1/*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include <assert.h>
25#include <limits.h>
26#include <stdbool.h>
27#include <stddef.h>
28#include <stdint.h>
29#include <string.h>
30#include <vulkan/vulkan.h>
31
32#include "hwdef/rogue_hw_defs.h"
33#include "hwdef/rogue_hw_utils.h"
34#include "pvr_bo.h"
35#include "pvr_csb.h"
36#include "pvr_csb_enum_helpers.h"
37#include "pvr_device_info.h"
38#include "pvr_end_of_tile.h"
39#include "pvr_formats.h"
40#include "pvr_hw_pass.h"
41#include "pvr_job_common.h"
42#include "pvr_job_render.h"
43#include "pvr_limits.h"
44#include "pvr_pds.h"
45#include "pvr_private.h"
46#include "pvr_types.h"
47#include "pvr_winsys.h"
48#include "util/bitscan.h"
49#include "util/compiler.h"
50#include "util/list.h"
51#include "util/macros.h"
52#include "util/u_dynarray.h"
53#include "util/u_pack_color.h"
54#include "vk_alloc.h"
55#include "vk_command_buffer.h"
56#include "vk_command_pool.h"
57#include "vk_format.h"
58#include "vk_log.h"
59#include "vk_object.h"
60#include "vk_util.h"
61
62/* Structure used to pass data into pvr_compute_generate_control_stream()
63 * function.
64 */
65struct pvr_compute_kernel_info {
66   pvr_dev_addr_t indirect_buffer_addr;
67   bool global_offsets_present;
68   uint32_t usc_common_size;
69   uint32_t usc_unified_size;
70   uint32_t pds_temp_size;
71   uint32_t pds_data_size;
72   enum PVRX(CDMCTRL_USC_TARGET) usc_target;
73   bool is_fence;
74   uint32_t pds_data_offset;
75   uint32_t pds_code_offset;
76   enum PVRX(CDMCTRL_SD_TYPE) sd_type;
77   bool usc_common_shared;
78   uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
79   uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
80   uint32_t max_instances;
81};
82
83static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
84                                        struct pvr_sub_cmd *sub_cmd)
85{
86   switch (sub_cmd->type) {
87   case PVR_SUB_CMD_TYPE_GRAPHICS:
88      pvr_csb_finish(&sub_cmd->gfx.control_stream);
89      pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.depth_bias_bo);
90      pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.scissor_bo);
91      break;
92
93   case PVR_SUB_CMD_TYPE_COMPUTE:
94      pvr_csb_finish(&sub_cmd->compute.control_stream);
95      break;
96
97   case PVR_SUB_CMD_TYPE_TRANSFER:
98      list_for_each_entry_safe (struct pvr_transfer_cmd,
99                                transfer_cmd,
100                                &sub_cmd->transfer.transfer_cmds,
101                                link) {
102         list_del(&transfer_cmd->link);
103         vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
104      }
105      break;
106
107   default:
108      pvr_finishme("Unsupported sub-command type %d", sub_cmd->type);
109      break;
110   }
111
112   list_del(&sub_cmd->link);
113   vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
114}
115
116static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
117{
118   list_for_each_entry_safe (struct pvr_sub_cmd,
119                             sub_cmd,
120                             &cmd_buffer->sub_cmds,
121                             link) {
122      pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
123   }
124}
125
126static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
127{
128   struct pvr_cmd_buffer *cmd_buffer =
129      container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
130
131   vk_free(&cmd_buffer->vk.pool->alloc,
132           cmd_buffer->state.render_pass_info.attachments);
133   vk_free(&cmd_buffer->vk.pool->alloc,
134           cmd_buffer->state.render_pass_info.clear_values);
135
136   pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
137
138   list_for_each_entry_safe (struct pvr_bo, bo, &cmd_buffer->bo_list, link) {
139      list_del(&bo->link);
140      pvr_bo_free(cmd_buffer->device, bo);
141   }
142
143   util_dynarray_fini(&cmd_buffer->scissor_array);
144   util_dynarray_fini(&cmd_buffer->depth_bias_array);
145
146   vk_command_buffer_finish(&cmd_buffer->vk);
147   vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
148}
149
150static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
151                                      struct vk_command_pool *pool,
152                                      VkCommandBufferLevel level,
153                                      VkCommandBuffer *pCommandBuffer)
154{
155   struct pvr_cmd_buffer *cmd_buffer;
156   VkResult result;
157
158   cmd_buffer = vk_zalloc(&pool->alloc,
159                          sizeof(*cmd_buffer),
160                          8U,
161                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
162   if (!cmd_buffer)
163      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
164
165   result = vk_command_buffer_init(&cmd_buffer->vk, pool, level);
166   if (result != VK_SUCCESS) {
167      vk_free(&pool->alloc, cmd_buffer);
168      return result;
169   }
170
171   cmd_buffer->vk.destroy = pvr_cmd_buffer_destroy;
172   cmd_buffer->device = device;
173
174   util_dynarray_init(&cmd_buffer->depth_bias_array, NULL);
175   util_dynarray_init(&cmd_buffer->scissor_array, NULL);
176
177   cmd_buffer->state.status = VK_SUCCESS;
178   cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL;
179
180   list_inithead(&cmd_buffer->sub_cmds);
181   list_inithead(&cmd_buffer->bo_list);
182
183   *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
184
185   return VK_SUCCESS;
186}
187
188VkResult
189pvr_AllocateCommandBuffers(VkDevice _device,
190                           const VkCommandBufferAllocateInfo *pAllocateInfo,
191                           VkCommandBuffer *pCommandBuffers)
192{
193   VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
194   PVR_FROM_HANDLE(pvr_device, device, _device);
195   VkResult result = VK_SUCCESS;
196   uint32_t i;
197
198   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
199      result = pvr_cmd_buffer_create(device,
200                                     pool,
201                                     pAllocateInfo->level,
202                                     &pCommandBuffers[i]);
203      if (result != VK_SUCCESS)
204         break;
205   }
206
207   if (result != VK_SUCCESS) {
208      while (i--) {
209         VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
210         pvr_cmd_buffer_destroy(cmd_buffer);
211      }
212
213      for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
214         pCommandBuffers[i] = VK_NULL_HANDLE;
215   }
216
217   return result;
218}
219
220static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
221                                           enum pvr_sub_cmd_type type)
222{
223   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
224   uint32_t barriers;
225
226   switch (type) {
227   case PVR_SUB_CMD_TYPE_GRAPHICS:
228      barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
229      break;
230
231   case PVR_SUB_CMD_TYPE_COMPUTE:
232      barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
233      break;
234
235   case PVR_SUB_CMD_TYPE_TRANSFER:
236      barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
237      break;
238
239   default:
240      barriers = 0;
241      pvr_finishme("Unsupported sub-command type %d", type);
242      break;
243   }
244
245   for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
246      state->barriers_needed[i] |= barriers;
247}
248
249static VkResult
250pvr_cmd_buffer_upload_tables(struct pvr_device *device,
251                             struct pvr_cmd_buffer *cmd_buffer,
252                             struct pvr_sub_cmd_gfx *const sub_cmd)
253{
254   const uint32_t cache_line_size =
255      rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
256   VkResult result;
257
258   assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
259
260   if (cmd_buffer->depth_bias_array.size > 0) {
261      result =
262         pvr_gpu_upload(device,
263                        device->heaps.general_heap,
264                        util_dynarray_begin(&cmd_buffer->depth_bias_array),
265                        cmd_buffer->depth_bias_array.size,
266                        cache_line_size,
267                        &sub_cmd->depth_bias_bo);
268      if (result != VK_SUCCESS)
269         return result;
270   }
271
272   if (cmd_buffer->scissor_array.size > 0) {
273      result = pvr_gpu_upload(device,
274                              device->heaps.general_heap,
275                              util_dynarray_begin(&cmd_buffer->scissor_array),
276                              cmd_buffer->scissor_array.size,
277                              cache_line_size,
278                              &sub_cmd->scissor_bo);
279      if (result != VK_SUCCESS)
280         goto err_free_depth_bias_bo;
281   }
282
283   util_dynarray_clear(&cmd_buffer->depth_bias_array);
284   util_dynarray_clear(&cmd_buffer->scissor_array);
285
286   return VK_SUCCESS;
287
288err_free_depth_bias_bo:
289   pvr_bo_free(device, sub_cmd->depth_bias_bo);
290   sub_cmd->depth_bias_bo = NULL;
291
292   return result;
293}
294
295static VkResult
296pvr_cmd_buffer_emit_ppp_state(struct pvr_cmd_buffer *cmd_buffer,
297                              struct pvr_sub_cmd_gfx *const sub_cmd)
298{
299   struct pvr_framebuffer *framebuffer =
300      cmd_buffer->state.render_pass_info.framebuffer;
301
302   pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE0, state0) {
303      state0.addrmsb = framebuffer->ppp_state_bo->vma->dev_addr;
304      state0.word_count = framebuffer->ppp_state_size;
305   }
306
307   pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE1, state1) {
308      state1.addrlsb = framebuffer->ppp_state_bo->vma->dev_addr;
309   }
310
311   return VK_SUCCESS;
312}
313
314static VkResult
315pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer,
316                              const void *const data,
317                              const size_t size,
318                              struct pvr_bo **const pvr_bo_out)
319{
320   struct pvr_device *const device = cmd_buffer->device;
321   const uint32_t cache_line_size =
322      rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
323   struct pvr_bo *pvr_bo;
324   VkResult result;
325
326   result = pvr_gpu_upload(device,
327                           device->heaps.general_heap,
328                           data,
329                           size,
330                           cache_line_size,
331                           &pvr_bo);
332   if (result != VK_SUCCESS)
333      return result;
334
335   list_add(&pvr_bo->link, &cmd_buffer->bo_list);
336
337   *pvr_bo_out = pvr_bo;
338
339   return VK_SUCCESS;
340}
341
342static VkResult
343pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
344                          const void *const code,
345                          const size_t code_size,
346                          uint64_t code_alignment,
347                          struct pvr_bo **const pvr_bo_out)
348{
349   struct pvr_device *const device = cmd_buffer->device;
350   const uint32_t cache_line_size =
351      rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
352   struct pvr_bo *pvr_bo;
353   VkResult result;
354
355   code_alignment = MAX2(code_alignment, cache_line_size);
356
357   result =
358      pvr_gpu_upload_usc(device, code, code_size, code_alignment, &pvr_bo);
359   if (result != VK_SUCCESS)
360      return result;
361
362   list_add(&pvr_bo->link, &cmd_buffer->bo_list);
363
364   *pvr_bo_out = pvr_bo;
365
366   return VK_SUCCESS;
367}
368
369static VkResult
370pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer,
371                          const uint32_t *data,
372                          uint32_t data_size_dwords,
373                          uint32_t data_alignment,
374                          const uint32_t *code,
375                          uint32_t code_size_dwords,
376                          uint32_t code_alignment,
377                          uint64_t min_alignment,
378                          struct pvr_pds_upload *const pds_upload_out)
379{
380   struct pvr_device *const device = cmd_buffer->device;
381   VkResult result;
382
383   result = pvr_gpu_upload_pds(device,
384                               data,
385                               data_size_dwords,
386                               data_alignment,
387                               code,
388                               code_size_dwords,
389                               code_alignment,
390                               min_alignment,
391                               pds_upload_out);
392   if (result != VK_SUCCESS)
393      return result;
394
395   list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
396
397   return VK_SUCCESS;
398}
399
400static inline VkResult
401pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
402                               const uint32_t *data,
403                               uint32_t data_size_dwords,
404                               uint32_t data_alignment,
405                               struct pvr_pds_upload *const pds_upload_out)
406{
407   return pvr_cmd_buffer_upload_pds(cmd_buffer,
408                                    data,
409                                    data_size_dwords,
410                                    data_alignment,
411                                    NULL,
412                                    0,
413                                    0,
414                                    data_alignment,
415                                    pds_upload_out);
416}
417
418static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
419   struct pvr_cmd_buffer *const cmd_buffer,
420   const uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
421   struct pvr_pds_upload *const pds_upload_out)
422{
423   struct pvr_pds_event_program pixel_event_program = {
424      /* No data to DMA, just a DOUTU needed. */
425      .num_emit_word_pairs = 0,
426   };
427   const uint32_t staging_buffer_size =
428      cmd_buffer->device->pixel_event_data_size_in_dwords * sizeof(uint32_t);
429   const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
430   struct pvr_device *const device = cmd_buffer->device;
431   /* FIXME: This should come from the compiler for the USC pixel program. */
432   const uint32_t usc_temp_count = 0;
433   struct pvr_bo *usc_eot_program;
434   uint8_t *usc_eot_program_ptr;
435   uint32_t *staging_buffer;
436   VkResult result;
437
438   result = pvr_cmd_buffer_upload_usc(cmd_buffer,
439                                      pvr_end_of_tile_program,
440                                      sizeof(pvr_end_of_tile_program),
441                                      4,
442                                      &usc_eot_program);
443   if (result != VK_SUCCESS)
444      return result;
445
446   assert((pbe_cs_words[1] & 0x3F) == 0x20);
447
448   /* FIXME: Stop patching the framebuffer address (this will require the
449    * end-of-tile program to be generated at run-time).
450    */
451   pvr_bo_cpu_map(device, usc_eot_program);
452   usc_eot_program_ptr = usc_eot_program->bo->map;
453   usc_eot_program_ptr[6] = (pbe_cs_words[0] >> 0) & 0xFF;
454   usc_eot_program_ptr[7] = (pbe_cs_words[0] >> 8) & 0xFF;
455   usc_eot_program_ptr[8] = (pbe_cs_words[0] >> 16) & 0xFF;
456   usc_eot_program_ptr[9] = (pbe_cs_words[0] >> 24) & 0xFF;
457   pvr_bo_cpu_unmap(device, usc_eot_program);
458
459   pvr_pds_setup_doutu(&pixel_event_program.task_control,
460                       usc_eot_program->vma->dev_addr.addr,
461                       usc_temp_count,
462                       PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
463                       false);
464
465   /* TODO: We could skip allocating this and generate directly into the device
466    * buffer thus removing one allocation and memcpy() per job. Would this
467    * speed up things in a noticeable way?
468    */
469   staging_buffer = vk_alloc(allocator,
470                             staging_buffer_size,
471                             8,
472                             VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
473   if (!staging_buffer) {
474      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
475      goto err_free_usc_pixel_program;
476   }
477
478   /* Generate the data segment. The code segment was uploaded earlier when
479    * setting up the PDS static heap data.
480    */
481   pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
482                                             staging_buffer,
483                                             &device->pdevice->dev_info);
484
485   result = pvr_cmd_buffer_upload_pds_data(
486      cmd_buffer,
487      staging_buffer,
488      cmd_buffer->device->pixel_event_data_size_in_dwords,
489      4,
490      pds_upload_out);
491   if (result != VK_SUCCESS)
492      goto err_free_pixel_event_staging_buffer;
493
494   vk_free(allocator, staging_buffer);
495
496   return VK_SUCCESS;
497
498err_free_pixel_event_staging_buffer:
499   vk_free(allocator, staging_buffer);
500
501err_free_usc_pixel_program:
502   list_del(&usc_eot_program->link);
503   pvr_bo_free(device, usc_eot_program);
504
505   return result;
506}
507
508static uint32_t pvr_get_hw_clear_color(VkFormat vk_format,
509                                       const VkClearValue *clear_value)
510{
511   union util_color uc = { .ui = 0 };
512
513   switch (vk_format) {
514   case VK_FORMAT_B8G8R8A8_UNORM:
515      util_pack_color(clear_value->color.float32,
516                      PIPE_FORMAT_R8G8B8A8_UNORM,
517                      &uc);
518      break;
519
520   default:
521      assert(!"Unsupported format");
522      uc.ui[0] = 0;
523      break;
524   }
525
526   return uc.ui[0];
527}
528
529static VkResult
530pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
531                                        uint32_t idx,
532                                        pvr_dev_addr_t *const addr_out)
533{
534   const struct pvr_render_pass_info *render_pass_info =
535      &cmd_buffer->state.render_pass_info;
536   const struct pvr_render_pass *pass = render_pass_info->pass;
537   const struct pvr_renderpass_hwsetup_render *hw_render =
538      &pass->hw_setup->renders[idx];
539   ASSERTED const struct pvr_load_op *load_op = hw_render->client_data;
540   const struct pvr_renderpass_colorinit *color_init =
541      &hw_render->color_init[0];
542   const struct pvr_render_pass_attachment *attachment =
543      &pass->attachments[color_init->driver_id];
544   const VkClearValue *clear_value =
545      &render_pass_info->clear_values[color_init->driver_id];
546   uint32_t hw_clear_value;
547   struct pvr_bo *clear_bo;
548   VkResult result;
549
550   pvr_finishme("Add missing load op data support");
551
552   assert(load_op->is_hw_object);
553   assert(hw_render->color_init_count == 1);
554
555   /* FIXME: add support for RENDERPASS_SURFACE_INITOP_LOAD. */
556   assert(color_init->op == RENDERPASS_SURFACE_INITOP_CLEAR);
557
558   /* FIXME: do this at the point we store the clear values? */
559   hw_clear_value = pvr_get_hw_clear_color(attachment->vk_format, clear_value);
560
561   result = pvr_cmd_buffer_upload_general(cmd_buffer,
562                                          &hw_clear_value,
563                                          sizeof(hw_clear_value),
564                                          &clear_bo);
565   if (result != VK_SUCCESS)
566      return result;
567
568   *addr_out = clear_bo->vma->dev_addr;
569
570   return VK_SUCCESS;
571}
572
573static VkResult pvr_load_op_pds_data_create_and_upload(
574   struct pvr_cmd_buffer *cmd_buffer,
575   uint32_t idx,
576   pvr_dev_addr_t constants_addr,
577   struct pvr_pds_upload *const pds_upload_out)
578{
579   const struct pvr_render_pass_info *render_pass_info =
580      &cmd_buffer->state.render_pass_info;
581   const struct pvr_load_op *load_op =
582      render_pass_info->pass->hw_setup->renders[idx].client_data;
583   struct pvr_device *device = cmd_buffer->device;
584   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
585   struct pvr_pds_pixel_shader_sa_program program = { 0 };
586   uint32_t staging_buffer_size;
587   uint32_t *staging_buffer;
588   VkResult result;
589
590   program.num_texture_dma_kicks = 1;
591
592   pvr_csb_pack (&program.texture_dma_address[0],
593                 PDSINST_DOUT_FIELDS_DOUTD_SRC0,
594                 value) {
595      value.sbase = constants_addr;
596   }
597
598   pvr_csb_pack (&program.texture_dma_control[0],
599                 PDSINST_DOUT_FIELDS_DOUTD_SRC1,
600                 value) {
601      value.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE);
602      value.a0 = load_op->shareds_dest_offset;
603      value.bsize = load_op->shareds_count;
604   }
605
606   pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
607
608   staging_buffer_size = program.data_size * sizeof(*staging_buffer);
609
610   staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
611                             staging_buffer_size,
612                             8,
613                             VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
614   if (!staging_buffer)
615      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
616
617   pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
618                                                       staging_buffer,
619                                                       dev_info);
620
621   result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
622                                           staging_buffer,
623                                           program.data_size,
624                                           1,
625                                           pds_upload_out);
626   if (result != VK_SUCCESS) {
627      vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
628      return result;
629   }
630
631   vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
632
633   return VK_SUCCESS;
634}
635
636/* FIXME: Should this function be specific to the HW background object, in
637 * which case its name should be changed, or should it have the load op
638 * structure passed in?
639 */
640static VkResult
641pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
642                                   uint32_t idx,
643                                   struct pvr_pds_upload *const pds_upload_out)
644{
645   pvr_dev_addr_t constants_addr;
646   VkResult result;
647
648   result =
649      pvr_load_op_constants_create_and_upload(cmd_buffer, idx, &constants_addr);
650   if (result != VK_SUCCESS)
651      return result;
652
653   return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
654                                                 idx,
655                                                 constants_addr,
656                                                 pds_upload_out);
657}
658
659static void pvr_pds_bgnd_pack_state(
660   const struct pvr_load_op *load_op,
661   const struct pvr_pds_upload *load_op_program,
662   uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
663{
664   pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
665      value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
666      value.texunicode_addr =
667         PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
668   }
669
670   pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
671      value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
672   }
673
674   pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
675      value.usc_sharedsize =
676         DIV_ROUND_UP(load_op->const_shareds_count,
677                      PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE));
678      value.pds_texturestatesize = DIV_ROUND_UP(
679         load_op_program->data_size,
680         PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE));
681      value.pds_tempsize =
682         DIV_ROUND_UP(load_op->temps_count,
683                      PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE));
684   }
685}
686
687/**
688 * \brief Calculates the stride in pixels based on the pitch in bytes and pixel
689 * format.
690 *
691 * \param[in] pitch     Width pitch in bytes.
692 * \param[in] vk_format Vulkan image format.
693 * \return Stride in pixels.
694 */
695static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
696{
697   const unsigned int cpp = vk_format_get_blocksize(vk_format);
698
699   assert(pitch % cpp == 0);
700
701   return pitch / cpp;
702}
703
704static void pvr_setup_pbe_state(
705   const struct pvr_device_info *dev_info,
706   struct pvr_framebuffer *framebuffer,
707   uint32_t mrt_index,
708   const struct usc_mrt_resource *mrt_resource,
709   const struct pvr_image_view *const iview,
710   const VkRect2D *render_area,
711   const bool down_scale,
712   const uint32_t samples,
713   uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
714   uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])
715{
716   const struct pvr_image *image = iview->image;
717   uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
718
719   struct pvr_pbe_surf_params surface_params;
720   struct pvr_pbe_render_params render_params;
721   bool with_packed_usc_channel;
722   const uint8_t *swizzle;
723   uint32_t position;
724
725   /* down_scale should be true when performing a resolve, in which case there
726    * should be more than one sample.
727    */
728   assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
729
730   /* Setup surface parameters. */
731
732   if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
733      switch (iview->vk.format) {
734      case VK_FORMAT_B8G8R8A8_UNORM:
735         with_packed_usc_channel = true;
736         break;
737      case VK_FORMAT_D32_SFLOAT:
738         with_packed_usc_channel = false;
739         break;
740      default:
741         unreachable("Unsupported Vulkan image format");
742      }
743   } else {
744      with_packed_usc_channel = false;
745   }
746
747   swizzle = pvr_get_format_swizzle(iview->vk.format);
748   memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
749
750   pvr_pbe_get_src_format_and_gamma(iview->vk.format,
751                                    PVR_PBE_GAMMA_NONE,
752                                    with_packed_usc_channel,
753                                    &surface_params.source_format,
754                                    &surface_params.gamma);
755
756   surface_params.is_normalized = vk_format_is_normalized(iview->vk.format);
757   surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
758   surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
759
760   /* FIXME: Should we have an inline function to return the address of a mip
761    * level?
762    */
763   surface_params.addr =
764      PVR_DEV_ADDR_OFFSET(image->vma->dev_addr,
765                          image->mip_levels[iview->vk.base_mip_level].offset);
766
767   surface_params.mem_layout = image->memlayout;
768   surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
769   surface_params.depth = iview->vk.extent.depth;
770   surface_params.width = iview->vk.extent.width;
771   surface_params.height = iview->vk.extent.height;
772   surface_params.z_only_render = false;
773   surface_params.down_scale = down_scale;
774   surface_params.msaa_mode = samples;
775
776   /* Setup render parameters. */
777
778   if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
779      position = mrt_resource->u.mem.offset_in_dwords;
780   } else {
781      assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REGISTER);
782      assert(mrt_resource->u.reg.offset == 0);
783
784      position = mrt_resource->u.reg.out_reg;
785   }
786
787   assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
788
789   switch (position) {
790   case 0:
791   case 4:
792      render_params.source_start = PVR_PBE_STARTPOS_BIT0;
793      break;
794   case 1:
795   case 5:
796      render_params.source_start = PVR_PBE_STARTPOS_BIT32;
797      break;
798   case 2:
799   case 6:
800      render_params.source_start = PVR_PBE_STARTPOS_BIT64;
801      break;
802   case 3:
803   case 7:
804      render_params.source_start = PVR_PBE_STARTPOS_BIT96;
805      break;
806   default:
807      assert(!"Invalid output register");
808      break;
809   }
810
811   render_params.min_x_clip = MAX2(0, render_area->offset.x);
812   render_params.min_y_clip = MAX2(0, render_area->offset.y);
813   render_params.max_x_clip =
814      MIN2(framebuffer->width,
815           render_area->offset.x + render_area->extent.width) -
816      1;
817   render_params.max_y_clip =
818      MIN2(framebuffer->height,
819           render_area->offset.y + render_area->extent.height) -
820      1;
821
822   render_params.slice = 0;
823   render_params.mrt_index = mrt_index;
824
825   pvr_pbe_pack_state(dev_info,
826                      &surface_params,
827                      &render_params,
828                      pbe_cs_words,
829                      pbe_reg_words);
830}
831
832static struct pvr_render_target *
833pvr_get_render_target(const struct pvr_render_pass *pass,
834                      const struct pvr_framebuffer *framebuffer,
835                      uint32_t idx)
836{
837   const struct pvr_renderpass_hwsetup_render *hw_render =
838      &pass->hw_setup->renders[idx];
839   uint32_t rt_idx = 0;
840
841   switch (hw_render->sample_count) {
842   case 1:
843   case 2:
844   case 4:
845   case 8:
846      rt_idx = util_logbase2(hw_render->sample_count);
847      break;
848
849   default:
850      unreachable("Unsupported sample count");
851      break;
852   }
853
854   return &framebuffer->render_targets[rt_idx];
855}
856
857static uint32_t
858pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass,
859                                uint32_t idx,
860                                const struct pvr_device_info *dev_info)
861{
862   const struct pvr_renderpass_hwsetup_render *hw_render =
863      &pass->hw_setup->renders[idx];
864   /* Default value based on the maximum value found in all existing cores. The
865    * maximum is used as this is being treated as a lower bound, making it a
866    * "safer" choice than the minimum value found in all existing cores.
867    */
868   const uint32_t min_output_regs =
869      PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
870   const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
871
872   return util_next_power_of_two(width);
873}
874
875static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
876                                         struct pvr_cmd_buffer *cmd_buffer,
877                                         struct pvr_sub_cmd_gfx *sub_cmd)
878{
879   struct pvr_render_pass_info *render_pass_info =
880      &cmd_buffer->state.render_pass_info;
881   const struct pvr_renderpass_hwsetup_render *hw_render =
882      &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx];
883   struct pvr_render_job *job = &sub_cmd->job;
884   struct pvr_pds_upload pds_pixel_event_program;
885
886   uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
887                        [ROGUE_NUM_PBESTATE_STATE_WORDS];
888   struct pvr_render_target *render_target;
889   VkResult result;
890
891   assert(hw_render->eot_surface_count < ARRAY_SIZE(pbe_cs_words));
892
893   for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
894      const struct pvr_renderpass_hwsetup_eot_surface *surface =
895         &hw_render->eot_surfaces[i];
896      const struct pvr_image_view *iview =
897         render_pass_info->attachments[surface->attachment_index];
898      const struct usc_mrt_resource *mrt_resource =
899         &hw_render->eot_setup.mrt_resources[surface->mrt_index];
900      uint32_t samples = 1;
901
902      if (surface->need_resolve)
903         pvr_finishme("Set up job resolve information.");
904
905      pvr_setup_pbe_state(dev_info,
906                          render_pass_info->framebuffer,
907                          surface->mrt_index,
908                          mrt_resource,
909                          iview,
910                          &render_pass_info->render_area,
911                          surface->need_resolve,
912                          samples,
913                          pbe_cs_words[i],
914                          job->pbe_reg_words[i]);
915   }
916
917   /* FIXME: The fragment program only supports a single surface at present. */
918   assert(hw_render->eot_surface_count == 1);
919   result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
920      cmd_buffer,
921      pbe_cs_words[0],
922      &pds_pixel_event_program);
923   if (result != VK_SUCCESS)
924      return result;
925
926   job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset;
927
928   /* FIXME: Don't do this if there is a barrier load. */
929   if (render_pass_info->enable_bg_tag) {
930      const struct pvr_load_op *load_op = hw_render->client_data;
931      struct pvr_pds_upload load_op_program;
932
933      /* FIXME: Should we free the PDS pixel event data or let it be freed
934       * when the pool gets emptied?
935       */
936      result = pvr_load_op_data_create_and_upload(cmd_buffer,
937                                                  sub_cmd->hw_render_idx,
938                                                  &load_op_program);
939      if (result != VK_SUCCESS)
940         return result;
941
942      pvr_pds_bgnd_pack_state(load_op,
943                              &load_op_program,
944                              job->pds_bgnd_reg_values);
945   }
946
947   job->enable_bg_tag = render_pass_info->enable_bg_tag;
948   job->process_empty_tiles = render_pass_info->process_empty_tiles;
949
950   render_target = pvr_get_render_target(render_pass_info->pass,
951                                         render_pass_info->framebuffer,
952                                         sub_cmd->hw_render_idx);
953   job->rt_dataset = render_target->rt_dataset;
954
955   job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
956
957   /* FIXME: Need to set up the border color table at device creation
958    * time. Set to invalid for the time being.
959    */
960   job->border_colour_table_addr = PVR_DEV_ADDR_INVALID;
961
962   if (sub_cmd->depth_bias_bo)
963      job->depth_bias_table_addr = sub_cmd->depth_bias_bo->vma->dev_addr;
964   else
965      job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
966
967   if (sub_cmd->scissor_bo)
968      job->scissor_table_addr = sub_cmd->scissor_bo->vma->dev_addr;
969   else
970      job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
971
972   job->pixel_output_width =
973      pvr_pass_get_pixel_output_width(render_pass_info->pass,
974                                      sub_cmd->hw_render_idx,
975                                      dev_info);
976
977   if (hw_render->ds_surface_id != -1) {
978      struct pvr_image_view *iview =
979         render_pass_info->attachments[hw_render->ds_surface_id];
980      const struct pvr_image *image = iview->image;
981
982      if (vk_format_has_depth(image->vk.format)) {
983         uint32_t level_pitch =
984            image->mip_levels[iview->vk.base_mip_level].pitch;
985
986         /* FIXME: Is this sufficient for depth buffers? */
987         job->depth_addr = image->dev_addr;
988
989         job->depth_stride =
990            pvr_stride_from_pitch(level_pitch, iview->vk.format);
991         job->depth_height = iview->vk.extent.height;
992         job->depth_physical_width =
993            u_minify(image->physical_extent.width, iview->vk.base_mip_level);
994         job->depth_physical_height =
995            u_minify(image->physical_extent.height, iview->vk.base_mip_level);
996         job->depth_layer_size = image->layer_size;
997
998         if (hw_render->ds_surface_id < render_pass_info->clear_value_count) {
999            VkClearValue *clear_values =
1000               &render_pass_info->clear_values[hw_render->ds_surface_id];
1001
1002            job->depth_clear_value = clear_values->depthStencil.depth;
1003         } else {
1004            job->depth_clear_value = 1.0f;
1005         }
1006
1007         job->depth_vk_format = iview->vk.format;
1008
1009         job->depth_memlayout = image->memlayout;
1010      } else {
1011         job->depth_addr = PVR_DEV_ADDR_INVALID;
1012         job->depth_stride = 0;
1013         job->depth_height = 0;
1014         job->depth_physical_width = 0;
1015         job->depth_physical_height = 0;
1016         job->depth_layer_size = 0;
1017         job->depth_clear_value = 1.0f;
1018         job->depth_vk_format = VK_FORMAT_UNDEFINED;
1019         job->depth_memlayout = PVR_MEMLAYOUT_LINEAR;
1020      }
1021
1022      if (vk_format_has_stencil(image->vk.format)) {
1023         /* FIXME: Is this sufficient for stencil buffers? */
1024         job->stencil_addr = image->dev_addr;
1025      } else {
1026         job->stencil_addr = PVR_DEV_ADDR_INVALID;
1027      }
1028
1029      job->samples = image->vk.samples;
1030   } else {
1031      pvr_finishme("Set up correct number of samples for render job");
1032
1033      job->depth_addr = PVR_DEV_ADDR_INVALID;
1034      job->depth_stride = 0;
1035      job->depth_height = 0;
1036      job->depth_physical_width = 0;
1037      job->depth_physical_height = 0;
1038      job->depth_layer_size = 0;
1039      job->depth_clear_value = 1.0f;
1040      job->depth_vk_format = VK_FORMAT_UNDEFINED;
1041      job->depth_memlayout = PVR_MEMLAYOUT_LINEAR;
1042
1043      job->stencil_addr = PVR_DEV_ADDR_INVALID;
1044
1045      job->samples = 1;
1046   }
1047
1048   if (sub_cmd->max_tiles_in_flight ==
1049       PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
1050      /* Use the default limit based on the partition store. */
1051      job->max_tiles_in_flight = 0U;
1052   } else {
1053      job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
1054   }
1055
1056   job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
1057   job->disable_compute_overlap = false;
1058   job->max_shared_registers = cmd_buffer->state.max_shared_regs;
1059   job->run_frag = true;
1060   job->geometry_terminate = true;
1061
1062   return VK_SUCCESS;
1063}
1064
1065/* Number of shareds used in the Issue Data Fence(IDF)/Wait Data Fence(WDF)
1066 * kernel.
1067 */
1068#define PVR_IDF_WDF_IN_REGISTER_CONST_COUNT 12U
1069
1070static void
1071pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
1072                             struct pvr_cmd_buffer *cmd_buffer,
1073                             struct pvr_sub_cmd_compute *sub_cmd)
1074{
1075   const struct pvr_device_runtime_info *dev_runtime_info =
1076      &pdevice->dev_runtime_info;
1077   const struct pvr_device_info *dev_info = &pdevice->dev_info;
1078
1079   if (sub_cmd->uses_barrier)
1080      sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_PREVENT_ALL_OVERLAP;
1081
1082   pvr_csb_pack (&sub_cmd->submit_info.regs.cdm_ctrl_stream_base,
1083                 CR_CDM_CTRL_STREAM_BASE,
1084                 value) {
1085      value.addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
1086   }
1087
1088   /* FIXME: Need to set up the border color table at device creation
1089    * time. Set to invalid for the time being.
1090    */
1091   pvr_csb_pack (&sub_cmd->submit_info.regs.tpu_border_colour_table,
1092                 CR_TPU_BORDER_COLOUR_TABLE_CDM,
1093                 value) {
1094      value.border_colour_table_address = PVR_DEV_ADDR_INVALID;
1095   }
1096
1097   sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
1098                                   cmd_buffer->state.max_shared_regs);
1099
1100   cmd_buffer->state.max_shared_regs = 0U;
1101
1102   if (PVR_HAS_FEATURE(dev_info, compute_morton_capable))
1103      sub_cmd->submit_info.regs.cdm_item = 0;
1104
1105   pvr_csb_pack (&sub_cmd->submit_info.regs.tpu, CR_TPU, value) {
1106      value.tag_cem_4k_face_packing = true;
1107   }
1108
1109   if (PVR_HAS_FEATURE(dev_info, cluster_grouping) &&
1110       PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) &&
1111       dev_runtime_info->num_phantoms > 1 && sub_cmd->uses_atomic_ops) {
1112      /* Each phantom has its own MCU, so atomicity can only be guaranteed
1113       * when all work items are processed on the same phantom. This means we
1114       * need to disable all USCs other than those of the first phantom, which
1115       * has 4 clusters.
1116       */
1117      pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster,
1118                    CR_COMPUTE_CLUSTER,
1119                    value) {
1120         value.mask = 0xFU;
1121      }
1122   } else {
1123      pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster,
1124                    CR_COMPUTE_CLUSTER,
1125                    value) {
1126         value.mask = 0U;
1127      }
1128   }
1129
1130   if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support) &&
1131       sub_cmd->uses_atomic_ops) {
1132      sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_SINGLE_CORE;
1133   }
1134}
1135
1136#define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
1137   (1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE))
1138
1139static uint32_t
1140pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
1141                           uint32_t coeff_regs_count,
1142                           bool use_barrier,
1143                           uint32_t total_workitems)
1144{
1145   const struct pvr_device_runtime_info *dev_runtime_info =
1146      &pdevice->dev_runtime_info;
1147   const struct pvr_device_info *dev_info = &pdevice->dev_info;
1148   uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
1149   uint32_t max_avail_coeff_regs =
1150      dev_runtime_info->cdm_max_local_mem_size_regs;
1151   uint32_t localstore_chunks_count =
1152      DIV_ROUND_UP(coeff_regs_count << 2,
1153                   PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1154
1155   /* Ensure that we cannot have more workgroups in a slot than the available
1156    * number of coefficients allow us to have.
1157    */
1158   if (coeff_regs_count > 0U) {
1159      /* If TA or 3D can overlap with CDM, or if the TA is running a geometry
1160       * shader then we need to consider this in calculating max allowed
1161       * work-groups.
1162       */
1163      if (PVR_HAS_QUIRK(dev_info, 52354) &&
1164          (PVR_HAS_FEATURE(dev_info, compute_overlap) ||
1165           PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
1166         /* Solve for n (number of work-groups per task). All values are in
1167          * size of common store alloc blocks:
1168          *
1169          * n + (2n + 7) * (local_memory_size_max - 1) =
1170          * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1171          * ==>
1172          * n + 2n * (local_memory_size_max - 1) =
1173          * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1174          * 	- (7 * (local_memory_size_max - 1))
1175          * ==>
1176          * n * (1 + 2 * (local_memory_size_max - 1)) =
1177          * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1178          * 	- (7 * (local_memory_size_max - 1))
1179          * ==>
1180          * n = ((coefficient_memory_pool_size) -
1181          * 	(7 * pixel_allocation_size_max) -
1182          * 	(7 * (local_memory_size_max - 1)) / (1 +
1183          * 2 * (local_memory_size_max - 1)))
1184          */
1185         uint32_t max_common_store_blocks =
1186            DIV_ROUND_UP(max_avail_coeff_regs * 4U,
1187                         PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1188
1189         /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1190          */
1191         max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1192                                    PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
1193
1194         /* - (7 * (local_memory_size_max - 1)) */
1195         max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1196                                     (localstore_chunks_count - 1U));
1197
1198         /* Divide by (1 + 2 * (local_memory_size_max - 1)) */
1199         max_workgroups_per_task = max_common_store_blocks /
1200                                   (1U + 2U * (localstore_chunks_count - 1U));
1201
1202         max_workgroups_per_task =
1203            MIN2(max_workgroups_per_task,
1204                 ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
1205
1206      } else {
1207         max_workgroups_per_task =
1208            MIN2((max_avail_coeff_regs / coeff_regs_count),
1209                 max_workgroups_per_task);
1210      }
1211   }
1212
1213   /* max_workgroups_per_task should at least be one. */
1214   assert(max_workgroups_per_task >= 1U);
1215
1216   if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
1217      /* In this case, the work group size will have been padded up to the
1218       * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
1219       * ROGUE_MAX_INSTANCES_PER_TASK.
1220       */
1221      return ROGUE_MAX_INSTANCES_PER_TASK;
1222   }
1223
1224   /* In this case, the number of instances in the slot must be clamped to
1225    * accommodate whole work-groups only.
1226    */
1227   if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
1228      max_workgroups_per_task =
1229         MIN2(max_workgroups_per_task,
1230              ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
1231      return total_workitems * max_workgroups_per_task;
1232   }
1233
1234   return MIN2(total_workitems * max_workgroups_per_task,
1235               ROGUE_MAX_INSTANCES_PER_TASK);
1236}
1237
1238static void
1239pvr_compute_generate_control_stream(struct pvr_csb *csb,
1240                                    struct pvr_sub_cmd_compute *sub_cmd,
1241                                    const struct pvr_compute_kernel_info *info)
1242{
1243   /* Compute kernel 0. */
1244   pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
1245      kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
1246      kernel0.global_offsets_present = info->global_offsets_present;
1247      kernel0.usc_common_size = info->usc_common_size;
1248      kernel0.usc_unified_size = info->usc_unified_size;
1249      kernel0.pds_temp_size = info->pds_temp_size;
1250      kernel0.pds_data_size = info->pds_data_size;
1251      kernel0.usc_target = info->usc_target;
1252      kernel0.fence = info->is_fence;
1253   }
1254
1255   /* Compute kernel 1. */
1256   pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
1257      kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
1258      kernel1.sd_type = info->sd_type;
1259      kernel1.usc_common_shared = info->usc_common_shared;
1260   }
1261
1262   /* Compute kernel 2. */
1263   pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
1264      kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
1265   }
1266
1267   if (info->indirect_buffer_addr.addr) {
1268      /* Compute kernel 6. */
1269      pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
1270         kernel6.indirect_addrmsb = info->indirect_buffer_addr;
1271      }
1272
1273      /* Compute kernel 7. */
1274      pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
1275         kernel7.indirect_addrlsb = info->indirect_buffer_addr;
1276      }
1277   } else {
1278      /* Compute kernel 3. */
1279      pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
1280         assert(info->global_size[0U] > 0U);
1281         kernel3.workgroup_x = info->global_size[0U] - 1U;
1282      }
1283
1284      /* Compute kernel 4. */
1285      pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
1286         assert(info->global_size[1U] > 0U);
1287         kernel4.workgroup_y = info->global_size[1U] - 1U;
1288      }
1289
1290      /* Compute kernel 5. */
1291      pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
1292         assert(info->global_size[2U] > 0U);
1293         kernel5.workgroup_z = info->global_size[2U] - 1U;
1294      }
1295   }
1296
1297   /* Compute kernel 8. */
1298   pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
1299      if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
1300         kernel8.max_instances = 0U;
1301      else
1302         kernel8.max_instances = info->max_instances;
1303
1304      assert(info->local_size[0U] > 0U);
1305      kernel8.workgroup_size_x = info->local_size[0U] - 1U;
1306      assert(info->local_size[1U] > 0U);
1307      kernel8.workgroup_size_y = info->local_size[1U] - 1U;
1308      assert(info->local_size[2U] > 0U);
1309      kernel8.workgroup_size_z = info->local_size[2U] - 1U;
1310   }
1311
1312   /* Track the highest amount of shared registers usage in this dispatch.
1313    * This is used by the FW for context switching, so must be large enough
1314    * to contain all the shared registers that might be in use for this compute
1315    * job. Coefficients don't need to be included as the context switch will not
1316    * happen within the execution of a single workgroup, thus nothing needs to
1317    * be preserved.
1318    */
1319   if (info->usc_common_shared) {
1320      sub_cmd->num_shared_regs =
1321         MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
1322   }
1323}
1324
1325/* TODO: This can be pre-packed and uploaded directly. Would that provide any
1326 * speed up?
1327 */
1328static void
1329pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
1330                            struct pvr_sub_cmd_compute *const sub_cmd)
1331{
1332   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1333   bool *const is_sw_barier_required =
1334      &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
1335   const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1336   struct pvr_csb *csb = &sub_cmd->control_stream;
1337   const struct pvr_pds_upload *program;
1338
1339   if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
1340       *is_sw_barier_required) {
1341      *is_sw_barier_required = false;
1342      program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
1343   } else {
1344      program = &cmd_buffer->device->idfwdf_state.pds;
1345   }
1346
1347   struct pvr_compute_kernel_info info = {
1348      .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1349      .global_offsets_present = false,
1350      .usc_common_size =
1351         DIV_ROUND_UP(cmd_buffer->device->idfwdf_state.usc_shareds << 2,
1352                      PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
1353      .usc_unified_size = 0U,
1354      .pds_temp_size = 0U,
1355      .pds_data_size =
1356         DIV_ROUND_UP(program->data_size << 2,
1357                      PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1358      .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
1359      .is_fence = false,
1360      .pds_data_offset = program->data_offset,
1361      .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
1362      .usc_common_shared = true,
1363      .pds_code_offset = program->code_offset,
1364      .global_size = { 1U, 1U, 1U },
1365      .local_size = { 1U, 1U, 1U },
1366   };
1367
1368   /* We don't need to pad work-group size for this case. */
1369
1370   info.max_instances =
1371      pvr_compute_flat_slot_size(pdevice,
1372                                 cmd_buffer->device->idfwdf_state.usc_shareds,
1373                                 false,
1374                                 1U);
1375
1376   pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1377}
1378
1379static void
1380pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
1381                           struct pvr_sub_cmd_compute *const sub_cmd,
1382                           bool deallocate_shareds)
1383{
1384   const struct pvr_pds_upload *program =
1385      &cmd_buffer->device->pds_compute_fence_program;
1386   const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1387   struct pvr_csb *csb = &sub_cmd->control_stream;
1388
1389   struct pvr_compute_kernel_info info = {
1390      .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1391      .global_offsets_present = false,
1392      .usc_common_size = 0U,
1393      .usc_unified_size = 0U,
1394      .pds_temp_size = 0U,
1395      .pds_data_size =
1396         DIV_ROUND_UP(program->data_size << 2,
1397                      PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1398      .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
1399      .is_fence = true,
1400      .pds_data_offset = program->data_offset,
1401      .sd_type = PVRX(CDMCTRL_SD_TYPE_PDS),
1402      .usc_common_shared = deallocate_shareds,
1403      .pds_code_offset = program->code_offset,
1404      .global_size = { 1U, 1U, 1U },
1405      .local_size = { 1U, 1U, 1U },
1406   };
1407
1408   /* We don't need to pad work-group size for this case. */
1409   /* Here we calculate the slot size. This can depend on the use of barriers,
1410    * local memory, BRN's or other factors.
1411    */
1412   info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
1413
1414   pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1415}
1416
1417static VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
1418{
1419   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1420   struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
1421   struct pvr_device *device = cmd_buffer->device;
1422   VkResult result;
1423
1424   /* FIXME: Is this NULL check required because this function is called from
1425    * pvr_resolve_unemitted_resolve_attachments()? See comment about this
1426    * function being called twice in a row in pvr_CmdEndRenderPass().
1427    */
1428   if (!sub_cmd)
1429      return VK_SUCCESS;
1430
1431   switch (sub_cmd->type) {
1432   case PVR_SUB_CMD_TYPE_GRAPHICS: {
1433      struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
1434
1435      if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1436         result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
1437         if (result != VK_SUCCESS) {
1438            state->status = result;
1439            return result;
1440         }
1441
1442         break;
1443      }
1444
1445      /* TODO: Check if the sub_cmd can be skipped based on
1446       * sub_cmd->gfx.empty_cmd flag.
1447       */
1448
1449      result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
1450      if (result != VK_SUCCESS) {
1451         state->status = result;
1452         return result;
1453      }
1454
1455      result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, gfx_sub_cmd);
1456      if (result != VK_SUCCESS) {
1457         state->status = result;
1458         return result;
1459      }
1460
1461      result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
1462      if (result != VK_SUCCESS) {
1463         state->status = result;
1464         return result;
1465      }
1466
1467      result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
1468                                        cmd_buffer,
1469                                        gfx_sub_cmd);
1470      if (result != VK_SUCCESS) {
1471         state->status = result;
1472         return result;
1473      }
1474
1475      break;
1476   }
1477
1478   case PVR_SUB_CMD_TYPE_COMPUTE: {
1479      struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
1480
1481      pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
1482
1483      result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
1484      if (result != VK_SUCCESS) {
1485         state->status = result;
1486         return result;
1487      }
1488
1489      pvr_sub_cmd_compute_job_init(device->pdevice,
1490                                   cmd_buffer,
1491                                   compute_sub_cmd);
1492      break;
1493   }
1494
1495   case PVR_SUB_CMD_TYPE_TRANSFER:
1496      break;
1497
1498   default:
1499      pvr_finishme("Unsupported sub-command type %d", sub_cmd->type);
1500      break;
1501   }
1502
1503   state->current_sub_cmd = NULL;
1504
1505   return VK_SUCCESS;
1506}
1507
1508static void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer_state *state,
1509                                           bool start_geom)
1510{
1511   if (start_geom) {
1512      /*
1513       * Initial geometry phase State.
1514       * It's the driver's responsibility to ensure that the state of the
1515       * hardware is correctly initialized at the start of every geometry
1516       * phase. This is required to prevent stale state from a previous
1517       * geometry phase erroneously affecting the next geometry phase. The
1518       * following fields in PPP State Header, and their corresponding state
1519       * words, must be supplied in the first PPP State Update of a geometry
1520       * phase that contains any geometry (draw calls). Any field not listed
1521       * below is safe to ignore.
1522       *
1523       *	TA_PRES_STREAM_OUT_SIZE
1524       *	TA_PRES_PPPCTRL
1525       *	TA_PRES_VARYING_WORD2
1526       *	TA_PRES_VARYING_WORD1
1527       *	TA_PRES_VARYING_WORD0
1528       *	TA_PRES_OUTSELECTS
1529       *	TA_PRES_WCLAMP
1530       *	TA_VIEWPORT_COUNT
1531       *	TA_PRES_VIEWPORT
1532       *	TA_PRES_REGION_CLIP
1533       *	TA_PRES_PDSSTATEPTR0
1534       *	TA_PRES_ISPCTLFB
1535       *	TA_PRES_ISPCTLFA
1536       *	TA_PRES_ISPCTL
1537       *
1538       * If a geometry phase does not contain any geometry, this restriction
1539       * can be ignored. If the first draw call in a geometry phase will only
1540       * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
1541       * in the ISP State Control Word, the PDS State Pointers
1542       * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
1543       * be supplied, since they will never reach the PDS in the fragment
1544       * phase.
1545       */
1546
1547      state->emit_state_bits = 0;
1548
1549      state->emit_state.stream_out = true;
1550      state->emit_state.ppp_control = true;
1551      state->emit_state.varying_word2 = true;
1552      state->emit_state.varying_word1 = true;
1553      state->emit_state.varying_word0 = true;
1554      state->emit_state.output_selects = true;
1555      state->emit_state.wclamp = true;
1556      state->emit_state.viewport = true;
1557      state->emit_state.region_clip = true;
1558      state->emit_state.pds_fragment_stateptr0 = true;
1559      state->emit_state.isp_fb = true;
1560      state->emit_state.isp = true;
1561   } else {
1562      state->emit_state.ppp_control = true;
1563      state->emit_state.varying_word1 = true;
1564      state->emit_state.varying_word0 = true;
1565      state->emit_state.output_selects = true;
1566      state->emit_state.viewport = true;
1567      state->emit_state.region_clip = true;
1568      state->emit_state.pds_fragment_stateptr0 = true;
1569      state->emit_state.isp_fb = true;
1570      state->emit_state.isp = true;
1571   }
1572
1573   memset(&state->ppp_state, 0U, sizeof(state->ppp_state));
1574
1575   state->dirty.vertex_bindings = true;
1576   state->dirty.gfx_pipeline_binding = true;
1577   state->dirty.viewport = true;
1578}
1579
1580static VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
1581                                             enum pvr_sub_cmd_type type)
1582{
1583   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1584   struct pvr_device *device = cmd_buffer->device;
1585   struct pvr_sub_cmd *sub_cmd;
1586   VkResult result;
1587
1588   /* Check the current status of the buffer. */
1589   if (state->status != VK_SUCCESS)
1590      return state->status;
1591
1592   pvr_cmd_buffer_update_barriers(cmd_buffer, type);
1593
1594   if (state->current_sub_cmd) {
1595      if (state->current_sub_cmd->type == type) {
1596         /* Continue adding to the current sub command. */
1597         return VK_SUCCESS;
1598      }
1599
1600      /* End the current sub command. */
1601      result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1602      if (result != VK_SUCCESS)
1603         return result;
1604   }
1605
1606   sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
1607                       sizeof(*sub_cmd),
1608                       8,
1609                       VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1610   if (!sub_cmd) {
1611      state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
1612      return state->status;
1613   }
1614
1615   sub_cmd->type = type;
1616
1617   switch (type) {
1618   case PVR_SUB_CMD_TYPE_GRAPHICS:
1619
1620      sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
1621      sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
1622      sub_cmd->gfx.modifies_depth = false;
1623      sub_cmd->gfx.modifies_stencil = false;
1624      sub_cmd->gfx.max_tiles_in_flight =
1625         PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
1626                               isp_max_tiles_in_flight,
1627                               1);
1628      sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass;
1629      sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer;
1630      sub_cmd->gfx.empty_cmd = true;
1631
1632      pvr_reset_graphics_dirty_state(state, true);
1633      pvr_csb_init(device,
1634                   PVR_CMD_STREAM_TYPE_GRAPHICS,
1635                   &sub_cmd->gfx.control_stream);
1636      break;
1637
1638   case PVR_SUB_CMD_TYPE_COMPUTE:
1639      pvr_csb_init(device,
1640                   PVR_CMD_STREAM_TYPE_COMPUTE,
1641                   &sub_cmd->compute.control_stream);
1642      break;
1643
1644   case PVR_SUB_CMD_TYPE_TRANSFER:
1645      list_inithead(&sub_cmd->transfer.transfer_cmds);
1646      break;
1647
1648   default:
1649      pvr_finishme("Unsupported sub-command type %d", type);
1650      break;
1651   }
1652
1653   list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
1654   state->current_sub_cmd = sub_cmd;
1655
1656   return VK_SUCCESS;
1657}
1658
1659VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer,
1660                                  struct pvr_winsys_heap *heap,
1661                                  uint64_t size,
1662                                  uint32_t flags,
1663                                  struct pvr_bo **const pvr_bo_out)
1664{
1665   const uint32_t cache_line_size =
1666      rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
1667   struct pvr_bo *pvr_bo;
1668   VkResult result;
1669
1670   result = pvr_bo_alloc(cmd_buffer->device,
1671                         heap,
1672                         size,
1673                         cache_line_size,
1674                         flags,
1675                         &pvr_bo);
1676   if (result != VK_SUCCESS) {
1677      cmd_buffer->state.status = result;
1678      return result;
1679   }
1680
1681   list_add(&pvr_bo->link, &cmd_buffer->bo_list);
1682
1683   *pvr_bo_out = pvr_bo;
1684
1685   return VK_SUCCESS;
1686}
1687
1688VkResult pvr_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1689                                VkCommandBufferResetFlags flags)
1690{
1691   assert(!"Unimplemented");
1692   return VK_SUCCESS;
1693}
1694
1695static void pvr_cmd_bind_compute_pipeline(
1696   const struct pvr_compute_pipeline *const compute_pipeline,
1697   struct pvr_cmd_buffer *const cmd_buffer)
1698{
1699   cmd_buffer->state.compute_pipeline = compute_pipeline;
1700   cmd_buffer->state.dirty.compute_pipeline_binding = true;
1701}
1702
1703static void pvr_cmd_bind_graphics_pipeline(
1704   const struct pvr_graphics_pipeline *const gfx_pipeline,
1705   struct pvr_cmd_buffer *const cmd_buffer)
1706{
1707   struct pvr_dynamic_state *const dest_state =
1708      &cmd_buffer->state.dynamic.common;
1709   const struct pvr_dynamic_state *const src_state =
1710      &gfx_pipeline->dynamic_state;
1711   struct pvr_cmd_buffer_state *const cmd_buffer_state = &cmd_buffer->state;
1712   const uint32_t state_mask = src_state->mask;
1713
1714   cmd_buffer_state->gfx_pipeline = gfx_pipeline;
1715   cmd_buffer_state->dirty.gfx_pipeline_binding = true;
1716
1717   /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_VIEWPORT. */
1718   if (!(state_mask & PVR_DYNAMIC_STATE_BIT_VIEWPORT)) {
1719      assert(!"Unimplemented");
1720   }
1721
1722   /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_SCISSOR. */
1723   if (!(state_mask & PVR_DYNAMIC_STATE_BIT_SCISSOR)) {
1724      assert(!"Unimplemented");
1725   }
1726
1727   if (!(state_mask & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH)) {
1728      dest_state->line_width = src_state->line_width;
1729
1730      cmd_buffer_state->dirty.line_width = true;
1731   }
1732
1733   if (!(state_mask & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) {
1734      memcpy(&dest_state->depth_bias,
1735             &src_state->depth_bias,
1736             sizeof(src_state->depth_bias));
1737
1738      cmd_buffer_state->dirty.depth_bias = true;
1739   }
1740
1741   if (!(state_mask & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) {
1742      STATIC_ASSERT(
1743         __same_type(dest_state->blend_constants, src_state->blend_constants));
1744
1745      typed_memcpy(dest_state->blend_constants,
1746                   src_state->blend_constants,
1747                   ARRAY_SIZE(dest_state->blend_constants));
1748
1749      cmd_buffer_state->dirty.blend_constants = true;
1750   }
1751
1752   if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) {
1753      dest_state->compare_mask.front = src_state->compare_mask.front;
1754      dest_state->compare_mask.back = src_state->compare_mask.back;
1755
1756      cmd_buffer_state->dirty.compare_mask = true;
1757   }
1758
1759   if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) {
1760      dest_state->write_mask.front = src_state->write_mask.front;
1761      dest_state->write_mask.back = src_state->write_mask.back;
1762
1763      cmd_buffer_state->dirty.write_mask = true;
1764   }
1765
1766   if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) {
1767      dest_state->reference.front = src_state->reference.front;
1768      dest_state->reference.back = src_state->reference.back;
1769
1770      cmd_buffer_state->dirty.reference = true;
1771   }
1772}
1773
1774void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,
1775                         VkPipelineBindPoint pipelineBindPoint,
1776                         VkPipeline _pipeline)
1777{
1778   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1779   PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
1780
1781   switch (pipelineBindPoint) {
1782   case VK_PIPELINE_BIND_POINT_COMPUTE:
1783      pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
1784                                    cmd_buffer);
1785      break;
1786
1787   case VK_PIPELINE_BIND_POINT_GRAPHICS:
1788      pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
1789                                     cmd_buffer);
1790      break;
1791
1792   default:
1793      unreachable("Invalid bind point.");
1794      break;
1795   }
1796}
1797
1798#if defined(DEBUG)
1799static void check_viewport_quirk_70165(const struct pvr_device *device,
1800                                       const VkViewport *pViewport)
1801{
1802   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
1803   float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
1804   float min_screen_space_value, max_screen_space_value;
1805   float sign_to_unsigned_offset, fixed_point_max;
1806   float guardband_width, guardband_height;
1807
1808   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1809      /* Max representable value in 13.4 fixed point format.
1810       * Round-down to avoid precision issues.
1811       * Calculated as (2 ** 13) - 2*(2 ** -4)
1812       */
1813      fixed_point_max = 8192.0f - 2.0f / 16.0f;
1814
1815      if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
1816         if (pViewport->width <= 4096 && pViewport->height <= 4096) {
1817            guardband_width = pViewport->width / 4.0f;
1818            guardband_height = pViewport->height / 4.0f;
1819
1820            /* 2k of the range is negative */
1821            sign_to_unsigned_offset = 2048.0f;
1822         } else {
1823            guardband_width = 0.0f;
1824            guardband_height = 0.0f;
1825
1826            /* For > 4k renders, the entire range is positive */
1827            sign_to_unsigned_offset = 0.0f;
1828         }
1829      } else {
1830         guardband_width = pViewport->width / 4.0f;
1831         guardband_height = pViewport->height / 4.0f;
1832
1833         /* 2k of the range is negative */
1834         sign_to_unsigned_offset = 2048.0f;
1835      }
1836   } else {
1837      /* Max representable value in 16.8 fixed point format
1838       * Calculated as (2 ** 16) - (2 ** -8)
1839       */
1840      fixed_point_max = 65535.99609375f;
1841      guardband_width = pViewport->width / 4.0f;
1842      guardband_height = pViewport->height / 4.0f;
1843
1844      /* 4k/20k of the range is negative */
1845      sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
1846   }
1847
1848   min_screen_space_value = -sign_to_unsigned_offset;
1849   max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
1850
1851   min_vertex_x = pViewport->x - guardband_width;
1852   max_vertex_x = pViewport->x + pViewport->width + guardband_width;
1853   min_vertex_y = pViewport->y - guardband_height;
1854   max_vertex_y = pViewport->y + pViewport->height + guardband_height;
1855   if (min_vertex_x < min_screen_space_value ||
1856       max_vertex_x > max_screen_space_value ||
1857       min_vertex_y < min_screen_space_value ||
1858       max_vertex_y > max_screen_space_value) {
1859      mesa_logw("Viewport is affected by BRN70165, geometry outside "
1860                "the viewport could be corrupted");
1861   }
1862}
1863#endif
1864
1865void pvr_CmdSetViewport(VkCommandBuffer commandBuffer,
1866                        uint32_t firstViewport,
1867                        uint32_t viewportCount,
1868                        const VkViewport *pViewports)
1869{
1870   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1871   const uint32_t total_count = firstViewport + viewportCount;
1872   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1873
1874   assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
1875   assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
1876
1877   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
1878
1879#if defined(DEBUG)
1880   if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
1881      for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
1882         check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
1883      }
1884   }
1885#endif
1886
1887   if (state->dynamic.common.viewport.count < total_count)
1888      state->dynamic.common.viewport.count = total_count;
1889
1890   memcpy(&state->dynamic.common.viewport.viewports[firstViewport],
1891          pViewports,
1892          viewportCount * sizeof(*pViewports));
1893
1894   state->dirty.viewport = true;
1895}
1896
1897void pvr_CmdSetScissor(VkCommandBuffer commandBuffer,
1898                       uint32_t firstScissor,
1899                       uint32_t scissorCount,
1900                       const VkRect2D *pScissors)
1901{
1902   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1903   const uint32_t total_count = firstScissor + scissorCount;
1904   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1905
1906   assert(firstScissor < PVR_MAX_VIEWPORTS && scissorCount > 0);
1907   assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
1908
1909   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
1910
1911   if (state->dynamic.common.scissor.count < total_count)
1912      state->dynamic.common.scissor.count = total_count;
1913
1914   memcpy(&state->dynamic.common.scissor.scissors[firstScissor],
1915          pScissors,
1916          scissorCount * sizeof(*pScissors));
1917
1918   state->dirty.scissor = true;
1919}
1920
1921void pvr_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
1922{
1923   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1924   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1925
1926   state->dynamic.common.line_width = lineWidth;
1927   state->dirty.line_width = true;
1928}
1929
1930void pvr_CmdSetDepthBias(VkCommandBuffer commandBuffer,
1931                         float depthBiasConstantFactor,
1932                         float depthBiasClamp,
1933                         float depthBiasSlopeFactor)
1934{
1935   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1936   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1937
1938   state->dynamic.common.depth_bias.constant_factor = depthBiasConstantFactor;
1939   state->dynamic.common.depth_bias.clamp = depthBiasClamp;
1940   state->dynamic.common.depth_bias.slope_factor = depthBiasSlopeFactor;
1941   state->dirty.depth_bias = true;
1942}
1943
1944void pvr_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
1945                              const float blendConstants[4])
1946{
1947   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1948   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1949
1950   STATIC_ASSERT(ARRAY_SIZE(state->dynamic.common.blend_constants) == 4);
1951   memcpy(state->dynamic.common.blend_constants,
1952          blendConstants,
1953          sizeof(state->dynamic.common.blend_constants));
1954
1955   state->dirty.blend_constants = true;
1956}
1957
1958void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
1959                           float minDepthBounds,
1960                           float maxDepthBounds)
1961{
1962   mesa_logd("No support for depth bounds testing.");
1963}
1964
1965void pvr_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
1966                                  VkStencilFaceFlags faceMask,
1967                                  uint32_t compareMask)
1968{
1969   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1970   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1971
1972   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
1973      state->dynamic.common.compare_mask.front = compareMask;
1974
1975   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
1976      state->dynamic.common.compare_mask.back = compareMask;
1977
1978   state->dirty.compare_mask = true;
1979}
1980
1981void pvr_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
1982                                VkStencilFaceFlags faceMask,
1983                                uint32_t writeMask)
1984{
1985   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1986   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1987
1988   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
1989      state->dynamic.common.write_mask.front = writeMask;
1990
1991   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
1992      state->dynamic.common.write_mask.back = writeMask;
1993
1994   state->dirty.write_mask = true;
1995}
1996
1997void pvr_CmdSetStencilReference(VkCommandBuffer commandBuffer,
1998                                VkStencilFaceFlags faceMask,
1999                                uint32_t reference)
2000{
2001   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2002   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2003
2004   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2005      state->dynamic.common.reference.front = reference;
2006
2007   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2008      state->dynamic.common.reference.back = reference;
2009
2010   state->dirty.reference = true;
2011}
2012
2013void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2014                               VkPipelineBindPoint pipelineBindPoint,
2015                               VkPipelineLayout _layout,
2016                               uint32_t firstSet,
2017                               uint32_t descriptorSetCount,
2018                               const VkDescriptorSet *pDescriptorSets,
2019                               uint32_t dynamicOffsetCount,
2020                               const uint32_t *pDynamicOffsets)
2021{
2022   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2023   struct pvr_descriptor_state *descriptor_state;
2024
2025   assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS);
2026
2027   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2028
2029   switch (pipelineBindPoint) {
2030   case VK_PIPELINE_BIND_POINT_GRAPHICS:
2031   case VK_PIPELINE_BIND_POINT_COMPUTE:
2032      break;
2033
2034   default:
2035      unreachable("Unsupported bind point.");
2036      break;
2037   }
2038
2039   if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2040      descriptor_state = &cmd_buffer->state.gfx_desc_state;
2041      cmd_buffer->state.dirty.gfx_desc_dirty = true;
2042   } else {
2043      descriptor_state = &cmd_buffer->state.compute_desc_state;
2044      cmd_buffer->state.dirty.compute_desc_dirty = true;
2045   }
2046
2047   for (uint32_t i = 0; i < descriptorSetCount; i++) {
2048      PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]);
2049      uint32_t index = firstSet + i;
2050
2051      if (descriptor_state->descriptor_sets[index] != set) {
2052         descriptor_state->descriptor_sets[index] = set;
2053         descriptor_state->valid_mask |= (1u << index);
2054      }
2055   }
2056}
2057
2058void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2059                              uint32_t firstBinding,
2060                              uint32_t bindingCount,
2061                              const VkBuffer *pBuffers,
2062                              const VkDeviceSize *pOffsets)
2063{
2064   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2065   struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
2066
2067   /* We have to defer setting up vertex buffer since we need the buffer
2068    * stride from the pipeline.
2069    */
2070
2071   assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
2072          bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
2073
2074   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2075
2076   for (uint32_t i = 0; i < bindingCount; i++) {
2077      vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]);
2078      vb[firstBinding + i].offset = pOffsets[i];
2079   }
2080
2081   cmd_buffer->state.dirty.vertex_bindings = true;
2082}
2083
2084void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2085                            VkBuffer buffer,
2086                            VkDeviceSize offset,
2087                            VkIndexType indexType)
2088{
2089   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2090   PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
2091   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2092
2093   assert(offset < index_buffer->vk.size);
2094   assert(indexType == VK_INDEX_TYPE_UINT32 ||
2095          indexType == VK_INDEX_TYPE_UINT16);
2096
2097   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2098
2099   state->index_buffer_binding.buffer = index_buffer;
2100   state->index_buffer_binding.offset = offset;
2101   state->index_buffer_binding.type = indexType;
2102   state->dirty.index_buffer_binding = true;
2103}
2104
2105void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
2106                          VkPipelineLayout layout,
2107                          VkShaderStageFlags stageFlags,
2108                          uint32_t offset,
2109                          uint32_t size,
2110                          const void *pValues)
2111{
2112#if defined(DEBUG)
2113   const uint64_t ending = (uint64_t)offset + (uint64_t)size;
2114#endif
2115
2116   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2117   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2118
2119   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2120
2121   pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
2122
2123   memcpy(&state->push_constants.data[offset], pValues, size);
2124
2125   state->push_constants.dirty_stages |= stageFlags;
2126}
2127
2128static VkResult
2129pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer,
2130                                 const struct pvr_render_pass *pass,
2131                                 const struct pvr_framebuffer *framebuffer)
2132{
2133   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2134   struct pvr_render_pass_info *info = &state->render_pass_info;
2135
2136   assert(pass->attachment_count == framebuffer->attachment_count);
2137
2138   /* Free any previously allocated attachments. */
2139   vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
2140
2141   if (pass->attachment_count == 0) {
2142      info->attachments = NULL;
2143      return VK_SUCCESS;
2144   }
2145
2146   info->attachments =
2147      vk_zalloc(&cmd_buffer->vk.pool->alloc,
2148                pass->attachment_count * sizeof(*info->attachments),
2149                8,
2150                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2151   if (!info->attachments) {
2152      /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
2153      state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
2154      return state->status;
2155   }
2156
2157   if (framebuffer) {
2158      for (uint32_t i = 0; i < pass->attachment_count; i++)
2159         info->attachments[i] = framebuffer->attachments[i];
2160   }
2161
2162   return VK_SUCCESS;
2163}
2164
2165static VkResult pvr_init_render_targets(struct pvr_device *device,
2166                                        struct pvr_render_pass *pass,
2167                                        struct pvr_framebuffer *framebuffer)
2168{
2169   for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
2170      struct pvr_render_target *render_target =
2171         pvr_get_render_target(pass, framebuffer, i);
2172
2173      pthread_mutex_lock(&render_target->mutex);
2174
2175      if (!render_target->valid) {
2176         const struct pvr_renderpass_hwsetup_render *hw_render =
2177            &pass->hw_setup->renders[i];
2178         VkResult result;
2179
2180         result = pvr_render_target_dataset_create(device,
2181                                                   framebuffer->width,
2182                                                   framebuffer->height,
2183                                                   hw_render->sample_count,
2184                                                   framebuffer->layers,
2185                                                   &render_target->rt_dataset);
2186         if (result != VK_SUCCESS) {
2187            pthread_mutex_unlock(&render_target->mutex);
2188            return result;
2189         }
2190
2191         render_target->valid = true;
2192      }
2193
2194      pthread_mutex_unlock(&render_target->mutex);
2195   }
2196
2197   return VK_SUCCESS;
2198}
2199
2200static const struct pvr_renderpass_hwsetup_subpass *
2201pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass)
2202{
2203   const struct pvr_renderpass_hw_map *map =
2204      &pass->hw_setup->subpass_map[subpass];
2205
2206   return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
2207}
2208
2209static void pvr_perform_start_of_render_attachment_clear(
2210   struct pvr_cmd_buffer *cmd_buffer,
2211   const struct pvr_framebuffer *framebuffer,
2212   uint32_t index,
2213   bool is_depth_stencil,
2214   uint32_t *index_list_clear_mask)
2215{
2216   struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2217   const struct pvr_render_pass *pass = info->pass;
2218   const struct pvr_renderpass_hwsetup_render *hw_render;
2219   const struct pvr_renderpass_hwsetup *hw_setup;
2220   struct pvr_image_view *iview;
2221   uint32_t view_idx;
2222   uint32_t height;
2223   uint32_t width;
2224
2225   hw_setup = pass->hw_setup;
2226   hw_render =
2227      &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2228
2229   if (is_depth_stencil) {
2230      bool stencil_clear;
2231      bool depth_clear;
2232      bool is_stencil;
2233      bool is_depth;
2234
2235      assert(hw_render->ds_surface_id != -1);
2236      assert(index == 0);
2237
2238      view_idx = hw_render->ds_surface_id;
2239
2240      is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format);
2241      is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format);
2242      depth_clear = hw_render->depth_init == RENDERPASS_SURFACE_INITOP_CLEAR;
2243      stencil_clear = hw_render->stencil_init ==
2244                      RENDERPASS_SURFACE_INITOP_CLEAR;
2245
2246      /* Attempt to clear the ds attachment. Do not erroneously discard an
2247       * attachment that has no depth clear but has a stencil attachment.
2248       */
2249      /* if not (a ∧ c) ∨ (b ∧ d) */
2250      if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
2251         return;
2252   } else if (hw_render->color_init[index].op !=
2253              RENDERPASS_SURFACE_INITOP_CLEAR) {
2254      return;
2255   } else {
2256      view_idx = hw_render->color_init[index].driver_id;
2257   }
2258
2259   iview = info->attachments[view_idx];
2260   width = iview->vk.extent.width;
2261   height = iview->vk.extent.height;
2262
2263   /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
2264    * were doing the same check (even if it's just an assert) to determine if a
2265    * clear is needed.
2266    */
2267   /* If this is single-layer fullscreen, we already do the clears in
2268    * pvr_sub_cmd_gfx_job_init().
2269    */
2270   if (info->render_area.offset.x == 0 && info->render_area.offset.y == 0 &&
2271       info->render_area.extent.width == width &&
2272       info->render_area.extent.height == height && framebuffer->layers == 1) {
2273      return;
2274   }
2275
2276   pvr_finishme("Unimplemented path!");
2277}
2278
2279static void
2280pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer)
2281{
2282   struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2283   const struct pvr_framebuffer *framebuffer = info->framebuffer;
2284   const struct pvr_render_pass *pass = info->pass;
2285   const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2286   const struct pvr_renderpass_hwsetup_render *hw_render;
2287
2288   /* Mask of attachment clears using index lists instead of background object
2289    * to clear.
2290    */
2291   uint32_t index_list_clear_mask = 0;
2292
2293   hw_render =
2294      &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2295   if (!hw_render) {
2296      info->process_empty_tiles = false;
2297      info->enable_bg_tag = false;
2298      return;
2299   }
2300
2301   for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
2302      pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2303                                                   framebuffer,
2304                                                   i,
2305                                                   false,
2306                                                   &index_list_clear_mask);
2307   }
2308
2309   info->enable_bg_tag = !!hw_render->color_init_count;
2310
2311   /* If we're not using index list for all clears/loads then we need to run
2312    * the background object on empty tiles.
2313    */
2314   if (hw_render->color_init_count &&
2315       index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
2316      info->process_empty_tiles = true;
2317   } else {
2318      info->process_empty_tiles = false;
2319   }
2320
2321   if (hw_render->ds_surface_id != -1) {
2322      uint32_t ds_index_list = 0;
2323
2324      pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2325                                                   framebuffer,
2326                                                   0,
2327                                                   true,
2328                                                   &ds_index_list);
2329   }
2330
2331   if (index_list_clear_mask)
2332      pvr_finishme("Add support for generating loadops shaders!");
2333}
2334
2335static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
2336                                   struct pvr_sub_cmd_gfx *const sub_cmd)
2337{
2338   const struct pvr_render_pass *pass = state->render_pass_info.pass;
2339   const struct pvr_renderpass_hwsetup_render *hw_render =
2340      &pass->hw_setup->renders[sub_cmd->hw_render_idx];
2341
2342   if (hw_render->ds_surface_id != -1) {
2343      struct pvr_image_view **iviews = state->render_pass_info.attachments;
2344
2345      state->depth_format = iviews[hw_render->ds_surface_id]->vk.format;
2346   }
2347}
2348
2349static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
2350{
2351   for (uint32_t i = 0; i < hw_setup->render_count; i++) {
2352      struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
2353      uint32_t render_targets_count =
2354         hw_render->init_setup.render_targets_count;
2355
2356      for (uint32_t j = 0;
2357           j < (hw_render->color_init_count * render_targets_count);
2358           j += render_targets_count) {
2359         for (uint32_t k = 0; k < hw_render->init_setup.render_targets_count;
2360              k++) {
2361            if (hw_render->color_init[j + k].op ==
2362                RENDERPASS_SURFACE_INITOP_CLEAR) {
2363               return true;
2364            }
2365         }
2366      }
2367      if (hw_render->depth_init == RENDERPASS_SURFACE_INITOP_CLEAR ||
2368          hw_render->stencil_init == RENDERPASS_SURFACE_INITOP_CLEAR) {
2369         return true;
2370      }
2371   }
2372
2373   return false;
2374}
2375
2376static VkResult
2377pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
2378                                const VkRenderPassBeginInfo *pRenderPassBegin)
2379{
2380   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2381
2382   /* Free any previously allocated clear values. */
2383   vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
2384
2385   if (pRenderPassBegin->clearValueCount) {
2386      const size_t size = pRenderPassBegin->clearValueCount *
2387                          sizeof(*state->render_pass_info.clear_values);
2388
2389      state->render_pass_info.clear_values =
2390         vk_zalloc(&cmd_buffer->vk.pool->alloc,
2391                   size,
2392                   8,
2393                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2394      if (!state->render_pass_info.clear_values) {
2395         state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
2396         return state->status;
2397      }
2398
2399      memcpy(state->render_pass_info.clear_values,
2400             pRenderPassBegin->pClearValues,
2401             size);
2402   } else {
2403      state->render_pass_info.clear_values = NULL;
2404   }
2405
2406   state->render_pass_info.clear_value_count =
2407      pRenderPassBegin->clearValueCount;
2408
2409   return VK_SUCCESS;
2410}
2411
2412void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2413                             const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2414                             const VkSubpassBeginInfo *pSubpassBeginInfo)
2415{
2416   PVR_FROM_HANDLE(pvr_framebuffer,
2417                   framebuffer,
2418                   pRenderPassBeginInfo->framebuffer);
2419   PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
2420   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2421   const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
2422   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2423   VkResult result;
2424
2425   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2426
2427   assert(!state->render_pass_info.pass);
2428   assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
2429
2430   /* FIXME: Create a separate function for everything using pass->subpasses,
2431    * look at cmd_buffer_begin_subpass() for example. */
2432   state->render_pass_info.pass = pass;
2433   state->render_pass_info.framebuffer = framebuffer;
2434   state->render_pass_info.subpass_idx = 0;
2435   state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
2436   state->render_pass_info.current_hw_subpass = 0;
2437   state->render_pass_info.pipeline_bind_point =
2438      pass->subpasses[0].pipeline_bind_point;
2439   state->render_pass_info.userpass_spawn = pass->subpasses[0].userpass_spawn;
2440   state->dirty.userpass_spawn = true;
2441
2442   result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer);
2443   if (result != VK_SUCCESS)
2444      return;
2445
2446   state->status =
2447      pvr_init_render_targets(cmd_buffer->device, pass, framebuffer);
2448   if (state->status != VK_SUCCESS)
2449      return;
2450
2451   result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
2452   if (result != VK_SUCCESS)
2453      return;
2454
2455   assert(pass->subpasses[0].pipeline_bind_point ==
2456          VK_PIPELINE_BIND_POINT_GRAPHICS);
2457
2458   result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
2459   if (result != VK_SUCCESS)
2460      return;
2461
2462   /* Run subpass 0 "soft" background object after the actual background
2463    * object.
2464    */
2465   hw_subpass = pvr_get_hw_subpass(pass, 0);
2466   if (hw_subpass->client_data)
2467      pvr_finishme("Unimplemented path!");
2468
2469   pvr_perform_start_of_render_clears(cmd_buffer);
2470   pvr_stash_depth_format(&cmd_buffer->state,
2471                          &cmd_buffer->state.current_sub_cmd->gfx);
2472
2473   if (!pvr_loadops_contain_clear(pass->hw_setup)) {
2474      state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_CHECK_FOR_CLEAR;
2475      state->dynamic.scissor_accum_bounds.offset.x = 0;
2476      state->dynamic.scissor_accum_bounds.offset.y = 0;
2477      state->dynamic.scissor_accum_bounds.extent.width = 0;
2478      state->dynamic.scissor_accum_bounds.extent.height = 0;
2479   } else {
2480      state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_DISABLED;
2481   }
2482}
2483
2484static void pvr_cmd_buffer_reset(struct pvr_cmd_buffer *cmd_buffer)
2485{
2486   if (cmd_buffer->status != PVR_CMD_BUFFER_STATUS_INITIAL) {
2487      /* FIXME: For now we always free all resources as if
2488       * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
2489       */
2490      pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
2491
2492      list_for_each_entry_safe (struct pvr_bo, bo, &cmd_buffer->bo_list, link) {
2493         list_del(&bo->link);
2494         pvr_bo_free(cmd_buffer->device, bo);
2495      }
2496
2497      util_dynarray_clear(&cmd_buffer->scissor_array);
2498      util_dynarray_clear(&cmd_buffer->depth_bias_array);
2499
2500      cmd_buffer->state.status = VK_SUCCESS;
2501      cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL;
2502   }
2503}
2504
2505VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,
2506                                const VkCommandBufferBeginInfo *pBeginInfo)
2507{
2508   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2509   struct pvr_cmd_buffer_state *state;
2510   VkResult result;
2511
2512   pvr_cmd_buffer_reset(cmd_buffer);
2513
2514   cmd_buffer->usage_flags = pBeginInfo->flags;
2515   state = &cmd_buffer->state;
2516
2517   /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
2518    * primary level command buffers.
2519    *
2520    * From the Vulkan 1.0 spec:
2521    *
2522    *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
2523    *    secondary command buffer is considered to be entirely inside a render
2524    *    pass. If this is a primary command buffer, then this bit is ignored.
2525    */
2526   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
2527      cmd_buffer->usage_flags &=
2528         ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2529   }
2530
2531   if (cmd_buffer->usage_flags &
2532       VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2533      const VkCommandBufferInheritanceInfo *inheritance_info =
2534         pBeginInfo->pInheritanceInfo;
2535      struct pvr_render_pass *pass;
2536
2537      pass = pvr_render_pass_from_handle(inheritance_info->renderPass);
2538      state->render_pass_info.pass = pass;
2539      state->render_pass_info.framebuffer =
2540         pvr_framebuffer_from_handle(inheritance_info->framebuffer);
2541      state->render_pass_info.subpass_idx = inheritance_info->subpass;
2542      state->render_pass_info.userpass_spawn =
2543         pass->subpasses[inheritance_info->subpass].userpass_spawn;
2544
2545      result =
2546         pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
2547      if (result != VK_SUCCESS)
2548         return result;
2549   }
2550
2551   memset(state->barriers_needed,
2552          0xFF,
2553          sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
2554
2555   cmd_buffer->status = PVR_CMD_BUFFER_STATUS_RECORDING;
2556
2557   return VK_SUCCESS;
2558}
2559
2560VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer,
2561                                         struct pvr_transfer_cmd *transfer_cmd)
2562{
2563   struct pvr_sub_cmd_transfer *sub_cmd;
2564   VkResult result;
2565
2566   result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
2567   if (result != VK_SUCCESS)
2568      return result;
2569
2570   sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
2571
2572   list_addtail(&transfer_cmd->link, &sub_cmd->transfer_cmds);
2573
2574   return VK_SUCCESS;
2575}
2576
2577static void
2578pvr_validate_push_descriptors(struct pvr_cmd_buffer *cmd_buffer,
2579                              bool *const push_descriptors_dirty_out)
2580{
2581   /* TODO: Implement this function, based on ValidatePushDescriptors. */
2582   pvr_finishme("Add support for push descriptors!");
2583   *push_descriptors_dirty_out = false;
2584}
2585
2586#define PVR_WRITE(_buffer, _value, _offset, _max)                \
2587   do {                                                          \
2588      __typeof__(_value) __value = _value;                       \
2589      uint64_t __offset = _offset;                               \
2590      uint32_t __nr_dwords = sizeof(__value) / sizeof(uint32_t); \
2591      static_assert(__same_type(*_buffer, __value),              \
2592                    "Buffer and value type mismatch");           \
2593      assert((__offset + __nr_dwords) <= (_max));                \
2594      assert((__offset % __nr_dwords) == 0U);                    \
2595      _buffer[__offset / __nr_dwords] = __value;                 \
2596   } while (0)
2597
2598static VkResult
2599pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
2600                         const struct pvr_graphics_pipeline *const gfx_pipeline)
2601{
2602   const struct pvr_vertex_shader_state *const vertex_state =
2603      &gfx_pipeline->vertex_shader_state;
2604   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2605   const struct pvr_pds_info *const pds_info = state->pds_shader.info;
2606   const uint8_t *entries;
2607   uint32_t *dword_buffer;
2608   uint64_t *qword_buffer;
2609   struct pvr_bo *pvr_bo;
2610   VkResult result;
2611
2612   result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
2613                                     cmd_buffer->device->heaps.pds_heap,
2614                                     pds_info->data_size_in_dwords,
2615                                     PVR_BO_ALLOC_FLAG_CPU_MAPPED,
2616                                     &pvr_bo);
2617   if (result != VK_SUCCESS)
2618      return result;
2619
2620   dword_buffer = (uint32_t *)pvr_bo->bo->map;
2621   qword_buffer = (uint64_t *)pvr_bo->bo->map;
2622
2623   entries = (uint8_t *)pds_info->entries;
2624
2625   for (uint32_t i = 0; i < pds_info->entry_count; i++) {
2626      const struct pvr_const_map_entry *const entry_header =
2627         (struct pvr_const_map_entry *)entries;
2628
2629      switch (entry_header->type) {
2630      case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
2631         const struct pvr_const_map_entry_literal32 *const literal =
2632            (struct pvr_const_map_entry_literal32 *)entries;
2633
2634         PVR_WRITE(dword_buffer,
2635                   literal->literal_value,
2636                   literal->const_offset,
2637                   pds_info->data_size_in_dwords);
2638
2639         entries += sizeof(*literal);
2640         break;
2641      }
2642
2643      case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
2644         const struct pvr_const_map_entry_doutu_address *const doutu_addr =
2645            (struct pvr_const_map_entry_doutu_address *)entries;
2646         const pvr_dev_addr_t exec_addr =
2647            PVR_DEV_ADDR_OFFSET(vertex_state->bo->vma->dev_addr,
2648                                vertex_state->entry_offset);
2649         uint64_t addr = 0ULL;
2650
2651         pvr_set_usc_execution_address64(&addr, exec_addr.addr);
2652
2653         PVR_WRITE(qword_buffer,
2654                   addr | doutu_addr->doutu_control,
2655                   doutu_addr->const_offset,
2656                   pds_info->data_size_in_dwords);
2657
2658         entries += sizeof(*doutu_addr);
2659         break;
2660      }
2661
2662      case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
2663         const struct pvr_const_map_entry_base_instance *const base_instance =
2664            (struct pvr_const_map_entry_base_instance *)entries;
2665
2666         PVR_WRITE(dword_buffer,
2667                   state->draw_state.base_instance,
2668                   base_instance->const_offset,
2669                   pds_info->data_size_in_dwords);
2670
2671         entries += sizeof(*base_instance);
2672         break;
2673      }
2674
2675      case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
2676         const struct pvr_const_map_entry_vertex_attribute_address
2677            *const attribute =
2678               (struct pvr_const_map_entry_vertex_attribute_address *)entries;
2679         const struct pvr_vertex_binding *const binding =
2680            &state->vertex_bindings[attribute->binding_index];
2681         const pvr_dev_addr_t addr =
2682            PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
2683                                binding->offset + attribute->offset);
2684
2685         PVR_WRITE(qword_buffer,
2686                   addr.addr,
2687                   attribute->const_offset,
2688                   pds_info->data_size_in_dwords);
2689
2690         entries += sizeof(*attribute);
2691         break;
2692      }
2693
2694      default:
2695         unreachable("Unsupported data section map");
2696         break;
2697      }
2698   }
2699
2700   state->pds_vertex_attrib_offset =
2701      pvr_bo->vma->dev_addr.addr -
2702      cmd_buffer->device->heaps.pds_heap->base_addr.addr;
2703
2704   pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo);
2705
2706   return VK_SUCCESS;
2707}
2708
2709static VkResult pvr_setup_descriptor_mappings(
2710   struct pvr_cmd_buffer *const cmd_buffer,
2711   enum pvr_stage_allocation stage,
2712   const struct pvr_stage_allocation_descriptor_state *descriptor_state,
2713   UNUSED const pvr_dev_addr_t *const num_worgroups_buff_addr,
2714   uint32_t *const descriptor_data_offset_out)
2715{
2716   const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
2717   const struct pvr_descriptor_state *desc_state;
2718   const uint8_t *entries;
2719   uint32_t *dword_buffer;
2720   uint64_t *qword_buffer;
2721   struct pvr_bo *pvr_bo;
2722   VkResult result;
2723
2724   pvr_finishme("Handle num_worgroups_buff_addr");
2725
2726   if (!pds_info->data_size_in_dwords)
2727      return VK_SUCCESS;
2728
2729   result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
2730                                     cmd_buffer->device->heaps.pds_heap,
2731                                     pds_info->data_size_in_dwords,
2732                                     PVR_BO_ALLOC_FLAG_CPU_MAPPED,
2733                                     &pvr_bo);
2734   if (result != VK_SUCCESS)
2735      return result;
2736
2737   dword_buffer = (uint32_t *)pvr_bo->bo->map;
2738   qword_buffer = (uint64_t *)pvr_bo->bo->map;
2739
2740   entries = (uint8_t *)pds_info->entries;
2741
2742   switch (stage) {
2743   case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
2744   case PVR_STAGE_ALLOCATION_FRAGMENT:
2745      desc_state = &cmd_buffer->state.gfx_desc_state;
2746      break;
2747
2748   case PVR_STAGE_ALLOCATION_COMPUTE:
2749      desc_state = &cmd_buffer->state.compute_desc_state;
2750      break;
2751
2752   default:
2753      unreachable("Unsupported stage.");
2754      break;
2755   }
2756
2757   for (uint32_t i = 0; i < pds_info->entry_count; i++) {
2758      const struct pvr_const_map_entry *const entry_header =
2759         (struct pvr_const_map_entry *)entries;
2760
2761      /* TODO: See if instead of reusing the blend constant buffer type entry,
2762       * we can setup a new buffer type specifically for num_workgroups or other
2763       * built-in variables. The mappings are setup at pipeline creation when
2764       * creating the descriptor program.
2765       */
2766      pvr_finishme("Handle blend constant reuse for compute.");
2767
2768      switch (entry_header->type) {
2769      case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
2770         const struct pvr_const_map_entry_literal32 *const literal =
2771            (struct pvr_const_map_entry_literal32 *)entries;
2772
2773         PVR_WRITE(dword_buffer,
2774                   literal->literal_value,
2775                   literal->const_offset,
2776                   pds_info->data_size_in_dwords);
2777
2778         entries += sizeof(*literal);
2779         break;
2780      }
2781
2782      case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: {
2783         const struct pvr_const_map_entry_constant_buffer *const_buffer_entry =
2784            (struct pvr_const_map_entry_constant_buffer *)entries;
2785         const uint32_t desc_set = const_buffer_entry->desc_set;
2786         const uint32_t binding = const_buffer_entry->binding;
2787         const struct pvr_descriptor_set *descriptor_set;
2788         const struct pvr_descriptor *descriptor;
2789         pvr_dev_addr_t buffer_addr;
2790
2791         /* TODO: Handle push descriptors. */
2792
2793         assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
2794         descriptor_set = desc_state->descriptor_sets[desc_set];
2795
2796         /* TODO: Handle dynamic buffers. */
2797         descriptor = &descriptor_set->descriptors[binding];
2798         assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2799
2800         assert(descriptor->buffer_desc_range ==
2801                const_buffer_entry->size_in_dwords * sizeof(uint32_t));
2802         assert(descriptor->buffer_create_info_size ==
2803                const_buffer_entry->size_in_dwords * sizeof(uint32_t));
2804
2805         buffer_addr =
2806            PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr,
2807                                const_buffer_entry->offset * sizeof(uint32_t));
2808
2809         PVR_WRITE(qword_buffer,
2810                   buffer_addr.addr,
2811                   const_buffer_entry->const_offset,
2812                   pds_info->data_size_in_dwords);
2813
2814         entries += sizeof(*const_buffer_entry);
2815         break;
2816      }
2817
2818      case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
2819         const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
2820            (struct pvr_const_map_entry_descriptor_set *)entries;
2821         const uint32_t desc_set_num = desc_set_entry->descriptor_set;
2822         const struct pvr_descriptor_set *descriptor_set;
2823         pvr_dev_addr_t desc_set_addr;
2824
2825         assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS);
2826
2827         /* TODO: Remove this when the compiler provides us with usage info?
2828          */
2829         /* We skip DMAing unbound descriptor sets. */
2830         if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) {
2831            const struct pvr_const_map_entry_literal32 *literal;
2832            uint32_t zero_literal_value;
2833
2834            entries += sizeof(*desc_set_entry);
2835            literal = (struct pvr_const_map_entry_literal32 *)entries;
2836
2837            /* TODO: Is there any guarantee that a literal will follow the
2838             * descriptor set entry?
2839             */
2840            assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32);
2841
2842            /* We zero out the DMA size so the DMA isn't performed. */
2843            zero_literal_value =
2844               literal->literal_value &
2845               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK;
2846
2847            PVR_WRITE(qword_buffer,
2848                      UINT64_C(0),
2849                      desc_set_entry->const_offset,
2850                      pds_info->data_size_in_dwords);
2851
2852            PVR_WRITE(dword_buffer,
2853                      zero_literal_value,
2854                      desc_set_entry->const_offset,
2855                      pds_info->data_size_in_dwords);
2856
2857            entries += sizeof(*literal);
2858            i++;
2859            continue;
2860         }
2861
2862         descriptor_set = desc_state->descriptor_sets[desc_set_num];
2863
2864         pvr_finishme("Handle push descriptor entry.");
2865
2866         desc_set_addr = descriptor_set->pvr_bo->vma->dev_addr;
2867
2868         if (desc_set_entry->primary) {
2869            desc_set_addr = PVR_DEV_ADDR_OFFSET(
2870               desc_set_addr,
2871               descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
2872                     .primary_offset
2873                  << 2U);
2874         } else {
2875            desc_set_addr = PVR_DEV_ADDR_OFFSET(
2876               desc_set_addr,
2877               descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
2878                     .secondary_offset
2879                  << 2U);
2880         }
2881
2882         desc_set_addr = PVR_DEV_ADDR_OFFSET(
2883            desc_set_addr,
2884            (uint64_t)desc_set_entry->offset_in_dwords << 2U);
2885
2886         PVR_WRITE(qword_buffer,
2887                   desc_set_addr.addr,
2888                   desc_set_entry->const_offset,
2889                   pds_info->data_size_in_dwords);
2890
2891         entries += sizeof(*desc_set_entry);
2892         break;
2893      }
2894
2895      case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
2896         const struct pvr_const_map_entry_special_buffer *special_buff_entry =
2897            (struct pvr_const_map_entry_special_buffer *)entries;
2898
2899         switch (special_buff_entry->buffer_type) {
2900         case PVR_BUFFER_TYPES_COMPILE_TIME: {
2901            uint64_t addr = descriptor_state->static_consts->vma->dev_addr.addr;
2902
2903            PVR_WRITE(qword_buffer,
2904                      addr,
2905                      special_buff_entry->const_offset,
2906                      pds_info->data_size_in_dwords);
2907            break;
2908         }
2909
2910         default:
2911            unreachable("Unsupported special buffer type.");
2912         }
2913
2914         entries += sizeof(*special_buff_entry);
2915         break;
2916      }
2917
2918      default:
2919         unreachable("Unsupported map entry type.");
2920      }
2921   }
2922
2923   pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo);
2924
2925   *descriptor_data_offset_out =
2926      pvr_bo->vma->dev_addr.addr -
2927      cmd_buffer->device->heaps.pds_heap->base_addr.addr;
2928
2929   return VK_SUCCESS;
2930}
2931
2932#undef PVR_WRITE
2933
2934static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
2935                                      struct pvr_sub_cmd_compute *const sub_cmd)
2936{
2937   const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
2938   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2939   struct pvr_csb *csb = &sub_cmd->control_stream;
2940   const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
2941   const uint32_t const_shared_reg_count =
2942      pipeline->state.shader.const_shared_reg_count;
2943   struct pvr_compute_kernel_info info;
2944
2945   /* No shared regs, no need to use an allocation kernel. */
2946   if (!const_shared_reg_count)
2947      return;
2948
2949   info = (struct pvr_compute_kernel_info){
2950      .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
2951      .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
2952
2953      .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
2954      .usc_common_shared = true,
2955      .usc_common_size =
2956         DIV_ROUND_UP(const_shared_reg_count,
2957                      PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
2958
2959      .local_size = { 1, 1, 1 },
2960      .global_size = { 1, 1, 1 },
2961   };
2962
2963   /* Sometimes we don't have a secondary program if there were no constants to
2964    * write, but we still need to run a PDS program to accomplish the
2965    * allocation of the local/common store shared registers so we repurpose the
2966    * deallocation PDS program.
2967    */
2968   if (pipeline->state.descriptor.pds_info.code_size_in_dwords) {
2969      uint32_t pds_data_size_in_dwords =
2970         pipeline->state.descriptor.pds_info.data_size_in_dwords;
2971
2972      info.pds_data_offset = state->pds_compute_descriptor_data_offset;
2973      info.pds_data_size =
2974         DIV_ROUND_UP(pds_data_size_in_dwords << 2U,
2975                      PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
2976
2977      /* Check that we have upload the code section. */
2978      assert(pipeline->state.descriptor.pds_code.code_size);
2979      info.pds_code_offset = pipeline->state.descriptor.pds_code.code_offset;
2980   } else {
2981      /* FIXME: There should be a deallocation pds program already uploaded
2982       * that we use at this point.
2983       */
2984      assert(!"Unimplemented");
2985   }
2986
2987   /* We don't need to pad the workgroup size. */
2988
2989   info.max_instances =
2990      pvr_compute_flat_slot_size(pdevice, const_shared_reg_count, false, 1U);
2991
2992   pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2993}
2994
2995static uint32_t
2996pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
2997                                    uint32_t workgroup_size,
2998                                    uint32_t coeff_regs_count)
2999{
3000   const struct pvr_device_runtime_info *dev_runtime_info =
3001      &pdevice->dev_runtime_info;
3002   const struct pvr_device_info *dev_info = &pdevice->dev_info;
3003   uint32_t max_avail_coeff_regs =
3004      dev_runtime_info->cdm_max_local_mem_size_regs;
3005   uint32_t coeff_regs_count_aligned =
3006      ALIGN_POT(coeff_regs_count,
3007                PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U);
3008
3009   /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
3010    * pad the work group size to the next multiple of
3011    * ROGUE_MAX_INSTANCES_PER_TASK.
3012    *
3013    * If we use more than 1/8th of the max coefficient registers then we round
3014    * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
3015    */
3016   /* TODO: See if this can be optimized. */
3017   if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
3018       coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
3019      assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
3020
3021      return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
3022   }
3023
3024   return workgroup_size;
3025}
3026
3027/* TODO: Wire up the base_workgroup variant program when implementing
3028 * VK_KHR_device_group. The values will also need patching into the program.
3029 */
3030static void pvr_compute_update_kernel(
3031   struct pvr_cmd_buffer *cmd_buffer,
3032   struct pvr_sub_cmd_compute *const sub_cmd,
3033   const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
3034{
3035   const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
3036   const struct pvr_device_runtime_info *dev_runtime_info =
3037      &pdevice->dev_runtime_info;
3038   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3039   struct pvr_csb *csb = &sub_cmd->control_stream;
3040   const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
3041   const struct pvr_pds_info *program_info =
3042      &pipeline->state.primary_program_info;
3043
3044   struct pvr_compute_kernel_info info = {
3045      .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
3046      .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
3047      .pds_temp_size =
3048         DIV_ROUND_UP(program_info->temps_required << 2U,
3049                      PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
3050
3051      .pds_data_size =
3052         DIV_ROUND_UP(program_info->data_size_in_dwords << 2U,
3053                      PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
3054      .pds_data_offset = pipeline->state.primary_program.data_offset,
3055      .pds_code_offset = pipeline->state.primary_program.code_offset,
3056
3057      .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
3058
3059      .usc_unified_size =
3060         DIV_ROUND_UP(pipeline->state.shader.input_register_count << 2U,
3061                      PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
3062
3063      /* clang-format off */
3064      .global_size = {
3065         global_workgroup_size[0],
3066         global_workgroup_size[1],
3067         global_workgroup_size[2]
3068      },
3069      /* clang-format on */
3070   };
3071
3072   uint32_t work_size = pipeline->state.shader.work_size;
3073   uint32_t coeff_regs;
3074
3075   if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
3076      /* Enforce a single workgroup per cluster through allocation starvation.
3077       */
3078      coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
3079   } else {
3080      coeff_regs = pipeline->state.shader.coefficient_register_count;
3081   }
3082
3083   info.usc_common_size =
3084      DIV_ROUND_UP(coeff_regs << 2U,
3085                   PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
3086
3087   /* Use a whole slot per workgroup. */
3088   work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
3089
3090   coeff_regs += pipeline->state.shader.const_shared_reg_count;
3091
3092   work_size =
3093      pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
3094
3095   info.local_size[0] = work_size;
3096   info.local_size[1] = 1U;
3097   info.local_size[2] = 1U;
3098
3099   info.max_instances =
3100      pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
3101
3102   pvr_compute_generate_control_stream(csb, sub_cmd, &info);
3103}
3104
3105void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
3106                     uint32_t groupCountX,
3107                     uint32_t groupCountY,
3108                     uint32_t groupCountZ)
3109{
3110   const uint32_t workgroup_size[] = { groupCountX, groupCountY, groupCountZ };
3111   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3112   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3113   const struct pvr_compute_pipeline *compute_pipeline =
3114      state->compute_pipeline;
3115   const VkShaderStageFlags push_consts_stage_mask =
3116      compute_pipeline->base.layout->push_constants_shader_stages;
3117   bool push_descriptors_dirty;
3118   struct pvr_sub_cmd_compute *sub_cmd;
3119   VkResult result;
3120
3121   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
3122   assert(compute_pipeline);
3123
3124   if (!groupCountX || !groupCountY || !groupCountZ)
3125      return;
3126
3127   pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
3128
3129   sub_cmd = &state->current_sub_cmd->compute;
3130
3131   sub_cmd->uses_atomic_ops |= compute_pipeline->state.shader.uses_atomic_ops;
3132   sub_cmd->uses_barrier |= compute_pipeline->state.shader.uses_barrier;
3133
3134   if (push_consts_stage_mask & VK_SHADER_STAGE_COMPUTE_BIT) {
3135      /* TODO: Add a dirty push constants mask in the cmd_buffer state and
3136       * check for dirty compute stage.
3137       */
3138      pvr_finishme("Add support for push constants.");
3139   }
3140
3141   pvr_validate_push_descriptors(cmd_buffer, &push_descriptors_dirty);
3142
3143   if (compute_pipeline->state.shader.uses_num_workgroups) {
3144      struct pvr_bo *num_workgroups_bo;
3145
3146      result = pvr_cmd_buffer_upload_general(cmd_buffer,
3147                                             workgroup_size,
3148                                             sizeof(workgroup_size),
3149                                             &num_workgroups_bo);
3150      if (result != VK_SUCCESS)
3151         return;
3152
3153      result = pvr_setup_descriptor_mappings(
3154         cmd_buffer,
3155         PVR_STAGE_ALLOCATION_COMPUTE,
3156         &compute_pipeline->state.descriptor,
3157         &num_workgroups_bo->vma->dev_addr,
3158         &state->pds_compute_descriptor_data_offset);
3159      if (result != VK_SUCCESS)
3160         return;
3161   } else if ((compute_pipeline->base.layout
3162                  ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] &&
3163               state->dirty.compute_desc_dirty) ||
3164              state->dirty.compute_pipeline_binding || push_descriptors_dirty) {
3165      result = pvr_setup_descriptor_mappings(
3166         cmd_buffer,
3167         PVR_STAGE_ALLOCATION_COMPUTE,
3168         &compute_pipeline->state.descriptor,
3169         NULL,
3170         &state->pds_compute_descriptor_data_offset);
3171      if (result != VK_SUCCESS)
3172         return;
3173   }
3174
3175   pvr_compute_update_shared(cmd_buffer, sub_cmd);
3176
3177   pvr_compute_update_kernel(cmd_buffer, sub_cmd, workgroup_size);
3178}
3179
3180void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3181                             VkBuffer _buffer,
3182                             VkDeviceSize offset)
3183{
3184   assert(!"Unimplemented");
3185}
3186
3187static void
3188pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
3189                      const struct pvr_cmd_buffer_draw_state *const draw_state)
3190{
3191   /* We don't have a state to tell us that base_instance is being used so it
3192    * gets used as a boolean - 0 means we'll use a pds program that skips the
3193    * base instance addition. If the base_instance gets used (and the last
3194    * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
3195    * program.
3196    *
3197    * If base_instance changes then we only need to update the data section.
3198    *
3199    * The only draw call state that doesn't really matter is the start vertex
3200    * as that is handled properly in the VDM state in all cases.
3201    */
3202   if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
3203       (state->draw_state.draw_indirect != draw_state->draw_indirect) ||
3204       (state->draw_state.base_instance == 0 &&
3205        draw_state->base_instance != 0)) {
3206      state->dirty.draw_variant = true;
3207   } else if (state->draw_state.base_instance != draw_state->base_instance) {
3208      state->dirty.draw_base_instance = true;
3209   }
3210
3211   state->draw_state = *draw_state;
3212}
3213
3214static uint32_t pvr_calc_shared_regs_count(
3215   const struct pvr_graphics_pipeline *const gfx_pipeline)
3216{
3217   const struct pvr_pipeline_stage_state *const vertex_state =
3218      &gfx_pipeline->vertex_shader_state.stage_state;
3219   uint32_t shared_regs = vertex_state->const_shared_reg_count +
3220                          vertex_state->const_shared_reg_offset;
3221
3222   if (gfx_pipeline->fragment_shader_state.bo) {
3223      const struct pvr_pipeline_stage_state *const fragment_state =
3224         &gfx_pipeline->fragment_shader_state.stage_state;
3225      uint32_t fragment_regs = fragment_state->const_shared_reg_count +
3226                               fragment_state->const_shared_reg_offset;
3227
3228      shared_regs = MAX2(shared_regs, fragment_regs);
3229   }
3230
3231   return shared_regs;
3232}
3233
3234static void
3235pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
3236                         struct pvr_sub_cmd_gfx *const sub_cmd,
3237                         const uint32_t pds_vertex_descriptor_data_offset)
3238{
3239   const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3240   const struct pvr_stage_allocation_descriptor_state
3241      *const vertex_descriptor_state =
3242         &state->gfx_pipeline->vertex_shader_state.descriptor_state;
3243   const struct pvr_pipeline_stage_state *const vertex_stage_state =
3244      &state->gfx_pipeline->vertex_shader_state.stage_state;
3245   struct pvr_csb *const csb = &sub_cmd->control_stream;
3246
3247   if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
3248      return;
3249
3250   pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
3251      state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ALL);
3252
3253      state0.usc_common_size =
3254         DIV_ROUND_UP(vertex_stage_state->const_shared_reg_count << 2,
3255                      PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
3256
3257      state0.pds_data_size = DIV_ROUND_UP(
3258         vertex_descriptor_state->pds_info.data_size_in_dwords << 2,
3259         PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
3260   }
3261
3262   pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
3263      state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
3264      state1.sd_type = PVRX(VDMCTRL_SD_TYPE_NONE);
3265   }
3266
3267   pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
3268      state2.pds_code_addr =
3269         PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
3270   }
3271}
3272
3273static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
3274{
3275   struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3276   const struct pvr_graphics_pipeline *const gfx_pipeline =
3277      cmd_buffer->state.gfx_pipeline;
3278   struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3279   const struct pvr_vertex_shader_state *const vertex_state =
3280      &gfx_pipeline->vertex_shader_state;
3281   uint32_t output_selects;
3282
3283   /* TODO: Handle vertex and fragment shader state flags. */
3284
3285   pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
3286      const VkPrimitiveTopology topology =
3287         gfx_pipeline->input_asm_state.topology;
3288
3289      state.rhw_pres = true;
3290      state.vtxsize = DIV_ROUND_UP(vertex_state->vertex_output_size, 4U);
3291      state.psprite_size_pres = (topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
3292   }
3293
3294   if (ppp_state->output_selects != output_selects) {
3295      ppp_state->output_selects = output_selects;
3296      emit_state->output_selects = true;
3297   }
3298
3299   if (ppp_state->varying_word[0] != vertex_state->varying[0]) {
3300      ppp_state->varying_word[0] = vertex_state->varying[0];
3301      emit_state->varying_word0 = true;
3302   }
3303
3304   if (ppp_state->varying_word[1] != vertex_state->varying[1]) {
3305      ppp_state->varying_word[1] = vertex_state->varying[1];
3306      emit_state->varying_word1 = true;
3307   }
3308}
3309
3310static void
3311pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
3312                                struct PVRX(TA_STATE_ISPA) *const ispa_out)
3313{
3314   struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3315   const struct pvr_graphics_pipeline *const gfx_pipeline =
3316      cmd_buffer->state.gfx_pipeline;
3317   struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3318   const struct pvr_dynamic_state *const dynamic_state =
3319      &cmd_buffer->state.dynamic.common;
3320   const struct pvr_render_pass_info *const pass_info =
3321      &cmd_buffer->state.render_pass_info;
3322   const uint32_t subpass_idx = pass_info->subpass_idx;
3323   const uint32_t *depth_stencil_attachment_idx =
3324      pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment;
3325   const struct pvr_image_view *const attachment =
3326      (!depth_stencil_attachment_idx)
3327         ? NULL
3328         : pass_info->attachments[*depth_stencil_attachment_idx];
3329
3330   const VkCullModeFlags cull_mode = gfx_pipeline->raster_state.cull_mode;
3331   const bool raster_discard_enabled =
3332      gfx_pipeline->raster_state.discard_enable;
3333   const bool disable_all = raster_discard_enabled || !attachment;
3334
3335   const VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology;
3336   const enum PVRX(TA_OBJTYPE) obj_type = pvr_ta_objtype(topology);
3337
3338   const bool disable_stencil_write = disable_all;
3339   const bool disable_stencil_test =
3340      disable_all || !vk_format_has_stencil(attachment->vk.format);
3341
3342   const bool disable_depth_write = disable_all;
3343   const bool disable_depth_test = disable_all ||
3344                                   !vk_format_has_depth(attachment->vk.format);
3345
3346   uint32_t ispb_stencil_off;
3347   bool is_two_sided = false;
3348   uint32_t isp_control;
3349
3350   uint32_t line_width;
3351   uint32_t common_a;
3352   uint32_t front_a;
3353   uint32_t front_b;
3354   uint32_t back_a;
3355   uint32_t back_b;
3356
3357   /* Convert to 4.4 fixed point format. */
3358   line_width = util_unsigned_fixed(dynamic_state->line_width, 4);
3359
3360   /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
3361    * If 0 it stays at 0, otherwise we subtract 1.
3362    */
3363   line_width = (!!line_width) * (line_width - 1);
3364
3365   line_width = MIN2(line_width, PVRX(TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX));
3366
3367   /* TODO: Part of the logic in this function is duplicated in another part
3368    * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
3369    */
3370
3371   pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
3372      ispa.pointlinewidth = line_width;
3373
3374      if (disable_depth_test)
3375         ispa.dcmpmode = PVRX(TA_CMPMODE_ALWAYS);
3376      else
3377         ispa.dcmpmode = pvr_ta_cmpmode(gfx_pipeline->depth_compare_op);
3378
3379      /* FIXME: Can we just have this and remove the assignment above?
3380       * The user provides a depthTestEnable at vkCreateGraphicsPipelines()
3381       * should we be using that?
3382       */
3383      ispa.dcmpmode |= gfx_pipeline->depth_compare_op;
3384
3385      ispa.dwritedisable = disable_depth_test || disable_depth_write;
3386      /* FIXME: Can we just have this and remove the assignment above? */
3387      ispa.dwritedisable = ispa.dwritedisable ||
3388                           gfx_pipeline->depth_write_disable;
3389
3390      ispa.passtype = gfx_pipeline->fragment_shader_state.pass_type;
3391
3392      ispa.objtype = obj_type;
3393
3394      /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
3395       * objtype are needed by pvr_setup_triangle_merging_flag.
3396       */
3397      if (ispa_out)
3398         *ispa_out = ispa;
3399   }
3400
3401   /* FIXME: This logic should be redone and improved. Can we also get rid of
3402    * the front and back variants?
3403    */
3404
3405   pvr_csb_pack (&front_a, TA_STATE_ISPA, ispa) {
3406      ispa.sref = (!disable_stencil_test) * dynamic_state->reference.front;
3407   }
3408   front_a |= common_a;
3409
3410   pvr_csb_pack (&back_a, TA_STATE_ISPA, ispa) {
3411      ispa.sref = (!disable_stencil_test) * dynamic_state->compare_mask.back;
3412   }
3413   back_a |= common_a;
3414
3415   /* TODO: Does this actually represent the ispb control word on stencil off?
3416    * If not, rename the variable.
3417    */
3418   pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
3419      ispb.sop3 = PVRX(TA_ISPB_STENCILOP_KEEP);
3420      ispb.sop2 = PVRX(TA_ISPB_STENCILOP_KEEP);
3421      ispb.sop1 = PVRX(TA_ISPB_STENCILOP_KEEP);
3422      ispb.scmpmode = PVRX(TA_CMPMODE_ALWAYS);
3423   }
3424
3425   if (disable_stencil_test) {
3426      back_b = front_b = ispb_stencil_off;
3427   } else {
3428      pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
3429         ispb.swmask =
3430            (!disable_stencil_write) * dynamic_state->write_mask.front;
3431         ispb.scmpmask = dynamic_state->compare_mask.front;
3432
3433         ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_front.pass_op);
3434         ispb.sop2 =
3435            pvr_ta_stencilop(gfx_pipeline->stencil_front.depth_fail_op);
3436         ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_front.fail_op);
3437
3438         ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_front.compare_op);
3439      }
3440
3441      pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
3442         ispb.swmask =
3443            (!disable_stencil_write) * dynamic_state->write_mask.back;
3444         ispb.scmpmask = dynamic_state->compare_mask.back;
3445
3446         ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_back.pass_op);
3447         ispb.sop2 = pvr_ta_stencilop(gfx_pipeline->stencil_back.depth_fail_op);
3448         ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_back.fail_op);
3449
3450         ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_back.compare_op);
3451      }
3452   }
3453
3454   if (front_a != back_a || front_b != back_b) {
3455      if (cull_mode & VK_CULL_MODE_BACK_BIT) {
3456         /* Single face, using front state. */
3457      } else if (cull_mode & VK_CULL_MODE_FRONT_BIT) {
3458         /* Single face, using back state. */
3459
3460         front_a = back_a;
3461         front_b = back_b;
3462      } else {
3463         /* Both faces. */
3464
3465         emit_state->isp_ba = is_two_sided = true;
3466
3467         if (gfx_pipeline->raster_state.front_face ==
3468             VK_FRONT_FACE_COUNTER_CLOCKWISE) {
3469            uint32_t tmp = front_a;
3470
3471            front_a = back_a;
3472            back_a = tmp;
3473
3474            tmp = front_b;
3475            front_b = back_b;
3476            back_b = tmp;
3477         }
3478
3479         /* HW defaults to stencil off. */
3480         if (back_b != ispb_stencil_off)
3481            emit_state->isp_fb = emit_state->isp_bb = true;
3482      }
3483   }
3484
3485   if (!disable_stencil_test && front_b != ispb_stencil_off)
3486      emit_state->isp_fb = true;
3487
3488   pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
3489      ispctl.upass = pass_info->userpass_spawn;
3490
3491      /* TODO: is bo ever NULL? Figure out what to do. */
3492      ispctl.tagwritedisable = raster_discard_enabled ||
3493                               !gfx_pipeline->fragment_shader_state.bo;
3494
3495      ispctl.two_sided = is_two_sided;
3496      ispctl.bpres = emit_state->isp_fb || emit_state->isp_bb;
3497
3498      ispctl.dbenable = !raster_discard_enabled &&
3499                        gfx_pipeline->raster_state.depth_bias_enable &&
3500                        obj_type == PVRX(TA_OBJTYPE_TRIANGLE);
3501      ispctl.scenable = !raster_discard_enabled;
3502
3503      ppp_state->isp.control_struct = ispctl;
3504   }
3505
3506   emit_state->isp = true;
3507
3508   ppp_state->isp.control = isp_control;
3509   ppp_state->isp.front_a = front_a;
3510   ppp_state->isp.front_b = front_b;
3511   ppp_state->isp.back_a = back_a;
3512   ppp_state->isp.back_b = back_b;
3513}
3514
3515static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
3516                                             const VkRect2D *const scissor,
3517                                             VkRect2D *const rect_out)
3518{
3519   /* TODO: See if we can remove this struct. */
3520   struct pvr_rect {
3521      int32_t x0, y0;
3522      int32_t x1, y1;
3523   };
3524
3525   /* TODO: Worry about overflow? */
3526   const struct pvr_rect scissor_rect = {
3527      .x0 = scissor->offset.x,
3528      .y0 = scissor->offset.y,
3529      .x1 = scissor->offset.x + scissor->extent.width,
3530      .y1 = scissor->offset.y + scissor->extent.height
3531   };
3532   struct pvr_rect viewport_rect = { 0 };
3533
3534   assert(viewport->width >= 0.0f);
3535   assert(scissor_rect.x0 >= 0);
3536   assert(scissor_rect.y0 >= 0);
3537
3538   if (scissor->extent.width == 0 || scissor->extent.height == 0) {
3539      *rect_out = (VkRect2D){ 0 };
3540      return;
3541   }
3542
3543   viewport_rect.x0 = (int32_t)viewport->x;
3544   viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
3545
3546   /* TODO: Is there a mathematical way of doing all this and then clamp at
3547    * the end?
3548    */
3549   /* We flip the y0 and y1 when height is negative. */
3550   viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
3551   viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
3552
3553   if (scissor_rect.x1 <= viewport_rect.x0 ||
3554       scissor_rect.y1 <= viewport_rect.y0 ||
3555       scissor_rect.x0 >= viewport_rect.x1 ||
3556       scissor_rect.y0 >= viewport_rect.y1) {
3557      *rect_out = (VkRect2D){ 0 };
3558      return;
3559   }
3560
3561   /* Determine the overlapping rectangle. */
3562   viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
3563   viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
3564   viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
3565   viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
3566
3567   /* TODO: Is this conversion safe? Is this logic right? */
3568   rect_out->offset.x = (uint32_t)viewport_rect.x0;
3569   rect_out->offset.y = (uint32_t)viewport_rect.y0;
3570   rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
3571   rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
3572}
3573
3574static inline uint32_t
3575pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
3576{
3577   /* TODO: This should come from rogue_ppp.xml. */
3578   return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
3579}
3580
3581/* FIXME: Remove device param when PVR_HAS_FEATURE() accepts const dev_info */
3582static void
3583pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
3584{
3585   struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3586   struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3587   const struct pvr_dynamic_state *const dynamic_state =
3588      &cmd_buffer->state.dynamic.common;
3589   const struct PVRX(TA_STATE_ISPCTL) *const ispctl =
3590      &ppp_state->isp.control_struct;
3591   struct pvr_device_info *const dev_info =
3592      &cmd_buffer->device->pdevice->dev_info;
3593
3594   if (ispctl->dbenable)
3595      assert(!"Unimplemented");
3596
3597   if (ispctl->scenable) {
3598      const uint32_t region_clip_align_size =
3599         pvr_get_geom_region_clip_align_size(dev_info);
3600      const VkViewport *const viewport = &dynamic_state->viewport.viewports[0];
3601      const VkRect2D *const scissor = &dynamic_state->scissor.scissors[0];
3602      VkRect2D overlap_rect;
3603      uint32_t scissor_words[2];
3604      uint32_t height;
3605      uint32_t width;
3606      uint32_t x;
3607      uint32_t y;
3608
3609      /* For region clip. */
3610      uint32_t bottom;
3611      uint32_t right;
3612      uint32_t left;
3613      uint32_t top;
3614
3615      /* We don't support multiple viewport calculations. */
3616      assert(dynamic_state->viewport.count == 1);
3617      /* We don't support multiple scissor calculations. */
3618      assert(dynamic_state->scissor.count == 1);
3619
3620      pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
3621
3622      x = overlap_rect.offset.x;
3623      y = overlap_rect.offset.y;
3624      width = overlap_rect.extent.width;
3625      height = overlap_rect.extent.height;
3626
3627      pvr_csb_pack (&scissor_words[0], IPF_SCISSOR_WORD_0, word0) {
3628         word0.scw0_xmax = x + width;
3629         word0.scw0_xmin = x;
3630      }
3631
3632      pvr_csb_pack (&scissor_words[1], IPF_SCISSOR_WORD_1, word1) {
3633         word1.scw1_ymax = y + height;
3634         word1.scw1_ymin = y;
3635      }
3636
3637      if (cmd_buffer->scissor_array.size &&
3638          cmd_buffer->scissor_words[0] == scissor_words[0] &&
3639          cmd_buffer->scissor_words[1] == scissor_words[1]) {
3640         return;
3641      }
3642
3643      cmd_buffer->scissor_words[0] = scissor_words[0];
3644      cmd_buffer->scissor_words[1] = scissor_words[1];
3645
3646      /* Calculate region clip. */
3647
3648      left = x / region_clip_align_size;
3649      top = y / region_clip_align_size;
3650
3651      /* We prevent right=-1 with the multiplication. */
3652      /* TODO: Is there a better way of doing this? */
3653      if ((x + width) != 0U)
3654         right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
3655      else
3656         right = 0;
3657
3658      if ((y + height) != 0U)
3659         bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
3660      else
3661         bottom = 0U;
3662
3663      /* Setup region clip to clip everything outside what was calculated. */
3664
3665      /* FIXME: Should we mask to prevent writing over other words? */
3666      pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
3667         word0.right = right;
3668         word0.left = left;
3669         word0.mode = PVRX(TA_REGION_CLIP_MODE_OUTSIDE);
3670      }
3671
3672      pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
3673         word1.bottom = bottom;
3674         word1.top = top;
3675      }
3676
3677      ppp_state->depthbias_scissor_indices.scissor_index =
3678         util_dynarray_num_elements(&cmd_buffer->scissor_array,
3679                                    __typeof__(cmd_buffer->scissor_words));
3680
3681      memcpy(util_dynarray_grow_bytes(&cmd_buffer->scissor_array,
3682                                      1,
3683                                      sizeof(cmd_buffer->scissor_words)),
3684             cmd_buffer->scissor_words,
3685             sizeof(cmd_buffer->scissor_words));
3686
3687      emit_state->isp_dbsc = true;
3688      emit_state->region_clip = true;
3689   }
3690}
3691
3692static void
3693pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
3694                                struct PVRX(TA_STATE_ISPA) * ispa)
3695{
3696   struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3697   struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3698   uint32_t merge_word;
3699   uint32_t mask;
3700
3701   pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
3702      /* Disable for lines or punch-through or for DWD and depth compare
3703       * always.
3704       */
3705      if (ispa->objtype == PVRX(TA_OBJTYPE_LINE) ||
3706          ispa->passtype == PVRX(TA_PASSTYPE_PUNCH_THROUGH) ||
3707          (ispa->dwritedisable && ispa->dcmpmode == PVRX(TA_CMPMODE_ALWAYS))) {
3708         size_info.pds_tri_merge_disable = true;
3709      }
3710   }
3711
3712   pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
3713      size_info.pds_tri_merge_disable = true;
3714   }
3715
3716   merge_word |= ppp_state->pds.size_info2 & ~mask;
3717
3718   if (merge_word != ppp_state->pds.size_info2) {
3719      ppp_state->pds.size_info2 = merge_word;
3720      emit_state->pds_fragment_stateptr0 = true;
3721   }
3722}
3723
3724static void
3725pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
3726                                  struct pvr_sub_cmd_gfx *const sub_cmd)
3727{
3728   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3729   const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
3730      &state->gfx_pipeline->fragment_shader_state.descriptor_state;
3731   const struct pvr_pds_upload *pds_coeff_program =
3732      &state->gfx_pipeline->fragment_shader_state.pds_coeff_program;
3733   const struct pvr_pipeline_stage_state *fragment_state =
3734      &state->gfx_pipeline->fragment_shader_state.stage_state;
3735   const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
3736   struct pvr_emit_state *const emit_state = &state->emit_state;
3737   struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3738
3739   const uint32_t pds_uniform_size =
3740      DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
3741                   PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE));
3742
3743   const uint32_t pds_varying_state_size =
3744      DIV_ROUND_UP(pds_coeff_program->data_size,
3745                   PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE));
3746
3747   const uint32_t usc_varying_size =
3748      DIV_ROUND_UP(fragment_state->coefficient_size,
3749                   PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
3750
3751   const uint32_t pds_temp_size =
3752      DIV_ROUND_UP(fragment_state->temps_count,
3753                   PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
3754
3755   const uint32_t usc_shared_size =
3756      DIV_ROUND_UP(fragment_state->const_shared_reg_count,
3757                   PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
3758
3759   const uint32_t max_tiles_in_flight =
3760      pvr_calc_fscommon_size_and_tiles_in_flight(
3761         pdevice,
3762         usc_shared_size *
3763            PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE),
3764         1);
3765   uint32_t size_info_mask;
3766   uint32_t size_info2;
3767
3768   if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
3769      sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
3770
3771   pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
3772                 TA_STATE_PDS_SHADERBASE,
3773                 shader_base) {
3774      const struct pvr_pds_upload *const pds_upload =
3775         &state->gfx_pipeline->fragment_shader_state.pds_fragment_program;
3776
3777      shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
3778   }
3779
3780   if (descriptor_shader_state->pds_code.pvr_bo) {
3781      pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
3782                    TA_STATE_PDS_TEXUNICODEBASE,
3783                    tex_base) {
3784         tex_base.addr =
3785            PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
3786      }
3787   } else {
3788      ppp_state->pds.texture_uniform_code_base = 0U;
3789   }
3790
3791   pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
3792      info1.pds_uniformsize = pds_uniform_size;
3793      info1.pds_texturestatesize = 0U;
3794      info1.pds_varyingsize = pds_varying_state_size;
3795      info1.usc_varyingsize = usc_varying_size;
3796      info1.pds_tempsize = pds_temp_size;
3797   }
3798
3799   pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
3800      mask.pds_tri_merge_disable = true;
3801   }
3802
3803   ppp_state->pds.size_info2 &= size_info_mask;
3804
3805   pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
3806      info2.usc_sharedsize = usc_shared_size;
3807   }
3808
3809   ppp_state->pds.size_info2 |= size_info2;
3810
3811   if (pds_coeff_program->pvr_bo) {
3812      state->emit_state.pds_fragment_stateptr1 = true;
3813
3814      pvr_csb_pack (&ppp_state->pds.varying_base,
3815                    TA_STATE_PDS_VARYINGBASE,
3816                    base) {
3817         base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset);
3818      }
3819   } else {
3820      ppp_state->pds.varying_base = 0U;
3821   }
3822
3823   pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
3824                 TA_STATE_PDS_UNIFORMDATABASE,
3825                 base) {
3826      base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
3827   }
3828
3829   emit_state->pds_fragment_stateptr0 = true;
3830   emit_state->pds_fragment_stateptr3 = true;
3831}
3832
3833static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
3834{
3835   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3836   struct pvr_emit_state *const emit_state = &state->emit_state;
3837   struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3838
3839   if (ppp_state->viewport_count != state->dynamic.common.viewport.count) {
3840      ppp_state->viewport_count = state->dynamic.common.viewport.count;
3841      emit_state->viewport = true;
3842   }
3843
3844   if (state->gfx_pipeline->raster_state.discard_enable) {
3845      /* We don't want to emit any viewport data as it'll just get thrown
3846       * away. It's after the previous condition because we still want to
3847       * stash the viewport_count as it's our trigger for when
3848       * rasterizer discard gets disabled.
3849       */
3850      emit_state->viewport = false;
3851      return;
3852   }
3853
3854   for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
3855      VkViewport *viewport = &state->dynamic.common.viewport.viewports[i];
3856      uint32_t x_scale = fui(viewport->width * 0.5f);
3857      uint32_t y_scale = fui(viewport->height * 0.5f);
3858      uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
3859      uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
3860      uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
3861      uint32_t z_center = fui(viewport->minDepth);
3862
3863      if (ppp_state->viewports[i].a0 != x_center ||
3864          ppp_state->viewports[i].m0 != x_scale ||
3865          ppp_state->viewports[i].a1 != y_center ||
3866          ppp_state->viewports[i].m1 != y_scale ||
3867          ppp_state->viewports[i].a2 != z_center ||
3868          ppp_state->viewports[i].m2 != z_scale) {
3869         ppp_state->viewports[i].a0 = x_center;
3870         ppp_state->viewports[i].m0 = x_scale;
3871         ppp_state->viewports[i].a1 = y_center;
3872         ppp_state->viewports[i].m1 = y_scale;
3873         ppp_state->viewports[i].a2 = z_center;
3874         ppp_state->viewports[i].m2 = z_scale;
3875
3876         emit_state->viewport = true;
3877      }
3878   }
3879}
3880
3881static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
3882{
3883   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3884   const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
3885   struct pvr_emit_state *const emit_state = &state->emit_state;
3886   struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3887   uint32_t ppp_control;
3888
3889   pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
3890      const struct pvr_raster_state *raster_state = &gfx_pipeline->raster_state;
3891      VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology;
3892      control.drawclippededges = true;
3893      control.wclampen = true;
3894
3895      if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
3896         control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_1);
3897      else
3898         control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_0);
3899
3900      if (raster_state->depth_clamp_enable)
3901         control.clip_mode = PVRX(TA_CLIP_MODE_NO_FRONT_OR_REAR);
3902      else
3903         control.clip_mode = PVRX(TA_CLIP_MODE_FRONT_REAR);
3904
3905      /* +--- FrontIsCCW?
3906       * | +--- Cull Front?
3907       * v v
3908       * 0|0 CULLMODE_CULL_CCW,
3909       * 0|1 CULLMODE_CULL_CW,
3910       * 1|0 CULLMODE_CULL_CW,
3911       * 1|1 CULLMODE_CULL_CCW,
3912       */
3913      switch (raster_state->cull_mode) {
3914      case VK_CULL_MODE_BACK_BIT:
3915      case VK_CULL_MODE_FRONT_BIT:
3916         if ((raster_state->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
3917             (raster_state->cull_mode == VK_CULL_MODE_FRONT_BIT)) {
3918            control.cullmode = PVRX(TA_CULLMODE_CULL_CW);
3919         } else {
3920            control.cullmode = PVRX(TA_CULLMODE_CULL_CCW);
3921         }
3922
3923         break;
3924
3925      case VK_CULL_MODE_NONE:
3926         control.cullmode = PVRX(TA_CULLMODE_NO_CULLING);
3927         break;
3928
3929      default:
3930         unreachable("Unsupported cull mode!");
3931      }
3932   }
3933
3934   if (ppp_control != ppp_state->ppp_control) {
3935      ppp_state->ppp_control = ppp_control;
3936      emit_state->ppp_control = true;
3937   }
3938}
3939
3940/* Largest valid PPP State update in words = 31
3941 * 1 - Header
3942 * 3 - Stream Out Config words 0, 1 and 2
3943 * 1 - PPP Control word
3944 * 3 - Varying Config words 0, 1 and 2
3945 * 1 - Output Select
3946 * 1 - WClamp
3947 * 6 - Viewport Transform words
3948 * 2 - Region Clip words
3949 * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
3950 * 4 - PDS State for fragment phase (PDSSTATEPTR0)
3951 * 6 - ISP Control Words
3952 */
3953#define PVR_MAX_PPP_STATE_DWORDS 31
3954
3955static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
3956                                   struct pvr_sub_cmd_gfx *const sub_cmd)
3957{
3958   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3959   struct pvr_emit_state *const emit_state = &state->emit_state;
3960   struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3961   struct pvr_csb *const control_stream = &sub_cmd->control_stream;
3962   uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
3963   uint32_t ppp_state_words_count;
3964   uint32_t ppp_state_header;
3965   bool deferred_secondary;
3966   struct pvr_bo *pvr_bo;
3967   uint32_t *buffer_ptr;
3968   VkResult result;
3969
3970   buffer_ptr = ppp_state_words;
3971
3972   pvr_csb_pack (&ppp_state_header, TA_STATE_HEADER, header) {
3973      header.view_port_count = (ppp_state->viewport_count == 0)
3974                                  ? 0U
3975                                  : (ppp_state->viewport_count - 1);
3976
3977      /* Skip over header. */
3978      buffer_ptr++;
3979
3980      /* Set ISP state. */
3981      if (emit_state->isp) {
3982         header.pres_ispctl = true;
3983         *buffer_ptr++ = ppp_state->isp.control;
3984         header.pres_ispctl_fa = true;
3985         *buffer_ptr++ = ppp_state->isp.front_a;
3986
3987         if (emit_state->isp_fb) {
3988            header.pres_ispctl_fb = true;
3989            *buffer_ptr++ = ppp_state->isp.front_b;
3990         }
3991
3992         if (emit_state->isp_ba) {
3993            header.pres_ispctl_ba = true;
3994            *buffer_ptr++ = ppp_state->isp.back_a;
3995         }
3996
3997         if (emit_state->isp_bb) {
3998            header.pres_ispctl_bb = true;
3999            *buffer_ptr++ = ppp_state->isp.back_b;
4000         }
4001      }
4002
4003      /* Depth bias / scissor
4004       * If deferred_secondary is true then we do a separate state update
4005       * which gets patched in ExecuteDeferredCommandBuffer.
4006       */
4007      /* TODO: Update above comment when we port ExecuteDeferredCommandBuffer.
4008       */
4009      deferred_secondary =
4010         cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
4011         cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
4012
4013      if (emit_state->isp_dbsc && !deferred_secondary) {
4014         header.pres_ispctl_dbsc = true;
4015
4016         pvr_csb_pack (buffer_ptr++, TA_STATE_ISPDBSC, ispdbsc) {
4017            ispdbsc.dbindex =
4018               ppp_state->depthbias_scissor_indices.depthbias_index;
4019            ispdbsc.scindex =
4020               ppp_state->depthbias_scissor_indices.scissor_index;
4021         }
4022      }
4023
4024      /* PDS state. */
4025      if (emit_state->pds_fragment_stateptr0) {
4026         header.pres_pds_state_ptr0 = true;
4027
4028         *buffer_ptr++ = ppp_state->pds.pixel_shader_base;
4029         *buffer_ptr++ = ppp_state->pds.texture_uniform_code_base;
4030         *buffer_ptr++ = ppp_state->pds.size_info1;
4031         *buffer_ptr++ = ppp_state->pds.size_info2;
4032      }
4033
4034      if (emit_state->pds_fragment_stateptr1) {
4035         header.pres_pds_state_ptr1 = true;
4036         *buffer_ptr++ = ppp_state->pds.varying_base;
4037      }
4038
4039      /* We don't use the pds_fragment_stateptr2 (texture state programs)
4040       * control word, but this doesn't mean we need to set it to 0. This is
4041       * because the hardware runs the texture state program only when the
4042       * pds_texture state field of PDS_SIZEINFO1 is non-zero.
4043       */
4044
4045      if (emit_state->pds_fragment_stateptr3) {
4046         header.pres_pds_state_ptr3 = true;
4047         *buffer_ptr++ = ppp_state->pds.uniform_state_data_base;
4048      }
4049
4050      /* Region clip. */
4051      if (emit_state->region_clip) {
4052         header.pres_region_clip = true;
4053         *buffer_ptr++ = ppp_state->region_clipping.word0;
4054         *buffer_ptr++ = ppp_state->region_clipping.word1;
4055      }
4056
4057      /* Viewport. */
4058      if (emit_state->viewport) {
4059         const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
4060
4061         header.pres_viewport = true;
4062         for (uint32_t i = 0; i < viewports; i++) {
4063            *buffer_ptr++ = ppp_state->viewports[i].a0;
4064            *buffer_ptr++ = ppp_state->viewports[i].m0;
4065            *buffer_ptr++ = ppp_state->viewports[i].a1;
4066            *buffer_ptr++ = ppp_state->viewports[i].m1;
4067            *buffer_ptr++ = ppp_state->viewports[i].a2;
4068            *buffer_ptr++ = ppp_state->viewports[i].m2;
4069         }
4070      }
4071
4072      /* W clamp. */
4073      if (emit_state->wclamp) {
4074         const float wclamp = 0.00001f;
4075
4076         header.pres_wclamp = true;
4077         *buffer_ptr++ = fui(wclamp);
4078      }
4079
4080      /* Output selects. */
4081      if (emit_state->output_selects) {
4082         header.pres_outselects = true;
4083         *buffer_ptr++ = ppp_state->output_selects;
4084      }
4085
4086      /* Varying words. */
4087      if (emit_state->varying_word0) {
4088         header.pres_varying_word0 = true;
4089         *buffer_ptr++ = ppp_state->varying_word[0];
4090      }
4091
4092      if (emit_state->varying_word1) {
4093         header.pres_varying_word1 = true;
4094         *buffer_ptr++ = ppp_state->varying_word[1];
4095      }
4096
4097      if (emit_state->varying_word2) {
4098         /* We only emit this on the first draw of a render job to prevent us
4099          * from inheriting a non-zero value set elsewhere.
4100          */
4101         header.pres_varying_word2 = true;
4102         *buffer_ptr++ = 0;
4103      }
4104
4105      /* PPP control. */
4106      if (emit_state->ppp_control) {
4107         header.pres_ppp_ctrl = true;
4108         *buffer_ptr++ = ppp_state->ppp_control;
4109      }
4110
4111      if (emit_state->stream_out) {
4112         /* We only emit this on the first draw of a render job to prevent us
4113          * from inheriting a non-zero value set elsewhere.
4114          */
4115         header.pres_stream_out_size = true;
4116         *buffer_ptr++ = 0;
4117      }
4118   }
4119
4120   if (!ppp_state_header)
4121      return VK_SUCCESS;
4122
4123   ppp_state_words_count = buffer_ptr - ppp_state_words;
4124   ppp_state_words[0] = ppp_state_header;
4125
4126   result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
4127                                     cmd_buffer->device->heaps.general_heap,
4128                                     ppp_state_words_count * sizeof(uint32_t),
4129                                     PVR_BO_ALLOC_FLAG_CPU_MAPPED,
4130                                     &pvr_bo);
4131   if (result != VK_SUCCESS)
4132      return result;
4133
4134   memcpy(pvr_bo->bo->map,
4135          ppp_state_words,
4136          ppp_state_words_count * sizeof(uint32_t));
4137
4138   /* Write the VDM state update into the VDM control stream. */
4139   pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
4140      state0.word_count = ppp_state_words_count;
4141      state0.addrmsb = pvr_bo->vma->dev_addr;
4142   }
4143
4144   pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
4145      state1.addrlsb = pvr_bo->vma->dev_addr;
4146   }
4147
4148   if (emit_state->isp_dbsc &&
4149       cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
4150      pvr_finishme("Unimplemented path!!");
4151   }
4152
4153   state->emit_state_bits = 0;
4154
4155   return VK_SUCCESS;
4156}
4157
4158static VkResult
4159pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
4160                         struct pvr_sub_cmd_gfx *const sub_cmd)
4161{
4162   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4163   const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
4164   const bool dirty_stencil = state->dirty.compare_mask ||
4165                              state->dirty.write_mask || state->dirty.reference;
4166   VkResult result;
4167
4168   if (!(dirty_stencil || state->dirty.depth_bias ||
4169         state->dirty.fragment_descriptors || state->dirty.line_width ||
4170         state->dirty.gfx_pipeline_binding || state->dirty.scissor ||
4171         state->dirty.userpass_spawn || state->dirty.viewport ||
4172         state->emit_state_bits)) {
4173      return VK_SUCCESS;
4174   }
4175
4176   if (state->dirty.gfx_pipeline_binding) {
4177      struct PVRX(TA_STATE_ISPA) ispa;
4178
4179      pvr_setup_output_select(cmd_buffer);
4180      pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
4181      pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
4182   } else if (dirty_stencil || state->dirty.line_width ||
4183              state->dirty.userpass_spawn) {
4184      pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
4185   }
4186
4187   if (!gfx_pipeline->raster_state.discard_enable &&
4188       state->dirty.fragment_descriptors &&
4189       gfx_pipeline->fragment_shader_state.bo) {
4190      pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
4191   }
4192
4193   pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
4194
4195   if (state->dirty.viewport)
4196      pvr_setup_viewport(cmd_buffer);
4197
4198   pvr_setup_ppp_control(cmd_buffer);
4199
4200   if (gfx_pipeline->raster_state.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
4201      /* FIXME: Port SetNegativeViewport(). */
4202   }
4203
4204   result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
4205   if (result != VK_SUCCESS)
4206      return result;
4207
4208   return VK_SUCCESS;
4209}
4210
4211static void
4212pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info,
4213                              const uint32_t vs_output_size,
4214                              const bool raster_enable,
4215                              uint32_t *const cam_size_out,
4216                              uint32_t *const vs_max_instances_out)
4217{
4218   /* First work out the size of a vertex in the UVS and multiply by 4 for
4219    * column ordering.
4220    */
4221   const uint32_t uvs_vertex_vector_size_in_dwords =
4222      (vs_output_size + 1U + raster_enable * 4U) * 4U;
4223   const uint32_t vdm_cam_size =
4224      PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
4225
4226   /* This is a proxy for 8XE. */
4227   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
4228       vdm_cam_size < 96U) {
4229      /* Comparisons are based on size including scratch per vertex vector. */
4230      if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
4231         *cam_size_out = MIN2(31U, vdm_cam_size - 1U);
4232         *vs_max_instances_out = 16U;
4233      } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
4234         *cam_size_out = 15U;
4235         *vs_max_instances_out = 16U;
4236      } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
4237         *cam_size_out = 11U;
4238         *vs_max_instances_out = 12U;
4239      } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
4240         *cam_size_out = 7U;
4241         *vs_max_instances_out = 8U;
4242      } else if (PVR_HAS_FEATURE(dev_info,
4243                                 simple_internal_parameter_format_v2) ||
4244                 uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
4245         *cam_size_out = 7U;
4246         *vs_max_instances_out = 4U;
4247      } else {
4248         *cam_size_out = 3U;
4249         *vs_max_instances_out = 2U;
4250      }
4251   } else {
4252      /* Comparisons are based on size including scratch per vertex vector. */
4253      if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
4254         /* output size <= 27 + 5 scratch. */
4255         *cam_size_out = MIN2(95U, vdm_cam_size - 1U);
4256         *vs_max_instances_out = 0U;
4257      } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
4258         /* output size <= 43 + 5 scratch */
4259         *cam_size_out = 63U;
4260         if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
4261            *vs_max_instances_out = 16U;
4262         else
4263            *vs_max_instances_out = 0U;
4264      } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
4265         /* output size <= 59 + 5 scratch. */
4266         *cam_size_out = 31U;
4267         if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
4268            *vs_max_instances_out = 16U;
4269         else
4270            *vs_max_instances_out = 0U;
4271      } else {
4272         *cam_size_out = 15U;
4273         *vs_max_instances_out = 16U;
4274      }
4275   }
4276}
4277
4278static void
4279pvr_emit_dirty_vdm_state(const struct pvr_cmd_buffer *const cmd_buffer,
4280                         struct pvr_sub_cmd_gfx *const sub_cmd)
4281{
4282   /* FIXME: Assume all state is dirty for the moment. */
4283   struct pvr_device_info *const dev_info =
4284      &cmd_buffer->device->pdevice->dev_info;
4285   ASSERTED const uint32_t max_user_vertex_output_components =
4286      pvr_get_max_user_vertex_output_components(dev_info);
4287   struct PVRX(VDMCTRL_VDM_STATE0)
4288      header = { pvr_cmd_header(VDMCTRL_VDM_STATE0) };
4289   const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4290   const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
4291   struct pvr_csb *const csb = &sub_cmd->control_stream;
4292   uint32_t vs_output_size;
4293   uint32_t max_instances;
4294   uint32_t cam_size;
4295
4296   assert(gfx_pipeline);
4297
4298   /* CAM Calculations and HW state take vertex size aligned to DWORDS. */
4299   vs_output_size =
4300      DIV_ROUND_UP(gfx_pipeline->vertex_shader_state.vertex_output_size,
4301                   PVRX(VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE));
4302
4303   assert(vs_output_size <= max_user_vertex_output_components);
4304
4305   pvr_calculate_vertex_cam_size(dev_info,
4306                                 vs_output_size,
4307                                 true,
4308                                 &cam_size,
4309                                 &max_instances);
4310
4311   pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
4312      state0.cam_size = cam_size;
4313
4314      if (gfx_pipeline->input_asm_state.primitive_restart) {
4315         state0.cut_index_enable = true;
4316         state0.cut_index_present = true;
4317      }
4318
4319      switch (gfx_pipeline->input_asm_state.topology) {
4320      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
4321         state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_1);
4322         break;
4323
4324      default:
4325         state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_0);
4326         break;
4327      }
4328
4329      /* If we've bound a different vertex buffer, or this draw-call requires
4330       * a different PDS attrib data-section from the last draw call (changed
4331       * base_instance) then we need to specify a new data section. This is
4332       * also the case if we've switched pipeline or attrib program as the
4333       * data-section layout will be different.
4334       */
4335      state0.vs_data_addr_present =
4336         state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
4337         state->dirty.draw_base_instance || state->dirty.draw_variant;
4338
4339      /* Need to specify new PDS Attrib program if we've bound a different
4340       * pipeline or we needed a different PDS Attrib variant for this
4341       * draw-call.
4342       */
4343      state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
4344                                state->dirty.draw_variant;
4345
4346      /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
4347       * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
4348       * Vulkan doesn't support stream output and the vertex position is
4349       * always emitted to the UVB.
4350       */
4351      state0.uvs_scratch_size_select =
4352         PVRX(VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE);
4353
4354      header = state0;
4355   }
4356
4357   if (header.cut_index_present) {
4358      pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
4359         switch (state->index_buffer_binding.type) {
4360         case VK_INDEX_TYPE_UINT32:
4361            /* FIXME: Defines for these? These seem to come from the Vulkan
4362             * spec. for VkPipelineInputAssemblyStateCreateInfo
4363             * primitiveRestartEnable.
4364             */
4365            state1.cut_index = 0xFFFFFFFF;
4366            break;
4367
4368         case VK_INDEX_TYPE_UINT16:
4369            state1.cut_index = 0xFFFF;
4370            break;
4371
4372         default:
4373            unreachable(!"Invalid index type");
4374         }
4375      }
4376   }
4377
4378   if (header.vs_data_addr_present) {
4379      pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
4380         state2.vs_pds_data_base_addr =
4381            PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
4382      }
4383   }
4384
4385   if (header.vs_other_present) {
4386      const uint32_t usc_unified_store_size_in_bytes =
4387         gfx_pipeline->vertex_shader_state.vertex_input_size << 2;
4388
4389      pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
4390         state3.vs_pds_code_base_addr =
4391            PVR_DEV_ADDR(state->pds_shader.code_offset);
4392      }
4393
4394      pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
4395         state4.vs_output_size = vs_output_size;
4396      }
4397
4398      pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
4399         state5.vs_max_instances = max_instances;
4400         state5.vs_usc_common_size = 0U;
4401         state5.vs_usc_unified_size = DIV_ROUND_UP(
4402            usc_unified_store_size_in_bytes,
4403            PVRX(VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE));
4404         state5.vs_pds_temp_size =
4405            DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
4406                         PVRX(VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE));
4407         state5.vs_pds_data_size =
4408            DIV_ROUND_UP(state->pds_shader.info->data_size_in_dwords << 2,
4409                         PVRX(VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE));
4410      }
4411   }
4412}
4413
4414static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
4415{
4416   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4417   const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
4418   const struct pvr_pipeline_stage_state *const fragment_state =
4419      &gfx_pipeline->fragment_shader_state.stage_state;
4420   struct pvr_sub_cmd_gfx *sub_cmd;
4421   bool fstencil_writemask_zero;
4422   bool bstencil_writemask_zero;
4423   bool push_descriptors_dirty;
4424   bool fstencil_keep;
4425   bool bstencil_keep;
4426   VkResult result;
4427
4428   pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
4429
4430   sub_cmd = &state->current_sub_cmd->gfx;
4431   sub_cmd->empty_cmd = false;
4432
4433   /* Determine pipeline depth/stencil usage. If a pipeline uses depth or
4434    * stencil testing, those attachments are using their loaded values, and
4435    * the loadOps cannot be optimized out.
4436    */
4437   /* Pipeline uses depth testing. */
4438   if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
4439       gfx_pipeline->depth_compare_op != VK_COMPARE_OP_ALWAYS) {
4440      sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
4441   }
4442
4443   /* Pipeline uses stencil testing. */
4444   if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
4445       (gfx_pipeline->stencil_front.compare_op != VK_COMPARE_OP_ALWAYS ||
4446        gfx_pipeline->stencil_back.compare_op != VK_COMPARE_OP_ALWAYS)) {
4447      sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
4448   }
4449
4450   if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
4451                       compute_overlap)) {
4452      uint32_t coefficient_size =
4453         DIV_ROUND_UP(fragment_state->coefficient_size,
4454                      PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
4455
4456      if (coefficient_size >
4457          PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE))
4458         sub_cmd->disable_compute_overlap = true;
4459   }
4460
4461   sub_cmd->frag_uses_atomic_ops |= fragment_state->uses_atomic_ops;
4462   sub_cmd->frag_has_side_effects |= fragment_state->has_side_effects;
4463   sub_cmd->frag_uses_texture_rw |= fragment_state->uses_texture_rw;
4464   sub_cmd->vertex_uses_texture_rw |=
4465      gfx_pipeline->vertex_shader_state.stage_state.uses_texture_rw;
4466
4467   fstencil_keep =
4468      (gfx_pipeline->stencil_front.fail_op == VK_STENCIL_OP_KEEP) &&
4469      (gfx_pipeline->stencil_front.pass_op == VK_STENCIL_OP_KEEP);
4470   bstencil_keep = (gfx_pipeline->stencil_back.fail_op == VK_STENCIL_OP_KEEP) &&
4471                   (gfx_pipeline->stencil_back.pass_op == VK_STENCIL_OP_KEEP);
4472   fstencil_writemask_zero = (state->dynamic.common.write_mask.front == 0);
4473   bstencil_writemask_zero = (state->dynamic.common.write_mask.back == 0);
4474
4475   /* Set stencil modified flag if:
4476    * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
4477    * - Neither front nor back-facing stencil has a write_mask of zero.
4478    */
4479   if (!(fstencil_keep && bstencil_keep) &&
4480       !(fstencil_writemask_zero && bstencil_writemask_zero)) {
4481      sub_cmd->modifies_stencil = true;
4482   }
4483
4484   /* Set depth modified flag if depth write is enabled. */
4485   if (!gfx_pipeline->depth_write_disable)
4486      sub_cmd->modifies_depth = true;
4487
4488   /* If either the data or code changes for pds vertex attribs, regenerate the
4489    * data segment.
4490    */
4491   if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
4492       state->dirty.draw_variant || state->dirty.draw_base_instance) {
4493      enum pvr_pds_vertex_attrib_program_type prog_type;
4494      const struct pvr_pds_attrib_program *program;
4495
4496      if (state->draw_state.draw_indirect)
4497         prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
4498      else if (state->draw_state.base_instance)
4499         prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
4500      else
4501         prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
4502
4503      program =
4504         &gfx_pipeline->vertex_shader_state.pds_attrib_programs[prog_type];
4505      state->pds_shader.info = &program->info;
4506      state->pds_shader.code_offset = program->program.code_offset;
4507
4508      state->max_shared_regs =
4509         MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
4510
4511      pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
4512   }
4513
4514   /* TODO: Check for dirty push constants */
4515
4516   pvr_validate_push_descriptors(cmd_buffer, &push_descriptors_dirty);
4517
4518   state->dirty.vertex_descriptors = push_descriptors_dirty ||
4519                                     state->dirty.gfx_pipeline_binding;
4520   state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
4521
4522   if (state->dirty.fragment_descriptors) {
4523      result = pvr_setup_descriptor_mappings(
4524         cmd_buffer,
4525         PVR_STAGE_ALLOCATION_FRAGMENT,
4526         &state->gfx_pipeline->fragment_shader_state.descriptor_state,
4527         NULL,
4528         &state->pds_fragment_descriptor_data_offset);
4529      if (result != VK_SUCCESS) {
4530         mesa_loge("Could not setup fragment descriptor mappings.");
4531         return result;
4532      }
4533   }
4534
4535   if (state->dirty.vertex_descriptors) {
4536      uint32_t pds_vertex_descriptor_data_offset;
4537
4538      result = pvr_setup_descriptor_mappings(
4539         cmd_buffer,
4540         PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
4541         &state->gfx_pipeline->vertex_shader_state.descriptor_state,
4542         NULL,
4543         &pds_vertex_descriptor_data_offset);
4544      if (result != VK_SUCCESS) {
4545         mesa_loge("Could not setup vertex descriptor mappings.");
4546         return result;
4547      }
4548
4549      pvr_emit_dirty_pds_state(cmd_buffer,
4550                               sub_cmd,
4551                               pds_vertex_descriptor_data_offset);
4552   }
4553
4554   pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
4555   pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
4556
4557   state->dirty.gfx_desc_dirty = false;
4558   state->dirty.blend_constants = false;
4559   state->dirty.compare_mask = false;
4560   state->dirty.depth_bias = false;
4561   state->dirty.draw_base_instance = false;
4562   state->dirty.draw_variant = false;
4563   state->dirty.fragment_descriptors = false;
4564   state->dirty.line_width = false;
4565   state->dirty.gfx_pipeline_binding = false;
4566   state->dirty.reference = false;
4567   state->dirty.scissor = false;
4568   state->dirty.userpass_spawn = false;
4569   state->dirty.vertex_bindings = false;
4570   state->dirty.viewport = false;
4571   state->dirty.write_mask = false;
4572
4573   return VK_SUCCESS;
4574}
4575
4576static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
4577{
4578   switch (topology) {
4579   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
4580      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST);
4581   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
4582      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST);
4583   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
4584      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP);
4585   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
4586      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
4587   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
4588      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP);
4589   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
4590      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN);
4591   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
4592      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ);
4593   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
4594      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ);
4595   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
4596      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ);
4597   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
4598      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ);
4599   case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
4600      return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST);
4601   default:
4602      unreachable("Undefined primitive topology");
4603   }
4604}
4605
4606static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
4607                                    struct pvr_sub_cmd_gfx *const sub_cmd,
4608                                    VkPrimitiveTopology topology,
4609                                    uint32_t first_vertex,
4610                                    uint32_t vertex_count,
4611                                    uint32_t first_index,
4612                                    uint32_t index_count,
4613                                    uint32_t instance_count)
4614{
4615   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4616   struct pvr_csb *const csb = &sub_cmd->control_stream;
4617   struct PVRX(VDMCTRL_INDEX_LIST0)
4618      list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) };
4619   pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
4620   unsigned int index_stride = 0;
4621
4622   pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
4623      const bool vertex_shader_has_side_effects =
4624         cmd_buffer->state.gfx_pipeline->vertex_shader_state.stage_state
4625            .has_side_effects;
4626
4627      list0.primitive_topology = pvr_get_hw_primitive_topology(topology);
4628
4629      /* First instance is not handled in the VDM state, it's implemented as
4630       * an addition in the PDS vertex fetch.
4631       */
4632      list0.index_count_present = true;
4633
4634      if (instance_count > 1)
4635         list0.index_instance_count_present = true;
4636
4637      if (first_vertex != 0)
4638         list0.index_offset_present = true;
4639
4640      if (state->draw_state.draw_indexed) {
4641         struct pvr_buffer *buffer = state->index_buffer_binding.buffer;
4642
4643         switch (state->index_buffer_binding.type) {
4644         case VK_INDEX_TYPE_UINT32:
4645            list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B32);
4646            index_stride = 4;
4647            break;
4648
4649         case VK_INDEX_TYPE_UINT16:
4650            list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B16);
4651            index_stride = 2;
4652            break;
4653
4654         default:
4655            unreachable("Invalid index type");
4656         }
4657
4658         list0.index_addr_present = true;
4659         index_buffer_addr = PVR_DEV_ADDR_OFFSET(
4660            buffer->dev_addr,
4661            state->index_buffer_binding.offset + first_index * index_stride);
4662         list0.index_base_addrmsb = index_buffer_addr;
4663      }
4664
4665      list0.degen_cull_enable =
4666         PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
4667                         vdm_degenerate_culling) &&
4668         !vertex_shader_has_side_effects;
4669
4670      list_hdr = list0;
4671   }
4672
4673   if (list_hdr.index_addr_present) {
4674      pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
4675         list1.index_base_addrlsb = index_buffer_addr;
4676      }
4677   }
4678
4679   if (list_hdr.index_count_present) {
4680      pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
4681         list2.index_count = vertex_count | index_count;
4682      }
4683   }
4684
4685   if (list_hdr.index_instance_count_present) {
4686      pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
4687         list3.instance_count = instance_count - 1;
4688      }
4689   }
4690
4691   if (list_hdr.index_offset_present) {
4692      pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
4693         list4.index_offset = first_vertex;
4694      }
4695   }
4696
4697   /* TODO: See if we need list_words[5-9]. */
4698}
4699
4700void pvr_CmdDraw(VkCommandBuffer commandBuffer,
4701                 uint32_t vertexCount,
4702                 uint32_t instanceCount,
4703                 uint32_t firstVertex,
4704                 uint32_t firstInstance)
4705{
4706   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4707   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4708   struct pvr_cmd_buffer_draw_state draw_state;
4709   VkResult result;
4710
4711   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4712
4713   draw_state.base_vertex = firstVertex;
4714   draw_state.base_instance = firstInstance;
4715   draw_state.draw_indirect = false;
4716   draw_state.draw_indexed = false;
4717   pvr_update_draw_state(state, &draw_state);
4718
4719   result = pvr_validate_draw_state(cmd_buffer);
4720   if (result != VK_SUCCESS)
4721      return;
4722
4723   /* Write the VDM control stream for the primitive. */
4724   pvr_emit_vdm_index_list(cmd_buffer,
4725                           &state->current_sub_cmd->gfx,
4726                           state->gfx_pipeline->input_asm_state.topology,
4727                           firstVertex,
4728                           vertexCount,
4729                           0U,
4730                           0U,
4731                           instanceCount);
4732}
4733
4734void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4735                        uint32_t indexCount,
4736                        uint32_t instanceCount,
4737                        uint32_t firstIndex,
4738                        int32_t vertexOffset,
4739                        uint32_t firstInstance)
4740{
4741   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4742   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4743   struct pvr_cmd_buffer_draw_state draw_state;
4744   VkResult result;
4745
4746   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4747
4748   draw_state.base_vertex = vertexOffset;
4749   draw_state.base_instance = firstInstance;
4750   draw_state.draw_indirect = false;
4751   draw_state.draw_indexed = true;
4752   pvr_update_draw_state(state, &draw_state);
4753
4754   result = pvr_validate_draw_state(cmd_buffer);
4755   if (result != VK_SUCCESS)
4756      return;
4757
4758   /* Write the VDM control stream for the primitive. */
4759   pvr_emit_vdm_index_list(cmd_buffer,
4760                           &state->current_sub_cmd->gfx,
4761                           state->gfx_pipeline->input_asm_state.topology,
4762                           vertexOffset,
4763                           0,
4764                           firstIndex,
4765                           indexCount,
4766                           instanceCount);
4767}
4768
4769void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4770                                VkBuffer _buffer,
4771                                VkDeviceSize offset,
4772                                uint32_t drawCount,
4773                                uint32_t stride)
4774{
4775   assert(!"Unimplemented");
4776}
4777
4778void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4779                         VkBuffer _buffer,
4780                         VkDeviceSize offset,
4781                         uint32_t drawCount,
4782                         uint32_t stride)
4783{
4784   assert(!"Unimplemented");
4785}
4786
4787static VkResult
4788pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer)
4789{
4790   pvr_finishme("Add attachment resolve support!");
4791   return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
4792}
4793
4794void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4795                           const VkSubpassEndInfo *pSubpassEndInfo)
4796{
4797   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4798   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4799   struct pvr_image_view **attachments;
4800   VkClearValue *clear_values;
4801   VkResult result;
4802
4803   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4804
4805   assert(state->render_pass_info.pass);
4806   assert(state->render_pass_info.framebuffer);
4807
4808   /* TODO: Investigate why pvr_cmd_buffer_end_sub_cmd/EndSubCommand is called
4809    * twice in this path, one here and one from
4810    * pvr_resolve_unemitted_resolve_attachments.
4811    */
4812   result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
4813   if (result != VK_SUCCESS)
4814      return;
4815
4816   result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer);
4817   if (result != VK_SUCCESS)
4818      return;
4819
4820   /* Save the required fields before clearing render_pass_info struct. */
4821   attachments = state->render_pass_info.attachments;
4822   clear_values = state->render_pass_info.clear_values;
4823
4824   memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
4825
4826   state->render_pass_info.attachments = attachments;
4827   state->render_pass_info.clear_values = clear_values;
4828}
4829
4830void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,
4831                            uint32_t commandBufferCount,
4832                            const VkCommandBuffer *pCommandBuffers)
4833{
4834   assert(!"Unimplemented");
4835}
4836
4837void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,
4838                         const VkSubpassBeginInfo *pSubpassBeginInfo,
4839                         const VkSubpassEndInfo *pSubpassEndInfo)
4840{
4841   assert(!"Unimplemented");
4842}
4843
4844/* This is just enough to handle vkCmdPipelineBarrier().
4845 * TODO: Complete?
4846 */
4847void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
4848                             const VkDependencyInfo *pDependencyInfo)
4849{
4850   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4851   struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4852   const struct pvr_render_pass *const render_pass =
4853      state->render_pass_info.pass;
4854   VkPipelineStageFlags vk_src_stage_mask = 0U;
4855   VkPipelineStageFlags vk_dst_stage_mask = 0U;
4856   uint32_t required_stage_mask = 0U;
4857   uint32_t src_stage_mask;
4858   uint32_t dst_stage_mask;
4859   bool is_barrier_needed;
4860
4861   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4862
4863   for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
4864      vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
4865      vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
4866   }
4867
4868   for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
4869      vk_src_stage_mask |=
4870         pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
4871      vk_dst_stage_mask |=
4872         pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
4873   }
4874
4875   for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
4876      vk_src_stage_mask |=
4877         pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
4878      vk_dst_stage_mask |=
4879         pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
4880   }
4881
4882   src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
4883   dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
4884
4885   for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
4886      if (!(dst_stage_mask & BITFIELD_BIT(stage)))
4887         continue;
4888
4889      required_stage_mask |= state->barriers_needed[stage];
4890   }
4891
4892   src_stage_mask &= required_stage_mask;
4893   for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
4894      if (!(dst_stage_mask & BITFIELD_BIT(stage)))
4895         continue;
4896
4897      state->barriers_needed[stage] &= ~src_stage_mask;
4898   }
4899
4900   if (src_stage_mask == 0 || dst_stage_mask == 0) {
4901      is_barrier_needed = false;
4902   } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
4903              dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
4904      /* This is implicit so no need to barrier. */
4905      is_barrier_needed = false;
4906   } else if (src_stage_mask == dst_stage_mask &&
4907              util_bitcount(src_stage_mask) == 1) {
4908      switch (src_stage_mask) {
4909      case PVR_PIPELINE_STAGE_FRAG_BIT:
4910         pvr_finishme("Handle fragment stage pipeline barrier.");
4911         is_barrier_needed = true;
4912         break;
4913
4914      case PVR_PIPELINE_STAGE_COMPUTE_BIT: {
4915         struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
4916
4917         is_barrier_needed = false;
4918
4919         if (!current_sub_cmd ||
4920             current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
4921            break;
4922         }
4923
4924         /* Multiple dispatches can be merged into a single job. When back to
4925          * back dispatches have a sequential dependency (CDM -> CDM pipeline
4926          * barrier) we need to do the following.
4927          *   - Dispatch a kernel which fences all previous memory writes and
4928          *     flushes the MADD cache.
4929          *   - Issue a CDM fence which ensures all previous tasks emitted by
4930          *     the CDM are completed before starting anything new.
4931          */
4932
4933         /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
4934          * for data.
4935          */
4936         pvr_compute_generate_idfwdf(cmd_buffer, &current_sub_cmd->compute);
4937
4938         pvr_compute_generate_fence(cmd_buffer,
4939                                    &current_sub_cmd->compute,
4940                                    false);
4941         break;
4942      }
4943
4944      default:
4945         is_barrier_needed = false;
4946         break;
4947      };
4948   } else {
4949      is_barrier_needed = true;
4950   }
4951
4952   if (render_pass) {
4953      pvr_finishme("Insert mid fragment stage barrier if needed.");
4954   } else {
4955      if (is_barrier_needed)
4956         pvr_finishme("Insert barrier if needed.");
4957   }
4958}
4959
4960void pvr_CmdResetEvent2KHR(VkCommandBuffer commandBuffer,
4961                           VkEvent _event,
4962                           VkPipelineStageFlags2 stageMask)
4963{
4964   assert(!"Unimplemented");
4965}
4966
4967void pvr_CmdSetEvent2KHR(VkCommandBuffer commandBuffer,
4968                         VkEvent _event,
4969                         const VkDependencyInfo *pDependencyInfo)
4970{
4971   assert(!"Unimplemented");
4972}
4973
4974void pvr_CmdWaitEvents2KHR(VkCommandBuffer commandBuffer,
4975                           uint32_t eventCount,
4976                           const VkEvent *pEvents,
4977                           const VkDependencyInfo *pDependencyInfos)
4978{
4979   assert(!"Unimplemented");
4980}
4981
4982void pvr_CmdWriteTimestamp2KHR(VkCommandBuffer commandBuffer,
4983                               VkPipelineStageFlags2 stage,
4984                               VkQueryPool queryPool,
4985                               uint32_t query)
4986{
4987   unreachable("Timestamp queries are not supported.");
4988}
4989
4990VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)
4991{
4992   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4993   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4994   VkResult result;
4995
4996   /* From the Vulkan 1.0 spec:
4997    *
4998    * CommandBuffer must be in the recording state.
4999    */
5000   assert(cmd_buffer->status == PVR_CMD_BUFFER_STATUS_RECORDING);
5001
5002   if (state->status != VK_SUCCESS)
5003      return state->status;
5004
5005   result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
5006   if (result != VK_SUCCESS)
5007      return result;
5008
5009   cmd_buffer->status = PVR_CMD_BUFFER_STATUS_EXECUTABLE;
5010
5011   return VK_SUCCESS;
5012}
5013