1/*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * based in part on v3dv driver which is:
5 * Copyright © 2019 Raspberry Pi
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 */
26
27#include <assert.h>
28#include <stdbool.h>
29#include <stdint.h>
30#include <string.h>
31#include <vulkan/vulkan.h>
32
33#include "compiler/shader_enums.h"
34#include "hwdef/rogue_hw_utils.h"
35#include "nir/nir.h"
36#include "pvr_bo.h"
37#include "pvr_csb.h"
38#include "pvr_csb_enum_helpers.h"
39#include "pvr_hardcode.h"
40#include "pvr_pds.h"
41#include "pvr_private.h"
42#include "pvr_shader.h"
43#include "pvr_types.h"
44#include "rogue/rogue.h"
45#include "rogue/rogue_build_data.h"
46#include "util/log.h"
47#include "util/macros.h"
48#include "util/ralloc.h"
49#include "util/u_math.h"
50#include "vk_alloc.h"
51#include "vk_log.h"
52#include "vk_object.h"
53#include "vk_util.h"
54
55/*****************************************************************************
56   PDS functions
57*****************************************************************************/
58
59/* If allocator == NULL, the internal one will be used. */
60static VkResult pvr_pds_coeff_program_create_and_upload(
61   struct pvr_device *device,
62   const VkAllocationCallbacks *allocator,
63   const uint32_t *fpu_iterators,
64   uint32_t fpu_iterators_count,
65   const uint32_t *destinations,
66   struct pvr_pds_upload *const pds_upload_out)
67{
68   struct pvr_pds_coeff_loading_program program = {
69      .num_fpu_iterators = fpu_iterators_count,
70   };
71   uint32_t staging_buffer_size;
72   uint32_t *staging_buffer;
73   VkResult result;
74
75   assert(fpu_iterators_count < PVR_MAXIMUM_ITERATIONS);
76
77   /* Get the size of the program and then allocate that much memory. */
78   pvr_pds_coefficient_loading(&program, NULL, PDS_GENERATE_SIZES);
79
80   staging_buffer_size =
81      (program.code_size + program.data_size) * sizeof(*staging_buffer);
82
83   staging_buffer = vk_alloc2(&device->vk.alloc,
84                              allocator,
85                              staging_buffer_size,
86                              8,
87                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
88   if (!staging_buffer)
89      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
90
91   /* FIXME: Should we save pointers when we redesign the pds gen api ? */
92   typed_memcpy(program.FPU_iterators,
93                fpu_iterators,
94                program.num_fpu_iterators);
95
96   typed_memcpy(program.destination, destinations, program.num_fpu_iterators);
97
98   /* Generate the program into is the staging_buffer. */
99   pvr_pds_coefficient_loading(&program,
100                               staging_buffer,
101                               PDS_GENERATE_CODEDATA_SEGMENTS);
102
103   /* FIXME: Figure out the define for alignment of 16. */
104   result = pvr_gpu_upload_pds(device,
105                               &staging_buffer[0],
106                               program.data_size,
107                               16,
108                               &staging_buffer[program.data_size],
109                               program.code_size,
110                               16,
111                               16,
112                               pds_upload_out);
113   if (result != VK_SUCCESS) {
114      vk_free2(&device->vk.alloc, allocator, staging_buffer);
115      return result;
116   }
117
118   vk_free2(&device->vk.alloc, allocator, staging_buffer);
119
120   return VK_SUCCESS;
121}
122
123/* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
124/* If allocator == NULL, the internal one will be used. */
125VkResult pvr_pds_fragment_program_create_and_upload(
126   struct pvr_device *device,
127   const VkAllocationCallbacks *allocator,
128   const struct pvr_bo *fragment_shader_bo,
129   uint32_t fragment_temp_count,
130   enum rogue_msaa_mode msaa_mode,
131   bool has_phase_rate_change,
132   struct pvr_pds_upload *const pds_upload_out)
133{
134   const enum PVRX(PDSINST_DOUTU_SAMPLE_RATE)
135      sample_rate = pvr_pdsinst_doutu_sample_rate_from_rogue(msaa_mode);
136   struct pvr_pds_kickusc_program program = { 0 };
137   uint32_t staging_buffer_size;
138   uint32_t *staging_buffer;
139   VkResult result;
140
141   /* FIXME: Should it be passing in the USC offset rather than address here?
142    */
143   /* Note this is not strictly required to be done before calculating the
144    * staging_buffer_size in this particular case. It can also be done after
145    * allocating the buffer. The size from pvr_pds_kick_usc() is constant.
146    */
147   pvr_pds_setup_doutu(&program.usc_task_control,
148                       fragment_shader_bo->vma->dev_addr.addr,
149                       fragment_temp_count,
150                       sample_rate,
151                       has_phase_rate_change);
152
153   pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
154
155   staging_buffer_size =
156      (program.code_size + program.data_size) * sizeof(*staging_buffer);
157
158   staging_buffer = vk_alloc2(&device->vk.alloc,
159                              allocator,
160                              staging_buffer_size,
161                              8,
162                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
163   if (!staging_buffer)
164      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
165
166   pvr_pds_kick_usc(&program,
167                    staging_buffer,
168                    0,
169                    false,
170                    PDS_GENERATE_CODEDATA_SEGMENTS);
171
172   /* FIXME: Figure out the define for alignment of 16. */
173   result = pvr_gpu_upload_pds(device,
174                               &staging_buffer[0],
175                               program.data_size,
176                               16,
177                               &staging_buffer[program.data_size],
178                               program.code_size,
179                               16,
180                               16,
181                               pds_upload_out);
182   if (result != VK_SUCCESS) {
183      vk_free2(&device->vk.alloc, allocator, staging_buffer);
184      return result;
185   }
186
187   vk_free2(&device->vk.alloc, allocator, staging_buffer);
188
189   return VK_SUCCESS;
190}
191
192static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
193   const struct pvr_device_info *dev_info,
194   bool robust_buffer_access)
195{
196   /* FIXME: Use more local variable to improve formatting. */
197
198   /* Maximum memory allocation needed for const map entries in
199    * pvr_pds_generate_vertex_primary_program().
200    * When robustBufferAccess is disabled, it must be >= 410.
201    * When robustBufferAccess is enabled, it must be >= 570.
202    *
203    * 1. Size of entry for base instance
204    *        (pvr_const_map_entry_base_instance)
205    *
206    * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * (
207    *     if (!robustBufferAccess)
208    *         size of vertex attribute entry
209    *             (pvr_const_map_entry_vertex_attribute_address) +
210    *     else
211    *         size of robust vertex attribute entry
212    *             (pvr_const_map_entry_robust_vertex_attribute_address) +
213    *         size of entry for max attribute index
214    *             (pvr_const_map_entry_vertex_attribute_max_index) +
215    *     fi
216    *     size of Unified Store burst entry
217    *         (pvr_const_map_entry_literal32) +
218    *     size of entry for vertex stride
219    *         (pvr_const_map_entry_literal32) +
220    *     size of entries for DDMAD control word
221    *         (num_ddmad_literals * pvr_const_map_entry_literal32))
222    *
223    * 3. Size of entry for DOUTW vertex/instance control word
224    *     (pvr_const_map_entry_literal32)
225    *
226    * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
227    */
228
229   const size_t attribute_size =
230      (!robust_buffer_access)
231         ? sizeof(struct pvr_const_map_entry_vertex_attribute_address)
232         : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) +
233              sizeof(struct pvr_const_map_entry_vertex_attribute_max_index);
234
235   /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word
236    * and is increased by one DWORD to contain the data for the DDMADT's
237    * out-of-bounds check.
238    */
239   const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals =
240      1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt);
241
242   return (sizeof(struct pvr_const_map_entry_base_instance) +
243           PVR_MAX_VERTEX_INPUT_BINDINGS *
244              (attribute_size +
245               (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) *
246                  sizeof(struct pvr_const_map_entry_literal32)) +
247           sizeof(struct pvr_const_map_entry_literal32) +
248           sizeof(struct pvr_const_map_entry_doutu_address));
249}
250
251/* This is a const pointer to an array of pvr_pds_vertex_dma structs.
252 * The array being pointed to is of PVR_MAX_VERTEX_ATTRIB_DMAS size.
253 */
254typedef struct pvr_pds_vertex_dma (
255      *const
256         pvr_pds_attrib_dma_descriptions_array_ptr)[PVR_MAX_VERTEX_ATTRIB_DMAS];
257
258/* dma_descriptions_out_ptr is a pointer to the array used as output.
259 * The whole array might not be filled so dma_count_out indicates how many
260 * elements were used.
261 */
262static void pvr_pds_vertex_attrib_init_dma_descriptions(
263   const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
264   const struct rogue_vs_build_data *vs_data,
265   pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,
266   uint32_t *const dma_count_out)
267{
268   struct pvr_pds_vertex_dma *const dma_descriptions =
269      *dma_descriptions_out_ptr;
270   uint32_t dma_count = 0;
271
272   if (!vertex_input_state) {
273      *dma_count_out = 0;
274      return;
275   }
276
277   for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
278        i++) {
279      const VkVertexInputAttributeDescription *const attrib_desc =
280         &vertex_input_state->pVertexAttributeDescriptions[i];
281      const VkVertexInputBindingDescription *binding_desc = NULL;
282
283      /* Finding the matching binding description. */
284      for (uint32_t j = 0;
285           j < vertex_input_state->vertexBindingDescriptionCount;
286           j++) {
287         const VkVertexInputBindingDescription *const current_binding_desc =
288            &vertex_input_state->pVertexBindingDescriptions[j];
289
290         if (current_binding_desc->binding == attrib_desc->binding) {
291            binding_desc = current_binding_desc;
292            break;
293         }
294      }
295
296      /* From the Vulkan 1.2.195 spec for
297       * VkPipelineVertexInputStateCreateInfo:
298       *
299       *    "For every binding specified by each element of
300       *    pVertexAttributeDescriptions, a
301       *    VkVertexInputBindingDescription must exist in
302       *    pVertexBindingDescriptions with the same value of binding"
303       *
304       * So we don't check if we found the matching binding description
305       * or not.
306       */
307
308      struct pvr_pds_vertex_dma *const dma_desc = &dma_descriptions[dma_count];
309
310      size_t location = attrib_desc->location;
311      assert(location < vs_data->inputs.num_input_vars);
312
313      dma_desc->offset = attrib_desc->offset;
314      dma_desc->stride = binding_desc->stride;
315
316      dma_desc->flags = 0;
317
318      if (binding_desc->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
319         dma_desc->flags |= PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
320
321      dma_desc->size_in_dwords = vs_data->inputs.components[location];
322      /* TODO: This will be different when other types are supported.
323       * Store in vs_data with base and components?
324       */
325      /* TODO: Use attrib_desc->format. */
326      dma_desc->component_size_in_bytes = ROGUE_REG_SIZE_BYTES;
327      dma_desc->destination = vs_data->inputs.base[location];
328      dma_desc->binding_index = attrib_desc->binding;
329      dma_desc->divisor = 1;
330      dma_desc->robustness_buffer_offset = 0;
331
332      ++dma_count;
333   }
334
335   *dma_count_out = dma_count;
336}
337
338static VkResult pvr_pds_vertex_attrib_program_create_and_upload(
339   struct pvr_device *const device,
340   const VkAllocationCallbacks *const allocator,
341   struct pvr_pds_vertex_primary_program_input *const input,
342   struct pvr_pds_attrib_program *const program_out)
343{
344   const size_t const_entries_size_in_bytes =
345      pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
346         &device->pdevice->dev_info,
347         device->features.robustBufferAccess);
348   struct pvr_pds_upload *const program = &program_out->program;
349   struct pvr_pds_info *const info = &program_out->info;
350   struct pvr_const_map_entry *entries_buffer;
351   ASSERTED uint32_t code_size_in_dwords;
352   size_t staging_buffer_size;
353   uint32_t *staging_buffer;
354   VkResult result;
355
356   memset(info, 0, sizeof(*info));
357
358   entries_buffer = vk_alloc2(&device->vk.alloc,
359                              allocator,
360                              const_entries_size_in_bytes,
361                              8,
362                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
363   if (!entries_buffer)
364      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
365
366   info->entries = entries_buffer;
367   info->entries_size_in_bytes = const_entries_size_in_bytes;
368
369   pvr_pds_generate_vertex_primary_program(input,
370                                           NULL,
371                                           info,
372                                           device->features.robustBufferAccess,
373                                           &device->pdevice->dev_info);
374
375   code_size_in_dwords = info->code_size_in_dwords;
376   staging_buffer_size = info->code_size_in_dwords * sizeof(*staging_buffer);
377
378   staging_buffer = vk_alloc2(&device->vk.alloc,
379                              allocator,
380                              staging_buffer_size,
381                              8,
382                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
383   if (!staging_buffer) {
384      vk_free2(&device->vk.alloc, allocator, entries_buffer);
385      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
386   }
387
388   /* This also fills in info->entries. */
389   pvr_pds_generate_vertex_primary_program(input,
390                                           staging_buffer,
391                                           info,
392                                           device->features.robustBufferAccess,
393                                           &device->pdevice->dev_info);
394
395   assert(info->code_size_in_dwords <= code_size_in_dwords);
396
397   /* FIXME: Add a vk_realloc2() ? */
398   entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
399                               entries_buffer,
400                               info->entries_written_size_in_bytes,
401                               8,
402                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
403   if (!entries_buffer) {
404      vk_free2(&device->vk.alloc, allocator, staging_buffer);
405      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
406   }
407
408   info->entries = entries_buffer;
409   info->entries_size_in_bytes = info->entries_written_size_in_bytes;
410
411   /* FIXME: Figure out the define for alignment of 16. */
412   result = pvr_gpu_upload_pds(device,
413                               NULL,
414                               0,
415                               0,
416                               staging_buffer,
417                               info->code_size_in_dwords,
418                               16,
419                               16,
420                               program);
421   if (result != VK_SUCCESS) {
422      vk_free2(&device->vk.alloc, allocator, entries_buffer);
423      vk_free2(&device->vk.alloc, allocator, staging_buffer);
424
425      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
426   }
427
428   vk_free2(&device->vk.alloc, allocator, staging_buffer);
429
430   return VK_SUCCESS;
431}
432
433static inline void pvr_pds_vertex_attrib_program_destroy(
434   struct pvr_device *const device,
435   const struct VkAllocationCallbacks *const allocator,
436   struct pvr_pds_attrib_program *const program)
437{
438   pvr_bo_free(device, program->program.pvr_bo);
439   vk_free2(&device->vk.alloc, allocator, program->info.entries);
440}
441
442/* This is a const pointer to an array of pvr_pds_attrib_program structs.
443 * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size.
444 */
445typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr)
446   [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT];
447
448/* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex
449 * inputs. This will bake the code segment and create a template of the data
450 * segment for the command buffer to fill in.
451 */
452/* If allocator == NULL, the internal one will be used.
453 *
454 * programs_out_ptr is a pointer to the array where the outputs will be placed.
455 * */
456static VkResult pvr_pds_vertex_attrib_programs_create_and_upload(
457   struct pvr_device *device,
458   const VkAllocationCallbacks *const allocator,
459   const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
460   uint32_t usc_temp_count,
461   const struct rogue_vs_build_data *vs_data,
462   pvr_pds_attrib_programs_array_ptr programs_out_ptr)
463{
464   struct pvr_pds_vertex_dma dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS];
465   struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr;
466   struct pvr_pds_vertex_primary_program_input input = {
467      .dma_list = dma_descriptions,
468   };
469   VkResult result;
470
471   pvr_pds_vertex_attrib_init_dma_descriptions(vertex_input_state,
472                                               vs_data,
473                                               &dma_descriptions,
474                                               &input.dma_count);
475
476   pvr_pds_setup_doutu(&input.usc_task_control,
477                       0,
478                       usc_temp_count,
479                       PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
480                       false);
481
482   /* TODO: If statements for all the "bRequired"s + ui32ExtraFlags. */
483
484   /* Note: programs_out_ptr is a pointer to an array so this is fine. See the
485    * typedef.
486    */
487   for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) {
488      switch (i) {
489      case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC:
490         input.flags = 0;
491         break;
492
493      case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE:
494         input.flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT;
495         break;
496
497      case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT:
498         /* We unset INSTANCE and set INDIRECT. */
499         input.flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT;
500         break;
501
502      default:
503         unreachable("Invalid vertex attrib program type.");
504      }
505
506      result =
507         pvr_pds_vertex_attrib_program_create_and_upload(device,
508                                                         allocator,
509                                                         &input,
510                                                         &programs_out[i]);
511      if (result != VK_SUCCESS) {
512         for (uint32_t j = 0; j < i; j++) {
513            pvr_pds_vertex_attrib_program_destroy(device,
514                                                  allocator,
515                                                  &programs_out[j]);
516         }
517
518         return result;
519      }
520   }
521
522   return VK_SUCCESS;
523}
524
525static size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes()
526{
527   /* Maximum memory allocation needed for const map entries in
528    * pvr_pds_generate_descriptor_upload_program().
529    * It must be >= 688 bytes. This size is calculated as the sum of:
530    *
531    *  1. Max. number of descriptor sets (8) * (
532    *         size of descriptor entry
533    *             (pvr_const_map_entry_descriptor_set) +
534    *         size of Common Store burst entry
535    *             (pvr_const_map_entry_literal32))
536    *
537    *  2. Max. number of PDS program buffers (24) * (
538    *         size of the largest buffer structure
539    *             (pvr_const_map_entry_constant_buffer) +
540    *         size of Common Store burst entry
541    *             (pvr_const_map_entry_literal32)
542    *
543    *  3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
544    */
545
546   /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
547    * say that it should be 8.
548    * Figure our a define for this or is the comment wrong?
549    */
550   return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) +
551                sizeof(struct pvr_const_map_entry_literal32)) +
552           PVR_PDS_MAX_BUFFERS *
553              (sizeof(struct pvr_const_map_entry_constant_buffer) +
554               sizeof(struct pvr_const_map_entry_literal32)) +
555           sizeof(struct pvr_const_map_entry_doutu_address));
556}
557
558/* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer
559 * structs.
560 */
561typedef struct pvr_pds_buffer (
562      *const pvr_pds_descriptor_program_buffer_array_ptr)[PVR_PDS_MAX_BUFFERS];
563
564/**
565 * \brief Setup buffers for the PDS descriptor program.
566 *
567 * Sets up buffers required by the PDS gen api based on compiler info.
568 *
569 * For compile time static constants that need DMAing it uploads them and
570 * returns the upload in \r static_consts_pvr_bo_out .
571 */
572static VkResult pvr_pds_descriptor_program_setup_buffers(
573   struct pvr_device *device,
574   bool robust_buffer_access,
575   const struct rogue_compile_time_consts_data *compile_time_consts_data,
576   const struct rogue_ubo_data *ubo_data,
577   pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,
578   uint32_t *const buffer_count_out,
579   struct pvr_bo **const static_consts_pvr_bo_out)
580{
581   struct pvr_pds_buffer *const buffers = *buffers_out_ptr;
582   uint32_t buffer_count = 0;
583
584   for (size_t i = 0; i < ubo_data->num_ubo_entries; i++) {
585      struct pvr_pds_buffer *current_buffer = &buffers[buffer_count];
586
587      /* This is fine since buffers_out_ptr is a pointer to an array. */
588      assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
589
590      current_buffer->type = PVR_BUFFER_TYPE_UBO;
591      current_buffer->size_in_dwords = ubo_data->size[i];
592      current_buffer->destination = ubo_data->dest[i];
593
594      current_buffer->buffer_id = buffer_count;
595      current_buffer->desc_set = ubo_data->desc_set[i];
596      current_buffer->binding = ubo_data->binding[i];
597      /* TODO: Is this always the case?
598       * E.g. can multiple UBOs have the same base buffer?
599       */
600      current_buffer->source_offset = 0;
601
602      buffer_count++;
603   }
604
605   if (compile_time_consts_data->static_consts.num > 0) {
606      VkResult result;
607
608      assert(compile_time_consts_data->static_consts.num <=
609             ARRAY_SIZE(compile_time_consts_data->static_consts.value));
610
611      /* This is fine since buffers_out_ptr is a pointer to an array. */
612      assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
613
614      /* TODO: Is it possible to have multiple static consts buffer where the
615       * destination is not adjoining? If so we need to handle that.
616       * Currently we're only setting up a single buffer.
617       */
618      buffers[buffer_count++] = (struct pvr_pds_buffer){
619         .type = PVR_BUFFER_TYPES_COMPILE_TIME,
620         .size_in_dwords = compile_time_consts_data->static_consts.num,
621         .destination = compile_time_consts_data->static_consts.dest,
622      };
623
624      result = pvr_gpu_upload(device,
625                              device->heaps.general_heap,
626                              compile_time_consts_data->static_consts.value,
627                              compile_time_consts_data->static_consts.num *
628                                 ROGUE_REG_SIZE_BYTES,
629                              ROGUE_REG_SIZE_BYTES,
630                              static_consts_pvr_bo_out);
631      if (result != VK_SUCCESS)
632         return result;
633   } else {
634      *static_consts_pvr_bo_out = NULL;
635   }
636
637   *buffer_count_out = buffer_count;
638
639   return VK_SUCCESS;
640}
641
642static VkResult pvr_pds_descriptor_program_create_and_upload(
643   struct pvr_device *const device,
644   const VkAllocationCallbacks *const allocator,
645   const struct rogue_compile_time_consts_data *const compile_time_consts_data,
646   const struct rogue_ubo_data *const ubo_data,
647   const struct pvr_explicit_constant_usage *const explicit_const_usage,
648   const struct pvr_pipeline_layout *const layout,
649   enum pvr_stage_allocation stage,
650   struct pvr_stage_allocation_descriptor_state *const descriptor_state)
651{
652   const size_t const_entries_size_in_bytes =
653      pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
654   struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
655   struct pvr_descriptor_program_input program = { 0 };
656   struct pvr_const_map_entry *entries_buffer;
657   ASSERTED uint32_t code_size_in_dwords;
658   uint32_t staging_buffer_size;
659   uint32_t *staging_buffer;
660   VkResult result;
661
662   assert(stage != PVR_STAGE_ALLOCATION_COUNT);
663
664   *pds_info = (struct pvr_pds_info){ 0 };
665
666   result = pvr_pds_descriptor_program_setup_buffers(
667      device,
668      device->features.robustBufferAccess,
669      compile_time_consts_data,
670      ubo_data,
671      &program.buffers,
672      &program.buffer_count,
673      &descriptor_state->static_consts);
674   if (result != VK_SUCCESS)
675      return result;
676
677   if (layout->per_stage_reg_info[stage].primary_dynamic_size_in_dwords)
678      assert(!"Unimplemented");
679
680   for (uint32_t set_num = 0; set_num < layout->set_count; set_num++) {
681      const struct pvr_descriptor_set_layout_mem_layout *const reg_layout =
682         &layout->register_layout_in_dwords_per_stage[stage][set_num];
683      const uint32_t start_offset = explicit_const_usage->start_offset;
684
685      /* TODO: Use compiler usage info to optimize this? */
686
687      /* Only dma primaries if they are actually required. */
688      if (reg_layout->primary_size) {
689         program.descriptor_sets[program.descriptor_set_count++] =
690            (struct pvr_pds_descriptor_set){
691               .descriptor_set = set_num,
692               .size_in_dwords = reg_layout->primary_size,
693               .destination = reg_layout->primary_offset + start_offset,
694               .primary = true,
695            };
696      }
697
698      /* Only dma secondaries if they are actually required. */
699      if (!reg_layout->secondary_size)
700         continue;
701
702      program.descriptor_sets[program.descriptor_set_count++] =
703         (struct pvr_pds_descriptor_set){
704            .descriptor_set = set_num,
705            .size_in_dwords = reg_layout->secondary_size,
706            .destination = reg_layout->secondary_offset + start_offset,
707         };
708   }
709
710   entries_buffer = vk_alloc2(&device->vk.alloc,
711                              allocator,
712                              const_entries_size_in_bytes,
713                              8,
714                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
715   if (!entries_buffer) {
716      pvr_bo_free(device, descriptor_state->static_consts);
717
718      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
719   }
720
721   pds_info->entries = entries_buffer;
722   pds_info->entries_size_in_bytes = const_entries_size_in_bytes;
723
724   pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);
725
726   code_size_in_dwords = pds_info->code_size_in_dwords;
727   staging_buffer_size =
728      pds_info->code_size_in_dwords * sizeof(*staging_buffer);
729
730   if (!staging_buffer_size) {
731      vk_free2(&device->vk.alloc, allocator, entries_buffer);
732
733      *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 };
734
735      return VK_SUCCESS;
736   }
737
738   staging_buffer = vk_alloc2(&device->vk.alloc,
739                              allocator,
740                              staging_buffer_size,
741                              8,
742                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
743   if (!staging_buffer) {
744      pvr_bo_free(device, descriptor_state->static_consts);
745      vk_free2(&device->vk.alloc, allocator, entries_buffer);
746
747      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
748   }
749
750   pvr_pds_generate_descriptor_upload_program(&program,
751                                              staging_buffer,
752                                              pds_info);
753
754   assert(pds_info->code_size_in_dwords <= code_size_in_dwords);
755
756   /* FIXME: use vk_realloc2() ? */
757   entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
758                               entries_buffer,
759                               pds_info->entries_written_size_in_bytes,
760                               8,
761                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
762   if (!entries_buffer) {
763      pvr_bo_free(device, descriptor_state->static_consts);
764      vk_free2(&device->vk.alloc, allocator, staging_buffer);
765
766      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
767   }
768
769   pds_info->entries = entries_buffer;
770   pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes;
771
772   /* FIXME: Figure out the define for alignment of 16. */
773   result = pvr_gpu_upload_pds(device,
774                               NULL,
775                               0,
776                               0,
777                               staging_buffer,
778                               pds_info->code_size_in_dwords,
779                               16,
780                               16,
781                               &descriptor_state->pds_code);
782   if (result != VK_SUCCESS) {
783      pvr_bo_free(device, descriptor_state->static_consts);
784      vk_free2(&device->vk.alloc, allocator, entries_buffer);
785      vk_free2(&device->vk.alloc, allocator, staging_buffer);
786
787      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
788   }
789
790   vk_free2(&device->vk.alloc, allocator, staging_buffer);
791
792   return VK_SUCCESS;
793}
794
795static void pvr_pds_descriptor_program_destroy(
796   struct pvr_device *const device,
797   const struct VkAllocationCallbacks *const allocator,
798   struct pvr_stage_allocation_descriptor_state *const descriptor_state)
799{
800   pvr_bo_free(device, descriptor_state->pds_code.pvr_bo);
801   vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries);
802   pvr_bo_free(device, descriptor_state->static_consts);
803}
804
805static void pvr_pds_compute_program_setup(
806   const struct pvr_device_info *dev_info,
807   const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
808   const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
809   uint32_t barrier_coefficient,
810   bool add_base_workgroup,
811   uint32_t usc_temps,
812   pvr_dev_addr_t usc_shader_dev_addr,
813   struct pvr_pds_compute_shader_program *const program)
814{
815   *program = (struct pvr_pds_compute_shader_program){
816      /* clang-format off */
817      .local_input_regs = {
818         local_input_regs[0],
819         local_input_regs[1],
820         local_input_regs[2]
821      },
822      .work_group_input_regs = {
823         work_group_input_regs[0],
824         work_group_input_regs[1],
825         work_group_input_regs[2]
826      },
827      .global_input_regs = {
828         [0 ... (PVR_WORKGROUP_DIMENSIONS - 1)] =
829            PVR_PDS_COMPUTE_INPUT_REG_UNUSED
830      },
831      /* clang-format on */
832      .barrier_coefficient = barrier_coefficient,
833      .flattened_work_groups = true,
834      .clear_pds_barrier = false,
835      .add_base_workgroup = add_base_workgroup,
836      .kick_usc = true,
837   };
838
839   STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
840                 PVR_WORKGROUP_DIMENSIONS);
841   STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
842                 PVR_WORKGROUP_DIMENSIONS);
843   STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
844                 PVR_WORKGROUP_DIMENSIONS);
845
846   pvr_pds_setup_doutu(&program->usc_task_control,
847                       usc_shader_dev_addr.addr,
848                       usc_temps,
849                       PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
850                       false);
851
852   pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
853}
854
855/* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
856 */
857static VkResult pvr_pds_compute_program_create_and_upload(
858   struct pvr_device *const device,
859   const VkAllocationCallbacks *const allocator,
860   const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
861   const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
862   uint32_t barrier_coefficient,
863   uint32_t usc_temps,
864   pvr_dev_addr_t usc_shader_dev_addr,
865   struct pvr_pds_upload *const pds_upload_out,
866   struct pvr_pds_info *const pds_info_out)
867{
868   struct pvr_device_info *dev_info = &device->pdevice->dev_info;
869   struct pvr_pds_compute_shader_program program;
870   uint32_t staging_buffer_size;
871   uint32_t *staging_buffer;
872   VkResult result;
873
874   pvr_pds_compute_program_setup(dev_info,
875                                 local_input_regs,
876                                 work_group_input_regs,
877                                 barrier_coefficient,
878                                 false,
879                                 usc_temps,
880                                 usc_shader_dev_addr,
881                                 &program);
882
883   /* FIXME: According to pvr_device_init_compute_pds_program() the code size
884    * is in bytes. Investigate this.
885    */
886   staging_buffer_size =
887      (program.code_size + program.data_size) * sizeof(*staging_buffer);
888
889   staging_buffer = vk_alloc2(&device->vk.alloc,
890                              allocator,
891                              staging_buffer_size,
892                              8,
893                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
894   if (!staging_buffer)
895      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
896
897   /* FIXME: pvr_pds_compute_shader doesn't implement
898    * PDS_GENERATE_CODEDATA_SEGMENTS.
899    */
900   pvr_pds_compute_shader(&program,
901                          &staging_buffer[0],
902                          PDS_GENERATE_CODE_SEGMENT,
903                          dev_info);
904
905   pvr_pds_compute_shader(&program,
906                          &staging_buffer[program.code_size],
907                          PDS_GENERATE_DATA_SEGMENT,
908                          dev_info);
909
910   /* FIXME: Figure out the define for alignment of 16. */
911   result = pvr_gpu_upload_pds(device,
912                               &staging_buffer[program.code_size],
913                               program.data_size,
914                               16,
915                               &staging_buffer[0],
916                               program.code_size,
917                               16,
918                               16,
919                               pds_upload_out);
920   if (result != VK_SUCCESS) {
921      vk_free2(&device->vk.alloc, allocator, staging_buffer);
922      return result;
923   }
924
925   *pds_info_out = (struct pvr_pds_info){
926      .temps_required = program.highest_temp,
927      .code_size_in_dwords = program.code_size,
928      .data_size_in_dwords = program.data_size,
929   };
930
931   vk_free2(&device->vk.alloc, allocator, staging_buffer);
932
933   return VK_SUCCESS;
934};
935
936static void pvr_pds_compute_program_destroy(
937   struct pvr_device *const device,
938   const struct VkAllocationCallbacks *const allocator,
939   struct pvr_pds_upload *const pds_program,
940   struct pvr_pds_info *const pds_info)
941{
942   /* We don't allocate an entries buffer so we don't need to free it */
943   pvr_bo_free(device, pds_program->pvr_bo);
944}
945
946/* This only uploads the code segment. The data segment will need to be patched
947 * with the base workgroup before uploading.
948 */
949static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
950   struct pvr_device *const device,
951   const VkAllocationCallbacks *const allocator,
952   const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
953   const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
954   uint32_t barrier_coefficient,
955   uint32_t usc_temps,
956   pvr_dev_addr_t usc_shader_dev_addr,
957   struct pvr_pds_base_workgroup_program *program_out)
958{
959   struct pvr_device_info *dev_info = &device->pdevice->dev_info;
960   struct pvr_pds_compute_shader_program program;
961   uint32_t buffer_size;
962   uint32_t *buffer;
963   VkResult result;
964
965   pvr_pds_compute_program_setup(dev_info,
966                                 local_input_regs,
967                                 work_group_input_regs,
968                                 barrier_coefficient,
969                                 true,
970                                 usc_temps,
971                                 usc_shader_dev_addr,
972                                 &program);
973
974   /* FIXME: According to pvr_device_init_compute_pds_program() the code size
975    * is in bytes. Investigate this.
976    */
977   buffer_size = MAX2(program.code_size, program.data_size) * sizeof(*buffer);
978
979   buffer = vk_alloc2(&device->vk.alloc,
980                      allocator,
981                      buffer_size,
982                      8,
983                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
984   if (!buffer)
985      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
986
987   pvr_pds_compute_shader(&program,
988                          &buffer[0],
989                          PDS_GENERATE_CODE_SEGMENT,
990                          dev_info);
991
992   /* FIXME: Figure out the define for alignment of 16. */
993   result = pvr_gpu_upload_pds(device,
994                               NULL,
995                               0,
996                               0,
997                               buffer,
998                               program.code_size,
999                               16,
1000                               16,
1001                               &program_out->code_upload);
1002   if (result != VK_SUCCESS) {
1003      vk_free2(&device->vk.alloc, allocator, buffer);
1004      return result;
1005   }
1006
1007   pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
1008
1009   program_out->data_section = buffer;
1010
1011   /* We'll need to patch the base workgroup in the PDS data section before
1012    * dispatch so we save the offsets at which to patch. We only need to save
1013    * the offset for the first workgroup id since the workgroup ids are stored
1014    * contiguously in the data segment.
1015    */
1016   program_out->base_workgroup_data_patching_offset =
1017      program.base_workgroup_constant_offset_in_dwords[0];
1018
1019   program_out->info = (struct pvr_pds_info){
1020      .temps_required = program.highest_temp,
1021      .code_size_in_dwords = program.code_size,
1022      .data_size_in_dwords = program.data_size,
1023   };
1024
1025   return VK_SUCCESS;
1026}
1027
1028static void pvr_pds_compute_base_workgroup_variant_program_finish(
1029   struct pvr_device *device,
1030   const VkAllocationCallbacks *const allocator,
1031   struct pvr_pds_base_workgroup_program *const state)
1032{
1033   pvr_bo_free(device, state->code_upload.pvr_bo);
1034   vk_free2(&device->vk.alloc, allocator, state->data_section);
1035}
1036
1037/******************************************************************************
1038   Generic pipeline functions
1039 ******************************************************************************/
1040
1041static void pvr_pipeline_init(struct pvr_device *device,
1042                              enum pvr_pipeline_type type,
1043                              struct pvr_pipeline *const pipeline)
1044{
1045   assert(!pipeline->layout);
1046
1047   vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
1048
1049   pipeline->type = type;
1050}
1051
1052static void pvr_pipeline_finish(struct pvr_pipeline *pipeline)
1053{
1054   vk_object_base_finish(&pipeline->base);
1055}
1056
1057/******************************************************************************
1058   Compute pipeline functions
1059 ******************************************************************************/
1060
1061/* Compiles and uploads shaders and PDS programs. */
1062static VkResult pvr_compute_pipeline_compile(
1063   struct pvr_device *const device,
1064   struct pvr_pipeline_cache *pipeline_cache,
1065   const VkComputePipelineCreateInfo *pCreateInfo,
1066   const VkAllocationCallbacks *const allocator,
1067   struct pvr_compute_pipeline *const compute_pipeline)
1068{
1069   struct rogue_compile_time_consts_data compile_time_consts_data;
1070   uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
1071   struct pvr_explicit_constant_usage explicit_const_usage;
1072   uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
1073   struct rogue_ubo_data ubo_data;
1074   uint32_t barrier_coefficient;
1075   uint32_t usc_temps;
1076   VkResult result;
1077
1078   if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) {
1079      struct pvr_hard_code_compute_build_info build_info;
1080
1081      result = pvr_hard_code_compute_pipeline(device,
1082                                              &compute_pipeline->state.shader,
1083                                              &build_info);
1084      if (result != VK_SUCCESS)
1085         return result;
1086
1087      ubo_data = build_info.ubo_data;
1088      compile_time_consts_data = build_info.compile_time_consts_data;
1089
1090      /* We make sure that the compiler's unused reg value is compatible with
1091       * the pds api.
1092       */
1093      STATIC_ASSERT(ROGUE_REG_UNUSED == PVR_PDS_COMPUTE_INPUT_REG_UNUSED);
1094
1095      barrier_coefficient = build_info.barrier_reg;
1096
1097      /* TODO: Maybe change the pds api to use pointers so we avoid the copy. */
1098      local_input_regs[0] = build_info.local_invocation_regs[0];
1099      local_input_regs[1] = build_info.local_invocation_regs[1];
1100      /* This is not a mistake. We want to assign element 1 to 2. */
1101      local_input_regs[2] = build_info.local_invocation_regs[1];
1102
1103      STATIC_ASSERT(
1104         __same_type(work_group_input_regs, build_info.work_group_regs));
1105      typed_memcpy(work_group_input_regs,
1106                   build_info.work_group_regs,
1107                   PVR_WORKGROUP_DIMENSIONS);
1108
1109      usc_temps = build_info.usc_temps;
1110
1111      explicit_const_usage = build_info.explicit_conts_usage;
1112
1113   } else {
1114      /* FIXME: Compile and upload the shader. */
1115      /* FIXME: Initialize the shader state and setup build info. */
1116      abort();
1117   };
1118
1119   result = pvr_pds_descriptor_program_create_and_upload(
1120      device,
1121      allocator,
1122      &compile_time_consts_data,
1123      &ubo_data,
1124      &explicit_const_usage,
1125      compute_pipeline->base.layout,
1126      PVR_STAGE_ALLOCATION_COMPUTE,
1127      &compute_pipeline->state.descriptor);
1128   if (result != VK_SUCCESS)
1129      goto err_free_shader;
1130
1131   result = pvr_pds_compute_program_create_and_upload(
1132      device,
1133      allocator,
1134      local_input_regs,
1135      work_group_input_regs,
1136      barrier_coefficient,
1137      usc_temps,
1138      compute_pipeline->state.shader.bo->vma->dev_addr,
1139      &compute_pipeline->state.primary_program,
1140      &compute_pipeline->state.primary_program_info);
1141   if (result != VK_SUCCESS)
1142      goto err_free_descriptor_program;
1143
1144   /* If the workgroup ID is required, then we require the base workgroup
1145    * variant of the PDS compute program as well.
1146    */
1147   compute_pipeline->state.flags.base_workgroup =
1148      work_group_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1149      work_group_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1150      work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED;
1151
1152   if (compute_pipeline->state.flags.base_workgroup) {
1153      result = pvr_pds_compute_base_workgroup_variant_program_init(
1154         device,
1155         allocator,
1156         local_input_regs,
1157         work_group_input_regs,
1158         barrier_coefficient,
1159         usc_temps,
1160         compute_pipeline->state.shader.bo->vma->dev_addr,
1161         &compute_pipeline->state.primary_base_workgroup_variant_program);
1162      if (result != VK_SUCCESS)
1163         goto err_destroy_compute_program;
1164   }
1165
1166   return VK_SUCCESS;
1167
1168err_destroy_compute_program:
1169   pvr_pds_compute_program_destroy(
1170      device,
1171      allocator,
1172      &compute_pipeline->state.primary_program,
1173      &compute_pipeline->state.primary_program_info);
1174
1175err_free_descriptor_program:
1176   pvr_bo_free(device, compute_pipeline->state.descriptor.pds_code.pvr_bo);
1177
1178err_free_shader:
1179   pvr_bo_free(device, compute_pipeline->state.shader.bo);
1180
1181   return result;
1182}
1183
1184static VkResult
1185pvr_compute_pipeline_init(struct pvr_device *device,
1186                          struct pvr_pipeline_cache *pipeline_cache,
1187                          const VkComputePipelineCreateInfo *pCreateInfo,
1188                          const VkAllocationCallbacks *allocator,
1189                          struct pvr_compute_pipeline *compute_pipeline)
1190{
1191   VkResult result;
1192
1193   pvr_pipeline_init(device,
1194                     PVR_PIPELINE_TYPE_COMPUTE,
1195                     &compute_pipeline->base);
1196
1197   compute_pipeline->base.layout =
1198      pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1199
1200   result = pvr_compute_pipeline_compile(device,
1201                                         pipeline_cache,
1202                                         pCreateInfo,
1203                                         allocator,
1204                                         compute_pipeline);
1205   if (result != VK_SUCCESS) {
1206      pvr_pipeline_finish(&compute_pipeline->base);
1207      return result;
1208   }
1209
1210   return VK_SUCCESS;
1211}
1212
1213static VkResult
1214pvr_compute_pipeline_create(struct pvr_device *device,
1215                            struct pvr_pipeline_cache *pipeline_cache,
1216                            const VkComputePipelineCreateInfo *pCreateInfo,
1217                            const VkAllocationCallbacks *allocator,
1218                            VkPipeline *const pipeline_out)
1219{
1220   struct pvr_compute_pipeline *compute_pipeline;
1221   VkResult result;
1222
1223   compute_pipeline = vk_zalloc2(&device->vk.alloc,
1224                                 allocator,
1225                                 sizeof(*compute_pipeline),
1226                                 8,
1227                                 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1228   if (!compute_pipeline)
1229      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1230
1231   /* Compiles and uploads shaders and PDS programs. */
1232   result = pvr_compute_pipeline_init(device,
1233                                      pipeline_cache,
1234                                      pCreateInfo,
1235                                      allocator,
1236                                      compute_pipeline);
1237   if (result != VK_SUCCESS) {
1238      vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1239      return result;
1240   }
1241
1242   *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base);
1243
1244   return VK_SUCCESS;
1245}
1246
1247static void pvr_compute_pipeline_destroy(
1248   struct pvr_device *const device,
1249   const VkAllocationCallbacks *const allocator,
1250   struct pvr_compute_pipeline *const compute_pipeline)
1251{
1252   if (compute_pipeline->state.flags.base_workgroup) {
1253      pvr_pds_compute_base_workgroup_variant_program_finish(
1254         device,
1255         allocator,
1256         &compute_pipeline->state.primary_base_workgroup_variant_program);
1257   }
1258
1259   pvr_pds_compute_program_destroy(
1260      device,
1261      allocator,
1262      &compute_pipeline->state.primary_program,
1263      &compute_pipeline->state.primary_program_info);
1264   pvr_pds_descriptor_program_destroy(device,
1265                                      allocator,
1266                                      &compute_pipeline->state.descriptor);
1267   pvr_bo_free(device, compute_pipeline->state.shader.bo);
1268
1269   pvr_pipeline_finish(&compute_pipeline->base);
1270
1271   vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1272}
1273
1274VkResult
1275pvr_CreateComputePipelines(VkDevice _device,
1276                           VkPipelineCache pipelineCache,
1277                           uint32_t createInfoCount,
1278                           const VkComputePipelineCreateInfo *pCreateInfos,
1279                           const VkAllocationCallbacks *pAllocator,
1280                           VkPipeline *pPipelines)
1281{
1282   PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache);
1283   PVR_FROM_HANDLE(pvr_device, device, _device);
1284   VkResult result = VK_SUCCESS;
1285
1286   for (uint32_t i = 0; i < createInfoCount; i++) {
1287      const VkResult local_result =
1288         pvr_compute_pipeline_create(device,
1289                                     pipeline_cache,
1290                                     &pCreateInfos[i],
1291                                     pAllocator,
1292                                     &pPipelines[i]);
1293      if (local_result != VK_SUCCESS) {
1294         result = local_result;
1295         pPipelines[i] = VK_NULL_HANDLE;
1296      }
1297   }
1298
1299   return result;
1300}
1301
1302/******************************************************************************
1303   Graphics pipeline functions
1304 ******************************************************************************/
1305
1306static inline uint32_t pvr_dynamic_state_bit_from_vk(VkDynamicState state)
1307{
1308   switch (state) {
1309   case VK_DYNAMIC_STATE_VIEWPORT:
1310      return PVR_DYNAMIC_STATE_BIT_VIEWPORT;
1311   case VK_DYNAMIC_STATE_SCISSOR:
1312      return PVR_DYNAMIC_STATE_BIT_SCISSOR;
1313   case VK_DYNAMIC_STATE_LINE_WIDTH:
1314      return PVR_DYNAMIC_STATE_BIT_LINE_WIDTH;
1315   case VK_DYNAMIC_STATE_DEPTH_BIAS:
1316      return PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS;
1317   case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
1318      return PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS;
1319   case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
1320      return PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK;
1321   case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
1322      return PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK;
1323   case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
1324      return PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE;
1325   default:
1326      unreachable("Unsupported state.");
1327   }
1328}
1329
1330static void
1331pvr_graphics_pipeline_destroy(struct pvr_device *const device,
1332                              const VkAllocationCallbacks *const allocator,
1333                              struct pvr_graphics_pipeline *const gfx_pipeline)
1334{
1335   const uint32_t num_vertex_attrib_programs =
1336      ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs);
1337
1338   pvr_pds_descriptor_program_destroy(
1339      device,
1340      allocator,
1341      &gfx_pipeline->fragment_shader_state.descriptor_state);
1342
1343   pvr_pds_descriptor_program_destroy(
1344      device,
1345      allocator,
1346      &gfx_pipeline->vertex_shader_state.descriptor_state);
1347
1348   for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) {
1349      struct pvr_pds_attrib_program *const attrib_program =
1350         &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i];
1351
1352      pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1353   }
1354
1355   pvr_bo_free(device,
1356               gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo);
1357   pvr_bo_free(device,
1358               gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo);
1359
1360   pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo);
1361   pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo);
1362
1363   pvr_pipeline_finish(&gfx_pipeline->base);
1364
1365   vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1366}
1367
1368static void
1369pvr_vertex_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1370                      const struct rogue_common_build_data *common_data,
1371                      const struct rogue_vs_build_data *vs_data)
1372{
1373   struct pvr_vertex_shader_state *vertex_state =
1374      &gfx_pipeline->vertex_shader_state;
1375
1376   /* TODO: Hard coding these for now. These should be populated based on the
1377    * information returned by the compiler.
1378    */
1379   vertex_state->stage_state.const_shared_reg_count = common_data->shareds;
1380   vertex_state->stage_state.const_shared_reg_offset = 0;
1381   vertex_state->stage_state.temps_count = common_data->temps;
1382   vertex_state->stage_state.coefficient_size = common_data->coeffs;
1383   vertex_state->stage_state.uses_atomic_ops = false;
1384   vertex_state->stage_state.uses_texture_rw = false;
1385   vertex_state->stage_state.uses_barrier = false;
1386   vertex_state->stage_state.has_side_effects = false;
1387   vertex_state->stage_state.empty_program = false;
1388
1389   vertex_state->vertex_input_size = vs_data->num_vertex_input_regs;
1390   vertex_state->vertex_output_size =
1391      vs_data->num_vertex_outputs * ROGUE_REG_SIZE_BYTES;
1392   vertex_state->user_clip_planes_mask = 0;
1393   vertex_state->entry_offset = 0;
1394
1395   /* TODO: The number of varyings should be checked against the fragment
1396    * shader inputs and assigned in the place where that happens.
1397    * There will also be an opportunity to cull unused fs inputs/vs outputs.
1398    */
1399   pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[0],
1400                 TA_STATE_VARYING0,
1401                 varying0) {
1402      varying0.f32_linear = vs_data->num_varyings;
1403      varying0.f32_flat = 0;
1404      varying0.f32_npc = 0;
1405   }
1406
1407   pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[1],
1408                 TA_STATE_VARYING1,
1409                 varying1) {
1410      varying1.f16_linear = 0;
1411      varying1.f16_flat = 0;
1412      varying1.f16_npc = 0;
1413   }
1414}
1415
1416static void
1417pvr_fragment_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1418                        const struct rogue_common_build_data *common_data)
1419{
1420   struct pvr_fragment_shader_state *fragment_state =
1421      &gfx_pipeline->fragment_shader_state;
1422
1423   /* TODO: Hard coding these for now. These should be populated based on the
1424    * information returned by the compiler.
1425    */
1426   fragment_state->stage_state.const_shared_reg_count = 0;
1427   fragment_state->stage_state.const_shared_reg_offset = 0;
1428   fragment_state->stage_state.temps_count = common_data->temps;
1429   fragment_state->stage_state.coefficient_size = common_data->coeffs;
1430   fragment_state->stage_state.uses_atomic_ops = false;
1431   fragment_state->stage_state.uses_texture_rw = false;
1432   fragment_state->stage_state.uses_barrier = false;
1433   fragment_state->stage_state.has_side_effects = false;
1434   fragment_state->stage_state.empty_program = false;
1435
1436   fragment_state->pass_type = 0;
1437   fragment_state->entry_offset = 0;
1438}
1439
1440/* Compiles and uploads shaders and PDS programs. */
1441static VkResult
1442pvr_graphics_pipeline_compile(struct pvr_device *const device,
1443                              struct pvr_pipeline_cache *pipeline_cache,
1444                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
1445                              const VkAllocationCallbacks *const allocator,
1446                              struct pvr_graphics_pipeline *const gfx_pipeline)
1447{
1448   /* FIXME: Remove this hard coding. */
1449   struct pvr_explicit_constant_usage vert_explicit_const_usage = {
1450      .start_offset = 16,
1451   };
1452   struct pvr_explicit_constant_usage frag_explicit_const_usage = {
1453      .start_offset = 0,
1454   };
1455   static uint32_t hard_code_pipeline_n = 0;
1456
1457   const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
1458      pCreateInfo->pVertexInputState;
1459   const uint32_t cache_line_size =
1460      rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
1461   struct rogue_compiler *compiler = device->pdevice->compiler;
1462   struct rogue_build_ctx *ctx;
1463   VkResult result;
1464
1465   /* Setup shared build context. */
1466   ctx = rogue_create_build_context(compiler);
1467   if (!ctx)
1468      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1469
1470   /* NIR middle-end translation. */
1471   for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
1472        stage--) {
1473      const VkPipelineShaderStageCreateInfo *create_info;
1474      size_t stage_index = gfx_pipeline->stage_indices[stage];
1475
1476      if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) {
1477         if (pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1478             BITFIELD_BIT(stage)) {
1479            continue;
1480         }
1481      }
1482
1483      /* Skip unused/inactive stages. */
1484      if (stage_index == ~0)
1485         continue;
1486
1487      create_info = &pCreateInfo->pStages[stage_index];
1488
1489      /* SPIR-V to NIR. */
1490      ctx->nir[stage] = pvr_spirv_to_nir(ctx, stage, create_info);
1491      if (!ctx->nir[stage]) {
1492         ralloc_free(ctx);
1493         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1494      }
1495   }
1496
1497   /* Pre-back-end analysis and optimization, driver data extraction. */
1498   /* TODO: Analyze and cull unused I/O between stages. */
1499   /* TODO: Allocate UBOs between stages;
1500    * pipeline->layout->set_{count,layout}.
1501    */
1502
1503   /* Back-end translation. */
1504   for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
1505        stage--) {
1506      if (pvr_hard_code_shader_required(&device->pdevice->dev_info) &&
1507          pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1508             BITFIELD_BIT(stage)) {
1509         const struct pvr_device_info *const dev_info =
1510            &device->pdevice->dev_info;
1511         struct pvr_explicit_constant_usage *explicit_const_usage;
1512
1513         switch (stage) {
1514         case MESA_SHADER_VERTEX:
1515            explicit_const_usage = &vert_explicit_const_usage;
1516            break;
1517
1518         case MESA_SHADER_FRAGMENT:
1519            explicit_const_usage = &frag_explicit_const_usage;
1520            break;
1521
1522         default:
1523            unreachable("Unsupported stage.");
1524         }
1525
1526         pvr_hard_code_graphics_shader(dev_info,
1527                                       hard_code_pipeline_n,
1528                                       stage,
1529                                       &ctx->binary[stage]);
1530
1531         pvr_hard_code_graphics_get_build_info(dev_info,
1532                                               hard_code_pipeline_n,
1533                                               stage,
1534                                               &ctx->common_data[stage],
1535                                               &ctx->stage_data,
1536                                               explicit_const_usage);
1537
1538         continue;
1539      }
1540
1541      if (!ctx->nir[stage])
1542         continue;
1543
1544      ctx->rogue[stage] = pvr_nir_to_rogue(ctx, ctx->nir[stage]);
1545      if (!ctx->rogue[stage]) {
1546         ralloc_free(ctx);
1547         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1548      }
1549
1550      ctx->binary[stage] = pvr_rogue_to_binary(ctx, ctx->rogue[stage]);
1551      if (!ctx->binary[stage]) {
1552         ralloc_free(ctx);
1553         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1554      }
1555   }
1556
1557   if (pvr_hard_code_shader_required(&device->pdevice->dev_info) &&
1558       pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1559          BITFIELD_BIT(MESA_SHADER_VERTEX)) {
1560      pvr_hard_code_graphics_vertex_state(&device->pdevice->dev_info,
1561                                          hard_code_pipeline_n,
1562                                          &gfx_pipeline->vertex_shader_state);
1563   } else {
1564      pvr_vertex_state_init(gfx_pipeline,
1565                            &ctx->common_data[MESA_SHADER_VERTEX],
1566                            &ctx->stage_data.vs);
1567   }
1568
1569   result = pvr_gpu_upload_usc(device,
1570                               ctx->binary[MESA_SHADER_VERTEX]->data,
1571                               ctx->binary[MESA_SHADER_VERTEX]->size,
1572                               cache_line_size,
1573                               &gfx_pipeline->vertex_shader_state.bo);
1574   if (result != VK_SUCCESS)
1575      goto err_free_build_context;
1576
1577   if (pvr_hard_code_shader_required(&device->pdevice->dev_info) &&
1578       pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1579          BITFIELD_BIT(MESA_SHADER_FRAGMENT)) {
1580      pvr_hard_code_graphics_fragment_state(
1581         &device->pdevice->dev_info,
1582         hard_code_pipeline_n,
1583         &gfx_pipeline->fragment_shader_state);
1584   } else {
1585      pvr_fragment_state_init(gfx_pipeline,
1586                              &ctx->common_data[MESA_SHADER_FRAGMENT]);
1587   }
1588
1589   result = pvr_gpu_upload_usc(device,
1590                               ctx->binary[MESA_SHADER_FRAGMENT]->data,
1591                               ctx->binary[MESA_SHADER_FRAGMENT]->size,
1592                               cache_line_size,
1593                               &gfx_pipeline->fragment_shader_state.bo);
1594   if (result != VK_SUCCESS)
1595      goto err_free_vertex_bo;
1596
1597   /* TODO: powervr has an optimization where it attempts to recompile shaders.
1598    * See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented since in our
1599    * case the optimization doesn't happen.
1600    */
1601
1602   /* TODO: The programs we use are hard coded for now, but these should be
1603    * selected dynamically.
1604    */
1605
1606   result = pvr_pds_coeff_program_create_and_upload(
1607      device,
1608      allocator,
1609      ctx->stage_data.fs.iterator_args.fpu_iterators,
1610      ctx->stage_data.fs.iterator_args.num_fpu_iterators,
1611      ctx->stage_data.fs.iterator_args.destination,
1612      &gfx_pipeline->fragment_shader_state.pds_coeff_program);
1613   if (result != VK_SUCCESS)
1614      goto err_free_fragment_bo;
1615
1616   result = pvr_pds_fragment_program_create_and_upload(
1617      device,
1618      allocator,
1619      gfx_pipeline->fragment_shader_state.bo,
1620      ctx->common_data[MESA_SHADER_FRAGMENT].temps,
1621      ctx->stage_data.fs.msaa_mode,
1622      ctx->stage_data.fs.phas,
1623      &gfx_pipeline->fragment_shader_state.pds_fragment_program);
1624   if (result != VK_SUCCESS)
1625      goto err_free_coeff_program;
1626
1627   result = pvr_pds_vertex_attrib_programs_create_and_upload(
1628      device,
1629      allocator,
1630      vertex_input_state,
1631      ctx->common_data[MESA_SHADER_VERTEX].temps,
1632      &ctx->stage_data.vs,
1633      &gfx_pipeline->vertex_shader_state.pds_attrib_programs);
1634   if (result != VK_SUCCESS)
1635      goto err_free_frag_program;
1636
1637   result = pvr_pds_descriptor_program_create_and_upload(
1638      device,
1639      allocator,
1640      &ctx->common_data[MESA_SHADER_VERTEX].compile_time_consts_data,
1641      &ctx->common_data[MESA_SHADER_VERTEX].ubo_data,
1642      &vert_explicit_const_usage,
1643      gfx_pipeline->base.layout,
1644      PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
1645      &gfx_pipeline->vertex_shader_state.descriptor_state);
1646   if (result != VK_SUCCESS)
1647      goto err_free_vertex_attrib_program;
1648
1649   /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a
1650    * scratch buffer for both vertex and fragment stage.
1651    * Figure out the best place to do this.
1652    */
1653   /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */
1654   /* TODO: Implement spilling with the above. */
1655
1656   /* TODO: Call pvr_pds_program_program_create_and_upload in a loop. */
1657   /* FIXME: For now we pass in the same explicit_const_usage since it contains
1658    * all invalid entries. Fix this by hooking it up to the compiler.
1659    */
1660   result = pvr_pds_descriptor_program_create_and_upload(
1661      device,
1662      allocator,
1663      &ctx->common_data[MESA_SHADER_FRAGMENT].compile_time_consts_data,
1664      &ctx->common_data[MESA_SHADER_FRAGMENT].ubo_data,
1665      &frag_explicit_const_usage,
1666      gfx_pipeline->base.layout,
1667      PVR_STAGE_ALLOCATION_FRAGMENT,
1668      &gfx_pipeline->fragment_shader_state.descriptor_state);
1669   if (result != VK_SUCCESS)
1670      goto err_free_vertex_descriptor_program;
1671
1672   ralloc_free(ctx);
1673
1674   hard_code_pipeline_n++;
1675
1676   return VK_SUCCESS;
1677
1678err_free_vertex_descriptor_program:
1679   pvr_pds_descriptor_program_destroy(
1680      device,
1681      allocator,
1682      &gfx_pipeline->vertex_shader_state.descriptor_state);
1683err_free_vertex_attrib_program:
1684   for (uint32_t i = 0;
1685        i < ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs);
1686        i++) {
1687      struct pvr_pds_attrib_program *const attrib_program =
1688         &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i];
1689
1690      pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1691   }
1692err_free_frag_program:
1693   pvr_bo_free(device,
1694               gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo);
1695err_free_coeff_program:
1696   pvr_bo_free(device,
1697               gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo);
1698err_free_fragment_bo:
1699   pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo);
1700err_free_vertex_bo:
1701   pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo);
1702err_free_build_context:
1703   ralloc_free(ctx);
1704   return result;
1705}
1706
1707static void pvr_graphics_pipeline_init_depth_and_stencil_state(
1708   struct pvr_graphics_pipeline *gfx_pipeline,
1709   const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state)
1710{
1711   const VkStencilOpState *front;
1712   const VkStencilOpState *back;
1713
1714   if (!depth_stencil_state)
1715      return;
1716
1717   front = &depth_stencil_state->front;
1718   back = &depth_stencil_state->back;
1719
1720   if (depth_stencil_state->depthTestEnable) {
1721      gfx_pipeline->depth_compare_op = depth_stencil_state->depthCompareOp;
1722      gfx_pipeline->depth_write_disable =
1723         !depth_stencil_state->depthWriteEnable;
1724   } else {
1725      gfx_pipeline->depth_compare_op = VK_COMPARE_OP_ALWAYS;
1726      gfx_pipeline->depth_write_disable = true;
1727   }
1728
1729   if (depth_stencil_state->stencilTestEnable) {
1730      gfx_pipeline->stencil_front.compare_op = front->compareOp;
1731      gfx_pipeline->stencil_front.fail_op = front->failOp;
1732      gfx_pipeline->stencil_front.depth_fail_op = front->depthFailOp;
1733      gfx_pipeline->stencil_front.pass_op = front->passOp;
1734
1735      gfx_pipeline->stencil_back.compare_op = back->compareOp;
1736      gfx_pipeline->stencil_back.fail_op = back->failOp;
1737      gfx_pipeline->stencil_back.depth_fail_op = back->depthFailOp;
1738      gfx_pipeline->stencil_back.pass_op = back->passOp;
1739   } else {
1740      gfx_pipeline->stencil_front.compare_op = VK_COMPARE_OP_ALWAYS;
1741      gfx_pipeline->stencil_front.fail_op = VK_STENCIL_OP_KEEP;
1742      gfx_pipeline->stencil_front.depth_fail_op = VK_STENCIL_OP_KEEP;
1743      gfx_pipeline->stencil_front.pass_op = VK_STENCIL_OP_KEEP;
1744
1745      gfx_pipeline->stencil_back = gfx_pipeline->stencil_front;
1746   }
1747}
1748
1749static void pvr_graphics_pipeline_init_dynamic_state(
1750   struct pvr_graphics_pipeline *gfx_pipeline,
1751   const VkPipelineDynamicStateCreateInfo *dynamic_state,
1752   const VkPipelineViewportStateCreateInfo *viewport_state,
1753   const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state,
1754   const VkPipelineColorBlendStateCreateInfo *color_blend_state,
1755   const VkPipelineRasterizationStateCreateInfo *rasterization_state)
1756{
1757   struct pvr_dynamic_state *const internal_dynamic_state =
1758      &gfx_pipeline->dynamic_state;
1759   uint32_t dynamic_states = 0;
1760
1761   if (dynamic_state) {
1762      for (uint32_t i = 0; i < dynamic_state->dynamicStateCount; i++) {
1763         dynamic_states |=
1764            pvr_dynamic_state_bit_from_vk(dynamic_state->pDynamicStates[i]);
1765      }
1766   }
1767
1768   /* TODO: Verify this.
1769    * We don't zero out the pipeline's state if they are dynamic since they
1770    * should be set later on in the command buffer.
1771    */
1772
1773   /* TODO: Handle rasterizerDiscardEnable. */
1774
1775   if (rasterization_state) {
1776      if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH))
1777         internal_dynamic_state->line_width = rasterization_state->lineWidth;
1778
1779      /* TODO: Do we need the depthBiasEnable check? */
1780      if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) {
1781         internal_dynamic_state->depth_bias.constant_factor =
1782            rasterization_state->depthBiasConstantFactor;
1783         internal_dynamic_state->depth_bias.clamp =
1784            rasterization_state->depthBiasClamp;
1785         internal_dynamic_state->depth_bias.slope_factor =
1786            rasterization_state->depthBiasSlopeFactor;
1787      }
1788   }
1789
1790   /* TODO: handle viewport state flags. */
1791
1792   /* TODO: handle static viewport state. */
1793   /* We assume the viewport state to by dynamic for now. */
1794
1795   /* TODO: handle static scissor state. */
1796   /* We assume the scissor state to by dynamic for now. */
1797
1798   if (depth_stencil_state) {
1799      const VkStencilOpState *const front = &depth_stencil_state->front;
1800      const VkStencilOpState *const back = &depth_stencil_state->back;
1801
1802      /* VkPhysicalDeviceFeatures->depthBounds is false. */
1803      assert(depth_stencil_state->depthBoundsTestEnable == VK_FALSE);
1804
1805      if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) {
1806         internal_dynamic_state->compare_mask.front = front->compareMask;
1807         internal_dynamic_state->compare_mask.back = back->compareMask;
1808      }
1809
1810      if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) {
1811         internal_dynamic_state->write_mask.front = front->writeMask;
1812         internal_dynamic_state->write_mask.back = back->writeMask;
1813      }
1814
1815      if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) {
1816         internal_dynamic_state->reference.front = front->reference;
1817         internal_dynamic_state->reference.back = back->reference;
1818      }
1819   }
1820
1821   if (color_blend_state &&
1822       !(dynamic_states & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) {
1823      STATIC_ASSERT(__same_type(internal_dynamic_state->blend_constants,
1824                                color_blend_state->blendConstants));
1825
1826      typed_memcpy(internal_dynamic_state->blend_constants,
1827                   color_blend_state->blendConstants,
1828                   ARRAY_SIZE(internal_dynamic_state->blend_constants));
1829   }
1830
1831   /* TODO: handle STATIC_STATE_DEPTH_BOUNDS ? */
1832
1833   internal_dynamic_state->mask = dynamic_states;
1834}
1835
1836static VkResult
1837pvr_graphics_pipeline_init(struct pvr_device *device,
1838                           struct pvr_pipeline_cache *pipeline_cache,
1839                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
1840                           const VkAllocationCallbacks *allocator,
1841                           struct pvr_graphics_pipeline *gfx_pipeline)
1842{
1843   /* If rasterization is not enabled, various CreateInfo structs must be
1844    * ignored.
1845    */
1846   const bool raster_discard_enabled =
1847      pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
1848   const VkPipelineViewportStateCreateInfo *vs_info =
1849      !raster_discard_enabled ? pCreateInfo->pViewportState : NULL;
1850   const VkPipelineDepthStencilStateCreateInfo *dss_info =
1851      !raster_discard_enabled ? pCreateInfo->pDepthStencilState : NULL;
1852   const VkPipelineRasterizationStateCreateInfo *rs_info =
1853      !raster_discard_enabled ? pCreateInfo->pRasterizationState : NULL;
1854   const VkPipelineColorBlendStateCreateInfo *cbs_info =
1855      !raster_discard_enabled ? pCreateInfo->pColorBlendState : NULL;
1856   const VkPipelineMultisampleStateCreateInfo *ms_info =
1857      !raster_discard_enabled ? pCreateInfo->pMultisampleState : NULL;
1858   VkResult result;
1859
1860   pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base);
1861
1862   pvr_finishme("ignoring pCreateInfo flags.");
1863   pvr_finishme("ignoring pipeline cache.");
1864
1865   gfx_pipeline->raster_state.discard_enable = raster_discard_enabled;
1866   gfx_pipeline->raster_state.cull_mode =
1867      pCreateInfo->pRasterizationState->cullMode;
1868   gfx_pipeline->raster_state.front_face =
1869      pCreateInfo->pRasterizationState->frontFace;
1870   gfx_pipeline->raster_state.depth_bias_enable =
1871      pCreateInfo->pRasterizationState->depthBiasEnable;
1872   gfx_pipeline->raster_state.depth_clamp_enable =
1873      pCreateInfo->pRasterizationState->depthClampEnable;
1874
1875   /* FIXME: Handle depthClampEnable. */
1876
1877   pvr_graphics_pipeline_init_depth_and_stencil_state(gfx_pipeline, dss_info);
1878   pvr_graphics_pipeline_init_dynamic_state(gfx_pipeline,
1879                                            pCreateInfo->pDynamicState,
1880                                            vs_info,
1881                                            dss_info,
1882                                            cbs_info,
1883                                            rs_info);
1884
1885   if (pCreateInfo->pInputAssemblyState) {
1886      gfx_pipeline->input_asm_state.topology =
1887         pCreateInfo->pInputAssemblyState->topology;
1888      gfx_pipeline->input_asm_state.primitive_restart =
1889         pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
1890   }
1891
1892   memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices));
1893
1894   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
1895      VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage;
1896      gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage);
1897      /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo:
1898       *
1899       *    "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS,
1900       *    or VK_SHADER_STAGE_ALL."
1901       *
1902       * So we don't handle that.
1903       *
1904       * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and
1905       * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and
1906       * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures
1907       * structure returned by the driver.
1908       */
1909      switch (pCreateInfo->pStages[i].stage) {
1910      case VK_SHADER_STAGE_VERTEX_BIT:
1911      case VK_SHADER_STAGE_FRAGMENT_BIT:
1912         gfx_pipeline->stage_indices[gl_stage] = i;
1913         break;
1914      default:
1915         unreachable("Unsupported stage.");
1916      }
1917   }
1918
1919   gfx_pipeline->base.layout =
1920      pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1921
1922   if (ms_info) {
1923      gfx_pipeline->rasterization_samples = ms_info->rasterizationSamples;
1924      gfx_pipeline->sample_mask =
1925         (ms_info->pSampleMask) ? ms_info->pSampleMask[0] : 0xFFFFFFFF;
1926   } else {
1927      gfx_pipeline->rasterization_samples = VK_SAMPLE_COUNT_1_BIT;
1928      gfx_pipeline->sample_mask = 0xFFFFFFFF;
1929   }
1930
1931   /* Compiles and uploads shaders and PDS programs. */
1932   result = pvr_graphics_pipeline_compile(device,
1933                                          pipeline_cache,
1934                                          pCreateInfo,
1935                                          allocator,
1936                                          gfx_pipeline);
1937   if (result != VK_SUCCESS) {
1938      pvr_pipeline_finish(&gfx_pipeline->base);
1939      return result;
1940   }
1941
1942   return VK_SUCCESS;
1943}
1944
1945/* If allocator == NULL, the internal one will be used. */
1946static VkResult
1947pvr_graphics_pipeline_create(struct pvr_device *device,
1948                             struct pvr_pipeline_cache *pipeline_cache,
1949                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
1950                             const VkAllocationCallbacks *allocator,
1951                             VkPipeline *const pipeline_out)
1952{
1953   struct pvr_graphics_pipeline *gfx_pipeline;
1954   VkResult result;
1955
1956   gfx_pipeline = vk_zalloc2(&device->vk.alloc,
1957                             allocator,
1958                             sizeof(*gfx_pipeline),
1959                             8,
1960                             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1961   if (!gfx_pipeline)
1962      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1963
1964   /* Compiles and uploads shaders and PDS programs too. */
1965   result = pvr_graphics_pipeline_init(device,
1966                                       pipeline_cache,
1967                                       pCreateInfo,
1968                                       allocator,
1969                                       gfx_pipeline);
1970   if (result != VK_SUCCESS) {
1971      vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1972      return result;
1973   }
1974
1975   *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base);
1976
1977   return VK_SUCCESS;
1978}
1979
1980VkResult
1981pvr_CreateGraphicsPipelines(VkDevice _device,
1982                            VkPipelineCache pipelineCache,
1983                            uint32_t createInfoCount,
1984                            const VkGraphicsPipelineCreateInfo *pCreateInfos,
1985                            const VkAllocationCallbacks *pAllocator,
1986                            VkPipeline *pPipelines)
1987{
1988   PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache);
1989   PVR_FROM_HANDLE(pvr_device, device, _device);
1990   VkResult result = VK_SUCCESS;
1991
1992   for (uint32_t i = 0; i < createInfoCount; i++) {
1993      const VkResult local_result =
1994         pvr_graphics_pipeline_create(device,
1995                                      pipeline_cache,
1996                                      &pCreateInfos[i],
1997                                      pAllocator,
1998                                      &pPipelines[i]);
1999      if (local_result != VK_SUCCESS) {
2000         result = local_result;
2001         pPipelines[i] = VK_NULL_HANDLE;
2002      }
2003   }
2004
2005   return result;
2006}
2007
2008/*****************************************************************************
2009   Other functions
2010*****************************************************************************/
2011
2012void pvr_DestroyPipeline(VkDevice _device,
2013                         VkPipeline _pipeline,
2014                         const VkAllocationCallbacks *pAllocator)
2015{
2016   PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2017   PVR_FROM_HANDLE(pvr_device, device, _device);
2018
2019   if (!pipeline)
2020      return;
2021
2022   switch (pipeline->type) {
2023   case PVR_PIPELINE_TYPE_GRAPHICS: {
2024      struct pvr_graphics_pipeline *const gfx_pipeline =
2025         to_pvr_graphics_pipeline(pipeline);
2026
2027      pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline);
2028      break;
2029   }
2030
2031   case PVR_PIPELINE_TYPE_COMPUTE: {
2032      struct pvr_compute_pipeline *const compute_pipeline =
2033         to_pvr_compute_pipeline(pipeline);
2034
2035      pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline);
2036      break;
2037   }
2038
2039   default:
2040      unreachable("Unknown pipeline type.");
2041   }
2042}
2043