broadcom/vulkan/v3dv_pipeline_cache.c

/*
 * Copyright © 2019 Raspberry Pi Ltd
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "v3dv_private.h"
#include "vulkan/util/vk_util.h"
#include "util/blob.h"
#include "nir/nir_serialize.h"

static const bool debug_cache = false;
static const bool dump_stats = false;
static const bool dump_stats_on_destroy = false;

/* Shared for nir/variants */
#define V3DV_MAX_PIPELINE_CACHE_ENTRIES 4096

static uint32_t
sha1_hash_func(const void *sha1)
{
   return _mesa_hash_data(sha1, 20);
}

static bool
sha1_compare_func(const void *sha1_a, const void *sha1_b)
{
   return memcmp(sha1_a, sha1_b, 20) == 0;
}

struct serialized_nir {
   unsigned char sha1_key[20];
   size_t size;
   char data[0];
};

static void
cache_dump_stats(struct v3dv_pipeline_cache *cache)
{
   fprintf(stderr, "  NIR cache entries:      %d\n", cache->nir_stats.count);
   fprintf(stderr, "  NIR cache miss count:   %d\n", cache->nir_stats.miss);
   fprintf(stderr, "  NIR cache hit  count:   %d\n", cache->nir_stats.hit);

   fprintf(stderr, "  cache entries:      %d\n", cache->stats.count);
   fprintf(stderr, "  cache miss count:   %d\n", cache->stats.miss);
   fprintf(stderr, "  cache hit  count:   %d\n", cache->stats.hit);

   fprintf(stderr, "  on-disk cache hit  count:   %d\n", cache->stats.on_disk_hit);
}

static void
pipeline_cache_lock(struct v3dv_pipeline_cache *cache)
{
   if (!cache->externally_synchronized)
      mtx_lock(&cache->mutex);
}

static void
pipeline_cache_unlock(struct v3dv_pipeline_cache *cache)
{
   if (!cache->externally_synchronized)
      mtx_unlock(&cache->mutex);
}

void
v3dv_pipeline_cache_upload_nir(struct v3dv_pipeline *pipeline,
                               struct v3dv_pipeline_cache *cache,
                               nir_shader *nir,
                               unsigned char sha1_key[20])
{
   if (!cache || !cache->nir_cache)
      return;

   if (cache->nir_stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
      return;

   pipeline_cache_lock(cache);
   struct hash_entry *entry =
      _mesa_hash_table_search(cache->nir_cache, sha1_key);
   pipeline_cache_unlock(cache);
   if (entry)
      return;

   struct blob blob;
   blob_init(&blob);

   nir_serialize(&blob, nir, false);
   if (blob.out_of_memory) {
      blob_finish(&blob);
      return;
   }

   pipeline_cache_lock(cache);
   /* Because ralloc isn't thread-safe, we have to do all this inside the
    * lock.  We could unlock for the big memcpy but it's probably not worth
    * the hassle.
    */
   entry = _mesa_hash_table_search(cache->nir_cache, sha1_key);
   if (entry) {
      blob_finish(&blob);
      pipeline_cache_unlock(cache);
      return;
   }

   struct serialized_nir *snir =
      ralloc_size(cache->nir_cache, sizeof(*snir) + blob.size);
   memcpy(snir->sha1_key, sha1_key, 20);
   snir->size = blob.size;
   memcpy(snir->data, blob.data, blob.size);

   blob_finish(&blob);

   cache->nir_stats.count++;
   if (debug_cache) {
      char sha1buf[41];
      _mesa_sha1_format(sha1buf, snir->sha1_key);
      fprintf(stderr, "pipeline cache %p, new nir entry %s\n", cache, sha1buf);
      if (dump_stats)
         cache_dump_stats(cache);
   }

   _mesa_hash_table_insert(cache->nir_cache, snir->sha1_key, snir);

   pipeline_cache_unlock(cache);
}

nir_shader*
v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
                                   struct v3dv_pipeline_cache *cache,
                                   const nir_shader_compiler_options *nir_options,
                                   unsigned char sha1_key[20])
{
   if (!cache || !cache->nir_cache)
      return NULL;

   if (debug_cache) {
      char sha1buf[41];
      _mesa_sha1_format(sha1buf, sha1_key);

      fprintf(stderr, "pipeline cache %p, search for nir %s\n", cache, sha1buf);
   }

   const struct serialized_nir *snir = NULL;

   pipeline_cache_lock(cache);
   struct hash_entry *entry =
      _mesa_hash_table_search(cache->nir_cache, sha1_key);
   if (entry)
      snir = entry->data;
   pipeline_cache_unlock(cache);

   if (snir) {
      struct blob_reader blob;
      blob_reader_init(&blob, snir->data, snir->size);

      /* We use context NULL as we want the p_stage to keep the reference to
       * nir, as we keep open the possibility of provide a shader variant
       * after cache creation
       */
      nir_shader *nir = nir_deserialize(NULL, nir_options, &blob);
      if (blob.overrun) {
         ralloc_free(nir);
      } else {
         cache->nir_stats.hit++;
         if (debug_cache) {
            fprintf(stderr, "[v3dv nir cache] hit: %p\n", nir);
            if (dump_stats)
               cache_dump_stats(cache);
         }
         return nir;
      }
   }

   cache->nir_stats.miss++;
   if (debug_cache) {
      fprintf(stderr, "[v3dv nir cache] miss\n");
      if (dump_stats)
         cache_dump_stats(cache);
   }

   return NULL;
}

void
v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
                         struct v3dv_device *device,
                         VkPipelineCacheCreateFlags flags,
                         bool cache_enabled)
{
   cache->device = device;
   mtx_init(&cache->mutex, mtx_plain);

   if (cache_enabled) {
      cache->nir_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
                                                 sha1_compare_func);
      cache->nir_stats.miss = 0;
      cache->nir_stats.hit = 0;
      cache->nir_stats.count = 0;

      cache->cache = _mesa_hash_table_create(NULL, sha1_hash_func,
                                             sha1_compare_func);
      cache->stats.miss = 0;
      cache->stats.hit = 0;
      cache->stats.count = 0;

      cache->externally_synchronized = flags &
         VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT;
   } else {
      cache->nir_cache = NULL;
      cache->cache = NULL;
   }

}

static struct v3dv_pipeline_shared_data *
v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
                                           struct blob_reader *blob);

static void
pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache,
                                  struct v3dv_pipeline_shared_data *shared_data,
                                  bool from_disk_cache);

static bool
v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *cache_entry,
                                        struct blob *blob);

/**
 * It searchs for pipeline cached data, and returns a v3dv_pipeline_shared_data with
 * it, or NULL if doesn't have it cached. On the former, it will increases the
 * ref_count, so caller is responsible to unref it.
 */
struct v3dv_pipeline_shared_data *
v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
                                        unsigned char sha1_key[20],
                                        bool *cache_hit)
{
   if (!cache || !cache->cache)
      return NULL;

   if (debug_cache) {
      char sha1buf[41];
      _mesa_sha1_format(sha1buf, sha1_key);

      fprintf(stderr, "pipeline cache %p, search pipeline with key %s\n", cache, sha1buf);
   }

   pipeline_cache_lock(cache);

   struct hash_entry *entry =
      _mesa_hash_table_search(cache->cache, sha1_key);

   if (entry) {
      struct v3dv_pipeline_shared_data *cache_entry =
         (struct v3dv_pipeline_shared_data *) entry->data;
      assert(cache_entry);

      cache->stats.hit++;
      *cache_hit = true;
      if (debug_cache) {
         fprintf(stderr, "[v3dv cache] hit: %p\n", cache_entry);
         if (dump_stats)
            cache_dump_stats(cache);
      }


      v3dv_pipeline_shared_data_ref(cache_entry);

      pipeline_cache_unlock(cache);

      return cache_entry;
   }

   cache->stats.miss++;
   if (debug_cache) {
      fprintf(stderr, "[v3dv cache] miss\n");
      if (dump_stats)
         cache_dump_stats(cache);
   }

   pipeline_cache_unlock(cache);

#ifdef ENABLE_SHADER_CACHE
   struct v3dv_device *device = cache->device;
   struct disk_cache *disk_cache = device->pdevice->disk_cache;
   /* Note that the on-disk-cache can be independently disabled, while keeping
    * the pipeline cache working, by using the environment variable
    * MESA_SHADER_CACHE_DISABLE. In that case the calls to disk_cache_put/get
    * will not do anything.
    */
   if (disk_cache && device->instance->pipeline_cache_enabled) {
      cache_key cache_key;
      disk_cache_compute_key(disk_cache, sha1_key, 20, cache_key);

      size_t buffer_size;
      uint8_t *buffer = disk_cache_get(disk_cache, cache_key, &buffer_size);
      if (unlikely(V3D_DEBUG & V3D_DEBUG_CACHE)) {
         char sha1buf[41];
         _mesa_sha1_format(sha1buf, cache_key);
         fprintf(stderr, "[v3dv on-disk cache] %s %s\n",
                 buffer ? "hit" : "miss",
                 sha1buf);
      }

      if (buffer) {
         struct blob_reader blob;
         struct v3dv_pipeline_shared_data *shared_data;

         blob_reader_init(&blob, buffer, buffer_size);
         shared_data = v3dv_pipeline_shared_data_create_from_blob(cache, &blob);
         free(buffer);

         if (shared_data) {
            /* Technically we could increase on_disk_hit as soon as we have a
             * buffer, but we are more interested on hits that got a valid
             * shared_data
             */
            cache->stats.on_disk_hit++;
            if (cache)
               pipeline_cache_upload_shared_data(cache, shared_data, true);
            return shared_data;
         }
      }
   }
#endif

   return NULL;
}

void
v3dv_pipeline_shared_data_destroy(struct v3dv_device *device,
                                  struct v3dv_pipeline_shared_data *shared_data)
{
   assert(shared_data->ref_cnt == 0);

   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
      if (shared_data->variants[stage] != NULL)
         v3dv_shader_variant_destroy(device, shared_data->variants[stage]);

      /* We don't free binning descriptor maps as we are sharing them
       * with the render shaders.
       */
      if (shared_data->maps[stage] != NULL &&
          !broadcom_shader_stage_is_binning(stage)) {
         vk_free(&device->vk.alloc, shared_data->maps[stage]);
      }
   }

   if (shared_data->assembly_bo)
      v3dv_bo_free(device, shared_data->assembly_bo);

   vk_free(&device->vk.alloc, shared_data);
}

static struct v3dv_pipeline_shared_data *
v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache,
                              const unsigned char sha1_key[20],
                              struct v3dv_descriptor_maps **maps,
                              struct v3dv_shader_variant **variants,
                              const uint64_t *total_assembly,
                              const uint32_t total_assembly_size)
{
   size_t size = sizeof(struct v3dv_pipeline_shared_data);
   /* We create new_entry using the device alloc. Right now shared_data is ref
    * and unref by both the pipeline and the pipeline cache, so we can't
    * ensure that the cache or pipeline alloc will be available on the last
    * unref.
    */
   struct v3dv_pipeline_shared_data *new_entry =
      vk_zalloc2(&cache->device->vk.alloc, NULL, size, 8,
                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

   if (new_entry == NULL)
      return NULL;

   new_entry->ref_cnt = 1;
   memcpy(new_entry->sha1_key, sha1_key, 20);

   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
      new_entry->maps[stage] = maps[stage];
      new_entry->variants[stage] = variants[stage];
   }

   struct v3dv_bo *bo = v3dv_bo_alloc(cache->device, total_assembly_size,
                                      "pipeline shader assembly", true);
   if (!bo) {
      fprintf(stderr, "failed to allocate memory for shaders assembly\n");
      goto fail;
   }

   bool ok = v3dv_bo_map(cache->device, bo, total_assembly_size);
   if (!ok) {
      fprintf(stderr, "failed to map source shader buffer\n");
      goto fail;
   }

   memcpy(bo->map, total_assembly, total_assembly_size);

   new_entry->assembly_bo = bo;

   return new_entry;

fail:
   v3dv_pipeline_shared_data_unref(cache->device, new_entry);
   return NULL;
}

static void
pipeline_cache_upload_shared_data(struct v3dv_pipeline_cache *cache,
                                  struct v3dv_pipeline_shared_data *shared_data,
                                  bool from_disk_cache)
{
   assert(shared_data);

   if (!cache || !cache->cache)
      return;

   if (cache->stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
      return;

   pipeline_cache_lock(cache);
   struct hash_entry *entry = NULL;

   /* If this is being called from the disk cache, we already know that the
    * entry is not on the hash table
    */
   if (!from_disk_cache)
      entry = _mesa_hash_table_search(cache->cache, shared_data->sha1_key);

   if (entry) {
      pipeline_cache_unlock(cache);
      return;
   }

   v3dv_pipeline_shared_data_ref(shared_data);
   _mesa_hash_table_insert(cache->cache, shared_data->sha1_key, shared_data);
   cache->stats.count++;
   if (debug_cache) {
      char sha1buf[41];
      _mesa_sha1_format(sha1buf, shared_data->sha1_key);

      fprintf(stderr, "pipeline cache %p, new cache entry with sha1 key %s:%p\n\n",
              cache, sha1buf, shared_data);
      if (dump_stats)
         cache_dump_stats(cache);
   }

   pipeline_cache_unlock(cache);

#ifdef ENABLE_SHADER_CACHE
   /* If we are being called from a on-disk-cache hit, we can skip writing to
    * the disk cache
    */
   if (from_disk_cache)
      return;

   struct v3dv_device *device = cache->device;
   struct disk_cache *disk_cache = device->pdevice->disk_cache;
   if (disk_cache) {
      struct blob binary;
      blob_init(&binary);
      if (v3dv_pipeline_shared_data_write_to_blob(shared_data, &binary)) {
         cache_key cache_key;
         disk_cache_compute_key(disk_cache, shared_data->sha1_key, 20, cache_key);

         if (unlikely(V3D_DEBUG & V3D_DEBUG_CACHE)) {
            char sha1buf[41];
            _mesa_sha1_format(sha1buf, shared_data->sha1_key);
            fprintf(stderr, "[v3dv on-disk cache] storing %s\n", sha1buf);
         }
         disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL);
      }

      blob_finish(&binary);
   }
#endif
}

/* Uploads all the "cacheable" or shared data from the pipeline */
void
v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
                                    struct v3dv_pipeline_cache *cache)
{
   pipeline_cache_upload_shared_data(cache, pipeline->shared_data, false);
}

static struct serialized_nir*
serialized_nir_create_from_blob(struct v3dv_pipeline_cache *cache,
                                struct blob_reader *blob)
{
   const unsigned char *sha1_key = blob_read_bytes(blob, 20);
   uint32_t snir_size = blob_read_uint32(blob);
   const char* snir_data = blob_read_bytes(blob, snir_size);
   if (blob->overrun)
      return NULL;

   struct serialized_nir *snir =
      ralloc_size(cache->nir_cache, sizeof(*snir) + snir_size);
   memcpy(snir->sha1_key, sha1_key, 20);
   snir->size = snir_size;
   memcpy(snir->data, snir_data, snir_size);

   return snir;
}

static struct v3dv_shader_variant*
shader_variant_create_from_blob(struct v3dv_device *device,
                                struct blob_reader *blob)
{
   VkResult result;

   enum broadcom_shader_stage stage = blob_read_uint32(blob);

   uint32_t prog_data_size = blob_read_uint32(blob);
   /* FIXME: as we include the stage perhaps we can avoid prog_data_size? */
   assert(prog_data_size == v3d_prog_data_size(broadcom_shader_stage_to_gl(stage)));

   const void *prog_data = blob_read_bytes(blob, prog_data_size);
   if (blob->overrun)
      return NULL;

   uint32_t ulist_count = blob_read_uint32(blob);
   uint32_t contents_size = sizeof(enum quniform_contents) * ulist_count;
   const void *contents_data = blob_read_bytes(blob, contents_size);
   if (blob->overrun)
      return NULL;

   uint ulist_data_size = sizeof(uint32_t) * ulist_count;
   const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size);
   if (blob->overrun)
      return NULL;

   uint32_t assembly_offset = blob_read_uint32(blob);
   uint32_t qpu_insts_size = blob_read_uint32(blob);

   /* shader_variant_create expects a newly created prog_data for their own,
    * as it is what the v3d compiler returns. So we are also allocating one
    * (including the uniform list) and filled it up with the data that we read
    * from the blob
    */
   struct v3d_prog_data *new_prog_data = rzalloc_size(NULL, prog_data_size);
   memcpy(new_prog_data, prog_data, prog_data_size);
   struct v3d_uniform_list *ulist = &new_prog_data->uniforms;
   ulist->count = ulist_count;
   ulist->contents = ralloc_array(new_prog_data, enum quniform_contents, ulist->count);
   memcpy(ulist->contents, contents_data, contents_size);
   ulist->data = ralloc_array(new_prog_data, uint32_t, ulist->count);
   memcpy(ulist->data, ulist_data_data, ulist_data_size);

   return v3dv_shader_variant_create(device, stage,
                                     new_prog_data, prog_data_size,
                                     assembly_offset,
                                     NULL, qpu_insts_size,
                                     &result);
}

static struct v3dv_pipeline_shared_data *
v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
                                           struct blob_reader *blob)
{
   const unsigned char *sha1_key = blob_read_bytes(blob, 20);

   struct v3dv_descriptor_maps *maps[BROADCOM_SHADER_STAGES] = { 0 };
   struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES] = { 0 };

   uint8_t descriptor_maps_count = blob_read_uint8(blob);
   for (uint8_t count = 0; count < descriptor_maps_count; count++) {
      uint8_t stage = blob_read_uint8(blob);

      const struct v3dv_descriptor_maps *current_maps =
         blob_read_bytes(blob, sizeof(struct v3dv_descriptor_maps));

      if (blob->overrun)
         goto fail;

      maps[stage] = vk_zalloc2(&cache->device->vk.alloc, NULL,
                               sizeof(struct v3dv_descriptor_maps), 8,
                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

      if (maps[stage] == NULL)
         goto fail;

      memcpy(maps[stage], current_maps, sizeof(struct v3dv_descriptor_maps));
      if (broadcom_shader_stage_is_render_with_binning(stage)) {
         enum broadcom_shader_stage bin_stage =
            broadcom_binning_shader_stage_for_render_stage(stage);
            maps[bin_stage] = maps[stage];
      }
   }

   uint8_t variant_count = blob_read_uint8(blob);

   for (uint8_t count = 0; count < variant_count; count++) {
      uint8_t stage = blob_read_uint8(blob);
      struct v3dv_shader_variant *variant =
         shader_variant_create_from_blob(cache->device, blob);
      variants[stage] = variant;
   }

   uint32_t total_assembly_size = blob_read_uint32(blob);
   const uint64_t *total_assembly =
      blob_read_bytes(blob, total_assembly_size);

   if (blob->overrun)
      goto fail;

   struct v3dv_pipeline_shared_data *data =
      v3dv_pipeline_shared_data_new(cache, sha1_key, maps, variants,
                                    total_assembly, total_assembly_size);

   if (!data)
      goto fail;

   return data;

fail:
   for (int i = 0; i < BROADCOM_SHADER_STAGES; i++) {
      if (maps[i])
         vk_free2(&cache->device->vk.alloc, NULL, maps[i]);
      if (variants[i])
         v3dv_shader_variant_destroy(cache->device, variants[i]);
   }
   return NULL;
}

static void
pipeline_cache_load(struct v3dv_pipeline_cache *cache,
                    size_t size,
                    const void *data)
{
   struct v3dv_device *device = cache->device;
   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
   struct vk_pipeline_cache_header header;

   if (cache->cache == NULL || cache->nir_cache == NULL)
      return;

   struct blob_reader blob;
   blob_reader_init(&blob, data, size);

   blob_copy_bytes(&blob, &header, sizeof(header));
   if (size < sizeof(header))
      return;
   memcpy(&header, data, sizeof(header));
   if (header.header_size < sizeof(header))
      return;
   if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
      return;
   if (header.vendor_id != v3dv_physical_device_vendor_id(pdevice))
      return;
   if (header.device_id != v3dv_physical_device_device_id(pdevice))
      return;
   if (memcmp(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE) != 0)
      return;

   uint32_t nir_count = blob_read_uint32(&blob);
   if (blob.overrun)
      return;

   for (uint32_t i = 0; i < nir_count; i++) {
      struct serialized_nir *snir =
         serialized_nir_create_from_blob(cache, &blob);

      if (!snir)
         break;

      _mesa_hash_table_insert(cache->nir_cache, snir->sha1_key, snir);
      cache->nir_stats.count++;
   }

   uint32_t count = blob_read_uint32(&blob);
   if (blob.overrun)
      return;

   for (uint32_t i = 0; i < count; i++) {
      struct v3dv_pipeline_shared_data *cache_entry =
         v3dv_pipeline_shared_data_create_from_blob(cache, &blob);
      if (!cache_entry)
         break;

      _mesa_hash_table_insert(cache->cache, cache_entry->sha1_key, cache_entry);
      cache->stats.count++;
   }

   if (debug_cache) {
      fprintf(stderr, "pipeline cache %p, loaded %i nir shaders and "
              "%i entries\n", cache, nir_count, count);
      if (dump_stats)
         cache_dump_stats(cache);
   }
}

VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreatePipelineCache(VkDevice _device,
                         const VkPipelineCacheCreateInfo *pCreateInfo,
                         const VkAllocationCallbacks *pAllocator,
                         VkPipelineCache *pPipelineCache)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   struct v3dv_pipeline_cache *cache;

   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);

   cache = vk_object_zalloc(&device->vk, pAllocator,
                            sizeof(*cache),
                            VK_OBJECT_TYPE_PIPELINE_CACHE);

   if (cache == NULL)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

   v3dv_pipeline_cache_init(cache, device, pCreateInfo->flags,
                            device->instance->pipeline_cache_enabled);

   if (pCreateInfo->initialDataSize > 0) {
      pipeline_cache_load(cache,
                          pCreateInfo->initialDataSize,
                          pCreateInfo->pInitialData);
   }

   *pPipelineCache = v3dv_pipeline_cache_to_handle(cache);

   return VK_SUCCESS;
}

void
v3dv_pipeline_cache_finish(struct v3dv_pipeline_cache *cache)
{
   mtx_destroy(&cache->mutex);

   if (dump_stats_on_destroy)
      cache_dump_stats(cache);

   if (cache->nir_cache) {
      hash_table_foreach(cache->nir_cache, entry)
         ralloc_free(entry->data);

      _mesa_hash_table_destroy(cache->nir_cache, NULL);
   }

   if (cache->cache) {
      hash_table_foreach(cache->cache, entry) {
         struct v3dv_pipeline_shared_data *cache_entry = entry->data;
         if (cache_entry)
            v3dv_pipeline_shared_data_unref(cache->device, cache_entry);
      }

      _mesa_hash_table_destroy(cache->cache, NULL);
   }
}

VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyPipelineCache(VkDevice _device,
                          VkPipelineCache _cache,
                          const VkAllocationCallbacks *pAllocator)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);

   if (!cache)
      return;

   v3dv_pipeline_cache_finish(cache);

   vk_object_free(&device->vk, pAllocator, cache);
}

VKAPI_ATTR VkResult VKAPI_CALL
v3dv_MergePipelineCaches(VkDevice device,
                         VkPipelineCache dstCache,
                         uint32_t srcCacheCount,
                         const VkPipelineCache *pSrcCaches)
{
   V3DV_FROM_HANDLE(v3dv_pipeline_cache, dst, dstCache);

   if (!dst->cache || !dst->nir_cache)
      return VK_SUCCESS;

   for (uint32_t i = 0; i < srcCacheCount; i++) {
      V3DV_FROM_HANDLE(v3dv_pipeline_cache, src, pSrcCaches[i]);
      if (!src->cache || !src->nir_cache)
         continue;

      hash_table_foreach(src->nir_cache, entry) {
         struct serialized_nir *src_snir = entry->data;
         assert(src_snir);

         if (_mesa_hash_table_search(dst->nir_cache, src_snir->sha1_key))
            continue;

         /* FIXME: we are using serialized nir shaders because they are
          * convenient to create and store on the cache, but requires to do a
          * copy here (and some other places) of the serialized NIR. Perhaps
          * it would make sense to move to handle the NIR shaders with shared
          * structures with ref counts, as the variants.
          */
         struct serialized_nir *snir_dst =
            ralloc_size(dst->nir_cache, sizeof(*snir_dst) + src_snir->size);
         memcpy(snir_dst->sha1_key, src_snir->sha1_key, 20);
         snir_dst->size = src_snir->size;
         memcpy(snir_dst->data, src_snir->data, src_snir->size);

         _mesa_hash_table_insert(dst->nir_cache, snir_dst->sha1_key, snir_dst);
         dst->nir_stats.count++;
         if (debug_cache) {
            char sha1buf[41];
            _mesa_sha1_format(sha1buf, snir_dst->sha1_key);

            fprintf(stderr, "pipeline cache %p, added nir entry %s "
                    "from pipeline cache %p\n",
                    dst, sha1buf, src);
            if (dump_stats)
               cache_dump_stats(dst);
         }
      }

      hash_table_foreach(src->cache, entry) {
         struct v3dv_pipeline_shared_data *cache_entry = entry->data;
         assert(cache_entry);

         if (_mesa_hash_table_search(dst->cache, cache_entry->sha1_key))
            continue;

         v3dv_pipeline_shared_data_ref(cache_entry);
         _mesa_hash_table_insert(dst->cache, cache_entry->sha1_key, cache_entry);

         dst->stats.count++;
         if (debug_cache) {
            char sha1buf[41];
            _mesa_sha1_format(sha1buf, cache_entry->sha1_key);

            fprintf(stderr, "pipeline cache %p, added entry %s "
                    "from pipeline cache %p\n",
                    dst, sha1buf, src);
            if (dump_stats)
               cache_dump_stats(dst);
         }
      }
   }

   return VK_SUCCESS;
}

static bool
shader_variant_write_to_blob(const struct v3dv_shader_variant *variant,
                             struct blob *blob)
{
   blob_write_uint32(blob, variant->stage);

   blob_write_uint32(blob, variant->prog_data_size);
   blob_write_bytes(blob, variant->prog_data.base, variant->prog_data_size);

   struct v3d_uniform_list *ulist = &variant->prog_data.base->uniforms;
   blob_write_uint32(blob, ulist->count);
   blob_write_bytes(blob, ulist->contents, sizeof(enum quniform_contents) * ulist->count);
   blob_write_bytes(blob, ulist->data, sizeof(uint32_t) * ulist->count);

   blob_write_uint32(blob, variant->assembly_offset);
   blob_write_uint32(blob, variant->qpu_insts_size);

   return !blob->out_of_memory;
}

static bool
v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *cache_entry,
                                        struct blob *blob)
{
   blob_write_bytes(blob, cache_entry->sha1_key, 20);

   uint8_t descriptor_maps_count = 0;
   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
      if (broadcom_shader_stage_is_binning(stage))
         continue;
      if (cache_entry->maps[stage] == NULL)
         continue;
      descriptor_maps_count++;
   }

   /* Compute pipelines only have one descriptor map,
    * graphics pipelines may have 2 (VS+FS) or 3 (VS+GS+FS), since the binning
    * stages take the descriptor map from the render stage.
    */
   assert((descriptor_maps_count >= 2 && descriptor_maps_count <= 3) ||
          (descriptor_maps_count == 1 && cache_entry->variants[BROADCOM_SHADER_COMPUTE]));
   blob_write_uint8(blob, descriptor_maps_count);

   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
      if (cache_entry->maps[stage] == NULL)
         continue;
      if (broadcom_shader_stage_is_binning(stage))
         continue;

      blob_write_uint8(blob, stage);
      blob_write_bytes(blob, cache_entry->maps[stage],
                       sizeof(struct v3dv_descriptor_maps));
   }

   uint8_t variant_count = 0;
   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
      if (cache_entry->variants[stage] == NULL)
         continue;
      variant_count++;
   }

   /* Graphics pipelines with VS+FS have 3 variants, VS+GS+FS will have 5 and
    * compute pipelines only have 1.
    */
   assert((variant_count == 5  || variant_count == 3) ||
          (variant_count == 1 && cache_entry->variants[BROADCOM_SHADER_COMPUTE]));
   blob_write_uint8(blob, variant_count);

   uint32_t total_assembly_size = 0;
   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
      if (cache_entry->variants[stage] == NULL)
         continue;

      blob_write_uint8(blob, stage);
      if (!shader_variant_write_to_blob(cache_entry->variants[stage], blob))
         return false;

      total_assembly_size += cache_entry->variants[stage]->qpu_insts_size;
   }
   blob_write_uint32(blob, total_assembly_size);

   assert(cache_entry->assembly_bo->map);
   assert(cache_entry->assembly_bo->size >= total_assembly_size);
   blob_write_bytes(blob, cache_entry->assembly_bo->map, total_assembly_size);

   return !blob->out_of_memory;
}


VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetPipelineCacheData(VkDevice _device,
                          VkPipelineCache _cache,
                          size_t *pDataSize,
                          void *pData)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);

   struct blob blob;
   if (pData) {
      blob_init_fixed(&blob, pData, *pDataSize);
   } else {
      blob_init_fixed(&blob, NULL, SIZE_MAX);
   }

   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
   VkResult result = VK_INCOMPLETE;

   pipeline_cache_lock(cache);

   struct vk_pipeline_cache_header header = {
      .header_size = sizeof(struct vk_pipeline_cache_header),
      .header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
      .vendor_id = v3dv_physical_device_vendor_id(pdevice),
      .device_id = v3dv_physical_device_device_id(pdevice),
   };
   memcpy(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
   blob_write_bytes(&blob, &header, sizeof(header));

   uint32_t nir_count = 0;
   intptr_t nir_count_offset = blob_reserve_uint32(&blob);
   if (nir_count_offset < 0) {
      *pDataSize = 0;
      goto done;
   }

   if (cache->nir_cache) {
      hash_table_foreach(cache->nir_cache, entry) {
         const struct serialized_nir *snir = entry->data;

         size_t save_size = blob.size;

         blob_write_bytes(&blob, snir->sha1_key, 20);
         blob_write_uint32(&blob, snir->size);
         blob_write_bytes(&blob, snir->data, snir->size);

         if (blob.out_of_memory) {
            blob.size = save_size;
            goto done;
         }

         nir_count++;
      }
   }
   blob_overwrite_uint32(&blob, nir_count_offset, nir_count);

   uint32_t count = 0;
   intptr_t count_offset = blob_reserve_uint32(&blob);
   if (count_offset < 0) {
      *pDataSize = 0;
      goto done;
   }

   if (cache->cache) {
      hash_table_foreach(cache->cache, entry) {
         struct v3dv_pipeline_shared_data *cache_entry = entry->data;

         size_t save_size = blob.size;
         if (!v3dv_pipeline_shared_data_write_to_blob(cache_entry, &blob)) {
            /* If it fails reset to the previous size and bail */
            blob.size = save_size;
            goto done;
         }

         count++;
      }
   }

   blob_overwrite_uint32(&blob, count_offset, count);

   *pDataSize = blob.size;

   result = VK_SUCCESS;

   if (debug_cache) {
      assert(count <= cache->stats.count);
      fprintf(stderr, "GetPipelineCacheData: serializing cache %p, "
              "%i nir shader entries "
              "%i entries, %u DataSize\n",
              cache, nir_count, count, (uint32_t) *pDataSize);
   }

 done:
   blob_finish(&blob);

   pipeline_cache_unlock(cache);

   return result;
}