1/*
2 * Copyright © 2021 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#include "gen_macros.h"
25
26#include "nir/nir_builder.h"
27#include "pan_encoder.h"
28#include "pan_shader.h"
29
30#include "panvk_private.h"
31
32static mali_ptr
33panvk_meta_copy_img_emit_texture(struct panfrost_device *pdev,
34                                 struct pan_pool *desc_pool,
35                                 const struct pan_image_view *view)
36{
37   struct panfrost_ptr texture =
38      pan_pool_alloc_desc(desc_pool, TEXTURE);
39   size_t payload_size =
40      GENX(panfrost_estimate_texture_payload_size)(view);
41   struct panfrost_ptr surfaces =
42      pan_pool_alloc_aligned(desc_pool, payload_size,
43                             pan_alignment(SURFACE_WITH_STRIDE));
44
45   GENX(panfrost_new_texture)(pdev, view, texture.cpu, &surfaces);
46
47   return texture.gpu;
48}
49
50static mali_ptr
51panvk_meta_copy_img_emit_sampler(struct panfrost_device *pdev,
52                                 struct pan_pool *desc_pool)
53{
54   struct panfrost_ptr sampler =
55      pan_pool_alloc_desc(desc_pool, SAMPLER);
56
57   pan_pack(sampler.cpu, SAMPLER, cfg) {
58      cfg.seamless_cube_map = false;
59      cfg.normalized_coordinates = false;
60      cfg.minify_nearest = true;
61      cfg.magnify_nearest = true;
62   }
63
64   return sampler.gpu;
65}
66
67static void
68panvk_meta_copy_emit_varying(struct pan_pool *pool,
69                             mali_ptr coordinates,
70                             mali_ptr *varying_bufs,
71                             mali_ptr *varyings)
72{
73   struct panfrost_ptr varying =
74      pan_pool_alloc_desc(pool, ATTRIBUTE);
75   struct panfrost_ptr varying_buffer =
76      pan_pool_alloc_desc_array(pool, 2, ATTRIBUTE_BUFFER);
77
78   pan_pack(varying_buffer.cpu, ATTRIBUTE_BUFFER, cfg) {
79      cfg.pointer = coordinates;
80      cfg.stride = 4 * sizeof(uint32_t);
81      cfg.size = cfg.stride * 4;
82   }
83
84   /* Bifrost needs an empty desc to mark end of prefetching */
85   pan_pack(varying_buffer.cpu + pan_size(ATTRIBUTE_BUFFER),
86            ATTRIBUTE_BUFFER, cfg);
87
88   pan_pack(varying.cpu, ATTRIBUTE, cfg) {
89      cfg.buffer_index = 0;
90      cfg.format = pool->dev->formats[PIPE_FORMAT_R32G32B32_FLOAT].hw;
91   }
92
93   *varyings = varying.gpu;
94   *varying_bufs = varying_buffer.gpu;
95}
96
97static void
98panvk_meta_copy_emit_dcd(struct pan_pool *pool,
99                         mali_ptr src_coords, mali_ptr dst_coords,
100                         mali_ptr texture, mali_ptr sampler,
101                         mali_ptr vpd, mali_ptr tsd, mali_ptr rsd,
102                         mali_ptr push_constants, void *out)
103{
104   pan_pack(out, DRAW, cfg) {
105      cfg.thread_storage = tsd;
106      cfg.state = rsd;
107      cfg.push_uniforms = push_constants;
108      cfg.position = dst_coords;
109      if (src_coords) {
110              panvk_meta_copy_emit_varying(pool, src_coords,
111                                           &cfg.varying_buffers,
112                                           &cfg.varyings);
113      }
114      cfg.viewport = vpd;
115      cfg.textures = texture;
116      cfg.samplers = sampler;
117   }
118}
119
120static struct panfrost_ptr
121panvk_meta_copy_emit_tiler_job(struct pan_pool *desc_pool,
122                               struct pan_scoreboard *scoreboard,
123                               mali_ptr src_coords, mali_ptr dst_coords,
124                               mali_ptr texture, mali_ptr sampler,
125                               mali_ptr push_constants,
126                               mali_ptr vpd, mali_ptr rsd,
127                               mali_ptr tsd, mali_ptr tiler)
128{
129   struct panfrost_ptr job =
130      pan_pool_alloc_desc(desc_pool, TILER_JOB);
131
132   panvk_meta_copy_emit_dcd(desc_pool, src_coords, dst_coords,
133                            texture, sampler, vpd, tsd, rsd, push_constants,
134                            pan_section_ptr(job.cpu, TILER_JOB, DRAW));
135
136   pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE, cfg) {
137      cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP;
138      cfg.index_count = 4;
139      cfg.job_task_split = 6;
140   }
141
142   pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) {
143      cfg.constant = 1.0f;
144   }
145
146   void *invoc = pan_section_ptr(job.cpu,
147                                 TILER_JOB,
148                                 INVOCATION);
149   panfrost_pack_work_groups_compute(invoc, 1, 4,
150                                     1, 1, 1, 1, true, false);
151
152   pan_section_pack(job.cpu, TILER_JOB, PADDING, cfg);
153   pan_section_pack(job.cpu, TILER_JOB, TILER, cfg) {
154      cfg.address = tiler;
155   }
156
157   panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_TILER,
158                    false, false, 0, 0, &job, false);
159   return job;
160}
161
162static struct panfrost_ptr
163panvk_meta_copy_emit_compute_job(struct pan_pool *desc_pool,
164                                 struct pan_scoreboard *scoreboard,
165                                 const struct pan_compute_dim *num_wg,
166                                 const struct pan_compute_dim *wg_sz,
167                                 mali_ptr texture, mali_ptr sampler,
168                                 mali_ptr push_constants,
169                                 mali_ptr rsd, mali_ptr tsd)
170{
171   struct panfrost_ptr job =
172      pan_pool_alloc_desc(desc_pool, COMPUTE_JOB);
173
174   void *invoc = pan_section_ptr(job.cpu,
175                                 COMPUTE_JOB,
176                                 INVOCATION);
177   panfrost_pack_work_groups_compute(invoc, num_wg->x, num_wg->y, num_wg->z,
178                                     wg_sz->x, wg_sz->y, wg_sz->z,
179                                     false, false);
180
181   pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
182      cfg.job_task_split = 8;
183   }
184
185   panvk_meta_copy_emit_dcd(desc_pool, 0, 0, texture, sampler,
186                            0, tsd, rsd, push_constants,
187                            pan_section_ptr(job.cpu, COMPUTE_JOB, DRAW));
188
189   panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
190                    false, false, 0, 0, &job, false);
191   return job;
192}
193
194
195static uint32_t
196panvk_meta_copy_img_bifrost_raw_format(unsigned texelsize)
197{
198   switch (texelsize) {
199   case 6: return MALI_RGB16UI << 12;
200   case 8: return MALI_RG32UI << 12;
201   case 12: return MALI_RGB32UI << 12;
202   case 16: return MALI_RGBA32UI << 12;
203   default: unreachable("Invalid texel size\n");
204   }
205}
206
207static mali_ptr
208panvk_meta_copy_to_img_emit_rsd(struct panfrost_device *pdev,
209                                struct pan_pool *desc_pool,
210                                mali_ptr shader,
211                                const struct pan_shader_info *shader_info,
212                                enum pipe_format fmt, unsigned wrmask,
213                                bool from_img)
214{
215   struct panfrost_ptr rsd_ptr =
216      pan_pool_alloc_desc_aggregate(desc_pool,
217                                    PAN_DESC(RENDERER_STATE),
218                                    PAN_DESC_ARRAY(1, BLEND));
219
220   bool raw = util_format_get_blocksize(fmt) > 4;
221   unsigned fullmask = (1 << util_format_get_nr_components(fmt)) - 1;
222   bool partialwrite = fullmask != wrmask && !raw;
223   bool readstb = fullmask != wrmask && raw;
224
225   pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
226      pan_shader_prepare_rsd(shader_info, shader, &cfg);
227      if (from_img) {
228         cfg.shader.varying_count = 1;
229         cfg.shader.texture_count = 1;
230         cfg.shader.sampler_count = 1;
231      }
232      cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
233      cfg.multisample_misc.sample_mask = UINT16_MAX;
234      cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS;
235      cfg.stencil_mask_misc.stencil_mask_front = 0xFF;
236      cfg.stencil_mask_misc.stencil_mask_back = 0xFF;
237      cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS;
238      cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE;
239      cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE;
240      cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE;
241      cfg.stencil_front.mask = 0xFF;
242      cfg.stencil_back = cfg.stencil_front;
243
244      cfg.properties.allow_forward_pixel_to_be_killed = true;
245      cfg.properties.allow_forward_pixel_to_kill =
246         !partialwrite && !readstb;
247      cfg.properties.zs_update_operation =
248         MALI_PIXEL_KILL_STRONG_EARLY;
249      cfg.properties.pixel_kill_operation =
250         MALI_PIXEL_KILL_FORCE_EARLY;
251   }
252
253   pan_pack(rsd_ptr.cpu + pan_size(RENDERER_STATE), BLEND, cfg) {
254      cfg.round_to_fb_precision = true;
255      cfg.load_destination = partialwrite;
256      cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
257      cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
258      cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
259      cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
260      cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
261      cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
262      cfg.internal.mode =
263         partialwrite ?
264         MALI_BLEND_MODE_FIXED_FUNCTION :
265         MALI_BLEND_MODE_OPAQUE;
266      cfg.equation.color_mask = partialwrite ? wrmask : 0xf;
267      cfg.internal.fixed_function.num_comps = 4;
268      if (!raw) {
269         cfg.internal.fixed_function.conversion.memory_format =
270            panfrost_format_to_bifrost_blend(pdev, fmt, false);
271         cfg.internal.fixed_function.conversion.register_format =
272            MALI_REGISTER_FILE_FORMAT_F32;
273      } else {
274         unsigned imgtexelsz = util_format_get_blocksize(fmt);
275
276         cfg.internal.fixed_function.conversion.memory_format =
277            panvk_meta_copy_img_bifrost_raw_format(imgtexelsz);
278         cfg.internal.fixed_function.conversion.register_format =
279            (imgtexelsz & 2) ?
280            MALI_REGISTER_FILE_FORMAT_U16 :
281            MALI_REGISTER_FILE_FORMAT_U32;
282      }
283   }
284
285   return rsd_ptr.gpu;
286}
287
288static mali_ptr
289panvk_meta_copy_to_buf_emit_rsd(struct panfrost_device *pdev,
290                                struct pan_pool *desc_pool,
291                                mali_ptr shader,
292                                const struct pan_shader_info *shader_info,
293                                bool from_img)
294{
295   struct panfrost_ptr rsd_ptr =
296      pan_pool_alloc_desc_aggregate(desc_pool,
297                                    PAN_DESC(RENDERER_STATE));
298
299   pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
300      pan_shader_prepare_rsd(shader_info, shader, &cfg);
301      if (from_img) {
302         cfg.shader.texture_count = 1;
303         cfg.shader.sampler_count = 1;
304      }
305   }
306
307   return rsd_ptr.gpu;
308}
309
310static mali_ptr
311panvk_meta_copy_img2img_shader(struct panfrost_device *pdev,
312                               struct pan_pool *bin_pool,
313                               enum pipe_format srcfmt,
314                               enum pipe_format dstfmt, unsigned dstmask,
315                               unsigned texdim, bool texisarray, bool is_ms,
316                               struct pan_shader_info *shader_info)
317{
318   nir_builder b =
319      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
320                                     GENX(pan_shader_get_compiler_options)(),
321                                     "panvk_meta_copy_img2img(srcfmt=%s,dstfmt=%s,%dD%s%s)",
322                                     util_format_name(srcfmt), util_format_name(dstfmt),
323                                     texdim, texisarray ? "[]" : "", is_ms ? ",ms" : "");
324
325   nir_variable *coord_var =
326      nir_variable_create(b.shader, nir_var_shader_in,
327                          glsl_vector_type(GLSL_TYPE_FLOAT, texdim + texisarray),
328                          "coord");
329   coord_var->data.location = VARYING_SLOT_VAR0;
330   nir_ssa_def *coord = nir_f2u32(&b, nir_load_var(&b, coord_var));
331
332   nir_tex_instr *tex = nir_tex_instr_create(b.shader, is_ms ? 2 : 1);
333   tex->op = is_ms ? nir_texop_txf_ms : nir_texop_txf;
334   tex->texture_index = 0;
335   tex->is_array = texisarray;
336   tex->dest_type = util_format_is_unorm(srcfmt) ?
337                    nir_type_float32 : nir_type_uint32;
338
339   switch (texdim) {
340   case 1: tex->sampler_dim = GLSL_SAMPLER_DIM_1D; break;
341   case 2: tex->sampler_dim = GLSL_SAMPLER_DIM_2D; break;
342   case 3: tex->sampler_dim = GLSL_SAMPLER_DIM_3D; break;
343   default: unreachable("Invalid texture dimension");
344   }
345
346   tex->src[0].src_type = nir_tex_src_coord;
347   tex->src[0].src = nir_src_for_ssa(coord);
348   tex->coord_components = texdim + texisarray;
349
350   if (is_ms) {
351      tex->src[1].src_type = nir_tex_src_ms_index;
352      tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(&b));
353   }
354
355   nir_ssa_dest_init(&tex->instr, &tex->dest, 4,
356                     nir_alu_type_get_type_size(tex->dest_type), NULL);
357   nir_builder_instr_insert(&b, &tex->instr);
358
359   nir_ssa_def *texel = &tex->dest.ssa;
360
361   unsigned dstcompsz =
362      util_format_get_component_bits(dstfmt, UTIL_FORMAT_COLORSPACE_RGB, 0);
363   unsigned ndstcomps = util_format_get_nr_components(dstfmt);
364   const struct glsl_type *outtype = NULL;
365
366   if (srcfmt == PIPE_FORMAT_R5G6B5_UNORM && dstfmt == PIPE_FORMAT_R8G8_UNORM) {
367      nir_ssa_def *rgb =
368         nir_f2u32(&b, nir_fmul(&b, texel,
369                                nir_vec3(&b,
370                                         nir_imm_float(&b, 31),
371                                         nir_imm_float(&b, 63),
372                                         nir_imm_float(&b, 31))));
373      nir_ssa_def *rg =
374         nir_vec2(&b,
375                  nir_ior(&b, nir_channel(&b, rgb, 0),
376                          nir_ishl(&b, nir_channel(&b, rgb, 1),
377                                   nir_imm_int(&b, 5))),
378                  nir_ior(&b,
379                          nir_ushr_imm(&b, nir_channel(&b, rgb, 1), 3),
380                          nir_ishl(&b, nir_channel(&b, rgb, 2),
381                                   nir_imm_int(&b, 3))));
382      rg = nir_iand_imm(&b, rg, 255);
383      texel = nir_fmul_imm(&b, nir_u2f32(&b, rg), 1.0 / 255);
384      outtype = glsl_vector_type(GLSL_TYPE_FLOAT, 2);
385   } else if (srcfmt == PIPE_FORMAT_R8G8_UNORM && dstfmt == PIPE_FORMAT_R5G6B5_UNORM) {
386      nir_ssa_def *rg = nir_f2u32(&b, nir_fmul_imm(&b, texel, 255));
387      nir_ssa_def *rgb =
388         nir_vec3(&b,
389                  nir_channel(&b, rg, 0),
390                  nir_ior(&b,
391                          nir_ushr_imm(&b, nir_channel(&b, rg, 0), 5),
392                          nir_ishl(&b, nir_channel(&b, rg, 1),
393                                   nir_imm_int(&b, 3))),
394                  nir_ushr_imm(&b, nir_channel(&b, rg, 1), 3));
395      rgb = nir_iand(&b, rgb,
396                     nir_vec3(&b,
397                              nir_imm_int(&b, 31),
398                              nir_imm_int(&b, 63),
399                              nir_imm_int(&b, 31)));
400      texel = nir_fmul(&b, nir_u2f32(&b, rgb),
401                       nir_vec3(&b,
402                                nir_imm_float(&b, 1.0 / 31),
403                                nir_imm_float(&b, 1.0 / 63),
404                                nir_imm_float(&b, 1.0 / 31)));
405      outtype = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
406   } else {
407      assert(srcfmt == dstfmt);
408      enum glsl_base_type basetype;
409      if (util_format_is_unorm(dstfmt)) {
410         basetype = GLSL_TYPE_FLOAT;
411      } else if (dstcompsz == 16) {
412         basetype = GLSL_TYPE_UINT16;
413      } else {
414         assert(dstcompsz == 32);
415         basetype = GLSL_TYPE_UINT;
416      }
417
418      if (dstcompsz == 16)
419         texel = nir_u2u16(&b, texel);
420
421      texel = nir_channels(&b, texel, (1 << ndstcomps) - 1);
422      outtype = glsl_vector_type(basetype, ndstcomps);
423   }
424
425   nir_variable *out =
426      nir_variable_create(b.shader, nir_var_shader_out, outtype, "out");
427   out->data.location = FRAG_RESULT_DATA0;
428
429   unsigned fullmask = (1 << ndstcomps) - 1;
430   if (dstcompsz > 8 && dstmask != fullmask) {
431      nir_ssa_def *oldtexel = nir_load_var(&b, out);
432      nir_ssa_def *dstcomps[4];
433
434      for (unsigned i = 0; i < ndstcomps; i++) {
435         if (dstmask & BITFIELD_BIT(i))
436            dstcomps[i] = nir_channel(&b, texel, i);
437         else
438            dstcomps[i] = nir_channel(&b, oldtexel, i);
439      }
440
441      texel = nir_vec(&b, dstcomps, ndstcomps);
442   }
443
444   nir_store_var(&b, out, texel, 0xff);
445
446   struct panfrost_compile_inputs inputs = {
447      .gpu_id = pdev->gpu_id,
448      .is_blit = true,
449      .no_ubo_to_push = true,
450   };
451
452   pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
453      cfg.memory_format = (dstcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
454      cfg.register_format = dstcompsz == 2 ?
455                            MALI_REGISTER_FILE_FORMAT_U16 :
456                            MALI_REGISTER_FILE_FORMAT_U32;
457   }
458   inputs.bifrost.static_rt_conv = true;
459
460   struct util_dynarray binary;
461
462   util_dynarray_init(&binary, NULL);
463   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
464
465   shader_info->fs.sample_shading = is_ms;
466
467   mali_ptr shader =
468      pan_pool_upload_aligned(bin_pool, binary.data, binary.size, 128);
469
470   util_dynarray_fini(&binary);
471   ralloc_free(b.shader);
472
473   return shader;
474}
475
476static enum pipe_format
477panvk_meta_copy_img_format(enum pipe_format fmt)
478{
479   /* We can't use a non-compressed format when handling a tiled/AFBC
480    * compressed format because the tile size differ (4x4 blocks for
481    * compressed formats and 16x16 texels for non-compressed ones).
482    */
483   assert(!util_format_is_compressed(fmt));
484
485   /* Pick blendable formats when we can, otherwise pick the UINT variant
486    * matching the texel size.
487    */
488   switch (util_format_get_blocksize(fmt)) {
489   case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
490   case 12: return PIPE_FORMAT_R32G32B32_UINT;
491   case 8: return PIPE_FORMAT_R32G32_UINT;
492   case 6: return PIPE_FORMAT_R16G16B16_UINT;
493   case 4: return PIPE_FORMAT_R8G8B8A8_UNORM;
494   case 2: return (fmt == PIPE_FORMAT_R5G6B5_UNORM ||
495                   fmt == PIPE_FORMAT_B5G6R5_UNORM) ?
496                  PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UNORM;
497   case 1: return PIPE_FORMAT_R8_UNORM;
498   default: unreachable("Unsupported format\n");
499   }
500}
501
502struct panvk_meta_copy_img2img_format_info {
503   enum pipe_format srcfmt;
504   enum pipe_format dstfmt;
505   unsigned dstmask;
506} PACKED;
507
508static const struct panvk_meta_copy_img2img_format_info panvk_meta_copy_img2img_fmts[] = {
509   { PIPE_FORMAT_R8_UNORM, PIPE_FORMAT_R8_UNORM, 0x1},
510   { PIPE_FORMAT_R5G6B5_UNORM, PIPE_FORMAT_R5G6B5_UNORM, 0x7},
511   { PIPE_FORMAT_R5G6B5_UNORM, PIPE_FORMAT_R8G8_UNORM, 0x3},
512   { PIPE_FORMAT_R8G8_UNORM, PIPE_FORMAT_R5G6B5_UNORM, 0x7},
513   { PIPE_FORMAT_R8G8_UNORM, PIPE_FORMAT_R8G8_UNORM, 0x3},
514   /* Z24S8(depth) */
515   { PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, 0x7 },
516   /* Z24S8(stencil) */
517   { PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, 0x8 },
518   { PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, 0xf },
519   { PIPE_FORMAT_R16G16B16_UINT, PIPE_FORMAT_R16G16B16_UINT, 0x7 },
520   { PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32_UINT, 0x3 },
521   /* Z32S8X24(depth) */
522   { PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32_UINT, 0x1 },
523   /* Z32S8X24(stencil) */
524   { PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32_UINT, 0x2 },
525   { PIPE_FORMAT_R32G32B32_UINT, PIPE_FORMAT_R32G32B32_UINT, 0x7 },
526   { PIPE_FORMAT_R32G32B32A32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
527};
528
529static unsigned
530panvk_meta_copy_img2img_format_idx(struct panvk_meta_copy_img2img_format_info key)
531{
532   STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2img_fmts) == PANVK_META_COPY_IMG2IMG_NUM_FORMATS);
533
534   for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2img_fmts); i++) {
535      if (!memcmp(&key, &panvk_meta_copy_img2img_fmts[i], sizeof(key)))
536         return i;
537   }
538
539   unreachable("Invalid image format\n");
540}
541
542static unsigned
543panvk_meta_copy_img_mask(enum pipe_format imgfmt, VkImageAspectFlags aspectMask)
544{
545   if (aspectMask != VK_IMAGE_ASPECT_DEPTH_BIT &&
546       aspectMask != VK_IMAGE_ASPECT_STENCIL_BIT) {
547      enum pipe_format outfmt = panvk_meta_copy_img_format(imgfmt);
548
549      return (1 << util_format_get_nr_components(outfmt)) - 1;
550   }
551
552   switch (imgfmt) {
553   case PIPE_FORMAT_S8_UINT:
554      return 1;
555   case PIPE_FORMAT_Z16_UNORM:
556      return 3;
557   case PIPE_FORMAT_Z16_UNORM_S8_UINT:
558      return aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT ? 3 : 8;
559   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
560      return aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT ? 7 : 8;
561   case PIPE_FORMAT_Z24X8_UNORM:
562      assert(aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT);
563      return 7;
564   case PIPE_FORMAT_Z32_FLOAT:
565      return 0xf;
566   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
567      return aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT ? 1 : 2;
568   default:
569      unreachable("Invalid depth format\n");
570   }
571}
572
573static void
574panvk_meta_copy_img2img(struct panvk_cmd_buffer *cmdbuf,
575                        const struct panvk_image *src,
576                        const struct panvk_image *dst,
577                        const VkImageCopy2 *region)
578{
579   struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
580   struct pan_fb_info *fbinfo = &cmdbuf->state.fb.info;
581   struct panvk_meta_copy_img2img_format_info key = {
582      .srcfmt = panvk_meta_copy_img_format(src->pimage.layout.format),
583      .dstfmt = panvk_meta_copy_img_format(dst->pimage.layout.format),
584      .dstmask = panvk_meta_copy_img_mask(dst->pimage.layout.format,
585                                          region->dstSubresource.aspectMask),
586   };
587
588   assert(src->pimage.layout.nr_samples == dst->pimage.layout.nr_samples);
589
590   unsigned texdimidx =
591      panvk_meta_copy_tex_type(src->pimage.layout.dim,
592                               src->pimage.layout.array_size > 1);
593   unsigned fmtidx =
594      panvk_meta_copy_img2img_format_idx(key);
595   unsigned ms = dst->pimage.layout.nr_samples > 1 ? 1 : 0;
596
597   mali_ptr rsd =
598      cmdbuf->device->physical_device->meta.copy.img2img[ms][texdimidx][fmtidx].rsd;
599
600   struct pan_image_view srcview = {
601      .format = key.srcfmt,
602      .dim = src->pimage.layout.dim == MALI_TEXTURE_DIMENSION_CUBE ?
603             MALI_TEXTURE_DIMENSION_2D : src->pimage.layout.dim,
604      .image = &src->pimage,
605      .nr_samples = src->pimage.layout.nr_samples,
606      .first_level = region->srcSubresource.mipLevel,
607      .last_level = region->srcSubresource.mipLevel,
608      .first_layer = region->srcSubresource.baseArrayLayer,
609      .last_layer = region->srcSubresource.baseArrayLayer + region->srcSubresource.layerCount - 1,
610      .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
611   };
612
613   struct pan_image_view dstview = {
614      .format = key.dstfmt,
615      .dim = MALI_TEXTURE_DIMENSION_2D,
616      .image = &dst->pimage,
617      .nr_samples = dst->pimage.layout.nr_samples,
618      .first_level = region->dstSubresource.mipLevel,
619      .last_level = region->dstSubresource.mipLevel,
620      .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
621   };
622
623   unsigned minx = MAX2(region->dstOffset.x, 0);
624   unsigned miny = MAX2(region->dstOffset.y, 0);
625   unsigned maxx = MAX2(region->dstOffset.x + region->extent.width - 1, 0);
626   unsigned maxy = MAX2(region->dstOffset.y + region->extent.height - 1, 0);
627
628   mali_ptr vpd =
629      panvk_per_arch(meta_emit_viewport)(&cmdbuf->desc_pool.base,
630                                         minx, miny, maxx, maxy);
631
632   float dst_rect[] = {
633      minx, miny, 0.0, 1.0,
634      maxx + 1, miny, 0.0, 1.0,
635      minx, maxy + 1, 0.0, 1.0,
636      maxx + 1, maxy + 1, 0.0, 1.0,
637   };
638
639   mali_ptr dst_coords =
640      pan_pool_upload_aligned(&cmdbuf->desc_pool.base, dst_rect,
641                              sizeof(dst_rect), 64);
642
643   /* TODO: don't force preloads of dst resources if unneeded */
644
645   unsigned width = u_minify(dst->pimage.layout.width, region->dstSubresource.mipLevel);
646   unsigned height = u_minify(dst->pimage.layout.height, region->dstSubresource.mipLevel);
647   cmdbuf->state.fb.crc_valid[0] = false;
648   *fbinfo = (struct pan_fb_info){
649      .width = width,
650      .height = height,
651      .extent.minx = minx & ~31,
652      .extent.miny = miny & ~31,
653      .extent.maxx = MIN2(ALIGN_POT(maxx + 1, 32), width) - 1,
654      .extent.maxy = MIN2(ALIGN_POT(maxy + 1, 32), height) - 1,
655      .nr_samples = dst->pimage.layout.nr_samples,
656      .rt_count = 1,
657      .rts[0].view = &dstview,
658      .rts[0].preload = true,
659      .rts[0].crc_valid = &cmdbuf->state.fb.crc_valid[0],
660   };
661
662   mali_ptr texture =
663      panvk_meta_copy_img_emit_texture(pdev, &cmdbuf->desc_pool.base, &srcview);
664   mali_ptr sampler =
665      panvk_meta_copy_img_emit_sampler(pdev, &cmdbuf->desc_pool.base);
666
667   panvk_per_arch(cmd_close_batch)(cmdbuf);
668
669   minx = MAX2(region->srcOffset.x, 0);
670   miny = MAX2(region->srcOffset.y, 0);
671   maxx = MAX2(region->srcOffset.x + region->extent.width - 1, 0);
672   maxy = MAX2(region->srcOffset.y + region->extent.height - 1, 0);
673   assert(region->dstOffset.z >= 0);
674
675   unsigned first_src_layer = MAX2(0, region->srcOffset.z);
676   unsigned first_dst_layer = MAX2(region->dstSubresource.baseArrayLayer, region->dstOffset.z);
677   unsigned nlayers = MAX2(region->dstSubresource.layerCount, region->extent.depth);
678   for (unsigned l = 0; l < nlayers; l++) {
679      unsigned src_l = l + first_src_layer;
680      float src_rect[] = {
681         minx, miny, src_l, 1.0,
682         maxx + 1, miny, src_l, 1.0,
683         minx, maxy + 1, src_l, 1.0,
684         maxx + 1, maxy + 1, src_l, 1.0,
685      };
686
687      mali_ptr src_coords =
688         pan_pool_upload_aligned(&cmdbuf->desc_pool.base, src_rect,
689                                 sizeof(src_rect), 64);
690
691      struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
692
693      dstview.first_layer = dstview.last_layer = l + first_dst_layer;
694      batch->blit.src = src->pimage.data.bo;
695      batch->blit.dst = dst->pimage.data.bo;
696      panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
697      panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
698      panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
699
700      mali_ptr tsd, tiler;
701
702      tsd = batch->tls.gpu;
703      tiler = batch->tiler.descs.gpu;
704
705      struct panfrost_ptr job;
706
707      job = panvk_meta_copy_emit_tiler_job(&cmdbuf->desc_pool.base,
708                                           &batch->scoreboard,
709                                           src_coords, dst_coords,
710                                           texture, sampler, 0,
711                                           vpd, rsd, tsd, tiler);
712
713      util_dynarray_append(&batch->jobs, void *, job.cpu);
714      panvk_per_arch(cmd_close_batch)(cmdbuf);
715   }
716}
717
718static void
719panvk_meta_copy_img2img_init(struct panvk_physical_device *dev, bool is_ms)
720{
721   STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2img_fmts) == PANVK_META_COPY_IMG2IMG_NUM_FORMATS);
722
723   for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2img_fmts); i++) {
724      for (unsigned texdim = 1; texdim <= 3; texdim++) {
725         unsigned texdimidx = panvk_meta_copy_tex_type(texdim, false);
726         assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2img[0]));
727
728         /* No MSAA on 3D textures */
729         if (texdim == 3 && is_ms) continue;
730
731         struct pan_shader_info shader_info;
732         mali_ptr shader =
733            panvk_meta_copy_img2img_shader(&dev->pdev, &dev->meta.bin_pool.base,
734                                           panvk_meta_copy_img2img_fmts[i].srcfmt,
735                                           panvk_meta_copy_img2img_fmts[i].dstfmt,
736                                           panvk_meta_copy_img2img_fmts[i].dstmask,
737                                           texdim, false, is_ms, &shader_info);
738         dev->meta.copy.img2img[is_ms][texdimidx][i].rsd =
739            panvk_meta_copy_to_img_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
740                                            shader, &shader_info,
741                                            panvk_meta_copy_img2img_fmts[i].dstfmt,
742                                            panvk_meta_copy_img2img_fmts[i].dstmask,
743                                            true);
744         if (texdim == 3)
745            continue;
746
747         memset(&shader_info, 0, sizeof(shader_info));
748         texdimidx = panvk_meta_copy_tex_type(texdim, true);
749         assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2img[0]));
750         shader =
751            panvk_meta_copy_img2img_shader(&dev->pdev, &dev->meta.bin_pool.base,
752                                           panvk_meta_copy_img2img_fmts[i].srcfmt,
753                                           panvk_meta_copy_img2img_fmts[i].dstfmt,
754                                           panvk_meta_copy_img2img_fmts[i].dstmask,
755                                           texdim, true, is_ms, &shader_info);
756         dev->meta.copy.img2img[is_ms][texdimidx][i].rsd =
757            panvk_meta_copy_to_img_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
758                                            shader, &shader_info,
759                                            panvk_meta_copy_img2img_fmts[i].dstfmt,
760                                            panvk_meta_copy_img2img_fmts[i].dstmask,
761                                            true);
762      }
763   }
764}
765
766void
767panvk_per_arch(CmdCopyImage2)(VkCommandBuffer commandBuffer,
768                              const VkCopyImageInfo2 *pCopyImageInfo)
769{
770   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
771   VK_FROM_HANDLE(panvk_image, dst, pCopyImageInfo->dstImage);
772   VK_FROM_HANDLE(panvk_image, src, pCopyImageInfo->srcImage);
773
774   for (unsigned i = 0; i < pCopyImageInfo->regionCount; i++) {
775      panvk_meta_copy_img2img(cmdbuf, src, dst, &pCopyImageInfo->pRegions[i]);
776   }
777}
778
779static unsigned
780panvk_meta_copy_buf_texelsize(enum pipe_format imgfmt, unsigned mask)
781{
782   unsigned imgtexelsz = util_format_get_blocksize(imgfmt);
783   unsigned nbufcomps = util_bitcount(mask);
784
785   if (nbufcomps == util_format_get_nr_components(imgfmt))
786      return imgtexelsz;
787
788   /* Special case for Z24 buffers which are not tightly packed */
789   if (mask == 7 && imgtexelsz == 4)
790      return 4;
791
792   /* Special case for S8 extraction from Z32_S8X24 */
793   if (mask == 2 && imgtexelsz == 8)
794      return 1;
795
796   unsigned compsz =
797      util_format_get_component_bits(imgfmt, UTIL_FORMAT_COLORSPACE_RGB, 0);
798
799   assert(!(compsz % 8));
800
801   return nbufcomps * compsz / 8;
802}
803
804static enum pipe_format
805panvk_meta_copy_buf2img_format(enum pipe_format imgfmt)
806{
807   /* Pick blendable formats when we can, and the FLOAT variant matching the
808    * texelsize otherwise.
809    */
810   switch (util_format_get_blocksize(imgfmt)) {
811   case 1: return PIPE_FORMAT_R8_UNORM;
812   /* AFBC stores things differently for RGB565,
813    * we can't simply map to R8G8 in that case */
814   case 2: return (imgfmt == PIPE_FORMAT_R5G6B5_UNORM ||
815                   imgfmt == PIPE_FORMAT_B5G6R5_UNORM) ?
816                  PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UNORM;
817   case 4: return PIPE_FORMAT_R8G8B8A8_UNORM;
818   case 6: return PIPE_FORMAT_R16G16B16_UINT;
819   case 8: return PIPE_FORMAT_R32G32_UINT;
820   case 12: return PIPE_FORMAT_R32G32B32_UINT;
821   case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
822   default: unreachable("Invalid format\n");
823   }
824}
825
826struct panvk_meta_copy_format_info {
827   enum pipe_format imgfmt;
828   unsigned mask;
829} PACKED;
830
831static const struct panvk_meta_copy_format_info panvk_meta_copy_buf2img_fmts[] = {
832   { PIPE_FORMAT_R8_UNORM, 0x1 },
833   { PIPE_FORMAT_R8G8_UNORM, 0x3 },
834   { PIPE_FORMAT_R5G6B5_UNORM, 0x7 },
835   { PIPE_FORMAT_R8G8B8A8_UNORM, 0xf },
836   { PIPE_FORMAT_R16G16B16_UINT, 0x7 },
837   { PIPE_FORMAT_R32G32_UINT, 0x3 },
838   { PIPE_FORMAT_R32G32B32_UINT, 0x7 },
839   { PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
840   /* S8 -> Z24S8 */
841   { PIPE_FORMAT_R8G8B8A8_UNORM, 0x8 },
842   /* S8 -> Z32_S8X24 */
843   { PIPE_FORMAT_R32G32_UINT, 0x2 },
844   /* Z24X8 -> Z24S8 */
845   { PIPE_FORMAT_R8G8B8A8_UNORM, 0x7 },
846   /* Z32 -> Z32_S8X24 */
847   { PIPE_FORMAT_R32G32_UINT, 0x1 },
848};
849
850struct panvk_meta_copy_buf2img_info {
851   struct {
852      mali_ptr ptr;
853      struct {
854         unsigned line;
855         unsigned surf;
856      } stride;
857   } buf;
858} PACKED;
859
860#define panvk_meta_copy_buf2img_get_info_field(b, field) \
861        nir_load_push_constant((b), 1, \
862                     sizeof(((struct panvk_meta_copy_buf2img_info *)0)->field) * 8, \
863                     nir_imm_int(b, 0), \
864                     .base = offsetof(struct panvk_meta_copy_buf2img_info, field), \
865                     .range = ~0)
866
867static mali_ptr
868panvk_meta_copy_buf2img_shader(struct panfrost_device *pdev,
869                               struct pan_pool *bin_pool,
870                               struct panvk_meta_copy_format_info key,
871                               struct pan_shader_info *shader_info)
872{
873   nir_builder b =
874      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
875                                     GENX(pan_shader_get_compiler_options)(),
876                                     "panvk_meta_copy_buf2img(imgfmt=%s,mask=%x)",
877                                     util_format_name(key.imgfmt),
878                                     key.mask);
879
880   nir_variable *coord_var =
881      nir_variable_create(b.shader, nir_var_shader_in,
882                          glsl_vector_type(GLSL_TYPE_FLOAT, 3),
883                          "coord");
884   coord_var->data.location = VARYING_SLOT_VAR0;
885   nir_ssa_def *coord = nir_load_var(&b, coord_var);
886
887   coord = nir_f2u32(&b, coord);
888
889   nir_ssa_def *bufptr =
890      panvk_meta_copy_buf2img_get_info_field(&b, buf.ptr);
891   nir_ssa_def *buflinestride =
892      panvk_meta_copy_buf2img_get_info_field(&b, buf.stride.line);
893   nir_ssa_def *bufsurfstride =
894      panvk_meta_copy_buf2img_get_info_field(&b, buf.stride.surf);
895
896   unsigned imgtexelsz = util_format_get_blocksize(key.imgfmt);
897   unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
898   unsigned writemask = key.mask;
899
900   nir_ssa_def *offset =
901      nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, buftexelsz));
902   offset = nir_iadd(&b, offset,
903                     nir_imul(&b, nir_channel(&b, coord, 1), buflinestride));
904   offset = nir_iadd(&b, offset,
905                     nir_imul(&b, nir_channel(&b, coord, 2), bufsurfstride));
906   bufptr = nir_iadd(&b, bufptr, nir_u2u64(&b, offset));
907
908   unsigned imgcompsz =
909      (imgtexelsz <= 4 && key.imgfmt != PIPE_FORMAT_R5G6B5_UNORM) ?
910      1 : MIN2(1 << (ffs(imgtexelsz) - 1), 4);
911
912   unsigned nimgcomps = imgtexelsz / imgcompsz;
913   unsigned bufcompsz = MIN2(buftexelsz, imgcompsz);
914   unsigned nbufcomps = buftexelsz / bufcompsz;
915
916   assert(bufcompsz == 1 || bufcompsz == 2 || bufcompsz == 4);
917   assert(nbufcomps <= 4 && nimgcomps <= 4);
918
919   nir_ssa_def *texel =
920      nir_load_global(&b, bufptr, bufcompsz, nbufcomps, bufcompsz * 8);
921
922   enum glsl_base_type basetype;
923   if (key.imgfmt == PIPE_FORMAT_R5G6B5_UNORM) {
924      texel = nir_vec3(&b,
925                       nir_iand_imm(&b, texel, BITFIELD_MASK(5)),
926                       nir_iand_imm(&b, nir_ushr_imm(&b, texel, 5), BITFIELD_MASK(6)),
927                       nir_iand_imm(&b, nir_ushr_imm(&b, texel, 11), BITFIELD_MASK(5)));
928      texel = nir_fmul(&b,
929                       nir_u2f32(&b, texel),
930                       nir_vec3(&b,
931                                nir_imm_float(&b, 1.0f / 31),
932                                nir_imm_float(&b, 1.0f / 63),
933                                nir_imm_float(&b, 1.0f / 31)));
934      nimgcomps = 3;
935      basetype = GLSL_TYPE_FLOAT;
936   } else if (imgcompsz == 1) {
937      assert(bufcompsz == 1);
938      /* Blendable formats are unorm and the fixed-function blend unit
939       * takes float values.
940       */
941      texel = nir_fmul(&b, nir_u2f32(&b, texel),
942                       nir_imm_float(&b, 1.0f / 255));
943      basetype = GLSL_TYPE_FLOAT;
944   } else {
945      texel = nir_u2uN(&b, texel, imgcompsz * 8);
946      basetype = imgcompsz == 2 ? GLSL_TYPE_UINT16 : GLSL_TYPE_UINT;
947   }
948
949   /* We always pass the texel using 32-bit regs for now */
950   nir_variable *out =
951      nir_variable_create(b.shader, nir_var_shader_out,
952                          glsl_vector_type(basetype, nimgcomps),
953                          "out");
954   out->data.location = FRAG_RESULT_DATA0;
955
956   uint16_t fullmask = (1 << nimgcomps) - 1;
957
958   assert(fullmask >= writemask);
959
960   if (fullmask != writemask) {
961      unsigned first_written_comp = ffs(writemask) - 1;
962      nir_ssa_def *oldtexel = NULL;
963      if (imgcompsz > 1)
964         oldtexel = nir_load_var(&b, out);
965
966      nir_ssa_def *texel_comps[4];
967      for (unsigned i = 0; i < nimgcomps; i++) {
968         if (writemask & BITFIELD_BIT(i))
969            texel_comps[i] = nir_channel(&b, texel, i - first_written_comp);
970         else if (imgcompsz > 1)
971            texel_comps[i] = nir_channel(&b, oldtexel, i);
972         else
973            texel_comps[i] = nir_imm_intN_t(&b, 0, texel->bit_size);
974      }
975
976      texel = nir_vec(&b, texel_comps, nimgcomps);
977   }
978
979   nir_store_var(&b, out, texel, 0xff);
980
981   struct panfrost_compile_inputs inputs = {
982      .gpu_id = pdev->gpu_id,
983      .is_blit = true,
984      .no_ubo_to_push = true,
985   };
986
987   pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
988      cfg.memory_format = (imgcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
989      cfg.register_format = imgcompsz == 2 ?
990                            MALI_REGISTER_FILE_FORMAT_U16 :
991                            MALI_REGISTER_FILE_FORMAT_U32;
992   }
993   inputs.bifrost.static_rt_conv = true;
994
995   struct util_dynarray binary;
996
997   util_dynarray_init(&binary, NULL);
998   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
999   shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2img_info), 4);
1000
1001   mali_ptr shader =
1002      pan_pool_upload_aligned(bin_pool, binary.data, binary.size, 128);
1003
1004   util_dynarray_fini(&binary);
1005   ralloc_free(b.shader);
1006
1007   return shader;
1008}
1009
1010static unsigned
1011panvk_meta_copy_buf2img_format_idx(struct panvk_meta_copy_format_info key)
1012{
1013   for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_buf2img_fmts); i++) {
1014      if (!memcmp(&key, &panvk_meta_copy_buf2img_fmts[i], sizeof(key)))
1015         return i;
1016   }
1017
1018   unreachable("Invalid image format\n");
1019}
1020
1021static void
1022panvk_meta_copy_buf2img(struct panvk_cmd_buffer *cmdbuf,
1023                        const struct panvk_buffer *buf,
1024                        const struct panvk_image *img,
1025                        const VkBufferImageCopy2 *region)
1026{
1027   struct pan_fb_info *fbinfo = &cmdbuf->state.fb.info;
1028   unsigned minx = MAX2(region->imageOffset.x, 0);
1029   unsigned miny = MAX2(region->imageOffset.y, 0);
1030   unsigned maxx = MAX2(region->imageOffset.x + region->imageExtent.width - 1, 0);
1031   unsigned maxy = MAX2(region->imageOffset.y + region->imageExtent.height - 1, 0);
1032
1033   mali_ptr vpd =
1034      panvk_per_arch(meta_emit_viewport)(&cmdbuf->desc_pool.base,
1035                                         minx, miny, maxx, maxy);
1036
1037   float dst_rect[] = {
1038      minx, miny, 0.0, 1.0,
1039      maxx + 1, miny, 0.0, 1.0,
1040      minx, maxy + 1, 0.0, 1.0,
1041      maxx + 1, maxy + 1, 0.0, 1.0,
1042   };
1043   mali_ptr dst_coords =
1044      pan_pool_upload_aligned(&cmdbuf->desc_pool.base, dst_rect,
1045                              sizeof(dst_rect), 64);
1046
1047   struct panvk_meta_copy_format_info key = {
1048      .imgfmt = panvk_meta_copy_buf2img_format(img->pimage.layout.format),
1049      .mask = panvk_meta_copy_img_mask(img->pimage.layout.format,
1050                                       region->imageSubresource.aspectMask),
1051   };
1052
1053   unsigned fmtidx = panvk_meta_copy_buf2img_format_idx(key);
1054
1055   mali_ptr rsd =
1056      cmdbuf->device->physical_device->meta.copy.buf2img[fmtidx].rsd;
1057
1058   const struct vk_image_buffer_layout buflayout =
1059      vk_image_buffer_copy_layout(&img->vk, region);
1060   struct panvk_meta_copy_buf2img_info info = {
1061      .buf.ptr = panvk_buffer_gpu_ptr(buf, region->bufferOffset),
1062      .buf.stride.line = buflayout.row_stride_B,
1063      .buf.stride.surf = buflayout.image_stride_B,
1064   };
1065
1066   mali_ptr pushconsts =
1067      pan_pool_upload_aligned(&cmdbuf->desc_pool.base, &info, sizeof(info), 16);
1068
1069   struct pan_image_view view = {
1070      .format = key.imgfmt,
1071      .dim = MALI_TEXTURE_DIMENSION_2D,
1072      .image = &img->pimage,
1073      .nr_samples = img->pimage.layout.nr_samples,
1074      .first_level = region->imageSubresource.mipLevel,
1075      .last_level = region->imageSubresource.mipLevel,
1076      .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
1077   };
1078
1079   /* TODO: don't force preloads of dst resources if unneeded */
1080   cmdbuf->state.fb.crc_valid[0] = false;
1081   *fbinfo = (struct pan_fb_info){
1082      .width = u_minify(img->pimage.layout.width, region->imageSubresource.mipLevel),
1083      .height = u_minify(img->pimage.layout.height, region->imageSubresource.mipLevel),
1084      .extent.minx = minx,
1085      .extent.maxx = maxx,
1086      .extent.miny = miny,
1087      .extent.maxy = maxy,
1088      .nr_samples = 1,
1089      .rt_count = 1,
1090      .rts[0].view = &view,
1091      .rts[0].preload = true,
1092      .rts[0].crc_valid = &cmdbuf->state.fb.crc_valid[0],
1093   };
1094
1095   panvk_per_arch(cmd_close_batch)(cmdbuf);
1096
1097   assert(region->imageSubresource.layerCount == 1 ||
1098          region->imageExtent.depth == 1);
1099   assert(region->imageOffset.z >= 0);
1100   unsigned first_layer = MAX2(region->imageSubresource.baseArrayLayer, region->imageOffset.z);
1101   unsigned nlayers = MAX2(region->imageSubresource.layerCount, region->imageExtent.depth);
1102   for (unsigned l = 0; l < nlayers; l++) {
1103      float src_rect[] = {
1104         0, 0, l, 1.0,
1105         region->imageExtent.width, 0, l, 1.0,
1106         0, region->imageExtent.height, l, 1.0,
1107         region->imageExtent.width, region->imageExtent.height, l, 1.0,
1108      };
1109
1110      mali_ptr src_coords =
1111         pan_pool_upload_aligned(&cmdbuf->desc_pool.base, src_rect,
1112                                 sizeof(src_rect), 64);
1113
1114      struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1115
1116      view.first_layer = view.last_layer = l + first_layer;
1117      batch->blit.src = buf->bo;
1118      batch->blit.dst = img->pimage.data.bo;
1119      panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
1120      panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
1121      panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
1122
1123      mali_ptr tsd, tiler;
1124
1125      tsd = batch->tls.gpu;
1126      tiler = batch->tiler.descs.gpu;
1127
1128      struct panfrost_ptr job;
1129
1130      job = panvk_meta_copy_emit_tiler_job(&cmdbuf->desc_pool.base,
1131                                           &batch->scoreboard,
1132                                           src_coords, dst_coords,
1133                                           0, 0, pushconsts,
1134                                           vpd, rsd, tsd, tiler);
1135
1136      util_dynarray_append(&batch->jobs, void *, job.cpu);
1137      panvk_per_arch(cmd_close_batch)(cmdbuf);
1138   }
1139}
1140
1141static void
1142panvk_meta_copy_buf2img_init(struct panvk_physical_device *dev)
1143{
1144   STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_buf2img_fmts) == PANVK_META_COPY_BUF2IMG_NUM_FORMATS);
1145
1146   for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_buf2img_fmts); i++) {
1147      struct pan_shader_info shader_info;
1148      mali_ptr shader =
1149         panvk_meta_copy_buf2img_shader(&dev->pdev, &dev->meta.bin_pool.base,
1150                                        panvk_meta_copy_buf2img_fmts[i],
1151                                        &shader_info);
1152      dev->meta.copy.buf2img[i].rsd =
1153         panvk_meta_copy_to_img_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
1154                                         shader, &shader_info,
1155                                         panvk_meta_copy_buf2img_fmts[i].imgfmt,
1156                                         panvk_meta_copy_buf2img_fmts[i].mask,
1157                                         false);
1158   }
1159}
1160
1161void
1162panvk_per_arch(CmdCopyBufferToImage2)(VkCommandBuffer commandBuffer,
1163                                      const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
1164{
1165   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1166   VK_FROM_HANDLE(panvk_buffer, buf, pCopyBufferToImageInfo->srcBuffer);
1167   VK_FROM_HANDLE(panvk_image, img, pCopyBufferToImageInfo->dstImage);
1168
1169   for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; i++) {
1170      panvk_meta_copy_buf2img(cmdbuf, buf, img, &pCopyBufferToImageInfo->pRegions[i]);
1171   }
1172}
1173
1174static const struct panvk_meta_copy_format_info panvk_meta_copy_img2buf_fmts[] = {
1175   { PIPE_FORMAT_R8_UINT, 0x1 },
1176   { PIPE_FORMAT_R8G8_UINT, 0x3 },
1177   { PIPE_FORMAT_R5G6B5_UNORM, 0x7 },
1178   { PIPE_FORMAT_R8G8B8A8_UINT, 0xf },
1179   { PIPE_FORMAT_R16G16B16_UINT, 0x7 },
1180   { PIPE_FORMAT_R32G32_UINT, 0x3 },
1181   { PIPE_FORMAT_R32G32B32_UINT, 0x7 },
1182   { PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
1183   /* S8 -> Z24S8 */
1184   { PIPE_FORMAT_R8G8B8A8_UINT, 0x8 },
1185   /* S8 -> Z32_S8X24 */
1186   { PIPE_FORMAT_R32G32_UINT, 0x2 },
1187   /* Z24X8 -> Z24S8 */
1188   { PIPE_FORMAT_R8G8B8A8_UINT, 0x7 },
1189   /* Z32 -> Z32_S8X24 */
1190   { PIPE_FORMAT_R32G32_UINT, 0x1 },
1191};
1192
1193static enum pipe_format
1194panvk_meta_copy_img2buf_format(enum pipe_format imgfmt)
1195{
1196   /* Pick blendable formats when we can, and the FLOAT variant matching the
1197    * texelsize otherwise.
1198    */
1199   switch (util_format_get_blocksize(imgfmt)) {
1200   case 1: return PIPE_FORMAT_R8_UINT;
1201   /* AFBC stores things differently for RGB565,
1202    * we can't simply map to R8G8 in that case */
1203   case 2: return (imgfmt == PIPE_FORMAT_R5G6B5_UNORM ||
1204                   imgfmt == PIPE_FORMAT_B5G6R5_UNORM) ?
1205                  PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UINT;
1206   case 4: return PIPE_FORMAT_R8G8B8A8_UINT;
1207   case 6: return PIPE_FORMAT_R16G16B16_UINT;
1208   case 8: return PIPE_FORMAT_R32G32_UINT;
1209   case 12: return PIPE_FORMAT_R32G32B32_UINT;
1210   case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
1211   default: unreachable("Invalid format\n");
1212   }
1213}
1214
1215struct panvk_meta_copy_img2buf_info {
1216   struct {
1217      mali_ptr ptr;
1218      struct {
1219         unsigned line;
1220         unsigned surf;
1221      } stride;
1222   } buf;
1223   struct {
1224      struct {
1225         unsigned x, y, z;
1226      } offset;
1227      struct {
1228         unsigned minx, miny, maxx, maxy;
1229      } extent;
1230   } img;
1231} PACKED;
1232
1233#define panvk_meta_copy_img2buf_get_info_field(b, field) \
1234        nir_load_push_constant((b), 1, \
1235                     sizeof(((struct panvk_meta_copy_img2buf_info *)0)->field) * 8, \
1236                     nir_imm_int(b, 0), \
1237                     .base = offsetof(struct panvk_meta_copy_img2buf_info, field), \
1238                     .range = ~0)
1239
1240static mali_ptr
1241panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev,
1242                               struct pan_pool *bin_pool,
1243                               struct panvk_meta_copy_format_info key,
1244                               unsigned texdim, unsigned texisarray,
1245                               struct pan_shader_info *shader_info)
1246{
1247   unsigned imgtexelsz = util_format_get_blocksize(key.imgfmt);
1248   unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
1249
1250   /* FIXME: Won't work on compute queues, but we can't do that with
1251    * a compute shader if the destination is an AFBC surface.
1252    */
1253   nir_builder b =
1254      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1255                                     GENX(pan_shader_get_compiler_options)(),
1256                                     "panvk_meta_copy_img2buf(dim=%dD%s,imgfmt=%s,mask=%x)",
1257                                     texdim, texisarray ? "[]" : "",
1258                                     util_format_name(key.imgfmt),
1259                                     key.mask);
1260
1261   nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
1262   nir_ssa_def *bufptr =
1263      panvk_meta_copy_img2buf_get_info_field(&b, buf.ptr);
1264   nir_ssa_def *buflinestride =
1265      panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.line);
1266   nir_ssa_def *bufsurfstride =
1267      panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.surf);
1268
1269   nir_ssa_def *imgminx =
1270      panvk_meta_copy_img2buf_get_info_field(&b, img.extent.minx);
1271   nir_ssa_def *imgminy =
1272      panvk_meta_copy_img2buf_get_info_field(&b, img.extent.miny);
1273   nir_ssa_def *imgmaxx =
1274      panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxx);
1275   nir_ssa_def *imgmaxy =
1276      panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxy);
1277
1278   nir_ssa_def *imgcoords, *inbounds;
1279
1280   switch (texdim + texisarray) {
1281   case 1:
1282      imgcoords =
1283         nir_iadd(&b,
1284                  nir_channel(&b, coord, 0),
1285                  panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x));
1286      inbounds =
1287         nir_iand(&b,
1288                  nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
1289                  nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx));
1290      break;
1291   case 2:
1292      imgcoords =
1293         nir_vec2(&b,
1294                  nir_iadd(&b,
1295                           nir_channel(&b, coord, 0),
1296                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
1297                  nir_iadd(&b,
1298                           nir_channel(&b, coord, 1),
1299                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
1300      inbounds =
1301         nir_iand(&b,
1302                  nir_iand(&b,
1303                           nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
1304                           nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
1305                  nir_iand(&b,
1306                           nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
1307                           nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
1308      break;
1309   case 3:
1310      imgcoords =
1311         nir_vec3(&b,
1312                  nir_iadd(&b,
1313                           nir_channel(&b, coord, 0),
1314                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
1315                  nir_iadd(&b,
1316                           nir_channel(&b, coord, 1),
1317                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)),
1318                  nir_iadd(&b,
1319                           nir_channel(&b, coord, 2),
1320                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
1321      inbounds =
1322         nir_iand(&b,
1323                  nir_iand(&b,
1324                           nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
1325                           nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
1326                  nir_iand(&b,
1327                           nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
1328                           nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
1329      break;
1330   default:
1331      unreachable("Invalid texture dimension\n");
1332   }
1333
1334   nir_push_if(&b, inbounds);
1335
1336   /* FIXME: doesn't work for tiled+compressed formats since blocks are 4x4
1337    * blocks instead of 16x16 texels in that case, and there's nothing we can
1338    * do to force the tile size to 4x4 in the render path.
1339    * This being said, compressed textures are not compatible with AFBC, so we
1340    * could use a compute shader arranging the blocks properly.
1341    */
1342   nir_ssa_def *offset =
1343      nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, buftexelsz));
1344   offset = nir_iadd(&b, offset,
1345                     nir_imul(&b, nir_channel(&b, coord, 1), buflinestride));
1346   offset = nir_iadd(&b, offset,
1347                     nir_imul(&b, nir_channel(&b, coord, 2), bufsurfstride));
1348   bufptr = nir_iadd(&b, bufptr, nir_u2u64(&b, offset));
1349
1350   unsigned imgcompsz = imgtexelsz <= 4 ?
1351                        1 : MIN2(1 << (ffs(imgtexelsz) - 1), 4);
1352   unsigned nimgcomps = imgtexelsz / imgcompsz;
1353   assert(nimgcomps <= 4);
1354
1355   nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
1356   tex->op = nir_texop_txf;
1357   tex->texture_index = 0;
1358   tex->is_array = texisarray;
1359   tex->dest_type = util_format_is_unorm(key.imgfmt) ?
1360                    nir_type_float32 : nir_type_uint32;
1361
1362   switch (texdim) {
1363   case 1: tex->sampler_dim = GLSL_SAMPLER_DIM_1D; break;
1364   case 2: tex->sampler_dim = GLSL_SAMPLER_DIM_2D; break;
1365   case 3: tex->sampler_dim = GLSL_SAMPLER_DIM_3D; break;
1366   default: unreachable("Invalid texture dimension");
1367   }
1368
1369   tex->src[0].src_type = nir_tex_src_coord;
1370   tex->src[0].src = nir_src_for_ssa(imgcoords);
1371   tex->coord_components = texdim + texisarray;
1372   nir_ssa_dest_init(&tex->instr, &tex->dest, 4,
1373                     nir_alu_type_get_type_size(tex->dest_type), NULL);
1374   nir_builder_instr_insert(&b, &tex->instr);
1375
1376   nir_ssa_def *texel = &tex->dest.ssa;
1377
1378   unsigned fullmask = (1 << util_format_get_nr_components(key.imgfmt)) - 1;
1379   unsigned nbufcomps = util_bitcount(fullmask);
1380   if (key.mask != fullmask) {
1381      nir_ssa_def *bufcomps[4];
1382      nbufcomps = 0;
1383      for (unsigned i = 0; i < nimgcomps; i++) {
1384         if (key.mask & BITFIELD_BIT(i))
1385            bufcomps[nbufcomps++] = nir_channel(&b, texel, i);
1386      }
1387
1388      texel = nir_vec(&b, bufcomps, nbufcomps);
1389   }
1390
1391   unsigned bufcompsz = buftexelsz / nbufcomps;
1392
1393   if (key.imgfmt == PIPE_FORMAT_R5G6B5_UNORM) {
1394      texel = nir_fmul(&b, texel,
1395                       nir_vec3(&b,
1396                                nir_imm_float(&b, 31),
1397                                nir_imm_float(&b, 63),
1398                                nir_imm_float(&b, 31)));
1399      texel = nir_f2u16(&b, texel);
1400      texel = nir_ior(&b, nir_channel(&b, texel, 0),
1401                      nir_ior(&b,
1402                              nir_ishl(&b, nir_channel(&b, texel, 1), nir_imm_int(&b, 5)),
1403                              nir_ishl(&b, nir_channel(&b, texel, 2), nir_imm_int(&b, 11))));
1404      imgcompsz = 2;
1405      bufcompsz = 2;
1406      nbufcomps = 1;
1407      nimgcomps = 1;
1408   } else if (imgcompsz == 1) {
1409      nir_ssa_def *packed = nir_channel(&b, texel, 0);
1410      for (unsigned i = 1; i < nbufcomps; i++) {
1411         packed = nir_ior(&b, packed,
1412                          nir_ishl(&b, nir_iand_imm(&b, nir_channel(&b, texel, i), 0xff),
1413                                   nir_imm_int(&b, i * 8)));
1414      }
1415      texel = packed;
1416
1417      bufcompsz = nbufcomps == 3 ? 4 : nbufcomps;
1418      nbufcomps = 1;
1419   }
1420
1421   assert(bufcompsz == 1 || bufcompsz == 2 || bufcompsz == 4);
1422   assert(nbufcomps <= 4 && nimgcomps <= 4);
1423   texel = nir_u2uN(&b, texel, bufcompsz * 8);
1424
1425   nir_store_global(&b, bufptr, bufcompsz, texel, (1 << nbufcomps) - 1);
1426   nir_pop_if(&b, NULL);
1427
1428   struct panfrost_compile_inputs inputs = {
1429      .gpu_id = pdev->gpu_id,
1430      .is_blit = true,
1431      .no_ubo_to_push = true,
1432   };
1433
1434   struct util_dynarray binary;
1435
1436   util_dynarray_init(&binary, NULL);
1437   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1438
1439   shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_img2buf_info), 4);
1440
1441   mali_ptr shader =
1442      pan_pool_upload_aligned(bin_pool, binary.data, binary.size, 128);
1443
1444   util_dynarray_fini(&binary);
1445   ralloc_free(b.shader);
1446
1447   return shader;
1448}
1449
1450static unsigned
1451panvk_meta_copy_img2buf_format_idx(struct panvk_meta_copy_format_info key)
1452{
1453   for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
1454      if (!memcmp(&key, &panvk_meta_copy_img2buf_fmts[i], sizeof(key)))
1455         return i;
1456   }
1457
1458   unreachable("Invalid texel size\n");
1459}
1460
1461static void
1462panvk_meta_copy_img2buf(struct panvk_cmd_buffer *cmdbuf,
1463                        const struct panvk_buffer *buf,
1464                        const struct panvk_image *img,
1465                        const VkBufferImageCopy2 *region)
1466{
1467   struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
1468   struct panvk_meta_copy_format_info key = {
1469      .imgfmt = panvk_meta_copy_img2buf_format(img->pimage.layout.format),
1470      .mask = panvk_meta_copy_img_mask(img->pimage.layout.format,
1471                                       region->imageSubresource.aspectMask),
1472   };
1473   unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
1474   unsigned texdimidx =
1475      panvk_meta_copy_tex_type(img->pimage.layout.dim,
1476                               img->pimage.layout.array_size > 1);
1477   unsigned fmtidx = panvk_meta_copy_img2buf_format_idx(key);
1478
1479   mali_ptr rsd =
1480      cmdbuf->device->physical_device->meta.copy.img2buf[texdimidx][fmtidx].rsd;
1481
1482   struct panvk_meta_copy_img2buf_info info = {
1483      .buf.ptr = panvk_buffer_gpu_ptr(buf, region->bufferOffset),
1484      .buf.stride.line = (region->bufferRowLength ? : region->imageExtent.width) * buftexelsz,
1485      .img.offset.x = MAX2(region->imageOffset.x & ~15, 0),
1486      .img.extent.minx = MAX2(region->imageOffset.x, 0),
1487      .img.extent.maxx = MAX2(region->imageOffset.x + region->imageExtent.width - 1, 0),
1488   };
1489
1490   if (img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D) {
1491      info.img.extent.maxy = region->imageSubresource.layerCount - 1;
1492   } else {
1493      info.img.offset.y = MAX2(region->imageOffset.y & ~15, 0);
1494      info.img.offset.z = MAX2(region->imageOffset.z, 0);
1495      info.img.extent.miny = MAX2(region->imageOffset.y, 0);
1496      info.img.extent.maxy = MAX2(region->imageOffset.y + region->imageExtent.height - 1, 0);
1497   }
1498
1499   info.buf.stride.surf = (region->bufferImageHeight ? : region->imageExtent.height) *
1500                          info.buf.stride.line;
1501
1502   mali_ptr pushconsts =
1503      pan_pool_upload_aligned(&cmdbuf->desc_pool.base, &info, sizeof(info), 16);
1504
1505   struct pan_image_view view = {
1506      .format = key.imgfmt,
1507      .dim = img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_CUBE ?
1508             MALI_TEXTURE_DIMENSION_2D : img->pimage.layout.dim,
1509      .image = &img->pimage,
1510      .nr_samples = img->pimage.layout.nr_samples,
1511      .first_level = region->imageSubresource.mipLevel,
1512      .last_level = region->imageSubresource.mipLevel,
1513      .first_layer = region->imageSubresource.baseArrayLayer,
1514      .last_layer = region->imageSubresource.baseArrayLayer + region->imageSubresource.layerCount - 1,
1515      .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
1516   };
1517
1518   mali_ptr texture =
1519      panvk_meta_copy_img_emit_texture(pdev, &cmdbuf->desc_pool.base, &view);
1520   mali_ptr sampler =
1521      panvk_meta_copy_img_emit_sampler(pdev, &cmdbuf->desc_pool.base);
1522
1523   panvk_per_arch(cmd_close_batch)(cmdbuf);
1524
1525   struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1526
1527   struct pan_tls_info tlsinfo = { 0 };
1528
1529   batch->blit.src = img->pimage.data.bo;
1530   batch->blit.dst = buf->bo;
1531   batch->tls =
1532      pan_pool_alloc_desc(&cmdbuf->desc_pool.base, LOCAL_STORAGE);
1533   GENX(pan_emit_tls)(&tlsinfo, batch->tls.cpu);
1534
1535   mali_ptr tsd = batch->tls.gpu;
1536
1537   struct pan_compute_dim wg_sz = {
1538      16,
1539      img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ? 1 : 16,
1540      1,
1541   };
1542
1543   struct pan_compute_dim num_wg = {
1544     (ALIGN_POT(info.img.extent.maxx + 1, 16) - info.img.offset.x) / 16,
1545     img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ?
1546        region->imageSubresource.layerCount :
1547        (ALIGN_POT(info.img.extent.maxy + 1, 16) - info.img.offset.y) / 16,
1548     img->pimage.layout.dim != MALI_TEXTURE_DIMENSION_1D ?
1549        MAX2(region->imageSubresource.layerCount, region->imageExtent.depth) : 1,
1550   };
1551
1552   struct panfrost_ptr job =
1553      panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
1554                                       &batch->scoreboard, &num_wg, &wg_sz,
1555                                       texture, sampler,
1556                                       pushconsts, rsd, tsd);
1557
1558   util_dynarray_append(&batch->jobs, void *, job.cpu);
1559
1560   panvk_per_arch(cmd_close_batch)(cmdbuf);
1561}
1562
1563static void
1564panvk_meta_copy_img2buf_init(struct panvk_physical_device *dev)
1565{
1566   STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2buf_fmts) == PANVK_META_COPY_IMG2BUF_NUM_FORMATS);
1567
1568   for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
1569      for (unsigned texdim = 1; texdim <= 3; texdim++) {
1570         unsigned texdimidx = panvk_meta_copy_tex_type(texdim, false);
1571         assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
1572
1573         struct pan_shader_info shader_info;
1574         mali_ptr shader =
1575            panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
1576                                           panvk_meta_copy_img2buf_fmts[i],
1577                                           texdim, false, &shader_info);
1578         dev->meta.copy.img2buf[texdimidx][i].rsd =
1579            panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
1580                                            &dev->meta.desc_pool.base,
1581                                            shader, &shader_info, true);
1582
1583         if (texdim == 3)
1584            continue;
1585
1586         memset(&shader_info, 0, sizeof(shader_info));
1587         texdimidx = panvk_meta_copy_tex_type(texdim, true);
1588         assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
1589         shader =
1590            panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
1591                                           panvk_meta_copy_img2buf_fmts[i],
1592                                           texdim, true, &shader_info);
1593         dev->meta.copy.img2buf[texdimidx][i].rsd =
1594            panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
1595                                            &dev->meta.desc_pool.base,
1596                                            shader, &shader_info, true);
1597      }
1598   }
1599}
1600
1601void
1602panvk_per_arch(CmdCopyImageToBuffer2)(VkCommandBuffer commandBuffer,
1603                                      const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
1604{
1605   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1606   VK_FROM_HANDLE(panvk_buffer, buf, pCopyImageToBufferInfo->dstBuffer);
1607   VK_FROM_HANDLE(panvk_image, img, pCopyImageToBufferInfo->srcImage);
1608
1609   for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; i++) {
1610      panvk_meta_copy_img2buf(cmdbuf, buf, img, &pCopyImageToBufferInfo->pRegions[i]);
1611   }
1612}
1613
1614struct panvk_meta_copy_buf2buf_info {
1615   mali_ptr src;
1616   mali_ptr dst;
1617} PACKED;
1618
1619#define panvk_meta_copy_buf2buf_get_info_field(b, field) \
1620        nir_load_push_constant((b), 1, \
1621                     sizeof(((struct panvk_meta_copy_buf2buf_info *)0)->field) * 8, \
1622                     nir_imm_int(b, 0), \
1623                     .base = offsetof(struct panvk_meta_copy_buf2buf_info, field), \
1624                     .range = ~0)
1625
1626static mali_ptr
1627panvk_meta_copy_buf2buf_shader(struct panfrost_device *pdev,
1628                               struct pan_pool *bin_pool,
1629                               unsigned blksz,
1630                               struct pan_shader_info *shader_info)
1631{
1632   /* FIXME: Won't work on compute queues, but we can't do that with
1633    * a compute shader if the destination is an AFBC surface.
1634    */
1635   nir_builder b =
1636      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1637                                     GENX(pan_shader_get_compiler_options)(),
1638                                     "panvk_meta_copy_buf2buf(blksz=%d)",
1639                                     blksz);
1640
1641   nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
1642
1643   nir_ssa_def *offset =
1644      nir_u2u64(&b, nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, blksz)));
1645   nir_ssa_def *srcptr =
1646      nir_iadd(&b, panvk_meta_copy_buf2buf_get_info_field(&b, src), offset);
1647   nir_ssa_def *dstptr =
1648      nir_iadd(&b, panvk_meta_copy_buf2buf_get_info_field(&b, dst), offset);
1649
1650   unsigned compsz = blksz < 4 ? blksz : 4;
1651   unsigned ncomps = blksz / compsz;
1652   nir_store_global(&b, dstptr, blksz,
1653                    nir_load_global(&b, srcptr, blksz, ncomps, compsz * 8),
1654                    (1 << ncomps) - 1);
1655
1656   struct panfrost_compile_inputs inputs = {
1657      .gpu_id = pdev->gpu_id,
1658      .is_blit = true,
1659      .no_ubo_to_push = true,
1660   };
1661
1662   struct util_dynarray binary;
1663
1664   util_dynarray_init(&binary, NULL);
1665   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1666
1667   shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2buf_info), 4);
1668
1669   mali_ptr shader =
1670      pan_pool_upload_aligned(bin_pool, binary.data, binary.size, 128);
1671
1672   util_dynarray_fini(&binary);
1673   ralloc_free(b.shader);
1674
1675   return shader;
1676}
1677
1678static void
1679panvk_meta_copy_buf2buf_init(struct panvk_physical_device *dev)
1680{
1681   for (unsigned i = 0; i < ARRAY_SIZE(dev->meta.copy.buf2buf); i++) {
1682      struct pan_shader_info shader_info;
1683      mali_ptr shader =
1684         panvk_meta_copy_buf2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
1685                                        1 << i, &shader_info);
1686      dev->meta.copy.buf2buf[i].rsd =
1687         panvk_meta_copy_to_buf_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
1688                                         shader, &shader_info, false);
1689   }
1690}
1691
1692static void
1693panvk_meta_copy_buf2buf(struct panvk_cmd_buffer *cmdbuf,
1694                        const struct panvk_buffer *src,
1695                        const struct panvk_buffer *dst,
1696                        const VkBufferCopy2 *region)
1697{
1698   struct panvk_meta_copy_buf2buf_info info = {
1699      .src = panvk_buffer_gpu_ptr(src, region->srcOffset),
1700      .dst = panvk_buffer_gpu_ptr(dst, region->dstOffset),
1701   };
1702
1703   unsigned alignment = ffs((info.src | info.dst | region->size) & 15);
1704   unsigned log2blksz = alignment ? alignment - 1 : 4;
1705
1706   assert(log2blksz < ARRAY_SIZE(cmdbuf->device->physical_device->meta.copy.buf2buf));
1707   mali_ptr rsd =
1708      cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].rsd;
1709
1710   mali_ptr pushconsts =
1711      pan_pool_upload_aligned(&cmdbuf->desc_pool.base, &info, sizeof(info), 16);
1712
1713   panvk_per_arch(cmd_close_batch)(cmdbuf);
1714
1715   struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1716
1717   panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
1718
1719   mali_ptr tsd = batch->tls.gpu;
1720
1721   unsigned nblocks = region->size >> log2blksz;
1722   struct pan_compute_dim num_wg = { nblocks, 1, 1 };
1723   struct pan_compute_dim wg_sz = { 1, 1, 1};
1724   struct panfrost_ptr job =
1725     panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
1726                                      &batch->scoreboard,
1727                                      &num_wg, &wg_sz,
1728                                      0, 0, pushconsts, rsd, tsd);
1729
1730   util_dynarray_append(&batch->jobs, void *, job.cpu);
1731
1732   batch->blit.src = src->bo;
1733   batch->blit.dst = dst->bo;
1734   panvk_per_arch(cmd_close_batch)(cmdbuf);
1735}
1736
1737void
1738panvk_per_arch(CmdCopyBuffer2)(VkCommandBuffer commandBuffer,
1739                               const VkCopyBufferInfo2 *pCopyBufferInfo)
1740{
1741   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1742   VK_FROM_HANDLE(panvk_buffer, src, pCopyBufferInfo->srcBuffer);
1743   VK_FROM_HANDLE(panvk_buffer, dst, pCopyBufferInfo->dstBuffer);
1744
1745   for (unsigned i = 0; i < pCopyBufferInfo->regionCount; i++) {
1746      panvk_meta_copy_buf2buf(cmdbuf, src, dst, &pCopyBufferInfo->pRegions[i]);
1747   }
1748}
1749
1750struct panvk_meta_fill_buf_info {
1751   mali_ptr start;
1752   uint32_t val;
1753} PACKED;
1754
1755#define panvk_meta_fill_buf_get_info_field(b, field) \
1756        nir_load_push_constant((b), 1, \
1757                     sizeof(((struct panvk_meta_fill_buf_info *)0)->field) * 8, \
1758                     nir_imm_int(b, 0), \
1759                     .base = offsetof(struct panvk_meta_fill_buf_info, field), \
1760                     .range = ~0)
1761
1762static mali_ptr
1763panvk_meta_fill_buf_shader(struct panfrost_device *pdev,
1764                           struct pan_pool *bin_pool,
1765                           struct pan_shader_info *shader_info)
1766{
1767   /* FIXME: Won't work on compute queues, but we can't do that with
1768    * a compute shader if the destination is an AFBC surface.
1769    */
1770   nir_builder b =
1771      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1772                                     GENX(pan_shader_get_compiler_options)(),
1773                                     "panvk_meta_fill_buf()");
1774
1775   nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
1776
1777   nir_ssa_def *offset =
1778      nir_u2u64(&b, nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, sizeof(uint32_t))));
1779   nir_ssa_def *ptr =
1780      nir_iadd(&b, panvk_meta_fill_buf_get_info_field(&b, start), offset);
1781   nir_ssa_def *val = panvk_meta_fill_buf_get_info_field(&b, val);
1782
1783   nir_store_global(&b, ptr, sizeof(uint32_t), val, 1);
1784
1785   struct panfrost_compile_inputs inputs = {
1786      .gpu_id = pdev->gpu_id,
1787      .is_blit = true,
1788      .no_ubo_to_push = true,
1789   };
1790
1791   struct util_dynarray binary;
1792
1793   util_dynarray_init(&binary, NULL);
1794   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1795
1796   shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_fill_buf_info), 4);
1797
1798   mali_ptr shader =
1799      pan_pool_upload_aligned(bin_pool, binary.data, binary.size, 128);
1800
1801   util_dynarray_fini(&binary);
1802   ralloc_free(b.shader);
1803
1804   return shader;
1805}
1806
1807static mali_ptr
1808panvk_meta_fill_buf_emit_rsd(struct panfrost_device *pdev,
1809                             struct pan_pool *bin_pool,
1810                             struct pan_pool *desc_pool)
1811{
1812   struct pan_shader_info shader_info;
1813
1814   mali_ptr shader =
1815      panvk_meta_fill_buf_shader(pdev, bin_pool, &shader_info);
1816
1817   struct panfrost_ptr rsd_ptr =
1818      pan_pool_alloc_desc_aggregate(desc_pool,
1819                                    PAN_DESC(RENDERER_STATE));
1820
1821   pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
1822      pan_shader_prepare_rsd(&shader_info, shader, &cfg);
1823   }
1824
1825   return rsd_ptr.gpu;
1826}
1827
1828static void
1829panvk_meta_fill_buf_init(struct panvk_physical_device *dev)
1830{
1831   dev->meta.copy.fillbuf.rsd =
1832      panvk_meta_fill_buf_emit_rsd(&dev->pdev, &dev->meta.bin_pool.base,
1833                                   &dev->meta.desc_pool.base);
1834}
1835
1836static void
1837panvk_meta_fill_buf(struct panvk_cmd_buffer *cmdbuf,
1838                    const struct panvk_buffer *dst,
1839                    VkDeviceSize size, VkDeviceSize offset,
1840                    uint32_t val)
1841{
1842   struct panvk_meta_fill_buf_info info = {
1843      .start = panvk_buffer_gpu_ptr(dst, offset),
1844      .val = val,
1845   };
1846   size = panvk_buffer_range(dst, offset, size);
1847
1848   /* From the Vulkan spec:
1849    *
1850    *    "size is the number of bytes to fill, and must be either a multiple
1851    *    of 4, or VK_WHOLE_SIZE to fill the range from offset to the end of
1852    *    the buffer. If VK_WHOLE_SIZE is used and the remaining size of the
1853    *    buffer is not a multiple of 4, then the nearest smaller multiple is
1854    *    used."
1855    */
1856   size &= ~3ull;
1857
1858   assert(!(offset & 3) && !(size & 3));
1859
1860   unsigned nwords = size / sizeof(uint32_t);
1861   mali_ptr rsd =
1862      cmdbuf->device->physical_device->meta.copy.fillbuf.rsd;
1863
1864   mali_ptr pushconsts =
1865      pan_pool_upload_aligned(&cmdbuf->desc_pool.base, &info, sizeof(info), 16);
1866
1867   panvk_per_arch(cmd_close_batch)(cmdbuf);
1868
1869   struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1870
1871   panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
1872
1873   mali_ptr tsd = batch->tls.gpu;
1874
1875   struct pan_compute_dim num_wg = { nwords, 1, 1 };
1876   struct pan_compute_dim wg_sz = { 1, 1, 1};
1877   struct panfrost_ptr job =
1878     panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
1879                                      &batch->scoreboard,
1880                                      &num_wg, &wg_sz,
1881                                      0, 0, pushconsts, rsd, tsd);
1882
1883   util_dynarray_append(&batch->jobs, void *, job.cpu);
1884
1885   batch->blit.dst = dst->bo;
1886   panvk_per_arch(cmd_close_batch)(cmdbuf);
1887}
1888
1889void
1890panvk_per_arch(CmdFillBuffer)(VkCommandBuffer commandBuffer,
1891                              VkBuffer dstBuffer,
1892                              VkDeviceSize dstOffset,
1893                              VkDeviceSize fillSize,
1894                              uint32_t data)
1895{
1896   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1897   VK_FROM_HANDLE(panvk_buffer, dst, dstBuffer);
1898
1899   panvk_meta_fill_buf(cmdbuf, dst, fillSize, dstOffset, data);
1900}
1901
1902static void
1903panvk_meta_update_buf(struct panvk_cmd_buffer *cmdbuf,
1904                      const struct panvk_buffer *dst, VkDeviceSize offset,
1905                      VkDeviceSize size, const void *data)
1906{
1907   struct panvk_meta_copy_buf2buf_info info = {
1908      .src = pan_pool_upload_aligned(&cmdbuf->desc_pool.base, data, size, 4),
1909      .dst = panvk_buffer_gpu_ptr(dst, offset),
1910   };
1911
1912   unsigned log2blksz = ffs(sizeof(uint32_t)) - 1;
1913
1914   mali_ptr rsd =
1915      cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].rsd;
1916
1917   mali_ptr pushconsts =
1918      pan_pool_upload_aligned(&cmdbuf->desc_pool.base, &info, sizeof(info), 16);
1919
1920   panvk_per_arch(cmd_close_batch)(cmdbuf);
1921
1922   struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1923
1924   panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
1925
1926   mali_ptr tsd = batch->tls.gpu;
1927
1928   unsigned nblocks = size >> log2blksz;
1929   struct pan_compute_dim num_wg = { nblocks, 1, 1 };
1930   struct pan_compute_dim wg_sz = { 1, 1, 1};
1931   struct panfrost_ptr job =
1932     panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
1933                                      &batch->scoreboard,
1934                                      &num_wg, &wg_sz,
1935                                      0, 0, pushconsts, rsd, tsd);
1936
1937   util_dynarray_append(&batch->jobs, void *, job.cpu);
1938
1939   batch->blit.dst = dst->bo;
1940   panvk_per_arch(cmd_close_batch)(cmdbuf);
1941}
1942
1943void
1944panvk_per_arch(CmdUpdateBuffer)(VkCommandBuffer commandBuffer,
1945                                VkBuffer dstBuffer,
1946                                VkDeviceSize dstOffset,
1947                                VkDeviceSize dataSize,
1948                                const void *pData)
1949{
1950   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1951   VK_FROM_HANDLE(panvk_buffer, dst, dstBuffer);
1952
1953   panvk_meta_update_buf(cmdbuf, dst, dstOffset, dataSize, pData);
1954}
1955
1956void
1957panvk_per_arch(meta_copy_init)(struct panvk_physical_device *dev)
1958{
1959   panvk_meta_copy_img2img_init(dev, false);
1960   panvk_meta_copy_img2img_init(dev, true);
1961   panvk_meta_copy_buf2img_init(dev);
1962   panvk_meta_copy_img2buf_init(dev);
1963   panvk_meta_copy_buf2buf_init(dev);
1964   panvk_meta_fill_buf_init(dev);
1965}
1966