1 /*
2  * Copyright © 2021 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "v3dv_meta_common.h"
26 
27 #include "broadcom/common/v3d_macros.h"
28 #include "broadcom/common/v3d_tfu.h"
29 #include "broadcom/cle/v3dx_pack.h"
30 #include "broadcom/compiler/v3d_compiler.h"
31 
32 struct rcl_clear_info {
33    const union v3dv_clear_value *clear_value;
34    struct v3dv_image *image;
35    VkImageAspectFlags aspects;
36    uint32_t level;
37 };
38 
39 static struct v3dv_cl *
emit_rcl_prologue(struct v3dv_job *job, struct v3dv_meta_framebuffer *fb, const struct rcl_clear_info *clear_info)40 emit_rcl_prologue(struct v3dv_job *job,
41                   struct v3dv_meta_framebuffer *fb,
42                   const struct rcl_clear_info *clear_info)
43 {
44    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
45 
46    struct v3dv_cl *rcl = &job->rcl;
47    v3dv_cl_ensure_space_with_branch(rcl, 200 +
48                                     tiling->layers * 256 *
49                                     cl_packet_length(SUPERTILE_COORDINATES));
50    if (job->cmd_buffer->state.oom)
51       return NULL;
52 
53    assert(!tiling->msaa || !tiling->double_buffer);
54    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
55       config.early_z_disable = true;
56       config.image_width_pixels = tiling->width;
57       config.image_height_pixels = tiling->height;
58       config.number_of_render_targets = 1;
59       config.multisample_mode_4x = tiling->msaa;
60       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
61       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
62       config.internal_depth_type = fb->internal_depth_type;
63    }
64 
65    if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
66       uint32_t clear_pad = 0;
67       if (clear_info->image) {
68          const struct v3dv_image *image = clear_info->image;
69          const struct v3d_resource_slice *slice =
70             &image->slices[clear_info->level];
71          if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
72              slice->tiling == V3D_TILING_UIF_XOR) {
73             int uif_block_height = v3d_utile_height(image->cpp) * 2;
74 
75             uint32_t implicit_padded_height =
76                align(tiling->height, uif_block_height) / uif_block_height;
77 
78             if (slice->padded_height_of_output_image_in_uif_blocks -
79                 implicit_padded_height >= 15) {
80                clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
81             }
82          }
83       }
84 
85       const uint32_t *color = &clear_info->clear_value->color[0];
86       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
87          clear.clear_color_low_32_bits = color[0];
88          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
89          clear.render_target_number = 0;
90       };
91 
92       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
93          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
94             clear.clear_color_mid_low_32_bits =
95               ((color[1] >> 24) | (color[2] << 8));
96             clear.clear_color_mid_high_24_bits =
97               ((color[2] >> 24) | ((color[3] & 0xffff) << 8));
98             clear.render_target_number = 0;
99          };
100       }
101 
102       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
103          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
104             clear.uif_padded_height_in_uif_blocks = clear_pad;
105             clear.clear_color_high_16_bits = color[3] >> 16;
106             clear.render_target_number = 0;
107          };
108       }
109    }
110 
111    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
112       rt.render_target_0_internal_bpp = tiling->internal_bpp;
113       rt.render_target_0_internal_type = fb->internal_type;
114       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
115    }
116 
117    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
118       clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
119       clear.stencil_clear_value = clear_info ? clear_info->clear_value->s : 0;
120    };
121 
122    cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
123       init.use_auto_chained_tile_lists = true;
124       init.size_of_first_block_in_chained_tile_lists =
125          TILE_ALLOCATION_BLOCK_SIZE_64B;
126    }
127 
128    return rcl;
129 }
130 
131 static void
emit_frame_setup(struct v3dv_job *job, uint32_t min_layer, const union v3dv_clear_value *clear_value)132 emit_frame_setup(struct v3dv_job *job,
133                  uint32_t min_layer,
134                  const union v3dv_clear_value *clear_value)
135 {
136    v3dv_return_if_oom(NULL, job);
137 
138    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
139 
140    struct v3dv_cl *rcl = &job->rcl;
141 
142    const uint32_t tile_alloc_offset =
143       64 * min_layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
144    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
145       list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
146    }
147 
148    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
149       config.number_of_bin_tile_lists = 1;
150       config.total_frame_width_in_tiles = tiling->draw_tiles_x;
151       config.total_frame_height_in_tiles = tiling->draw_tiles_y;
152 
153       config.supertile_width_in_tiles = tiling->supertile_width;
154       config.supertile_height_in_tiles = tiling->supertile_height;
155 
156       config.total_frame_width_in_supertiles =
157          tiling->frame_width_in_supertiles;
158       config.total_frame_height_in_supertiles =
159          tiling->frame_height_in_supertiles;
160    }
161 
162    /* Implement GFXH-1742 workaround. Also, if we are clearing we have to do
163     * it here.
164     */
165    for (int i = 0; i < 2; i++) {
166       cl_emit(rcl, TILE_COORDINATES, coords);
167       cl_emit(rcl, END_OF_LOADS, end);
168       cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
169          store.buffer_to_store = NONE;
170       }
171       /* When using double-buffering, we need to clear both buffers (unless
172        * we only have a single tile to render).
173        */
174       if (clear_value &&
175           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
176          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
177             clear.clear_z_stencil_buffer = true;
178             clear.clear_all_render_targets = true;
179          }
180       }
181       cl_emit(rcl, END_OF_TILE_MARKER, end);
182    }
183 
184    cl_emit(rcl, FLUSH_VCD_CACHE, flush);
185 }
186 
187 static void
emit_supertile_coordinates(struct v3dv_job *job, struct v3dv_meta_framebuffer *framebuffer)188 emit_supertile_coordinates(struct v3dv_job *job,
189                            struct v3dv_meta_framebuffer *framebuffer)
190 {
191    v3dv_return_if_oom(NULL, job);
192 
193    struct v3dv_cl *rcl = &job->rcl;
194 
195    const uint32_t min_y = framebuffer->min_y_supertile;
196    const uint32_t max_y = framebuffer->max_y_supertile;
197    const uint32_t min_x = framebuffer->min_x_supertile;
198    const uint32_t max_x = framebuffer->max_x_supertile;
199 
200    for (int y = min_y; y <= max_y; y++) {
201       for (int x = min_x; x <= max_x; x++) {
202          cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
203             coords.column_number_in_supertiles = x;
204             coords.row_number_in_supertiles = y;
205          }
206       }
207    }
208 }
209 
210 static void
emit_linear_load(struct v3dv_cl *cl, uint32_t buffer, struct v3dv_bo *bo, uint32_t offset, uint32_t stride, uint32_t format)211 emit_linear_load(struct v3dv_cl *cl,
212                  uint32_t buffer,
213                  struct v3dv_bo *bo,
214                  uint32_t offset,
215                  uint32_t stride,
216                  uint32_t format)
217 {
218    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
219       load.buffer_to_load = buffer;
220       load.address = v3dv_cl_address(bo, offset);
221       load.input_image_format = format;
222       load.memory_format = V3D_TILING_RASTER;
223       load.height_in_ub_or_stride = stride;
224       load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
225    }
226 }
227 
228 static void
emit_linear_store(struct v3dv_cl *cl, uint32_t buffer, struct v3dv_bo *bo, uint32_t offset, uint32_t stride, bool msaa, uint32_t format)229 emit_linear_store(struct v3dv_cl *cl,
230                   uint32_t buffer,
231                   struct v3dv_bo *bo,
232                   uint32_t offset,
233                   uint32_t stride,
234                   bool msaa,
235                   uint32_t format)
236 {
237    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
238       store.buffer_to_store = RENDER_TARGET_0;
239       store.address = v3dv_cl_address(bo, offset);
240       store.clear_buffer_being_stored = false;
241       store.output_image_format = format;
242       store.memory_format = V3D_TILING_RASTER;
243       store.height_in_ub_or_stride = stride;
244       store.decimate_mode = msaa ? V3D_DECIMATE_MODE_ALL_SAMPLES :
245                                    V3D_DECIMATE_MODE_SAMPLE_0;
246    }
247 }
248 
249 /* This chooses a tile buffer format that is appropriate for the copy operation.
250  * Typically, this is the image render target type, however, if we are copying
251  * depth/stencil to/from a buffer the hardware can't do raster loads/stores, so
252  * we need to load and store to/from a tile color buffer using a compatible
253  * color format.
254  */
255 static uint32_t
choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer, VkImageAspectFlags aspect, bool for_store, bool is_copy_to_buffer, bool is_copy_from_buffer)256 choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer,
257                   VkImageAspectFlags aspect,
258                   bool for_store,
259                   bool is_copy_to_buffer,
260                   bool is_copy_from_buffer)
261 {
262    if (is_copy_to_buffer || is_copy_from_buffer) {
263       switch (framebuffer->vk_format) {
264       case VK_FORMAT_D16_UNORM:
265          return V3D_OUTPUT_IMAGE_FORMAT_R16UI;
266       case VK_FORMAT_D32_SFLOAT:
267          return V3D_OUTPUT_IMAGE_FORMAT_R32F;
268       case VK_FORMAT_X8_D24_UNORM_PACK32:
269          return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
270       case VK_FORMAT_D24_UNORM_S8_UINT:
271          /* When storing the stencil aspect of a combined depth/stencil image
272           * to a buffer, the Vulkan spec states that the output buffer must
273           * have packed stencil values, so we choose an R8UI format for our
274           * store outputs. For the load input we still want RGBA8UI since the
275           * source image contains 4 channels (including the 3 channels
276           * containing the 24-bit depth value).
277           *
278           * When loading the stencil aspect of a combined depth/stencil image
279           * from a buffer, we read packed 8-bit stencil values from the buffer
280           * that we need to put into the LSB of the 32-bit format (the R
281           * channel), so we use R8UI. For the store, if we used R8UI then we
282           * would write 8-bit stencil values consecutively over depth channels,
283           * so we need to use RGBA8UI. This will write each stencil value in
284           * its correct position, but will overwrite depth values (channels G
285           * B,A) with undefined values. To fix this,  we will have to restore
286           * the depth aspect from the Z tile buffer, which we should pre-load
287           * from the image before the store).
288           */
289          if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) {
290             return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
291          } else {
292             assert(aspect & VK_IMAGE_ASPECT_STENCIL_BIT);
293             if (is_copy_to_buffer) {
294                return for_store ? V3D_OUTPUT_IMAGE_FORMAT_R8UI :
295                                   V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
296             } else {
297                assert(is_copy_from_buffer);
298                return for_store ? V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI :
299                                   V3D_OUTPUT_IMAGE_FORMAT_R8UI;
300             }
301          }
302       default: /* Color formats */
303          return framebuffer->format->rt_type;
304          break;
305       }
306    } else {
307       return framebuffer->format->rt_type;
308    }
309 }
310 
311 static inline bool
format_needs_rb_swap(struct v3dv_device *device, VkFormat format)312 format_needs_rb_swap(struct v3dv_device *device,
313                      VkFormat format)
314 {
315    const uint8_t *swizzle = v3dv_get_format_swizzle(device, format);
316    return v3dv_format_swizzle_needs_rb_swap(swizzle);
317 }
318 
319 static inline bool
format_needs_reverse(struct v3dv_device *device, VkFormat format)320 format_needs_reverse(struct v3dv_device *device,
321                      VkFormat format)
322 {
323    const uint8_t *swizzle = v3dv_get_format_swizzle(device, format);
324    return v3dv_format_swizzle_needs_reverse(swizzle);
325 }
326 
327 static void
emit_image_load(struct v3dv_device *device, struct v3dv_cl *cl, struct v3dv_meta_framebuffer *framebuffer, struct v3dv_image *image, VkImageAspectFlags aspect, uint32_t layer, uint32_t mip_level, bool is_copy_to_buffer, bool is_copy_from_buffer)328 emit_image_load(struct v3dv_device *device,
329                 struct v3dv_cl *cl,
330                 struct v3dv_meta_framebuffer *framebuffer,
331                 struct v3dv_image *image,
332                 VkImageAspectFlags aspect,
333                 uint32_t layer,
334                 uint32_t mip_level,
335                 bool is_copy_to_buffer,
336                 bool is_copy_from_buffer)
337 {
338    uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
339 
340    /* For image to/from buffer copies we always load to and store from RT0,
341     * even for depth/stencil aspects, because the hardware can't do raster
342     * stores or loads from/to the depth/stencil tile buffers.
343     */
344    bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
345                             aspect == VK_IMAGE_ASPECT_COLOR_BIT;
346 
347    const struct v3d_resource_slice *slice = &image->slices[mip_level];
348    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
349       load.buffer_to_load = load_to_color_tlb ?
350          RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
351 
352       load.address = v3dv_cl_address(image->mem->bo, layer_offset);
353 
354       load.input_image_format = choose_tlb_format(framebuffer, aspect, false,
355                                                   is_copy_to_buffer,
356                                                   is_copy_from_buffer);
357       load.memory_format = slice->tiling;
358 
359       /* When copying depth/stencil images to a buffer, for D24 formats Vulkan
360        * expects the depth value in the LSB bits of each 32-bit pixel.
361        * Unfortunately, the hardware seems to put the S8/X8 bits there and the
362        * depth bits on the MSB. To work around that we can reverse the channel
363        * order and then swap the R/B channels to get what we want.
364        *
365        * NOTE: reversing and swapping only gets us the behavior we want if the
366        * operations happen in that exact order, which seems to be the case when
367        * done on the tile buffer load operations. On the store, it seems the
368        * order is not the same. The order on the store is probably reversed so
369        * that reversing and swapping on both the load and the store preserves
370        * the original order of the channels in memory.
371        *
372        * Notice that we only need to do this when copying to a buffer, where
373        * depth and stencil aspects are copied as separate regions and
374        * the spec expects them to be tightly packed.
375        */
376       bool needs_rb_swap = false;
377       bool needs_chan_reverse = false;
378       if (is_copy_to_buffer &&
379          (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
380           (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
381            (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
382          needs_rb_swap = true;
383          needs_chan_reverse = true;
384       } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
385                  (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
386          /* This is not a raw data copy (i.e. we are clearing the image),
387           * so we need to make sure we respect the format swizzle.
388           */
389          needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
390          needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format);
391       }
392 
393       load.r_b_swap = needs_rb_swap;
394       load.channel_reverse = needs_chan_reverse;
395 
396       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
397           slice->tiling == V3D_TILING_UIF_XOR) {
398          load.height_in_ub_or_stride =
399             slice->padded_height_of_output_image_in_uif_blocks;
400       } else if (slice->tiling == V3D_TILING_RASTER) {
401          load.height_in_ub_or_stride = slice->stride;
402       }
403 
404       if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
405          load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
406       else
407          load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
408    }
409 }
410 
411 static void
emit_image_store(struct v3dv_device *device, struct v3dv_cl *cl, struct v3dv_meta_framebuffer *framebuffer, struct v3dv_image *image, VkImageAspectFlags aspect, uint32_t layer, uint32_t mip_level, bool is_copy_to_buffer, bool is_copy_from_buffer, bool is_multisample_resolve)412 emit_image_store(struct v3dv_device *device,
413                  struct v3dv_cl *cl,
414                  struct v3dv_meta_framebuffer *framebuffer,
415                  struct v3dv_image *image,
416                  VkImageAspectFlags aspect,
417                  uint32_t layer,
418                  uint32_t mip_level,
419                  bool is_copy_to_buffer,
420                  bool is_copy_from_buffer,
421                  bool is_multisample_resolve)
422 {
423    uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
424 
425    bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
426                                aspect == VK_IMAGE_ASPECT_COLOR_BIT;
427 
428    const struct v3d_resource_slice *slice = &image->slices[mip_level];
429    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
430       store.buffer_to_store = store_from_color_tlb ?
431          RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
432 
433       store.address = v3dv_cl_address(image->mem->bo, layer_offset);
434       store.clear_buffer_being_stored = false;
435 
436       /* See rationale in emit_image_load() */
437       bool needs_rb_swap = false;
438       bool needs_chan_reverse = false;
439       if (is_copy_from_buffer &&
440          (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
441           (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
442            (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
443          needs_rb_swap = true;
444          needs_chan_reverse = true;
445       } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
446                  (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
447          needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
448          needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format);
449       }
450 
451       store.r_b_swap = needs_rb_swap;
452       store.channel_reverse = needs_chan_reverse;
453 
454       store.output_image_format = choose_tlb_format(framebuffer, aspect, true,
455                                                     is_copy_to_buffer,
456                                                     is_copy_from_buffer);
457       store.memory_format = slice->tiling;
458       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
459           slice->tiling == V3D_TILING_UIF_XOR) {
460          store.height_in_ub_or_stride =
461             slice->padded_height_of_output_image_in_uif_blocks;
462       } else if (slice->tiling == V3D_TILING_RASTER) {
463          store.height_in_ub_or_stride = slice->stride;
464       }
465 
466       if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
467          store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
468       else if (is_multisample_resolve)
469          store.decimate_mode = V3D_DECIMATE_MODE_4X;
470       else
471          store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
472    }
473 }
474 
475 static void
emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job, struct v3dv_meta_framebuffer *framebuffer, struct v3dv_buffer *buffer, struct v3dv_image *image, uint32_t layer_offset, const VkBufferImageCopy2 *region)476 emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
477                                         struct v3dv_meta_framebuffer *framebuffer,
478                                         struct v3dv_buffer *buffer,
479                                         struct v3dv_image *image,
480                                         uint32_t layer_offset,
481                                         const VkBufferImageCopy2 *region)
482 {
483    struct v3dv_cl *cl = &job->indirect;
484    v3dv_cl_ensure_space(cl, 200, 1);
485    v3dv_return_if_oom(NULL, job);
486 
487    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
488 
489    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
490 
491    /* Load image to TLB */
492    assert((image->vk.image_type != VK_IMAGE_TYPE_3D &&
493            layer_offset < region->imageSubresource.layerCount) ||
494           layer_offset < image->vk.extent.depth);
495 
496    const uint32_t image_layer = image->vk.image_type != VK_IMAGE_TYPE_3D ?
497       region->imageSubresource.baseArrayLayer + layer_offset :
498       region->imageOffset.z + layer_offset;
499 
500    emit_image_load(job->device, cl, framebuffer, image,
501                    region->imageSubresource.aspectMask,
502                    image_layer,
503                    region->imageSubresource.mipLevel,
504                    true, false);
505 
506    cl_emit(cl, END_OF_LOADS, end);
507 
508    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
509 
510    /* Store TLB to buffer */
511    uint32_t width, height;
512    if (region->bufferRowLength == 0)
513       width = region->imageExtent.width;
514    else
515       width = region->bufferRowLength;
516 
517    if (region->bufferImageHeight == 0)
518       height = region->imageExtent.height;
519    else
520       height = region->bufferImageHeight;
521 
522    /* Handle copy from compressed format */
523    width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
524    height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
525 
526    /* If we are storing stencil from a combined depth/stencil format the
527     * Vulkan spec states that the output buffer must have packed stencil
528     * values, where each stencil value is 1 byte.
529     */
530    uint32_t cpp =
531       region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
532          1 : image->cpp;
533    uint32_t buffer_stride = width * cpp;
534    uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset +
535                             height * buffer_stride * layer_offset;
536 
537    uint32_t format = choose_tlb_format(framebuffer,
538                                        region->imageSubresource.aspectMask,
539                                        true, true, false);
540    bool msaa = image->vk.samples > VK_SAMPLE_COUNT_1_BIT;
541 
542    emit_linear_store(cl, RENDER_TARGET_0, buffer->mem->bo,
543                      buffer_offset, buffer_stride, msaa, format);
544 
545    cl_emit(cl, END_OF_TILE_MARKER, end);
546 
547    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
548 
549    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
550       branch.start = tile_list_start;
551       branch.end = v3dv_cl_get_address(cl);
552    }
553 }
554 
555 static void
emit_copy_layer_to_buffer(struct v3dv_job *job, struct v3dv_buffer *buffer, struct v3dv_image *image, struct v3dv_meta_framebuffer *framebuffer, uint32_t layer, const VkBufferImageCopy2 *region)556 emit_copy_layer_to_buffer(struct v3dv_job *job,
557                           struct v3dv_buffer *buffer,
558                           struct v3dv_image *image,
559                           struct v3dv_meta_framebuffer *framebuffer,
560                           uint32_t layer,
561                           const VkBufferImageCopy2 *region)
562 {
563    emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer,
564                                            image, layer, region);
565    emit_supertile_coordinates(job, framebuffer);
566 }
567 
568 void
meta_emit_copy_image_to_buffer_rcl(struct v3dv_job *job, struct v3dv_buffer *buffer, struct v3dv_image *image, struct v3dv_meta_framebuffer *framebuffer, const VkBufferImageCopy2 *region)569 v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
570                                          struct v3dv_buffer *buffer,
571                                          struct v3dv_image *image,
572                                          struct v3dv_meta_framebuffer *framebuffer,
573                                          const VkBufferImageCopy2 *region)
574 {
575    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
576    v3dv_return_if_oom(NULL, job);
577 
578    emit_frame_setup(job, 0, NULL);
579    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
580       emit_copy_layer_to_buffer(job, buffer, image, framebuffer, layer, region);
581    cl_emit(rcl, END_OF_RENDERING, end);
582 }
583 
584 static void
emit_resolve_image_layer_per_tile_list(struct v3dv_job *job, struct v3dv_meta_framebuffer *framebuffer, struct v3dv_image *dst, struct v3dv_image *src, uint32_t layer_offset, const VkImageResolve2 *region)585 emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
586                                        struct v3dv_meta_framebuffer *framebuffer,
587                                        struct v3dv_image *dst,
588                                        struct v3dv_image *src,
589                                        uint32_t layer_offset,
590                                        const VkImageResolve2 *region)
591 {
592    struct v3dv_cl *cl = &job->indirect;
593    v3dv_cl_ensure_space(cl, 200, 1);
594    v3dv_return_if_oom(NULL, job);
595 
596    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
597 
598    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
599 
600    assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
601            layer_offset < region->srcSubresource.layerCount) ||
602           layer_offset < src->vk.extent.depth);
603 
604    const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
605       region->srcSubresource.baseArrayLayer + layer_offset :
606       region->srcOffset.z + layer_offset;
607 
608    emit_image_load(job->device, cl, framebuffer, src,
609                    region->srcSubresource.aspectMask,
610                    src_layer,
611                    region->srcSubresource.mipLevel,
612                    false, false);
613 
614    cl_emit(cl, END_OF_LOADS, end);
615 
616    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
617 
618    assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
619            layer_offset < region->dstSubresource.layerCount) ||
620           layer_offset < dst->vk.extent.depth);
621 
622    const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
623       region->dstSubresource.baseArrayLayer + layer_offset :
624       region->dstOffset.z + layer_offset;
625 
626    emit_image_store(job->device, cl, framebuffer, dst,
627                     region->dstSubresource.aspectMask,
628                     dst_layer,
629                     region->dstSubresource.mipLevel,
630                     false, false, true);
631 
632    cl_emit(cl, END_OF_TILE_MARKER, end);
633 
634    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
635 
636    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
637       branch.start = tile_list_start;
638       branch.end = v3dv_cl_get_address(cl);
639    }
640 }
641 
642 static void
emit_resolve_image_layer(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, uint32_t layer, const VkImageResolve2 *region)643 emit_resolve_image_layer(struct v3dv_job *job,
644                          struct v3dv_image *dst,
645                          struct v3dv_image *src,
646                          struct v3dv_meta_framebuffer *framebuffer,
647                          uint32_t layer,
648                          const VkImageResolve2 *region)
649 {
650    emit_resolve_image_layer_per_tile_list(job, framebuffer,
651                                           dst, src, layer, region);
652    emit_supertile_coordinates(job, framebuffer);
653 }
654 
655 void
meta_emit_resolve_image_rcl(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, const VkImageResolve2 *region)656 v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
657                                   struct v3dv_image *dst,
658                                   struct v3dv_image *src,
659                                   struct v3dv_meta_framebuffer *framebuffer,
660                                   const VkImageResolve2 *region)
661 {
662    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
663    v3dv_return_if_oom(NULL, job);
664 
665    emit_frame_setup(job, 0, NULL);
666    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
667       emit_resolve_image_layer(job, dst, src, framebuffer, layer, region);
668    cl_emit(rcl, END_OF_RENDERING, end);
669 }
670 
671 static void
emit_copy_buffer_per_tile_list(struct v3dv_job *job, struct v3dv_bo *dst, struct v3dv_bo *src, uint32_t dst_offset, uint32_t src_offset, uint32_t stride, uint32_t format)672 emit_copy_buffer_per_tile_list(struct v3dv_job *job,
673                                struct v3dv_bo *dst,
674                                struct v3dv_bo *src,
675                                uint32_t dst_offset,
676                                uint32_t src_offset,
677                                uint32_t stride,
678                                uint32_t format)
679 {
680    struct v3dv_cl *cl = &job->indirect;
681    v3dv_cl_ensure_space(cl, 200, 1);
682    v3dv_return_if_oom(NULL, job);
683 
684    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
685 
686    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
687 
688    emit_linear_load(cl, RENDER_TARGET_0, src, src_offset, stride, format);
689 
690    cl_emit(cl, END_OF_LOADS, end);
691 
692    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
693 
694    emit_linear_store(cl, RENDER_TARGET_0,
695                      dst, dst_offset, stride, false, format);
696 
697    cl_emit(cl, END_OF_TILE_MARKER, end);
698 
699    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
700 
701    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
702       branch.start = tile_list_start;
703       branch.end = v3dv_cl_get_address(cl);
704    }
705 }
706 
707 void
meta_emit_copy_buffer(struct v3dv_job *job, struct v3dv_bo *dst, struct v3dv_bo *src, uint32_t dst_offset, uint32_t src_offset, struct v3dv_meta_framebuffer *framebuffer, uint32_t format, uint32_t item_size)708 v3dX(meta_emit_copy_buffer)(struct v3dv_job *job,
709                             struct v3dv_bo *dst,
710                             struct v3dv_bo *src,
711                             uint32_t dst_offset,
712                             uint32_t src_offset,
713                             struct v3dv_meta_framebuffer *framebuffer,
714                             uint32_t format,
715                             uint32_t item_size)
716 {
717    const uint32_t stride = job->frame_tiling.width * item_size;
718    emit_copy_buffer_per_tile_list(job, dst, src,
719                                   dst_offset, src_offset,
720                                   stride, format);
721    emit_supertile_coordinates(job, framebuffer);
722 }
723 
724 void
meta_emit_copy_buffer_rcl(struct v3dv_job *job, struct v3dv_bo *dst, struct v3dv_bo *src, uint32_t dst_offset, uint32_t src_offset, struct v3dv_meta_framebuffer *framebuffer, uint32_t format, uint32_t item_size)725 v3dX(meta_emit_copy_buffer_rcl)(struct v3dv_job *job,
726                                 struct v3dv_bo *dst,
727                                 struct v3dv_bo *src,
728                                 uint32_t dst_offset,
729                                 uint32_t src_offset,
730                                 struct v3dv_meta_framebuffer *framebuffer,
731                                 uint32_t format,
732                                 uint32_t item_size)
733 {
734    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
735    v3dv_return_if_oom(NULL, job);
736 
737    emit_frame_setup(job, 0, NULL);
738 
739    v3dX(meta_emit_copy_buffer)(job, dst, src, dst_offset, src_offset,
740                                framebuffer, format, item_size);
741 
742    cl_emit(rcl, END_OF_RENDERING, end);
743 }
744 
745 static void
emit_copy_image_layer_per_tile_list(struct v3dv_job *job, struct v3dv_meta_framebuffer *framebuffer, struct v3dv_image *dst, struct v3dv_image *src, uint32_t layer_offset, const VkImageCopy2 *region)746 emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
747                                     struct v3dv_meta_framebuffer *framebuffer,
748                                     struct v3dv_image *dst,
749                                     struct v3dv_image *src,
750                                     uint32_t layer_offset,
751                                     const VkImageCopy2 *region)
752 {
753    struct v3dv_cl *cl = &job->indirect;
754    v3dv_cl_ensure_space(cl, 200, 1);
755    v3dv_return_if_oom(NULL, job);
756 
757    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
758 
759    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
760 
761    assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
762            layer_offset < region->srcSubresource.layerCount) ||
763           layer_offset < src->vk.extent.depth);
764 
765    const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
766       region->srcSubresource.baseArrayLayer + layer_offset :
767       region->srcOffset.z + layer_offset;
768 
769    emit_image_load(job->device, cl, framebuffer, src,
770                    region->srcSubresource.aspectMask,
771                    src_layer,
772                    region->srcSubresource.mipLevel,
773                    false, false);
774 
775    cl_emit(cl, END_OF_LOADS, end);
776 
777    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
778 
779    assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
780            layer_offset < region->dstSubresource.layerCount) ||
781           layer_offset < dst->vk.extent.depth);
782 
783    const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
784       region->dstSubresource.baseArrayLayer + layer_offset :
785       region->dstOffset.z + layer_offset;
786 
787    emit_image_store(job->device, cl, framebuffer, dst,
788                     region->dstSubresource.aspectMask,
789                     dst_layer,
790                     region->dstSubresource.mipLevel,
791                     false, false, false);
792 
793    cl_emit(cl, END_OF_TILE_MARKER, end);
794 
795    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
796 
797    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
798       branch.start = tile_list_start;
799       branch.end = v3dv_cl_get_address(cl);
800    }
801 }
802 
803 static void
emit_copy_image_layer(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, uint32_t layer, const VkImageCopy2 *region)804 emit_copy_image_layer(struct v3dv_job *job,
805                       struct v3dv_image *dst,
806                       struct v3dv_image *src,
807                       struct v3dv_meta_framebuffer *framebuffer,
808                       uint32_t layer,
809                       const VkImageCopy2 *region)
810 {
811    emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region);
812    emit_supertile_coordinates(job, framebuffer);
813 }
814 
815 void
meta_emit_copy_image_rcl(struct v3dv_job *job, struct v3dv_image *dst, struct v3dv_image *src, struct v3dv_meta_framebuffer *framebuffer, const VkImageCopy2 *region)816 v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
817                                struct v3dv_image *dst,
818                                struct v3dv_image *src,
819                                struct v3dv_meta_framebuffer *framebuffer,
820                                const VkImageCopy2 *region)
821 {
822    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
823    v3dv_return_if_oom(NULL, job);
824 
825    emit_frame_setup(job, 0, NULL);
826    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
827       emit_copy_image_layer(job, dst, src, framebuffer, layer, region);
828    cl_emit(rcl, END_OF_RENDERING, end);
829 }
830 
831 void
meta_emit_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, uint32_t dst_bo_handle, uint32_t dst_offset, enum v3d_tiling_mode dst_tiling, uint32_t dst_padded_height_or_stride, uint32_t dst_cpp, uint32_t src_bo_handle, uint32_t src_offset, enum v3d_tiling_mode src_tiling, uint32_t src_padded_height_or_stride, uint32_t src_cpp, uint32_t width, uint32_t height, const struct v3dv_format *format)832 v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
833                         uint32_t dst_bo_handle,
834                         uint32_t dst_offset,
835                         enum v3d_tiling_mode dst_tiling,
836                         uint32_t dst_padded_height_or_stride,
837                         uint32_t dst_cpp,
838                         uint32_t src_bo_handle,
839                         uint32_t src_offset,
840                         enum v3d_tiling_mode src_tiling,
841                         uint32_t src_padded_height_or_stride,
842                         uint32_t src_cpp,
843                         uint32_t width,
844                         uint32_t height,
845                         const struct v3dv_format *format)
846 {
847    struct drm_v3d_submit_tfu tfu = {
848       .ios = (height << 16) | width,
849       .bo_handles = {
850          dst_bo_handle,
851          src_bo_handle != dst_bo_handle ? src_bo_handle : 0
852       },
853    };
854 
855    tfu.iia |= src_offset;
856 
857    if (src_tiling == V3D_TILING_RASTER) {
858       tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
859    } else {
860       tfu.icfg = (V3D33_TFU_ICFG_FORMAT_LINEARTILE +
861                   (src_tiling - V3D_TILING_LINEARTILE)) <<
862                    V3D33_TFU_ICFG_FORMAT_SHIFT;
863    }
864    tfu.icfg |= format->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
865 
866    tfu.ioa = dst_offset;
867 
868    tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
869                (dst_tiling - V3D_TILING_LINEARTILE)) <<
870                 V3D33_TFU_IOA_FORMAT_SHIFT;
871 
872    switch (src_tiling) {
873    case V3D_TILING_UIF_NO_XOR:
874    case V3D_TILING_UIF_XOR:
875       tfu.iis |= src_padded_height_or_stride / (2 * v3d_utile_height(src_cpp));
876       break;
877    case V3D_TILING_RASTER:
878       tfu.iis |= src_padded_height_or_stride / src_cpp;
879       break;
880    default:
881       break;
882    }
883 
884    /* The TFU can handle raster sources but always produces UIF results */
885    assert(dst_tiling != V3D_TILING_RASTER);
886 
887    /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
888     * OPAD field for the destination (how many extra UIF blocks beyond
889     * those necessary to cover the height).
890     */
891    if (dst_tiling == V3D_TILING_UIF_NO_XOR || dst_tiling == V3D_TILING_UIF_XOR) {
892       uint32_t uif_block_h = 2 * v3d_utile_height(dst_cpp);
893       uint32_t implicit_padded_height = align(height, uif_block_h);
894       uint32_t icfg = (dst_padded_height_or_stride - implicit_padded_height) /
895                       uif_block_h;
896       tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
897    }
898 
899    v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
900 }
901 
902 static void
emit_clear_image_layer_per_tile_list(struct v3dv_job *job, struct v3dv_meta_framebuffer *framebuffer, struct v3dv_image *image, VkImageAspectFlags aspects, uint32_t layer, uint32_t level)903 emit_clear_image_layer_per_tile_list(struct v3dv_job *job,
904                                      struct v3dv_meta_framebuffer *framebuffer,
905                                      struct v3dv_image *image,
906                                      VkImageAspectFlags aspects,
907                                      uint32_t layer,
908                                      uint32_t level)
909 {
910    struct v3dv_cl *cl = &job->indirect;
911    v3dv_cl_ensure_space(cl, 200, 1);
912    v3dv_return_if_oom(NULL, job);
913 
914    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
915 
916    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
917 
918    cl_emit(cl, END_OF_LOADS, end);
919 
920    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
921 
922    emit_image_store(job->device, cl, framebuffer, image, aspects,
923                     layer, level, false, false, false);
924 
925    cl_emit(cl, END_OF_TILE_MARKER, end);
926 
927    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
928 
929    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
930       branch.start = tile_list_start;
931       branch.end = v3dv_cl_get_address(cl);
932    }
933 }
934 
935 static void
emit_clear_image_layers(struct v3dv_job *job, struct v3dv_image *image, struct v3dv_meta_framebuffer *framebuffer, VkImageAspectFlags aspects, uint32_t min_layer, uint32_t max_layer, uint32_t level)936 emit_clear_image_layers(struct v3dv_job *job,
937                  struct v3dv_image *image,
938                  struct v3dv_meta_framebuffer *framebuffer,
939                  VkImageAspectFlags aspects,
940                  uint32_t min_layer,
941                  uint32_t max_layer,
942                  uint32_t level)
943 {
944    for (uint32_t layer = min_layer; layer < max_layer; layer++) {
945       emit_clear_image_layer_per_tile_list(job, framebuffer, image, aspects,
946                                            layer, level);
947       emit_supertile_coordinates(job, framebuffer);
948    }
949 }
950 
951 void
meta_emit_clear_image_rcl(struct v3dv_job *job, struct v3dv_image *image, struct v3dv_meta_framebuffer *framebuffer, const union v3dv_clear_value *clear_value, VkImageAspectFlags aspects, uint32_t min_layer, uint32_t max_layer, uint32_t level)952 v3dX(meta_emit_clear_image_rcl)(struct v3dv_job *job,
953                                 struct v3dv_image *image,
954                                 struct v3dv_meta_framebuffer *framebuffer,
955                                 const union v3dv_clear_value *clear_value,
956                                 VkImageAspectFlags aspects,
957                                 uint32_t min_layer,
958                                 uint32_t max_layer,
959                                 uint32_t level)
960 {
961    const struct rcl_clear_info clear_info = {
962       .clear_value = clear_value,
963       .image = image,
964       .aspects = aspects,
965       .level = level,
966    };
967 
968    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
969    v3dv_return_if_oom(NULL, job);
970 
971    emit_frame_setup(job, 0, clear_value);
972    emit_clear_image_layers(job, image, framebuffer, aspects,
973                            min_layer, max_layer, level);
974    cl_emit(rcl, END_OF_RENDERING, end);
975 }
976 
977 static void
emit_fill_buffer_per_tile_list(struct v3dv_job *job, struct v3dv_bo *bo, uint32_t offset, uint32_t stride)978 emit_fill_buffer_per_tile_list(struct v3dv_job *job,
979                                struct v3dv_bo *bo,
980                                uint32_t offset,
981                                uint32_t stride)
982 {
983    struct v3dv_cl *cl = &job->indirect;
984    v3dv_cl_ensure_space(cl, 200, 1);
985    v3dv_return_if_oom(NULL, job);
986 
987    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
988 
989    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
990 
991    cl_emit(cl, END_OF_LOADS, end);
992 
993    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
994 
995    emit_linear_store(cl, RENDER_TARGET_0, bo, offset, stride, false,
996                      V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI);
997 
998    cl_emit(cl, END_OF_TILE_MARKER, end);
999 
1000    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1001 
1002    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1003       branch.start = tile_list_start;
1004       branch.end = v3dv_cl_get_address(cl);
1005    }
1006 }
1007 
1008 static void
emit_fill_buffer(struct v3dv_job *job, struct v3dv_bo *bo, uint32_t offset, struct v3dv_meta_framebuffer *framebuffer)1009 emit_fill_buffer(struct v3dv_job *job,
1010                  struct v3dv_bo *bo,
1011                  uint32_t offset,
1012                  struct v3dv_meta_framebuffer *framebuffer)
1013 {
1014    const uint32_t stride = job->frame_tiling.width * 4;
1015    emit_fill_buffer_per_tile_list(job, bo, offset, stride);
1016    emit_supertile_coordinates(job, framebuffer);
1017 }
1018 
1019 void
meta_emit_fill_buffer_rcl(struct v3dv_job *job, struct v3dv_bo *bo, uint32_t offset, struct v3dv_meta_framebuffer *framebuffer, uint32_t data)1020 v3dX(meta_emit_fill_buffer_rcl)(struct v3dv_job *job,
1021                                 struct v3dv_bo *bo,
1022                                 uint32_t offset,
1023                                 struct v3dv_meta_framebuffer *framebuffer,
1024                                 uint32_t data)
1025 {
1026    const union v3dv_clear_value clear_value = {
1027        .color = { data, 0, 0, 0 },
1028    };
1029 
1030    const struct rcl_clear_info clear_info = {
1031       .clear_value = &clear_value,
1032       .image = NULL,
1033       .aspects = VK_IMAGE_ASPECT_COLOR_BIT,
1034       .level = 0,
1035    };
1036 
1037    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
1038    v3dv_return_if_oom(NULL, job);
1039 
1040    emit_frame_setup(job, 0, &clear_value);
1041    emit_fill_buffer(job, bo, offset, framebuffer);
1042    cl_emit(rcl, END_OF_RENDERING, end);
1043 }
1044 
1045 
1046 static void
emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job, struct v3dv_meta_framebuffer *framebuffer, struct v3dv_image *image, struct v3dv_buffer *buffer, uint32_t layer, const VkBufferImageCopy2 *region)1047 emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
1048                                         struct v3dv_meta_framebuffer *framebuffer,
1049                                         struct v3dv_image *image,
1050                                         struct v3dv_buffer *buffer,
1051                                         uint32_t layer,
1052                                         const VkBufferImageCopy2 *region)
1053 {
1054    struct v3dv_cl *cl = &job->indirect;
1055    v3dv_cl_ensure_space(cl, 200, 1);
1056    v3dv_return_if_oom(NULL, job);
1057 
1058    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1059 
1060    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1061 
1062    const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
1063    assert((image->vk.image_type != VK_IMAGE_TYPE_3D && layer < imgrsc->layerCount) ||
1064           layer < image->vk.extent.depth);
1065 
1066    /* Load TLB from buffer */
1067    uint32_t width, height;
1068    if (region->bufferRowLength == 0)
1069       width = region->imageExtent.width;
1070    else
1071       width = region->bufferRowLength;
1072 
1073    if (region->bufferImageHeight == 0)
1074       height = region->imageExtent.height;
1075    else
1076       height = region->bufferImageHeight;
1077 
1078    /* Handle copy to compressed format using a compatible format */
1079    width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
1080    height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
1081 
1082    uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
1083                   1 : image->cpp;
1084    uint32_t buffer_stride = width * cpp;
1085    uint32_t buffer_offset =
1086       buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer;
1087 
1088    uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
1089                                        false, false, true);
1090 
1091    uint32_t image_layer = layer + (image->vk.image_type != VK_IMAGE_TYPE_3D ?
1092       imgrsc->baseArrayLayer : region->imageOffset.z);
1093 
1094    emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
1095                     buffer_offset, buffer_stride, format);
1096 
1097    /* Because we can't do raster loads/stores of Z/S formats we need to
1098     * use a color tile buffer with a compatible RGBA color format instead.
1099     * However, when we are uploading a single aspect to a combined
1100     * depth/stencil image we have the problem that our tile buffer stores don't
1101     * allow us to mask out the other aspect, so we always write all four RGBA
1102     * channels to the image and we end up overwriting that other aspect with
1103     * undefined values. To work around that, we first load the aspect we are
1104     * not copying from the image memory into a proper Z/S tile buffer. Then we
1105     * do our store from the color buffer for the aspect we are copying, and
1106     * after that, we do another store from the Z/S tile buffer to restore the
1107     * other aspect to its original value.
1108     */
1109    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1110       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1111          emit_image_load(job->device, cl, framebuffer, image,
1112                          VK_IMAGE_ASPECT_STENCIL_BIT,
1113                          image_layer, imgrsc->mipLevel,
1114                          false, false);
1115       } else {
1116          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
1117          emit_image_load(job->device, cl, framebuffer, image,
1118                          VK_IMAGE_ASPECT_DEPTH_BIT,
1119                          image_layer, imgrsc->mipLevel,
1120                          false, false);
1121       }
1122    }
1123 
1124    cl_emit(cl, END_OF_LOADS, end);
1125 
1126    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1127 
1128    /* Store TLB to image */
1129    emit_image_store(job->device, cl, framebuffer, image, imgrsc->aspectMask,
1130                     image_layer, imgrsc->mipLevel,
1131                     false, true, false);
1132 
1133    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1134       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1135          emit_image_store(job->device, cl, framebuffer, image,
1136                           VK_IMAGE_ASPECT_STENCIL_BIT,
1137                           image_layer, imgrsc->mipLevel,
1138                           false, false, false);
1139       } else {
1140          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
1141          emit_image_store(job->device, cl, framebuffer, image,
1142                           VK_IMAGE_ASPECT_DEPTH_BIT,
1143                           image_layer, imgrsc->mipLevel,
1144                           false, false, false);
1145       }
1146    }
1147 
1148    cl_emit(cl, END_OF_TILE_MARKER, end);
1149 
1150    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1151 
1152    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1153       branch.start = tile_list_start;
1154       branch.end = v3dv_cl_get_address(cl);
1155    }
1156 }
1157 
1158 static void
emit_copy_buffer_to_layer(struct v3dv_job *job, struct v3dv_image *image, struct v3dv_buffer *buffer, struct v3dv_meta_framebuffer *framebuffer, uint32_t layer, const VkBufferImageCopy2 *region)1159 emit_copy_buffer_to_layer(struct v3dv_job *job,
1160                           struct v3dv_image *image,
1161                           struct v3dv_buffer *buffer,
1162                           struct v3dv_meta_framebuffer *framebuffer,
1163                           uint32_t layer,
1164                           const VkBufferImageCopy2 *region)
1165 {
1166    emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer,
1167                                            layer, region);
1168    emit_supertile_coordinates(job, framebuffer);
1169 }
1170 
1171 void
meta_emit_copy_buffer_to_image_rcl(struct v3dv_job *job, struct v3dv_image *image, struct v3dv_buffer *buffer, struct v3dv_meta_framebuffer *framebuffer, const VkBufferImageCopy2 *region)1172 v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
1173                                          struct v3dv_image *image,
1174                                          struct v3dv_buffer *buffer,
1175                                          struct v3dv_meta_framebuffer *framebuffer,
1176                                          const VkBufferImageCopy2 *region)
1177 {
1178    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
1179    v3dv_return_if_oom(NULL, job);
1180 
1181    emit_frame_setup(job, 0, NULL);
1182    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
1183       emit_copy_buffer_to_layer(job, image, buffer, framebuffer, layer, region);
1184    cl_emit(rcl, END_OF_RENDERING, end);
1185 }
1186 
1187 /* Figure out a TLB size configuration for a number of pixels to process.
1188  * Beware that we can't "render" more than MAX_DIMxMAX_DIM pixels in a single
1189  * job, if the pixel count is larger than this, the caller might need to split
1190  * the job and call this function multiple times.
1191  */
1192 static void
framebuffer_size_for_pixel_count(uint32_t num_pixels, uint32_t *width, uint32_t *height)1193 framebuffer_size_for_pixel_count(uint32_t num_pixels,
1194                                  uint32_t *width,
1195                                  uint32_t *height)
1196 {
1197    assert(num_pixels > 0);
1198 
1199    const uint32_t max_dim_pixels = V3D_MAX_IMAGE_DIMENSION;
1200    const uint32_t max_pixels = max_dim_pixels * max_dim_pixels;
1201 
1202    uint32_t w, h;
1203    if (num_pixels > max_pixels) {
1204       w = max_dim_pixels;
1205       h = max_dim_pixels;
1206    } else {
1207       w = num_pixels;
1208       h = 1;
1209       while (w > max_dim_pixels || ((w % 2) == 0 && w > 2 * h)) {
1210          w >>= 1;
1211          h <<= 1;
1212       }
1213    }
1214    assert(w <= max_dim_pixels && h <= max_dim_pixels);
1215    assert(w * h <= num_pixels);
1216    assert(w > 0 && h > 0);
1217 
1218    *width = w;
1219    *height = h;
1220 }
1221 
1222 struct v3dv_job *
meta_copy_buffer(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_bo *dst, uint32_t dst_offset, struct v3dv_bo *src, uint32_t src_offset, const VkBufferCopy2 *region)1223 v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
1224                        struct v3dv_bo *dst,
1225                        uint32_t dst_offset,
1226                        struct v3dv_bo *src,
1227                        uint32_t src_offset,
1228                        const VkBufferCopy2 *region)
1229 {
1230    const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
1231    const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
1232 
1233    /* Select appropriate pixel format for the copy operation based on the
1234     * size to copy and the alignment of the source and destination offsets.
1235     */
1236    src_offset += region->srcOffset;
1237    dst_offset += region->dstOffset;
1238    uint32_t item_size = 4;
1239    while (item_size > 1 &&
1240           (src_offset % item_size != 0 || dst_offset % item_size != 0)) {
1241       item_size /= 2;
1242    }
1243 
1244    while (item_size > 1 && region->size % item_size != 0)
1245       item_size /= 2;
1246 
1247    assert(region->size % item_size == 0);
1248    uint32_t num_items = region->size / item_size;
1249    assert(num_items > 0);
1250 
1251    uint32_t format;
1252    VkFormat vk_format;
1253    switch (item_size) {
1254    case 4:
1255       format = V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
1256       vk_format = VK_FORMAT_R8G8B8A8_UINT;
1257       break;
1258    case 2:
1259       format = V3D_OUTPUT_IMAGE_FORMAT_RG8UI;
1260       vk_format = VK_FORMAT_R8G8_UINT;
1261       break;
1262    default:
1263       format = V3D_OUTPUT_IMAGE_FORMAT_R8UI;
1264       vk_format = VK_FORMAT_R8_UINT;
1265       break;
1266    }
1267 
1268    struct v3dv_job *job = NULL;
1269    while (num_items > 0) {
1270       job = v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1271       if (!job)
1272          return NULL;
1273 
1274       uint32_t width, height;
1275       framebuffer_size_for_pixel_count(num_items, &width, &height);
1276 
1277       v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
1278 
1279       struct v3dv_meta_framebuffer framebuffer;
1280       v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
1281                                   &job->frame_tiling);
1282 
1283       v3dX(job_emit_binning_flush)(job);
1284 
1285       v3dX(meta_emit_copy_buffer_rcl)(job, dst, src, dst_offset, src_offset,
1286                                       &framebuffer, format, item_size);
1287 
1288       v3dv_cmd_buffer_finish_job(cmd_buffer);
1289 
1290       const uint32_t items_copied = width * height;
1291       const uint32_t bytes_copied = items_copied * item_size;
1292       num_items -= items_copied;
1293       src_offset += bytes_copied;
1294       dst_offset += bytes_copied;
1295    }
1296 
1297    return job;
1298 }
1299 
1300 void
meta_fill_buffer(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_bo *bo, uint32_t offset, uint32_t size, uint32_t data)1301 v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
1302                        struct v3dv_bo *bo,
1303                        uint32_t offset,
1304                        uint32_t size,
1305                        uint32_t data)
1306 {
1307    assert(size > 0 && size % 4 == 0);
1308    assert(offset + size <= bo->size);
1309 
1310    const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
1311    const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
1312    uint32_t num_items = size / 4;
1313 
1314    while (num_items > 0) {
1315       struct v3dv_job *job =
1316          v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1317       if (!job)
1318          return;
1319 
1320       uint32_t width, height;
1321       framebuffer_size_for_pixel_count(num_items, &width, &height);
1322 
1323       v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
1324 
1325       struct v3dv_meta_framebuffer framebuffer;
1326       v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
1327                                   internal_type, &job->frame_tiling);
1328 
1329       v3dX(job_emit_binning_flush)(job);
1330 
1331       v3dX(meta_emit_fill_buffer_rcl)(job, bo, offset, &framebuffer, data);
1332 
1333       v3dv_cmd_buffer_finish_job(cmd_buffer);
1334 
1335       const uint32_t items_copied = width * height;
1336       const uint32_t bytes_copied = items_copied * 4;
1337       num_items -= items_copied;
1338       offset += bytes_copied;
1339    }
1340 }
1341 
1342 void
meta_framebuffer_init(struct v3dv_meta_framebuffer *fb, VkFormat vk_format, uint32_t internal_type, const struct v3dv_frame_tiling *tiling)1343 v3dX(meta_framebuffer_init)(struct v3dv_meta_framebuffer *fb,
1344                             VkFormat vk_format,
1345                             uint32_t internal_type,
1346                             const struct v3dv_frame_tiling *tiling)
1347 {
1348    fb->internal_type = internal_type;
1349 
1350    /* Supertile coverage always starts at 0,0  */
1351    uint32_t supertile_w_in_pixels =
1352       tiling->tile_width * tiling->supertile_width;
1353    uint32_t supertile_h_in_pixels =
1354       tiling->tile_height * tiling->supertile_height;
1355 
1356    fb->min_x_supertile = 0;
1357    fb->min_y_supertile = 0;
1358    fb->max_x_supertile = (tiling->width - 1) / supertile_w_in_pixels;
1359    fb->max_y_supertile = (tiling->height - 1) / supertile_h_in_pixels;
1360 
1361    fb->vk_format = vk_format;
1362    fb->format = v3dX(get_format)(vk_format);
1363 
1364    fb->internal_depth_type = V3D_INTERNAL_TYPE_DEPTH_32F;
1365    if (vk_format_is_depth_or_stencil(vk_format))
1366       fb->internal_depth_type = v3dX(get_internal_depth_type)(vk_format);
1367 }
1368