xref: /third_party/mesa3d/src/amd/vulkan/radv_query.c (revision bf215546)
1/*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26#include <assert.h>
27#include <fcntl.h>
28#include <stdbool.h>
29#include <string.h>
30
31#include "nir/nir_builder.h"
32#include "util/u_atomic.h"
33#include "vulkan/vulkan_core.h"
34#include "radv_acceleration_structure.h"
35#include "radv_cs.h"
36#include "radv_meta.h"
37#include "radv_private.h"
38#include "sid.h"
39
40#define TIMESTAMP_NOT_READY UINT64_MAX
41
42static const int pipelinestat_block_size = 11 * 8;
43static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
44
45static void
46radv_store_availability(nir_builder *b, nir_ssa_def *flags, nir_ssa_def *dst_buf,
47                        nir_ssa_def *offset, nir_ssa_def *value32)
48{
49   nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
50
51   nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
52
53   nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .align_mul = 8);
54
55   nir_push_else(b, NULL);
56
57   nir_store_ssbo(b, value32, dst_buf, offset);
58
59   nir_pop_if(b, NULL);
60
61   nir_pop_if(b, NULL);
62}
63
64static nir_shader *
65build_occlusion_query_shader(struct radv_device *device)
66{
67   /* the shader this builds is roughly
68    *
69    * push constants {
70    * 	uint32_t flags;
71    * 	uint32_t dst_stride;
72    * };
73    *
74    * uint32_t src_stride = 16 * db_count;
75    *
76    * location(binding = 0) buffer dst_buf;
77    * location(binding = 1) buffer src_buf;
78    *
79    * void main() {
80    * 	uint64_t result = 0;
81    * 	uint64_t src_offset = src_stride * global_id.x;
82    * 	uint64_t dst_offset = dst_stride * global_id.x;
83    * 	bool available = true;
84    * 	for (int i = 0; i < db_count; ++i) {
85    *		if (enabled_rb_mask & (1 << i)) {
86    *			uint64_t start = src_buf[src_offset + 16 * i];
87    *			uint64_t end = src_buf[src_offset + 16 * i + 8];
88    *			if ((start & (1ull << 63)) && (end & (1ull << 63)))
89    *				result += end - start;
90    *			else
91    *				available = false;
92    *		}
93    * 	}
94    * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
95    * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
96    * 		if (flags & VK_QUERY_RESULT_64_BIT)
97    * 			dst_buf[dst_offset] = result;
98    * 		else
99    * 			dst_buf[dst_offset] = (uint32_t)result.
100    * 	}
101    * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
102    * 		dst_buf[dst_offset + elem_size] = available;
103    * 	}
104    * }
105    */
106   nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "occlusion_query");
107   b.shader->info.workgroup_size[0] = 64;
108
109   nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
110   nir_variable *outer_counter =
111      nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
112   nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
113   nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
114   nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
115   unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
116   unsigned db_count = device->physical_device->rad_info.max_render_backends;
117
118   nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
119
120   nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
121   nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
122
123   nir_ssa_def *global_id = get_global_ids(&b, 1);
124
125   nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
126   nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
127   nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
128   nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
129
130   nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
131   nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
132   nir_store_var(&b, available, nir_imm_true(&b), 0x1);
133
134   nir_push_loop(&b);
135
136   nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
137   radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
138
139   nir_ssa_def *enabled_cond =
140      nir_iand_imm(&b, nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count), enabled_rb_mask);
141
142   nir_push_if(&b, nir_i2b(&b, enabled_cond));
143
144   nir_ssa_def *load_offset = nir_imul_imm(&b, current_outer_count, 16);
145   load_offset = nir_iadd(&b, input_base, load_offset);
146
147   nir_ssa_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16);
148
149   nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1);
150   nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1);
151
152   nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
153   nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
154
155   nir_push_if(&b, nir_iand(&b, start_done, end_done));
156
157   nir_store_var(&b, result,
158                 nir_iadd(&b, nir_load_var(&b, result),
159                          nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))),
160                 0x1);
161
162   nir_push_else(&b, NULL);
163
164   nir_store_var(&b, available, nir_imm_false(&b), 0x1);
165
166   nir_pop_if(&b, NULL);
167   nir_pop_if(&b, NULL);
168   nir_pop_loop(&b, NULL);
169
170   /* Store the result if complete or if partial results have been requested. */
171
172   nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
173   nir_ssa_def *result_size =
174      nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
175   nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
176                           nir_load_var(&b, available)));
177
178   nir_push_if(&b, result_is_64bit);
179
180   nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .align_mul = 8);
181
182   nir_push_else(&b, NULL);
183
184   nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base,
185                  .align_mul = 8);
186
187   nir_pop_if(&b, NULL);
188   nir_pop_if(&b, NULL);
189
190   radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
191                           nir_b2i32(&b, nir_load_var(&b, available)));
192
193   return b.shader;
194}
195
196static nir_shader *
197build_pipeline_statistics_query_shader(struct radv_device *device)
198{
199   /* the shader this builds is roughly
200    *
201    * push constants {
202    * 	uint32_t flags;
203    * 	uint32_t dst_stride;
204    * 	uint32_t stats_mask;
205    * 	uint32_t avail_offset;
206    * };
207    *
208    * uint32_t src_stride = pipelinestat_block_size * 2;
209    *
210    * location(binding = 0) buffer dst_buf;
211    * location(binding = 1) buffer src_buf;
212    *
213    * void main() {
214    * 	uint64_t src_offset = src_stride * global_id.x;
215    * 	uint64_t dst_base = dst_stride * global_id.x;
216    * 	uint64_t dst_offset = dst_base;
217    * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
218    * 	uint32_t elem_count = stats_mask >> 16;
219    * 	uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
220    * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
221    * 		dst_buf[dst_offset + elem_count * elem_size] = available32;
222    * 	}
223    * 	if ((bool)available32) {
224    * 		// repeat 11 times:
225    * 		if (stats_mask & (1 << 0)) {
226    * 			uint64_t start = src_buf[src_offset + 8 * indices[0]];
227    * 			uint64_t end = src_buf[src_offset + 8 * indices[0] +
228    * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT)
229    * 				dst_buf[dst_offset] = result;
230    * 			else
231    * 				dst_buf[dst_offset] = (uint32_t)result.
232    * 			dst_offset += elem_size;
233    * 		}
234    * 	} else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
235    *              // Set everything to 0 as we don't know what is valid.
236    * 		for (int i = 0; i < elem_count; ++i)
237    * 			dst_buf[dst_base + elem_size * i] = 0;
238    * 	}
239    * }
240    */
241   nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pipeline_statistics_query");
242   b.shader->info.workgroup_size[0] = 64;
243
244   nir_variable *output_offset =
245      nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
246   nir_variable *result =
247      nir_local_variable_create(b.impl, glsl_int64_t_type(), "result");
248
249   nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
250   nir_ssa_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12);
251   nir_ssa_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
252   nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
253
254   nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
255   nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
256
257   nir_ssa_def *global_id = get_global_ids(&b, 1);
258
259   nir_variable *input_stride = nir_local_variable_create(b.impl, glsl_int_type(), "input_stride");
260   nir_push_if(&b, nir_ine(&b, uses_gds, nir_imm_int(&b, 0)));
261   {
262      nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2 + 8 * 2), 0x1);
263   }
264   nir_push_else(&b, NULL);
265   {
266      nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2), 0x1);
267   }
268   nir_pop_if(&b, NULL);
269
270   nir_ssa_def *input_base = nir_imul(&b, nir_load_var(&b, input_stride), global_id);
271   nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
272   nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
273
274   avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4));
275
276   nir_ssa_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset);
277
278   nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
279   nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
280   nir_ssa_def *elem_count = nir_ushr_imm(&b, stats_mask, 16);
281
282   radv_store_availability(&b, flags, dst_buf,
283                           nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
284                           available32);
285
286   nir_push_if(&b, nir_i2b(&b, available32));
287
288   nir_store_var(&b, output_offset, output_base, 0x1);
289   for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
290      nir_push_if(&b, nir_test_mask(&b, stats_mask, BITFIELD64_BIT(i)));
291
292      nir_ssa_def *start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8);
293      nir_ssa_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset);
294
295      nir_ssa_def *end_offset =
296         nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size);
297      nir_ssa_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset);
298
299      nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
300
301      nir_push_if(&b, nir_iand(&b, nir_i2b(&b, uses_gds),
302                               nir_ieq(&b, nir_imm_int(&b, 1u << i),
303                                       nir_imm_int(&b, VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))));
304      {
305         /* Compute the GDS result if needed. */
306         nir_ssa_def *gds_start_offset =
307            nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2));
308         nir_ssa_def *gds_start = nir_load_ssbo(&b, 1, 64, src_buf, gds_start_offset);
309
310         nir_ssa_def *gds_end_offset =
311            nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2 + 8));
312         nir_ssa_def *gds_end = nir_load_ssbo(&b, 1, 64, src_buf, gds_end_offset);
313
314         nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
315
316         nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_gds_result), 0x1);
317      }
318      nir_pop_if(&b, NULL);
319
320      /* Store result */
321      nir_push_if(&b, result_is_64bit);
322
323      nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, nir_load_var(&b, output_offset));
324
325      nir_push_else(&b, NULL);
326
327      nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, nir_load_var(&b, output_offset));
328
329      nir_pop_if(&b, NULL);
330
331      nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size),
332                    0x1);
333
334      nir_pop_if(&b, NULL);
335   }
336
337   nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
338
339   nir_push_if(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
340
341   /* Stores zeros in all outputs. */
342
343   nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
344   nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
345
346   nir_loop *loop = nir_push_loop(&b);
347
348   nir_ssa_def *current_counter = nir_load_var(&b, counter);
349   radv_break_on_count(&b, counter, elem_count);
350
351   nir_ssa_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter));
352   nir_push_if(&b, result_is_64bit);
353
354   nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem);
355
356   nir_push_else(&b, NULL);
357
358   nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem);
359
360   nir_pop_if(&b, NULL);
361
362   nir_pop_loop(&b, loop);
363   nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
364   nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
365   return b.shader;
366}
367
368static nir_shader *
369build_tfb_query_shader(struct radv_device *device)
370{
371   /* the shader this builds is roughly
372    *
373    * uint32_t src_stride = 32;
374    *
375    * location(binding = 0) buffer dst_buf;
376    * location(binding = 1) buffer src_buf;
377    *
378    * void main() {
379    *	uint64_t result[2] = {};
380    *	bool available = false;
381    *	uint64_t src_offset = src_stride * global_id.x;
382    * 	uint64_t dst_offset = dst_stride * global_id.x;
383    * 	uint64_t *src_data = src_buf[src_offset];
384    *	uint32_t avail = (src_data[0] >> 32) &
385    *			 (src_data[1] >> 32) &
386    *			 (src_data[2] >> 32) &
387    *			 (src_data[3] >> 32);
388    *	if (avail & 0x80000000) {
389    *		result[0] = src_data[3] - src_data[1];
390    *		result[1] = src_data[2] - src_data[0];
391    *		available = true;
392    *	}
393    * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
394    * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
395    *		if (flags & VK_QUERY_RESULT_64_BIT) {
396    *			dst_buf[dst_offset] = result;
397    *		} else {
398    *			dst_buf[dst_offset] = (uint32_t)result;
399    *		}
400    *	}
401    *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
402    *		dst_buf[dst_offset + result_size] = available;
403    * 	}
404    * }
405    */
406   nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "tfb_query");
407   b.shader->info.workgroup_size[0] = 64;
408
409   /* Create and initialize local variables. */
410   nir_variable *result =
411      nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result");
412   nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
413
414   nir_store_var(&b, result, nir_vec2(&b, nir_imm_int64(&b, 0), nir_imm_int64(&b, 0)), 0x3);
415   nir_store_var(&b, available, nir_imm_false(&b), 0x1);
416
417   nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
418
419   /* Load resources. */
420   nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
421   nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
422
423   /* Compute global ID. */
424   nir_ssa_def *global_id = get_global_ids(&b, 1);
425
426   /* Compute src/dst strides. */
427   nir_ssa_def *input_stride = nir_imm_int(&b, 32);
428   nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
429   nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
430   nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
431
432   /* Load data from the query pool. */
433   nir_ssa_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32);
434   nir_ssa_def *load2 =
435      nir_load_ssbo(&b, 4, 32, src_buf, nir_iadd_imm(&b, input_base, 16), .align_mul = 16);
436
437   /* Check if result is available. */
438   nir_ssa_def *avails[2];
439   avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3));
440   avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3));
441   nir_ssa_def *result_is_available =
442      nir_test_mask(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000);
443
444   /* Only compute result if available. */
445   nir_push_if(&b, result_is_available);
446
447   /* Pack values. */
448   nir_ssa_def *packed64[4];
449   packed64[0] =
450      nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 0), nir_channel(&b, load1, 1)));
451   packed64[1] =
452      nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3)));
453   packed64[2] =
454      nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 0), nir_channel(&b, load2, 1)));
455   packed64[3] =
456      nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3)));
457
458   /* Compute result. */
459   nir_ssa_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]);
460   nir_ssa_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]);
461
462   nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3);
463   nir_store_var(&b, available, nir_imm_true(&b), 0x1);
464
465   nir_pop_if(&b, NULL);
466
467   /* Determine if result is 64 or 32 bit. */
468   nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
469   nir_ssa_def *result_size =
470      nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
471
472   /* Store the result if complete or partial results have been requested. */
473   nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
474                           nir_load_var(&b, available)));
475
476   /* Store result. */
477   nir_push_if(&b, result_is_64bit);
478
479   nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
480
481   nir_push_else(&b, NULL);
482
483   nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
484
485   nir_pop_if(&b, NULL);
486   nir_pop_if(&b, NULL);
487
488   radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
489                           nir_b2i32(&b, nir_load_var(&b, available)));
490
491   return b.shader;
492}
493
494static nir_shader *
495build_timestamp_query_shader(struct radv_device *device)
496{
497   /* the shader this builds is roughly
498    *
499    * uint32_t src_stride = 8;
500    *
501    * location(binding = 0) buffer dst_buf;
502    * location(binding = 1) buffer src_buf;
503    *
504    * void main() {
505    *	uint64_t result = 0;
506    *	bool available = false;
507    *	uint64_t src_offset = src_stride * global_id.x;
508    * 	uint64_t dst_offset = dst_stride * global_id.x;
509    * 	uint64_t timestamp = src_buf[src_offset];
510    *	if (timestamp != TIMESTAMP_NOT_READY) {
511    *		result = timestamp;
512    *		available = true;
513    *	}
514    * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
515    * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
516    *		if (flags & VK_QUERY_RESULT_64_BIT) {
517    *			dst_buf[dst_offset] = result;
518    *		} else {
519    *			dst_buf[dst_offset] = (uint32_t)result;
520    *		}
521    *	}
522    *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
523    *		dst_buf[dst_offset + result_size] = available;
524    * 	}
525    * }
526    */
527   nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "timestamp_query");
528   b.shader->info.workgroup_size[0] = 64;
529
530   /* Create and initialize local variables. */
531   nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
532   nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
533
534   nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
535   nir_store_var(&b, available, nir_imm_false(&b), 0x1);
536
537   nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
538
539   /* Load resources. */
540   nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
541   nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
542
543   /* Compute global ID. */
544   nir_ssa_def *global_id = get_global_ids(&b, 1);
545
546   /* Compute src/dst strides. */
547   nir_ssa_def *input_stride = nir_imm_int(&b, 8);
548   nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
549   nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
550   nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
551
552   /* Load data from the query pool. */
553   nir_ssa_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8);
554
555   /* Pack the timestamp. */
556   nir_ssa_def *timestamp;
557   timestamp =
558      nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load, 0), nir_channel(&b, load, 1)));
559
560   /* Check if result is available. */
561   nir_ssa_def *result_is_available = nir_i2b(&b, nir_ine_imm(&b, timestamp, TIMESTAMP_NOT_READY));
562
563   /* Only store result if available. */
564   nir_push_if(&b, result_is_available);
565
566   nir_store_var(&b, result, timestamp, 0x1);
567   nir_store_var(&b, available, nir_imm_true(&b), 0x1);
568
569   nir_pop_if(&b, NULL);
570
571   /* Determine if result is 64 or 32 bit. */
572   nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
573   nir_ssa_def *result_size =
574      nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
575
576   /* Store the result if complete or partial results have been requested. */
577   nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
578                           nir_load_var(&b, available)));
579
580   /* Store result. */
581   nir_push_if(&b, result_is_64bit);
582
583   nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
584
585   nir_push_else(&b, NULL);
586
587   nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
588
589   nir_pop_if(&b, NULL);
590
591   nir_pop_if(&b, NULL);
592
593   radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
594                           nir_b2i32(&b, nir_load_var(&b, available)));
595
596   return b.shader;
597}
598
599static nir_shader *
600build_pg_query_shader(struct radv_device *device)
601{
602   /* the shader this builds is roughly
603    *
604    * uint32_t src_stride = 32;
605    *
606    * location(binding = 0) buffer dst_buf;
607    * location(binding = 1) buffer src_buf;
608    *
609    * void main() {
610    *	uint64_t result = {};
611    *	bool available = false;
612    *	uint64_t src_offset = src_stride * global_id.x;
613    * 	uint64_t dst_offset = dst_stride * global_id.x;
614    * 	uint64_t *src_data = src_buf[src_offset];
615    *	uint32_t avail = (src_data[0] >> 32) &
616    *			 (src_data[2] >> 32);
617    *	if (avail & 0x80000000) {
618    *		result = src_data[2] - src_data[0];
619    *	        if (use_gds) {
620    *			uint64_t ngg_gds_result = 0;
621    *			ngg_gds_result += src_data[5] - src_data[4];
622    *			ngg_gds_result += src_data[7] - src_data[6];
623    *			result += ngg_gds_result;
624    *	        }
625    *		available = true;
626    *	}
627    * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
628    * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
629    *		if (flags & VK_QUERY_RESULT_64_BIT) {
630    *			dst_buf[dst_offset] = result;
631    *		} else {
632    *			dst_buf[dst_offset] = (uint32_t)result;
633    *		}
634    *	}
635    *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
636    *		dst_buf[dst_offset + result_size] = available;
637    * 	}
638    * }
639    */
640   nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pg_query");
641   b.shader->info.workgroup_size[0] = 64;
642
643   /* Create and initialize local variables. */
644   nir_variable *result =
645      nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
646   nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
647
648   nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
649   nir_store_var(&b, available, nir_imm_false(&b), 0x1);
650
651   nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
652
653   /* Load resources. */
654   nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
655   nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
656
657   /* Compute global ID. */
658   nir_ssa_def *global_id = get_global_ids(&b, 1);
659
660   /* Compute src/dst strides. */
661   nir_ssa_def *input_stride = nir_imm_int(&b, 32);
662   nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
663   nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
664   nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
665
666   /* Load data from the query pool. */
667   nir_ssa_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
668   nir_ssa_def *load2 = nir_load_ssbo(
669      &b, 2, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16);
670
671   /* Check if result is available. */
672   nir_ssa_def *avails[2];
673   avails[0] = nir_channel(&b, load1, 1);
674   avails[1] = nir_channel(&b, load2, 1);
675   nir_ssa_def *result_is_available =
676      nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), nir_imm_int(&b, 0x80000000)));
677
678   /* Only compute result if available. */
679   nir_push_if(&b, result_is_available);
680
681   /* Pack values. */
682   nir_ssa_def *packed64[2];
683   packed64[0] =
684      nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 0), nir_channel(&b, load1, 1)));
685   packed64[1] =
686      nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 0), nir_channel(&b, load2, 1)));
687
688   /* Compute result. */
689   nir_ssa_def *primitive_storage_needed = nir_isub(&b, packed64[1], packed64[0]);
690
691   nir_store_var(&b, result, primitive_storage_needed, 0x1);
692
693   nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
694   nir_push_if(&b, nir_i2b(&b, uses_gds));
695   {
696      /* NGG GS result */
697      nir_ssa_def *gds_start =
698         nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 8);
699      nir_ssa_def *gds_end =
700         nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 8);
701
702      nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
703
704      /* NGG VS/TES result */
705      gds_start =
706         nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 48)), .align_mul = 8);
707      gds_end =
708         nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 56)), .align_mul = 8);
709
710      ngg_gds_result = nir_iadd(&b, ngg_gds_result, nir_isub(&b, gds_end, gds_start));
711
712      nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_gds_result), 0x1);
713   }
714   nir_pop_if(&b, NULL);
715
716   nir_store_var(&b, available, nir_imm_true(&b), 0x1);
717
718   nir_pop_if(&b, NULL);
719
720   /* Determine if result is 64 or 32 bit. */
721   nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
722   nir_ssa_def *result_size =
723      nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
724
725   /* Store the result if complete or partial results have been requested. */
726   nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
727                           nir_load_var(&b, available)));
728
729   /* Store result. */
730   nir_push_if(&b, result_is_64bit);
731
732   nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
733
734   nir_push_else(&b, NULL);
735
736   nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
737
738   nir_pop_if(&b, NULL);
739   nir_pop_if(&b, NULL);
740
741   radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
742                           nir_b2i32(&b, nir_load_var(&b, available)));
743
744   return b.shader;
745}
746
747static VkResult
748radv_device_init_meta_query_state_internal(struct radv_device *device)
749{
750   VkResult result;
751   nir_shader *occlusion_cs = NULL;
752   nir_shader *pipeline_statistics_cs = NULL;
753   nir_shader *tfb_cs = NULL;
754   nir_shader *timestamp_cs = NULL;
755   nir_shader *pg_cs = NULL;
756
757   mtx_lock(&device->meta_state.mtx);
758   if (device->meta_state.query.pipeline_statistics_query_pipeline) {
759      mtx_unlock(&device->meta_state.mtx);
760      return VK_SUCCESS;
761   }
762   occlusion_cs = build_occlusion_query_shader(device);
763   pipeline_statistics_cs = build_pipeline_statistics_query_shader(device);
764   tfb_cs = build_tfb_query_shader(device);
765   timestamp_cs = build_timestamp_query_shader(device);
766   pg_cs = build_pg_query_shader(device);
767
768   VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
769      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
770      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
771      .bindingCount = 2,
772      .pBindings = (VkDescriptorSetLayoutBinding[]){
773         {.binding = 0,
774          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
775          .descriptorCount = 1,
776          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
777          .pImmutableSamplers = NULL},
778         {.binding = 1,
779          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
780          .descriptorCount = 1,
781          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
782          .pImmutableSamplers = NULL},
783      }};
784
785   result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &occlusion_ds_create_info,
786                                           &device->meta_state.alloc,
787                                           &device->meta_state.query.ds_layout);
788   if (result != VK_SUCCESS)
789      goto fail;
790
791   VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
792      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
793      .setLayoutCount = 1,
794      .pSetLayouts = &device->meta_state.query.ds_layout,
795      .pushConstantRangeCount = 1,
796      .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 20},
797   };
798
799   result =
800      radv_CreatePipelineLayout(radv_device_to_handle(device), &occlusion_pl_create_info,
801                                &device->meta_state.alloc, &device->meta_state.query.p_layout);
802   if (result != VK_SUCCESS)
803      goto fail;
804
805   VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
806      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
807      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
808      .module = vk_shader_module_handle_from_nir(occlusion_cs),
809      .pName = "main",
810      .pSpecializationInfo = NULL,
811   };
812
813   VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
814      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
815      .stage = occlusion_pipeline_shader_stage,
816      .flags = 0,
817      .layout = device->meta_state.query.p_layout,
818   };
819
820   result = radv_CreateComputePipelines(
821      radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
822      &occlusion_vk_pipeline_info, NULL, &device->meta_state.query.occlusion_query_pipeline);
823   if (result != VK_SUCCESS)
824      goto fail;
825
826   VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
827      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
828      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
829      .module = vk_shader_module_handle_from_nir(pipeline_statistics_cs),
830      .pName = "main",
831      .pSpecializationInfo = NULL,
832   };
833
834   VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
835      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
836      .stage = pipeline_statistics_pipeline_shader_stage,
837      .flags = 0,
838      .layout = device->meta_state.query.p_layout,
839   };
840
841   result = radv_CreateComputePipelines(
842      radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
843      &pipeline_statistics_vk_pipeline_info, NULL,
844      &device->meta_state.query.pipeline_statistics_query_pipeline);
845   if (result != VK_SUCCESS)
846      goto fail;
847
848   VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
849      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
850      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
851      .module = vk_shader_module_handle_from_nir(tfb_cs),
852      .pName = "main",
853      .pSpecializationInfo = NULL,
854   };
855
856   VkComputePipelineCreateInfo tfb_pipeline_info = {
857      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
858      .stage = tfb_pipeline_shader_stage,
859      .flags = 0,
860      .layout = device->meta_state.query.p_layout,
861   };
862
863   result = radv_CreateComputePipelines(
864      radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
865      &tfb_pipeline_info, NULL, &device->meta_state.query.tfb_query_pipeline);
866   if (result != VK_SUCCESS)
867      goto fail;
868
869   VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = {
870      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
871      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
872      .module = vk_shader_module_handle_from_nir(timestamp_cs),
873      .pName = "main",
874      .pSpecializationInfo = NULL,
875   };
876
877   VkComputePipelineCreateInfo timestamp_pipeline_info = {
878      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
879      .stage = timestamp_pipeline_shader_stage,
880      .flags = 0,
881      .layout = device->meta_state.query.p_layout,
882   };
883
884   result = radv_CreateComputePipelines(
885      radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
886      &timestamp_pipeline_info, NULL, &device->meta_state.query.timestamp_query_pipeline);
887   if (result != VK_SUCCESS)
888      goto fail;
889
890   VkPipelineShaderStageCreateInfo pg_pipeline_shader_stage = {
891      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
892      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
893      .module = vk_shader_module_handle_from_nir(pg_cs),
894      .pName = "main",
895      .pSpecializationInfo = NULL,
896   };
897
898   VkComputePipelineCreateInfo pg_pipeline_info = {
899      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
900      .stage = pg_pipeline_shader_stage,
901      .flags = 0,
902      .layout = device->meta_state.query.p_layout,
903   };
904
905   result = radv_CreateComputePipelines(
906      radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
907      &pg_pipeline_info, NULL, &device->meta_state.query.pg_query_pipeline);
908
909fail:
910   ralloc_free(occlusion_cs);
911   ralloc_free(pipeline_statistics_cs);
912   ralloc_free(tfb_cs);
913   ralloc_free(pg_cs);
914   ralloc_free(timestamp_cs);
915   mtx_unlock(&device->meta_state.mtx);
916   return result;
917}
918
919VkResult
920radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
921{
922   if (on_demand)
923      return VK_SUCCESS;
924
925   return radv_device_init_meta_query_state_internal(device);
926}
927
928void
929radv_device_finish_meta_query_state(struct radv_device *device)
930{
931   if (device->meta_state.query.tfb_query_pipeline)
932      radv_DestroyPipeline(radv_device_to_handle(device),
933                           device->meta_state.query.tfb_query_pipeline, &device->meta_state.alloc);
934
935   if (device->meta_state.query.pipeline_statistics_query_pipeline)
936      radv_DestroyPipeline(radv_device_to_handle(device),
937                           device->meta_state.query.pipeline_statistics_query_pipeline,
938                           &device->meta_state.alloc);
939
940   if (device->meta_state.query.occlusion_query_pipeline)
941      radv_DestroyPipeline(radv_device_to_handle(device),
942                           device->meta_state.query.occlusion_query_pipeline,
943                           &device->meta_state.alloc);
944
945   if (device->meta_state.query.timestamp_query_pipeline)
946      radv_DestroyPipeline(radv_device_to_handle(device),
947                           device->meta_state.query.timestamp_query_pipeline,
948                           &device->meta_state.alloc);
949
950   if (device->meta_state.query.pg_query_pipeline)
951      radv_DestroyPipeline(radv_device_to_handle(device),
952                           device->meta_state.query.pg_query_pipeline, &device->meta_state.alloc);
953
954   if (device->meta_state.query.p_layout)
955      radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.query.p_layout,
956                                 &device->meta_state.alloc);
957
958   if (device->meta_state.query.ds_layout)
959      device->vk.dispatch_table.DestroyDescriptorSetLayout(radv_device_to_handle(device),
960                                                           device->meta_state.query.ds_layout,
961                                                           &device->meta_state.alloc);
962}
963
964static void
965radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline,
966                  struct radeon_winsys_bo *src_bo, struct radeon_winsys_bo *dst_bo,
967                  uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride,
968                  uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags,
969                  uint32_t pipeline_stats_mask, uint32_t avail_offset, bool uses_gds)
970{
971   struct radv_device *device = cmd_buffer->device;
972   struct radv_meta_saved_state saved_state;
973   struct radv_buffer src_buffer, dst_buffer;
974
975   if (!*pipeline) {
976      VkResult ret = radv_device_init_meta_query_state_internal(device);
977      if (ret != VK_SUCCESS) {
978         cmd_buffer->record_result = ret;
979         return;
980      }
981   }
982
983   /* VK_EXT_conditional_rendering says that copy commands should not be
984    * affected by conditional rendering.
985    */
986   radv_meta_save(&saved_state, cmd_buffer,
987                  RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS |
988                     RADV_META_SAVE_DESCRIPTORS | RADV_META_SUSPEND_PREDICATING);
989
990   uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset);
991   uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size;
992
993   radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset);
994   radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset);
995
996   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
997                        *pipeline);
998
999   radv_meta_push_descriptor_set(
1000      cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.query.p_layout, 0, /* set */
1001      2, /* descriptorWriteCount */
1002      (VkWriteDescriptorSet[]){
1003         {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1004          .dstBinding = 0,
1005          .dstArrayElement = 0,
1006          .descriptorCount = 1,
1007          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1008          .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer),
1009                                                   .offset = 0,
1010                                                   .range = VK_WHOLE_SIZE}},
1011         {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1012          .dstBinding = 1,
1013          .dstArrayElement = 0,
1014          .descriptorCount = 1,
1015          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1016          .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer),
1017                                                   .offset = 0,
1018                                                   .range = VK_WHOLE_SIZE}}});
1019
1020   /* Encode the number of elements for easy access by the shader. */
1021   pipeline_stats_mask &= 0x7ff;
1022   pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1023
1024   avail_offset -= src_offset;
1025
1026   struct {
1027      uint32_t flags;
1028      uint32_t dst_stride;
1029      uint32_t pipeline_stats_mask;
1030      uint32_t avail_offset;
1031      uint32_t uses_gds;
1032   } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset, uses_gds};
1033
1034   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.query.p_layout,
1035                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
1036
1037   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1038
1039   if (flags & VK_QUERY_RESULT_WAIT_BIT)
1040      cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1041
1042   radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1043
1044   /* Ensure that the query copy dispatch is complete before a potential vkCmdResetPool because
1045    * there is an implicit execution dependency from each such query command to all query commands
1046    * previously submitted to the same queue.
1047    */
1048   cmd_buffer->active_query_flush_bits |=
1049      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1050
1051   radv_buffer_finish(&src_buffer);
1052   radv_buffer_finish(&dst_buffer);
1053
1054   radv_meta_restore(&saved_state, cmd_buffer);
1055}
1056
1057static void
1058radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
1059                        struct radv_query_pool *pool)
1060{
1061   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
1062      radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool);
1063
1064   if (pool->bo)
1065      device->ws->buffer_destroy(device->ws, pool->bo);
1066   vk_object_base_finish(&pool->base);
1067   vk_free2(&device->vk.alloc, pAllocator, pool);
1068}
1069
1070VKAPI_ATTR VkResult VKAPI_CALL
1071radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
1072                     const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
1073{
1074   RADV_FROM_HANDLE(radv_device, device, _device);
1075   VkResult result;
1076   size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
1077                                ? sizeof(struct radv_pc_query_pool)
1078                                : sizeof(struct radv_query_pool);
1079
1080   struct radv_query_pool *pool = vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8,
1081                                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1082
1083   if (!pool)
1084      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1085
1086   vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL);
1087
1088   pool->type = pCreateInfo->queryType;
1089   pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics;
1090
1091   /* The number of primitives generated by geometry shader invocations is only counted by the
1092    * hardware if GS uses the legacy path. When NGG GS is used, the hardware can't know the number
1093    * of generated primitives and we have to increment it from the shader using a plain GDS atomic.
1094    */
1095   pool->uses_gds = device->physical_device->use_ngg &&
1096                    ((pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) ||
1097                     pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1098
1099   switch (pCreateInfo->queryType) {
1100   case VK_QUERY_TYPE_OCCLUSION:
1101      pool->stride = 16 * device->physical_device->rad_info.max_render_backends;
1102      break;
1103   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1104      pool->stride = pipelinestat_block_size * 2;
1105      if (pool->uses_gds) {
1106         /* When the query pool needs GDS (for counting the number of primitives generated by a
1107          * geometry shader with NGG), allocate 2x64-bit values for begin/end.
1108          */
1109         pool->stride += 8 * 2;
1110      }
1111      break;
1112   case VK_QUERY_TYPE_TIMESTAMP:
1113   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1114   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1115   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1116   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1117      pool->stride = 8;
1118      break;
1119   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1120   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1121      pool->stride = 32;
1122      if (pool->uses_gds) {
1123         /* When the query pool needs GDS, allocate 4x64-bit values for begin/end of NGG GS and
1124          * NGG VS/TES because they use a different offset.
1125          */
1126         pool->stride += 8 * 4;
1127      }
1128      break;
1129   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1130      result = radv_pc_init_query_pool(device->physical_device, pCreateInfo,
1131                                       (struct radv_pc_query_pool *)pool);
1132
1133      if (result != VK_SUCCESS) {
1134         radv_destroy_query_pool(device, pAllocator, pool);
1135         return vk_error(device, result);
1136      }
1137      break;
1138   }
1139   default:
1140      unreachable("creating unhandled query type");
1141   }
1142
1143   pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1144   pool->size = pool->availability_offset;
1145   if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
1146      pool->size += 4 * pCreateInfo->queryCount;
1147
1148   result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT,
1149                                      RADEON_FLAG_NO_INTERPROCESS_SHARING,
1150                                      RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo);
1151   if (result != VK_SUCCESS) {
1152      radv_destroy_query_pool(device, pAllocator, pool);
1153      return vk_error(device, result);
1154   }
1155
1156   pool->ptr = device->ws->buffer_map(pool->bo);
1157   if (!pool->ptr) {
1158      radv_destroy_query_pool(device, pAllocator, pool);
1159      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1160   }
1161
1162   *pQueryPool = radv_query_pool_to_handle(pool);
1163   return VK_SUCCESS;
1164}
1165
1166VKAPI_ATTR void VKAPI_CALL
1167radv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator)
1168{
1169   RADV_FROM_HANDLE(radv_device, device, _device);
1170   RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
1171
1172   if (!pool)
1173      return;
1174
1175   radv_destroy_query_pool(device, pAllocator, pool);
1176}
1177
1178VKAPI_ATTR VkResult VKAPI_CALL
1179radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery,
1180                         uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride,
1181                         VkQueryResultFlags flags)
1182{
1183   RADV_FROM_HANDLE(radv_device, device, _device);
1184   RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1185   char *data = pData;
1186   VkResult result = VK_SUCCESS;
1187
1188   if (vk_device_is_lost(&device->vk))
1189      return VK_ERROR_DEVICE_LOST;
1190
1191   for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) {
1192      char *dest = data;
1193      unsigned query = firstQuery + query_idx;
1194      char *src = pool->ptr + query * pool->stride;
1195      uint32_t available;
1196
1197      switch (pool->type) {
1198      case VK_QUERY_TYPE_TIMESTAMP:
1199      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1200      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1201      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1202      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: {
1203         uint64_t const *src64 = (uint64_t const *)src;
1204         uint64_t value;
1205
1206         do {
1207            value = p_atomic_read(src64);
1208         } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT));
1209
1210         available = value != TIMESTAMP_NOT_READY;
1211
1212         if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1213            result = VK_NOT_READY;
1214
1215         if (flags & VK_QUERY_RESULT_64_BIT) {
1216            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1217               *(uint64_t *)dest = value;
1218            dest += 8;
1219         } else {
1220            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1221               *(uint32_t *)dest = (uint32_t)value;
1222            dest += 4;
1223         }
1224         break;
1225      }
1226      case VK_QUERY_TYPE_OCCLUSION: {
1227         uint64_t const *src64 = (uint64_t const *)src;
1228         uint32_t db_count = device->physical_device->rad_info.max_render_backends;
1229         uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
1230         uint64_t sample_count = 0;
1231         available = 1;
1232
1233         for (int i = 0; i < db_count; ++i) {
1234            uint64_t start, end;
1235
1236            if (!(enabled_rb_mask & (1 << i)))
1237               continue;
1238
1239            do {
1240               start = p_atomic_read(src64 + 2 * i);
1241               end = p_atomic_read(src64 + 2 * i + 1);
1242            } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) &&
1243                     (flags & VK_QUERY_RESULT_WAIT_BIT));
1244
1245            if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1246               available = 0;
1247            else {
1248               sample_count += end - start;
1249            }
1250         }
1251
1252         if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1253            result = VK_NOT_READY;
1254
1255         if (flags & VK_QUERY_RESULT_64_BIT) {
1256            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1257               *(uint64_t *)dest = sample_count;
1258            dest += 8;
1259         } else {
1260            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1261               *(uint32_t *)dest = sample_count;
1262            dest += 4;
1263         }
1264         break;
1265      }
1266      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1267         const uint32_t *avail_ptr =
1268            (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
1269         uint64_t ngg_gds_result = 0;
1270
1271         do {
1272            available = p_atomic_read(avail_ptr);
1273         } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT));
1274
1275         if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1276            result = VK_NOT_READY;
1277
1278         if (pool->uses_gds) {
1279            /* Compute the result that was copied from GDS. */
1280            const uint64_t *gds_start = (uint64_t *)(src + pipelinestat_block_size * 2);
1281            const uint64_t *gds_stop = (uint64_t *)(src + pipelinestat_block_size * 2 + 8);
1282
1283            ngg_gds_result = gds_stop[0] - gds_start[0];
1284         }
1285
1286         const uint64_t *start = (uint64_t *)src;
1287         const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
1288         if (flags & VK_QUERY_RESULT_64_BIT) {
1289            uint64_t *dst = (uint64_t *)dest;
1290            dest += util_bitcount(pool->pipeline_stats_mask) * 8;
1291            for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1292               if (pool->pipeline_stats_mask & (1u << i)) {
1293                  if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1294                     *dst = stop[pipeline_statistics_indices[i]] -
1295                            start[pipeline_statistics_indices[i]];
1296
1297                     if (pool->uses_gds &&
1298                         (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
1299                        *dst += ngg_gds_result;
1300                     }
1301                  }
1302                  dst++;
1303               }
1304            }
1305
1306         } else {
1307            uint32_t *dst = (uint32_t *)dest;
1308            dest += util_bitcount(pool->pipeline_stats_mask) * 4;
1309            for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1310               if (pool->pipeline_stats_mask & (1u << i)) {
1311                  if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1312                     *dst = stop[pipeline_statistics_indices[i]] -
1313                            start[pipeline_statistics_indices[i]];
1314
1315                     if (pool->uses_gds &&
1316                         (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
1317                        *dst += ngg_gds_result;
1318                     }
1319                  }
1320                  dst++;
1321               }
1322            }
1323         }
1324         break;
1325      }
1326      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1327         uint64_t const *src64 = (uint64_t const *)src;
1328         uint64_t num_primitives_written;
1329         uint64_t primitive_storage_needed;
1330
1331         /* SAMPLE_STREAMOUTSTATS stores this structure:
1332          * {
1333          *	u64 NumPrimitivesWritten;
1334          *	u64 PrimitiveStorageNeeded;
1335          * }
1336          */
1337         available = 1;
1338         for (int j = 0; j < 4; j++) {
1339            if (!(p_atomic_read(src64 + j) & 0x8000000000000000UL))
1340               available = 0;
1341         }
1342
1343         if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1344            result = VK_NOT_READY;
1345
1346         num_primitives_written = src64[3] - src64[1];
1347         primitive_storage_needed = src64[2] - src64[0];
1348
1349         if (flags & VK_QUERY_RESULT_64_BIT) {
1350            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1351               *(uint64_t *)dest = num_primitives_written;
1352            dest += 8;
1353            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1354               *(uint64_t *)dest = primitive_storage_needed;
1355            dest += 8;
1356         } else {
1357            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1358               *(uint32_t *)dest = num_primitives_written;
1359            dest += 4;
1360            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1361               *(uint32_t *)dest = primitive_storage_needed;
1362            dest += 4;
1363         }
1364         break;
1365      }
1366      case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1367         uint64_t const *src64 = (uint64_t const *)src;
1368         uint64_t primitive_storage_needed;
1369
1370         /* SAMPLE_STREAMOUTSTATS stores this structure:
1371          * {
1372          *	u64 NumPrimitivesWritten;
1373          *	u64 PrimitiveStorageNeeded;
1374          * }
1375          */
1376         available = 1;
1377         if (!(p_atomic_read(src64 + 0) & 0x8000000000000000UL) ||
1378             !(p_atomic_read(src64 + 2) & 0x8000000000000000UL)) {
1379            available = 0;
1380         }
1381
1382         if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1383            result = VK_NOT_READY;
1384
1385         primitive_storage_needed = src64[2] - src64[0];
1386
1387         if (pool->uses_gds) {
1388            /* Accumulate the result that was copied from GDS in case NGG GS or NGG VS/TES have been
1389             * used.
1390             */
1391            primitive_storage_needed += src64[5] - src64[4]; /* NGG GS */
1392            primitive_storage_needed += src64[7] - src64[6]; /* NGG VS/TES */
1393         }
1394
1395         if (flags & VK_QUERY_RESULT_64_BIT) {
1396            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1397               *(uint64_t *)dest = primitive_storage_needed;
1398            dest += 8;
1399         } else {
1400            if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1401               *(uint32_t *)dest = primitive_storage_needed;
1402            dest += 4;
1403         }
1404         break;
1405      }
1406      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1407         struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool;
1408         const uint64_t *src64 = (const uint64_t *)src;
1409         bool avail;
1410         do {
1411            avail = true;
1412            for (unsigned i = 0; i < pc_pool->num_passes; ++i)
1413               if (!p_atomic_read(src64 + pool->stride / 8 - i - 1))
1414                  avail = false;
1415         } while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT));
1416
1417         available = avail;
1418
1419         radv_pc_get_results(pc_pool, src64, dest);
1420         dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR);
1421         break;
1422      }
1423      default:
1424         unreachable("trying to get results of unhandled query type");
1425      }
1426
1427      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1428         if (flags & VK_QUERY_RESULT_64_BIT) {
1429            *(uint64_t *)dest = available;
1430         } else {
1431            *(uint32_t *)dest = available;
1432         }
1433      }
1434   }
1435
1436   return result;
1437}
1438
1439static void
1440emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool)
1441{
1442   if (cmd_buffer->pending_reset_query) {
1443      if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1444         /* Only need to flush caches if the query pool size is
1445          * large enough to be resetted using the compute shader
1446          * path. Small pools don't need any cache flushes
1447          * because we use a CP dma clear.
1448          */
1449         si_emit_cache_flush(cmd_buffer);
1450      }
1451   }
1452}
1453
1454static size_t
1455radv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags)
1456{
1457   unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0;
1458   switch (pool->type) {
1459   case VK_QUERY_TYPE_TIMESTAMP:
1460   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1461   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1462   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1463   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1464   case VK_QUERY_TYPE_OCCLUSION:
1465      values += 1;
1466      break;
1467   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1468      values += util_bitcount(pool->pipeline_stats_mask);
1469      break;
1470   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1471      values += 2;
1472      break;
1473   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1474      values += 1;
1475      break;
1476   default:
1477      unreachable("trying to get size of unhandled query type");
1478   }
1479   return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4);
1480}
1481
1482VKAPI_ATTR void VKAPI_CALL
1483radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
1484                             uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer,
1485                             VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags)
1486{
1487   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1488   RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1489   RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1490   struct radeon_cmdbuf *cs = cmd_buffer->cs;
1491   uint64_t va = radv_buffer_get_va(pool->bo);
1492   uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1493   size_t dst_size = radv_query_result_size(pool, flags);
1494   dest_va += dst_buffer->offset + dstOffset;
1495
1496   if (!queryCount)
1497      return;
1498
1499   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
1500   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
1501
1502   /* Workaround engines that forget to properly specify WAIT_BIT because some driver implicitly
1503    * synchronizes before query copy.
1504    */
1505   if (cmd_buffer->device->instance->flush_before_query_copy)
1506      cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1507
1508   /* From the Vulkan spec 1.1.108:
1509    *
1510    * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1511    *  previous uses of vkCmdResetQueryPool in the same queue, without any
1512    *  additional synchronization."
1513    *
1514    * So, we have to flush the caches if the compute shader path was used.
1515    */
1516   emit_query_flush(cmd_buffer, pool);
1517
1518   switch (pool->type) {
1519   case VK_QUERY_TYPE_OCCLUSION:
1520      if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1521         unsigned enabled_rb_mask = cmd_buffer->device->physical_device->rad_info.enabled_rb_mask;
1522         uint32_t rb_avail_offset = 16 * util_last_bit(enabled_rb_mask) - 4;
1523         for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1524            unsigned query = firstQuery + i;
1525            uint64_t src_va = va + query * pool->stride + rb_avail_offset;
1526
1527            radeon_check_space(cmd_buffer->device->ws, cs, 7);
1528
1529            /* Waits on the upper word of the last DB entry */
1530            radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va, 0x80000000, 0xffffffff);
1531         }
1532      }
1533      radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline,
1534                        pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1535                        dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1536                        flags, 0, 0, false);
1537      break;
1538   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1539      if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1540         for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1541            unsigned query = firstQuery + i;
1542
1543            radeon_check_space(cmd_buffer->device->ws, cs, 7);
1544
1545            uint64_t avail_va = va + pool->availability_offset + 4 * query;
1546
1547            /* This waits on the ME. All copies below are done on the ME */
1548            radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1549         }
1550      }
1551      radv_query_shader(
1552         cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1553         pool->bo, dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset,
1554         pool->stride, stride, dst_size, queryCount, flags, pool->pipeline_stats_mask,
1555         pool->availability_offset + 4 * firstQuery, pool->uses_gds);
1556      break;
1557   case VK_QUERY_TYPE_TIMESTAMP:
1558   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1559   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1560   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1561   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1562      if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1563         for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1564            unsigned query = firstQuery + i;
1565            uint64_t local_src_va = va + query * pool->stride;
1566
1567            radeon_check_space(cmd_buffer->device->ws, cs, 7);
1568
1569            /* Wait on the high 32 bits of the timestamp in
1570             * case the low part is 0xffffffff.
1571             */
1572            radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4,
1573                             TIMESTAMP_NOT_READY >> 32, 0xffffffff);
1574         }
1575      }
1576
1577      radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline,
1578                        pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1579                        dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1580                        flags, 0, 0, false);
1581      break;
1582   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1583      if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1584         for (unsigned i = 0; i < queryCount; i++) {
1585            unsigned query = firstQuery + i;
1586            uint64_t src_va = va + query * pool->stride;
1587
1588            radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1589
1590            /* Wait on the upper word of all results. */
1591            for (unsigned j = 0; j < 4; j++, src_va += 8) {
1592               radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000,
1593                                0xffffffff);
1594            }
1595         }
1596      }
1597
1598      radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline,
1599                        pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1600                        dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1601                        flags, 0, 0, false);
1602      break;
1603   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1604      if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1605         for (unsigned i = 0; i < queryCount; i++) {
1606            unsigned query = firstQuery + i;
1607            uint64_t src_va = va + query * pool->stride;
1608
1609            radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
1610
1611            /* Wait on the upper word of the PrimitiveStorageNeeded result. */
1612            radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1613            radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
1614         }
1615      }
1616
1617      radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pg_query_pipeline,
1618                        pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1619                        dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1620                        flags, 0, 0, pool->uses_gds);
1621      break;
1622   default:
1623      unreachable("trying to get results of unhandled query type");
1624   }
1625}
1626
1627static uint32_t
1628query_clear_value(VkQueryType type)
1629{
1630   switch (type) {
1631   case VK_QUERY_TYPE_TIMESTAMP:
1632   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1633   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1634   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1635   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1636      return (uint32_t)TIMESTAMP_NOT_READY;
1637   default:
1638      return 0;
1639   }
1640}
1641
1642VKAPI_ATTR void VKAPI_CALL
1643radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
1644                       uint32_t queryCount)
1645{
1646   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1647   RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1648   uint32_t value = query_clear_value(pool->type);
1649   uint32_t flush_bits = 0;
1650
1651   /* Make sure to sync all previous work if the given command buffer has
1652    * pending active queries. Otherwise the GPU might write queries data
1653    * after the reset operation.
1654    */
1655   cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1656
1657   flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo,
1658                                  radv_buffer_get_va(pool->bo) + firstQuery * pool->stride,
1659                                  queryCount * pool->stride, value);
1660
1661   if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1662      flush_bits |=
1663         radv_fill_buffer(cmd_buffer, NULL, pool->bo,
1664                          radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4,
1665                          queryCount * 4, 0);
1666   }
1667
1668   if (flush_bits) {
1669      /* Only need to flush caches for the compute shader path. */
1670      cmd_buffer->pending_reset_query = true;
1671      cmd_buffer->state.flush_bits |= flush_bits;
1672   }
1673}
1674
1675VKAPI_ATTR void VKAPI_CALL
1676radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery,
1677                    uint32_t queryCount)
1678{
1679   RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1680
1681   uint32_t value = query_clear_value(pool->type);
1682   uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride);
1683   uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1684
1685   for (uint32_t *p = data; p != data_end; ++p)
1686      *p = value;
1687
1688   if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1689      memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4);
1690   }
1691}
1692
1693static unsigned
1694event_type_for_stream(unsigned stream)
1695{
1696   switch (stream) {
1697   default:
1698   case 0:
1699      return V_028A90_SAMPLE_STREAMOUTSTATS;
1700   case 1:
1701      return V_028A90_SAMPLE_STREAMOUTSTATS1;
1702   case 2:
1703      return V_028A90_SAMPLE_STREAMOUTSTATS2;
1704   case 3:
1705      return V_028A90_SAMPLE_STREAMOUTSTATS3;
1706   }
1707}
1708
1709static void
1710emit_sample_streamout(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1711{
1712   struct radeon_cmdbuf *cs = cmd_buffer->cs;
1713
1714   radeon_check_space(cmd_buffer->device->ws, cs, 4);
1715
1716   assert(index < MAX_SO_STREAMS);
1717
1718   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1719   radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1720   radeon_emit(cs, va);
1721   radeon_emit(cs, va >> 32);
1722}
1723
1724static void
1725gfx10_copy_gds_query(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va)
1726{
1727   struct radeon_cmdbuf *cs = cmd_buffer->cs;
1728
1729   /* Make sure GDS is idle before copying the value. */
1730   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
1731   si_emit_cache_flush(cmd_buffer);
1732
1733   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1734   radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
1735                   COPY_DATA_WR_CONFIRM);
1736   radeon_emit(cs, gds_offset);
1737   radeon_emit(cs, 0);
1738   radeon_emit(cs, va);
1739   radeon_emit(cs, va >> 32);
1740}
1741
1742static void
1743emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va,
1744                 VkQueryType query_type, VkQueryControlFlags flags, uint32_t index)
1745{
1746   struct radeon_cmdbuf *cs = cmd_buffer->cs;
1747   switch (query_type) {
1748   case VK_QUERY_TYPE_OCCLUSION:
1749      radeon_check_space(cmd_buffer->device->ws, cs, 7);
1750
1751      ++cmd_buffer->state.active_occlusion_queries;
1752      if (cmd_buffer->state.active_occlusion_queries == 1) {
1753         if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
1754            /* This is the first occlusion query, enable
1755             * the hint if the precision bit is set.
1756             */
1757            cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1758         }
1759
1760         radv_set_db_count_control(cmd_buffer, true);
1761      } else {
1762         if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) &&
1763             !cmd_buffer->state.perfect_occlusion_queries_enabled) {
1764            /* This is not the first query, but this one
1765             * needs to enable precision, DB_COUNT_CONTROL
1766             * has to be updated accordingly.
1767             */
1768            cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1769
1770            radv_set_db_count_control(cmd_buffer, true);
1771         }
1772      }
1773
1774      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1775         uint64_t rb_mask =
1776            BITFIELD64_MASK(cmd_buffer->device->physical_device->rad_info.max_render_backends);
1777
1778         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1779         radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1));
1780         radeon_emit(cs, PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) |
1781                         PIXEL_PIPE_STATE_CNTL_STRIDE(2) |
1782                         PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask));
1783         radeon_emit(cs, PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask));
1784      }
1785
1786      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1787
1788      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1789         radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
1790      } else {
1791         radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1792      }
1793
1794      radeon_emit(cs, va);
1795      radeon_emit(cs, va >> 32);
1796      break;
1797   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1798      radeon_check_space(cmd_buffer->device->ws, cs, 4);
1799
1800      ++cmd_buffer->state.active_pipeline_queries;
1801      if (cmd_buffer->state.active_pipeline_queries == 1) {
1802         cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1803         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
1804      }
1805
1806      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1807      radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1808      radeon_emit(cs, va);
1809      radeon_emit(cs, va >> 32);
1810
1811      if (pool->uses_gds) {
1812         va += pipelinestat_block_size * 2;
1813
1814         gfx10_copy_gds_query(cmd_buffer, 0, va); /* NGG GS */
1815
1816         /* Record that the command buffer needs GDS. */
1817         cmd_buffer->gds_needed = true;
1818
1819         cmd_buffer->state.active_pipeline_gds_queries++;
1820      }
1821      break;
1822   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1823      emit_sample_streamout(cmd_buffer, va, index);
1824      break;
1825   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1826      if (!cmd_buffer->state.prims_gen_query_enabled) {
1827         bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1828
1829         cmd_buffer->state.prims_gen_query_enabled = true;
1830
1831         if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1832            radv_emit_streamout_enable(cmd_buffer);
1833         }
1834      }
1835
1836      emit_sample_streamout(cmd_buffer, va, index);
1837
1838      if (pool->uses_gds) {
1839         gfx10_copy_gds_query(cmd_buffer, 0, va + 32); /* NGG GS */
1840         gfx10_copy_gds_query(cmd_buffer, 4, va + 48); /* NGG VS/TES */
1841
1842         /* Record that the command buffer needs GDS. */
1843         cmd_buffer->gds_needed = true;
1844
1845         cmd_buffer->state.active_pipeline_gds_queries++;
1846      }
1847      break;
1848   }
1849   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1850      radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
1851      break;
1852   }
1853   default:
1854      unreachable("beginning unhandled query type");
1855   }
1856}
1857
1858static void
1859emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va,
1860               uint64_t avail_va, VkQueryType query_type, uint32_t index)
1861{
1862   struct radeon_cmdbuf *cs = cmd_buffer->cs;
1863   switch (query_type) {
1864   case VK_QUERY_TYPE_OCCLUSION:
1865      radeon_check_space(cmd_buffer->device->ws, cs, 14);
1866
1867      cmd_buffer->state.active_occlusion_queries--;
1868      if (cmd_buffer->state.active_occlusion_queries == 0) {
1869         radv_set_db_count_control(cmd_buffer, false);
1870
1871         /* Reset the perfect occlusion queries hint now that no
1872          * queries are active.
1873          */
1874         cmd_buffer->state.perfect_occlusion_queries_enabled = false;
1875      }
1876
1877      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1878      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1879         radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
1880      } else {
1881         radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1882      }
1883      radeon_emit(cs, va + 8);
1884      radeon_emit(cs, (va + 8) >> 32);
1885
1886      break;
1887   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1888      radeon_check_space(cmd_buffer->device->ws, cs, 16);
1889
1890      cmd_buffer->state.active_pipeline_queries--;
1891      if (cmd_buffer->state.active_pipeline_queries == 0) {
1892         cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1893         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1894      }
1895      va += pipelinestat_block_size;
1896
1897      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1898      radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1899      radeon_emit(cs, va);
1900      radeon_emit(cs, va >> 32);
1901
1902      si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
1903                                 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
1904                                 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1,
1905                                 cmd_buffer->gfx9_eop_bug_va);
1906
1907      if (pool->uses_gds) {
1908         va += pipelinestat_block_size + 8;
1909
1910         gfx10_copy_gds_query(cmd_buffer, 0, va); /* NGG GS */
1911
1912         cmd_buffer->state.active_pipeline_gds_queries--;
1913      }
1914      break;
1915   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1916      emit_sample_streamout(cmd_buffer, va + 16, index);
1917      break;
1918   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1919      if (cmd_buffer->state.prims_gen_query_enabled) {
1920         bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1921
1922         cmd_buffer->state.prims_gen_query_enabled = false;
1923
1924         if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1925            radv_emit_streamout_enable(cmd_buffer);
1926         }
1927      }
1928
1929      emit_sample_streamout(cmd_buffer, va + 16, index);
1930
1931      if (pool->uses_gds) {
1932         gfx10_copy_gds_query(cmd_buffer, 0, va + 40); /* NGG GS */
1933         gfx10_copy_gds_query(cmd_buffer, 4, va + 56); /* NGG VS/TES */
1934
1935         cmd_buffer->state.active_pipeline_gds_queries--;
1936      }
1937      break;
1938   }
1939   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1940      radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
1941      break;
1942   }
1943   default:
1944      unreachable("ending unhandled query type");
1945   }
1946
1947   cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1948                                          RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 |
1949                                          RADV_CMD_FLAG_INV_VCACHE;
1950   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
1951      cmd_buffer->active_query_flush_bits |=
1952         RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
1953   }
1954}
1955
1956VKAPI_ATTR void VKAPI_CALL
1957radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
1958                             VkQueryControlFlags flags, uint32_t index)
1959{
1960   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1961   RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1962   struct radeon_cmdbuf *cs = cmd_buffer->cs;
1963   uint64_t va = radv_buffer_get_va(pool->bo);
1964
1965   radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1966
1967   emit_query_flush(cmd_buffer, pool);
1968
1969   va += pool->stride * query;
1970
1971   emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
1972}
1973
1974VKAPI_ATTR void VKAPI_CALL
1975radv_CmdBeginQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
1976                   VkQueryControlFlags flags)
1977{
1978   radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0);
1979}
1980
1981VKAPI_ATTR void VKAPI_CALL
1982radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
1983                           uint32_t index)
1984{
1985   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1986   RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1987   uint64_t va = radv_buffer_get_va(pool->bo);
1988   uint64_t avail_va = va + pool->availability_offset + 4 * query;
1989   va += pool->stride * query;
1990
1991   /* Do not need to add the pool BO to the list because the query must
1992    * currently be active, which means the BO is already in the list.
1993    */
1994   emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
1995
1996   /*
1997    * For multiview we have to emit a query for each bit in the mask,
1998    * however the first query we emit will get the totals for all the
1999    * operations, so we don't want to get a real value in the other
2000    * queries. This emits a fake begin/end sequence so the waiting
2001    * code gets a completed query value and doesn't hang, but the
2002    * query returns 0.
2003    */
2004   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
2005      for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
2006         va += pool->stride;
2007         avail_va += 4;
2008         emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
2009         emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
2010      }
2011   }
2012}
2013
2014VKAPI_ATTR void VKAPI_CALL
2015radv_CmdEndQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query)
2016{
2017   radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0);
2018}
2019
2020VKAPI_ATTR void VKAPI_CALL
2021radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage,
2022                        VkQueryPool queryPool, uint32_t query)
2023{
2024   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2025   RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2026   bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
2027   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2028   uint64_t va = radv_buffer_get_va(pool->bo);
2029   uint64_t query_va = va + pool->stride * query;
2030
2031   radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2032
2033   emit_query_flush(cmd_buffer, pool);
2034
2035   int num_queries = 1;
2036   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
2037      num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask);
2038
2039   ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
2040
2041   for (unsigned i = 0; i < num_queries; i++) {
2042      if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
2043         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2044         radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
2045                            COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) | COPY_DATA_DST_SEL(V_370_MEM));
2046         radeon_emit(cs, 0);
2047         radeon_emit(cs, 0);
2048         radeon_emit(cs, query_va);
2049         radeon_emit(cs, query_va >> 32);
2050      } else {
2051         si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
2052                                    mec, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
2053                                    EOP_DATA_SEL_TIMESTAMP, query_va, 0,
2054                                    cmd_buffer->gfx9_eop_bug_va);
2055      }
2056      query_va += pool->stride;
2057   }
2058
2059   cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
2060                                          RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 |
2061                                          RADV_CMD_FLAG_INV_VCACHE;
2062   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
2063      cmd_buffer->active_query_flush_bits |=
2064         RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2065   }
2066
2067   assert(cmd_buffer->cs->cdw <= cdw_max);
2068}
2069
2070VKAPI_ATTR void VKAPI_CALL
2071radv_CmdWriteAccelerationStructuresPropertiesKHR(
2072   VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount,
2073   const VkAccelerationStructureKHR *pAccelerationStructures, VkQueryType queryType,
2074   VkQueryPool queryPool, uint32_t firstQuery)
2075{
2076   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2077   RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2078   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2079   uint64_t pool_va = radv_buffer_get_va(pool->bo);
2080   uint64_t query_va = pool_va + pool->stride * firstQuery;
2081
2082   radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2083
2084   emit_query_flush(cmd_buffer, pool);
2085
2086   ASSERTED unsigned cdw_max =
2087      radeon_check_space(cmd_buffer->device->ws, cs, 6 * accelerationStructureCount);
2088
2089   for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
2090      RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, pAccelerationStructures[i]);
2091      uint64_t va = radv_accel_struct_get_va(accel_struct);
2092
2093      switch (queryType) {
2094      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2095         va += offsetof(struct radv_accel_struct_header, compacted_size);
2096         break;
2097      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2098         va += offsetof(struct radv_accel_struct_header, serialization_size);
2099         break;
2100      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2101         va += offsetof(struct radv_accel_struct_header, instance_count);
2102         break;
2103      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2104         va += offsetof(struct radv_accel_struct_header, size);
2105         break;
2106      default:
2107         unreachable("Unhandle accel struct query type.");
2108      }
2109
2110      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2111      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
2112                         COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
2113      radeon_emit(cs, va);
2114      radeon_emit(cs, va >> 32);
2115      radeon_emit(cs, query_va);
2116      radeon_emit(cs, query_va >> 32);
2117
2118      query_va += pool->stride;
2119   }
2120
2121   assert(cmd_buffer->cs->cdw <= cdw_max);
2122}
2123