1/* 2 * Copyrigh 2016 Red Hat Inc. 3 * Based on anv: 4 * Copyright © 2015 Intel Corporation 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23 * IN THE SOFTWARE. 24 */ 25 26#include <assert.h> 27#include <fcntl.h> 28#include <stdbool.h> 29#include <string.h> 30 31#include "nir/nir_builder.h" 32#include "util/u_atomic.h" 33#include "vulkan/vulkan_core.h" 34#include "radv_acceleration_structure.h" 35#include "radv_cs.h" 36#include "radv_meta.h" 37#include "radv_private.h" 38#include "sid.h" 39 40#define TIMESTAMP_NOT_READY UINT64_MAX 41 42static const int pipelinestat_block_size = 11 * 8; 43static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10}; 44 45static void 46radv_store_availability(nir_builder *b, nir_ssa_def *flags, nir_ssa_def *dst_buf, 47 nir_ssa_def *offset, nir_ssa_def *value32) 48{ 49 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)); 50 51 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT)); 52 53 nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .align_mul = 8); 54 55 nir_push_else(b, NULL); 56 57 nir_store_ssbo(b, value32, dst_buf, offset); 58 59 nir_pop_if(b, NULL); 60 61 nir_pop_if(b, NULL); 62} 63 64static nir_shader * 65build_occlusion_query_shader(struct radv_device *device) 66{ 67 /* the shader this builds is roughly 68 * 69 * push constants { 70 * uint32_t flags; 71 * uint32_t dst_stride; 72 * }; 73 * 74 * uint32_t src_stride = 16 * db_count; 75 * 76 * location(binding = 0) buffer dst_buf; 77 * location(binding = 1) buffer src_buf; 78 * 79 * void main() { 80 * uint64_t result = 0; 81 * uint64_t src_offset = src_stride * global_id.x; 82 * uint64_t dst_offset = dst_stride * global_id.x; 83 * bool available = true; 84 * for (int i = 0; i < db_count; ++i) { 85 * if (enabled_rb_mask & (1 << i)) { 86 * uint64_t start = src_buf[src_offset + 16 * i]; 87 * uint64_t end = src_buf[src_offset + 16 * i + 8]; 88 * if ((start & (1ull << 63)) && (end & (1ull << 63))) 89 * result += end - start; 90 * else 91 * available = false; 92 * } 93 * } 94 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4; 95 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { 96 * if (flags & VK_QUERY_RESULT_64_BIT) 97 * dst_buf[dst_offset] = result; 98 * else 99 * dst_buf[dst_offset] = (uint32_t)result. 100 * } 101 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 102 * dst_buf[dst_offset + elem_size] = available; 103 * } 104 * } 105 */ 106 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "occlusion_query"); 107 b.shader->info.workgroup_size[0] = 64; 108 109 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result"); 110 nir_variable *outer_counter = 111 nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter"); 112 nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start"); 113 nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end"); 114 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available"); 115 unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask; 116 unsigned db_count = device->physical_device->rad_info.max_render_backends; 117 118 nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4); 119 120 nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); 121 nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); 122 123 nir_ssa_def *global_id = get_global_ids(&b, 1); 124 125 nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16); 126 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 127 nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8); 128 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 129 130 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1); 131 nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1); 132 nir_store_var(&b, available, nir_imm_true(&b), 0x1); 133 134 nir_push_loop(&b); 135 136 nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter); 137 radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count)); 138 139 nir_ssa_def *enabled_cond = 140 nir_iand_imm(&b, nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count), enabled_rb_mask); 141 142 nir_push_if(&b, nir_i2b(&b, enabled_cond)); 143 144 nir_ssa_def *load_offset = nir_imul_imm(&b, current_outer_count, 16); 145 load_offset = nir_iadd(&b, input_base, load_offset); 146 147 nir_ssa_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16); 148 149 nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1); 150 nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1); 151 152 nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0)); 153 nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0)); 154 155 nir_push_if(&b, nir_iand(&b, start_done, end_done)); 156 157 nir_store_var(&b, result, 158 nir_iadd(&b, nir_load_var(&b, result), 159 nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))), 160 0x1); 161 162 nir_push_else(&b, NULL); 163 164 nir_store_var(&b, available, nir_imm_false(&b), 0x1); 165 166 nir_pop_if(&b, NULL); 167 nir_pop_if(&b, NULL); 168 nir_pop_loop(&b, NULL); 169 170 /* Store the result if complete or if partial results have been requested. */ 171 172 nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT); 173 nir_ssa_def *result_size = 174 nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4)); 175 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), 176 nir_load_var(&b, available))); 177 178 nir_push_if(&b, result_is_64bit); 179 180 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .align_mul = 8); 181 182 nir_push_else(&b, NULL); 183 184 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base, 185 .align_mul = 8); 186 187 nir_pop_if(&b, NULL); 188 nir_pop_if(&b, NULL); 189 190 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base), 191 nir_b2i32(&b, nir_load_var(&b, available))); 192 193 return b.shader; 194} 195 196static nir_shader * 197build_pipeline_statistics_query_shader(struct radv_device *device) 198{ 199 /* the shader this builds is roughly 200 * 201 * push constants { 202 * uint32_t flags; 203 * uint32_t dst_stride; 204 * uint32_t stats_mask; 205 * uint32_t avail_offset; 206 * }; 207 * 208 * uint32_t src_stride = pipelinestat_block_size * 2; 209 * 210 * location(binding = 0) buffer dst_buf; 211 * location(binding = 1) buffer src_buf; 212 * 213 * void main() { 214 * uint64_t src_offset = src_stride * global_id.x; 215 * uint64_t dst_base = dst_stride * global_id.x; 216 * uint64_t dst_offset = dst_base; 217 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4; 218 * uint32_t elem_count = stats_mask >> 16; 219 * uint32_t available32 = src_buf[avail_offset + 4 * global_id.x]; 220 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 221 * dst_buf[dst_offset + elem_count * elem_size] = available32; 222 * } 223 * if ((bool)available32) { 224 * // repeat 11 times: 225 * if (stats_mask & (1 << 0)) { 226 * uint64_t start = src_buf[src_offset + 8 * indices[0]]; 227 * uint64_t end = src_buf[src_offset + 8 * indices[0] + 228 * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT) 229 * dst_buf[dst_offset] = result; 230 * else 231 * dst_buf[dst_offset] = (uint32_t)result. 232 * dst_offset += elem_size; 233 * } 234 * } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { 235 * // Set everything to 0 as we don't know what is valid. 236 * for (int i = 0; i < elem_count; ++i) 237 * dst_buf[dst_base + elem_size * i] = 0; 238 * } 239 * } 240 */ 241 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pipeline_statistics_query"); 242 b.shader->info.workgroup_size[0] = 64; 243 244 nir_variable *output_offset = 245 nir_local_variable_create(b.impl, glsl_int_type(), "output_offset"); 246 nir_variable *result = 247 nir_local_variable_create(b.impl, glsl_int64_t_type(), "result"); 248 249 nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4); 250 nir_ssa_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12); 251 nir_ssa_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16); 252 nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20); 253 254 nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); 255 nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); 256 257 nir_ssa_def *global_id = get_global_ids(&b, 1); 258 259 nir_variable *input_stride = nir_local_variable_create(b.impl, glsl_int_type(), "input_stride"); 260 nir_push_if(&b, nir_ine(&b, uses_gds, nir_imm_int(&b, 0))); 261 { 262 nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2 + 8 * 2), 0x1); 263 } 264 nir_push_else(&b, NULL); 265 { 266 nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2), 0x1); 267 } 268 nir_pop_if(&b, NULL); 269 270 nir_ssa_def *input_base = nir_imul(&b, nir_load_var(&b, input_stride), global_id); 271 nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8); 272 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 273 274 avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4)); 275 276 nir_ssa_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset); 277 278 nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT); 279 nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4)); 280 nir_ssa_def *elem_count = nir_ushr_imm(&b, stats_mask, 16); 281 282 radv_store_availability(&b, flags, dst_buf, 283 nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)), 284 available32); 285 286 nir_push_if(&b, nir_i2b(&b, available32)); 287 288 nir_store_var(&b, output_offset, output_base, 0x1); 289 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) { 290 nir_push_if(&b, nir_test_mask(&b, stats_mask, BITFIELD64_BIT(i))); 291 292 nir_ssa_def *start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8); 293 nir_ssa_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset); 294 295 nir_ssa_def *end_offset = 296 nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size); 297 nir_ssa_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset); 298 299 nir_store_var(&b, result, nir_isub(&b, end, start), 0x1); 300 301 nir_push_if(&b, nir_iand(&b, nir_i2b(&b, uses_gds), 302 nir_ieq(&b, nir_imm_int(&b, 1u << i), 303 nir_imm_int(&b, VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)))); 304 { 305 /* Compute the GDS result if needed. */ 306 nir_ssa_def *gds_start_offset = 307 nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2)); 308 nir_ssa_def *gds_start = nir_load_ssbo(&b, 1, 64, src_buf, gds_start_offset); 309 310 nir_ssa_def *gds_end_offset = 311 nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2 + 8)); 312 nir_ssa_def *gds_end = nir_load_ssbo(&b, 1, 64, src_buf, gds_end_offset); 313 314 nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start); 315 316 nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_gds_result), 0x1); 317 } 318 nir_pop_if(&b, NULL); 319 320 /* Store result */ 321 nir_push_if(&b, result_is_64bit); 322 323 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, nir_load_var(&b, output_offset)); 324 325 nir_push_else(&b, NULL); 326 327 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, nir_load_var(&b, output_offset)); 328 329 nir_pop_if(&b, NULL); 330 331 nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size), 332 0x1); 333 334 nir_pop_if(&b, NULL); 335 } 336 337 nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */ 338 339 nir_push_if(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT)); 340 341 /* Stores zeros in all outputs. */ 342 343 nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter"); 344 nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1); 345 346 nir_loop *loop = nir_push_loop(&b); 347 348 nir_ssa_def *current_counter = nir_load_var(&b, counter); 349 radv_break_on_count(&b, counter, elem_count); 350 351 nir_ssa_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter)); 352 nir_push_if(&b, result_is_64bit); 353 354 nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem); 355 356 nir_push_else(&b, NULL); 357 358 nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem); 359 360 nir_pop_if(&b, NULL); 361 362 nir_pop_loop(&b, loop); 363 nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */ 364 nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */ 365 return b.shader; 366} 367 368static nir_shader * 369build_tfb_query_shader(struct radv_device *device) 370{ 371 /* the shader this builds is roughly 372 * 373 * uint32_t src_stride = 32; 374 * 375 * location(binding = 0) buffer dst_buf; 376 * location(binding = 1) buffer src_buf; 377 * 378 * void main() { 379 * uint64_t result[2] = {}; 380 * bool available = false; 381 * uint64_t src_offset = src_stride * global_id.x; 382 * uint64_t dst_offset = dst_stride * global_id.x; 383 * uint64_t *src_data = src_buf[src_offset]; 384 * uint32_t avail = (src_data[0] >> 32) & 385 * (src_data[1] >> 32) & 386 * (src_data[2] >> 32) & 387 * (src_data[3] >> 32); 388 * if (avail & 0x80000000) { 389 * result[0] = src_data[3] - src_data[1]; 390 * result[1] = src_data[2] - src_data[0]; 391 * available = true; 392 * } 393 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8; 394 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { 395 * if (flags & VK_QUERY_RESULT_64_BIT) { 396 * dst_buf[dst_offset] = result; 397 * } else { 398 * dst_buf[dst_offset] = (uint32_t)result; 399 * } 400 * } 401 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 402 * dst_buf[dst_offset + result_size] = available; 403 * } 404 * } 405 */ 406 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "tfb_query"); 407 b.shader->info.workgroup_size[0] = 64; 408 409 /* Create and initialize local variables. */ 410 nir_variable *result = 411 nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result"); 412 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available"); 413 414 nir_store_var(&b, result, nir_vec2(&b, nir_imm_int64(&b, 0), nir_imm_int64(&b, 0)), 0x3); 415 nir_store_var(&b, available, nir_imm_false(&b), 0x1); 416 417 nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4); 418 419 /* Load resources. */ 420 nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); 421 nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); 422 423 /* Compute global ID. */ 424 nir_ssa_def *global_id = get_global_ids(&b, 1); 425 426 /* Compute src/dst strides. */ 427 nir_ssa_def *input_stride = nir_imm_int(&b, 32); 428 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 429 nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8); 430 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 431 432 /* Load data from the query pool. */ 433 nir_ssa_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32); 434 nir_ssa_def *load2 = 435 nir_load_ssbo(&b, 4, 32, src_buf, nir_iadd_imm(&b, input_base, 16), .align_mul = 16); 436 437 /* Check if result is available. */ 438 nir_ssa_def *avails[2]; 439 avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3)); 440 avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3)); 441 nir_ssa_def *result_is_available = 442 nir_test_mask(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000); 443 444 /* Only compute result if available. */ 445 nir_push_if(&b, result_is_available); 446 447 /* Pack values. */ 448 nir_ssa_def *packed64[4]; 449 packed64[0] = 450 nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 0), nir_channel(&b, load1, 1))); 451 packed64[1] = 452 nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3))); 453 packed64[2] = 454 nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 0), nir_channel(&b, load2, 1))); 455 packed64[3] = 456 nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3))); 457 458 /* Compute result. */ 459 nir_ssa_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]); 460 nir_ssa_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]); 461 462 nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3); 463 nir_store_var(&b, available, nir_imm_true(&b), 0x1); 464 465 nir_pop_if(&b, NULL); 466 467 /* Determine if result is 64 or 32 bit. */ 468 nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT); 469 nir_ssa_def *result_size = 470 nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8)); 471 472 /* Store the result if complete or partial results have been requested. */ 473 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), 474 nir_load_var(&b, available))); 475 476 /* Store result. */ 477 nir_push_if(&b, result_is_64bit); 478 479 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base); 480 481 nir_push_else(&b, NULL); 482 483 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base); 484 485 nir_pop_if(&b, NULL); 486 nir_pop_if(&b, NULL); 487 488 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base), 489 nir_b2i32(&b, nir_load_var(&b, available))); 490 491 return b.shader; 492} 493 494static nir_shader * 495build_timestamp_query_shader(struct radv_device *device) 496{ 497 /* the shader this builds is roughly 498 * 499 * uint32_t src_stride = 8; 500 * 501 * location(binding = 0) buffer dst_buf; 502 * location(binding = 1) buffer src_buf; 503 * 504 * void main() { 505 * uint64_t result = 0; 506 * bool available = false; 507 * uint64_t src_offset = src_stride * global_id.x; 508 * uint64_t dst_offset = dst_stride * global_id.x; 509 * uint64_t timestamp = src_buf[src_offset]; 510 * if (timestamp != TIMESTAMP_NOT_READY) { 511 * result = timestamp; 512 * available = true; 513 * } 514 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4; 515 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { 516 * if (flags & VK_QUERY_RESULT_64_BIT) { 517 * dst_buf[dst_offset] = result; 518 * } else { 519 * dst_buf[dst_offset] = (uint32_t)result; 520 * } 521 * } 522 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 523 * dst_buf[dst_offset + result_size] = available; 524 * } 525 * } 526 */ 527 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "timestamp_query"); 528 b.shader->info.workgroup_size[0] = 64; 529 530 /* Create and initialize local variables. */ 531 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result"); 532 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available"); 533 534 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1); 535 nir_store_var(&b, available, nir_imm_false(&b), 0x1); 536 537 nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4); 538 539 /* Load resources. */ 540 nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); 541 nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); 542 543 /* Compute global ID. */ 544 nir_ssa_def *global_id = get_global_ids(&b, 1); 545 546 /* Compute src/dst strides. */ 547 nir_ssa_def *input_stride = nir_imm_int(&b, 8); 548 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 549 nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8); 550 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 551 552 /* Load data from the query pool. */ 553 nir_ssa_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8); 554 555 /* Pack the timestamp. */ 556 nir_ssa_def *timestamp; 557 timestamp = 558 nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load, 0), nir_channel(&b, load, 1))); 559 560 /* Check if result is available. */ 561 nir_ssa_def *result_is_available = nir_i2b(&b, nir_ine_imm(&b, timestamp, TIMESTAMP_NOT_READY)); 562 563 /* Only store result if available. */ 564 nir_push_if(&b, result_is_available); 565 566 nir_store_var(&b, result, timestamp, 0x1); 567 nir_store_var(&b, available, nir_imm_true(&b), 0x1); 568 569 nir_pop_if(&b, NULL); 570 571 /* Determine if result is 64 or 32 bit. */ 572 nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT); 573 nir_ssa_def *result_size = 574 nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4)); 575 576 /* Store the result if complete or partial results have been requested. */ 577 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), 578 nir_load_var(&b, available))); 579 580 /* Store result. */ 581 nir_push_if(&b, result_is_64bit); 582 583 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base); 584 585 nir_push_else(&b, NULL); 586 587 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base); 588 589 nir_pop_if(&b, NULL); 590 591 nir_pop_if(&b, NULL); 592 593 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base), 594 nir_b2i32(&b, nir_load_var(&b, available))); 595 596 return b.shader; 597} 598 599static nir_shader * 600build_pg_query_shader(struct radv_device *device) 601{ 602 /* the shader this builds is roughly 603 * 604 * uint32_t src_stride = 32; 605 * 606 * location(binding = 0) buffer dst_buf; 607 * location(binding = 1) buffer src_buf; 608 * 609 * void main() { 610 * uint64_t result = {}; 611 * bool available = false; 612 * uint64_t src_offset = src_stride * global_id.x; 613 * uint64_t dst_offset = dst_stride * global_id.x; 614 * uint64_t *src_data = src_buf[src_offset]; 615 * uint32_t avail = (src_data[0] >> 32) & 616 * (src_data[2] >> 32); 617 * if (avail & 0x80000000) { 618 * result = src_data[2] - src_data[0]; 619 * if (use_gds) { 620 * uint64_t ngg_gds_result = 0; 621 * ngg_gds_result += src_data[5] - src_data[4]; 622 * ngg_gds_result += src_data[7] - src_data[6]; 623 * result += ngg_gds_result; 624 * } 625 * available = true; 626 * } 627 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8; 628 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) { 629 * if (flags & VK_QUERY_RESULT_64_BIT) { 630 * dst_buf[dst_offset] = result; 631 * } else { 632 * dst_buf[dst_offset] = (uint32_t)result; 633 * } 634 * } 635 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 636 * dst_buf[dst_offset + result_size] = available; 637 * } 638 * } 639 */ 640 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pg_query"); 641 b.shader->info.workgroup_size[0] = 64; 642 643 /* Create and initialize local variables. */ 644 nir_variable *result = 645 nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result"); 646 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available"); 647 648 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1); 649 nir_store_var(&b, available, nir_imm_false(&b), 0x1); 650 651 nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16); 652 653 /* Load resources. */ 654 nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0); 655 nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1); 656 657 /* Compute global ID. */ 658 nir_ssa_def *global_id = get_global_ids(&b, 1); 659 660 /* Compute src/dst strides. */ 661 nir_ssa_def *input_stride = nir_imm_int(&b, 32); 662 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id); 663 nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16); 664 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id); 665 666 /* Load data from the query pool. */ 667 nir_ssa_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32); 668 nir_ssa_def *load2 = nir_load_ssbo( 669 &b, 2, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16); 670 671 /* Check if result is available. */ 672 nir_ssa_def *avails[2]; 673 avails[0] = nir_channel(&b, load1, 1); 674 avails[1] = nir_channel(&b, load2, 1); 675 nir_ssa_def *result_is_available = 676 nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), nir_imm_int(&b, 0x80000000))); 677 678 /* Only compute result if available. */ 679 nir_push_if(&b, result_is_available); 680 681 /* Pack values. */ 682 nir_ssa_def *packed64[2]; 683 packed64[0] = 684 nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 0), nir_channel(&b, load1, 1))); 685 packed64[1] = 686 nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 0), nir_channel(&b, load2, 1))); 687 688 /* Compute result. */ 689 nir_ssa_def *primitive_storage_needed = nir_isub(&b, packed64[1], packed64[0]); 690 691 nir_store_var(&b, result, primitive_storage_needed, 0x1); 692 693 nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20); 694 nir_push_if(&b, nir_i2b(&b, uses_gds)); 695 { 696 /* NGG GS result */ 697 nir_ssa_def *gds_start = 698 nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 8); 699 nir_ssa_def *gds_end = 700 nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 8); 701 702 nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start); 703 704 /* NGG VS/TES result */ 705 gds_start = 706 nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 48)), .align_mul = 8); 707 gds_end = 708 nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 56)), .align_mul = 8); 709 710 ngg_gds_result = nir_iadd(&b, ngg_gds_result, nir_isub(&b, gds_end, gds_start)); 711 712 nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_gds_result), 0x1); 713 } 714 nir_pop_if(&b, NULL); 715 716 nir_store_var(&b, available, nir_imm_true(&b), 0x1); 717 718 nir_pop_if(&b, NULL); 719 720 /* Determine if result is 64 or 32 bit. */ 721 nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT); 722 nir_ssa_def *result_size = 723 nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8)); 724 725 /* Store the result if complete or partial results have been requested. */ 726 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), 727 nir_load_var(&b, available))); 728 729 /* Store result. */ 730 nir_push_if(&b, result_is_64bit); 731 732 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base); 733 734 nir_push_else(&b, NULL); 735 736 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base); 737 738 nir_pop_if(&b, NULL); 739 nir_pop_if(&b, NULL); 740 741 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base), 742 nir_b2i32(&b, nir_load_var(&b, available))); 743 744 return b.shader; 745} 746 747static VkResult 748radv_device_init_meta_query_state_internal(struct radv_device *device) 749{ 750 VkResult result; 751 nir_shader *occlusion_cs = NULL; 752 nir_shader *pipeline_statistics_cs = NULL; 753 nir_shader *tfb_cs = NULL; 754 nir_shader *timestamp_cs = NULL; 755 nir_shader *pg_cs = NULL; 756 757 mtx_lock(&device->meta_state.mtx); 758 if (device->meta_state.query.pipeline_statistics_query_pipeline) { 759 mtx_unlock(&device->meta_state.mtx); 760 return VK_SUCCESS; 761 } 762 occlusion_cs = build_occlusion_query_shader(device); 763 pipeline_statistics_cs = build_pipeline_statistics_query_shader(device); 764 tfb_cs = build_tfb_query_shader(device); 765 timestamp_cs = build_timestamp_query_shader(device); 766 pg_cs = build_pg_query_shader(device); 767 768 VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = { 769 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, 770 .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, 771 .bindingCount = 2, 772 .pBindings = (VkDescriptorSetLayoutBinding[]){ 773 {.binding = 0, 774 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 775 .descriptorCount = 1, 776 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, 777 .pImmutableSamplers = NULL}, 778 {.binding = 1, 779 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 780 .descriptorCount = 1, 781 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, 782 .pImmutableSamplers = NULL}, 783 }}; 784 785 result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &occlusion_ds_create_info, 786 &device->meta_state.alloc, 787 &device->meta_state.query.ds_layout); 788 if (result != VK_SUCCESS) 789 goto fail; 790 791 VkPipelineLayoutCreateInfo occlusion_pl_create_info = { 792 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, 793 .setLayoutCount = 1, 794 .pSetLayouts = &device->meta_state.query.ds_layout, 795 .pushConstantRangeCount = 1, 796 .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 20}, 797 }; 798 799 result = 800 radv_CreatePipelineLayout(radv_device_to_handle(device), &occlusion_pl_create_info, 801 &device->meta_state.alloc, &device->meta_state.query.p_layout); 802 if (result != VK_SUCCESS) 803 goto fail; 804 805 VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = { 806 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 807 .stage = VK_SHADER_STAGE_COMPUTE_BIT, 808 .module = vk_shader_module_handle_from_nir(occlusion_cs), 809 .pName = "main", 810 .pSpecializationInfo = NULL, 811 }; 812 813 VkComputePipelineCreateInfo occlusion_vk_pipeline_info = { 814 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 815 .stage = occlusion_pipeline_shader_stage, 816 .flags = 0, 817 .layout = device->meta_state.query.p_layout, 818 }; 819 820 result = radv_CreateComputePipelines( 821 radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, 822 &occlusion_vk_pipeline_info, NULL, &device->meta_state.query.occlusion_query_pipeline); 823 if (result != VK_SUCCESS) 824 goto fail; 825 826 VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = { 827 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 828 .stage = VK_SHADER_STAGE_COMPUTE_BIT, 829 .module = vk_shader_module_handle_from_nir(pipeline_statistics_cs), 830 .pName = "main", 831 .pSpecializationInfo = NULL, 832 }; 833 834 VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = { 835 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 836 .stage = pipeline_statistics_pipeline_shader_stage, 837 .flags = 0, 838 .layout = device->meta_state.query.p_layout, 839 }; 840 841 result = radv_CreateComputePipelines( 842 radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, 843 &pipeline_statistics_vk_pipeline_info, NULL, 844 &device->meta_state.query.pipeline_statistics_query_pipeline); 845 if (result != VK_SUCCESS) 846 goto fail; 847 848 VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = { 849 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 850 .stage = VK_SHADER_STAGE_COMPUTE_BIT, 851 .module = vk_shader_module_handle_from_nir(tfb_cs), 852 .pName = "main", 853 .pSpecializationInfo = NULL, 854 }; 855 856 VkComputePipelineCreateInfo tfb_pipeline_info = { 857 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 858 .stage = tfb_pipeline_shader_stage, 859 .flags = 0, 860 .layout = device->meta_state.query.p_layout, 861 }; 862 863 result = radv_CreateComputePipelines( 864 radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, 865 &tfb_pipeline_info, NULL, &device->meta_state.query.tfb_query_pipeline); 866 if (result != VK_SUCCESS) 867 goto fail; 868 869 VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = { 870 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 871 .stage = VK_SHADER_STAGE_COMPUTE_BIT, 872 .module = vk_shader_module_handle_from_nir(timestamp_cs), 873 .pName = "main", 874 .pSpecializationInfo = NULL, 875 }; 876 877 VkComputePipelineCreateInfo timestamp_pipeline_info = { 878 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 879 .stage = timestamp_pipeline_shader_stage, 880 .flags = 0, 881 .layout = device->meta_state.query.p_layout, 882 }; 883 884 result = radv_CreateComputePipelines( 885 radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, 886 ×tamp_pipeline_info, NULL, &device->meta_state.query.timestamp_query_pipeline); 887 if (result != VK_SUCCESS) 888 goto fail; 889 890 VkPipelineShaderStageCreateInfo pg_pipeline_shader_stage = { 891 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 892 .stage = VK_SHADER_STAGE_COMPUTE_BIT, 893 .module = vk_shader_module_handle_from_nir(pg_cs), 894 .pName = "main", 895 .pSpecializationInfo = NULL, 896 }; 897 898 VkComputePipelineCreateInfo pg_pipeline_info = { 899 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 900 .stage = pg_pipeline_shader_stage, 901 .flags = 0, 902 .layout = device->meta_state.query.p_layout, 903 }; 904 905 result = radv_CreateComputePipelines( 906 radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, 907 &pg_pipeline_info, NULL, &device->meta_state.query.pg_query_pipeline); 908 909fail: 910 ralloc_free(occlusion_cs); 911 ralloc_free(pipeline_statistics_cs); 912 ralloc_free(tfb_cs); 913 ralloc_free(pg_cs); 914 ralloc_free(timestamp_cs); 915 mtx_unlock(&device->meta_state.mtx); 916 return result; 917} 918 919VkResult 920radv_device_init_meta_query_state(struct radv_device *device, bool on_demand) 921{ 922 if (on_demand) 923 return VK_SUCCESS; 924 925 return radv_device_init_meta_query_state_internal(device); 926} 927 928void 929radv_device_finish_meta_query_state(struct radv_device *device) 930{ 931 if (device->meta_state.query.tfb_query_pipeline) 932 radv_DestroyPipeline(radv_device_to_handle(device), 933 device->meta_state.query.tfb_query_pipeline, &device->meta_state.alloc); 934 935 if (device->meta_state.query.pipeline_statistics_query_pipeline) 936 radv_DestroyPipeline(radv_device_to_handle(device), 937 device->meta_state.query.pipeline_statistics_query_pipeline, 938 &device->meta_state.alloc); 939 940 if (device->meta_state.query.occlusion_query_pipeline) 941 radv_DestroyPipeline(radv_device_to_handle(device), 942 device->meta_state.query.occlusion_query_pipeline, 943 &device->meta_state.alloc); 944 945 if (device->meta_state.query.timestamp_query_pipeline) 946 radv_DestroyPipeline(radv_device_to_handle(device), 947 device->meta_state.query.timestamp_query_pipeline, 948 &device->meta_state.alloc); 949 950 if (device->meta_state.query.pg_query_pipeline) 951 radv_DestroyPipeline(radv_device_to_handle(device), 952 device->meta_state.query.pg_query_pipeline, &device->meta_state.alloc); 953 954 if (device->meta_state.query.p_layout) 955 radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.query.p_layout, 956 &device->meta_state.alloc); 957 958 if (device->meta_state.query.ds_layout) 959 device->vk.dispatch_table.DestroyDescriptorSetLayout(radv_device_to_handle(device), 960 device->meta_state.query.ds_layout, 961 &device->meta_state.alloc); 962} 963 964static void 965radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline, 966 struct radeon_winsys_bo *src_bo, struct radeon_winsys_bo *dst_bo, 967 uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride, 968 uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags, 969 uint32_t pipeline_stats_mask, uint32_t avail_offset, bool uses_gds) 970{ 971 struct radv_device *device = cmd_buffer->device; 972 struct radv_meta_saved_state saved_state; 973 struct radv_buffer src_buffer, dst_buffer; 974 975 if (!*pipeline) { 976 VkResult ret = radv_device_init_meta_query_state_internal(device); 977 if (ret != VK_SUCCESS) { 978 cmd_buffer->record_result = ret; 979 return; 980 } 981 } 982 983 /* VK_EXT_conditional_rendering says that copy commands should not be 984 * affected by conditional rendering. 985 */ 986 radv_meta_save(&saved_state, cmd_buffer, 987 RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | 988 RADV_META_SAVE_DESCRIPTORS | RADV_META_SUSPEND_PREDICATING); 989 990 uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset); 991 uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size; 992 993 radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset); 994 radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset); 995 996 radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, 997 *pipeline); 998 999 radv_meta_push_descriptor_set( 1000 cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.query.p_layout, 0, /* set */ 1001 2, /* descriptorWriteCount */ 1002 (VkWriteDescriptorSet[]){ 1003 {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, 1004 .dstBinding = 0, 1005 .dstArrayElement = 0, 1006 .descriptorCount = 1, 1007 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1008 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer), 1009 .offset = 0, 1010 .range = VK_WHOLE_SIZE}}, 1011 {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, 1012 .dstBinding = 1, 1013 .dstArrayElement = 0, 1014 .descriptorCount = 1, 1015 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1016 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer), 1017 .offset = 0, 1018 .range = VK_WHOLE_SIZE}}}); 1019 1020 /* Encode the number of elements for easy access by the shader. */ 1021 pipeline_stats_mask &= 0x7ff; 1022 pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16; 1023 1024 avail_offset -= src_offset; 1025 1026 struct { 1027 uint32_t flags; 1028 uint32_t dst_stride; 1029 uint32_t pipeline_stats_mask; 1030 uint32_t avail_offset; 1031 uint32_t uses_gds; 1032 } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset, uses_gds}; 1033 1034 radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.query.p_layout, 1035 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants); 1036 1037 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE; 1038 1039 if (flags & VK_QUERY_RESULT_WAIT_BIT) 1040 cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER; 1041 1042 radv_unaligned_dispatch(cmd_buffer, count, 1, 1); 1043 1044 /* Ensure that the query copy dispatch is complete before a potential vkCmdResetPool because 1045 * there is an implicit execution dependency from each such query command to all query commands 1046 * previously submitted to the same queue. 1047 */ 1048 cmd_buffer->active_query_flush_bits |= 1049 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE; 1050 1051 radv_buffer_finish(&src_buffer); 1052 radv_buffer_finish(&dst_buffer); 1053 1054 radv_meta_restore(&saved_state, cmd_buffer); 1055} 1056 1057static void 1058radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator, 1059 struct radv_query_pool *pool) 1060{ 1061 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) 1062 radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool); 1063 1064 if (pool->bo) 1065 device->ws->buffer_destroy(device->ws, pool->bo); 1066 vk_object_base_finish(&pool->base); 1067 vk_free2(&device->vk.alloc, pAllocator, pool); 1068} 1069 1070VKAPI_ATTR VkResult VKAPI_CALL 1071radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, 1072 const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool) 1073{ 1074 RADV_FROM_HANDLE(radv_device, device, _device); 1075 VkResult result; 1076 size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR 1077 ? sizeof(struct radv_pc_query_pool) 1078 : sizeof(struct radv_query_pool); 1079 1080 struct radv_query_pool *pool = vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8, 1081 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1082 1083 if (!pool) 1084 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1085 1086 vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL); 1087 1088 pool->type = pCreateInfo->queryType; 1089 pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics; 1090 1091 /* The number of primitives generated by geometry shader invocations is only counted by the 1092 * hardware if GS uses the legacy path. When NGG GS is used, the hardware can't know the number 1093 * of generated primitives and we have to increment it from the shader using a plain GDS atomic. 1094 */ 1095 pool->uses_gds = device->physical_device->use_ngg && 1096 ((pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) || 1097 pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); 1098 1099 switch (pCreateInfo->queryType) { 1100 case VK_QUERY_TYPE_OCCLUSION: 1101 pool->stride = 16 * device->physical_device->rad_info.max_render_backends; 1102 break; 1103 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1104 pool->stride = pipelinestat_block_size * 2; 1105 if (pool->uses_gds) { 1106 /* When the query pool needs GDS (for counting the number of primitives generated by a 1107 * geometry shader with NGG), allocate 2x64-bit values for begin/end. 1108 */ 1109 pool->stride += 8 * 2; 1110 } 1111 break; 1112 case VK_QUERY_TYPE_TIMESTAMP: 1113 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 1114 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 1115 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: 1116 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: 1117 pool->stride = 8; 1118 break; 1119 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1120 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1121 pool->stride = 32; 1122 if (pool->uses_gds) { 1123 /* When the query pool needs GDS, allocate 4x64-bit values for begin/end of NGG GS and 1124 * NGG VS/TES because they use a different offset. 1125 */ 1126 pool->stride += 8 * 4; 1127 } 1128 break; 1129 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 1130 result = radv_pc_init_query_pool(device->physical_device, pCreateInfo, 1131 (struct radv_pc_query_pool *)pool); 1132 1133 if (result != VK_SUCCESS) { 1134 radv_destroy_query_pool(device, pAllocator, pool); 1135 return vk_error(device, result); 1136 } 1137 break; 1138 } 1139 default: 1140 unreachable("creating unhandled query type"); 1141 } 1142 1143 pool->availability_offset = pool->stride * pCreateInfo->queryCount; 1144 pool->size = pool->availability_offset; 1145 if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS) 1146 pool->size += 4 * pCreateInfo->queryCount; 1147 1148 result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT, 1149 RADEON_FLAG_NO_INTERPROCESS_SHARING, 1150 RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo); 1151 if (result != VK_SUCCESS) { 1152 radv_destroy_query_pool(device, pAllocator, pool); 1153 return vk_error(device, result); 1154 } 1155 1156 pool->ptr = device->ws->buffer_map(pool->bo); 1157 if (!pool->ptr) { 1158 radv_destroy_query_pool(device, pAllocator, pool); 1159 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); 1160 } 1161 1162 *pQueryPool = radv_query_pool_to_handle(pool); 1163 return VK_SUCCESS; 1164} 1165 1166VKAPI_ATTR void VKAPI_CALL 1167radv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator) 1168{ 1169 RADV_FROM_HANDLE(radv_device, device, _device); 1170 RADV_FROM_HANDLE(radv_query_pool, pool, _pool); 1171 1172 if (!pool) 1173 return; 1174 1175 radv_destroy_query_pool(device, pAllocator, pool); 1176} 1177 1178VKAPI_ATTR VkResult VKAPI_CALL 1179radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, 1180 uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride, 1181 VkQueryResultFlags flags) 1182{ 1183 RADV_FROM_HANDLE(radv_device, device, _device); 1184 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1185 char *data = pData; 1186 VkResult result = VK_SUCCESS; 1187 1188 if (vk_device_is_lost(&device->vk)) 1189 return VK_ERROR_DEVICE_LOST; 1190 1191 for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) { 1192 char *dest = data; 1193 unsigned query = firstQuery + query_idx; 1194 char *src = pool->ptr + query * pool->stride; 1195 uint32_t available; 1196 1197 switch (pool->type) { 1198 case VK_QUERY_TYPE_TIMESTAMP: 1199 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 1200 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 1201 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: 1202 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: { 1203 uint64_t const *src64 = (uint64_t const *)src; 1204 uint64_t value; 1205 1206 do { 1207 value = p_atomic_read(src64); 1208 } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT)); 1209 1210 available = value != TIMESTAMP_NOT_READY; 1211 1212 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1213 result = VK_NOT_READY; 1214 1215 if (flags & VK_QUERY_RESULT_64_BIT) { 1216 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1217 *(uint64_t *)dest = value; 1218 dest += 8; 1219 } else { 1220 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1221 *(uint32_t *)dest = (uint32_t)value; 1222 dest += 4; 1223 } 1224 break; 1225 } 1226 case VK_QUERY_TYPE_OCCLUSION: { 1227 uint64_t const *src64 = (uint64_t const *)src; 1228 uint32_t db_count = device->physical_device->rad_info.max_render_backends; 1229 uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask; 1230 uint64_t sample_count = 0; 1231 available = 1; 1232 1233 for (int i = 0; i < db_count; ++i) { 1234 uint64_t start, end; 1235 1236 if (!(enabled_rb_mask & (1 << i))) 1237 continue; 1238 1239 do { 1240 start = p_atomic_read(src64 + 2 * i); 1241 end = p_atomic_read(src64 + 2 * i + 1); 1242 } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && 1243 (flags & VK_QUERY_RESULT_WAIT_BIT)); 1244 1245 if (!(start & (1ull << 63)) || !(end & (1ull << 63))) 1246 available = 0; 1247 else { 1248 sample_count += end - start; 1249 } 1250 } 1251 1252 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1253 result = VK_NOT_READY; 1254 1255 if (flags & VK_QUERY_RESULT_64_BIT) { 1256 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1257 *(uint64_t *)dest = sample_count; 1258 dest += 8; 1259 } else { 1260 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1261 *(uint32_t *)dest = sample_count; 1262 dest += 4; 1263 } 1264 break; 1265 } 1266 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 1267 const uint32_t *avail_ptr = 1268 (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query); 1269 uint64_t ngg_gds_result = 0; 1270 1271 do { 1272 available = p_atomic_read(avail_ptr); 1273 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)); 1274 1275 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1276 result = VK_NOT_READY; 1277 1278 if (pool->uses_gds) { 1279 /* Compute the result that was copied from GDS. */ 1280 const uint64_t *gds_start = (uint64_t *)(src + pipelinestat_block_size * 2); 1281 const uint64_t *gds_stop = (uint64_t *)(src + pipelinestat_block_size * 2 + 8); 1282 1283 ngg_gds_result = gds_stop[0] - gds_start[0]; 1284 } 1285 1286 const uint64_t *start = (uint64_t *)src; 1287 const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size); 1288 if (flags & VK_QUERY_RESULT_64_BIT) { 1289 uint64_t *dst = (uint64_t *)dest; 1290 dest += util_bitcount(pool->pipeline_stats_mask) * 8; 1291 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) { 1292 if (pool->pipeline_stats_mask & (1u << i)) { 1293 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) { 1294 *dst = stop[pipeline_statistics_indices[i]] - 1295 start[pipeline_statistics_indices[i]]; 1296 1297 if (pool->uses_gds && 1298 (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) { 1299 *dst += ngg_gds_result; 1300 } 1301 } 1302 dst++; 1303 } 1304 } 1305 1306 } else { 1307 uint32_t *dst = (uint32_t *)dest; 1308 dest += util_bitcount(pool->pipeline_stats_mask) * 4; 1309 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) { 1310 if (pool->pipeline_stats_mask & (1u << i)) { 1311 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) { 1312 *dst = stop[pipeline_statistics_indices[i]] - 1313 start[pipeline_statistics_indices[i]]; 1314 1315 if (pool->uses_gds && 1316 (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) { 1317 *dst += ngg_gds_result; 1318 } 1319 } 1320 dst++; 1321 } 1322 } 1323 } 1324 break; 1325 } 1326 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { 1327 uint64_t const *src64 = (uint64_t const *)src; 1328 uint64_t num_primitives_written; 1329 uint64_t primitive_storage_needed; 1330 1331 /* SAMPLE_STREAMOUTSTATS stores this structure: 1332 * { 1333 * u64 NumPrimitivesWritten; 1334 * u64 PrimitiveStorageNeeded; 1335 * } 1336 */ 1337 available = 1; 1338 for (int j = 0; j < 4; j++) { 1339 if (!(p_atomic_read(src64 + j) & 0x8000000000000000UL)) 1340 available = 0; 1341 } 1342 1343 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1344 result = VK_NOT_READY; 1345 1346 num_primitives_written = src64[3] - src64[1]; 1347 primitive_storage_needed = src64[2] - src64[0]; 1348 1349 if (flags & VK_QUERY_RESULT_64_BIT) { 1350 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1351 *(uint64_t *)dest = num_primitives_written; 1352 dest += 8; 1353 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1354 *(uint64_t *)dest = primitive_storage_needed; 1355 dest += 8; 1356 } else { 1357 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1358 *(uint32_t *)dest = num_primitives_written; 1359 dest += 4; 1360 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1361 *(uint32_t *)dest = primitive_storage_needed; 1362 dest += 4; 1363 } 1364 break; 1365 } 1366 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { 1367 uint64_t const *src64 = (uint64_t const *)src; 1368 uint64_t primitive_storage_needed; 1369 1370 /* SAMPLE_STREAMOUTSTATS stores this structure: 1371 * { 1372 * u64 NumPrimitivesWritten; 1373 * u64 PrimitiveStorageNeeded; 1374 * } 1375 */ 1376 available = 1; 1377 if (!(p_atomic_read(src64 + 0) & 0x8000000000000000UL) || 1378 !(p_atomic_read(src64 + 2) & 0x8000000000000000UL)) { 1379 available = 0; 1380 } 1381 1382 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1383 result = VK_NOT_READY; 1384 1385 primitive_storage_needed = src64[2] - src64[0]; 1386 1387 if (pool->uses_gds) { 1388 /* Accumulate the result that was copied from GDS in case NGG GS or NGG VS/TES have been 1389 * used. 1390 */ 1391 primitive_storage_needed += src64[5] - src64[4]; /* NGG GS */ 1392 primitive_storage_needed += src64[7] - src64[6]; /* NGG VS/TES */ 1393 } 1394 1395 if (flags & VK_QUERY_RESULT_64_BIT) { 1396 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1397 *(uint64_t *)dest = primitive_storage_needed; 1398 dest += 8; 1399 } else { 1400 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) 1401 *(uint32_t *)dest = primitive_storage_needed; 1402 dest += 4; 1403 } 1404 break; 1405 } 1406 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 1407 struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool; 1408 const uint64_t *src64 = (const uint64_t *)src; 1409 bool avail; 1410 do { 1411 avail = true; 1412 for (unsigned i = 0; i < pc_pool->num_passes; ++i) 1413 if (!p_atomic_read(src64 + pool->stride / 8 - i - 1)) 1414 avail = false; 1415 } while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT)); 1416 1417 available = avail; 1418 1419 radv_pc_get_results(pc_pool, src64, dest); 1420 dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR); 1421 break; 1422 } 1423 default: 1424 unreachable("trying to get results of unhandled query type"); 1425 } 1426 1427 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 1428 if (flags & VK_QUERY_RESULT_64_BIT) { 1429 *(uint64_t *)dest = available; 1430 } else { 1431 *(uint32_t *)dest = available; 1432 } 1433 } 1434 } 1435 1436 return result; 1437} 1438 1439static void 1440emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool) 1441{ 1442 if (cmd_buffer->pending_reset_query) { 1443 if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) { 1444 /* Only need to flush caches if the query pool size is 1445 * large enough to be resetted using the compute shader 1446 * path. Small pools don't need any cache flushes 1447 * because we use a CP dma clear. 1448 */ 1449 si_emit_cache_flush(cmd_buffer); 1450 } 1451 } 1452} 1453 1454static size_t 1455radv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags) 1456{ 1457 unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0; 1458 switch (pool->type) { 1459 case VK_QUERY_TYPE_TIMESTAMP: 1460 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 1461 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 1462 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: 1463 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: 1464 case VK_QUERY_TYPE_OCCLUSION: 1465 values += 1; 1466 break; 1467 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1468 values += util_bitcount(pool->pipeline_stats_mask); 1469 break; 1470 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1471 values += 2; 1472 break; 1473 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1474 values += 1; 1475 break; 1476 default: 1477 unreachable("trying to get size of unhandled query type"); 1478 } 1479 return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4); 1480} 1481 1482VKAPI_ATTR void VKAPI_CALL 1483radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, 1484 uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer, 1485 VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags) 1486{ 1487 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 1488 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1489 RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); 1490 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1491 uint64_t va = radv_buffer_get_va(pool->bo); 1492 uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo); 1493 size_t dst_size = radv_query_result_size(pool, flags); 1494 dest_va += dst_buffer->offset + dstOffset; 1495 1496 if (!queryCount) 1497 return; 1498 1499 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo); 1500 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo); 1501 1502 /* Workaround engines that forget to properly specify WAIT_BIT because some driver implicitly 1503 * synchronizes before query copy. 1504 */ 1505 if (cmd_buffer->device->instance->flush_before_query_copy) 1506 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits; 1507 1508 /* From the Vulkan spec 1.1.108: 1509 * 1510 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of 1511 * previous uses of vkCmdResetQueryPool in the same queue, without any 1512 * additional synchronization." 1513 * 1514 * So, we have to flush the caches if the compute shader path was used. 1515 */ 1516 emit_query_flush(cmd_buffer, pool); 1517 1518 switch (pool->type) { 1519 case VK_QUERY_TYPE_OCCLUSION: 1520 if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1521 unsigned enabled_rb_mask = cmd_buffer->device->physical_device->rad_info.enabled_rb_mask; 1522 uint32_t rb_avail_offset = 16 * util_last_bit(enabled_rb_mask) - 4; 1523 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) { 1524 unsigned query = firstQuery + i; 1525 uint64_t src_va = va + query * pool->stride + rb_avail_offset; 1526 1527 radeon_check_space(cmd_buffer->device->ws, cs, 7); 1528 1529 /* Waits on the upper word of the last DB entry */ 1530 radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va, 0x80000000, 0xffffffff); 1531 } 1532 } 1533 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline, 1534 pool->bo, dst_buffer->bo, firstQuery * pool->stride, 1535 dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount, 1536 flags, 0, 0, false); 1537 break; 1538 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1539 if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1540 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) { 1541 unsigned query = firstQuery + i; 1542 1543 radeon_check_space(cmd_buffer->device->ws, cs, 7); 1544 1545 uint64_t avail_va = va + pool->availability_offset + 4 * query; 1546 1547 /* This waits on the ME. All copies below are done on the ME */ 1548 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff); 1549 } 1550 } 1551 radv_query_shader( 1552 cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, 1553 pool->bo, dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, 1554 pool->stride, stride, dst_size, queryCount, flags, pool->pipeline_stats_mask, 1555 pool->availability_offset + 4 * firstQuery, pool->uses_gds); 1556 break; 1557 case VK_QUERY_TYPE_TIMESTAMP: 1558 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 1559 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 1560 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: 1561 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: 1562 if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1563 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) { 1564 unsigned query = firstQuery + i; 1565 uint64_t local_src_va = va + query * pool->stride; 1566 1567 radeon_check_space(cmd_buffer->device->ws, cs, 7); 1568 1569 /* Wait on the high 32 bits of the timestamp in 1570 * case the low part is 0xffffffff. 1571 */ 1572 radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4, 1573 TIMESTAMP_NOT_READY >> 32, 0xffffffff); 1574 } 1575 } 1576 1577 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline, 1578 pool->bo, dst_buffer->bo, firstQuery * pool->stride, 1579 dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount, 1580 flags, 0, 0, false); 1581 break; 1582 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1583 if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1584 for (unsigned i = 0; i < queryCount; i++) { 1585 unsigned query = firstQuery + i; 1586 uint64_t src_va = va + query * pool->stride; 1587 1588 radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4); 1589 1590 /* Wait on the upper word of all results. */ 1591 for (unsigned j = 0; j < 4; j++, src_va += 8) { 1592 radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 1593 0xffffffff); 1594 } 1595 } 1596 } 1597 1598 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline, 1599 pool->bo, dst_buffer->bo, firstQuery * pool->stride, 1600 dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount, 1601 flags, 0, 0, false); 1602 break; 1603 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: 1604 if (flags & VK_QUERY_RESULT_WAIT_BIT) { 1605 for (unsigned i = 0; i < queryCount; i++) { 1606 unsigned query = firstQuery + i; 1607 uint64_t src_va = va + query * pool->stride; 1608 1609 radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2); 1610 1611 /* Wait on the upper word of the PrimitiveStorageNeeded result. */ 1612 radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff); 1613 radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff); 1614 } 1615 } 1616 1617 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pg_query_pipeline, 1618 pool->bo, dst_buffer->bo, firstQuery * pool->stride, 1619 dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount, 1620 flags, 0, 0, pool->uses_gds); 1621 break; 1622 default: 1623 unreachable("trying to get results of unhandled query type"); 1624 } 1625} 1626 1627static uint32_t 1628query_clear_value(VkQueryType type) 1629{ 1630 switch (type) { 1631 case VK_QUERY_TYPE_TIMESTAMP: 1632 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 1633 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 1634 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: 1635 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: 1636 return (uint32_t)TIMESTAMP_NOT_READY; 1637 default: 1638 return 0; 1639 } 1640} 1641 1642VKAPI_ATTR void VKAPI_CALL 1643radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, 1644 uint32_t queryCount) 1645{ 1646 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 1647 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1648 uint32_t value = query_clear_value(pool->type); 1649 uint32_t flush_bits = 0; 1650 1651 /* Make sure to sync all previous work if the given command buffer has 1652 * pending active queries. Otherwise the GPU might write queries data 1653 * after the reset operation. 1654 */ 1655 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits; 1656 1657 flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, 1658 radv_buffer_get_va(pool->bo) + firstQuery * pool->stride, 1659 queryCount * pool->stride, value); 1660 1661 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 1662 flush_bits |= 1663 radv_fill_buffer(cmd_buffer, NULL, pool->bo, 1664 radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4, 1665 queryCount * 4, 0); 1666 } 1667 1668 if (flush_bits) { 1669 /* Only need to flush caches for the compute shader path. */ 1670 cmd_buffer->pending_reset_query = true; 1671 cmd_buffer->state.flush_bits |= flush_bits; 1672 } 1673} 1674 1675VKAPI_ATTR void VKAPI_CALL 1676radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, 1677 uint32_t queryCount) 1678{ 1679 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1680 1681 uint32_t value = query_clear_value(pool->type); 1682 uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride); 1683 uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride); 1684 1685 for (uint32_t *p = data; p != data_end; ++p) 1686 *p = value; 1687 1688 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 1689 memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4); 1690 } 1691} 1692 1693static unsigned 1694event_type_for_stream(unsigned stream) 1695{ 1696 switch (stream) { 1697 default: 1698 case 0: 1699 return V_028A90_SAMPLE_STREAMOUTSTATS; 1700 case 1: 1701 return V_028A90_SAMPLE_STREAMOUTSTATS1; 1702 case 2: 1703 return V_028A90_SAMPLE_STREAMOUTSTATS2; 1704 case 3: 1705 return V_028A90_SAMPLE_STREAMOUTSTATS3; 1706 } 1707} 1708 1709static void 1710emit_sample_streamout(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index) 1711{ 1712 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1713 1714 radeon_check_space(cmd_buffer->device->ws, cs, 4); 1715 1716 assert(index < MAX_SO_STREAMS); 1717 1718 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1719 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3)); 1720 radeon_emit(cs, va); 1721 radeon_emit(cs, va >> 32); 1722} 1723 1724static void 1725gfx10_copy_gds_query(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va) 1726{ 1727 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1728 1729 /* Make sure GDS is idle before copying the value. */ 1730 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2; 1731 si_emit_cache_flush(cmd_buffer); 1732 1733 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 1734 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 1735 COPY_DATA_WR_CONFIRM); 1736 radeon_emit(cs, gds_offset); 1737 radeon_emit(cs, 0); 1738 radeon_emit(cs, va); 1739 radeon_emit(cs, va >> 32); 1740} 1741 1742static void 1743emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, 1744 VkQueryType query_type, VkQueryControlFlags flags, uint32_t index) 1745{ 1746 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1747 switch (query_type) { 1748 case VK_QUERY_TYPE_OCCLUSION: 1749 radeon_check_space(cmd_buffer->device->ws, cs, 7); 1750 1751 ++cmd_buffer->state.active_occlusion_queries; 1752 if (cmd_buffer->state.active_occlusion_queries == 1) { 1753 if (flags & VK_QUERY_CONTROL_PRECISE_BIT) { 1754 /* This is the first occlusion query, enable 1755 * the hint if the precision bit is set. 1756 */ 1757 cmd_buffer->state.perfect_occlusion_queries_enabled = true; 1758 } 1759 1760 radv_set_db_count_control(cmd_buffer, true); 1761 } else { 1762 if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) && 1763 !cmd_buffer->state.perfect_occlusion_queries_enabled) { 1764 /* This is not the first query, but this one 1765 * needs to enable precision, DB_COUNT_CONTROL 1766 * has to be updated accordingly. 1767 */ 1768 cmd_buffer->state.perfect_occlusion_queries_enabled = true; 1769 1770 radv_set_db_count_control(cmd_buffer, true); 1771 } 1772 } 1773 1774 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 1775 uint64_t rb_mask = 1776 BITFIELD64_MASK(cmd_buffer->device->physical_device->rad_info.max_render_backends); 1777 1778 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1779 radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1)); 1780 radeon_emit(cs, PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) | 1781 PIXEL_PIPE_STATE_CNTL_STRIDE(2) | 1782 PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask)); 1783 radeon_emit(cs, PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask)); 1784 } 1785 1786 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1787 1788 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 1789 radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1)); 1790 } else { 1791 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); 1792 } 1793 1794 radeon_emit(cs, va); 1795 radeon_emit(cs, va >> 32); 1796 break; 1797 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1798 radeon_check_space(cmd_buffer->device->ws, cs, 4); 1799 1800 ++cmd_buffer->state.active_pipeline_queries; 1801 if (cmd_buffer->state.active_pipeline_queries == 1) { 1802 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS; 1803 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS; 1804 } 1805 1806 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1807 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); 1808 radeon_emit(cs, va); 1809 radeon_emit(cs, va >> 32); 1810 1811 if (pool->uses_gds) { 1812 va += pipelinestat_block_size * 2; 1813 1814 gfx10_copy_gds_query(cmd_buffer, 0, va); /* NGG GS */ 1815 1816 /* Record that the command buffer needs GDS. */ 1817 cmd_buffer->gds_needed = true; 1818 1819 cmd_buffer->state.active_pipeline_gds_queries++; 1820 } 1821 break; 1822 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1823 emit_sample_streamout(cmd_buffer, va, index); 1824 break; 1825 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { 1826 if (!cmd_buffer->state.prims_gen_query_enabled) { 1827 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer); 1828 1829 cmd_buffer->state.prims_gen_query_enabled = true; 1830 1831 if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) { 1832 radv_emit_streamout_enable(cmd_buffer); 1833 } 1834 } 1835 1836 emit_sample_streamout(cmd_buffer, va, index); 1837 1838 if (pool->uses_gds) { 1839 gfx10_copy_gds_query(cmd_buffer, 0, va + 32); /* NGG GS */ 1840 gfx10_copy_gds_query(cmd_buffer, 4, va + 48); /* NGG VS/TES */ 1841 1842 /* Record that the command buffer needs GDS. */ 1843 cmd_buffer->gds_needed = true; 1844 1845 cmd_buffer->state.active_pipeline_gds_queries++; 1846 } 1847 break; 1848 } 1849 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 1850 radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va); 1851 break; 1852 } 1853 default: 1854 unreachable("beginning unhandled query type"); 1855 } 1856} 1857 1858static void 1859emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, 1860 uint64_t avail_va, VkQueryType query_type, uint32_t index) 1861{ 1862 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1863 switch (query_type) { 1864 case VK_QUERY_TYPE_OCCLUSION: 1865 radeon_check_space(cmd_buffer->device->ws, cs, 14); 1866 1867 cmd_buffer->state.active_occlusion_queries--; 1868 if (cmd_buffer->state.active_occlusion_queries == 0) { 1869 radv_set_db_count_control(cmd_buffer, false); 1870 1871 /* Reset the perfect occlusion queries hint now that no 1872 * queries are active. 1873 */ 1874 cmd_buffer->state.perfect_occlusion_queries_enabled = false; 1875 } 1876 1877 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1878 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 1879 radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1)); 1880 } else { 1881 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); 1882 } 1883 radeon_emit(cs, va + 8); 1884 radeon_emit(cs, (va + 8) >> 32); 1885 1886 break; 1887 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1888 radeon_check_space(cmd_buffer->device->ws, cs, 16); 1889 1890 cmd_buffer->state.active_pipeline_queries--; 1891 if (cmd_buffer->state.active_pipeline_queries == 0) { 1892 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS; 1893 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS; 1894 } 1895 va += pipelinestat_block_size; 1896 1897 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1898 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); 1899 radeon_emit(cs, va); 1900 radeon_emit(cs, va >> 32); 1901 1902 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, 1903 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 1904 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, 1905 cmd_buffer->gfx9_eop_bug_va); 1906 1907 if (pool->uses_gds) { 1908 va += pipelinestat_block_size + 8; 1909 1910 gfx10_copy_gds_query(cmd_buffer, 0, va); /* NGG GS */ 1911 1912 cmd_buffer->state.active_pipeline_gds_queries--; 1913 } 1914 break; 1915 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1916 emit_sample_streamout(cmd_buffer, va + 16, index); 1917 break; 1918 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { 1919 if (cmd_buffer->state.prims_gen_query_enabled) { 1920 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer); 1921 1922 cmd_buffer->state.prims_gen_query_enabled = false; 1923 1924 if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) { 1925 radv_emit_streamout_enable(cmd_buffer); 1926 } 1927 } 1928 1929 emit_sample_streamout(cmd_buffer, va + 16, index); 1930 1931 if (pool->uses_gds) { 1932 gfx10_copy_gds_query(cmd_buffer, 0, va + 40); /* NGG GS */ 1933 gfx10_copy_gds_query(cmd_buffer, 4, va + 56); /* NGG VS/TES */ 1934 1935 cmd_buffer->state.active_pipeline_gds_queries--; 1936 } 1937 break; 1938 } 1939 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 1940 radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va); 1941 break; 1942 } 1943 default: 1944 unreachable("ending unhandled query type"); 1945 } 1946 1947 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 1948 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | 1949 RADV_CMD_FLAG_INV_VCACHE; 1950 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) { 1951 cmd_buffer->active_query_flush_bits |= 1952 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 1953 } 1954} 1955 1956VKAPI_ATTR void VKAPI_CALL 1957radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, 1958 VkQueryControlFlags flags, uint32_t index) 1959{ 1960 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 1961 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1962 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1963 uint64_t va = radv_buffer_get_va(pool->bo); 1964 1965 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo); 1966 1967 emit_query_flush(cmd_buffer, pool); 1968 1969 va += pool->stride * query; 1970 1971 emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index); 1972} 1973 1974VKAPI_ATTR void VKAPI_CALL 1975radv_CmdBeginQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, 1976 VkQueryControlFlags flags) 1977{ 1978 radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0); 1979} 1980 1981VKAPI_ATTR void VKAPI_CALL 1982radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, 1983 uint32_t index) 1984{ 1985 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 1986 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 1987 uint64_t va = radv_buffer_get_va(pool->bo); 1988 uint64_t avail_va = va + pool->availability_offset + 4 * query; 1989 va += pool->stride * query; 1990 1991 /* Do not need to add the pool BO to the list because the query must 1992 * currently be active, which means the BO is already in the list. 1993 */ 1994 emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index); 1995 1996 /* 1997 * For multiview we have to emit a query for each bit in the mask, 1998 * however the first query we emit will get the totals for all the 1999 * operations, so we don't want to get a real value in the other 2000 * queries. This emits a fake begin/end sequence so the waiting 2001 * code gets a completed query value and doesn't hang, but the 2002 * query returns 0. 2003 */ 2004 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 2005 for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) { 2006 va += pool->stride; 2007 avail_va += 4; 2008 emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0); 2009 emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0); 2010 } 2011 } 2012} 2013 2014VKAPI_ATTR void VKAPI_CALL 2015radv_CmdEndQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query) 2016{ 2017 radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0); 2018} 2019 2020VKAPI_ATTR void VKAPI_CALL 2021radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, 2022 VkQueryPool queryPool, uint32_t query) 2023{ 2024 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2025 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 2026 bool mec = radv_cmd_buffer_uses_mec(cmd_buffer); 2027 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2028 uint64_t va = radv_buffer_get_va(pool->bo); 2029 uint64_t query_va = va + pool->stride * query; 2030 2031 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo); 2032 2033 emit_query_flush(cmd_buffer, pool); 2034 2035 int num_queries = 1; 2036 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) 2037 num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask); 2038 2039 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries); 2040 2041 for (unsigned i = 0; i < num_queries; i++) { 2042 if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) { 2043 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 2044 radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | 2045 COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) | COPY_DATA_DST_SEL(V_370_MEM)); 2046 radeon_emit(cs, 0); 2047 radeon_emit(cs, 0); 2048 radeon_emit(cs, query_va); 2049 radeon_emit(cs, query_va >> 32); 2050 } else { 2051 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, 2052 mec, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, 2053 EOP_DATA_SEL_TIMESTAMP, query_va, 0, 2054 cmd_buffer->gfx9_eop_bug_va); 2055 } 2056 query_va += pool->stride; 2057 } 2058 2059 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 2060 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | 2061 RADV_CMD_FLAG_INV_VCACHE; 2062 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) { 2063 cmd_buffer->active_query_flush_bits |= 2064 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 2065 } 2066 2067 assert(cmd_buffer->cs->cdw <= cdw_max); 2068} 2069 2070VKAPI_ATTR void VKAPI_CALL 2071radv_CmdWriteAccelerationStructuresPropertiesKHR( 2072 VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount, 2073 const VkAccelerationStructureKHR *pAccelerationStructures, VkQueryType queryType, 2074 VkQueryPool queryPool, uint32_t firstQuery) 2075{ 2076 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 2077 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); 2078 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2079 uint64_t pool_va = radv_buffer_get_va(pool->bo); 2080 uint64_t query_va = pool_va + pool->stride * firstQuery; 2081 2082 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo); 2083 2084 emit_query_flush(cmd_buffer, pool); 2085 2086 ASSERTED unsigned cdw_max = 2087 radeon_check_space(cmd_buffer->device->ws, cs, 6 * accelerationStructureCount); 2088 2089 for (uint32_t i = 0; i < accelerationStructureCount; ++i) { 2090 RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, pAccelerationStructures[i]); 2091 uint64_t va = radv_accel_struct_get_va(accel_struct); 2092 2093 switch (queryType) { 2094 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR: 2095 va += offsetof(struct radv_accel_struct_header, compacted_size); 2096 break; 2097 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: 2098 va += offsetof(struct radv_accel_struct_header, serialization_size); 2099 break; 2100 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: 2101 va += offsetof(struct radv_accel_struct_header, instance_count); 2102 break; 2103 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: 2104 va += offsetof(struct radv_accel_struct_header, size); 2105 break; 2106 default: 2107 unreachable("Unhandle accel struct query type."); 2108 } 2109 2110 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 2111 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 2112 COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM); 2113 radeon_emit(cs, va); 2114 radeon_emit(cs, va >> 32); 2115 radeon_emit(cs, query_va); 2116 radeon_emit(cs, query_va >> 32); 2117 2118 query_va += pool->stride; 2119 } 2120 2121 assert(cmd_buffer->cs->cdw <= cdw_max); 2122} 2123