1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2021 Igalia S.L. 3bf215546Sopenharmony_ci * SPDX-License-Identifier: MIT 4bf215546Sopenharmony_ci */ 5bf215546Sopenharmony_ci 6bf215546Sopenharmony_ci#include "tu_autotune.h" 7bf215546Sopenharmony_ci 8bf215546Sopenharmony_ci#include "tu_cmd_buffer.h" 9bf215546Sopenharmony_ci#include "tu_cs.h" 10bf215546Sopenharmony_ci#include "tu_device.h" 11bf215546Sopenharmony_ci#include "tu_image.h" 12bf215546Sopenharmony_ci#include "tu_pass.h" 13bf215546Sopenharmony_ci 14bf215546Sopenharmony_ci/* How does it work? 15bf215546Sopenharmony_ci * 16bf215546Sopenharmony_ci * - For each renderpass we calculate the number of samples passed 17bf215546Sopenharmony_ci * by storing the number before and after in GPU memory. 18bf215546Sopenharmony_ci * - To store the values each command buffer holds GPU memory which 19bf215546Sopenharmony_ci * expands with more renderpasses being written. 20bf215546Sopenharmony_ci * - For each renderpass we create tu_renderpass_result entry which 21bf215546Sopenharmony_ci * points to the results in GPU memory. 22bf215546Sopenharmony_ci * - Later on tu_renderpass_result would be added to the 23bf215546Sopenharmony_ci * tu_renderpass_history entry which aggregate results for a 24bf215546Sopenharmony_ci * given renderpass. 25bf215546Sopenharmony_ci * - On submission: 26bf215546Sopenharmony_ci * - Process results which fence was signalled. 27bf215546Sopenharmony_ci * - Free per-submission data which we now don't need. 28bf215546Sopenharmony_ci * 29bf215546Sopenharmony_ci * - Create a command stream to write a fence value. This way we would 30bf215546Sopenharmony_ci * know when we could safely read the results. 31bf215546Sopenharmony_ci * - We cannot rely on the command buffer's lifetime when referencing 32bf215546Sopenharmony_ci * its resources since the buffer could be destroyed before we process 33bf215546Sopenharmony_ci * the results. 34bf215546Sopenharmony_ci * - For each command buffer: 35bf215546Sopenharmony_ci * - Reference its GPU memory. 36bf215546Sopenharmony_ci * - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue. 37bf215546Sopenharmony_ci * 38bf215546Sopenharmony_ci * Since the command buffers could be recorded on different threads 39bf215546Sopenharmony_ci * we have to maintaining some amount of locking history table, 40bf215546Sopenharmony_ci * however we change the table only in a single thread at the submission 41bf215546Sopenharmony_ci * time, so in most cases there will be no locking. 42bf215546Sopenharmony_ci */ 43bf215546Sopenharmony_ci 44bf215546Sopenharmony_civoid 45bf215546Sopenharmony_citu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results); 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_ci#define TU_AUTOTUNE_DEBUG_LOG 0 48bf215546Sopenharmony_ci/* Dump history entries on autotuner finish, 49bf215546Sopenharmony_ci * could be used to gather data from traces. 50bf215546Sopenharmony_ci */ 51bf215546Sopenharmony_ci#define TU_AUTOTUNE_LOG_AT_FINISH 0 52bf215546Sopenharmony_ci 53bf215546Sopenharmony_ci/* How many last renderpass stats are taken into account. */ 54bf215546Sopenharmony_ci#define MAX_HISTORY_RESULTS 5 55bf215546Sopenharmony_ci/* For how many submissions we store renderpass stats. */ 56bf215546Sopenharmony_ci#define MAX_HISTORY_LIFETIME 128 57bf215546Sopenharmony_ci 58bf215546Sopenharmony_ci 59bf215546Sopenharmony_ci/** 60bf215546Sopenharmony_ci * Tracks results for a given renderpass key 61bf215546Sopenharmony_ci */ 62bf215546Sopenharmony_cistruct tu_renderpass_history { 63bf215546Sopenharmony_ci uint64_t key; 64bf215546Sopenharmony_ci 65bf215546Sopenharmony_ci /* We would delete old history entries */ 66bf215546Sopenharmony_ci uint32_t last_fence; 67bf215546Sopenharmony_ci 68bf215546Sopenharmony_ci /** 69bf215546Sopenharmony_ci * List of recent fd_renderpass_result's 70bf215546Sopenharmony_ci */ 71bf215546Sopenharmony_ci struct list_head results; 72bf215546Sopenharmony_ci uint32_t num_results; 73bf215546Sopenharmony_ci 74bf215546Sopenharmony_ci uint32_t avg_samples; 75bf215546Sopenharmony_ci}; 76bf215546Sopenharmony_ci 77bf215546Sopenharmony_ci/* Holds per-submission cs which writes the fence. */ 78bf215546Sopenharmony_cistruct tu_submission_data { 79bf215546Sopenharmony_ci struct list_head node; 80bf215546Sopenharmony_ci uint32_t fence; 81bf215546Sopenharmony_ci 82bf215546Sopenharmony_ci struct tu_cs fence_cs; 83bf215546Sopenharmony_ci uint32_t buffers_count; 84bf215546Sopenharmony_ci}; 85bf215546Sopenharmony_ci 86bf215546Sopenharmony_cistatic uint32_t 87bf215546Sopenharmony_ciget_autotune_fence(struct tu_autotune *at) 88bf215546Sopenharmony_ci{ 89bf215546Sopenharmony_ci const struct tu6_global *global = at->device->global_bo->map; 90bf215546Sopenharmony_ci return global->autotune_fence; 91bf215546Sopenharmony_ci} 92bf215546Sopenharmony_ci 93bf215546Sopenharmony_cistatic struct tu_submission_data * 94bf215546Sopenharmony_cicreate_submission_data(struct tu_device *dev, struct tu_autotune *at) 95bf215546Sopenharmony_ci{ 96bf215546Sopenharmony_ci struct tu_submission_data *submission_data = 97bf215546Sopenharmony_ci calloc(1, sizeof(struct tu_submission_data)); 98bf215546Sopenharmony_ci submission_data->fence = at->fence_counter; 99bf215546Sopenharmony_ci 100bf215546Sopenharmony_ci struct tu_cs* fence_cs = &submission_data->fence_cs; 101bf215546Sopenharmony_ci tu_cs_init(fence_cs, dev, TU_CS_MODE_GROW, 5); 102bf215546Sopenharmony_ci tu_cs_begin(fence_cs); 103bf215546Sopenharmony_ci 104bf215546Sopenharmony_ci tu_cs_emit_pkt7(fence_cs, CP_EVENT_WRITE, 4); 105bf215546Sopenharmony_ci tu_cs_emit(fence_cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS)); 106bf215546Sopenharmony_ci tu_cs_emit_qw(fence_cs, dev->global_bo->iova + gb_offset(autotune_fence)); 107bf215546Sopenharmony_ci tu_cs_emit(fence_cs, at->fence_counter); 108bf215546Sopenharmony_ci 109bf215546Sopenharmony_ci tu_cs_end(fence_cs); 110bf215546Sopenharmony_ci 111bf215546Sopenharmony_ci list_addtail(&submission_data->node, &at->pending_submission_data); 112bf215546Sopenharmony_ci 113bf215546Sopenharmony_ci return submission_data; 114bf215546Sopenharmony_ci} 115bf215546Sopenharmony_ci 116bf215546Sopenharmony_cistatic void 117bf215546Sopenharmony_cifree_submission_data(struct tu_submission_data *data) 118bf215546Sopenharmony_ci{ 119bf215546Sopenharmony_ci list_del(&data->node); 120bf215546Sopenharmony_ci tu_cs_finish(&data->fence_cs); 121bf215546Sopenharmony_ci 122bf215546Sopenharmony_ci free(data); 123bf215546Sopenharmony_ci} 124bf215546Sopenharmony_ci 125bf215546Sopenharmony_ci#define APPEND_TO_HASH(state, field) \ 126bf215546Sopenharmony_ci XXH64_update(state, &field, sizeof(field)); 127bf215546Sopenharmony_ci 128bf215546Sopenharmony_cistatic uint64_t 129bf215546Sopenharmony_cihash_renderpass_instance(const struct tu_render_pass *pass, 130bf215546Sopenharmony_ci const struct tu_framebuffer *framebuffer, 131bf215546Sopenharmony_ci const struct tu_cmd_buffer *cmd) { 132bf215546Sopenharmony_ci XXH64_state_t hash_state; 133bf215546Sopenharmony_ci XXH64_reset(&hash_state, 0); 134bf215546Sopenharmony_ci 135bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, framebuffer->width); 136bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, framebuffer->height); 137bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, framebuffer->layers); 138bf215546Sopenharmony_ci 139bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, pass->attachment_count); 140bf215546Sopenharmony_ci XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0])); 141bf215546Sopenharmony_ci 142bf215546Sopenharmony_ci for (unsigned i = 0; i < pass->attachment_count; i++) { 143bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width); 144bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height); 145bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.format); 146bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.array_layers); 147bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.mip_levels); 148bf215546Sopenharmony_ci } 149bf215546Sopenharmony_ci 150bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, pass->subpass_count); 151bf215546Sopenharmony_ci for (unsigned i = 0; i < pass->subpass_count; i++) { 152bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples); 153bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count); 154bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count); 155bf215546Sopenharmony_ci APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count); 156bf215546Sopenharmony_ci } 157bf215546Sopenharmony_ci 158bf215546Sopenharmony_ci return XXH64_digest(&hash_state); 159bf215546Sopenharmony_ci} 160bf215546Sopenharmony_ci 161bf215546Sopenharmony_cistatic void 162bf215546Sopenharmony_cifree_result(struct tu_device *dev, struct tu_renderpass_result *result) 163bf215546Sopenharmony_ci{ 164bf215546Sopenharmony_ci tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo); 165bf215546Sopenharmony_ci list_del(&result->node); 166bf215546Sopenharmony_ci free(result); 167bf215546Sopenharmony_ci} 168bf215546Sopenharmony_ci 169bf215546Sopenharmony_cistatic void 170bf215546Sopenharmony_cifree_history(struct tu_device *dev, struct tu_renderpass_history *history) 171bf215546Sopenharmony_ci{ 172bf215546Sopenharmony_ci tu_autotune_free_results_locked(dev, &history->results); 173bf215546Sopenharmony_ci free(history); 174bf215546Sopenharmony_ci} 175bf215546Sopenharmony_ci 176bf215546Sopenharmony_cistatic bool 177bf215546Sopenharmony_ciget_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples) 178bf215546Sopenharmony_ci{ 179bf215546Sopenharmony_ci bool has_history = false; 180bf215546Sopenharmony_ci 181bf215546Sopenharmony_ci /* If the lock contantion would be found in the wild - 182bf215546Sopenharmony_ci * we could use try_lock here. 183bf215546Sopenharmony_ci */ 184bf215546Sopenharmony_ci u_rwlock_rdlock(&at->ht_lock); 185bf215546Sopenharmony_ci struct hash_entry *entry = 186bf215546Sopenharmony_ci _mesa_hash_table_search(at->ht, &rp_key); 187bf215546Sopenharmony_ci if (entry) { 188bf215546Sopenharmony_ci struct tu_renderpass_history *history = entry->data; 189bf215546Sopenharmony_ci if (history->num_results > 0) { 190bf215546Sopenharmony_ci *avg_samples = p_atomic_read(&history->avg_samples); 191bf215546Sopenharmony_ci has_history = true; 192bf215546Sopenharmony_ci } 193bf215546Sopenharmony_ci } 194bf215546Sopenharmony_ci u_rwlock_rdunlock(&at->ht_lock); 195bf215546Sopenharmony_ci 196bf215546Sopenharmony_ci return has_history; 197bf215546Sopenharmony_ci} 198bf215546Sopenharmony_ci 199bf215546Sopenharmony_cistatic struct tu_renderpass_result * 200bf215546Sopenharmony_cicreate_history_result(struct tu_autotune *at, uint64_t rp_key) 201bf215546Sopenharmony_ci{ 202bf215546Sopenharmony_ci struct tu_renderpass_result *result = calloc(1, sizeof(*result)); 203bf215546Sopenharmony_ci result->rp_key = rp_key; 204bf215546Sopenharmony_ci 205bf215546Sopenharmony_ci return result; 206bf215546Sopenharmony_ci} 207bf215546Sopenharmony_ci 208bf215546Sopenharmony_cistatic void 209bf215546Sopenharmony_cihistory_add_result(struct tu_device *dev, struct tu_renderpass_history *history, 210bf215546Sopenharmony_ci struct tu_renderpass_result *result) 211bf215546Sopenharmony_ci{ 212bf215546Sopenharmony_ci list_delinit(&result->node); 213bf215546Sopenharmony_ci list_add(&result->node, &history->results); 214bf215546Sopenharmony_ci 215bf215546Sopenharmony_ci if (history->num_results < MAX_HISTORY_RESULTS) { 216bf215546Sopenharmony_ci history->num_results++; 217bf215546Sopenharmony_ci } else { 218bf215546Sopenharmony_ci /* Once above the limit, start popping old results off the 219bf215546Sopenharmony_ci * tail of the list: 220bf215546Sopenharmony_ci */ 221bf215546Sopenharmony_ci struct tu_renderpass_result *old_result = 222bf215546Sopenharmony_ci list_last_entry(&history->results, struct tu_renderpass_result, node); 223bf215546Sopenharmony_ci mtx_lock(&dev->autotune_mutex); 224bf215546Sopenharmony_ci free_result(dev, old_result); 225bf215546Sopenharmony_ci mtx_unlock(&dev->autotune_mutex); 226bf215546Sopenharmony_ci } 227bf215546Sopenharmony_ci 228bf215546Sopenharmony_ci /* Do calculations here to avoid locking history in tu_autotune_use_bypass */ 229bf215546Sopenharmony_ci uint32_t total_samples = 0; 230bf215546Sopenharmony_ci list_for_each_entry(struct tu_renderpass_result, result, 231bf215546Sopenharmony_ci &history->results, node) { 232bf215546Sopenharmony_ci total_samples += result->samples_passed; 233bf215546Sopenharmony_ci } 234bf215546Sopenharmony_ci 235bf215546Sopenharmony_ci float avg_samples = (float)total_samples / (float)history->num_results; 236bf215546Sopenharmony_ci p_atomic_set(&history->avg_samples, (uint32_t)avg_samples); 237bf215546Sopenharmony_ci} 238bf215546Sopenharmony_ci 239bf215546Sopenharmony_cistatic void 240bf215546Sopenharmony_ciprocess_results(struct tu_autotune *at, uint32_t current_fence) 241bf215546Sopenharmony_ci{ 242bf215546Sopenharmony_ci struct tu_device *dev = at->device; 243bf215546Sopenharmony_ci 244bf215546Sopenharmony_ci list_for_each_entry_safe(struct tu_renderpass_result, result, 245bf215546Sopenharmony_ci &at->pending_results, node) { 246bf215546Sopenharmony_ci if (result->fence > current_fence) 247bf215546Sopenharmony_ci break; 248bf215546Sopenharmony_ci 249bf215546Sopenharmony_ci struct tu_renderpass_history *history = result->history; 250bf215546Sopenharmony_ci result->samples_passed = 251bf215546Sopenharmony_ci result->samples->samples_end - result->samples->samples_start; 252bf215546Sopenharmony_ci 253bf215546Sopenharmony_ci history_add_result(dev, history, result); 254bf215546Sopenharmony_ci } 255bf215546Sopenharmony_ci 256bf215546Sopenharmony_ci list_for_each_entry_safe(struct tu_submission_data, submission_data, 257bf215546Sopenharmony_ci &at->pending_submission_data, node) { 258bf215546Sopenharmony_ci if (submission_data->fence > current_fence) 259bf215546Sopenharmony_ci break; 260bf215546Sopenharmony_ci 261bf215546Sopenharmony_ci free_submission_data(submission_data); 262bf215546Sopenharmony_ci } 263bf215546Sopenharmony_ci} 264bf215546Sopenharmony_ci 265bf215546Sopenharmony_cistatic void 266bf215546Sopenharmony_ciqueue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf) 267bf215546Sopenharmony_ci{ 268bf215546Sopenharmony_ci bool one_time_submit = cmdbuf->usage_flags & 269bf215546Sopenharmony_ci VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; 270bf215546Sopenharmony_ci 271bf215546Sopenharmony_ci if (one_time_submit) { 272bf215546Sopenharmony_ci /* We can just steal the list since it won't be resubmitted again */ 273bf215546Sopenharmony_ci list_splicetail(&cmdbuf->renderpass_autotune_results, 274bf215546Sopenharmony_ci &at->pending_results); 275bf215546Sopenharmony_ci list_inithead(&cmdbuf->renderpass_autotune_results); 276bf215546Sopenharmony_ci } else { 277bf215546Sopenharmony_ci list_for_each_entry_safe(struct tu_renderpass_result, result, 278bf215546Sopenharmony_ci &cmdbuf->renderpass_autotune_results, node) { 279bf215546Sopenharmony_ci /* TODO: copying each result isn't nice */ 280bf215546Sopenharmony_ci struct tu_renderpass_result *copy = malloc(sizeof(*result)); 281bf215546Sopenharmony_ci *copy = *result; 282bf215546Sopenharmony_ci tu_bo_get_ref(copy->bo.bo); 283bf215546Sopenharmony_ci list_addtail(©->node, &at->pending_results); 284bf215546Sopenharmony_ci } 285bf215546Sopenharmony_ci } 286bf215546Sopenharmony_ci} 287bf215546Sopenharmony_ci 288bf215546Sopenharmony_cistruct tu_cs * 289bf215546Sopenharmony_citu_autotune_on_submit(struct tu_device *dev, 290bf215546Sopenharmony_ci struct tu_autotune *at, 291bf215546Sopenharmony_ci struct tu_cmd_buffer **cmd_buffers, 292bf215546Sopenharmony_ci uint32_t cmd_buffer_count) 293bf215546Sopenharmony_ci{ 294bf215546Sopenharmony_ci /* We are single-threaded here */ 295bf215546Sopenharmony_ci 296bf215546Sopenharmony_ci const uint32_t gpu_fence = get_autotune_fence(at); 297bf215546Sopenharmony_ci 298bf215546Sopenharmony_ci process_results(at, gpu_fence); 299bf215546Sopenharmony_ci 300bf215546Sopenharmony_ci /* pre-increment so zero isn't valid fence */ 301bf215546Sopenharmony_ci uint32_t new_fence = ++at->fence_counter; 302bf215546Sopenharmony_ci uint32_t result_buffers = 0; 303bf215546Sopenharmony_ci 304bf215546Sopenharmony_ci /* Create history entries here to minimize work and locking being 305bf215546Sopenharmony_ci * done on renderpass end. 306bf215546Sopenharmony_ci */ 307bf215546Sopenharmony_ci for (uint32_t i = 0; i < cmd_buffer_count; i++) { 308bf215546Sopenharmony_ci struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; 309bf215546Sopenharmony_ci list_for_each_entry_safe(struct tu_renderpass_result, result, 310bf215546Sopenharmony_ci &cmdbuf->renderpass_autotune_results, node) { 311bf215546Sopenharmony_ci struct tu_renderpass_history *history; 312bf215546Sopenharmony_ci struct hash_entry *entry = 313bf215546Sopenharmony_ci _mesa_hash_table_search(at->ht, &result->rp_key); 314bf215546Sopenharmony_ci if (!entry) { 315bf215546Sopenharmony_ci history = calloc(1, sizeof(*history)); 316bf215546Sopenharmony_ci history->key = result->rp_key; 317bf215546Sopenharmony_ci list_inithead(&history->results); 318bf215546Sopenharmony_ci 319bf215546Sopenharmony_ci u_rwlock_wrlock(&at->ht_lock); 320bf215546Sopenharmony_ci _mesa_hash_table_insert(at->ht, &history->key, history); 321bf215546Sopenharmony_ci u_rwlock_wrunlock(&at->ht_lock); 322bf215546Sopenharmony_ci } else { 323bf215546Sopenharmony_ci history = (struct tu_renderpass_history *) entry->data; 324bf215546Sopenharmony_ci } 325bf215546Sopenharmony_ci 326bf215546Sopenharmony_ci history->last_fence = new_fence; 327bf215546Sopenharmony_ci 328bf215546Sopenharmony_ci result->fence = new_fence; 329bf215546Sopenharmony_ci result->history = history; 330bf215546Sopenharmony_ci } 331bf215546Sopenharmony_ci 332bf215546Sopenharmony_ci if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) { 333bf215546Sopenharmony_ci result_buffers++; 334bf215546Sopenharmony_ci } 335bf215546Sopenharmony_ci } 336bf215546Sopenharmony_ci 337bf215546Sopenharmony_ci struct tu_submission_data *submission_data = 338bf215546Sopenharmony_ci create_submission_data(dev, at); 339bf215546Sopenharmony_ci submission_data->buffers_count = result_buffers; 340bf215546Sopenharmony_ci 341bf215546Sopenharmony_ci for (uint32_t i = 0; i < cmd_buffer_count; i++) { 342bf215546Sopenharmony_ci struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; 343bf215546Sopenharmony_ci if (list_is_empty(&cmdbuf->renderpass_autotune_results)) 344bf215546Sopenharmony_ci continue; 345bf215546Sopenharmony_ci 346bf215546Sopenharmony_ci queue_pending_results(at, cmdbuf); 347bf215546Sopenharmony_ci } 348bf215546Sopenharmony_ci 349bf215546Sopenharmony_ci if (TU_AUTOTUNE_DEBUG_LOG) 350bf215546Sopenharmony_ci mesa_logi("Total history entries: %u", at->ht->entries); 351bf215546Sopenharmony_ci 352bf215546Sopenharmony_ci /* Cleanup old entries from history table. The assumption 353bf215546Sopenharmony_ci * here is that application doesn't hold many old unsubmitted 354bf215546Sopenharmony_ci * command buffers, otherwise this table may grow big. 355bf215546Sopenharmony_ci */ 356bf215546Sopenharmony_ci hash_table_foreach(at->ht, entry) { 357bf215546Sopenharmony_ci struct tu_renderpass_history *history = entry->data; 358bf215546Sopenharmony_ci if (history->last_fence == 0 || 359bf215546Sopenharmony_ci gpu_fence < history->last_fence || 360bf215546Sopenharmony_ci (gpu_fence - history->last_fence) <= MAX_HISTORY_LIFETIME) 361bf215546Sopenharmony_ci continue; 362bf215546Sopenharmony_ci 363bf215546Sopenharmony_ci if (TU_AUTOTUNE_DEBUG_LOG) 364bf215546Sopenharmony_ci mesa_logi("Removed old history entry %016"PRIx64"", history->key); 365bf215546Sopenharmony_ci 366bf215546Sopenharmony_ci u_rwlock_wrlock(&at->ht_lock); 367bf215546Sopenharmony_ci _mesa_hash_table_remove_key(at->ht, &history->key); 368bf215546Sopenharmony_ci u_rwlock_wrunlock(&at->ht_lock); 369bf215546Sopenharmony_ci 370bf215546Sopenharmony_ci mtx_lock(&dev->autotune_mutex); 371bf215546Sopenharmony_ci free_history(dev, history); 372bf215546Sopenharmony_ci mtx_unlock(&dev->autotune_mutex); 373bf215546Sopenharmony_ci } 374bf215546Sopenharmony_ci 375bf215546Sopenharmony_ci return &submission_data->fence_cs; 376bf215546Sopenharmony_ci} 377bf215546Sopenharmony_ci 378bf215546Sopenharmony_cistatic bool 379bf215546Sopenharmony_cirenderpass_key_equals(const void *_a, const void *_b) 380bf215546Sopenharmony_ci{ 381bf215546Sopenharmony_ci return *(uint64_t *)_a == *(uint64_t *)_b; 382bf215546Sopenharmony_ci} 383bf215546Sopenharmony_ci 384bf215546Sopenharmony_cistatic uint32_t 385bf215546Sopenharmony_cirenderpass_key_hash(const void *_a) 386bf215546Sopenharmony_ci{ 387bf215546Sopenharmony_ci return *((uint64_t *) _a) & 0xffffffff; 388bf215546Sopenharmony_ci} 389bf215546Sopenharmony_ci 390bf215546Sopenharmony_ciVkResult 391bf215546Sopenharmony_citu_autotune_init(struct tu_autotune *at, struct tu_device *dev) 392bf215546Sopenharmony_ci{ 393bf215546Sopenharmony_ci at->enabled = true; 394bf215546Sopenharmony_ci at->device = dev; 395bf215546Sopenharmony_ci at->ht = _mesa_hash_table_create(NULL, 396bf215546Sopenharmony_ci renderpass_key_hash, 397bf215546Sopenharmony_ci renderpass_key_equals); 398bf215546Sopenharmony_ci u_rwlock_init(&at->ht_lock); 399bf215546Sopenharmony_ci 400bf215546Sopenharmony_ci list_inithead(&at->pending_results); 401bf215546Sopenharmony_ci list_inithead(&at->pending_submission_data); 402bf215546Sopenharmony_ci 403bf215546Sopenharmony_ci return VK_SUCCESS; 404bf215546Sopenharmony_ci} 405bf215546Sopenharmony_ci 406bf215546Sopenharmony_civoid 407bf215546Sopenharmony_citu_autotune_fini(struct tu_autotune *at, struct tu_device *dev) 408bf215546Sopenharmony_ci{ 409bf215546Sopenharmony_ci if (TU_AUTOTUNE_LOG_AT_FINISH) { 410bf215546Sopenharmony_ci while (!list_is_empty(&at->pending_results)) { 411bf215546Sopenharmony_ci const uint32_t gpu_fence = get_autotune_fence(at); 412bf215546Sopenharmony_ci process_results(at, gpu_fence); 413bf215546Sopenharmony_ci } 414bf215546Sopenharmony_ci 415bf215546Sopenharmony_ci hash_table_foreach(at->ht, entry) { 416bf215546Sopenharmony_ci struct tu_renderpass_history *history = entry->data; 417bf215546Sopenharmony_ci 418bf215546Sopenharmony_ci mesa_logi("%016"PRIx64" \tavg_passed=%u results=%u", 419bf215546Sopenharmony_ci history->key, history->avg_samples, history->num_results); 420bf215546Sopenharmony_ci } 421bf215546Sopenharmony_ci } 422bf215546Sopenharmony_ci 423bf215546Sopenharmony_ci tu_autotune_free_results(dev, &at->pending_results); 424bf215546Sopenharmony_ci 425bf215546Sopenharmony_ci mtx_lock(&dev->autotune_mutex); 426bf215546Sopenharmony_ci hash_table_foreach(at->ht, entry) { 427bf215546Sopenharmony_ci struct tu_renderpass_history *history = entry->data; 428bf215546Sopenharmony_ci free_history(dev, history); 429bf215546Sopenharmony_ci } 430bf215546Sopenharmony_ci mtx_unlock(&dev->autotune_mutex); 431bf215546Sopenharmony_ci 432bf215546Sopenharmony_ci list_for_each_entry_safe(struct tu_submission_data, submission_data, 433bf215546Sopenharmony_ci &at->pending_submission_data, node) { 434bf215546Sopenharmony_ci free_submission_data(submission_data); 435bf215546Sopenharmony_ci } 436bf215546Sopenharmony_ci 437bf215546Sopenharmony_ci _mesa_hash_table_destroy(at->ht, NULL); 438bf215546Sopenharmony_ci u_rwlock_destroy(&at->ht_lock); 439bf215546Sopenharmony_ci} 440bf215546Sopenharmony_ci 441bf215546Sopenharmony_cibool 442bf215546Sopenharmony_citu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers, 443bf215546Sopenharmony_ci uint32_t cmd_buffer_count) 444bf215546Sopenharmony_ci{ 445bf215546Sopenharmony_ci for (uint32_t i = 0; i < cmd_buffer_count; i++) { 446bf215546Sopenharmony_ci struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; 447bf215546Sopenharmony_ci if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) 448bf215546Sopenharmony_ci return true; 449bf215546Sopenharmony_ci } 450bf215546Sopenharmony_ci 451bf215546Sopenharmony_ci return false; 452bf215546Sopenharmony_ci} 453bf215546Sopenharmony_ci 454bf215546Sopenharmony_civoid 455bf215546Sopenharmony_citu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results) 456bf215546Sopenharmony_ci{ 457bf215546Sopenharmony_ci list_for_each_entry_safe(struct tu_renderpass_result, result, 458bf215546Sopenharmony_ci results, node) { 459bf215546Sopenharmony_ci free_result(dev, result); 460bf215546Sopenharmony_ci } 461bf215546Sopenharmony_ci} 462bf215546Sopenharmony_ci 463bf215546Sopenharmony_civoid 464bf215546Sopenharmony_citu_autotune_free_results(struct tu_device *dev, struct list_head *results) 465bf215546Sopenharmony_ci{ 466bf215546Sopenharmony_ci mtx_lock(&dev->autotune_mutex); 467bf215546Sopenharmony_ci tu_autotune_free_results_locked(dev, results); 468bf215546Sopenharmony_ci mtx_unlock(&dev->autotune_mutex); 469bf215546Sopenharmony_ci} 470bf215546Sopenharmony_ci 471bf215546Sopenharmony_cistatic bool 472bf215546Sopenharmony_cifallback_use_bypass(const struct tu_render_pass *pass, 473bf215546Sopenharmony_ci const struct tu_framebuffer *framebuffer, 474bf215546Sopenharmony_ci const struct tu_cmd_buffer *cmd_buffer) 475bf215546Sopenharmony_ci{ 476bf215546Sopenharmony_ci if (cmd_buffer->state.rp.drawcall_count > 5) 477bf215546Sopenharmony_ci return false; 478bf215546Sopenharmony_ci 479bf215546Sopenharmony_ci for (unsigned i = 0; i < pass->subpass_count; i++) { 480bf215546Sopenharmony_ci if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT) 481bf215546Sopenharmony_ci return false; 482bf215546Sopenharmony_ci } 483bf215546Sopenharmony_ci 484bf215546Sopenharmony_ci return true; 485bf215546Sopenharmony_ci} 486bf215546Sopenharmony_ci 487bf215546Sopenharmony_cistatic uint32_t 488bf215546Sopenharmony_ciget_render_pass_pixel_count(const struct tu_cmd_buffer *cmd) 489bf215546Sopenharmony_ci{ 490bf215546Sopenharmony_ci const VkExtent2D *extent = &cmd->state.render_area.extent; 491bf215546Sopenharmony_ci return extent->width * extent->height; 492bf215546Sopenharmony_ci} 493bf215546Sopenharmony_ci 494bf215546Sopenharmony_cistatic uint64_t 495bf215546Sopenharmony_ciestimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd, 496bf215546Sopenharmony_ci uint32_t avg_renderpass_sample_count) 497bf215546Sopenharmony_ci{ 498bf215546Sopenharmony_ci const struct tu_cmd_state *state = &cmd->state; 499bf215546Sopenharmony_ci 500bf215546Sopenharmony_ci if (!state->rp.drawcall_count) 501bf215546Sopenharmony_ci return 0; 502bf215546Sopenharmony_ci 503bf215546Sopenharmony_ci /* sample count times drawcall_bandwidth_per_sample */ 504bf215546Sopenharmony_ci return (uint64_t)avg_renderpass_sample_count * 505bf215546Sopenharmony_ci state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count; 506bf215546Sopenharmony_ci} 507bf215546Sopenharmony_ci 508bf215546Sopenharmony_cibool 509bf215546Sopenharmony_citu_autotune_use_bypass(struct tu_autotune *at, 510bf215546Sopenharmony_ci struct tu_cmd_buffer *cmd_buffer, 511bf215546Sopenharmony_ci struct tu_renderpass_result **autotune_result) 512bf215546Sopenharmony_ci{ 513bf215546Sopenharmony_ci const struct tu_render_pass *pass = cmd_buffer->state.pass; 514bf215546Sopenharmony_ci const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 515bf215546Sopenharmony_ci 516bf215546Sopenharmony_ci for (unsigned i = 0; i < pass->subpass_count; i++) { 517bf215546Sopenharmony_ci const struct tu_subpass *subpass = &pass->subpasses[i]; 518bf215546Sopenharmony_ci /* GMEM works much faster in this case */ 519bf215546Sopenharmony_ci if (subpass->raster_order_attachment_access) 520bf215546Sopenharmony_ci return false; 521bf215546Sopenharmony_ci 522bf215546Sopenharmony_ci /* Would be very slow in sysmem mode because we have to enable 523bf215546Sopenharmony_ci * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) 524bf215546Sopenharmony_ci */ 525bf215546Sopenharmony_ci if (subpass->feedback_loop_color || subpass->feedback_loop_ds) 526bf215546Sopenharmony_ci return false; 527bf215546Sopenharmony_ci } 528bf215546Sopenharmony_ci 529bf215546Sopenharmony_ci /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers 530bf215546Sopenharmony_ci * we would have to allocate GPU memory at the submit time and copy 531bf215546Sopenharmony_ci * results into it. 532bf215546Sopenharmony_ci * Native games ususally don't use it, Zink and DXVK don't use it, 533bf215546Sopenharmony_ci * D3D12 doesn't have such concept. 534bf215546Sopenharmony_ci */ 535bf215546Sopenharmony_ci bool simultaneous_use = 536bf215546Sopenharmony_ci cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; 537bf215546Sopenharmony_ci 538bf215546Sopenharmony_ci if (!at->enabled || simultaneous_use) 539bf215546Sopenharmony_ci return fallback_use_bypass(pass, framebuffer, cmd_buffer); 540bf215546Sopenharmony_ci 541bf215546Sopenharmony_ci /* We use 64bit hash as a key since we don't fear rare hash collision, 542bf215546Sopenharmony_ci * the worst that would happen is sysmem being selected when it should 543bf215546Sopenharmony_ci * have not, and with 64bit it would be extremely rare. 544bf215546Sopenharmony_ci * 545bf215546Sopenharmony_ci * Q: Why not make the key from framebuffer + renderpass pointers? 546bf215546Sopenharmony_ci * A: At least DXVK creates new framebuffers each frame while keeping 547bf215546Sopenharmony_ci * renderpasses the same. Also we want to support replaying a single 548bf215546Sopenharmony_ci * frame in a loop for testing. 549bf215546Sopenharmony_ci */ 550bf215546Sopenharmony_ci uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer); 551bf215546Sopenharmony_ci 552bf215546Sopenharmony_ci *autotune_result = create_history_result(at, renderpass_key); 553bf215546Sopenharmony_ci 554bf215546Sopenharmony_ci uint32_t avg_samples = 0; 555bf215546Sopenharmony_ci if (get_history(at, renderpass_key, &avg_samples)) { 556bf215546Sopenharmony_ci const uint32_t pass_pixel_count = 557bf215546Sopenharmony_ci get_render_pass_pixel_count(cmd_buffer); 558bf215546Sopenharmony_ci uint64_t sysmem_bandwidth = 559bf215546Sopenharmony_ci (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count; 560bf215546Sopenharmony_ci uint64_t gmem_bandwidth = 561bf215546Sopenharmony_ci (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count; 562bf215546Sopenharmony_ci 563bf215546Sopenharmony_ci const uint64_t total_draw_call_bandwidth = 564bf215546Sopenharmony_ci estimate_drawcall_bandwidth(cmd_buffer, avg_samples); 565bf215546Sopenharmony_ci 566bf215546Sopenharmony_ci /* drawcalls access the memory in sysmem rendering (ignoring CCU) */ 567bf215546Sopenharmony_ci sysmem_bandwidth += total_draw_call_bandwidth; 568bf215546Sopenharmony_ci 569bf215546Sopenharmony_ci /* drawcalls access gmem in gmem rendering, but we do not want to ignore 570bf215546Sopenharmony_ci * them completely. The state changes between tiles also have an 571bf215546Sopenharmony_ci * overhead. The magic numbers of 11 and 10 are randomly chosen. 572bf215546Sopenharmony_ci */ 573bf215546Sopenharmony_ci gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10; 574bf215546Sopenharmony_ci 575bf215546Sopenharmony_ci const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth; 576bf215546Sopenharmony_ci if (TU_AUTOTUNE_DEBUG_LOG) { 577bf215546Sopenharmony_ci const VkExtent2D *extent = &cmd_buffer->state.render_area.extent; 578bf215546Sopenharmony_ci const float drawcall_bandwidth_per_sample = 579bf215546Sopenharmony_ci (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum / 580bf215546Sopenharmony_ci cmd_buffer->state.rp.drawcall_count; 581bf215546Sopenharmony_ci 582bf215546Sopenharmony_ci mesa_logi("autotune %016" PRIx64 ":%u selecting %s", 583bf215546Sopenharmony_ci renderpass_key, 584bf215546Sopenharmony_ci cmd_buffer->state.rp.drawcall_count, 585bf215546Sopenharmony_ci select_sysmem ? "sysmem" : "gmem"); 586bf215546Sopenharmony_ci mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64, 587bf215546Sopenharmony_ci avg_samples, 588bf215546Sopenharmony_ci drawcall_bandwidth_per_sample, 589bf215546Sopenharmony_ci total_draw_call_bandwidth); 590bf215546Sopenharmony_ci mesa_logi(" render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u", 591bf215546Sopenharmony_ci extent->width, extent->height, 592bf215546Sopenharmony_ci pass->sysmem_bandwidth_per_pixel, 593bf215546Sopenharmony_ci pass->gmem_bandwidth_per_pixel); 594bf215546Sopenharmony_ci mesa_logi(" sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64, 595bf215546Sopenharmony_ci sysmem_bandwidth, gmem_bandwidth); 596bf215546Sopenharmony_ci } 597bf215546Sopenharmony_ci 598bf215546Sopenharmony_ci return select_sysmem; 599bf215546Sopenharmony_ci } 600bf215546Sopenharmony_ci 601bf215546Sopenharmony_ci return fallback_use_bypass(pass, framebuffer, cmd_buffer); 602bf215546Sopenharmony_ci} 603bf215546Sopenharmony_ci 604bf215546Sopenharmony_civoid 605bf215546Sopenharmony_citu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd, 606bf215546Sopenharmony_ci struct tu_cs *cs, 607bf215546Sopenharmony_ci struct tu_renderpass_result *autotune_result) 608bf215546Sopenharmony_ci{ 609bf215546Sopenharmony_ci if (!autotune_result) 610bf215546Sopenharmony_ci return; 611bf215546Sopenharmony_ci 612bf215546Sopenharmony_ci struct tu_device *dev = cmd->device; 613bf215546Sopenharmony_ci 614bf215546Sopenharmony_ci static const uint32_t size = sizeof(struct tu_renderpass_samples); 615bf215546Sopenharmony_ci 616bf215546Sopenharmony_ci mtx_lock(&dev->autotune_mutex); 617bf215546Sopenharmony_ci VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size); 618bf215546Sopenharmony_ci mtx_unlock(&dev->autotune_mutex); 619bf215546Sopenharmony_ci if (ret != VK_SUCCESS) { 620bf215546Sopenharmony_ci autotune_result->bo.iova = 0; 621bf215546Sopenharmony_ci return; 622bf215546Sopenharmony_ci } 623bf215546Sopenharmony_ci 624bf215546Sopenharmony_ci uint64_t result_iova = autotune_result->bo.iova; 625bf215546Sopenharmony_ci 626bf215546Sopenharmony_ci autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo); 627bf215546Sopenharmony_ci 628bf215546Sopenharmony_ci tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); 629bf215546Sopenharmony_ci 630bf215546Sopenharmony_ci tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova)); 631bf215546Sopenharmony_ci 632bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 633bf215546Sopenharmony_ci tu_cs_emit(cs, ZPASS_DONE); 634bf215546Sopenharmony_ci} 635bf215546Sopenharmony_ci 636bf215546Sopenharmony_civoid tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd, 637bf215546Sopenharmony_ci struct tu_cs *cs, 638bf215546Sopenharmony_ci struct tu_renderpass_result *autotune_result) 639bf215546Sopenharmony_ci{ 640bf215546Sopenharmony_ci if (!autotune_result) 641bf215546Sopenharmony_ci return; 642bf215546Sopenharmony_ci 643bf215546Sopenharmony_ci if (!autotune_result->bo.iova) 644bf215546Sopenharmony_ci return; 645bf215546Sopenharmony_ci 646bf215546Sopenharmony_ci uint64_t result_iova = autotune_result->bo.iova + 647bf215546Sopenharmony_ci offsetof(struct tu_renderpass_samples, samples_end); 648bf215546Sopenharmony_ci 649bf215546Sopenharmony_ci tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); 650bf215546Sopenharmony_ci 651bf215546Sopenharmony_ci tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova)); 652bf215546Sopenharmony_ci 653bf215546Sopenharmony_ci tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 654bf215546Sopenharmony_ci tu_cs_emit(cs, ZPASS_DONE); 655bf215546Sopenharmony_ci} 656