1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2021 Igalia S.L.
3bf215546Sopenharmony_ci * SPDX-License-Identifier: MIT
4bf215546Sopenharmony_ci */
5bf215546Sopenharmony_ci
6bf215546Sopenharmony_ci#include "tu_autotune.h"
7bf215546Sopenharmony_ci
8bf215546Sopenharmony_ci#include "tu_cmd_buffer.h"
9bf215546Sopenharmony_ci#include "tu_cs.h"
10bf215546Sopenharmony_ci#include "tu_device.h"
11bf215546Sopenharmony_ci#include "tu_image.h"
12bf215546Sopenharmony_ci#include "tu_pass.h"
13bf215546Sopenharmony_ci
14bf215546Sopenharmony_ci/* How does it work?
15bf215546Sopenharmony_ci *
16bf215546Sopenharmony_ci * - For each renderpass we calculate the number of samples passed
17bf215546Sopenharmony_ci *   by storing the number before and after in GPU memory.
18bf215546Sopenharmony_ci * - To store the values each command buffer holds GPU memory which
19bf215546Sopenharmony_ci *   expands with more renderpasses being written.
20bf215546Sopenharmony_ci * - For each renderpass we create tu_renderpass_result entry which
21bf215546Sopenharmony_ci *   points to the results in GPU memory.
22bf215546Sopenharmony_ci *   - Later on tu_renderpass_result would be added to the
23bf215546Sopenharmony_ci *     tu_renderpass_history entry which aggregate results for a
24bf215546Sopenharmony_ci *     given renderpass.
25bf215546Sopenharmony_ci * - On submission:
26bf215546Sopenharmony_ci *   - Process results which fence was signalled.
27bf215546Sopenharmony_ci *   - Free per-submission data which we now don't need.
28bf215546Sopenharmony_ci *
29bf215546Sopenharmony_ci *   - Create a command stream to write a fence value. This way we would
30bf215546Sopenharmony_ci *     know when we could safely read the results.
31bf215546Sopenharmony_ci *   - We cannot rely on the command buffer's lifetime when referencing
32bf215546Sopenharmony_ci *     its resources since the buffer could be destroyed before we process
33bf215546Sopenharmony_ci *     the results.
34bf215546Sopenharmony_ci *   - For each command buffer:
35bf215546Sopenharmony_ci *     - Reference its GPU memory.
36bf215546Sopenharmony_ci *     - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
37bf215546Sopenharmony_ci *
38bf215546Sopenharmony_ci * Since the command buffers could be recorded on different threads
39bf215546Sopenharmony_ci * we have to maintaining some amount of locking history table,
40bf215546Sopenharmony_ci * however we change the table only in a single thread at the submission
41bf215546Sopenharmony_ci * time, so in most cases there will be no locking.
42bf215546Sopenharmony_ci */
43bf215546Sopenharmony_ci
44bf215546Sopenharmony_civoid
45bf215546Sopenharmony_citu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
46bf215546Sopenharmony_ci
47bf215546Sopenharmony_ci#define TU_AUTOTUNE_DEBUG_LOG 0
48bf215546Sopenharmony_ci/* Dump history entries on autotuner finish,
49bf215546Sopenharmony_ci * could be used to gather data from traces.
50bf215546Sopenharmony_ci */
51bf215546Sopenharmony_ci#define TU_AUTOTUNE_LOG_AT_FINISH 0
52bf215546Sopenharmony_ci
53bf215546Sopenharmony_ci/* How many last renderpass stats are taken into account. */
54bf215546Sopenharmony_ci#define MAX_HISTORY_RESULTS 5
55bf215546Sopenharmony_ci/* For how many submissions we store renderpass stats. */
56bf215546Sopenharmony_ci#define MAX_HISTORY_LIFETIME 128
57bf215546Sopenharmony_ci
58bf215546Sopenharmony_ci
59bf215546Sopenharmony_ci/**
60bf215546Sopenharmony_ci * Tracks results for a given renderpass key
61bf215546Sopenharmony_ci */
62bf215546Sopenharmony_cistruct tu_renderpass_history {
63bf215546Sopenharmony_ci   uint64_t key;
64bf215546Sopenharmony_ci
65bf215546Sopenharmony_ci   /* We would delete old history entries */
66bf215546Sopenharmony_ci   uint32_t last_fence;
67bf215546Sopenharmony_ci
68bf215546Sopenharmony_ci   /**
69bf215546Sopenharmony_ci    * List of recent fd_renderpass_result's
70bf215546Sopenharmony_ci    */
71bf215546Sopenharmony_ci   struct list_head results;
72bf215546Sopenharmony_ci   uint32_t num_results;
73bf215546Sopenharmony_ci
74bf215546Sopenharmony_ci   uint32_t avg_samples;
75bf215546Sopenharmony_ci};
76bf215546Sopenharmony_ci
77bf215546Sopenharmony_ci/* Holds per-submission cs which writes the fence. */
78bf215546Sopenharmony_cistruct tu_submission_data {
79bf215546Sopenharmony_ci   struct list_head node;
80bf215546Sopenharmony_ci   uint32_t fence;
81bf215546Sopenharmony_ci
82bf215546Sopenharmony_ci   struct tu_cs fence_cs;
83bf215546Sopenharmony_ci   uint32_t buffers_count;
84bf215546Sopenharmony_ci};
85bf215546Sopenharmony_ci
86bf215546Sopenharmony_cistatic uint32_t
87bf215546Sopenharmony_ciget_autotune_fence(struct tu_autotune *at)
88bf215546Sopenharmony_ci{
89bf215546Sopenharmony_ci   const struct tu6_global *global = at->device->global_bo->map;
90bf215546Sopenharmony_ci   return global->autotune_fence;
91bf215546Sopenharmony_ci}
92bf215546Sopenharmony_ci
93bf215546Sopenharmony_cistatic struct tu_submission_data *
94bf215546Sopenharmony_cicreate_submission_data(struct tu_device *dev, struct tu_autotune *at)
95bf215546Sopenharmony_ci{
96bf215546Sopenharmony_ci   struct tu_submission_data *submission_data =
97bf215546Sopenharmony_ci      calloc(1, sizeof(struct tu_submission_data));
98bf215546Sopenharmony_ci   submission_data->fence = at->fence_counter;
99bf215546Sopenharmony_ci
100bf215546Sopenharmony_ci   struct tu_cs* fence_cs = &submission_data->fence_cs;
101bf215546Sopenharmony_ci   tu_cs_init(fence_cs, dev, TU_CS_MODE_GROW, 5);
102bf215546Sopenharmony_ci   tu_cs_begin(fence_cs);
103bf215546Sopenharmony_ci
104bf215546Sopenharmony_ci   tu_cs_emit_pkt7(fence_cs, CP_EVENT_WRITE, 4);
105bf215546Sopenharmony_ci   tu_cs_emit(fence_cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
106bf215546Sopenharmony_ci   tu_cs_emit_qw(fence_cs, dev->global_bo->iova + gb_offset(autotune_fence));
107bf215546Sopenharmony_ci   tu_cs_emit(fence_cs, at->fence_counter);
108bf215546Sopenharmony_ci
109bf215546Sopenharmony_ci   tu_cs_end(fence_cs);
110bf215546Sopenharmony_ci
111bf215546Sopenharmony_ci   list_addtail(&submission_data->node, &at->pending_submission_data);
112bf215546Sopenharmony_ci
113bf215546Sopenharmony_ci   return submission_data;
114bf215546Sopenharmony_ci}
115bf215546Sopenharmony_ci
116bf215546Sopenharmony_cistatic void
117bf215546Sopenharmony_cifree_submission_data(struct tu_submission_data *data)
118bf215546Sopenharmony_ci{
119bf215546Sopenharmony_ci   list_del(&data->node);
120bf215546Sopenharmony_ci   tu_cs_finish(&data->fence_cs);
121bf215546Sopenharmony_ci
122bf215546Sopenharmony_ci   free(data);
123bf215546Sopenharmony_ci}
124bf215546Sopenharmony_ci
125bf215546Sopenharmony_ci#define APPEND_TO_HASH(state, field) \
126bf215546Sopenharmony_ci   XXH64_update(state, &field, sizeof(field));
127bf215546Sopenharmony_ci
128bf215546Sopenharmony_cistatic uint64_t
129bf215546Sopenharmony_cihash_renderpass_instance(const struct tu_render_pass *pass,
130bf215546Sopenharmony_ci                         const struct tu_framebuffer *framebuffer,
131bf215546Sopenharmony_ci                         const struct tu_cmd_buffer *cmd) {
132bf215546Sopenharmony_ci   XXH64_state_t hash_state;
133bf215546Sopenharmony_ci   XXH64_reset(&hash_state, 0);
134bf215546Sopenharmony_ci
135bf215546Sopenharmony_ci   APPEND_TO_HASH(&hash_state, framebuffer->width);
136bf215546Sopenharmony_ci   APPEND_TO_HASH(&hash_state, framebuffer->height);
137bf215546Sopenharmony_ci   APPEND_TO_HASH(&hash_state, framebuffer->layers);
138bf215546Sopenharmony_ci
139bf215546Sopenharmony_ci   APPEND_TO_HASH(&hash_state, pass->attachment_count);
140bf215546Sopenharmony_ci   XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0]));
141bf215546Sopenharmony_ci
142bf215546Sopenharmony_ci   for (unsigned i = 0; i < pass->attachment_count; i++) {
143bf215546Sopenharmony_ci      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width);
144bf215546Sopenharmony_ci      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height);
145bf215546Sopenharmony_ci      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.format);
146bf215546Sopenharmony_ci      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.array_layers);
147bf215546Sopenharmony_ci      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.mip_levels);
148bf215546Sopenharmony_ci   }
149bf215546Sopenharmony_ci
150bf215546Sopenharmony_ci   APPEND_TO_HASH(&hash_state, pass->subpass_count);
151bf215546Sopenharmony_ci   for (unsigned i = 0; i < pass->subpass_count; i++) {
152bf215546Sopenharmony_ci      APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples);
153bf215546Sopenharmony_ci      APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count);
154bf215546Sopenharmony_ci      APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count);
155bf215546Sopenharmony_ci      APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count);
156bf215546Sopenharmony_ci   }
157bf215546Sopenharmony_ci
158bf215546Sopenharmony_ci   return XXH64_digest(&hash_state);
159bf215546Sopenharmony_ci}
160bf215546Sopenharmony_ci
161bf215546Sopenharmony_cistatic void
162bf215546Sopenharmony_cifree_result(struct tu_device *dev, struct tu_renderpass_result *result)
163bf215546Sopenharmony_ci{
164bf215546Sopenharmony_ci   tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
165bf215546Sopenharmony_ci   list_del(&result->node);
166bf215546Sopenharmony_ci   free(result);
167bf215546Sopenharmony_ci}
168bf215546Sopenharmony_ci
169bf215546Sopenharmony_cistatic void
170bf215546Sopenharmony_cifree_history(struct tu_device *dev, struct tu_renderpass_history *history)
171bf215546Sopenharmony_ci{
172bf215546Sopenharmony_ci   tu_autotune_free_results_locked(dev, &history->results);
173bf215546Sopenharmony_ci   free(history);
174bf215546Sopenharmony_ci}
175bf215546Sopenharmony_ci
176bf215546Sopenharmony_cistatic bool
177bf215546Sopenharmony_ciget_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
178bf215546Sopenharmony_ci{
179bf215546Sopenharmony_ci   bool has_history = false;
180bf215546Sopenharmony_ci
181bf215546Sopenharmony_ci   /* If the lock contantion would be found in the wild -
182bf215546Sopenharmony_ci    * we could use try_lock here.
183bf215546Sopenharmony_ci    */
184bf215546Sopenharmony_ci   u_rwlock_rdlock(&at->ht_lock);
185bf215546Sopenharmony_ci   struct hash_entry *entry =
186bf215546Sopenharmony_ci      _mesa_hash_table_search(at->ht, &rp_key);
187bf215546Sopenharmony_ci   if (entry) {
188bf215546Sopenharmony_ci      struct tu_renderpass_history *history = entry->data;
189bf215546Sopenharmony_ci      if (history->num_results > 0) {
190bf215546Sopenharmony_ci         *avg_samples = p_atomic_read(&history->avg_samples);
191bf215546Sopenharmony_ci         has_history = true;
192bf215546Sopenharmony_ci      }
193bf215546Sopenharmony_ci   }
194bf215546Sopenharmony_ci   u_rwlock_rdunlock(&at->ht_lock);
195bf215546Sopenharmony_ci
196bf215546Sopenharmony_ci   return has_history;
197bf215546Sopenharmony_ci}
198bf215546Sopenharmony_ci
199bf215546Sopenharmony_cistatic struct tu_renderpass_result *
200bf215546Sopenharmony_cicreate_history_result(struct tu_autotune *at, uint64_t rp_key)
201bf215546Sopenharmony_ci{
202bf215546Sopenharmony_ci   struct tu_renderpass_result *result = calloc(1, sizeof(*result));
203bf215546Sopenharmony_ci   result->rp_key = rp_key;
204bf215546Sopenharmony_ci
205bf215546Sopenharmony_ci   return result;
206bf215546Sopenharmony_ci}
207bf215546Sopenharmony_ci
208bf215546Sopenharmony_cistatic void
209bf215546Sopenharmony_cihistory_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
210bf215546Sopenharmony_ci                      struct tu_renderpass_result *result)
211bf215546Sopenharmony_ci{
212bf215546Sopenharmony_ci   list_delinit(&result->node);
213bf215546Sopenharmony_ci   list_add(&result->node, &history->results);
214bf215546Sopenharmony_ci
215bf215546Sopenharmony_ci   if (history->num_results < MAX_HISTORY_RESULTS) {
216bf215546Sopenharmony_ci      history->num_results++;
217bf215546Sopenharmony_ci   } else {
218bf215546Sopenharmony_ci      /* Once above the limit, start popping old results off the
219bf215546Sopenharmony_ci       * tail of the list:
220bf215546Sopenharmony_ci       */
221bf215546Sopenharmony_ci      struct tu_renderpass_result *old_result =
222bf215546Sopenharmony_ci         list_last_entry(&history->results, struct tu_renderpass_result, node);
223bf215546Sopenharmony_ci      mtx_lock(&dev->autotune_mutex);
224bf215546Sopenharmony_ci      free_result(dev, old_result);
225bf215546Sopenharmony_ci      mtx_unlock(&dev->autotune_mutex);
226bf215546Sopenharmony_ci   }
227bf215546Sopenharmony_ci
228bf215546Sopenharmony_ci   /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
229bf215546Sopenharmony_ci   uint32_t total_samples = 0;
230bf215546Sopenharmony_ci   list_for_each_entry(struct tu_renderpass_result, result,
231bf215546Sopenharmony_ci                       &history->results, node) {
232bf215546Sopenharmony_ci      total_samples += result->samples_passed;
233bf215546Sopenharmony_ci   }
234bf215546Sopenharmony_ci
235bf215546Sopenharmony_ci   float avg_samples = (float)total_samples / (float)history->num_results;
236bf215546Sopenharmony_ci   p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
237bf215546Sopenharmony_ci}
238bf215546Sopenharmony_ci
239bf215546Sopenharmony_cistatic void
240bf215546Sopenharmony_ciprocess_results(struct tu_autotune *at, uint32_t current_fence)
241bf215546Sopenharmony_ci{
242bf215546Sopenharmony_ci   struct tu_device *dev = at->device;
243bf215546Sopenharmony_ci
244bf215546Sopenharmony_ci   list_for_each_entry_safe(struct tu_renderpass_result, result,
245bf215546Sopenharmony_ci                            &at->pending_results, node) {
246bf215546Sopenharmony_ci      if (result->fence > current_fence)
247bf215546Sopenharmony_ci         break;
248bf215546Sopenharmony_ci
249bf215546Sopenharmony_ci      struct tu_renderpass_history *history = result->history;
250bf215546Sopenharmony_ci      result->samples_passed =
251bf215546Sopenharmony_ci         result->samples->samples_end - result->samples->samples_start;
252bf215546Sopenharmony_ci
253bf215546Sopenharmony_ci      history_add_result(dev, history, result);
254bf215546Sopenharmony_ci   }
255bf215546Sopenharmony_ci
256bf215546Sopenharmony_ci   list_for_each_entry_safe(struct tu_submission_data, submission_data,
257bf215546Sopenharmony_ci                            &at->pending_submission_data, node) {
258bf215546Sopenharmony_ci      if (submission_data->fence > current_fence)
259bf215546Sopenharmony_ci         break;
260bf215546Sopenharmony_ci
261bf215546Sopenharmony_ci      free_submission_data(submission_data);
262bf215546Sopenharmony_ci   }
263bf215546Sopenharmony_ci}
264bf215546Sopenharmony_ci
265bf215546Sopenharmony_cistatic void
266bf215546Sopenharmony_ciqueue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
267bf215546Sopenharmony_ci{
268bf215546Sopenharmony_ci   bool one_time_submit = cmdbuf->usage_flags &
269bf215546Sopenharmony_ci         VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
270bf215546Sopenharmony_ci
271bf215546Sopenharmony_ci   if (one_time_submit) {
272bf215546Sopenharmony_ci      /* We can just steal the list since it won't be resubmitted again */
273bf215546Sopenharmony_ci      list_splicetail(&cmdbuf->renderpass_autotune_results,
274bf215546Sopenharmony_ci                        &at->pending_results);
275bf215546Sopenharmony_ci      list_inithead(&cmdbuf->renderpass_autotune_results);
276bf215546Sopenharmony_ci   } else {
277bf215546Sopenharmony_ci      list_for_each_entry_safe(struct tu_renderpass_result, result,
278bf215546Sopenharmony_ci                              &cmdbuf->renderpass_autotune_results, node) {
279bf215546Sopenharmony_ci         /* TODO: copying each result isn't nice */
280bf215546Sopenharmony_ci         struct tu_renderpass_result *copy = malloc(sizeof(*result));
281bf215546Sopenharmony_ci         *copy = *result;
282bf215546Sopenharmony_ci         tu_bo_get_ref(copy->bo.bo);
283bf215546Sopenharmony_ci         list_addtail(&copy->node, &at->pending_results);
284bf215546Sopenharmony_ci      }
285bf215546Sopenharmony_ci   }
286bf215546Sopenharmony_ci}
287bf215546Sopenharmony_ci
288bf215546Sopenharmony_cistruct tu_cs *
289bf215546Sopenharmony_citu_autotune_on_submit(struct tu_device *dev,
290bf215546Sopenharmony_ci                      struct tu_autotune *at,
291bf215546Sopenharmony_ci                      struct tu_cmd_buffer **cmd_buffers,
292bf215546Sopenharmony_ci                      uint32_t cmd_buffer_count)
293bf215546Sopenharmony_ci{
294bf215546Sopenharmony_ci   /* We are single-threaded here */
295bf215546Sopenharmony_ci
296bf215546Sopenharmony_ci   const uint32_t gpu_fence = get_autotune_fence(at);
297bf215546Sopenharmony_ci
298bf215546Sopenharmony_ci   process_results(at, gpu_fence);
299bf215546Sopenharmony_ci
300bf215546Sopenharmony_ci   /* pre-increment so zero isn't valid fence */
301bf215546Sopenharmony_ci   uint32_t new_fence = ++at->fence_counter;
302bf215546Sopenharmony_ci   uint32_t result_buffers = 0;
303bf215546Sopenharmony_ci
304bf215546Sopenharmony_ci   /* Create history entries here to minimize work and locking being
305bf215546Sopenharmony_ci    * done on renderpass end.
306bf215546Sopenharmony_ci    */
307bf215546Sopenharmony_ci   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
308bf215546Sopenharmony_ci      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
309bf215546Sopenharmony_ci      list_for_each_entry_safe(struct tu_renderpass_result, result,
310bf215546Sopenharmony_ci                          &cmdbuf->renderpass_autotune_results, node) {
311bf215546Sopenharmony_ci         struct tu_renderpass_history *history;
312bf215546Sopenharmony_ci         struct hash_entry *entry =
313bf215546Sopenharmony_ci            _mesa_hash_table_search(at->ht, &result->rp_key);
314bf215546Sopenharmony_ci         if (!entry) {
315bf215546Sopenharmony_ci            history = calloc(1, sizeof(*history));
316bf215546Sopenharmony_ci            history->key = result->rp_key;
317bf215546Sopenharmony_ci            list_inithead(&history->results);
318bf215546Sopenharmony_ci
319bf215546Sopenharmony_ci            u_rwlock_wrlock(&at->ht_lock);
320bf215546Sopenharmony_ci            _mesa_hash_table_insert(at->ht, &history->key, history);
321bf215546Sopenharmony_ci            u_rwlock_wrunlock(&at->ht_lock);
322bf215546Sopenharmony_ci         } else {
323bf215546Sopenharmony_ci            history = (struct tu_renderpass_history *) entry->data;
324bf215546Sopenharmony_ci         }
325bf215546Sopenharmony_ci
326bf215546Sopenharmony_ci         history->last_fence = new_fence;
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_ci         result->fence = new_fence;
329bf215546Sopenharmony_ci         result->history = history;
330bf215546Sopenharmony_ci      }
331bf215546Sopenharmony_ci
332bf215546Sopenharmony_ci      if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) {
333bf215546Sopenharmony_ci         result_buffers++;
334bf215546Sopenharmony_ci      }
335bf215546Sopenharmony_ci   }
336bf215546Sopenharmony_ci
337bf215546Sopenharmony_ci   struct tu_submission_data *submission_data =
338bf215546Sopenharmony_ci      create_submission_data(dev, at);
339bf215546Sopenharmony_ci   submission_data->buffers_count = result_buffers;
340bf215546Sopenharmony_ci
341bf215546Sopenharmony_ci   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
342bf215546Sopenharmony_ci      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
343bf215546Sopenharmony_ci      if (list_is_empty(&cmdbuf->renderpass_autotune_results))
344bf215546Sopenharmony_ci         continue;
345bf215546Sopenharmony_ci
346bf215546Sopenharmony_ci      queue_pending_results(at, cmdbuf);
347bf215546Sopenharmony_ci   }
348bf215546Sopenharmony_ci
349bf215546Sopenharmony_ci   if (TU_AUTOTUNE_DEBUG_LOG)
350bf215546Sopenharmony_ci      mesa_logi("Total history entries: %u", at->ht->entries);
351bf215546Sopenharmony_ci
352bf215546Sopenharmony_ci   /* Cleanup old entries from history table. The assumption
353bf215546Sopenharmony_ci    * here is that application doesn't hold many old unsubmitted
354bf215546Sopenharmony_ci    * command buffers, otherwise this table may grow big.
355bf215546Sopenharmony_ci    */
356bf215546Sopenharmony_ci   hash_table_foreach(at->ht, entry) {
357bf215546Sopenharmony_ci      struct tu_renderpass_history *history = entry->data;
358bf215546Sopenharmony_ci      if (history->last_fence == 0 ||
359bf215546Sopenharmony_ci          gpu_fence < history->last_fence ||
360bf215546Sopenharmony_ci          (gpu_fence - history->last_fence) <= MAX_HISTORY_LIFETIME)
361bf215546Sopenharmony_ci         continue;
362bf215546Sopenharmony_ci
363bf215546Sopenharmony_ci      if (TU_AUTOTUNE_DEBUG_LOG)
364bf215546Sopenharmony_ci         mesa_logi("Removed old history entry %016"PRIx64"", history->key);
365bf215546Sopenharmony_ci
366bf215546Sopenharmony_ci      u_rwlock_wrlock(&at->ht_lock);
367bf215546Sopenharmony_ci      _mesa_hash_table_remove_key(at->ht, &history->key);
368bf215546Sopenharmony_ci      u_rwlock_wrunlock(&at->ht_lock);
369bf215546Sopenharmony_ci
370bf215546Sopenharmony_ci      mtx_lock(&dev->autotune_mutex);
371bf215546Sopenharmony_ci      free_history(dev, history);
372bf215546Sopenharmony_ci      mtx_unlock(&dev->autotune_mutex);
373bf215546Sopenharmony_ci   }
374bf215546Sopenharmony_ci
375bf215546Sopenharmony_ci   return &submission_data->fence_cs;
376bf215546Sopenharmony_ci}
377bf215546Sopenharmony_ci
378bf215546Sopenharmony_cistatic bool
379bf215546Sopenharmony_cirenderpass_key_equals(const void *_a, const void *_b)
380bf215546Sopenharmony_ci{
381bf215546Sopenharmony_ci   return *(uint64_t *)_a == *(uint64_t *)_b;
382bf215546Sopenharmony_ci}
383bf215546Sopenharmony_ci
384bf215546Sopenharmony_cistatic uint32_t
385bf215546Sopenharmony_cirenderpass_key_hash(const void *_a)
386bf215546Sopenharmony_ci{
387bf215546Sopenharmony_ci   return *((uint64_t *) _a) & 0xffffffff;
388bf215546Sopenharmony_ci}
389bf215546Sopenharmony_ci
390bf215546Sopenharmony_ciVkResult
391bf215546Sopenharmony_citu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
392bf215546Sopenharmony_ci{
393bf215546Sopenharmony_ci   at->enabled = true;
394bf215546Sopenharmony_ci   at->device = dev;
395bf215546Sopenharmony_ci   at->ht = _mesa_hash_table_create(NULL,
396bf215546Sopenharmony_ci                                    renderpass_key_hash,
397bf215546Sopenharmony_ci                                    renderpass_key_equals);
398bf215546Sopenharmony_ci   u_rwlock_init(&at->ht_lock);
399bf215546Sopenharmony_ci
400bf215546Sopenharmony_ci   list_inithead(&at->pending_results);
401bf215546Sopenharmony_ci   list_inithead(&at->pending_submission_data);
402bf215546Sopenharmony_ci
403bf215546Sopenharmony_ci   return VK_SUCCESS;
404bf215546Sopenharmony_ci}
405bf215546Sopenharmony_ci
406bf215546Sopenharmony_civoid
407bf215546Sopenharmony_citu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
408bf215546Sopenharmony_ci{
409bf215546Sopenharmony_ci   if (TU_AUTOTUNE_LOG_AT_FINISH) {
410bf215546Sopenharmony_ci      while (!list_is_empty(&at->pending_results)) {
411bf215546Sopenharmony_ci         const uint32_t gpu_fence = get_autotune_fence(at);
412bf215546Sopenharmony_ci         process_results(at, gpu_fence);
413bf215546Sopenharmony_ci      }
414bf215546Sopenharmony_ci
415bf215546Sopenharmony_ci      hash_table_foreach(at->ht, entry) {
416bf215546Sopenharmony_ci         struct tu_renderpass_history *history = entry->data;
417bf215546Sopenharmony_ci
418bf215546Sopenharmony_ci         mesa_logi("%016"PRIx64" \tavg_passed=%u results=%u",
419bf215546Sopenharmony_ci                   history->key, history->avg_samples, history->num_results);
420bf215546Sopenharmony_ci      }
421bf215546Sopenharmony_ci   }
422bf215546Sopenharmony_ci
423bf215546Sopenharmony_ci   tu_autotune_free_results(dev, &at->pending_results);
424bf215546Sopenharmony_ci
425bf215546Sopenharmony_ci   mtx_lock(&dev->autotune_mutex);
426bf215546Sopenharmony_ci   hash_table_foreach(at->ht, entry) {
427bf215546Sopenharmony_ci      struct tu_renderpass_history *history = entry->data;
428bf215546Sopenharmony_ci      free_history(dev, history);
429bf215546Sopenharmony_ci   }
430bf215546Sopenharmony_ci   mtx_unlock(&dev->autotune_mutex);
431bf215546Sopenharmony_ci
432bf215546Sopenharmony_ci   list_for_each_entry_safe(struct tu_submission_data, submission_data,
433bf215546Sopenharmony_ci                            &at->pending_submission_data, node) {
434bf215546Sopenharmony_ci      free_submission_data(submission_data);
435bf215546Sopenharmony_ci   }
436bf215546Sopenharmony_ci
437bf215546Sopenharmony_ci   _mesa_hash_table_destroy(at->ht, NULL);
438bf215546Sopenharmony_ci   u_rwlock_destroy(&at->ht_lock);
439bf215546Sopenharmony_ci}
440bf215546Sopenharmony_ci
441bf215546Sopenharmony_cibool
442bf215546Sopenharmony_citu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
443bf215546Sopenharmony_ci                                  uint32_t cmd_buffer_count)
444bf215546Sopenharmony_ci{
445bf215546Sopenharmony_ci   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
446bf215546Sopenharmony_ci      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
447bf215546Sopenharmony_ci      if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
448bf215546Sopenharmony_ci         return true;
449bf215546Sopenharmony_ci   }
450bf215546Sopenharmony_ci
451bf215546Sopenharmony_ci   return false;
452bf215546Sopenharmony_ci}
453bf215546Sopenharmony_ci
454bf215546Sopenharmony_civoid
455bf215546Sopenharmony_citu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
456bf215546Sopenharmony_ci{
457bf215546Sopenharmony_ci   list_for_each_entry_safe(struct tu_renderpass_result, result,
458bf215546Sopenharmony_ci                            results, node) {
459bf215546Sopenharmony_ci      free_result(dev, result);
460bf215546Sopenharmony_ci   }
461bf215546Sopenharmony_ci}
462bf215546Sopenharmony_ci
463bf215546Sopenharmony_civoid
464bf215546Sopenharmony_citu_autotune_free_results(struct tu_device *dev, struct list_head *results)
465bf215546Sopenharmony_ci{
466bf215546Sopenharmony_ci   mtx_lock(&dev->autotune_mutex);
467bf215546Sopenharmony_ci   tu_autotune_free_results_locked(dev, results);
468bf215546Sopenharmony_ci   mtx_unlock(&dev->autotune_mutex);
469bf215546Sopenharmony_ci}
470bf215546Sopenharmony_ci
471bf215546Sopenharmony_cistatic bool
472bf215546Sopenharmony_cifallback_use_bypass(const struct tu_render_pass *pass,
473bf215546Sopenharmony_ci                    const struct tu_framebuffer *framebuffer,
474bf215546Sopenharmony_ci                    const struct tu_cmd_buffer *cmd_buffer)
475bf215546Sopenharmony_ci{
476bf215546Sopenharmony_ci   if (cmd_buffer->state.rp.drawcall_count > 5)
477bf215546Sopenharmony_ci      return false;
478bf215546Sopenharmony_ci
479bf215546Sopenharmony_ci   for (unsigned i = 0; i < pass->subpass_count; i++) {
480bf215546Sopenharmony_ci      if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
481bf215546Sopenharmony_ci         return false;
482bf215546Sopenharmony_ci   }
483bf215546Sopenharmony_ci
484bf215546Sopenharmony_ci   return true;
485bf215546Sopenharmony_ci}
486bf215546Sopenharmony_ci
487bf215546Sopenharmony_cistatic uint32_t
488bf215546Sopenharmony_ciget_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
489bf215546Sopenharmony_ci{
490bf215546Sopenharmony_ci   const VkExtent2D *extent = &cmd->state.render_area.extent;
491bf215546Sopenharmony_ci   return extent->width * extent->height;
492bf215546Sopenharmony_ci}
493bf215546Sopenharmony_ci
494bf215546Sopenharmony_cistatic uint64_t
495bf215546Sopenharmony_ciestimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
496bf215546Sopenharmony_ci                            uint32_t avg_renderpass_sample_count)
497bf215546Sopenharmony_ci{
498bf215546Sopenharmony_ci   const struct tu_cmd_state *state = &cmd->state;
499bf215546Sopenharmony_ci
500bf215546Sopenharmony_ci   if (!state->rp.drawcall_count)
501bf215546Sopenharmony_ci      return 0;
502bf215546Sopenharmony_ci
503bf215546Sopenharmony_ci   /* sample count times drawcall_bandwidth_per_sample */
504bf215546Sopenharmony_ci   return (uint64_t)avg_renderpass_sample_count *
505bf215546Sopenharmony_ci      state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
506bf215546Sopenharmony_ci}
507bf215546Sopenharmony_ci
508bf215546Sopenharmony_cibool
509bf215546Sopenharmony_citu_autotune_use_bypass(struct tu_autotune *at,
510bf215546Sopenharmony_ci                       struct tu_cmd_buffer *cmd_buffer,
511bf215546Sopenharmony_ci                       struct tu_renderpass_result **autotune_result)
512bf215546Sopenharmony_ci{
513bf215546Sopenharmony_ci   const struct tu_render_pass *pass = cmd_buffer->state.pass;
514bf215546Sopenharmony_ci   const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
515bf215546Sopenharmony_ci
516bf215546Sopenharmony_ci   for (unsigned i = 0; i < pass->subpass_count; i++) {
517bf215546Sopenharmony_ci      const struct tu_subpass *subpass = &pass->subpasses[i];
518bf215546Sopenharmony_ci      /* GMEM works much faster in this case */
519bf215546Sopenharmony_ci      if (subpass->raster_order_attachment_access)
520bf215546Sopenharmony_ci         return false;
521bf215546Sopenharmony_ci
522bf215546Sopenharmony_ci      /* Would be very slow in sysmem mode because we have to enable
523bf215546Sopenharmony_ci       * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE)
524bf215546Sopenharmony_ci       */
525bf215546Sopenharmony_ci      if (subpass->feedback_loop_color || subpass->feedback_loop_ds)
526bf215546Sopenharmony_ci         return false;
527bf215546Sopenharmony_ci   }
528bf215546Sopenharmony_ci
529bf215546Sopenharmony_ci   /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
530bf215546Sopenharmony_ci    * we would have to allocate GPU memory at the submit time and copy
531bf215546Sopenharmony_ci    * results into it.
532bf215546Sopenharmony_ci    * Native games ususally don't use it, Zink and DXVK don't use it,
533bf215546Sopenharmony_ci    * D3D12 doesn't have such concept.
534bf215546Sopenharmony_ci    */
535bf215546Sopenharmony_ci   bool simultaneous_use =
536bf215546Sopenharmony_ci      cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
537bf215546Sopenharmony_ci
538bf215546Sopenharmony_ci   if (!at->enabled || simultaneous_use)
539bf215546Sopenharmony_ci      return fallback_use_bypass(pass, framebuffer, cmd_buffer);
540bf215546Sopenharmony_ci
541bf215546Sopenharmony_ci   /* We use 64bit hash as a key since we don't fear rare hash collision,
542bf215546Sopenharmony_ci    * the worst that would happen is sysmem being selected when it should
543bf215546Sopenharmony_ci    * have not, and with 64bit it would be extremely rare.
544bf215546Sopenharmony_ci    *
545bf215546Sopenharmony_ci    * Q: Why not make the key from framebuffer + renderpass pointers?
546bf215546Sopenharmony_ci    * A: At least DXVK creates new framebuffers each frame while keeping
547bf215546Sopenharmony_ci    *    renderpasses the same. Also we want to support replaying a single
548bf215546Sopenharmony_ci    *    frame in a loop for testing.
549bf215546Sopenharmony_ci    */
550bf215546Sopenharmony_ci   uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
551bf215546Sopenharmony_ci
552bf215546Sopenharmony_ci   *autotune_result = create_history_result(at, renderpass_key);
553bf215546Sopenharmony_ci
554bf215546Sopenharmony_ci   uint32_t avg_samples = 0;
555bf215546Sopenharmony_ci   if (get_history(at, renderpass_key, &avg_samples)) {
556bf215546Sopenharmony_ci      const uint32_t pass_pixel_count =
557bf215546Sopenharmony_ci         get_render_pass_pixel_count(cmd_buffer);
558bf215546Sopenharmony_ci      uint64_t sysmem_bandwidth =
559bf215546Sopenharmony_ci         (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
560bf215546Sopenharmony_ci      uint64_t gmem_bandwidth =
561bf215546Sopenharmony_ci         (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
562bf215546Sopenharmony_ci
563bf215546Sopenharmony_ci      const uint64_t total_draw_call_bandwidth =
564bf215546Sopenharmony_ci         estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
565bf215546Sopenharmony_ci
566bf215546Sopenharmony_ci      /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
567bf215546Sopenharmony_ci      sysmem_bandwidth += total_draw_call_bandwidth;
568bf215546Sopenharmony_ci
569bf215546Sopenharmony_ci      /* drawcalls access gmem in gmem rendering, but we do not want to ignore
570bf215546Sopenharmony_ci       * them completely.  The state changes between tiles also have an
571bf215546Sopenharmony_ci       * overhead.  The magic numbers of 11 and 10 are randomly chosen.
572bf215546Sopenharmony_ci       */
573bf215546Sopenharmony_ci      gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
574bf215546Sopenharmony_ci
575bf215546Sopenharmony_ci      const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
576bf215546Sopenharmony_ci      if (TU_AUTOTUNE_DEBUG_LOG) {
577bf215546Sopenharmony_ci         const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
578bf215546Sopenharmony_ci         const float drawcall_bandwidth_per_sample =
579bf215546Sopenharmony_ci            (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
580bf215546Sopenharmony_ci            cmd_buffer->state.rp.drawcall_count;
581bf215546Sopenharmony_ci
582bf215546Sopenharmony_ci         mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
583bf215546Sopenharmony_ci               renderpass_key,
584bf215546Sopenharmony_ci               cmd_buffer->state.rp.drawcall_count,
585bf215546Sopenharmony_ci               select_sysmem ? "sysmem" : "gmem");
586bf215546Sopenharmony_ci         mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
587bf215546Sopenharmony_ci               avg_samples,
588bf215546Sopenharmony_ci               drawcall_bandwidth_per_sample,
589bf215546Sopenharmony_ci               total_draw_call_bandwidth);
590bf215546Sopenharmony_ci         mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
591bf215546Sopenharmony_ci               extent->width, extent->height,
592bf215546Sopenharmony_ci               pass->sysmem_bandwidth_per_pixel,
593bf215546Sopenharmony_ci               pass->gmem_bandwidth_per_pixel);
594bf215546Sopenharmony_ci         mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
595bf215546Sopenharmony_ci               sysmem_bandwidth, gmem_bandwidth);
596bf215546Sopenharmony_ci      }
597bf215546Sopenharmony_ci
598bf215546Sopenharmony_ci      return select_sysmem;
599bf215546Sopenharmony_ci   }
600bf215546Sopenharmony_ci
601bf215546Sopenharmony_ci   return fallback_use_bypass(pass, framebuffer, cmd_buffer);
602bf215546Sopenharmony_ci}
603bf215546Sopenharmony_ci
604bf215546Sopenharmony_civoid
605bf215546Sopenharmony_citu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
606bf215546Sopenharmony_ci                             struct tu_cs *cs,
607bf215546Sopenharmony_ci                             struct tu_renderpass_result *autotune_result)
608bf215546Sopenharmony_ci{
609bf215546Sopenharmony_ci   if (!autotune_result)
610bf215546Sopenharmony_ci      return;
611bf215546Sopenharmony_ci
612bf215546Sopenharmony_ci   struct tu_device *dev = cmd->device;
613bf215546Sopenharmony_ci
614bf215546Sopenharmony_ci   static const uint32_t size = sizeof(struct tu_renderpass_samples);
615bf215546Sopenharmony_ci
616bf215546Sopenharmony_ci   mtx_lock(&dev->autotune_mutex);
617bf215546Sopenharmony_ci   VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
618bf215546Sopenharmony_ci   mtx_unlock(&dev->autotune_mutex);
619bf215546Sopenharmony_ci   if (ret != VK_SUCCESS) {
620bf215546Sopenharmony_ci      autotune_result->bo.iova = 0;
621bf215546Sopenharmony_ci      return;
622bf215546Sopenharmony_ci   }
623bf215546Sopenharmony_ci
624bf215546Sopenharmony_ci   uint64_t result_iova = autotune_result->bo.iova;
625bf215546Sopenharmony_ci
626bf215546Sopenharmony_ci   autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo);
627bf215546Sopenharmony_ci
628bf215546Sopenharmony_ci   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
629bf215546Sopenharmony_ci
630bf215546Sopenharmony_ci   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
631bf215546Sopenharmony_ci
632bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
633bf215546Sopenharmony_ci   tu_cs_emit(cs, ZPASS_DONE);
634bf215546Sopenharmony_ci}
635bf215546Sopenharmony_ci
636bf215546Sopenharmony_civoid tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
637bf215546Sopenharmony_ci                                struct tu_cs *cs,
638bf215546Sopenharmony_ci                                struct tu_renderpass_result *autotune_result)
639bf215546Sopenharmony_ci{
640bf215546Sopenharmony_ci   if (!autotune_result)
641bf215546Sopenharmony_ci      return;
642bf215546Sopenharmony_ci
643bf215546Sopenharmony_ci   if (!autotune_result->bo.iova)
644bf215546Sopenharmony_ci      return;
645bf215546Sopenharmony_ci
646bf215546Sopenharmony_ci   uint64_t result_iova = autotune_result->bo.iova +
647bf215546Sopenharmony_ci                          offsetof(struct tu_renderpass_samples, samples_end);
648bf215546Sopenharmony_ci
649bf215546Sopenharmony_ci   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
650bf215546Sopenharmony_ci
651bf215546Sopenharmony_ci   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
652bf215546Sopenharmony_ci
653bf215546Sopenharmony_ci   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
654bf215546Sopenharmony_ci   tu_cs_emit(cs, ZPASS_DONE);
655bf215546Sopenharmony_ci}
656