1/*
2 * Copyright © 2020-2021 Collabora, Ltd.
3 * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
4 * Author: Corentin Noël <corentin.noel@collabora.com>
5 *
6 * SPDX-License-Identifier: MIT
7 */
8
9#include "intel_pps_driver.h"
10
11#include <dirent.h>
12#include <fcntl.h>
13#include <math.h>
14#include <poll.h>
15#include <strings.h>
16#include <sys/ioctl.h>
17#include <unistd.h>
18
19#include "drm-uapi/i915_drm.h"
20
21#include "common/intel_gem.h"
22#include "dev/intel_device_info.h"
23#include "perf/intel_perf.h"
24#include "perf/intel_perf_query.h"
25
26#include <pps/pps.h>
27#include <pps/pps_algorithm.h>
28
29#include "intel_pps_perf.h"
30#include "intel_pps_priv.h"
31
32namespace pps
33{
34
35// The HW sampling period is programmed using period_exponent following this
36// formula:
37//    sample_period = timestamp_period * 2^(period_exponent + 1)
38// So our minimum sampling period is twice the timestamp period
39
40uint64_t IntelDriver::get_min_sampling_period_ns()
41{
42   return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
43}
44
45IntelDriver::IntelDriver()
46{
47}
48
49IntelDriver::~IntelDriver()
50{
51}
52
53void IntelDriver::enable_counter(uint32_t counter_id)
54{
55   auto &counter = counters[counter_id];
56
57   enabled_counters.emplace_back(counter);
58}
59
60void IntelDriver::enable_all_counters()
61{
62   // We should only have one group
63   assert(groups.size() == 1);
64   for (uint32_t counter_id : groups[0].counters) {
65      auto &counter = counters[counter_id];
66      enabled_counters.emplace_back(counter);
67   }
68}
69
70bool IntelDriver::init_perfcnt()
71{
72   /* Note: clock_id's below 128 are reserved.. for custom clock sources,
73    * using the hash of a namespaced string is the recommended approach.
74    * See: https://perfetto.dev/docs/concepts/clock-sync
75    */
76   this->clock_id = intel_pps_clock_id(drm_device.gpu_num);
77
78   assert(!perf && "Intel perf should not be initialized at this point");
79
80   perf = std::make_unique<IntelPerf>(drm_device.fd);
81
82   const char *metric_set_name = getenv("INTEL_PERFETTO_METRIC_SET");
83
84   struct intel_perf_query_info *default_query = nullptr;
85   selected_query = nullptr;
86   for (auto &query : perf->get_queries()) {
87      if (!strcmp(query->symbol_name, "RenderBasic"))
88         default_query = query;
89      if (metric_set_name && !strcmp(query->symbol_name, metric_set_name))
90         selected_query = query;
91   }
92
93   assert(default_query);
94
95   if (!selected_query) {
96      if (metric_set_name) {
97         PPS_LOG_ERROR("Available metric sets:");
98         for (auto &query : perf->get_queries())
99            PPS_LOG_ERROR("   %s", query->symbol_name);
100         PPS_LOG_FATAL("Metric set '%s' not available.", metric_set_name);
101      }
102      selected_query = default_query;
103   }
104
105   PPS_LOG("Using metric set '%s': %s",
106           selected_query->symbol_name, selected_query->name);
107
108   // Create group
109   CounterGroup group = {};
110   group.id = groups.size();
111   group.name = selected_query->symbol_name;
112
113   for (int i = 0; i < selected_query->n_counters; ++i) {
114      intel_perf_query_counter &counter = selected_query->counters[i];
115
116      // Create counter
117      Counter counter_desc = {};
118      counter_desc.id = counters.size();
119      counter_desc.name = counter.symbol_name;
120      counter_desc.group = group.id;
121      counter_desc.getter = [counter, this](
122         const Counter &c, const Driver &dri) -> Counter::Value {
123         switch (counter.data_type) {
124         case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
125         case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
126         case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
127            return (int64_t)counter.oa_counter_read_uint64(perf->cfg,
128                                                           selected_query,
129                                                           &perf->result);
130            break;
131         case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
132         case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
133            return counter.oa_counter_read_float(perf->cfg,
134                                                 selected_query,
135                                                 &perf->result);
136            break;
137         }
138
139         return {};
140      };
141
142      // Add counter id to the group
143      group.counters.emplace_back(counter_desc.id);
144
145      // Store counter
146      counters.emplace_back(std::move(counter_desc));
147   }
148
149   // Store group
150   groups.emplace_back(std::move(group));
151
152   assert(counters.size() && "Failed to query counters");
153
154   // Clear accumulations
155   intel_perf_query_result_clear(&perf->result);
156
157   return true;
158}
159
160void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
161{
162   this->sampling_period_ns = sampling_period_ns;
163
164   gpu_timestamp_udw = intel_read_gpu_timestamp(drm_device.fd) & ~perf->cfg->oa_timestamp_mask;
165   if (!perf->open(sampling_period_ns, selected_query)) {
166      PPS_LOG_FATAL("Failed to open intel perf");
167   }
168}
169
170void IntelDriver::disable_perfcnt()
171{
172   gpu_timestamp_udw = 0;
173   perf = nullptr;
174   groups.clear();
175   counters.clear();
176   enabled_counters.clear();
177}
178
179/// @brief Some perf record durations can be really short
180/// @return True if the duration is at least close to the sampling period
181static bool close_enough(uint64_t duration, uint64_t sampling_period)
182{
183   return duration > sampling_period - 100000;
184}
185
186/// @brief Transforms the raw data received in from the driver into records
187std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
188   const size_t byte_count)
189{
190   std::vector<PerfRecord> records;
191   records.reserve(128);
192
193   PerfRecord record;
194   record.data.reserve(512);
195
196   const uint8_t *iter = data.data();
197   const uint8_t *end = iter + byte_count;
198
199   uint64_t prev_gpu_timestamp = last_gpu_timestamp;
200
201   while (iter < end) {
202      // Iterate a record at a time
203      auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
204
205      if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
206         // Report is next to the header
207         const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1);
208         uint64_t gpu_timestamp_ldw =
209            intel_perf_report_timestamp(selected_query, report);
210
211         /* Our HW only provides us with the lower 32 bits of the 36bits
212          * timestamp counter value. If we haven't captured the top bits yet,
213          * do it now. If we see a roll over the lower 32bits capture it
214          * again.
215          */
216         if (gpu_timestamp_udw == 0 || (gpu_timestamp_udw | gpu_timestamp_ldw) < last_gpu_timestamp)
217            gpu_timestamp_udw = intel_read_gpu_timestamp(drm_device.fd) & ~perf->cfg->oa_timestamp_mask;
218
219         uint64_t gpu_timestamp = gpu_timestamp_udw | gpu_timestamp_ldw;
220
221         auto duration = intel_device_info_timebase_scale(&perf->devinfo,
222                                                          gpu_timestamp - prev_gpu_timestamp);
223
224         // Skip perf-records that are too short by checking
225         // the distance between last report and this one
226         if (close_enough(duration, sampling_period_ns)) {
227            prev_gpu_timestamp = gpu_timestamp;
228
229            // Add the new record to the list
230            record.timestamp = gpu_timestamp;
231            record.data.resize(header->size); // Possibly 264?
232            memcpy(record.data.data(), iter, header->size);
233            records.emplace_back(record);
234         }
235      }
236
237      // Go to the next record
238      iter += header->size;
239   }
240
241   return records;
242}
243
244/// @brief Read all the available data from the metric set currently in use
245void IntelDriver::read_data_from_metric_set()
246{
247   assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
248
249   ssize_t bytes_read = 0;
250   while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
251              metric_buffer.size() - total_bytes_read)) > 0 ||
252      errno == EINTR) {
253      total_bytes_read += std::max(ssize_t(0), bytes_read);
254
255      // Increase size of the buffer for the next read
256      if (metric_buffer.size() / 2 < total_bytes_read) {
257         metric_buffer.resize(metric_buffer.size() * 2);
258      }
259   }
260
261   assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
262}
263
264bool IntelDriver::dump_perfcnt()
265{
266   if (!perf->oa_stream_ready()) {
267      return false;
268   }
269
270   read_data_from_metric_set();
271
272   auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
273   if (new_records.empty()) {
274      // No new records from the GPU yet
275      return false;
276   } else {
277      // Records are parsed correctly, so we can reset the
278      // number of bytes read so far from the metric set
279      total_bytes_read = 0;
280   }
281
282   APPEND(records, new_records);
283
284   if (records.size() < 2) {
285      // Not enough records to accumulate
286      return false;
287   }
288
289   return true;
290}
291
292uint64_t IntelDriver::gpu_next()
293{
294   if (records.size() < 2) {
295      // Not enough records to accumulate
296      return 0;
297   }
298
299   // Get first and second
300   auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data.data());
301   auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data.data());
302
303   intel_perf_query_result_accumulate_fields(&perf->result,
304                                             selected_query,
305                                             record_a + 1,
306                                             record_b + 1,
307                                             false /* no_oa_accumulate */);
308
309   // Get last timestamp
310   auto gpu_timestamp = records[1].timestamp;
311
312   // Consume first record
313   records.erase(std::begin(records), std::begin(records) + 1);
314
315   return intel_device_info_timebase_scale(&perf->devinfo, gpu_timestamp);
316}
317
318uint64_t IntelDriver::next()
319{
320   // Reset accumulation
321   intel_perf_query_result_clear(&perf->result);
322   return gpu_next();
323}
324
325uint32_t IntelDriver::gpu_clock_id() const
326{
327   return this->clock_id;
328}
329
330uint64_t IntelDriver::gpu_timestamp() const
331{
332   return intel_device_info_timebase_scale(&perf->devinfo,
333                                           intel_read_gpu_timestamp(drm_device.fd));
334}
335
336} // namespace pps
337