1/* 2 * Copyright © 2020-2021 Collabora, Ltd. 3 * Author: Antonio Caggiano <antonio.caggiano@collabora.com> 4 * Author: Corentin Noël <corentin.noel@collabora.com> 5 * 6 * SPDX-License-Identifier: MIT 7 */ 8 9#include "intel_pps_driver.h" 10 11#include <dirent.h> 12#include <fcntl.h> 13#include <math.h> 14#include <poll.h> 15#include <strings.h> 16#include <sys/ioctl.h> 17#include <unistd.h> 18 19#include "drm-uapi/i915_drm.h" 20 21#include "common/intel_gem.h" 22#include "dev/intel_device_info.h" 23#include "perf/intel_perf.h" 24#include "perf/intel_perf_query.h" 25 26#include <pps/pps.h> 27#include <pps/pps_algorithm.h> 28 29#include "intel_pps_perf.h" 30#include "intel_pps_priv.h" 31 32namespace pps 33{ 34 35// The HW sampling period is programmed using period_exponent following this 36// formula: 37// sample_period = timestamp_period * 2^(period_exponent + 1) 38// So our minimum sampling period is twice the timestamp period 39 40uint64_t IntelDriver::get_min_sampling_period_ns() 41{ 42 return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull; 43} 44 45IntelDriver::IntelDriver() 46{ 47} 48 49IntelDriver::~IntelDriver() 50{ 51} 52 53void IntelDriver::enable_counter(uint32_t counter_id) 54{ 55 auto &counter = counters[counter_id]; 56 57 enabled_counters.emplace_back(counter); 58} 59 60void IntelDriver::enable_all_counters() 61{ 62 // We should only have one group 63 assert(groups.size() == 1); 64 for (uint32_t counter_id : groups[0].counters) { 65 auto &counter = counters[counter_id]; 66 enabled_counters.emplace_back(counter); 67 } 68} 69 70bool IntelDriver::init_perfcnt() 71{ 72 /* Note: clock_id's below 128 are reserved.. for custom clock sources, 73 * using the hash of a namespaced string is the recommended approach. 74 * See: https://perfetto.dev/docs/concepts/clock-sync 75 */ 76 this->clock_id = intel_pps_clock_id(drm_device.gpu_num); 77 78 assert(!perf && "Intel perf should not be initialized at this point"); 79 80 perf = std::make_unique<IntelPerf>(drm_device.fd); 81 82 const char *metric_set_name = getenv("INTEL_PERFETTO_METRIC_SET"); 83 84 struct intel_perf_query_info *default_query = nullptr; 85 selected_query = nullptr; 86 for (auto &query : perf->get_queries()) { 87 if (!strcmp(query->symbol_name, "RenderBasic")) 88 default_query = query; 89 if (metric_set_name && !strcmp(query->symbol_name, metric_set_name)) 90 selected_query = query; 91 } 92 93 assert(default_query); 94 95 if (!selected_query) { 96 if (metric_set_name) { 97 PPS_LOG_ERROR("Available metric sets:"); 98 for (auto &query : perf->get_queries()) 99 PPS_LOG_ERROR(" %s", query->symbol_name); 100 PPS_LOG_FATAL("Metric set '%s' not available.", metric_set_name); 101 } 102 selected_query = default_query; 103 } 104 105 PPS_LOG("Using metric set '%s': %s", 106 selected_query->symbol_name, selected_query->name); 107 108 // Create group 109 CounterGroup group = {}; 110 group.id = groups.size(); 111 group.name = selected_query->symbol_name; 112 113 for (int i = 0; i < selected_query->n_counters; ++i) { 114 intel_perf_query_counter &counter = selected_query->counters[i]; 115 116 // Create counter 117 Counter counter_desc = {}; 118 counter_desc.id = counters.size(); 119 counter_desc.name = counter.symbol_name; 120 counter_desc.group = group.id; 121 counter_desc.getter = [counter, this]( 122 const Counter &c, const Driver &dri) -> Counter::Value { 123 switch (counter.data_type) { 124 case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: 125 case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: 126 case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32: 127 return (int64_t)counter.oa_counter_read_uint64(perf->cfg, 128 selected_query, 129 &perf->result); 130 break; 131 case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: 132 case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: 133 return counter.oa_counter_read_float(perf->cfg, 134 selected_query, 135 &perf->result); 136 break; 137 } 138 139 return {}; 140 }; 141 142 // Add counter id to the group 143 group.counters.emplace_back(counter_desc.id); 144 145 // Store counter 146 counters.emplace_back(std::move(counter_desc)); 147 } 148 149 // Store group 150 groups.emplace_back(std::move(group)); 151 152 assert(counters.size() && "Failed to query counters"); 153 154 // Clear accumulations 155 intel_perf_query_result_clear(&perf->result); 156 157 return true; 158} 159 160void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns) 161{ 162 this->sampling_period_ns = sampling_period_ns; 163 164 gpu_timestamp_udw = intel_read_gpu_timestamp(drm_device.fd) & ~perf->cfg->oa_timestamp_mask; 165 if (!perf->open(sampling_period_ns, selected_query)) { 166 PPS_LOG_FATAL("Failed to open intel perf"); 167 } 168} 169 170void IntelDriver::disable_perfcnt() 171{ 172 gpu_timestamp_udw = 0; 173 perf = nullptr; 174 groups.clear(); 175 counters.clear(); 176 enabled_counters.clear(); 177} 178 179/// @brief Some perf record durations can be really short 180/// @return True if the duration is at least close to the sampling period 181static bool close_enough(uint64_t duration, uint64_t sampling_period) 182{ 183 return duration > sampling_period - 100000; 184} 185 186/// @brief Transforms the raw data received in from the driver into records 187std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data, 188 const size_t byte_count) 189{ 190 std::vector<PerfRecord> records; 191 records.reserve(128); 192 193 PerfRecord record; 194 record.data.reserve(512); 195 196 const uint8_t *iter = data.data(); 197 const uint8_t *end = iter + byte_count; 198 199 uint64_t prev_gpu_timestamp = last_gpu_timestamp; 200 201 while (iter < end) { 202 // Iterate a record at a time 203 auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter); 204 205 if (header->type == DRM_I915_PERF_RECORD_SAMPLE) { 206 // Report is next to the header 207 const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1); 208 uint64_t gpu_timestamp_ldw = 209 intel_perf_report_timestamp(selected_query, report); 210 211 /* Our HW only provides us with the lower 32 bits of the 36bits 212 * timestamp counter value. If we haven't captured the top bits yet, 213 * do it now. If we see a roll over the lower 32bits capture it 214 * again. 215 */ 216 if (gpu_timestamp_udw == 0 || (gpu_timestamp_udw | gpu_timestamp_ldw) < last_gpu_timestamp) 217 gpu_timestamp_udw = intel_read_gpu_timestamp(drm_device.fd) & ~perf->cfg->oa_timestamp_mask; 218 219 uint64_t gpu_timestamp = gpu_timestamp_udw | gpu_timestamp_ldw; 220 221 auto duration = intel_device_info_timebase_scale(&perf->devinfo, 222 gpu_timestamp - prev_gpu_timestamp); 223 224 // Skip perf-records that are too short by checking 225 // the distance between last report and this one 226 if (close_enough(duration, sampling_period_ns)) { 227 prev_gpu_timestamp = gpu_timestamp; 228 229 // Add the new record to the list 230 record.timestamp = gpu_timestamp; 231 record.data.resize(header->size); // Possibly 264? 232 memcpy(record.data.data(), iter, header->size); 233 records.emplace_back(record); 234 } 235 } 236 237 // Go to the next record 238 iter += header->size; 239 } 240 241 return records; 242} 243 244/// @brief Read all the available data from the metric set currently in use 245void IntelDriver::read_data_from_metric_set() 246{ 247 assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading"); 248 249 ssize_t bytes_read = 0; 250 while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read, 251 metric_buffer.size() - total_bytes_read)) > 0 || 252 errno == EINTR) { 253 total_bytes_read += std::max(ssize_t(0), bytes_read); 254 255 // Increase size of the buffer for the next read 256 if (metric_buffer.size() / 2 < total_bytes_read) { 257 metric_buffer.resize(metric_buffer.size() * 2); 258 } 259 } 260 261 assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough"); 262} 263 264bool IntelDriver::dump_perfcnt() 265{ 266 if (!perf->oa_stream_ready()) { 267 return false; 268 } 269 270 read_data_from_metric_set(); 271 272 auto new_records = parse_perf_records(metric_buffer, total_bytes_read); 273 if (new_records.empty()) { 274 // No new records from the GPU yet 275 return false; 276 } else { 277 // Records are parsed correctly, so we can reset the 278 // number of bytes read so far from the metric set 279 total_bytes_read = 0; 280 } 281 282 APPEND(records, new_records); 283 284 if (records.size() < 2) { 285 // Not enough records to accumulate 286 return false; 287 } 288 289 return true; 290} 291 292uint64_t IntelDriver::gpu_next() 293{ 294 if (records.size() < 2) { 295 // Not enough records to accumulate 296 return 0; 297 } 298 299 // Get first and second 300 auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data.data()); 301 auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data.data()); 302 303 intel_perf_query_result_accumulate_fields(&perf->result, 304 selected_query, 305 record_a + 1, 306 record_b + 1, 307 false /* no_oa_accumulate */); 308 309 // Get last timestamp 310 auto gpu_timestamp = records[1].timestamp; 311 312 // Consume first record 313 records.erase(std::begin(records), std::begin(records) + 1); 314 315 return intel_device_info_timebase_scale(&perf->devinfo, gpu_timestamp); 316} 317 318uint64_t IntelDriver::next() 319{ 320 // Reset accumulation 321 intel_perf_query_result_clear(&perf->result); 322 return gpu_next(); 323} 324 325uint32_t IntelDriver::gpu_clock_id() const 326{ 327 return this->clock_id; 328} 329 330uint64_t IntelDriver::gpu_timestamp() const 331{ 332 return intel_device_info_timebase_scale(&perf->devinfo, 333 intel_read_gpu_timestamp(drm_device.fd)); 334} 335 336} // namespace pps 337