1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Hypervisor supplied "24x7" performance counter support
4 *
5 * Author: Cody P Schafer <cody@linux.vnet.ibm.com>
6 * Copyright 2014 IBM Corporation.
7 */
8
9 #define pr_fmt(fmt) "hv-24x7: " fmt
10
11 #include <linux/perf_event.h>
12 #include <linux/rbtree.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/vmalloc.h>
16
17 #include <asm/cputhreads.h>
18 #include <asm/firmware.h>
19 #include <asm/hvcall.h>
20 #include <asm/io.h>
21 #include <linux/byteorder/generic.h>
22
23 #include <asm/rtas.h>
24 #include "hv-24x7.h"
25 #include "hv-24x7-catalog.h"
26 #include "hv-common.h"
27
28 /* Version of the 24x7 hypervisor API that we should use in this machine. */
29 static int interface_version;
30
31 /* Whether we have to aggregate result data for some domains. */
32 static bool aggregate_result_elements;
33
34 static cpumask_t hv_24x7_cpumask;
35
domain_is_valid(unsigned domain)36 static bool domain_is_valid(unsigned domain)
37 {
38 switch (domain) {
39 #define DOMAIN(n, v, x, c) \
40 case HV_PERF_DOMAIN_##n: \
41 /* fall through */
42 #include "hv-24x7-domains.h"
43 #undef DOMAIN
44 return true;
45 default:
46 return false;
47 }
48 }
49
is_physical_domain(unsigned domain)50 static bool is_physical_domain(unsigned domain)
51 {
52 switch (domain) {
53 #define DOMAIN(n, v, x, c) \
54 case HV_PERF_DOMAIN_##n: \
55 return c;
56 #include "hv-24x7-domains.h"
57 #undef DOMAIN
58 default:
59 return false;
60 }
61 }
62
63 /*
64 * The Processor Module Information system parameter allows transferring
65 * of certain processor module information from the platform to the OS.
66 * Refer PAPR+ document to get parameter token value as '43'.
67 */
68
69 #define PROCESSOR_MODULE_INFO 43
70
71 static u32 phys_sockets; /* Physical sockets */
72 static u32 phys_chipspersocket; /* Physical chips per socket*/
73 static u32 phys_coresperchip; /* Physical cores per chip */
74
75 /*
76 * read_24x7_sys_info()
77 * Retrieve the number of sockets and chips per socket and cores per
78 * chip details through the get-system-parameter rtas call.
79 */
read_24x7_sys_info(void)80 void read_24x7_sys_info(void)
81 {
82 const s32 token = rtas_token("ibm,get-system-parameter");
83 int call_status;
84
85 /*
86 * Making system parameter: chips and sockets and cores per chip
87 * default to 1.
88 */
89 phys_sockets = 1;
90 phys_chipspersocket = 1;
91 phys_coresperchip = 1;
92
93 do {
94 spin_lock(&rtas_data_buf_lock);
95 call_status = rtas_call(token, 3, 1, NULL, PROCESSOR_MODULE_INFO,
96 __pa(rtas_data_buf), RTAS_DATA_BUF_SIZE);
97 if (call_status == 0) {
98 int ntypes = be16_to_cpup((__be16 *)&rtas_data_buf[2]);
99 int len = be16_to_cpup((__be16 *)&rtas_data_buf[0]);
100
101 if (len >= 8 && ntypes != 0) {
102 phys_sockets = be16_to_cpup((__be16 *)&rtas_data_buf[4]);
103 phys_chipspersocket = be16_to_cpup((__be16 *)&rtas_data_buf[6]);
104 phys_coresperchip = be16_to_cpup((__be16 *)&rtas_data_buf[8]);
105 }
106 }
107 spin_unlock(&rtas_data_buf_lock);
108 } while (rtas_busy_delay(call_status));
109
110 if (call_status != 0) {
111 pr_err("Error calling get-system-parameter %d\n",
112 call_status);
113 }
114 }
115
116 /* Domains for which more than one result element are returned for each event. */
domain_needs_aggregation(unsigned int domain)117 static bool domain_needs_aggregation(unsigned int domain)
118 {
119 return aggregate_result_elements &&
120 (domain == HV_PERF_DOMAIN_PHYS_CORE ||
121 (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE &&
122 domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE));
123 }
124
domain_name(unsigned domain)125 static const char *domain_name(unsigned domain)
126 {
127 if (!domain_is_valid(domain))
128 return NULL;
129
130 switch (domain) {
131 case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip";
132 case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core";
133 case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core";
134 case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip";
135 case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node";
136 case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node";
137 }
138
139 WARN_ON_ONCE(domain);
140 return NULL;
141 }
142
catalog_entry_domain_is_valid(unsigned domain)143 static bool catalog_entry_domain_is_valid(unsigned domain)
144 {
145 /* POWER8 doesn't support virtual domains. */
146 if (interface_version == 1)
147 return is_physical_domain(domain);
148 else
149 return domain_is_valid(domain);
150 }
151
152 /*
153 * TODO: Merging events:
154 * - Think of the hcall as an interface to a 4d array of counters:
155 * - x = domains
156 * - y = indexes in the domain (core, chip, vcpu, node, etc)
157 * - z = offset into the counter space
158 * - w = lpars (guest vms, "logical partitions")
159 * - A single request is: x,y,y_last,z,z_last,w,w_last
160 * - this means we can retrieve a rectangle of counters in y,z for a single x.
161 *
162 * - Things to consider (ignoring w):
163 * - input cost_per_request = 16
164 * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs
165 * - limited number of requests per hcall (must fit into 4K bytes)
166 * - 4k = 16 [buffer header] - 16 [request size] * request_count
167 * - 255 requests per hcall
168 * - sometimes it will be more efficient to read extra data and discard
169 */
170
171 /*
172 * Example usage:
173 * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/'
174 */
175
176 /* u3 0-6, one of HV_24X7_PERF_DOMAIN */
177 EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3);
178 /* u16 */
179 EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31);
180 EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31);
181 EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31);
182 /* u32, see "data_offset" */
183 EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63);
184 /* u16 */
185 EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15);
186
187 EVENT_DEFINE_RANGE(reserved1, config, 4, 15);
188 EVENT_DEFINE_RANGE(reserved2, config1, 16, 63);
189 EVENT_DEFINE_RANGE(reserved3, config2, 0, 63);
190
191 static struct attribute *format_attrs[] = {
192 &format_attr_domain.attr,
193 &format_attr_offset.attr,
194 &format_attr_core.attr,
195 &format_attr_chip.attr,
196 &format_attr_vcpu.attr,
197 &format_attr_lpar.attr,
198 NULL,
199 };
200
201 static struct attribute_group format_group = {
202 .name = "format",
203 .attrs = format_attrs,
204 };
205
206 static struct attribute_group event_group = {
207 .name = "events",
208 /* .attrs is set in init */
209 };
210
211 static struct attribute_group event_desc_group = {
212 .name = "event_descs",
213 /* .attrs is set in init */
214 };
215
216 static struct attribute_group event_long_desc_group = {
217 .name = "event_long_descs",
218 /* .attrs is set in init */
219 };
220
221 static struct kmem_cache *hv_page_cache;
222
223 DEFINE_PER_CPU(int, hv_24x7_txn_flags);
224 DEFINE_PER_CPU(int, hv_24x7_txn_err);
225
226 struct hv_24x7_hw {
227 struct perf_event *events[255];
228 };
229
230 DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw);
231
232 /*
233 * request_buffer and result_buffer are not required to be 4k aligned,
234 * but are not allowed to cross any 4k boundary. Aligning them to 4k is
235 * the simplest way to ensure that.
236 */
237 #define H24x7_DATA_BUFFER_SIZE 4096
238 DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
239 DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
240
max_num_requests(int interface_version)241 static unsigned int max_num_requests(int interface_version)
242 {
243 return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer))
244 / H24x7_REQUEST_SIZE(interface_version);
245 }
246
event_name(struct hv_24x7_event_data *ev, int *len)247 static char *event_name(struct hv_24x7_event_data *ev, int *len)
248 {
249 *len = be16_to_cpu(ev->event_name_len) - 2;
250 return (char *)ev->remainder;
251 }
252
event_desc(struct hv_24x7_event_data *ev, int *len)253 static char *event_desc(struct hv_24x7_event_data *ev, int *len)
254 {
255 unsigned nl = be16_to_cpu(ev->event_name_len);
256 __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2);
257
258 *len = be16_to_cpu(*desc_len) - 2;
259 return (char *)ev->remainder + nl;
260 }
261
event_long_desc(struct hv_24x7_event_data *ev, int *len)262 static char *event_long_desc(struct hv_24x7_event_data *ev, int *len)
263 {
264 unsigned nl = be16_to_cpu(ev->event_name_len);
265 __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2);
266 unsigned desc_len = be16_to_cpu(*desc_len_);
267 __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2);
268
269 *len = be16_to_cpu(*long_desc_len) - 2;
270 return (char *)ev->remainder + nl + desc_len;
271 }
272
event_fixed_portion_is_within(struct hv_24x7_event_data *ev, void *end)273 static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev,
274 void *end)
275 {
276 void *start = ev;
277
278 return (start + offsetof(struct hv_24x7_event_data, remainder)) < end;
279 }
280
281 /*
282 * Things we don't check:
283 * - padding for desc, name, and long/detailed desc is required to be '\0'
284 * bytes.
285 *
286 * Return NULL if we pass end,
287 * Otherwise return the address of the byte just following the event.
288 */
event_end(struct hv_24x7_event_data *ev, void *end)289 static void *event_end(struct hv_24x7_event_data *ev, void *end)
290 {
291 void *start = ev;
292 __be16 *dl_, *ldl_;
293 unsigned dl, ldl;
294 unsigned nl = be16_to_cpu(ev->event_name_len);
295
296 if (nl < 2) {
297 pr_debug("%s: name length too short: %d", __func__, nl);
298 return NULL;
299 }
300
301 if (start + nl > end) {
302 pr_debug("%s: start=%p + nl=%u > end=%p",
303 __func__, start, nl, end);
304 return NULL;
305 }
306
307 dl_ = (__be16 *)(ev->remainder + nl - 2);
308 if (!IS_ALIGNED((uintptr_t)dl_, 2))
309 pr_warn("desc len not aligned %p", dl_);
310 dl = be16_to_cpu(*dl_);
311 if (dl < 2) {
312 pr_debug("%s: desc len too short: %d", __func__, dl);
313 return NULL;
314 }
315
316 if (start + nl + dl > end) {
317 pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p",
318 __func__, start, nl, dl, start + nl + dl, end);
319 return NULL;
320 }
321
322 ldl_ = (__be16 *)(ev->remainder + nl + dl - 2);
323 if (!IS_ALIGNED((uintptr_t)ldl_, 2))
324 pr_warn("long desc len not aligned %p", ldl_);
325 ldl = be16_to_cpu(*ldl_);
326 if (ldl < 2) {
327 pr_debug("%s: long desc len too short (ldl=%u)",
328 __func__, ldl);
329 return NULL;
330 }
331
332 if (start + nl + dl + ldl > end) {
333 pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p",
334 __func__, start, nl, dl, ldl, end);
335 return NULL;
336 }
337
338 return start + nl + dl + ldl;
339 }
340
h_get_24x7_catalog_page_(unsigned long phys_4096, unsigned long version, unsigned long index)341 static long h_get_24x7_catalog_page_(unsigned long phys_4096,
342 unsigned long version, unsigned long index)
343 {
344 pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)",
345 phys_4096, version, index);
346
347 WARN_ON(!IS_ALIGNED(phys_4096, 4096));
348
349 return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE,
350 phys_4096, version, index);
351 }
352
h_get_24x7_catalog_page(char page[], u64 version, u32 index)353 static long h_get_24x7_catalog_page(char page[], u64 version, u32 index)
354 {
355 return h_get_24x7_catalog_page_(virt_to_phys(page),
356 version, index);
357 }
358
359 /*
360 * Each event we find in the catalog, will have a sysfs entry. Format the
361 * data for this sysfs entry based on the event's domain.
362 *
363 * Events belonging to the Chip domain can only be monitored in that domain.
364 * i.e the domain for these events is a fixed/knwon value.
365 *
366 * Events belonging to the Core domain can be monitored either in the physical
367 * core or in one of the virtual CPU domains. So the domain value for these
368 * events must be specified by the user (i.e is a required parameter). Format
369 * the Core events with 'domain=?' so the perf-tool can error check required
370 * parameters.
371 *
372 * NOTE: For the Core domain events, rather than making domain a required
373 * parameter we could default it to PHYS_CORE and allowe users to
374 * override the domain to one of the VCPU domains.
375 *
376 * However, this can make the interface a little inconsistent.
377 *
378 * If we set domain=2 (PHYS_CHIP) and allow user to override this field
379 * the user may be tempted to also modify the "offset=x" field in which
380 * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and
381 * HPM_INST (offset=0x20) events. With:
382 *
383 * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/
384 *
385 * we end up monitoring HPM_INST, while the command line has HPM_PCYC.
386 *
387 * By not assigning a default value to the domain for the Core events,
388 * we can have simple guidelines:
389 *
390 * - Specifying values for parameters with "=?" is required.
391 *
392 * - Specifying (i.e overriding) values for other parameters
393 * is undefined.
394 */
event_fmt(struct hv_24x7_event_data *event, unsigned domain)395 static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain)
396 {
397 const char *sindex;
398 const char *lpar;
399 const char *domain_str;
400 char buf[8];
401
402 switch (domain) {
403 case HV_PERF_DOMAIN_PHYS_CHIP:
404 snprintf(buf, sizeof(buf), "%d", domain);
405 domain_str = buf;
406 lpar = "0x0";
407 sindex = "chip";
408 break;
409 case HV_PERF_DOMAIN_PHYS_CORE:
410 domain_str = "?";
411 lpar = "0x0";
412 sindex = "core";
413 break;
414 default:
415 domain_str = "?";
416 lpar = "?";
417 sindex = "vcpu";
418 }
419
420 return kasprintf(GFP_KERNEL,
421 "domain=%s,offset=0x%x,%s=?,lpar=%s",
422 domain_str,
423 be16_to_cpu(event->event_counter_offs) +
424 be16_to_cpu(event->event_group_record_offs),
425 sindex,
426 lpar);
427 }
428
429 /* Avoid trusting fw to NUL terminate strings */
memdup_to_str(char *maybe_str, int max_len, gfp_t gfp)430 static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp)
431 {
432 return kasprintf(gfp, "%.*s", max_len, maybe_str);
433 }
434
device_show_string(struct device *dev, struct device_attribute *attr, char *buf)435 static ssize_t device_show_string(struct device *dev,
436 struct device_attribute *attr, char *buf)
437 {
438 struct dev_ext_attribute *d;
439
440 d = container_of(attr, struct dev_ext_attribute, attr);
441
442 return sprintf(buf, "%s\n", (char *)d->var);
443 }
444
cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)445 static ssize_t cpumask_show(struct device *dev,
446 struct device_attribute *attr, char *buf)
447 {
448 return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask);
449 }
450
sockets_show(struct device *dev, struct device_attribute *attr, char *buf)451 static ssize_t sockets_show(struct device *dev,
452 struct device_attribute *attr, char *buf)
453 {
454 return sprintf(buf, "%d\n", phys_sockets);
455 }
456
chipspersocket_show(struct device *dev, struct device_attribute *attr, char *buf)457 static ssize_t chipspersocket_show(struct device *dev,
458 struct device_attribute *attr, char *buf)
459 {
460 return sprintf(buf, "%d\n", phys_chipspersocket);
461 }
462
coresperchip_show(struct device *dev, struct device_attribute *attr, char *buf)463 static ssize_t coresperchip_show(struct device *dev,
464 struct device_attribute *attr, char *buf)
465 {
466 return sprintf(buf, "%d\n", phys_coresperchip);
467 }
468
device_str_attr_create_(char *name, char *str)469 static struct attribute *device_str_attr_create_(char *name, char *str)
470 {
471 struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL);
472
473 if (!attr)
474 return NULL;
475
476 sysfs_attr_init(&attr->attr.attr);
477
478 attr->var = str;
479 attr->attr.attr.name = name;
480 attr->attr.attr.mode = 0444;
481 attr->attr.show = device_show_string;
482
483 return &attr->attr.attr;
484 }
485
486 /*
487 * Allocate and initialize strings representing event attributes.
488 *
489 * NOTE: The strings allocated here are never destroyed and continue to
490 * exist till shutdown. This is to allow us to create as many events
491 * from the catalog as possible, even if we encounter errors with some.
492 * In case of changes to error paths in future, these may need to be
493 * freed by the caller.
494 */
device_str_attr_create(char *name, int name_max, int name_nonce, char *str, size_t str_max)495 static struct attribute *device_str_attr_create(char *name, int name_max,
496 int name_nonce,
497 char *str, size_t str_max)
498 {
499 char *n;
500 char *s = memdup_to_str(str, str_max, GFP_KERNEL);
501 struct attribute *a;
502
503 if (!s)
504 return NULL;
505
506 if (!name_nonce)
507 n = kasprintf(GFP_KERNEL, "%.*s", name_max, name);
508 else
509 n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name,
510 name_nonce);
511 if (!n)
512 goto out_s;
513
514 a = device_str_attr_create_(n, s);
515 if (!a)
516 goto out_n;
517
518 return a;
519 out_n:
520 kfree(n);
521 out_s:
522 kfree(s);
523 return NULL;
524 }
525
event_to_attr(unsigned ix, struct hv_24x7_event_data *event, unsigned domain, int nonce)526 static struct attribute *event_to_attr(unsigned ix,
527 struct hv_24x7_event_data *event,
528 unsigned domain,
529 int nonce)
530 {
531 int event_name_len;
532 char *ev_name, *a_ev_name, *val;
533 struct attribute *attr;
534
535 if (!domain_is_valid(domain)) {
536 pr_warn("catalog event %u has invalid domain %u\n",
537 ix, domain);
538 return NULL;
539 }
540
541 val = event_fmt(event, domain);
542 if (!val)
543 return NULL;
544
545 ev_name = event_name(event, &event_name_len);
546 if (!nonce)
547 a_ev_name = kasprintf(GFP_KERNEL, "%.*s",
548 (int)event_name_len, ev_name);
549 else
550 a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d",
551 (int)event_name_len, ev_name, nonce);
552
553 if (!a_ev_name)
554 goto out_val;
555
556 attr = device_str_attr_create_(a_ev_name, val);
557 if (!attr)
558 goto out_name;
559
560 return attr;
561 out_name:
562 kfree(a_ev_name);
563 out_val:
564 kfree(val);
565 return NULL;
566 }
567
event_to_desc_attr(struct hv_24x7_event_data *event, int nonce)568 static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event,
569 int nonce)
570 {
571 int nl, dl;
572 char *name = event_name(event, &nl);
573 char *desc = event_desc(event, &dl);
574
575 /* If there isn't a description, don't create the sysfs file */
576 if (!dl)
577 return NULL;
578
579 return device_str_attr_create(name, nl, nonce, desc, dl);
580 }
581
582 static struct attribute *
event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)583 event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)
584 {
585 int nl, dl;
586 char *name = event_name(event, &nl);
587 char *desc = event_long_desc(event, &dl);
588
589 /* If there isn't a description, don't create the sysfs file */
590 if (!dl)
591 return NULL;
592
593 return device_str_attr_create(name, nl, nonce, desc, dl);
594 }
595
event_data_to_attrs(unsigned ix, struct attribute **attrs, struct hv_24x7_event_data *event, int nonce)596 static int event_data_to_attrs(unsigned ix, struct attribute **attrs,
597 struct hv_24x7_event_data *event, int nonce)
598 {
599 *attrs = event_to_attr(ix, event, event->domain, nonce);
600 if (!*attrs)
601 return -1;
602
603 return 0;
604 }
605
606 /* */
607 struct event_uniq {
608 struct rb_node node;
609 const char *name;
610 int nl;
611 unsigned ct;
612 unsigned domain;
613 };
614
memord(const void *d1, size_t s1, const void *d2, size_t s2)615 static int memord(const void *d1, size_t s1, const void *d2, size_t s2)
616 {
617 if (s1 < s2)
618 return 1;
619 if (s1 > s2)
620 return -1;
621
622 return memcmp(d1, d2, s1);
623 }
624
ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2, size_t s2, unsigned d2)625 static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2,
626 size_t s2, unsigned d2)
627 {
628 int r = memord(v1, s1, v2, s2);
629
630 if (r)
631 return r;
632 if (d1 > d2)
633 return 1;
634 if (d2 > d1)
635 return -1;
636 return 0;
637 }
638
event_uniq_add(struct rb_root *root, const char *name, int nl, unsigned domain)639 static int event_uniq_add(struct rb_root *root, const char *name, int nl,
640 unsigned domain)
641 {
642 struct rb_node **new = &(root->rb_node), *parent = NULL;
643 struct event_uniq *data;
644
645 /* Figure out where to put new node */
646 while (*new) {
647 struct event_uniq *it;
648 int result;
649
650 it = rb_entry(*new, struct event_uniq, node);
651 result = ev_uniq_ord(name, nl, domain, it->name, it->nl,
652 it->domain);
653
654 parent = *new;
655 if (result < 0)
656 new = &((*new)->rb_left);
657 else if (result > 0)
658 new = &((*new)->rb_right);
659 else {
660 it->ct++;
661 pr_info("found a duplicate event %.*s, ct=%u\n", nl,
662 name, it->ct);
663 return it->ct;
664 }
665 }
666
667 data = kmalloc(sizeof(*data), GFP_KERNEL);
668 if (!data)
669 return -ENOMEM;
670
671 *data = (struct event_uniq) {
672 .name = name,
673 .nl = nl,
674 .ct = 0,
675 .domain = domain,
676 };
677
678 /* Add new node and rebalance tree. */
679 rb_link_node(&data->node, parent, new);
680 rb_insert_color(&data->node, root);
681
682 /* data->ct */
683 return 0;
684 }
685
event_uniq_destroy(struct rb_root *root)686 static void event_uniq_destroy(struct rb_root *root)
687 {
688 /*
689 * the strings we point to are in the giant block of memory filled by
690 * the catalog, and are freed separately.
691 */
692 struct event_uniq *pos, *n;
693
694 rbtree_postorder_for_each_entry_safe(pos, n, root, node)
695 kfree(pos);
696 }
697
698
699 /*
700 * ensure the event structure's sizes are self consistent and don't cause us to
701 * read outside of the event
702 *
703 * On success, return the event length in bytes.
704 * Otherwise, return -1 (and print as appropriate).
705 */
catalog_event_len_validate(struct hv_24x7_event_data *event, size_t event_idx, size_t event_data_bytes, size_t event_entry_count, size_t offset, void *end)706 static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
707 size_t event_idx,
708 size_t event_data_bytes,
709 size_t event_entry_count,
710 size_t offset, void *end)
711 {
712 ssize_t ev_len;
713 void *ev_end, *calc_ev_end;
714
715 if (offset >= event_data_bytes)
716 return -1;
717
718 if (event_idx >= event_entry_count) {
719 pr_devel("catalog event data has %zu bytes of padding after last event\n",
720 event_data_bytes - offset);
721 return -1;
722 }
723
724 if (!event_fixed_portion_is_within(event, end)) {
725 pr_warn("event %zu fixed portion is not within range\n",
726 event_idx);
727 return -1;
728 }
729
730 ev_len = be16_to_cpu(event->length);
731
732 if (ev_len % 16)
733 pr_info("event %zu has length %zu not divisible by 16: event=%pK\n",
734 event_idx, ev_len, event);
735
736 ev_end = (__u8 *)event + ev_len;
737 if (ev_end > end) {
738 pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n",
739 event_idx, ev_len, ev_end, end,
740 offset);
741 return -1;
742 }
743
744 calc_ev_end = event_end(event, end);
745 if (!calc_ev_end) {
746 pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n",
747 event_idx, event_data_bytes, event, end,
748 offset);
749 return -1;
750 }
751
752 if (calc_ev_end > ev_end) {
753 pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n",
754 event_idx, event, ev_end, offset, calc_ev_end);
755 return -1;
756 }
757
758 return ev_len;
759 }
760
761 #define MAX_4K (SIZE_MAX / 4096)
762
create_events_from_catalog(struct attribute ***events_, struct attribute ***event_descs_, struct attribute ***event_long_descs_)763 static int create_events_from_catalog(struct attribute ***events_,
764 struct attribute ***event_descs_,
765 struct attribute ***event_long_descs_)
766 {
767 long hret;
768 size_t catalog_len, catalog_page_len, event_entry_count,
769 event_data_len, event_data_offs,
770 event_data_bytes, junk_events, event_idx, event_attr_ct, i,
771 attr_max, event_idx_last, desc_ct, long_desc_ct;
772 ssize_t ct, ev_len;
773 uint64_t catalog_version_num;
774 struct attribute **events, **event_descs, **event_long_descs;
775 struct hv_24x7_catalog_page_0 *page_0 =
776 kmem_cache_alloc(hv_page_cache, GFP_KERNEL);
777 void *page = page_0;
778 void *event_data, *end;
779 struct hv_24x7_event_data *event;
780 struct rb_root ev_uniq = RB_ROOT;
781 int ret = 0;
782
783 if (!page) {
784 ret = -ENOMEM;
785 goto e_out;
786 }
787
788 hret = h_get_24x7_catalog_page(page, 0, 0);
789 if (hret) {
790 ret = -EIO;
791 goto e_free;
792 }
793
794 catalog_version_num = be64_to_cpu(page_0->version);
795 catalog_page_len = be32_to_cpu(page_0->length);
796
797 if (MAX_4K < catalog_page_len) {
798 pr_err("invalid page count: %zu\n", catalog_page_len);
799 ret = -EIO;
800 goto e_free;
801 }
802
803 catalog_len = catalog_page_len * 4096;
804
805 event_entry_count = be16_to_cpu(page_0->event_entry_count);
806 event_data_offs = be16_to_cpu(page_0->event_data_offs);
807 event_data_len = be16_to_cpu(page_0->event_data_len);
808
809 pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n",
810 catalog_version_num, catalog_len,
811 event_entry_count, event_data_offs, event_data_len);
812
813 if ((MAX_4K < event_data_len)
814 || (MAX_4K < event_data_offs)
815 || (MAX_4K - event_data_offs < event_data_len)) {
816 pr_err("invalid event data offs %zu and/or len %zu\n",
817 event_data_offs, event_data_len);
818 ret = -EIO;
819 goto e_free;
820 }
821
822 if ((event_data_offs + event_data_len) > catalog_page_len) {
823 pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n",
824 event_data_offs,
825 event_data_offs + event_data_len,
826 catalog_page_len);
827 ret = -EIO;
828 goto e_free;
829 }
830
831 if (SIZE_MAX - 1 < event_entry_count) {
832 pr_err("event_entry_count %zu is invalid\n", event_entry_count);
833 ret = -EIO;
834 goto e_free;
835 }
836
837 event_data_bytes = event_data_len * 4096;
838
839 /*
840 * event data can span several pages, events can cross between these
841 * pages. Use vmalloc to make this easier.
842 */
843 event_data = vmalloc(event_data_bytes);
844 if (!event_data) {
845 pr_err("could not allocate event data\n");
846 ret = -ENOMEM;
847 goto e_free;
848 }
849
850 end = event_data + event_data_bytes;
851
852 /*
853 * using vmalloc_to_phys() like this only works if PAGE_SIZE is
854 * divisible by 4096
855 */
856 BUILD_BUG_ON(PAGE_SIZE % 4096);
857
858 for (i = 0; i < event_data_len; i++) {
859 hret = h_get_24x7_catalog_page_(
860 vmalloc_to_phys(event_data + i * 4096),
861 catalog_version_num,
862 i + event_data_offs);
863 if (hret) {
864 pr_err("Failed to get event data in page %zu: rc=%ld\n",
865 i + event_data_offs, hret);
866 ret = -EIO;
867 goto e_event_data;
868 }
869 }
870
871 /*
872 * scan the catalog to determine the number of attributes we need, and
873 * verify it at the same time.
874 */
875 for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0;
876 ;
877 event_idx++, event = (void *)event + ev_len) {
878 size_t offset = (void *)event - (void *)event_data;
879 char *name;
880 int nl;
881
882 ev_len = catalog_event_len_validate(event, event_idx,
883 event_data_bytes,
884 event_entry_count,
885 offset, end);
886 if (ev_len < 0)
887 break;
888
889 name = event_name(event, &nl);
890
891 if (event->event_group_record_len == 0) {
892 pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n",
893 event_idx, nl, name);
894 junk_events++;
895 continue;
896 }
897
898 if (!catalog_entry_domain_is_valid(event->domain)) {
899 pr_info("event %zu (%.*s) has invalid domain %d\n",
900 event_idx, nl, name, event->domain);
901 junk_events++;
902 continue;
903 }
904
905 attr_max++;
906 }
907
908 event_idx_last = event_idx;
909 if (event_idx_last != event_entry_count)
910 pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n",
911 event_idx_last, event_entry_count, junk_events);
912
913 events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL);
914 if (!events) {
915 ret = -ENOMEM;
916 goto e_event_data;
917 }
918
919 event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs),
920 GFP_KERNEL);
921 if (!event_descs) {
922 ret = -ENOMEM;
923 goto e_event_attrs;
924 }
925
926 event_long_descs = kmalloc_array(event_idx + 1,
927 sizeof(*event_long_descs), GFP_KERNEL);
928 if (!event_long_descs) {
929 ret = -ENOMEM;
930 goto e_event_descs;
931 }
932
933 /* Iterate over the catalog filling in the attribute vector */
934 for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0,
935 event = event_data, event_idx = 0;
936 event_idx < event_idx_last;
937 event_idx++, ev_len = be16_to_cpu(event->length),
938 event = (void *)event + ev_len) {
939 char *name;
940 int nl;
941 int nonce;
942 /*
943 * these are the only "bad" events that are intermixed and that
944 * we can ignore without issue. make sure to skip them here
945 */
946 if (event->event_group_record_len == 0)
947 continue;
948 if (!catalog_entry_domain_is_valid(event->domain))
949 continue;
950
951 name = event_name(event, &nl);
952 nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
953 ct = event_data_to_attrs(event_idx, events + event_attr_ct,
954 event, nonce);
955 if (ct < 0) {
956 pr_warn("event %zu (%.*s) creation failure, skipping\n",
957 event_idx, nl, name);
958 junk_events++;
959 } else {
960 event_attr_ct++;
961 event_descs[desc_ct] = event_to_desc_attr(event, nonce);
962 if (event_descs[desc_ct])
963 desc_ct++;
964 event_long_descs[long_desc_ct] =
965 event_to_long_desc_attr(event, nonce);
966 if (event_long_descs[long_desc_ct])
967 long_desc_ct++;
968 }
969 }
970
971 pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n",
972 event_idx, event_attr_ct, junk_events, desc_ct);
973
974 events[event_attr_ct] = NULL;
975 event_descs[desc_ct] = NULL;
976 event_long_descs[long_desc_ct] = NULL;
977
978 event_uniq_destroy(&ev_uniq);
979 vfree(event_data);
980 kmem_cache_free(hv_page_cache, page);
981
982 *events_ = events;
983 *event_descs_ = event_descs;
984 *event_long_descs_ = event_long_descs;
985 return 0;
986
987 e_event_descs:
988 kfree(event_descs);
989 e_event_attrs:
990 kfree(events);
991 e_event_data:
992 vfree(event_data);
993 e_free:
994 kmem_cache_free(hv_page_cache, page);
995 e_out:
996 *events_ = NULL;
997 *event_descs_ = NULL;
998 *event_long_descs_ = NULL;
999 return ret;
1000 }
1001
catalog_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t offset, size_t count)1002 static ssize_t catalog_read(struct file *filp, struct kobject *kobj,
1003 struct bin_attribute *bin_attr, char *buf,
1004 loff_t offset, size_t count)
1005 {
1006 long hret;
1007 ssize_t ret = 0;
1008 size_t catalog_len = 0, catalog_page_len = 0;
1009 loff_t page_offset = 0;
1010 loff_t offset_in_page;
1011 size_t copy_len;
1012 uint64_t catalog_version_num = 0;
1013 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);
1014 struct hv_24x7_catalog_page_0 *page_0 = page;
1015
1016 if (!page)
1017 return -ENOMEM;
1018
1019 hret = h_get_24x7_catalog_page(page, 0, 0);
1020 if (hret) {
1021 ret = -EIO;
1022 goto e_free;
1023 }
1024
1025 catalog_version_num = be64_to_cpu(page_0->version);
1026 catalog_page_len = be32_to_cpu(page_0->length);
1027 catalog_len = catalog_page_len * 4096;
1028
1029 page_offset = offset / 4096;
1030 offset_in_page = offset % 4096;
1031
1032 if (page_offset >= catalog_page_len)
1033 goto e_free;
1034
1035 if (page_offset != 0) {
1036 hret = h_get_24x7_catalog_page(page, catalog_version_num,
1037 page_offset);
1038 if (hret) {
1039 ret = -EIO;
1040 goto e_free;
1041 }
1042 }
1043
1044 copy_len = 4096 - offset_in_page;
1045 if (copy_len > count)
1046 copy_len = count;
1047
1048 memcpy(buf, page+offset_in_page, copy_len);
1049 ret = copy_len;
1050
1051 e_free:
1052 if (hret)
1053 pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
1054 " rc=%ld\n",
1055 catalog_version_num, page_offset, hret);
1056 kmem_cache_free(hv_page_cache, page);
1057
1058 pr_devel("catalog_read: offset=%lld(%lld) count=%zu "
1059 "catalog_len=%zu(%zu) => %zd\n", offset, page_offset,
1060 count, catalog_len, catalog_page_len, ret);
1061
1062 return ret;
1063 }
1064
domains_show(struct device *dev, struct device_attribute *attr, char *page)1065 static ssize_t domains_show(struct device *dev, struct device_attribute *attr,
1066 char *page)
1067 {
1068 int d, n, count = 0;
1069 const char *str;
1070
1071 for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) {
1072 str = domain_name(d);
1073 if (!str)
1074 continue;
1075
1076 n = sprintf(page, "%d: %s\n", d, str);
1077 if (n < 0)
1078 break;
1079
1080 count += n;
1081 page += n;
1082 }
1083 return count;
1084 }
1085
1086 #define PAGE_0_ATTR(_name, _fmt, _expr) \
1087 static ssize_t _name##_show(struct device *dev, \
1088 struct device_attribute *dev_attr, \
1089 char *buf) \
1090 { \
1091 long hret; \
1092 ssize_t ret = 0; \
1093 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \
1094 struct hv_24x7_catalog_page_0 *page_0 = page; \
1095 if (!page) \
1096 return -ENOMEM; \
1097 hret = h_get_24x7_catalog_page(page, 0, 0); \
1098 if (hret) { \
1099 ret = -EIO; \
1100 goto e_free; \
1101 } \
1102 ret = sprintf(buf, _fmt, _expr); \
1103 e_free: \
1104 kmem_cache_free(hv_page_cache, page); \
1105 return ret; \
1106 } \
1107 static DEVICE_ATTR_RO(_name)
1108
1109 PAGE_0_ATTR(catalog_version, "%lld\n",
1110 (unsigned long long)be64_to_cpu(page_0->version));
1111 PAGE_0_ATTR(catalog_len, "%lld\n",
1112 (unsigned long long)be32_to_cpu(page_0->length) * 4096);
1113 static BIN_ATTR_RO(catalog, 0/* real length varies */);
1114 static DEVICE_ATTR_RO(domains);
1115 static DEVICE_ATTR_RO(sockets);
1116 static DEVICE_ATTR_RO(chipspersocket);
1117 static DEVICE_ATTR_RO(coresperchip);
1118 static DEVICE_ATTR_RO(cpumask);
1119
1120 static struct bin_attribute *if_bin_attrs[] = {
1121 &bin_attr_catalog,
1122 NULL,
1123 };
1124
1125 static struct attribute *cpumask_attrs[] = {
1126 &dev_attr_cpumask.attr,
1127 NULL,
1128 };
1129
1130 static struct attribute_group cpumask_attr_group = {
1131 .attrs = cpumask_attrs,
1132 };
1133
1134 static struct attribute *if_attrs[] = {
1135 &dev_attr_catalog_len.attr,
1136 &dev_attr_catalog_version.attr,
1137 &dev_attr_domains.attr,
1138 &dev_attr_sockets.attr,
1139 &dev_attr_chipspersocket.attr,
1140 &dev_attr_coresperchip.attr,
1141 NULL,
1142 };
1143
1144 static struct attribute_group if_group = {
1145 .name = "interface",
1146 .bin_attrs = if_bin_attrs,
1147 .attrs = if_attrs,
1148 };
1149
1150 static const struct attribute_group *attr_groups[] = {
1151 &format_group,
1152 &event_group,
1153 &event_desc_group,
1154 &event_long_desc_group,
1155 &if_group,
1156 &cpumask_attr_group,
1157 NULL,
1158 };
1159
1160 /*
1161 * Start the process for a new H_GET_24x7_DATA hcall.
1162 */
init_24x7_request(struct hv_24x7_request_buffer *request_buffer, struct hv_24x7_data_result_buffer *result_buffer)1163 static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer,
1164 struct hv_24x7_data_result_buffer *result_buffer)
1165 {
1166
1167 memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE);
1168 memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE);
1169
1170 request_buffer->interface_version = interface_version;
1171 /* memset above set request_buffer->num_requests to 0 */
1172 }
1173
1174 /*
1175 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected
1176 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'.
1177 */
make_24x7_request(struct hv_24x7_request_buffer *request_buffer, struct hv_24x7_data_result_buffer *result_buffer)1178 static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer,
1179 struct hv_24x7_data_result_buffer *result_buffer)
1180 {
1181 long ret;
1182
1183 /*
1184 * NOTE: Due to variable number of array elements in request and
1185 * result buffer(s), sizeof() is not reliable. Use the actual
1186 * allocated buffer size, H24x7_DATA_BUFFER_SIZE.
1187 */
1188 ret = plpar_hcall_norets(H_GET_24X7_DATA,
1189 virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE,
1190 virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE);
1191
1192 if (ret) {
1193 struct hv_24x7_request *req;
1194
1195 req = request_buffer->requests;
1196 pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n",
1197 req->performance_domain, req->data_offset,
1198 req->starting_ix, req->starting_lpar_ix,
1199 ret, ret, result_buffer->detailed_rc,
1200 result_buffer->failing_request_ix);
1201 return -EIO;
1202 }
1203
1204 return 0;
1205 }
1206
1207 /*
1208 * Add the given @event to the next slot in the 24x7 request_buffer.
1209 *
1210 * Note that H_GET_24X7_DATA hcall allows reading several counters'
1211 * values in a single HCALL. We expect the caller to add events to the
1212 * request buffer one by one, make the HCALL and process the results.
1213 */
add_event_to_24x7_request(struct perf_event *event, struct hv_24x7_request_buffer *request_buffer)1214 static int add_event_to_24x7_request(struct perf_event *event,
1215 struct hv_24x7_request_buffer *request_buffer)
1216 {
1217 u16 idx;
1218 int i;
1219 size_t req_size;
1220 struct hv_24x7_request *req;
1221
1222 if (request_buffer->num_requests >=
1223 max_num_requests(request_buffer->interface_version)) {
1224 pr_devel("Too many requests for 24x7 HCALL %d\n",
1225 request_buffer->num_requests);
1226 return -EINVAL;
1227 }
1228
1229 switch (event_get_domain(event)) {
1230 case HV_PERF_DOMAIN_PHYS_CHIP:
1231 idx = event_get_chip(event);
1232 break;
1233 case HV_PERF_DOMAIN_PHYS_CORE:
1234 idx = event_get_core(event);
1235 break;
1236 default:
1237 idx = event_get_vcpu(event);
1238 }
1239
1240 req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version);
1241
1242 i = request_buffer->num_requests++;
1243 req = (void *) request_buffer->requests + i * req_size;
1244
1245 req->performance_domain = event_get_domain(event);
1246 req->data_size = cpu_to_be16(8);
1247 req->data_offset = cpu_to_be32(event_get_offset(event));
1248 req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event));
1249 req->max_num_lpars = cpu_to_be16(1);
1250 req->starting_ix = cpu_to_be16(idx);
1251 req->max_ix = cpu_to_be16(1);
1252
1253 if (request_buffer->interface_version > 1) {
1254 if (domain_needs_aggregation(req->performance_domain))
1255 req->max_num_thread_groups = -1;
1256 else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) {
1257 req->starting_thread_group_ix = idx % 2;
1258 req->max_num_thread_groups = 1;
1259 }
1260 }
1261
1262 return 0;
1263 }
1264
1265 /**
1266 * get_count_from_result - get event count from all result elements in result
1267 *
1268 * If the event corresponding to this result needs aggregation of the result
1269 * element values, then this function does that.
1270 *
1271 * @event: Event associated with @res.
1272 * @resb: Result buffer containing @res.
1273 * @res: Result to work on.
1274 * @countp: Output variable containing the event count.
1275 * @next: Optional output variable pointing to the next result in @resb.
1276 */
get_count_from_result(struct perf_event *event, struct hv_24x7_data_result_buffer *resb, struct hv_24x7_result *res, u64 *countp, struct hv_24x7_result **next)1277 static int get_count_from_result(struct perf_event *event,
1278 struct hv_24x7_data_result_buffer *resb,
1279 struct hv_24x7_result *res, u64 *countp,
1280 struct hv_24x7_result **next)
1281 {
1282 u16 num_elements = be16_to_cpu(res->num_elements_returned);
1283 u16 data_size = be16_to_cpu(res->result_element_data_size);
1284 unsigned int data_offset;
1285 void *element_data;
1286 int i;
1287 u64 count;
1288
1289 /*
1290 * We can bail out early if the result is empty.
1291 */
1292 if (!num_elements) {
1293 pr_debug("Result of request %hhu is empty, nothing to do\n",
1294 res->result_ix);
1295
1296 if (next)
1297 *next = (struct hv_24x7_result *) res->elements;
1298
1299 return -ENODATA;
1300 }
1301
1302 /*
1303 * Since we always specify 1 as the maximum for the smallest resource
1304 * we're requesting, there should to be only one element per result.
1305 * Except when an event needs aggregation, in which case there are more.
1306 */
1307 if (num_elements != 1 &&
1308 !domain_needs_aggregation(event_get_domain(event))) {
1309 pr_err("Error: result of request %hhu has %hu elements\n",
1310 res->result_ix, num_elements);
1311
1312 return -EIO;
1313 }
1314
1315 if (data_size != sizeof(u64)) {
1316 pr_debug("Error: result of request %hhu has data of %hu bytes\n",
1317 res->result_ix, data_size);
1318
1319 return -ENOTSUPP;
1320 }
1321
1322 if (resb->interface_version == 1)
1323 data_offset = offsetof(struct hv_24x7_result_element_v1,
1324 element_data);
1325 else
1326 data_offset = offsetof(struct hv_24x7_result_element_v2,
1327 element_data);
1328
1329 /* Go through the result elements in the result. */
1330 for (i = count = 0, element_data = res->elements + data_offset;
1331 i < num_elements;
1332 i++, element_data += data_size + data_offset)
1333 count += be64_to_cpu(*((u64 *) element_data));
1334
1335 *countp = count;
1336
1337 /* The next result is after the last result element. */
1338 if (next)
1339 *next = element_data - data_offset;
1340
1341 return 0;
1342 }
1343
single_24x7_request(struct perf_event *event, u64 *count)1344 static int single_24x7_request(struct perf_event *event, u64 *count)
1345 {
1346 int ret;
1347 struct hv_24x7_request_buffer *request_buffer;
1348 struct hv_24x7_data_result_buffer *result_buffer;
1349
1350 BUILD_BUG_ON(sizeof(*request_buffer) > 4096);
1351 BUILD_BUG_ON(sizeof(*result_buffer) > 4096);
1352
1353 request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1354 result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1355
1356 init_24x7_request(request_buffer, result_buffer);
1357
1358 ret = add_event_to_24x7_request(event, request_buffer);
1359 if (ret)
1360 goto out;
1361
1362 ret = make_24x7_request(request_buffer, result_buffer);
1363 if (ret)
1364 goto out;
1365
1366 /* process result from hcall */
1367 ret = get_count_from_result(event, result_buffer,
1368 result_buffer->results, count, NULL);
1369
1370 out:
1371 put_cpu_var(hv_24x7_reqb);
1372 put_cpu_var(hv_24x7_resb);
1373 return ret;
1374 }
1375
1376
h_24x7_event_init(struct perf_event *event)1377 static int h_24x7_event_init(struct perf_event *event)
1378 {
1379 struct hv_perf_caps caps;
1380 unsigned domain;
1381 unsigned long hret;
1382 u64 ct;
1383
1384 /* Not our event */
1385 if (event->attr.type != event->pmu->type)
1386 return -ENOENT;
1387
1388 /* Unused areas must be 0 */
1389 if (event_get_reserved1(event) ||
1390 event_get_reserved2(event) ||
1391 event_get_reserved3(event)) {
1392 pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n",
1393 event->attr.config,
1394 event_get_reserved1(event),
1395 event->attr.config1,
1396 event_get_reserved2(event),
1397 event->attr.config2,
1398 event_get_reserved3(event));
1399 return -EINVAL;
1400 }
1401
1402 /* no branch sampling */
1403 if (has_branch_stack(event))
1404 return -EOPNOTSUPP;
1405
1406 /* offset must be 8 byte aligned */
1407 if (event_get_offset(event) % 8) {
1408 pr_devel("bad alignment\n");
1409 return -EINVAL;
1410 }
1411
1412 domain = event_get_domain(event);
1413 if (domain == 0 || domain >= HV_PERF_DOMAIN_MAX) {
1414 pr_devel("invalid domain %d\n", domain);
1415 return -EINVAL;
1416 }
1417
1418 hret = hv_perf_caps_get(&caps);
1419 if (hret) {
1420 pr_devel("could not get capabilities: rc=%ld\n", hret);
1421 return -EIO;
1422 }
1423
1424 /* Physical domains & other lpars require extra capabilities */
1425 if (!caps.collect_privileged && (is_physical_domain(domain) ||
1426 (event_get_lpar(event) != event_get_lpar_max()))) {
1427 pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n",
1428 is_physical_domain(domain),
1429 event_get_lpar(event));
1430 return -EACCES;
1431 }
1432
1433 /* Get the initial value of the counter for this event */
1434 if (single_24x7_request(event, &ct)) {
1435 pr_devel("test hcall failed\n");
1436 return -EIO;
1437 }
1438 (void)local64_xchg(&event->hw.prev_count, ct);
1439
1440 return 0;
1441 }
1442
h_24x7_get_value(struct perf_event *event)1443 static u64 h_24x7_get_value(struct perf_event *event)
1444 {
1445 u64 ct;
1446
1447 if (single_24x7_request(event, &ct))
1448 /* We checked this in event init, shouldn't fail here... */
1449 return 0;
1450
1451 return ct;
1452 }
1453
update_event_count(struct perf_event *event, u64 now)1454 static void update_event_count(struct perf_event *event, u64 now)
1455 {
1456 s64 prev;
1457
1458 prev = local64_xchg(&event->hw.prev_count, now);
1459 local64_add(now - prev, &event->count);
1460 }
1461
h_24x7_event_read(struct perf_event *event)1462 static void h_24x7_event_read(struct perf_event *event)
1463 {
1464 u64 now;
1465 struct hv_24x7_request_buffer *request_buffer;
1466 struct hv_24x7_hw *h24x7hw;
1467 int txn_flags;
1468
1469 txn_flags = __this_cpu_read(hv_24x7_txn_flags);
1470
1471 /*
1472 * If in a READ transaction, add this counter to the list of
1473 * counters to read during the next HCALL (i.e commit_txn()).
1474 * If not in a READ transaction, go ahead and make the HCALL
1475 * to read this counter by itself.
1476 */
1477
1478 if (txn_flags & PERF_PMU_TXN_READ) {
1479 int i;
1480 int ret;
1481
1482 if (__this_cpu_read(hv_24x7_txn_err))
1483 return;
1484
1485 request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1486
1487 ret = add_event_to_24x7_request(event, request_buffer);
1488 if (ret) {
1489 __this_cpu_write(hv_24x7_txn_err, ret);
1490 } else {
1491 /*
1492 * Associate the event with the HCALL request index,
1493 * so ->commit_txn() can quickly find/update count.
1494 */
1495 i = request_buffer->num_requests - 1;
1496
1497 h24x7hw = &get_cpu_var(hv_24x7_hw);
1498 h24x7hw->events[i] = event;
1499 put_cpu_var(h24x7hw);
1500 }
1501
1502 put_cpu_var(hv_24x7_reqb);
1503 } else {
1504 now = h_24x7_get_value(event);
1505 update_event_count(event, now);
1506 }
1507 }
1508
h_24x7_event_start(struct perf_event *event, int flags)1509 static void h_24x7_event_start(struct perf_event *event, int flags)
1510 {
1511 if (flags & PERF_EF_RELOAD)
1512 local64_set(&event->hw.prev_count, h_24x7_get_value(event));
1513 }
1514
h_24x7_event_stop(struct perf_event *event, int flags)1515 static void h_24x7_event_stop(struct perf_event *event, int flags)
1516 {
1517 h_24x7_event_read(event);
1518 }
1519
h_24x7_event_add(struct perf_event *event, int flags)1520 static int h_24x7_event_add(struct perf_event *event, int flags)
1521 {
1522 if (flags & PERF_EF_START)
1523 h_24x7_event_start(event, flags);
1524
1525 return 0;
1526 }
1527
1528 /*
1529 * 24x7 counters only support READ transactions. They are
1530 * always counting and dont need/support ADD transactions.
1531 * Cache the flags, but otherwise ignore transactions that
1532 * are not PERF_PMU_TXN_READ.
1533 */
h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags)1534 static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags)
1535 {
1536 struct hv_24x7_request_buffer *request_buffer;
1537 struct hv_24x7_data_result_buffer *result_buffer;
1538
1539 /* We should not be called if we are already in a txn */
1540 WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags));
1541
1542 __this_cpu_write(hv_24x7_txn_flags, flags);
1543 if (flags & ~PERF_PMU_TXN_READ)
1544 return;
1545
1546 request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1547 result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1548
1549 init_24x7_request(request_buffer, result_buffer);
1550
1551 put_cpu_var(hv_24x7_resb);
1552 put_cpu_var(hv_24x7_reqb);
1553 }
1554
1555 /*
1556 * Clean up transaction state.
1557 *
1558 * NOTE: Ignore state of request and result buffers for now.
1559 * We will initialize them during the next read/txn.
1560 */
reset_txn(void)1561 static void reset_txn(void)
1562 {
1563 __this_cpu_write(hv_24x7_txn_flags, 0);
1564 __this_cpu_write(hv_24x7_txn_err, 0);
1565 }
1566
1567 /*
1568 * 24x7 counters only support READ transactions. They are always counting
1569 * and dont need/support ADD transactions. Clear ->txn_flags but otherwise
1570 * ignore transactions that are not of type PERF_PMU_TXN_READ.
1571 *
1572 * For READ transactions, submit all pending 24x7 requests (i.e requests
1573 * that were queued by h_24x7_event_read()), to the hypervisor and update
1574 * the event counts.
1575 */
h_24x7_event_commit_txn(struct pmu *pmu)1576 static int h_24x7_event_commit_txn(struct pmu *pmu)
1577 {
1578 struct hv_24x7_request_buffer *request_buffer;
1579 struct hv_24x7_data_result_buffer *result_buffer;
1580 struct hv_24x7_result *res, *next_res;
1581 u64 count;
1582 int i, ret, txn_flags;
1583 struct hv_24x7_hw *h24x7hw;
1584
1585 txn_flags = __this_cpu_read(hv_24x7_txn_flags);
1586 WARN_ON_ONCE(!txn_flags);
1587
1588 ret = 0;
1589 if (txn_flags & ~PERF_PMU_TXN_READ)
1590 goto out;
1591
1592 ret = __this_cpu_read(hv_24x7_txn_err);
1593 if (ret)
1594 goto out;
1595
1596 request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1597 result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1598
1599 ret = make_24x7_request(request_buffer, result_buffer);
1600 if (ret)
1601 goto put_reqb;
1602
1603 h24x7hw = &get_cpu_var(hv_24x7_hw);
1604
1605 /* Go through results in the result buffer to update event counts. */
1606 for (i = 0, res = result_buffer->results;
1607 i < result_buffer->num_results; i++, res = next_res) {
1608 struct perf_event *event = h24x7hw->events[res->result_ix];
1609
1610 ret = get_count_from_result(event, result_buffer, res, &count,
1611 &next_res);
1612 if (ret)
1613 break;
1614
1615 update_event_count(event, count);
1616 }
1617
1618 put_cpu_var(hv_24x7_hw);
1619
1620 put_reqb:
1621 put_cpu_var(hv_24x7_resb);
1622 put_cpu_var(hv_24x7_reqb);
1623 out:
1624 reset_txn();
1625 return ret;
1626 }
1627
1628 /*
1629 * 24x7 counters only support READ transactions. They are always counting
1630 * and dont need/support ADD transactions. However, regardless of type
1631 * of transaction, all we need to do is cleanup, so we don't have to check
1632 * the type of transaction.
1633 */
h_24x7_event_cancel_txn(struct pmu *pmu)1634 static void h_24x7_event_cancel_txn(struct pmu *pmu)
1635 {
1636 WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags));
1637 reset_txn();
1638 }
1639
1640 static struct pmu h_24x7_pmu = {
1641 .task_ctx_nr = perf_invalid_context,
1642
1643 .name = "hv_24x7",
1644 .attr_groups = attr_groups,
1645 .event_init = h_24x7_event_init,
1646 .add = h_24x7_event_add,
1647 .del = h_24x7_event_stop,
1648 .start = h_24x7_event_start,
1649 .stop = h_24x7_event_stop,
1650 .read = h_24x7_event_read,
1651 .start_txn = h_24x7_event_start_txn,
1652 .commit_txn = h_24x7_event_commit_txn,
1653 .cancel_txn = h_24x7_event_cancel_txn,
1654 .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
1655 };
1656
ppc_hv_24x7_cpu_online(unsigned int cpu)1657 static int ppc_hv_24x7_cpu_online(unsigned int cpu)
1658 {
1659 if (cpumask_empty(&hv_24x7_cpumask))
1660 cpumask_set_cpu(cpu, &hv_24x7_cpumask);
1661
1662 return 0;
1663 }
1664
ppc_hv_24x7_cpu_offline(unsigned int cpu)1665 static int ppc_hv_24x7_cpu_offline(unsigned int cpu)
1666 {
1667 int target;
1668
1669 /* Check if exiting cpu is used for collecting 24x7 events */
1670 if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask))
1671 return 0;
1672
1673 /* Find a new cpu to collect 24x7 events */
1674 target = cpumask_last(cpu_active_mask);
1675
1676 if (target < 0 || target >= nr_cpu_ids) {
1677 pr_err("hv_24x7: CPU hotplug init failed\n");
1678 return -1;
1679 }
1680
1681 /* Migrate 24x7 events to the new target */
1682 cpumask_set_cpu(target, &hv_24x7_cpumask);
1683 perf_pmu_migrate_context(&h_24x7_pmu, cpu, target);
1684
1685 return 0;
1686 }
1687
hv_24x7_cpu_hotplug_init(void)1688 static int hv_24x7_cpu_hotplug_init(void)
1689 {
1690 return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
1691 "perf/powerpc/hv_24x7:online",
1692 ppc_hv_24x7_cpu_online,
1693 ppc_hv_24x7_cpu_offline);
1694 }
1695
hv_24x7_init(void)1696 static int hv_24x7_init(void)
1697 {
1698 int r;
1699 unsigned long hret;
1700 struct hv_perf_caps caps;
1701
1702 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
1703 pr_debug("not a virtualized system, not enabling\n");
1704 return -ENODEV;
1705 } else if (!cur_cpu_spec->oprofile_cpu_type)
1706 return -ENODEV;
1707
1708 /* POWER8 only supports v1, while POWER9 only supports v2. */
1709 if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8"))
1710 interface_version = 1;
1711 else {
1712 interface_version = 2;
1713
1714 /* SMT8 in POWER9 needs to aggregate result elements. */
1715 if (threads_per_core == 8)
1716 aggregate_result_elements = true;
1717 }
1718
1719 hret = hv_perf_caps_get(&caps);
1720 if (hret) {
1721 pr_debug("could not obtain capabilities, not enabling, rc=%ld\n",
1722 hret);
1723 return -ENODEV;
1724 }
1725
1726 hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL);
1727 if (!hv_page_cache)
1728 return -ENOMEM;
1729
1730 /* sampling not supported */
1731 h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1732
1733 r = create_events_from_catalog(&event_group.attrs,
1734 &event_desc_group.attrs,
1735 &event_long_desc_group.attrs);
1736
1737 if (r)
1738 return r;
1739
1740 /* init cpuhotplug */
1741 r = hv_24x7_cpu_hotplug_init();
1742 if (r)
1743 return r;
1744
1745 r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
1746 if (r)
1747 return r;
1748
1749 read_24x7_sys_info();
1750
1751 return 0;
1752 }
1753
1754 device_initcall(hv_24x7_init);
1755