1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Hypervisor supplied "24x7" performance counter support
4 *
5 * Author: Cody P Schafer <cody@linux.vnet.ibm.com>
6 * Copyright 2014 IBM Corporation.
7 */
8
9#define pr_fmt(fmt) "hv-24x7: " fmt
10
11#include <linux/perf_event.h>
12#include <linux/rbtree.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h>
16
17#include <asm/cputhreads.h>
18#include <asm/firmware.h>
19#include <asm/hvcall.h>
20#include <asm/io.h>
21#include <linux/byteorder/generic.h>
22
23#include <asm/rtas.h>
24#include "hv-24x7.h"
25#include "hv-24x7-catalog.h"
26#include "hv-common.h"
27
28/* Version of the 24x7 hypervisor API that we should use in this machine. */
29static int interface_version;
30
31/* Whether we have to aggregate result data for some domains. */
32static bool aggregate_result_elements;
33
34static cpumask_t hv_24x7_cpumask;
35
36static bool domain_is_valid(unsigned domain)
37{
38	switch (domain) {
39#define DOMAIN(n, v, x, c)		\
40	case HV_PERF_DOMAIN_##n:	\
41		/* fall through */
42#include "hv-24x7-domains.h"
43#undef DOMAIN
44		return true;
45	default:
46		return false;
47	}
48}
49
50static bool is_physical_domain(unsigned domain)
51{
52	switch (domain) {
53#define DOMAIN(n, v, x, c)		\
54	case HV_PERF_DOMAIN_##n:	\
55		return c;
56#include "hv-24x7-domains.h"
57#undef DOMAIN
58	default:
59		return false;
60	}
61}
62
63/*
64 * The Processor Module Information system parameter allows transferring
65 * of certain processor module information from the platform to the OS.
66 * Refer PAPR+ document to get parameter token value as '43'.
67 */
68
69#define PROCESSOR_MODULE_INFO   43
70
71static u32 phys_sockets;	/* Physical sockets */
72static u32 phys_chipspersocket;	/* Physical chips per socket*/
73static u32 phys_coresperchip; /* Physical cores per chip */
74
75/*
76 * read_24x7_sys_info()
77 * Retrieve the number of sockets and chips per socket and cores per
78 * chip details through the get-system-parameter rtas call.
79 */
80void read_24x7_sys_info(void)
81{
82	const s32 token = rtas_token("ibm,get-system-parameter");
83	int call_status;
84
85	/*
86	 * Making system parameter: chips and sockets and cores per chip
87	 * default to 1.
88	 */
89	phys_sockets = 1;
90	phys_chipspersocket = 1;
91	phys_coresperchip = 1;
92
93	do {
94		spin_lock(&rtas_data_buf_lock);
95		call_status = rtas_call(token, 3, 1, NULL, PROCESSOR_MODULE_INFO,
96					__pa(rtas_data_buf), RTAS_DATA_BUF_SIZE);
97		if (call_status == 0) {
98			int ntypes = be16_to_cpup((__be16 *)&rtas_data_buf[2]);
99			int len = be16_to_cpup((__be16 *)&rtas_data_buf[0]);
100
101			if (len >= 8 && ntypes != 0) {
102				phys_sockets = be16_to_cpup((__be16 *)&rtas_data_buf[4]);
103				phys_chipspersocket = be16_to_cpup((__be16 *)&rtas_data_buf[6]);
104				phys_coresperchip = be16_to_cpup((__be16 *)&rtas_data_buf[8]);
105			}
106		}
107		spin_unlock(&rtas_data_buf_lock);
108	} while (rtas_busy_delay(call_status));
109
110	if (call_status != 0) {
111		pr_err("Error calling get-system-parameter %d\n",
112		       call_status);
113	}
114}
115
116/* Domains for which more than one result element are returned for each event. */
117static bool domain_needs_aggregation(unsigned int domain)
118{
119	return aggregate_result_elements &&
120			(domain == HV_PERF_DOMAIN_PHYS_CORE ||
121			 (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE &&
122			  domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE));
123}
124
125static const char *domain_name(unsigned domain)
126{
127	if (!domain_is_valid(domain))
128		return NULL;
129
130	switch (domain) {
131	case HV_PERF_DOMAIN_PHYS_CHIP:		return "Physical Chip";
132	case HV_PERF_DOMAIN_PHYS_CORE:		return "Physical Core";
133	case HV_PERF_DOMAIN_VCPU_HOME_CORE:	return "VCPU Home Core";
134	case HV_PERF_DOMAIN_VCPU_HOME_CHIP:	return "VCPU Home Chip";
135	case HV_PERF_DOMAIN_VCPU_HOME_NODE:	return "VCPU Home Node";
136	case HV_PERF_DOMAIN_VCPU_REMOTE_NODE:	return "VCPU Remote Node";
137	}
138
139	WARN_ON_ONCE(domain);
140	return NULL;
141}
142
143static bool catalog_entry_domain_is_valid(unsigned domain)
144{
145	/* POWER8 doesn't support virtual domains. */
146	if (interface_version == 1)
147		return is_physical_domain(domain);
148	else
149		return domain_is_valid(domain);
150}
151
152/*
153 * TODO: Merging events:
154 * - Think of the hcall as an interface to a 4d array of counters:
155 *   - x = domains
156 *   - y = indexes in the domain (core, chip, vcpu, node, etc)
157 *   - z = offset into the counter space
158 *   - w = lpars (guest vms, "logical partitions")
159 * - A single request is: x,y,y_last,z,z_last,w,w_last
160 *   - this means we can retrieve a rectangle of counters in y,z for a single x.
161 *
162 * - Things to consider (ignoring w):
163 *   - input  cost_per_request = 16
164 *   - output cost_per_result(ys,zs)  = 8 + 8 * ys + ys * zs
165 *   - limited number of requests per hcall (must fit into 4K bytes)
166 *     - 4k = 16 [buffer header] - 16 [request size] * request_count
167 *     - 255 requests per hcall
168 *   - sometimes it will be more efficient to read extra data and discard
169 */
170
171/*
172 * Example usage:
173 *  perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/'
174 */
175
176/* u3 0-6, one of HV_24X7_PERF_DOMAIN */
177EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3);
178/* u16 */
179EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31);
180EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31);
181EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31);
182/* u32, see "data_offset" */
183EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63);
184/* u16 */
185EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15);
186
187EVENT_DEFINE_RANGE(reserved1, config,   4, 15);
188EVENT_DEFINE_RANGE(reserved2, config1, 16, 63);
189EVENT_DEFINE_RANGE(reserved3, config2,  0, 63);
190
191static struct attribute *format_attrs[] = {
192	&format_attr_domain.attr,
193	&format_attr_offset.attr,
194	&format_attr_core.attr,
195	&format_attr_chip.attr,
196	&format_attr_vcpu.attr,
197	&format_attr_lpar.attr,
198	NULL,
199};
200
201static struct attribute_group format_group = {
202	.name = "format",
203	.attrs = format_attrs,
204};
205
206static struct attribute_group event_group = {
207	.name = "events",
208	/* .attrs is set in init */
209};
210
211static struct attribute_group event_desc_group = {
212	.name = "event_descs",
213	/* .attrs is set in init */
214};
215
216static struct attribute_group event_long_desc_group = {
217	.name = "event_long_descs",
218	/* .attrs is set in init */
219};
220
221static struct kmem_cache *hv_page_cache;
222
223DEFINE_PER_CPU(int, hv_24x7_txn_flags);
224DEFINE_PER_CPU(int, hv_24x7_txn_err);
225
226struct hv_24x7_hw {
227	struct perf_event *events[255];
228};
229
230DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw);
231
232/*
233 * request_buffer and result_buffer are not required to be 4k aligned,
234 * but are not allowed to cross any 4k boundary. Aligning them to 4k is
235 * the simplest way to ensure that.
236 */
237#define H24x7_DATA_BUFFER_SIZE	4096
238DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
239DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
240
241static unsigned int max_num_requests(int interface_version)
242{
243	return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer))
244		/ H24x7_REQUEST_SIZE(interface_version);
245}
246
247static char *event_name(struct hv_24x7_event_data *ev, int *len)
248{
249	*len = be16_to_cpu(ev->event_name_len) - 2;
250	return (char *)ev->remainder;
251}
252
253static char *event_desc(struct hv_24x7_event_data *ev, int *len)
254{
255	unsigned nl = be16_to_cpu(ev->event_name_len);
256	__be16 *desc_len = (__be16 *)(ev->remainder + nl - 2);
257
258	*len = be16_to_cpu(*desc_len) - 2;
259	return (char *)ev->remainder + nl;
260}
261
262static char *event_long_desc(struct hv_24x7_event_data *ev, int *len)
263{
264	unsigned nl = be16_to_cpu(ev->event_name_len);
265	__be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2);
266	unsigned desc_len = be16_to_cpu(*desc_len_);
267	__be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2);
268
269	*len = be16_to_cpu(*long_desc_len) - 2;
270	return (char *)ev->remainder + nl + desc_len;
271}
272
273static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev,
274					  void *end)
275{
276	void *start = ev;
277
278	return (start + offsetof(struct hv_24x7_event_data, remainder)) < end;
279}
280
281/*
282 * Things we don't check:
283 *  - padding for desc, name, and long/detailed desc is required to be '\0'
284 *    bytes.
285 *
286 *  Return NULL if we pass end,
287 *  Otherwise return the address of the byte just following the event.
288 */
289static void *event_end(struct hv_24x7_event_data *ev, void *end)
290{
291	void *start = ev;
292	__be16 *dl_, *ldl_;
293	unsigned dl, ldl;
294	unsigned nl = be16_to_cpu(ev->event_name_len);
295
296	if (nl < 2) {
297		pr_debug("%s: name length too short: %d", __func__, nl);
298		return NULL;
299	}
300
301	if (start + nl > end) {
302		pr_debug("%s: start=%p + nl=%u > end=%p",
303				__func__, start, nl, end);
304		return NULL;
305	}
306
307	dl_ = (__be16 *)(ev->remainder + nl - 2);
308	if (!IS_ALIGNED((uintptr_t)dl_, 2))
309		pr_warn("desc len not aligned %p", dl_);
310	dl = be16_to_cpu(*dl_);
311	if (dl < 2) {
312		pr_debug("%s: desc len too short: %d", __func__, dl);
313		return NULL;
314	}
315
316	if (start + nl + dl > end) {
317		pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p",
318				__func__, start, nl, dl, start + nl + dl, end);
319		return NULL;
320	}
321
322	ldl_ = (__be16 *)(ev->remainder + nl + dl - 2);
323	if (!IS_ALIGNED((uintptr_t)ldl_, 2))
324		pr_warn("long desc len not aligned %p", ldl_);
325	ldl = be16_to_cpu(*ldl_);
326	if (ldl < 2) {
327		pr_debug("%s: long desc len too short (ldl=%u)",
328				__func__, ldl);
329		return NULL;
330	}
331
332	if (start + nl + dl + ldl > end) {
333		pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p",
334				__func__, start, nl, dl, ldl, end);
335		return NULL;
336	}
337
338	return start + nl + dl + ldl;
339}
340
341static long h_get_24x7_catalog_page_(unsigned long phys_4096,
342				     unsigned long version, unsigned long index)
343{
344	pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)",
345			phys_4096, version, index);
346
347	WARN_ON(!IS_ALIGNED(phys_4096, 4096));
348
349	return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE,
350			phys_4096, version, index);
351}
352
353static long h_get_24x7_catalog_page(char page[], u64 version, u32 index)
354{
355	return h_get_24x7_catalog_page_(virt_to_phys(page),
356					version, index);
357}
358
359/*
360 * Each event we find in the catalog, will have a sysfs entry. Format the
361 * data for this sysfs entry based on the event's domain.
362 *
363 * Events belonging to the Chip domain can only be monitored in that domain.
364 * i.e the domain for these events is a fixed/knwon value.
365 *
366 * Events belonging to the Core domain can be monitored either in the physical
367 * core or in one of the virtual CPU domains. So the domain value for these
368 * events must be specified by the user (i.e is a required parameter). Format
369 * the Core events with 'domain=?' so the perf-tool can error check required
370 * parameters.
371 *
372 * NOTE: For the Core domain events, rather than making domain a required
373 *	 parameter we could default it to PHYS_CORE and allowe users to
374 *	 override the domain to one of the VCPU domains.
375 *
376 *	 However, this can make the interface a little inconsistent.
377 *
378 *	 If we set domain=2 (PHYS_CHIP) and allow user to override this field
379 *	 the user may be tempted to also modify the "offset=x" field in which
380 *	 can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and
381 *	 HPM_INST (offset=0x20) events. With:
382 *
383 *		perf stat -e hv_24x7/HPM_PCYC,offset=0x20/
384 *
385 *	we end up monitoring HPM_INST, while the command line has HPM_PCYC.
386 *
387 *	By not assigning a default value to the domain for the Core events,
388 *	we can have simple guidelines:
389 *
390 *		- Specifying values for parameters with "=?" is required.
391 *
392 *		- Specifying (i.e overriding) values for other parameters
393 *		  is undefined.
394 */
395static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain)
396{
397	const char *sindex;
398	const char *lpar;
399	const char *domain_str;
400	char buf[8];
401
402	switch (domain) {
403	case HV_PERF_DOMAIN_PHYS_CHIP:
404		snprintf(buf, sizeof(buf), "%d", domain);
405		domain_str = buf;
406		lpar = "0x0";
407		sindex = "chip";
408		break;
409	case HV_PERF_DOMAIN_PHYS_CORE:
410		domain_str = "?";
411		lpar = "0x0";
412		sindex = "core";
413		break;
414	default:
415		domain_str = "?";
416		lpar = "?";
417		sindex = "vcpu";
418	}
419
420	return kasprintf(GFP_KERNEL,
421			"domain=%s,offset=0x%x,%s=?,lpar=%s",
422			domain_str,
423			be16_to_cpu(event->event_counter_offs) +
424				be16_to_cpu(event->event_group_record_offs),
425			sindex,
426			lpar);
427}
428
429/* Avoid trusting fw to NUL terminate strings */
430static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp)
431{
432	return kasprintf(gfp, "%.*s", max_len, maybe_str);
433}
434
435static ssize_t device_show_string(struct device *dev,
436		struct device_attribute *attr, char *buf)
437{
438	struct dev_ext_attribute *d;
439
440	d = container_of(attr, struct dev_ext_attribute, attr);
441
442	return sprintf(buf, "%s\n", (char *)d->var);
443}
444
445static ssize_t cpumask_show(struct device *dev,
446			    struct device_attribute *attr, char *buf)
447{
448	return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask);
449}
450
451static ssize_t sockets_show(struct device *dev,
452			    struct device_attribute *attr, char *buf)
453{
454	return sprintf(buf, "%d\n", phys_sockets);
455}
456
457static ssize_t chipspersocket_show(struct device *dev,
458				   struct device_attribute *attr, char *buf)
459{
460	return sprintf(buf, "%d\n", phys_chipspersocket);
461}
462
463static ssize_t coresperchip_show(struct device *dev,
464				 struct device_attribute *attr, char *buf)
465{
466	return sprintf(buf, "%d\n", phys_coresperchip);
467}
468
469static struct attribute *device_str_attr_create_(char *name, char *str)
470{
471	struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL);
472
473	if (!attr)
474		return NULL;
475
476	sysfs_attr_init(&attr->attr.attr);
477
478	attr->var = str;
479	attr->attr.attr.name = name;
480	attr->attr.attr.mode = 0444;
481	attr->attr.show = device_show_string;
482
483	return &attr->attr.attr;
484}
485
486/*
487 * Allocate and initialize strings representing event attributes.
488 *
489 * NOTE: The strings allocated here are never destroyed and continue to
490 *	 exist till shutdown. This is to allow us to create as many events
491 *	 from the catalog as possible, even if we encounter errors with some.
492 *	 In case of changes to error paths in future, these may need to be
493 *	 freed by the caller.
494 */
495static struct attribute *device_str_attr_create(char *name, int name_max,
496						int name_nonce,
497						char *str, size_t str_max)
498{
499	char *n;
500	char *s = memdup_to_str(str, str_max, GFP_KERNEL);
501	struct attribute *a;
502
503	if (!s)
504		return NULL;
505
506	if (!name_nonce)
507		n = kasprintf(GFP_KERNEL, "%.*s", name_max, name);
508	else
509		n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name,
510					name_nonce);
511	if (!n)
512		goto out_s;
513
514	a = device_str_attr_create_(n, s);
515	if (!a)
516		goto out_n;
517
518	return a;
519out_n:
520	kfree(n);
521out_s:
522	kfree(s);
523	return NULL;
524}
525
526static struct attribute *event_to_attr(unsigned ix,
527				       struct hv_24x7_event_data *event,
528				       unsigned domain,
529				       int nonce)
530{
531	int event_name_len;
532	char *ev_name, *a_ev_name, *val;
533	struct attribute *attr;
534
535	if (!domain_is_valid(domain)) {
536		pr_warn("catalog event %u has invalid domain %u\n",
537				ix, domain);
538		return NULL;
539	}
540
541	val = event_fmt(event, domain);
542	if (!val)
543		return NULL;
544
545	ev_name = event_name(event, &event_name_len);
546	if (!nonce)
547		a_ev_name = kasprintf(GFP_KERNEL, "%.*s",
548				(int)event_name_len, ev_name);
549	else
550		a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d",
551				(int)event_name_len, ev_name, nonce);
552
553	if (!a_ev_name)
554		goto out_val;
555
556	attr = device_str_attr_create_(a_ev_name, val);
557	if (!attr)
558		goto out_name;
559
560	return attr;
561out_name:
562	kfree(a_ev_name);
563out_val:
564	kfree(val);
565	return NULL;
566}
567
568static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event,
569					    int nonce)
570{
571	int nl, dl;
572	char *name = event_name(event, &nl);
573	char *desc = event_desc(event, &dl);
574
575	/* If there isn't a description, don't create the sysfs file */
576	if (!dl)
577		return NULL;
578
579	return device_str_attr_create(name, nl, nonce, desc, dl);
580}
581
582static struct attribute *
583event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)
584{
585	int nl, dl;
586	char *name = event_name(event, &nl);
587	char *desc = event_long_desc(event, &dl);
588
589	/* If there isn't a description, don't create the sysfs file */
590	if (!dl)
591		return NULL;
592
593	return device_str_attr_create(name, nl, nonce, desc, dl);
594}
595
596static int event_data_to_attrs(unsigned ix, struct attribute **attrs,
597				   struct hv_24x7_event_data *event, int nonce)
598{
599	*attrs = event_to_attr(ix, event, event->domain, nonce);
600	if (!*attrs)
601		return -1;
602
603	return 0;
604}
605
606/* */
607struct event_uniq {
608	struct rb_node node;
609	const char *name;
610	int nl;
611	unsigned ct;
612	unsigned domain;
613};
614
615static int memord(const void *d1, size_t s1, const void *d2, size_t s2)
616{
617	if (s1 < s2)
618		return 1;
619	if (s1 > s2)
620		return -1;
621
622	return memcmp(d1, d2, s1);
623}
624
625static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2,
626		       size_t s2, unsigned d2)
627{
628	int r = memord(v1, s1, v2, s2);
629
630	if (r)
631		return r;
632	if (d1 > d2)
633		return 1;
634	if (d2 > d1)
635		return -1;
636	return 0;
637}
638
639static int event_uniq_add(struct rb_root *root, const char *name, int nl,
640			  unsigned domain)
641{
642	struct rb_node **new = &(root->rb_node), *parent = NULL;
643	struct event_uniq *data;
644
645	/* Figure out where to put new node */
646	while (*new) {
647		struct event_uniq *it;
648		int result;
649
650		it = rb_entry(*new, struct event_uniq, node);
651		result = ev_uniq_ord(name, nl, domain, it->name, it->nl,
652					it->domain);
653
654		parent = *new;
655		if (result < 0)
656			new = &((*new)->rb_left);
657		else if (result > 0)
658			new = &((*new)->rb_right);
659		else {
660			it->ct++;
661			pr_info("found a duplicate event %.*s, ct=%u\n", nl,
662						name, it->ct);
663			return it->ct;
664		}
665	}
666
667	data = kmalloc(sizeof(*data), GFP_KERNEL);
668	if (!data)
669		return -ENOMEM;
670
671	*data = (struct event_uniq) {
672		.name = name,
673		.nl = nl,
674		.ct = 0,
675		.domain = domain,
676	};
677
678	/* Add new node and rebalance tree. */
679	rb_link_node(&data->node, parent, new);
680	rb_insert_color(&data->node, root);
681
682	/* data->ct */
683	return 0;
684}
685
686static void event_uniq_destroy(struct rb_root *root)
687{
688	/*
689	 * the strings we point to are in the giant block of memory filled by
690	 * the catalog, and are freed separately.
691	 */
692	struct event_uniq *pos, *n;
693
694	rbtree_postorder_for_each_entry_safe(pos, n, root, node)
695		kfree(pos);
696}
697
698
699/*
700 * ensure the event structure's sizes are self consistent and don't cause us to
701 * read outside of the event
702 *
703 * On success, return the event length in bytes.
704 * Otherwise, return -1 (and print as appropriate).
705 */
706static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
707					  size_t event_idx,
708					  size_t event_data_bytes,
709					  size_t event_entry_count,
710					  size_t offset, void *end)
711{
712	ssize_t ev_len;
713	void *ev_end, *calc_ev_end;
714
715	if (offset >= event_data_bytes)
716		return -1;
717
718	if (event_idx >= event_entry_count) {
719		pr_devel("catalog event data has %zu bytes of padding after last event\n",
720				event_data_bytes - offset);
721		return -1;
722	}
723
724	if (!event_fixed_portion_is_within(event, end)) {
725		pr_warn("event %zu fixed portion is not within range\n",
726				event_idx);
727		return -1;
728	}
729
730	ev_len = be16_to_cpu(event->length);
731
732	if (ev_len % 16)
733		pr_info("event %zu has length %zu not divisible by 16: event=%pK\n",
734				event_idx, ev_len, event);
735
736	ev_end = (__u8 *)event + ev_len;
737	if (ev_end > end) {
738		pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n",
739				event_idx, ev_len, ev_end, end,
740				offset);
741		return -1;
742	}
743
744	calc_ev_end = event_end(event, end);
745	if (!calc_ev_end) {
746		pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n",
747			event_idx, event_data_bytes, event, end,
748			offset);
749		return -1;
750	}
751
752	if (calc_ev_end > ev_end) {
753		pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n",
754			event_idx, event, ev_end, offset, calc_ev_end);
755		return -1;
756	}
757
758	return ev_len;
759}
760
761#define MAX_4K (SIZE_MAX / 4096)
762
763static int create_events_from_catalog(struct attribute ***events_,
764				      struct attribute ***event_descs_,
765				      struct attribute ***event_long_descs_)
766{
767	long hret;
768	size_t catalog_len, catalog_page_len, event_entry_count,
769	       event_data_len, event_data_offs,
770	       event_data_bytes, junk_events, event_idx, event_attr_ct, i,
771	       attr_max, event_idx_last, desc_ct, long_desc_ct;
772	ssize_t ct, ev_len;
773	uint64_t catalog_version_num;
774	struct attribute **events, **event_descs, **event_long_descs;
775	struct hv_24x7_catalog_page_0 *page_0 =
776		kmem_cache_alloc(hv_page_cache, GFP_KERNEL);
777	void *page = page_0;
778	void *event_data, *end;
779	struct hv_24x7_event_data *event;
780	struct rb_root ev_uniq = RB_ROOT;
781	int ret = 0;
782
783	if (!page) {
784		ret = -ENOMEM;
785		goto e_out;
786	}
787
788	hret = h_get_24x7_catalog_page(page, 0, 0);
789	if (hret) {
790		ret = -EIO;
791		goto e_free;
792	}
793
794	catalog_version_num = be64_to_cpu(page_0->version);
795	catalog_page_len = be32_to_cpu(page_0->length);
796
797	if (MAX_4K < catalog_page_len) {
798		pr_err("invalid page count: %zu\n", catalog_page_len);
799		ret = -EIO;
800		goto e_free;
801	}
802
803	catalog_len = catalog_page_len * 4096;
804
805	event_entry_count = be16_to_cpu(page_0->event_entry_count);
806	event_data_offs   = be16_to_cpu(page_0->event_data_offs);
807	event_data_len    = be16_to_cpu(page_0->event_data_len);
808
809	pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n",
810			catalog_version_num, catalog_len,
811			event_entry_count, event_data_offs, event_data_len);
812
813	if ((MAX_4K < event_data_len)
814			|| (MAX_4K < event_data_offs)
815			|| (MAX_4K - event_data_offs < event_data_len)) {
816		pr_err("invalid event data offs %zu and/or len %zu\n",
817				event_data_offs, event_data_len);
818		ret = -EIO;
819		goto e_free;
820	}
821
822	if ((event_data_offs + event_data_len) > catalog_page_len) {
823		pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n",
824				event_data_offs,
825				event_data_offs + event_data_len,
826				catalog_page_len);
827		ret = -EIO;
828		goto e_free;
829	}
830
831	if (SIZE_MAX - 1 < event_entry_count) {
832		pr_err("event_entry_count %zu is invalid\n", event_entry_count);
833		ret = -EIO;
834		goto e_free;
835	}
836
837	event_data_bytes = event_data_len * 4096;
838
839	/*
840	 * event data can span several pages, events can cross between these
841	 * pages. Use vmalloc to make this easier.
842	 */
843	event_data = vmalloc(event_data_bytes);
844	if (!event_data) {
845		pr_err("could not allocate event data\n");
846		ret = -ENOMEM;
847		goto e_free;
848	}
849
850	end = event_data + event_data_bytes;
851
852	/*
853	 * using vmalloc_to_phys() like this only works if PAGE_SIZE is
854	 * divisible by 4096
855	 */
856	BUILD_BUG_ON(PAGE_SIZE % 4096);
857
858	for (i = 0; i < event_data_len; i++) {
859		hret = h_get_24x7_catalog_page_(
860				vmalloc_to_phys(event_data + i * 4096),
861				catalog_version_num,
862				i + event_data_offs);
863		if (hret) {
864			pr_err("Failed to get event data in page %zu: rc=%ld\n",
865			       i + event_data_offs, hret);
866			ret = -EIO;
867			goto e_event_data;
868		}
869	}
870
871	/*
872	 * scan the catalog to determine the number of attributes we need, and
873	 * verify it at the same time.
874	 */
875	for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0;
876	     ;
877	     event_idx++, event = (void *)event + ev_len) {
878		size_t offset = (void *)event - (void *)event_data;
879		char *name;
880		int nl;
881
882		ev_len = catalog_event_len_validate(event, event_idx,
883						    event_data_bytes,
884						    event_entry_count,
885						    offset, end);
886		if (ev_len < 0)
887			break;
888
889		name = event_name(event, &nl);
890
891		if (event->event_group_record_len == 0) {
892			pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n",
893					event_idx, nl, name);
894			junk_events++;
895			continue;
896		}
897
898		if (!catalog_entry_domain_is_valid(event->domain)) {
899			pr_info("event %zu (%.*s) has invalid domain %d\n",
900					event_idx, nl, name, event->domain);
901			junk_events++;
902			continue;
903		}
904
905		attr_max++;
906	}
907
908	event_idx_last = event_idx;
909	if (event_idx_last != event_entry_count)
910		pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n",
911				event_idx_last, event_entry_count, junk_events);
912
913	events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL);
914	if (!events) {
915		ret = -ENOMEM;
916		goto e_event_data;
917	}
918
919	event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs),
920				GFP_KERNEL);
921	if (!event_descs) {
922		ret = -ENOMEM;
923		goto e_event_attrs;
924	}
925
926	event_long_descs = kmalloc_array(event_idx + 1,
927			sizeof(*event_long_descs), GFP_KERNEL);
928	if (!event_long_descs) {
929		ret = -ENOMEM;
930		goto e_event_descs;
931	}
932
933	/* Iterate over the catalog filling in the attribute vector */
934	for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0,
935				event = event_data, event_idx = 0;
936			event_idx < event_idx_last;
937			event_idx++, ev_len = be16_to_cpu(event->length),
938				event = (void *)event + ev_len) {
939		char *name;
940		int nl;
941		int nonce;
942		/*
943		 * these are the only "bad" events that are intermixed and that
944		 * we can ignore without issue. make sure to skip them here
945		 */
946		if (event->event_group_record_len == 0)
947			continue;
948		if (!catalog_entry_domain_is_valid(event->domain))
949			continue;
950
951		name  = event_name(event, &nl);
952		nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
953		ct    = event_data_to_attrs(event_idx, events + event_attr_ct,
954					    event, nonce);
955		if (ct < 0) {
956			pr_warn("event %zu (%.*s) creation failure, skipping\n",
957				event_idx, nl, name);
958			junk_events++;
959		} else {
960			event_attr_ct++;
961			event_descs[desc_ct] = event_to_desc_attr(event, nonce);
962			if (event_descs[desc_ct])
963				desc_ct++;
964			event_long_descs[long_desc_ct] =
965					event_to_long_desc_attr(event, nonce);
966			if (event_long_descs[long_desc_ct])
967				long_desc_ct++;
968		}
969	}
970
971	pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n",
972			event_idx, event_attr_ct, junk_events, desc_ct);
973
974	events[event_attr_ct] = NULL;
975	event_descs[desc_ct] = NULL;
976	event_long_descs[long_desc_ct] = NULL;
977
978	event_uniq_destroy(&ev_uniq);
979	vfree(event_data);
980	kmem_cache_free(hv_page_cache, page);
981
982	*events_ = events;
983	*event_descs_ = event_descs;
984	*event_long_descs_ = event_long_descs;
985	return 0;
986
987e_event_descs:
988	kfree(event_descs);
989e_event_attrs:
990	kfree(events);
991e_event_data:
992	vfree(event_data);
993e_free:
994	kmem_cache_free(hv_page_cache, page);
995e_out:
996	*events_ = NULL;
997	*event_descs_ = NULL;
998	*event_long_descs_ = NULL;
999	return ret;
1000}
1001
1002static ssize_t catalog_read(struct file *filp, struct kobject *kobj,
1003			    struct bin_attribute *bin_attr, char *buf,
1004			    loff_t offset, size_t count)
1005{
1006	long hret;
1007	ssize_t ret = 0;
1008	size_t catalog_len = 0, catalog_page_len = 0;
1009	loff_t page_offset = 0;
1010	loff_t offset_in_page;
1011	size_t copy_len;
1012	uint64_t catalog_version_num = 0;
1013	void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);
1014	struct hv_24x7_catalog_page_0 *page_0 = page;
1015
1016	if (!page)
1017		return -ENOMEM;
1018
1019	hret = h_get_24x7_catalog_page(page, 0, 0);
1020	if (hret) {
1021		ret = -EIO;
1022		goto e_free;
1023	}
1024
1025	catalog_version_num = be64_to_cpu(page_0->version);
1026	catalog_page_len = be32_to_cpu(page_0->length);
1027	catalog_len = catalog_page_len * 4096;
1028
1029	page_offset = offset / 4096;
1030	offset_in_page = offset % 4096;
1031
1032	if (page_offset >= catalog_page_len)
1033		goto e_free;
1034
1035	if (page_offset != 0) {
1036		hret = h_get_24x7_catalog_page(page, catalog_version_num,
1037					       page_offset);
1038		if (hret) {
1039			ret = -EIO;
1040			goto e_free;
1041		}
1042	}
1043
1044	copy_len = 4096 - offset_in_page;
1045	if (copy_len > count)
1046		copy_len = count;
1047
1048	memcpy(buf, page+offset_in_page, copy_len);
1049	ret = copy_len;
1050
1051e_free:
1052	if (hret)
1053		pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
1054		       " rc=%ld\n",
1055		       catalog_version_num, page_offset, hret);
1056	kmem_cache_free(hv_page_cache, page);
1057
1058	pr_devel("catalog_read: offset=%lld(%lld) count=%zu "
1059			"catalog_len=%zu(%zu) => %zd\n", offset, page_offset,
1060			count, catalog_len, catalog_page_len, ret);
1061
1062	return ret;
1063}
1064
1065static ssize_t domains_show(struct device *dev, struct device_attribute *attr,
1066			    char *page)
1067{
1068	int d, n, count = 0;
1069	const char *str;
1070
1071	for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) {
1072		str = domain_name(d);
1073		if (!str)
1074			continue;
1075
1076		n = sprintf(page, "%d: %s\n", d, str);
1077		if (n < 0)
1078			break;
1079
1080		count += n;
1081		page += n;
1082	}
1083	return count;
1084}
1085
1086#define PAGE_0_ATTR(_name, _fmt, _expr)				\
1087static ssize_t _name##_show(struct device *dev,			\
1088			    struct device_attribute *dev_attr,	\
1089			    char *buf)				\
1090{								\
1091	long hret;						\
1092	ssize_t ret = 0;					\
1093	void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);	\
1094	struct hv_24x7_catalog_page_0 *page_0 = page;		\
1095	if (!page)						\
1096		return -ENOMEM;					\
1097	hret = h_get_24x7_catalog_page(page, 0, 0);		\
1098	if (hret) {						\
1099		ret = -EIO;					\
1100		goto e_free;					\
1101	}							\
1102	ret = sprintf(buf, _fmt, _expr);			\
1103e_free:								\
1104	kmem_cache_free(hv_page_cache, page);			\
1105	return ret;						\
1106}								\
1107static DEVICE_ATTR_RO(_name)
1108
1109PAGE_0_ATTR(catalog_version, "%lld\n",
1110		(unsigned long long)be64_to_cpu(page_0->version));
1111PAGE_0_ATTR(catalog_len, "%lld\n",
1112		(unsigned long long)be32_to_cpu(page_0->length) * 4096);
1113static BIN_ATTR_RO(catalog, 0/* real length varies */);
1114static DEVICE_ATTR_RO(domains);
1115static DEVICE_ATTR_RO(sockets);
1116static DEVICE_ATTR_RO(chipspersocket);
1117static DEVICE_ATTR_RO(coresperchip);
1118static DEVICE_ATTR_RO(cpumask);
1119
1120static struct bin_attribute *if_bin_attrs[] = {
1121	&bin_attr_catalog,
1122	NULL,
1123};
1124
1125static struct attribute *cpumask_attrs[] = {
1126	&dev_attr_cpumask.attr,
1127	NULL,
1128};
1129
1130static struct attribute_group cpumask_attr_group = {
1131	.attrs = cpumask_attrs,
1132};
1133
1134static struct attribute *if_attrs[] = {
1135	&dev_attr_catalog_len.attr,
1136	&dev_attr_catalog_version.attr,
1137	&dev_attr_domains.attr,
1138	&dev_attr_sockets.attr,
1139	&dev_attr_chipspersocket.attr,
1140	&dev_attr_coresperchip.attr,
1141	NULL,
1142};
1143
1144static struct attribute_group if_group = {
1145	.name = "interface",
1146	.bin_attrs = if_bin_attrs,
1147	.attrs = if_attrs,
1148};
1149
1150static const struct attribute_group *attr_groups[] = {
1151	&format_group,
1152	&event_group,
1153	&event_desc_group,
1154	&event_long_desc_group,
1155	&if_group,
1156	&cpumask_attr_group,
1157	NULL,
1158};
1159
1160/*
1161 * Start the process for a new H_GET_24x7_DATA hcall.
1162 */
1163static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer,
1164			      struct hv_24x7_data_result_buffer *result_buffer)
1165{
1166
1167	memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE);
1168	memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE);
1169
1170	request_buffer->interface_version = interface_version;
1171	/* memset above set request_buffer->num_requests to 0 */
1172}
1173
1174/*
1175 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected
1176 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'.
1177 */
1178static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer,
1179			     struct hv_24x7_data_result_buffer *result_buffer)
1180{
1181	long ret;
1182
1183	/*
1184	 * NOTE: Due to variable number of array elements in request and
1185	 *	 result buffer(s), sizeof() is not reliable. Use the actual
1186	 *	 allocated buffer size, H24x7_DATA_BUFFER_SIZE.
1187	 */
1188	ret = plpar_hcall_norets(H_GET_24X7_DATA,
1189			virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE,
1190			virt_to_phys(result_buffer),  H24x7_DATA_BUFFER_SIZE);
1191
1192	if (ret) {
1193		struct hv_24x7_request *req;
1194
1195		req = request_buffer->requests;
1196		pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n",
1197				      req->performance_domain, req->data_offset,
1198				      req->starting_ix, req->starting_lpar_ix,
1199				      ret, ret, result_buffer->detailed_rc,
1200				      result_buffer->failing_request_ix);
1201		return -EIO;
1202	}
1203
1204	return 0;
1205}
1206
1207/*
1208 * Add the given @event to the next slot in the 24x7 request_buffer.
1209 *
1210 * Note that H_GET_24X7_DATA hcall allows reading several counters'
1211 * values in a single HCALL. We expect the caller to add events to the
1212 * request buffer one by one, make the HCALL and process the results.
1213 */
1214static int add_event_to_24x7_request(struct perf_event *event,
1215				struct hv_24x7_request_buffer *request_buffer)
1216{
1217	u16 idx;
1218	int i;
1219	size_t req_size;
1220	struct hv_24x7_request *req;
1221
1222	if (request_buffer->num_requests >=
1223	    max_num_requests(request_buffer->interface_version)) {
1224		pr_devel("Too many requests for 24x7 HCALL %d\n",
1225				request_buffer->num_requests);
1226		return -EINVAL;
1227	}
1228
1229	switch (event_get_domain(event)) {
1230	case HV_PERF_DOMAIN_PHYS_CHIP:
1231		idx = event_get_chip(event);
1232		break;
1233	case HV_PERF_DOMAIN_PHYS_CORE:
1234		idx = event_get_core(event);
1235		break;
1236	default:
1237		idx = event_get_vcpu(event);
1238	}
1239
1240	req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version);
1241
1242	i = request_buffer->num_requests++;
1243	req = (void *) request_buffer->requests + i * req_size;
1244
1245	req->performance_domain = event_get_domain(event);
1246	req->data_size = cpu_to_be16(8);
1247	req->data_offset = cpu_to_be32(event_get_offset(event));
1248	req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event));
1249	req->max_num_lpars = cpu_to_be16(1);
1250	req->starting_ix = cpu_to_be16(idx);
1251	req->max_ix = cpu_to_be16(1);
1252
1253	if (request_buffer->interface_version > 1) {
1254		if (domain_needs_aggregation(req->performance_domain))
1255			req->max_num_thread_groups = -1;
1256		else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) {
1257			req->starting_thread_group_ix = idx % 2;
1258			req->max_num_thread_groups = 1;
1259		}
1260	}
1261
1262	return 0;
1263}
1264
1265/**
1266 * get_count_from_result - get event count from all result elements in result
1267 *
1268 * If the event corresponding to this result needs aggregation of the result
1269 * element values, then this function does that.
1270 *
1271 * @event:	Event associated with @res.
1272 * @resb:	Result buffer containing @res.
1273 * @res:	Result to work on.
1274 * @countp:	Output variable containing the event count.
1275 * @next:	Optional output variable pointing to the next result in @resb.
1276 */
1277static int get_count_from_result(struct perf_event *event,
1278				 struct hv_24x7_data_result_buffer *resb,
1279				 struct hv_24x7_result *res, u64 *countp,
1280				 struct hv_24x7_result **next)
1281{
1282	u16 num_elements = be16_to_cpu(res->num_elements_returned);
1283	u16 data_size = be16_to_cpu(res->result_element_data_size);
1284	unsigned int data_offset;
1285	void *element_data;
1286	int i;
1287	u64 count;
1288
1289	/*
1290	 * We can bail out early if the result is empty.
1291	 */
1292	if (!num_elements) {
1293		pr_debug("Result of request %hhu is empty, nothing to do\n",
1294			 res->result_ix);
1295
1296		if (next)
1297			*next = (struct hv_24x7_result *) res->elements;
1298
1299		return -ENODATA;
1300	}
1301
1302	/*
1303	 * Since we always specify 1 as the maximum for the smallest resource
1304	 * we're requesting, there should to be only one element per result.
1305	 * Except when an event needs aggregation, in which case there are more.
1306	 */
1307	if (num_elements != 1 &&
1308	    !domain_needs_aggregation(event_get_domain(event))) {
1309		pr_err("Error: result of request %hhu has %hu elements\n",
1310		       res->result_ix, num_elements);
1311
1312		return -EIO;
1313	}
1314
1315	if (data_size != sizeof(u64)) {
1316		pr_debug("Error: result of request %hhu has data of %hu bytes\n",
1317			 res->result_ix, data_size);
1318
1319		return -ENOTSUPP;
1320	}
1321
1322	if (resb->interface_version == 1)
1323		data_offset = offsetof(struct hv_24x7_result_element_v1,
1324				       element_data);
1325	else
1326		data_offset = offsetof(struct hv_24x7_result_element_v2,
1327				       element_data);
1328
1329	/* Go through the result elements in the result. */
1330	for (i = count = 0, element_data = res->elements + data_offset;
1331	     i < num_elements;
1332	     i++, element_data += data_size + data_offset)
1333		count += be64_to_cpu(*((u64 *) element_data));
1334
1335	*countp = count;
1336
1337	/* The next result is after the last result element. */
1338	if (next)
1339		*next = element_data - data_offset;
1340
1341	return 0;
1342}
1343
1344static int single_24x7_request(struct perf_event *event, u64 *count)
1345{
1346	int ret;
1347	struct hv_24x7_request_buffer *request_buffer;
1348	struct hv_24x7_data_result_buffer *result_buffer;
1349
1350	BUILD_BUG_ON(sizeof(*request_buffer) > 4096);
1351	BUILD_BUG_ON(sizeof(*result_buffer) > 4096);
1352
1353	request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1354	result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1355
1356	init_24x7_request(request_buffer, result_buffer);
1357
1358	ret = add_event_to_24x7_request(event, request_buffer);
1359	if (ret)
1360		goto out;
1361
1362	ret = make_24x7_request(request_buffer, result_buffer);
1363	if (ret)
1364		goto out;
1365
1366	/* process result from hcall */
1367	ret = get_count_from_result(event, result_buffer,
1368				    result_buffer->results, count, NULL);
1369
1370out:
1371	put_cpu_var(hv_24x7_reqb);
1372	put_cpu_var(hv_24x7_resb);
1373	return ret;
1374}
1375
1376
1377static int h_24x7_event_init(struct perf_event *event)
1378{
1379	struct hv_perf_caps caps;
1380	unsigned domain;
1381	unsigned long hret;
1382	u64 ct;
1383
1384	/* Not our event */
1385	if (event->attr.type != event->pmu->type)
1386		return -ENOENT;
1387
1388	/* Unused areas must be 0 */
1389	if (event_get_reserved1(event) ||
1390	    event_get_reserved2(event) ||
1391	    event_get_reserved3(event)) {
1392		pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n",
1393				event->attr.config,
1394				event_get_reserved1(event),
1395				event->attr.config1,
1396				event_get_reserved2(event),
1397				event->attr.config2,
1398				event_get_reserved3(event));
1399		return -EINVAL;
1400	}
1401
1402	/* no branch sampling */
1403	if (has_branch_stack(event))
1404		return -EOPNOTSUPP;
1405
1406	/* offset must be 8 byte aligned */
1407	if (event_get_offset(event) % 8) {
1408		pr_devel("bad alignment\n");
1409		return -EINVAL;
1410	}
1411
1412	domain = event_get_domain(event);
1413	if (domain  == 0 || domain >= HV_PERF_DOMAIN_MAX) {
1414		pr_devel("invalid domain %d\n", domain);
1415		return -EINVAL;
1416	}
1417
1418	hret = hv_perf_caps_get(&caps);
1419	if (hret) {
1420		pr_devel("could not get capabilities: rc=%ld\n", hret);
1421		return -EIO;
1422	}
1423
1424	/* Physical domains & other lpars require extra capabilities */
1425	if (!caps.collect_privileged && (is_physical_domain(domain) ||
1426		(event_get_lpar(event) != event_get_lpar_max()))) {
1427		pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n",
1428				is_physical_domain(domain),
1429				event_get_lpar(event));
1430		return -EACCES;
1431	}
1432
1433	/* Get the initial value of the counter for this event */
1434	if (single_24x7_request(event, &ct)) {
1435		pr_devel("test hcall failed\n");
1436		return -EIO;
1437	}
1438	(void)local64_xchg(&event->hw.prev_count, ct);
1439
1440	return 0;
1441}
1442
1443static u64 h_24x7_get_value(struct perf_event *event)
1444{
1445	u64 ct;
1446
1447	if (single_24x7_request(event, &ct))
1448		/* We checked this in event init, shouldn't fail here... */
1449		return 0;
1450
1451	return ct;
1452}
1453
1454static void update_event_count(struct perf_event *event, u64 now)
1455{
1456	s64 prev;
1457
1458	prev = local64_xchg(&event->hw.prev_count, now);
1459	local64_add(now - prev, &event->count);
1460}
1461
1462static void h_24x7_event_read(struct perf_event *event)
1463{
1464	u64 now;
1465	struct hv_24x7_request_buffer *request_buffer;
1466	struct hv_24x7_hw *h24x7hw;
1467	int txn_flags;
1468
1469	txn_flags = __this_cpu_read(hv_24x7_txn_flags);
1470
1471	/*
1472	 * If in a READ transaction, add this counter to the list of
1473	 * counters to read during the next HCALL (i.e commit_txn()).
1474	 * If not in a READ transaction, go ahead and make the HCALL
1475	 * to read this counter by itself.
1476	 */
1477
1478	if (txn_flags & PERF_PMU_TXN_READ) {
1479		int i;
1480		int ret;
1481
1482		if (__this_cpu_read(hv_24x7_txn_err))
1483			return;
1484
1485		request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1486
1487		ret = add_event_to_24x7_request(event, request_buffer);
1488		if (ret) {
1489			__this_cpu_write(hv_24x7_txn_err, ret);
1490		} else {
1491			/*
1492			 * Associate the event with the HCALL request index,
1493			 * so ->commit_txn() can quickly find/update count.
1494			 */
1495			i = request_buffer->num_requests - 1;
1496
1497			h24x7hw = &get_cpu_var(hv_24x7_hw);
1498			h24x7hw->events[i] = event;
1499			put_cpu_var(h24x7hw);
1500		}
1501
1502		put_cpu_var(hv_24x7_reqb);
1503	} else {
1504		now = h_24x7_get_value(event);
1505		update_event_count(event, now);
1506	}
1507}
1508
1509static void h_24x7_event_start(struct perf_event *event, int flags)
1510{
1511	if (flags & PERF_EF_RELOAD)
1512		local64_set(&event->hw.prev_count, h_24x7_get_value(event));
1513}
1514
1515static void h_24x7_event_stop(struct perf_event *event, int flags)
1516{
1517	h_24x7_event_read(event);
1518}
1519
1520static int h_24x7_event_add(struct perf_event *event, int flags)
1521{
1522	if (flags & PERF_EF_START)
1523		h_24x7_event_start(event, flags);
1524
1525	return 0;
1526}
1527
1528/*
1529 * 24x7 counters only support READ transactions. They are
1530 * always counting and dont need/support ADD transactions.
1531 * Cache the flags, but otherwise ignore transactions that
1532 * are not PERF_PMU_TXN_READ.
1533 */
1534static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags)
1535{
1536	struct hv_24x7_request_buffer *request_buffer;
1537	struct hv_24x7_data_result_buffer *result_buffer;
1538
1539	/* We should not be called if we are already in a txn */
1540	WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags));
1541
1542	__this_cpu_write(hv_24x7_txn_flags, flags);
1543	if (flags & ~PERF_PMU_TXN_READ)
1544		return;
1545
1546	request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1547	result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1548
1549	init_24x7_request(request_buffer, result_buffer);
1550
1551	put_cpu_var(hv_24x7_resb);
1552	put_cpu_var(hv_24x7_reqb);
1553}
1554
1555/*
1556 * Clean up transaction state.
1557 *
1558 * NOTE: Ignore state of request and result buffers for now.
1559 *	 We will initialize them during the next read/txn.
1560 */
1561static void reset_txn(void)
1562{
1563	__this_cpu_write(hv_24x7_txn_flags, 0);
1564	__this_cpu_write(hv_24x7_txn_err, 0);
1565}
1566
1567/*
1568 * 24x7 counters only support READ transactions. They are always counting
1569 * and dont need/support ADD transactions. Clear ->txn_flags but otherwise
1570 * ignore transactions that are not of type PERF_PMU_TXN_READ.
1571 *
1572 * For READ transactions, submit all pending 24x7 requests (i.e requests
1573 * that were queued by h_24x7_event_read()), to the hypervisor and update
1574 * the event counts.
1575 */
1576static int h_24x7_event_commit_txn(struct pmu *pmu)
1577{
1578	struct hv_24x7_request_buffer *request_buffer;
1579	struct hv_24x7_data_result_buffer *result_buffer;
1580	struct hv_24x7_result *res, *next_res;
1581	u64 count;
1582	int i, ret, txn_flags;
1583	struct hv_24x7_hw *h24x7hw;
1584
1585	txn_flags = __this_cpu_read(hv_24x7_txn_flags);
1586	WARN_ON_ONCE(!txn_flags);
1587
1588	ret = 0;
1589	if (txn_flags & ~PERF_PMU_TXN_READ)
1590		goto out;
1591
1592	ret = __this_cpu_read(hv_24x7_txn_err);
1593	if (ret)
1594		goto out;
1595
1596	request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1597	result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1598
1599	ret = make_24x7_request(request_buffer, result_buffer);
1600	if (ret)
1601		goto put_reqb;
1602
1603	h24x7hw = &get_cpu_var(hv_24x7_hw);
1604
1605	/* Go through results in the result buffer to update event counts. */
1606	for (i = 0, res = result_buffer->results;
1607	     i < result_buffer->num_results; i++, res = next_res) {
1608		struct perf_event *event = h24x7hw->events[res->result_ix];
1609
1610		ret = get_count_from_result(event, result_buffer, res, &count,
1611					    &next_res);
1612		if (ret)
1613			break;
1614
1615		update_event_count(event, count);
1616	}
1617
1618	put_cpu_var(hv_24x7_hw);
1619
1620put_reqb:
1621	put_cpu_var(hv_24x7_resb);
1622	put_cpu_var(hv_24x7_reqb);
1623out:
1624	reset_txn();
1625	return ret;
1626}
1627
1628/*
1629 * 24x7 counters only support READ transactions. They are always counting
1630 * and dont need/support ADD transactions. However, regardless of type
1631 * of transaction, all we need to do is cleanup, so we don't have to check
1632 * the type of transaction.
1633 */
1634static void h_24x7_event_cancel_txn(struct pmu *pmu)
1635{
1636	WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags));
1637	reset_txn();
1638}
1639
1640static struct pmu h_24x7_pmu = {
1641	.task_ctx_nr = perf_invalid_context,
1642
1643	.name = "hv_24x7",
1644	.attr_groups = attr_groups,
1645	.event_init  = h_24x7_event_init,
1646	.add         = h_24x7_event_add,
1647	.del         = h_24x7_event_stop,
1648	.start       = h_24x7_event_start,
1649	.stop        = h_24x7_event_stop,
1650	.read        = h_24x7_event_read,
1651	.start_txn   = h_24x7_event_start_txn,
1652	.commit_txn  = h_24x7_event_commit_txn,
1653	.cancel_txn  = h_24x7_event_cancel_txn,
1654	.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
1655};
1656
1657static int ppc_hv_24x7_cpu_online(unsigned int cpu)
1658{
1659	if (cpumask_empty(&hv_24x7_cpumask))
1660		cpumask_set_cpu(cpu, &hv_24x7_cpumask);
1661
1662	return 0;
1663}
1664
1665static int ppc_hv_24x7_cpu_offline(unsigned int cpu)
1666{
1667	int target;
1668
1669	/* Check if exiting cpu is used for collecting 24x7 events */
1670	if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask))
1671		return 0;
1672
1673	/* Find a new cpu to collect 24x7 events */
1674	target = cpumask_last(cpu_active_mask);
1675
1676	if (target < 0 || target >= nr_cpu_ids) {
1677		pr_err("hv_24x7: CPU hotplug init failed\n");
1678		return -1;
1679	}
1680
1681	/* Migrate 24x7 events to the new target */
1682	cpumask_set_cpu(target, &hv_24x7_cpumask);
1683	perf_pmu_migrate_context(&h_24x7_pmu, cpu, target);
1684
1685	return 0;
1686}
1687
1688static int hv_24x7_cpu_hotplug_init(void)
1689{
1690	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
1691			  "perf/powerpc/hv_24x7:online",
1692			  ppc_hv_24x7_cpu_online,
1693			  ppc_hv_24x7_cpu_offline);
1694}
1695
1696static int hv_24x7_init(void)
1697{
1698	int r;
1699	unsigned long hret;
1700	struct hv_perf_caps caps;
1701
1702	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
1703		pr_debug("not a virtualized system, not enabling\n");
1704		return -ENODEV;
1705	} else if (!cur_cpu_spec->oprofile_cpu_type)
1706		return -ENODEV;
1707
1708	/* POWER8 only supports v1, while POWER9 only supports v2. */
1709	if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8"))
1710		interface_version = 1;
1711	else {
1712		interface_version = 2;
1713
1714		/* SMT8 in POWER9 needs to aggregate result elements. */
1715		if (threads_per_core == 8)
1716			aggregate_result_elements = true;
1717	}
1718
1719	hret = hv_perf_caps_get(&caps);
1720	if (hret) {
1721		pr_debug("could not obtain capabilities, not enabling, rc=%ld\n",
1722				hret);
1723		return -ENODEV;
1724	}
1725
1726	hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL);
1727	if (!hv_page_cache)
1728		return -ENOMEM;
1729
1730	/* sampling not supported */
1731	h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1732
1733	r = create_events_from_catalog(&event_group.attrs,
1734				   &event_desc_group.attrs,
1735				   &event_long_desc_group.attrs);
1736
1737	if (r)
1738		return r;
1739
1740	/* init cpuhotplug */
1741	r = hv_24x7_cpu_hotplug_init();
1742	if (r)
1743		return r;
1744
1745	r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
1746	if (r)
1747		return r;
1748
1749	read_24x7_sys_info();
1750
1751	return 0;
1752}
1753
1754device_initcall(hv_24x7_init);
1755