1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Hypervisor supplied "24x7" performance counter support 4 * 5 * Author: Cody P Schafer <cody@linux.vnet.ibm.com> 6 * Copyright 2014 IBM Corporation. 7 */ 8 9#define pr_fmt(fmt) "hv-24x7: " fmt 10 11#include <linux/perf_event.h> 12#include <linux/rbtree.h> 13#include <linux/module.h> 14#include <linux/slab.h> 15#include <linux/vmalloc.h> 16 17#include <asm/cputhreads.h> 18#include <asm/firmware.h> 19#include <asm/hvcall.h> 20#include <asm/io.h> 21#include <linux/byteorder/generic.h> 22 23#include <asm/rtas.h> 24#include "hv-24x7.h" 25#include "hv-24x7-catalog.h" 26#include "hv-common.h" 27 28/* Version of the 24x7 hypervisor API that we should use in this machine. */ 29static int interface_version; 30 31/* Whether we have to aggregate result data for some domains. */ 32static bool aggregate_result_elements; 33 34static cpumask_t hv_24x7_cpumask; 35 36static bool domain_is_valid(unsigned domain) 37{ 38 switch (domain) { 39#define DOMAIN(n, v, x, c) \ 40 case HV_PERF_DOMAIN_##n: \ 41 /* fall through */ 42#include "hv-24x7-domains.h" 43#undef DOMAIN 44 return true; 45 default: 46 return false; 47 } 48} 49 50static bool is_physical_domain(unsigned domain) 51{ 52 switch (domain) { 53#define DOMAIN(n, v, x, c) \ 54 case HV_PERF_DOMAIN_##n: \ 55 return c; 56#include "hv-24x7-domains.h" 57#undef DOMAIN 58 default: 59 return false; 60 } 61} 62 63/* 64 * The Processor Module Information system parameter allows transferring 65 * of certain processor module information from the platform to the OS. 66 * Refer PAPR+ document to get parameter token value as '43'. 67 */ 68 69#define PROCESSOR_MODULE_INFO 43 70 71static u32 phys_sockets; /* Physical sockets */ 72static u32 phys_chipspersocket; /* Physical chips per socket*/ 73static u32 phys_coresperchip; /* Physical cores per chip */ 74 75/* 76 * read_24x7_sys_info() 77 * Retrieve the number of sockets and chips per socket and cores per 78 * chip details through the get-system-parameter rtas call. 79 */ 80void read_24x7_sys_info(void) 81{ 82 const s32 token = rtas_token("ibm,get-system-parameter"); 83 int call_status; 84 85 /* 86 * Making system parameter: chips and sockets and cores per chip 87 * default to 1. 88 */ 89 phys_sockets = 1; 90 phys_chipspersocket = 1; 91 phys_coresperchip = 1; 92 93 do { 94 spin_lock(&rtas_data_buf_lock); 95 call_status = rtas_call(token, 3, 1, NULL, PROCESSOR_MODULE_INFO, 96 __pa(rtas_data_buf), RTAS_DATA_BUF_SIZE); 97 if (call_status == 0) { 98 int ntypes = be16_to_cpup((__be16 *)&rtas_data_buf[2]); 99 int len = be16_to_cpup((__be16 *)&rtas_data_buf[0]); 100 101 if (len >= 8 && ntypes != 0) { 102 phys_sockets = be16_to_cpup((__be16 *)&rtas_data_buf[4]); 103 phys_chipspersocket = be16_to_cpup((__be16 *)&rtas_data_buf[6]); 104 phys_coresperchip = be16_to_cpup((__be16 *)&rtas_data_buf[8]); 105 } 106 } 107 spin_unlock(&rtas_data_buf_lock); 108 } while (rtas_busy_delay(call_status)); 109 110 if (call_status != 0) { 111 pr_err("Error calling get-system-parameter %d\n", 112 call_status); 113 } 114} 115 116/* Domains for which more than one result element are returned for each event. */ 117static bool domain_needs_aggregation(unsigned int domain) 118{ 119 return aggregate_result_elements && 120 (domain == HV_PERF_DOMAIN_PHYS_CORE || 121 (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE && 122 domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE)); 123} 124 125static const char *domain_name(unsigned domain) 126{ 127 if (!domain_is_valid(domain)) 128 return NULL; 129 130 switch (domain) { 131 case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip"; 132 case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core"; 133 case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core"; 134 case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip"; 135 case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node"; 136 case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node"; 137 } 138 139 WARN_ON_ONCE(domain); 140 return NULL; 141} 142 143static bool catalog_entry_domain_is_valid(unsigned domain) 144{ 145 /* POWER8 doesn't support virtual domains. */ 146 if (interface_version == 1) 147 return is_physical_domain(domain); 148 else 149 return domain_is_valid(domain); 150} 151 152/* 153 * TODO: Merging events: 154 * - Think of the hcall as an interface to a 4d array of counters: 155 * - x = domains 156 * - y = indexes in the domain (core, chip, vcpu, node, etc) 157 * - z = offset into the counter space 158 * - w = lpars (guest vms, "logical partitions") 159 * - A single request is: x,y,y_last,z,z_last,w,w_last 160 * - this means we can retrieve a rectangle of counters in y,z for a single x. 161 * 162 * - Things to consider (ignoring w): 163 * - input cost_per_request = 16 164 * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs 165 * - limited number of requests per hcall (must fit into 4K bytes) 166 * - 4k = 16 [buffer header] - 16 [request size] * request_count 167 * - 255 requests per hcall 168 * - sometimes it will be more efficient to read extra data and discard 169 */ 170 171/* 172 * Example usage: 173 * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/' 174 */ 175 176/* u3 0-6, one of HV_24X7_PERF_DOMAIN */ 177EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3); 178/* u16 */ 179EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31); 180EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31); 181EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31); 182/* u32, see "data_offset" */ 183EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63); 184/* u16 */ 185EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15); 186 187EVENT_DEFINE_RANGE(reserved1, config, 4, 15); 188EVENT_DEFINE_RANGE(reserved2, config1, 16, 63); 189EVENT_DEFINE_RANGE(reserved3, config2, 0, 63); 190 191static struct attribute *format_attrs[] = { 192 &format_attr_domain.attr, 193 &format_attr_offset.attr, 194 &format_attr_core.attr, 195 &format_attr_chip.attr, 196 &format_attr_vcpu.attr, 197 &format_attr_lpar.attr, 198 NULL, 199}; 200 201static struct attribute_group format_group = { 202 .name = "format", 203 .attrs = format_attrs, 204}; 205 206static struct attribute_group event_group = { 207 .name = "events", 208 /* .attrs is set in init */ 209}; 210 211static struct attribute_group event_desc_group = { 212 .name = "event_descs", 213 /* .attrs is set in init */ 214}; 215 216static struct attribute_group event_long_desc_group = { 217 .name = "event_long_descs", 218 /* .attrs is set in init */ 219}; 220 221static struct kmem_cache *hv_page_cache; 222 223DEFINE_PER_CPU(int, hv_24x7_txn_flags); 224DEFINE_PER_CPU(int, hv_24x7_txn_err); 225 226struct hv_24x7_hw { 227 struct perf_event *events[255]; 228}; 229 230DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); 231 232/* 233 * request_buffer and result_buffer are not required to be 4k aligned, 234 * but are not allowed to cross any 4k boundary. Aligning them to 4k is 235 * the simplest way to ensure that. 236 */ 237#define H24x7_DATA_BUFFER_SIZE 4096 238DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 239DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); 240 241static unsigned int max_num_requests(int interface_version) 242{ 243 return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer)) 244 / H24x7_REQUEST_SIZE(interface_version); 245} 246 247static char *event_name(struct hv_24x7_event_data *ev, int *len) 248{ 249 *len = be16_to_cpu(ev->event_name_len) - 2; 250 return (char *)ev->remainder; 251} 252 253static char *event_desc(struct hv_24x7_event_data *ev, int *len) 254{ 255 unsigned nl = be16_to_cpu(ev->event_name_len); 256 __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2); 257 258 *len = be16_to_cpu(*desc_len) - 2; 259 return (char *)ev->remainder + nl; 260} 261 262static char *event_long_desc(struct hv_24x7_event_data *ev, int *len) 263{ 264 unsigned nl = be16_to_cpu(ev->event_name_len); 265 __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2); 266 unsigned desc_len = be16_to_cpu(*desc_len_); 267 __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2); 268 269 *len = be16_to_cpu(*long_desc_len) - 2; 270 return (char *)ev->remainder + nl + desc_len; 271} 272 273static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev, 274 void *end) 275{ 276 void *start = ev; 277 278 return (start + offsetof(struct hv_24x7_event_data, remainder)) < end; 279} 280 281/* 282 * Things we don't check: 283 * - padding for desc, name, and long/detailed desc is required to be '\0' 284 * bytes. 285 * 286 * Return NULL if we pass end, 287 * Otherwise return the address of the byte just following the event. 288 */ 289static void *event_end(struct hv_24x7_event_data *ev, void *end) 290{ 291 void *start = ev; 292 __be16 *dl_, *ldl_; 293 unsigned dl, ldl; 294 unsigned nl = be16_to_cpu(ev->event_name_len); 295 296 if (nl < 2) { 297 pr_debug("%s: name length too short: %d", __func__, nl); 298 return NULL; 299 } 300 301 if (start + nl > end) { 302 pr_debug("%s: start=%p + nl=%u > end=%p", 303 __func__, start, nl, end); 304 return NULL; 305 } 306 307 dl_ = (__be16 *)(ev->remainder + nl - 2); 308 if (!IS_ALIGNED((uintptr_t)dl_, 2)) 309 pr_warn("desc len not aligned %p", dl_); 310 dl = be16_to_cpu(*dl_); 311 if (dl < 2) { 312 pr_debug("%s: desc len too short: %d", __func__, dl); 313 return NULL; 314 } 315 316 if (start + nl + dl > end) { 317 pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p", 318 __func__, start, nl, dl, start + nl + dl, end); 319 return NULL; 320 } 321 322 ldl_ = (__be16 *)(ev->remainder + nl + dl - 2); 323 if (!IS_ALIGNED((uintptr_t)ldl_, 2)) 324 pr_warn("long desc len not aligned %p", ldl_); 325 ldl = be16_to_cpu(*ldl_); 326 if (ldl < 2) { 327 pr_debug("%s: long desc len too short (ldl=%u)", 328 __func__, ldl); 329 return NULL; 330 } 331 332 if (start + nl + dl + ldl > end) { 333 pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p", 334 __func__, start, nl, dl, ldl, end); 335 return NULL; 336 } 337 338 return start + nl + dl + ldl; 339} 340 341static long h_get_24x7_catalog_page_(unsigned long phys_4096, 342 unsigned long version, unsigned long index) 343{ 344 pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)", 345 phys_4096, version, index); 346 347 WARN_ON(!IS_ALIGNED(phys_4096, 4096)); 348 349 return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE, 350 phys_4096, version, index); 351} 352 353static long h_get_24x7_catalog_page(char page[], u64 version, u32 index) 354{ 355 return h_get_24x7_catalog_page_(virt_to_phys(page), 356 version, index); 357} 358 359/* 360 * Each event we find in the catalog, will have a sysfs entry. Format the 361 * data for this sysfs entry based on the event's domain. 362 * 363 * Events belonging to the Chip domain can only be monitored in that domain. 364 * i.e the domain for these events is a fixed/knwon value. 365 * 366 * Events belonging to the Core domain can be monitored either in the physical 367 * core or in one of the virtual CPU domains. So the domain value for these 368 * events must be specified by the user (i.e is a required parameter). Format 369 * the Core events with 'domain=?' so the perf-tool can error check required 370 * parameters. 371 * 372 * NOTE: For the Core domain events, rather than making domain a required 373 * parameter we could default it to PHYS_CORE and allowe users to 374 * override the domain to one of the VCPU domains. 375 * 376 * However, this can make the interface a little inconsistent. 377 * 378 * If we set domain=2 (PHYS_CHIP) and allow user to override this field 379 * the user may be tempted to also modify the "offset=x" field in which 380 * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and 381 * HPM_INST (offset=0x20) events. With: 382 * 383 * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/ 384 * 385 * we end up monitoring HPM_INST, while the command line has HPM_PCYC. 386 * 387 * By not assigning a default value to the domain for the Core events, 388 * we can have simple guidelines: 389 * 390 * - Specifying values for parameters with "=?" is required. 391 * 392 * - Specifying (i.e overriding) values for other parameters 393 * is undefined. 394 */ 395static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain) 396{ 397 const char *sindex; 398 const char *lpar; 399 const char *domain_str; 400 char buf[8]; 401 402 switch (domain) { 403 case HV_PERF_DOMAIN_PHYS_CHIP: 404 snprintf(buf, sizeof(buf), "%d", domain); 405 domain_str = buf; 406 lpar = "0x0"; 407 sindex = "chip"; 408 break; 409 case HV_PERF_DOMAIN_PHYS_CORE: 410 domain_str = "?"; 411 lpar = "0x0"; 412 sindex = "core"; 413 break; 414 default: 415 domain_str = "?"; 416 lpar = "?"; 417 sindex = "vcpu"; 418 } 419 420 return kasprintf(GFP_KERNEL, 421 "domain=%s,offset=0x%x,%s=?,lpar=%s", 422 domain_str, 423 be16_to_cpu(event->event_counter_offs) + 424 be16_to_cpu(event->event_group_record_offs), 425 sindex, 426 lpar); 427} 428 429/* Avoid trusting fw to NUL terminate strings */ 430static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp) 431{ 432 return kasprintf(gfp, "%.*s", max_len, maybe_str); 433} 434 435static ssize_t device_show_string(struct device *dev, 436 struct device_attribute *attr, char *buf) 437{ 438 struct dev_ext_attribute *d; 439 440 d = container_of(attr, struct dev_ext_attribute, attr); 441 442 return sprintf(buf, "%s\n", (char *)d->var); 443} 444 445static ssize_t cpumask_show(struct device *dev, 446 struct device_attribute *attr, char *buf) 447{ 448 return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask); 449} 450 451static ssize_t sockets_show(struct device *dev, 452 struct device_attribute *attr, char *buf) 453{ 454 return sprintf(buf, "%d\n", phys_sockets); 455} 456 457static ssize_t chipspersocket_show(struct device *dev, 458 struct device_attribute *attr, char *buf) 459{ 460 return sprintf(buf, "%d\n", phys_chipspersocket); 461} 462 463static ssize_t coresperchip_show(struct device *dev, 464 struct device_attribute *attr, char *buf) 465{ 466 return sprintf(buf, "%d\n", phys_coresperchip); 467} 468 469static struct attribute *device_str_attr_create_(char *name, char *str) 470{ 471 struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL); 472 473 if (!attr) 474 return NULL; 475 476 sysfs_attr_init(&attr->attr.attr); 477 478 attr->var = str; 479 attr->attr.attr.name = name; 480 attr->attr.attr.mode = 0444; 481 attr->attr.show = device_show_string; 482 483 return &attr->attr.attr; 484} 485 486/* 487 * Allocate and initialize strings representing event attributes. 488 * 489 * NOTE: The strings allocated here are never destroyed and continue to 490 * exist till shutdown. This is to allow us to create as many events 491 * from the catalog as possible, even if we encounter errors with some. 492 * In case of changes to error paths in future, these may need to be 493 * freed by the caller. 494 */ 495static struct attribute *device_str_attr_create(char *name, int name_max, 496 int name_nonce, 497 char *str, size_t str_max) 498{ 499 char *n; 500 char *s = memdup_to_str(str, str_max, GFP_KERNEL); 501 struct attribute *a; 502 503 if (!s) 504 return NULL; 505 506 if (!name_nonce) 507 n = kasprintf(GFP_KERNEL, "%.*s", name_max, name); 508 else 509 n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name, 510 name_nonce); 511 if (!n) 512 goto out_s; 513 514 a = device_str_attr_create_(n, s); 515 if (!a) 516 goto out_n; 517 518 return a; 519out_n: 520 kfree(n); 521out_s: 522 kfree(s); 523 return NULL; 524} 525 526static struct attribute *event_to_attr(unsigned ix, 527 struct hv_24x7_event_data *event, 528 unsigned domain, 529 int nonce) 530{ 531 int event_name_len; 532 char *ev_name, *a_ev_name, *val; 533 struct attribute *attr; 534 535 if (!domain_is_valid(domain)) { 536 pr_warn("catalog event %u has invalid domain %u\n", 537 ix, domain); 538 return NULL; 539 } 540 541 val = event_fmt(event, domain); 542 if (!val) 543 return NULL; 544 545 ev_name = event_name(event, &event_name_len); 546 if (!nonce) 547 a_ev_name = kasprintf(GFP_KERNEL, "%.*s", 548 (int)event_name_len, ev_name); 549 else 550 a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d", 551 (int)event_name_len, ev_name, nonce); 552 553 if (!a_ev_name) 554 goto out_val; 555 556 attr = device_str_attr_create_(a_ev_name, val); 557 if (!attr) 558 goto out_name; 559 560 return attr; 561out_name: 562 kfree(a_ev_name); 563out_val: 564 kfree(val); 565 return NULL; 566} 567 568static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event, 569 int nonce) 570{ 571 int nl, dl; 572 char *name = event_name(event, &nl); 573 char *desc = event_desc(event, &dl); 574 575 /* If there isn't a description, don't create the sysfs file */ 576 if (!dl) 577 return NULL; 578 579 return device_str_attr_create(name, nl, nonce, desc, dl); 580} 581 582static struct attribute * 583event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce) 584{ 585 int nl, dl; 586 char *name = event_name(event, &nl); 587 char *desc = event_long_desc(event, &dl); 588 589 /* If there isn't a description, don't create the sysfs file */ 590 if (!dl) 591 return NULL; 592 593 return device_str_attr_create(name, nl, nonce, desc, dl); 594} 595 596static int event_data_to_attrs(unsigned ix, struct attribute **attrs, 597 struct hv_24x7_event_data *event, int nonce) 598{ 599 *attrs = event_to_attr(ix, event, event->domain, nonce); 600 if (!*attrs) 601 return -1; 602 603 return 0; 604} 605 606/* */ 607struct event_uniq { 608 struct rb_node node; 609 const char *name; 610 int nl; 611 unsigned ct; 612 unsigned domain; 613}; 614 615static int memord(const void *d1, size_t s1, const void *d2, size_t s2) 616{ 617 if (s1 < s2) 618 return 1; 619 if (s1 > s2) 620 return -1; 621 622 return memcmp(d1, d2, s1); 623} 624 625static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2, 626 size_t s2, unsigned d2) 627{ 628 int r = memord(v1, s1, v2, s2); 629 630 if (r) 631 return r; 632 if (d1 > d2) 633 return 1; 634 if (d2 > d1) 635 return -1; 636 return 0; 637} 638 639static int event_uniq_add(struct rb_root *root, const char *name, int nl, 640 unsigned domain) 641{ 642 struct rb_node **new = &(root->rb_node), *parent = NULL; 643 struct event_uniq *data; 644 645 /* Figure out where to put new node */ 646 while (*new) { 647 struct event_uniq *it; 648 int result; 649 650 it = rb_entry(*new, struct event_uniq, node); 651 result = ev_uniq_ord(name, nl, domain, it->name, it->nl, 652 it->domain); 653 654 parent = *new; 655 if (result < 0) 656 new = &((*new)->rb_left); 657 else if (result > 0) 658 new = &((*new)->rb_right); 659 else { 660 it->ct++; 661 pr_info("found a duplicate event %.*s, ct=%u\n", nl, 662 name, it->ct); 663 return it->ct; 664 } 665 } 666 667 data = kmalloc(sizeof(*data), GFP_KERNEL); 668 if (!data) 669 return -ENOMEM; 670 671 *data = (struct event_uniq) { 672 .name = name, 673 .nl = nl, 674 .ct = 0, 675 .domain = domain, 676 }; 677 678 /* Add new node and rebalance tree. */ 679 rb_link_node(&data->node, parent, new); 680 rb_insert_color(&data->node, root); 681 682 /* data->ct */ 683 return 0; 684} 685 686static void event_uniq_destroy(struct rb_root *root) 687{ 688 /* 689 * the strings we point to are in the giant block of memory filled by 690 * the catalog, and are freed separately. 691 */ 692 struct event_uniq *pos, *n; 693 694 rbtree_postorder_for_each_entry_safe(pos, n, root, node) 695 kfree(pos); 696} 697 698 699/* 700 * ensure the event structure's sizes are self consistent and don't cause us to 701 * read outside of the event 702 * 703 * On success, return the event length in bytes. 704 * Otherwise, return -1 (and print as appropriate). 705 */ 706static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event, 707 size_t event_idx, 708 size_t event_data_bytes, 709 size_t event_entry_count, 710 size_t offset, void *end) 711{ 712 ssize_t ev_len; 713 void *ev_end, *calc_ev_end; 714 715 if (offset >= event_data_bytes) 716 return -1; 717 718 if (event_idx >= event_entry_count) { 719 pr_devel("catalog event data has %zu bytes of padding after last event\n", 720 event_data_bytes - offset); 721 return -1; 722 } 723 724 if (!event_fixed_portion_is_within(event, end)) { 725 pr_warn("event %zu fixed portion is not within range\n", 726 event_idx); 727 return -1; 728 } 729 730 ev_len = be16_to_cpu(event->length); 731 732 if (ev_len % 16) 733 pr_info("event %zu has length %zu not divisible by 16: event=%pK\n", 734 event_idx, ev_len, event); 735 736 ev_end = (__u8 *)event + ev_len; 737 if (ev_end > end) { 738 pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n", 739 event_idx, ev_len, ev_end, end, 740 offset); 741 return -1; 742 } 743 744 calc_ev_end = event_end(event, end); 745 if (!calc_ev_end) { 746 pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n", 747 event_idx, event_data_bytes, event, end, 748 offset); 749 return -1; 750 } 751 752 if (calc_ev_end > ev_end) { 753 pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n", 754 event_idx, event, ev_end, offset, calc_ev_end); 755 return -1; 756 } 757 758 return ev_len; 759} 760 761#define MAX_4K (SIZE_MAX / 4096) 762 763static int create_events_from_catalog(struct attribute ***events_, 764 struct attribute ***event_descs_, 765 struct attribute ***event_long_descs_) 766{ 767 long hret; 768 size_t catalog_len, catalog_page_len, event_entry_count, 769 event_data_len, event_data_offs, 770 event_data_bytes, junk_events, event_idx, event_attr_ct, i, 771 attr_max, event_idx_last, desc_ct, long_desc_ct; 772 ssize_t ct, ev_len; 773 uint64_t catalog_version_num; 774 struct attribute **events, **event_descs, **event_long_descs; 775 struct hv_24x7_catalog_page_0 *page_0 = 776 kmem_cache_alloc(hv_page_cache, GFP_KERNEL); 777 void *page = page_0; 778 void *event_data, *end; 779 struct hv_24x7_event_data *event; 780 struct rb_root ev_uniq = RB_ROOT; 781 int ret = 0; 782 783 if (!page) { 784 ret = -ENOMEM; 785 goto e_out; 786 } 787 788 hret = h_get_24x7_catalog_page(page, 0, 0); 789 if (hret) { 790 ret = -EIO; 791 goto e_free; 792 } 793 794 catalog_version_num = be64_to_cpu(page_0->version); 795 catalog_page_len = be32_to_cpu(page_0->length); 796 797 if (MAX_4K < catalog_page_len) { 798 pr_err("invalid page count: %zu\n", catalog_page_len); 799 ret = -EIO; 800 goto e_free; 801 } 802 803 catalog_len = catalog_page_len * 4096; 804 805 event_entry_count = be16_to_cpu(page_0->event_entry_count); 806 event_data_offs = be16_to_cpu(page_0->event_data_offs); 807 event_data_len = be16_to_cpu(page_0->event_data_len); 808 809 pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n", 810 catalog_version_num, catalog_len, 811 event_entry_count, event_data_offs, event_data_len); 812 813 if ((MAX_4K < event_data_len) 814 || (MAX_4K < event_data_offs) 815 || (MAX_4K - event_data_offs < event_data_len)) { 816 pr_err("invalid event data offs %zu and/or len %zu\n", 817 event_data_offs, event_data_len); 818 ret = -EIO; 819 goto e_free; 820 } 821 822 if ((event_data_offs + event_data_len) > catalog_page_len) { 823 pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n", 824 event_data_offs, 825 event_data_offs + event_data_len, 826 catalog_page_len); 827 ret = -EIO; 828 goto e_free; 829 } 830 831 if (SIZE_MAX - 1 < event_entry_count) { 832 pr_err("event_entry_count %zu is invalid\n", event_entry_count); 833 ret = -EIO; 834 goto e_free; 835 } 836 837 event_data_bytes = event_data_len * 4096; 838 839 /* 840 * event data can span several pages, events can cross between these 841 * pages. Use vmalloc to make this easier. 842 */ 843 event_data = vmalloc(event_data_bytes); 844 if (!event_data) { 845 pr_err("could not allocate event data\n"); 846 ret = -ENOMEM; 847 goto e_free; 848 } 849 850 end = event_data + event_data_bytes; 851 852 /* 853 * using vmalloc_to_phys() like this only works if PAGE_SIZE is 854 * divisible by 4096 855 */ 856 BUILD_BUG_ON(PAGE_SIZE % 4096); 857 858 for (i = 0; i < event_data_len; i++) { 859 hret = h_get_24x7_catalog_page_( 860 vmalloc_to_phys(event_data + i * 4096), 861 catalog_version_num, 862 i + event_data_offs); 863 if (hret) { 864 pr_err("Failed to get event data in page %zu: rc=%ld\n", 865 i + event_data_offs, hret); 866 ret = -EIO; 867 goto e_event_data; 868 } 869 } 870 871 /* 872 * scan the catalog to determine the number of attributes we need, and 873 * verify it at the same time. 874 */ 875 for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0; 876 ; 877 event_idx++, event = (void *)event + ev_len) { 878 size_t offset = (void *)event - (void *)event_data; 879 char *name; 880 int nl; 881 882 ev_len = catalog_event_len_validate(event, event_idx, 883 event_data_bytes, 884 event_entry_count, 885 offset, end); 886 if (ev_len < 0) 887 break; 888 889 name = event_name(event, &nl); 890 891 if (event->event_group_record_len == 0) { 892 pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n", 893 event_idx, nl, name); 894 junk_events++; 895 continue; 896 } 897 898 if (!catalog_entry_domain_is_valid(event->domain)) { 899 pr_info("event %zu (%.*s) has invalid domain %d\n", 900 event_idx, nl, name, event->domain); 901 junk_events++; 902 continue; 903 } 904 905 attr_max++; 906 } 907 908 event_idx_last = event_idx; 909 if (event_idx_last != event_entry_count) 910 pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n", 911 event_idx_last, event_entry_count, junk_events); 912 913 events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL); 914 if (!events) { 915 ret = -ENOMEM; 916 goto e_event_data; 917 } 918 919 event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs), 920 GFP_KERNEL); 921 if (!event_descs) { 922 ret = -ENOMEM; 923 goto e_event_attrs; 924 } 925 926 event_long_descs = kmalloc_array(event_idx + 1, 927 sizeof(*event_long_descs), GFP_KERNEL); 928 if (!event_long_descs) { 929 ret = -ENOMEM; 930 goto e_event_descs; 931 } 932 933 /* Iterate over the catalog filling in the attribute vector */ 934 for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0, 935 event = event_data, event_idx = 0; 936 event_idx < event_idx_last; 937 event_idx++, ev_len = be16_to_cpu(event->length), 938 event = (void *)event + ev_len) { 939 char *name; 940 int nl; 941 int nonce; 942 /* 943 * these are the only "bad" events that are intermixed and that 944 * we can ignore without issue. make sure to skip them here 945 */ 946 if (event->event_group_record_len == 0) 947 continue; 948 if (!catalog_entry_domain_is_valid(event->domain)) 949 continue; 950 951 name = event_name(event, &nl); 952 nonce = event_uniq_add(&ev_uniq, name, nl, event->domain); 953 ct = event_data_to_attrs(event_idx, events + event_attr_ct, 954 event, nonce); 955 if (ct < 0) { 956 pr_warn("event %zu (%.*s) creation failure, skipping\n", 957 event_idx, nl, name); 958 junk_events++; 959 } else { 960 event_attr_ct++; 961 event_descs[desc_ct] = event_to_desc_attr(event, nonce); 962 if (event_descs[desc_ct]) 963 desc_ct++; 964 event_long_descs[long_desc_ct] = 965 event_to_long_desc_attr(event, nonce); 966 if (event_long_descs[long_desc_ct]) 967 long_desc_ct++; 968 } 969 } 970 971 pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n", 972 event_idx, event_attr_ct, junk_events, desc_ct); 973 974 events[event_attr_ct] = NULL; 975 event_descs[desc_ct] = NULL; 976 event_long_descs[long_desc_ct] = NULL; 977 978 event_uniq_destroy(&ev_uniq); 979 vfree(event_data); 980 kmem_cache_free(hv_page_cache, page); 981 982 *events_ = events; 983 *event_descs_ = event_descs; 984 *event_long_descs_ = event_long_descs; 985 return 0; 986 987e_event_descs: 988 kfree(event_descs); 989e_event_attrs: 990 kfree(events); 991e_event_data: 992 vfree(event_data); 993e_free: 994 kmem_cache_free(hv_page_cache, page); 995e_out: 996 *events_ = NULL; 997 *event_descs_ = NULL; 998 *event_long_descs_ = NULL; 999 return ret; 1000} 1001 1002static ssize_t catalog_read(struct file *filp, struct kobject *kobj, 1003 struct bin_attribute *bin_attr, char *buf, 1004 loff_t offset, size_t count) 1005{ 1006 long hret; 1007 ssize_t ret = 0; 1008 size_t catalog_len = 0, catalog_page_len = 0; 1009 loff_t page_offset = 0; 1010 loff_t offset_in_page; 1011 size_t copy_len; 1012 uint64_t catalog_version_num = 0; 1013 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); 1014 struct hv_24x7_catalog_page_0 *page_0 = page; 1015 1016 if (!page) 1017 return -ENOMEM; 1018 1019 hret = h_get_24x7_catalog_page(page, 0, 0); 1020 if (hret) { 1021 ret = -EIO; 1022 goto e_free; 1023 } 1024 1025 catalog_version_num = be64_to_cpu(page_0->version); 1026 catalog_page_len = be32_to_cpu(page_0->length); 1027 catalog_len = catalog_page_len * 4096; 1028 1029 page_offset = offset / 4096; 1030 offset_in_page = offset % 4096; 1031 1032 if (page_offset >= catalog_page_len) 1033 goto e_free; 1034 1035 if (page_offset != 0) { 1036 hret = h_get_24x7_catalog_page(page, catalog_version_num, 1037 page_offset); 1038 if (hret) { 1039 ret = -EIO; 1040 goto e_free; 1041 } 1042 } 1043 1044 copy_len = 4096 - offset_in_page; 1045 if (copy_len > count) 1046 copy_len = count; 1047 1048 memcpy(buf, page+offset_in_page, copy_len); 1049 ret = copy_len; 1050 1051e_free: 1052 if (hret) 1053 pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:" 1054 " rc=%ld\n", 1055 catalog_version_num, page_offset, hret); 1056 kmem_cache_free(hv_page_cache, page); 1057 1058 pr_devel("catalog_read: offset=%lld(%lld) count=%zu " 1059 "catalog_len=%zu(%zu) => %zd\n", offset, page_offset, 1060 count, catalog_len, catalog_page_len, ret); 1061 1062 return ret; 1063} 1064 1065static ssize_t domains_show(struct device *dev, struct device_attribute *attr, 1066 char *page) 1067{ 1068 int d, n, count = 0; 1069 const char *str; 1070 1071 for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) { 1072 str = domain_name(d); 1073 if (!str) 1074 continue; 1075 1076 n = sprintf(page, "%d: %s\n", d, str); 1077 if (n < 0) 1078 break; 1079 1080 count += n; 1081 page += n; 1082 } 1083 return count; 1084} 1085 1086#define PAGE_0_ATTR(_name, _fmt, _expr) \ 1087static ssize_t _name##_show(struct device *dev, \ 1088 struct device_attribute *dev_attr, \ 1089 char *buf) \ 1090{ \ 1091 long hret; \ 1092 ssize_t ret = 0; \ 1093 void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \ 1094 struct hv_24x7_catalog_page_0 *page_0 = page; \ 1095 if (!page) \ 1096 return -ENOMEM; \ 1097 hret = h_get_24x7_catalog_page(page, 0, 0); \ 1098 if (hret) { \ 1099 ret = -EIO; \ 1100 goto e_free; \ 1101 } \ 1102 ret = sprintf(buf, _fmt, _expr); \ 1103e_free: \ 1104 kmem_cache_free(hv_page_cache, page); \ 1105 return ret; \ 1106} \ 1107static DEVICE_ATTR_RO(_name) 1108 1109PAGE_0_ATTR(catalog_version, "%lld\n", 1110 (unsigned long long)be64_to_cpu(page_0->version)); 1111PAGE_0_ATTR(catalog_len, "%lld\n", 1112 (unsigned long long)be32_to_cpu(page_0->length) * 4096); 1113static BIN_ATTR_RO(catalog, 0/* real length varies */); 1114static DEVICE_ATTR_RO(domains); 1115static DEVICE_ATTR_RO(sockets); 1116static DEVICE_ATTR_RO(chipspersocket); 1117static DEVICE_ATTR_RO(coresperchip); 1118static DEVICE_ATTR_RO(cpumask); 1119 1120static struct bin_attribute *if_bin_attrs[] = { 1121 &bin_attr_catalog, 1122 NULL, 1123}; 1124 1125static struct attribute *cpumask_attrs[] = { 1126 &dev_attr_cpumask.attr, 1127 NULL, 1128}; 1129 1130static struct attribute_group cpumask_attr_group = { 1131 .attrs = cpumask_attrs, 1132}; 1133 1134static struct attribute *if_attrs[] = { 1135 &dev_attr_catalog_len.attr, 1136 &dev_attr_catalog_version.attr, 1137 &dev_attr_domains.attr, 1138 &dev_attr_sockets.attr, 1139 &dev_attr_chipspersocket.attr, 1140 &dev_attr_coresperchip.attr, 1141 NULL, 1142}; 1143 1144static struct attribute_group if_group = { 1145 .name = "interface", 1146 .bin_attrs = if_bin_attrs, 1147 .attrs = if_attrs, 1148}; 1149 1150static const struct attribute_group *attr_groups[] = { 1151 &format_group, 1152 &event_group, 1153 &event_desc_group, 1154 &event_long_desc_group, 1155 &if_group, 1156 &cpumask_attr_group, 1157 NULL, 1158}; 1159 1160/* 1161 * Start the process for a new H_GET_24x7_DATA hcall. 1162 */ 1163static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1164 struct hv_24x7_data_result_buffer *result_buffer) 1165{ 1166 1167 memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1168 memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE); 1169 1170 request_buffer->interface_version = interface_version; 1171 /* memset above set request_buffer->num_requests to 0 */ 1172} 1173 1174/* 1175 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected 1176 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'. 1177 */ 1178static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer, 1179 struct hv_24x7_data_result_buffer *result_buffer) 1180{ 1181 long ret; 1182 1183 /* 1184 * NOTE: Due to variable number of array elements in request and 1185 * result buffer(s), sizeof() is not reliable. Use the actual 1186 * allocated buffer size, H24x7_DATA_BUFFER_SIZE. 1187 */ 1188 ret = plpar_hcall_norets(H_GET_24X7_DATA, 1189 virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE, 1190 virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE); 1191 1192 if (ret) { 1193 struct hv_24x7_request *req; 1194 1195 req = request_buffer->requests; 1196 pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n", 1197 req->performance_domain, req->data_offset, 1198 req->starting_ix, req->starting_lpar_ix, 1199 ret, ret, result_buffer->detailed_rc, 1200 result_buffer->failing_request_ix); 1201 return -EIO; 1202 } 1203 1204 return 0; 1205} 1206 1207/* 1208 * Add the given @event to the next slot in the 24x7 request_buffer. 1209 * 1210 * Note that H_GET_24X7_DATA hcall allows reading several counters' 1211 * values in a single HCALL. We expect the caller to add events to the 1212 * request buffer one by one, make the HCALL and process the results. 1213 */ 1214static int add_event_to_24x7_request(struct perf_event *event, 1215 struct hv_24x7_request_buffer *request_buffer) 1216{ 1217 u16 idx; 1218 int i; 1219 size_t req_size; 1220 struct hv_24x7_request *req; 1221 1222 if (request_buffer->num_requests >= 1223 max_num_requests(request_buffer->interface_version)) { 1224 pr_devel("Too many requests for 24x7 HCALL %d\n", 1225 request_buffer->num_requests); 1226 return -EINVAL; 1227 } 1228 1229 switch (event_get_domain(event)) { 1230 case HV_PERF_DOMAIN_PHYS_CHIP: 1231 idx = event_get_chip(event); 1232 break; 1233 case HV_PERF_DOMAIN_PHYS_CORE: 1234 idx = event_get_core(event); 1235 break; 1236 default: 1237 idx = event_get_vcpu(event); 1238 } 1239 1240 req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version); 1241 1242 i = request_buffer->num_requests++; 1243 req = (void *) request_buffer->requests + i * req_size; 1244 1245 req->performance_domain = event_get_domain(event); 1246 req->data_size = cpu_to_be16(8); 1247 req->data_offset = cpu_to_be32(event_get_offset(event)); 1248 req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)); 1249 req->max_num_lpars = cpu_to_be16(1); 1250 req->starting_ix = cpu_to_be16(idx); 1251 req->max_ix = cpu_to_be16(1); 1252 1253 if (request_buffer->interface_version > 1) { 1254 if (domain_needs_aggregation(req->performance_domain)) 1255 req->max_num_thread_groups = -1; 1256 else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) { 1257 req->starting_thread_group_ix = idx % 2; 1258 req->max_num_thread_groups = 1; 1259 } 1260 } 1261 1262 return 0; 1263} 1264 1265/** 1266 * get_count_from_result - get event count from all result elements in result 1267 * 1268 * If the event corresponding to this result needs aggregation of the result 1269 * element values, then this function does that. 1270 * 1271 * @event: Event associated with @res. 1272 * @resb: Result buffer containing @res. 1273 * @res: Result to work on. 1274 * @countp: Output variable containing the event count. 1275 * @next: Optional output variable pointing to the next result in @resb. 1276 */ 1277static int get_count_from_result(struct perf_event *event, 1278 struct hv_24x7_data_result_buffer *resb, 1279 struct hv_24x7_result *res, u64 *countp, 1280 struct hv_24x7_result **next) 1281{ 1282 u16 num_elements = be16_to_cpu(res->num_elements_returned); 1283 u16 data_size = be16_to_cpu(res->result_element_data_size); 1284 unsigned int data_offset; 1285 void *element_data; 1286 int i; 1287 u64 count; 1288 1289 /* 1290 * We can bail out early if the result is empty. 1291 */ 1292 if (!num_elements) { 1293 pr_debug("Result of request %hhu is empty, nothing to do\n", 1294 res->result_ix); 1295 1296 if (next) 1297 *next = (struct hv_24x7_result *) res->elements; 1298 1299 return -ENODATA; 1300 } 1301 1302 /* 1303 * Since we always specify 1 as the maximum for the smallest resource 1304 * we're requesting, there should to be only one element per result. 1305 * Except when an event needs aggregation, in which case there are more. 1306 */ 1307 if (num_elements != 1 && 1308 !domain_needs_aggregation(event_get_domain(event))) { 1309 pr_err("Error: result of request %hhu has %hu elements\n", 1310 res->result_ix, num_elements); 1311 1312 return -EIO; 1313 } 1314 1315 if (data_size != sizeof(u64)) { 1316 pr_debug("Error: result of request %hhu has data of %hu bytes\n", 1317 res->result_ix, data_size); 1318 1319 return -ENOTSUPP; 1320 } 1321 1322 if (resb->interface_version == 1) 1323 data_offset = offsetof(struct hv_24x7_result_element_v1, 1324 element_data); 1325 else 1326 data_offset = offsetof(struct hv_24x7_result_element_v2, 1327 element_data); 1328 1329 /* Go through the result elements in the result. */ 1330 for (i = count = 0, element_data = res->elements + data_offset; 1331 i < num_elements; 1332 i++, element_data += data_size + data_offset) 1333 count += be64_to_cpu(*((u64 *) element_data)); 1334 1335 *countp = count; 1336 1337 /* The next result is after the last result element. */ 1338 if (next) 1339 *next = element_data - data_offset; 1340 1341 return 0; 1342} 1343 1344static int single_24x7_request(struct perf_event *event, u64 *count) 1345{ 1346 int ret; 1347 struct hv_24x7_request_buffer *request_buffer; 1348 struct hv_24x7_data_result_buffer *result_buffer; 1349 1350 BUILD_BUG_ON(sizeof(*request_buffer) > 4096); 1351 BUILD_BUG_ON(sizeof(*result_buffer) > 4096); 1352 1353 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1354 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1355 1356 init_24x7_request(request_buffer, result_buffer); 1357 1358 ret = add_event_to_24x7_request(event, request_buffer); 1359 if (ret) 1360 goto out; 1361 1362 ret = make_24x7_request(request_buffer, result_buffer); 1363 if (ret) 1364 goto out; 1365 1366 /* process result from hcall */ 1367 ret = get_count_from_result(event, result_buffer, 1368 result_buffer->results, count, NULL); 1369 1370out: 1371 put_cpu_var(hv_24x7_reqb); 1372 put_cpu_var(hv_24x7_resb); 1373 return ret; 1374} 1375 1376 1377static int h_24x7_event_init(struct perf_event *event) 1378{ 1379 struct hv_perf_caps caps; 1380 unsigned domain; 1381 unsigned long hret; 1382 u64 ct; 1383 1384 /* Not our event */ 1385 if (event->attr.type != event->pmu->type) 1386 return -ENOENT; 1387 1388 /* Unused areas must be 0 */ 1389 if (event_get_reserved1(event) || 1390 event_get_reserved2(event) || 1391 event_get_reserved3(event)) { 1392 pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n", 1393 event->attr.config, 1394 event_get_reserved1(event), 1395 event->attr.config1, 1396 event_get_reserved2(event), 1397 event->attr.config2, 1398 event_get_reserved3(event)); 1399 return -EINVAL; 1400 } 1401 1402 /* no branch sampling */ 1403 if (has_branch_stack(event)) 1404 return -EOPNOTSUPP; 1405 1406 /* offset must be 8 byte aligned */ 1407 if (event_get_offset(event) % 8) { 1408 pr_devel("bad alignment\n"); 1409 return -EINVAL; 1410 } 1411 1412 domain = event_get_domain(event); 1413 if (domain == 0 || domain >= HV_PERF_DOMAIN_MAX) { 1414 pr_devel("invalid domain %d\n", domain); 1415 return -EINVAL; 1416 } 1417 1418 hret = hv_perf_caps_get(&caps); 1419 if (hret) { 1420 pr_devel("could not get capabilities: rc=%ld\n", hret); 1421 return -EIO; 1422 } 1423 1424 /* Physical domains & other lpars require extra capabilities */ 1425 if (!caps.collect_privileged && (is_physical_domain(domain) || 1426 (event_get_lpar(event) != event_get_lpar_max()))) { 1427 pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n", 1428 is_physical_domain(domain), 1429 event_get_lpar(event)); 1430 return -EACCES; 1431 } 1432 1433 /* Get the initial value of the counter for this event */ 1434 if (single_24x7_request(event, &ct)) { 1435 pr_devel("test hcall failed\n"); 1436 return -EIO; 1437 } 1438 (void)local64_xchg(&event->hw.prev_count, ct); 1439 1440 return 0; 1441} 1442 1443static u64 h_24x7_get_value(struct perf_event *event) 1444{ 1445 u64 ct; 1446 1447 if (single_24x7_request(event, &ct)) 1448 /* We checked this in event init, shouldn't fail here... */ 1449 return 0; 1450 1451 return ct; 1452} 1453 1454static void update_event_count(struct perf_event *event, u64 now) 1455{ 1456 s64 prev; 1457 1458 prev = local64_xchg(&event->hw.prev_count, now); 1459 local64_add(now - prev, &event->count); 1460} 1461 1462static void h_24x7_event_read(struct perf_event *event) 1463{ 1464 u64 now; 1465 struct hv_24x7_request_buffer *request_buffer; 1466 struct hv_24x7_hw *h24x7hw; 1467 int txn_flags; 1468 1469 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1470 1471 /* 1472 * If in a READ transaction, add this counter to the list of 1473 * counters to read during the next HCALL (i.e commit_txn()). 1474 * If not in a READ transaction, go ahead and make the HCALL 1475 * to read this counter by itself. 1476 */ 1477 1478 if (txn_flags & PERF_PMU_TXN_READ) { 1479 int i; 1480 int ret; 1481 1482 if (__this_cpu_read(hv_24x7_txn_err)) 1483 return; 1484 1485 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1486 1487 ret = add_event_to_24x7_request(event, request_buffer); 1488 if (ret) { 1489 __this_cpu_write(hv_24x7_txn_err, ret); 1490 } else { 1491 /* 1492 * Associate the event with the HCALL request index, 1493 * so ->commit_txn() can quickly find/update count. 1494 */ 1495 i = request_buffer->num_requests - 1; 1496 1497 h24x7hw = &get_cpu_var(hv_24x7_hw); 1498 h24x7hw->events[i] = event; 1499 put_cpu_var(h24x7hw); 1500 } 1501 1502 put_cpu_var(hv_24x7_reqb); 1503 } else { 1504 now = h_24x7_get_value(event); 1505 update_event_count(event, now); 1506 } 1507} 1508 1509static void h_24x7_event_start(struct perf_event *event, int flags) 1510{ 1511 if (flags & PERF_EF_RELOAD) 1512 local64_set(&event->hw.prev_count, h_24x7_get_value(event)); 1513} 1514 1515static void h_24x7_event_stop(struct perf_event *event, int flags) 1516{ 1517 h_24x7_event_read(event); 1518} 1519 1520static int h_24x7_event_add(struct perf_event *event, int flags) 1521{ 1522 if (flags & PERF_EF_START) 1523 h_24x7_event_start(event, flags); 1524 1525 return 0; 1526} 1527 1528/* 1529 * 24x7 counters only support READ transactions. They are 1530 * always counting and dont need/support ADD transactions. 1531 * Cache the flags, but otherwise ignore transactions that 1532 * are not PERF_PMU_TXN_READ. 1533 */ 1534static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags) 1535{ 1536 struct hv_24x7_request_buffer *request_buffer; 1537 struct hv_24x7_data_result_buffer *result_buffer; 1538 1539 /* We should not be called if we are already in a txn */ 1540 WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags)); 1541 1542 __this_cpu_write(hv_24x7_txn_flags, flags); 1543 if (flags & ~PERF_PMU_TXN_READ) 1544 return; 1545 1546 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1547 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1548 1549 init_24x7_request(request_buffer, result_buffer); 1550 1551 put_cpu_var(hv_24x7_resb); 1552 put_cpu_var(hv_24x7_reqb); 1553} 1554 1555/* 1556 * Clean up transaction state. 1557 * 1558 * NOTE: Ignore state of request and result buffers for now. 1559 * We will initialize them during the next read/txn. 1560 */ 1561static void reset_txn(void) 1562{ 1563 __this_cpu_write(hv_24x7_txn_flags, 0); 1564 __this_cpu_write(hv_24x7_txn_err, 0); 1565} 1566 1567/* 1568 * 24x7 counters only support READ transactions. They are always counting 1569 * and dont need/support ADD transactions. Clear ->txn_flags but otherwise 1570 * ignore transactions that are not of type PERF_PMU_TXN_READ. 1571 * 1572 * For READ transactions, submit all pending 24x7 requests (i.e requests 1573 * that were queued by h_24x7_event_read()), to the hypervisor and update 1574 * the event counts. 1575 */ 1576static int h_24x7_event_commit_txn(struct pmu *pmu) 1577{ 1578 struct hv_24x7_request_buffer *request_buffer; 1579 struct hv_24x7_data_result_buffer *result_buffer; 1580 struct hv_24x7_result *res, *next_res; 1581 u64 count; 1582 int i, ret, txn_flags; 1583 struct hv_24x7_hw *h24x7hw; 1584 1585 txn_flags = __this_cpu_read(hv_24x7_txn_flags); 1586 WARN_ON_ONCE(!txn_flags); 1587 1588 ret = 0; 1589 if (txn_flags & ~PERF_PMU_TXN_READ) 1590 goto out; 1591 1592 ret = __this_cpu_read(hv_24x7_txn_err); 1593 if (ret) 1594 goto out; 1595 1596 request_buffer = (void *)get_cpu_var(hv_24x7_reqb); 1597 result_buffer = (void *)get_cpu_var(hv_24x7_resb); 1598 1599 ret = make_24x7_request(request_buffer, result_buffer); 1600 if (ret) 1601 goto put_reqb; 1602 1603 h24x7hw = &get_cpu_var(hv_24x7_hw); 1604 1605 /* Go through results in the result buffer to update event counts. */ 1606 for (i = 0, res = result_buffer->results; 1607 i < result_buffer->num_results; i++, res = next_res) { 1608 struct perf_event *event = h24x7hw->events[res->result_ix]; 1609 1610 ret = get_count_from_result(event, result_buffer, res, &count, 1611 &next_res); 1612 if (ret) 1613 break; 1614 1615 update_event_count(event, count); 1616 } 1617 1618 put_cpu_var(hv_24x7_hw); 1619 1620put_reqb: 1621 put_cpu_var(hv_24x7_resb); 1622 put_cpu_var(hv_24x7_reqb); 1623out: 1624 reset_txn(); 1625 return ret; 1626} 1627 1628/* 1629 * 24x7 counters only support READ transactions. They are always counting 1630 * and dont need/support ADD transactions. However, regardless of type 1631 * of transaction, all we need to do is cleanup, so we don't have to check 1632 * the type of transaction. 1633 */ 1634static void h_24x7_event_cancel_txn(struct pmu *pmu) 1635{ 1636 WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags)); 1637 reset_txn(); 1638} 1639 1640static struct pmu h_24x7_pmu = { 1641 .task_ctx_nr = perf_invalid_context, 1642 1643 .name = "hv_24x7", 1644 .attr_groups = attr_groups, 1645 .event_init = h_24x7_event_init, 1646 .add = h_24x7_event_add, 1647 .del = h_24x7_event_stop, 1648 .start = h_24x7_event_start, 1649 .stop = h_24x7_event_stop, 1650 .read = h_24x7_event_read, 1651 .start_txn = h_24x7_event_start_txn, 1652 .commit_txn = h_24x7_event_commit_txn, 1653 .cancel_txn = h_24x7_event_cancel_txn, 1654 .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 1655}; 1656 1657static int ppc_hv_24x7_cpu_online(unsigned int cpu) 1658{ 1659 if (cpumask_empty(&hv_24x7_cpumask)) 1660 cpumask_set_cpu(cpu, &hv_24x7_cpumask); 1661 1662 return 0; 1663} 1664 1665static int ppc_hv_24x7_cpu_offline(unsigned int cpu) 1666{ 1667 int target; 1668 1669 /* Check if exiting cpu is used for collecting 24x7 events */ 1670 if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask)) 1671 return 0; 1672 1673 /* Find a new cpu to collect 24x7 events */ 1674 target = cpumask_last(cpu_active_mask); 1675 1676 if (target < 0 || target >= nr_cpu_ids) { 1677 pr_err("hv_24x7: CPU hotplug init failed\n"); 1678 return -1; 1679 } 1680 1681 /* Migrate 24x7 events to the new target */ 1682 cpumask_set_cpu(target, &hv_24x7_cpumask); 1683 perf_pmu_migrate_context(&h_24x7_pmu, cpu, target); 1684 1685 return 0; 1686} 1687 1688static int hv_24x7_cpu_hotplug_init(void) 1689{ 1690 return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, 1691 "perf/powerpc/hv_24x7:online", 1692 ppc_hv_24x7_cpu_online, 1693 ppc_hv_24x7_cpu_offline); 1694} 1695 1696static int hv_24x7_init(void) 1697{ 1698 int r; 1699 unsigned long hret; 1700 struct hv_perf_caps caps; 1701 1702 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 1703 pr_debug("not a virtualized system, not enabling\n"); 1704 return -ENODEV; 1705 } else if (!cur_cpu_spec->oprofile_cpu_type) 1706 return -ENODEV; 1707 1708 /* POWER8 only supports v1, while POWER9 only supports v2. */ 1709 if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8")) 1710 interface_version = 1; 1711 else { 1712 interface_version = 2; 1713 1714 /* SMT8 in POWER9 needs to aggregate result elements. */ 1715 if (threads_per_core == 8) 1716 aggregate_result_elements = true; 1717 } 1718 1719 hret = hv_perf_caps_get(&caps); 1720 if (hret) { 1721 pr_debug("could not obtain capabilities, not enabling, rc=%ld\n", 1722 hret); 1723 return -ENODEV; 1724 } 1725 1726 hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL); 1727 if (!hv_page_cache) 1728 return -ENOMEM; 1729 1730 /* sampling not supported */ 1731 h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1732 1733 r = create_events_from_catalog(&event_group.attrs, 1734 &event_desc_group.attrs, 1735 &event_long_desc_group.attrs); 1736 1737 if (r) 1738 return r; 1739 1740 /* init cpuhotplug */ 1741 r = hv_24x7_cpu_hotplug_init(); 1742 if (r) 1743 return r; 1744 1745 r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1); 1746 if (r) 1747 return r; 1748 1749 read_24x7_sys_info(); 1750 1751 return 0; 1752} 1753 1754device_initcall(hv_24x7_init); 1755