18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Ring buffer operations. 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2020 Facebook, Inc. 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci#ifndef _GNU_SOURCE 88c2ecf20Sopenharmony_ci#define _GNU_SOURCE 98c2ecf20Sopenharmony_ci#endif 108c2ecf20Sopenharmony_ci#include <stdlib.h> 118c2ecf20Sopenharmony_ci#include <stdio.h> 128c2ecf20Sopenharmony_ci#include <errno.h> 138c2ecf20Sopenharmony_ci#include <unistd.h> 148c2ecf20Sopenharmony_ci#include <linux/err.h> 158c2ecf20Sopenharmony_ci#include <linux/bpf.h> 168c2ecf20Sopenharmony_ci#include <asm/barrier.h> 178c2ecf20Sopenharmony_ci#include <sys/mman.h> 188c2ecf20Sopenharmony_ci#include <sys/epoll.h> 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci#include "libbpf.h" 218c2ecf20Sopenharmony_ci#include "libbpf_internal.h" 228c2ecf20Sopenharmony_ci#include "bpf.h" 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_cistruct ring { 258c2ecf20Sopenharmony_ci ring_buffer_sample_fn sample_cb; 268c2ecf20Sopenharmony_ci void *ctx; 278c2ecf20Sopenharmony_ci void *data; 288c2ecf20Sopenharmony_ci unsigned long *consumer_pos; 298c2ecf20Sopenharmony_ci unsigned long *producer_pos; 308c2ecf20Sopenharmony_ci unsigned long mask; 318c2ecf20Sopenharmony_ci int map_fd; 328c2ecf20Sopenharmony_ci}; 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_cistruct ring_buffer { 358c2ecf20Sopenharmony_ci struct epoll_event *events; 368c2ecf20Sopenharmony_ci struct ring *rings; 378c2ecf20Sopenharmony_ci size_t page_size; 388c2ecf20Sopenharmony_ci int epoll_fd; 398c2ecf20Sopenharmony_ci int ring_cnt; 408c2ecf20Sopenharmony_ci}; 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_cistatic void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) 438c2ecf20Sopenharmony_ci{ 448c2ecf20Sopenharmony_ci if (r->consumer_pos) { 458c2ecf20Sopenharmony_ci munmap(r->consumer_pos, rb->page_size); 468c2ecf20Sopenharmony_ci r->consumer_pos = NULL; 478c2ecf20Sopenharmony_ci } 488c2ecf20Sopenharmony_ci if (r->producer_pos) { 498c2ecf20Sopenharmony_ci munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1)); 508c2ecf20Sopenharmony_ci r->producer_pos = NULL; 518c2ecf20Sopenharmony_ci } 528c2ecf20Sopenharmony_ci} 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci/* Add extra RINGBUF maps to this ring buffer manager */ 558c2ecf20Sopenharmony_ciint ring_buffer__add(struct ring_buffer *rb, int map_fd, 568c2ecf20Sopenharmony_ci ring_buffer_sample_fn sample_cb, void *ctx) 578c2ecf20Sopenharmony_ci{ 588c2ecf20Sopenharmony_ci struct bpf_map_info info; 598c2ecf20Sopenharmony_ci __u32 len = sizeof(info); 608c2ecf20Sopenharmony_ci struct epoll_event *e; 618c2ecf20Sopenharmony_ci struct ring *r; 628c2ecf20Sopenharmony_ci __u64 mmap_sz; 638c2ecf20Sopenharmony_ci void *tmp; 648c2ecf20Sopenharmony_ci int err; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci memset(&info, 0, sizeof(info)); 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci err = bpf_obj_get_info_by_fd(map_fd, &info, &len); 698c2ecf20Sopenharmony_ci if (err) { 708c2ecf20Sopenharmony_ci err = -errno; 718c2ecf20Sopenharmony_ci pr_warn("ringbuf: failed to get map info for fd=%d: %d\n", 728c2ecf20Sopenharmony_ci map_fd, err); 738c2ecf20Sopenharmony_ci return err; 748c2ecf20Sopenharmony_ci } 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci if (info.type != BPF_MAP_TYPE_RINGBUF) { 778c2ecf20Sopenharmony_ci pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n", 788c2ecf20Sopenharmony_ci map_fd); 798c2ecf20Sopenharmony_ci return -EINVAL; 808c2ecf20Sopenharmony_ci } 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci tmp = libbpf_reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings)); 838c2ecf20Sopenharmony_ci if (!tmp) 848c2ecf20Sopenharmony_ci return -ENOMEM; 858c2ecf20Sopenharmony_ci rb->rings = tmp; 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci tmp = libbpf_reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events)); 888c2ecf20Sopenharmony_ci if (!tmp) 898c2ecf20Sopenharmony_ci return -ENOMEM; 908c2ecf20Sopenharmony_ci rb->events = tmp; 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci r = &rb->rings[rb->ring_cnt]; 938c2ecf20Sopenharmony_ci memset(r, 0, sizeof(*r)); 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci r->map_fd = map_fd; 968c2ecf20Sopenharmony_ci r->sample_cb = sample_cb; 978c2ecf20Sopenharmony_ci r->ctx = ctx; 988c2ecf20Sopenharmony_ci r->mask = info.max_entries - 1; 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ci /* Map writable consumer page */ 1018c2ecf20Sopenharmony_ci tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0); 1028c2ecf20Sopenharmony_ci if (tmp == MAP_FAILED) { 1038c2ecf20Sopenharmony_ci err = -errno; 1048c2ecf20Sopenharmony_ci pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n", 1058c2ecf20Sopenharmony_ci map_fd, err); 1068c2ecf20Sopenharmony_ci return err; 1078c2ecf20Sopenharmony_ci } 1088c2ecf20Sopenharmony_ci r->consumer_pos = tmp; 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_ci /* Map read-only producer page and data pages. We map twice as big 1118c2ecf20Sopenharmony_ci * data size to allow simple reading of samples that wrap around the 1128c2ecf20Sopenharmony_ci * end of a ring buffer. See kernel implementation for details. 1138c2ecf20Sopenharmony_ci * */ 1148c2ecf20Sopenharmony_ci mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; 1158c2ecf20Sopenharmony_ci if (mmap_sz != (__u64)(size_t)mmap_sz) { 1168c2ecf20Sopenharmony_ci pr_warn("ringbuf: ring buffer size (%u) is too big\n", info.max_entries); 1178c2ecf20Sopenharmony_ci return -E2BIG; 1188c2ecf20Sopenharmony_ci } 1198c2ecf20Sopenharmony_ci tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ, MAP_SHARED, map_fd, rb->page_size); 1208c2ecf20Sopenharmony_ci if (tmp == MAP_FAILED) { 1218c2ecf20Sopenharmony_ci err = -errno; 1228c2ecf20Sopenharmony_ci ringbuf_unmap_ring(rb, r); 1238c2ecf20Sopenharmony_ci pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n", 1248c2ecf20Sopenharmony_ci map_fd, err); 1258c2ecf20Sopenharmony_ci return err; 1268c2ecf20Sopenharmony_ci } 1278c2ecf20Sopenharmony_ci r->producer_pos = tmp; 1288c2ecf20Sopenharmony_ci r->data = tmp + rb->page_size; 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci e = &rb->events[rb->ring_cnt]; 1318c2ecf20Sopenharmony_ci memset(e, 0, sizeof(*e)); 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci e->events = EPOLLIN; 1348c2ecf20Sopenharmony_ci e->data.fd = rb->ring_cnt; 1358c2ecf20Sopenharmony_ci if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) { 1368c2ecf20Sopenharmony_ci err = -errno; 1378c2ecf20Sopenharmony_ci ringbuf_unmap_ring(rb, r); 1388c2ecf20Sopenharmony_ci pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n", 1398c2ecf20Sopenharmony_ci map_fd, err); 1408c2ecf20Sopenharmony_ci return err; 1418c2ecf20Sopenharmony_ci } 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci rb->ring_cnt++; 1448c2ecf20Sopenharmony_ci return 0; 1458c2ecf20Sopenharmony_ci} 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_civoid ring_buffer__free(struct ring_buffer *rb) 1488c2ecf20Sopenharmony_ci{ 1498c2ecf20Sopenharmony_ci int i; 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci if (!rb) 1528c2ecf20Sopenharmony_ci return; 1538c2ecf20Sopenharmony_ci 1548c2ecf20Sopenharmony_ci for (i = 0; i < rb->ring_cnt; ++i) 1558c2ecf20Sopenharmony_ci ringbuf_unmap_ring(rb, &rb->rings[i]); 1568c2ecf20Sopenharmony_ci if (rb->epoll_fd >= 0) 1578c2ecf20Sopenharmony_ci close(rb->epoll_fd); 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_ci free(rb->events); 1608c2ecf20Sopenharmony_ci free(rb->rings); 1618c2ecf20Sopenharmony_ci free(rb); 1628c2ecf20Sopenharmony_ci} 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_cistruct ring_buffer * 1658c2ecf20Sopenharmony_ciring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, 1668c2ecf20Sopenharmony_ci const struct ring_buffer_opts *opts) 1678c2ecf20Sopenharmony_ci{ 1688c2ecf20Sopenharmony_ci struct ring_buffer *rb; 1698c2ecf20Sopenharmony_ci int err; 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci if (!OPTS_VALID(opts, ring_buffer_opts)) 1728c2ecf20Sopenharmony_ci return NULL; 1738c2ecf20Sopenharmony_ci 1748c2ecf20Sopenharmony_ci rb = calloc(1, sizeof(*rb)); 1758c2ecf20Sopenharmony_ci if (!rb) 1768c2ecf20Sopenharmony_ci return NULL; 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci rb->page_size = getpagesize(); 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_ci rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); 1818c2ecf20Sopenharmony_ci if (rb->epoll_fd < 0) { 1828c2ecf20Sopenharmony_ci err = -errno; 1838c2ecf20Sopenharmony_ci pr_warn("ringbuf: failed to create epoll instance: %d\n", err); 1848c2ecf20Sopenharmony_ci goto err_out; 1858c2ecf20Sopenharmony_ci } 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci err = ring_buffer__add(rb, map_fd, sample_cb, ctx); 1888c2ecf20Sopenharmony_ci if (err) 1898c2ecf20Sopenharmony_ci goto err_out; 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci return rb; 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_cierr_out: 1948c2ecf20Sopenharmony_ci ring_buffer__free(rb); 1958c2ecf20Sopenharmony_ci return NULL; 1968c2ecf20Sopenharmony_ci} 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_cistatic inline int roundup_len(__u32 len) 1998c2ecf20Sopenharmony_ci{ 2008c2ecf20Sopenharmony_ci /* clear out top 2 bits (discard and busy, if set) */ 2018c2ecf20Sopenharmony_ci len <<= 2; 2028c2ecf20Sopenharmony_ci len >>= 2; 2038c2ecf20Sopenharmony_ci /* add length prefix */ 2048c2ecf20Sopenharmony_ci len += BPF_RINGBUF_HDR_SZ; 2058c2ecf20Sopenharmony_ci /* round up to 8 byte alignment */ 2068c2ecf20Sopenharmony_ci return (len + 7) / 8 * 8; 2078c2ecf20Sopenharmony_ci} 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_cistatic int64_t ringbuf_process_ring(struct ring* r) 2108c2ecf20Sopenharmony_ci{ 2118c2ecf20Sopenharmony_ci int *len_ptr, len, err; 2128c2ecf20Sopenharmony_ci /* 64-bit to avoid overflow in case of extreme application behavior */ 2138c2ecf20Sopenharmony_ci int64_t cnt = 0; 2148c2ecf20Sopenharmony_ci unsigned long cons_pos, prod_pos; 2158c2ecf20Sopenharmony_ci bool got_new_data; 2168c2ecf20Sopenharmony_ci void *sample; 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_ci cons_pos = smp_load_acquire(r->consumer_pos); 2198c2ecf20Sopenharmony_ci do { 2208c2ecf20Sopenharmony_ci got_new_data = false; 2218c2ecf20Sopenharmony_ci prod_pos = smp_load_acquire(r->producer_pos); 2228c2ecf20Sopenharmony_ci while (cons_pos < prod_pos) { 2238c2ecf20Sopenharmony_ci len_ptr = r->data + (cons_pos & r->mask); 2248c2ecf20Sopenharmony_ci len = smp_load_acquire(len_ptr); 2258c2ecf20Sopenharmony_ci 2268c2ecf20Sopenharmony_ci /* sample not committed yet, bail out for now */ 2278c2ecf20Sopenharmony_ci if (len & BPF_RINGBUF_BUSY_BIT) 2288c2ecf20Sopenharmony_ci goto done; 2298c2ecf20Sopenharmony_ci 2308c2ecf20Sopenharmony_ci got_new_data = true; 2318c2ecf20Sopenharmony_ci cons_pos += roundup_len(len); 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_ci if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) { 2348c2ecf20Sopenharmony_ci sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ; 2358c2ecf20Sopenharmony_ci err = r->sample_cb(r->ctx, sample, len); 2368c2ecf20Sopenharmony_ci if (err < 0) { 2378c2ecf20Sopenharmony_ci /* update consumer pos and bail out */ 2388c2ecf20Sopenharmony_ci smp_store_release(r->consumer_pos, 2398c2ecf20Sopenharmony_ci cons_pos); 2408c2ecf20Sopenharmony_ci return err; 2418c2ecf20Sopenharmony_ci } 2428c2ecf20Sopenharmony_ci cnt++; 2438c2ecf20Sopenharmony_ci } 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci smp_store_release(r->consumer_pos, cons_pos); 2468c2ecf20Sopenharmony_ci } 2478c2ecf20Sopenharmony_ci } while (got_new_data); 2488c2ecf20Sopenharmony_cidone: 2498c2ecf20Sopenharmony_ci return cnt; 2508c2ecf20Sopenharmony_ci} 2518c2ecf20Sopenharmony_ci 2528c2ecf20Sopenharmony_ci/* Consume available ring buffer(s) data without event polling. 2538c2ecf20Sopenharmony_ci * Returns number of records consumed across all registered ring buffers (or 2548c2ecf20Sopenharmony_ci * INT_MAX, whichever is less), or negative number if any of the callbacks 2558c2ecf20Sopenharmony_ci * return error. 2568c2ecf20Sopenharmony_ci */ 2578c2ecf20Sopenharmony_ciint ring_buffer__consume(struct ring_buffer *rb) 2588c2ecf20Sopenharmony_ci{ 2598c2ecf20Sopenharmony_ci int64_t err, res = 0; 2608c2ecf20Sopenharmony_ci int i; 2618c2ecf20Sopenharmony_ci 2628c2ecf20Sopenharmony_ci for (i = 0; i < rb->ring_cnt; i++) { 2638c2ecf20Sopenharmony_ci struct ring *ring = &rb->rings[i]; 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci err = ringbuf_process_ring(ring); 2668c2ecf20Sopenharmony_ci if (err < 0) 2678c2ecf20Sopenharmony_ci return err; 2688c2ecf20Sopenharmony_ci res += err; 2698c2ecf20Sopenharmony_ci } 2708c2ecf20Sopenharmony_ci if (res > INT_MAX) 2718c2ecf20Sopenharmony_ci return INT_MAX; 2728c2ecf20Sopenharmony_ci return res; 2738c2ecf20Sopenharmony_ci} 2748c2ecf20Sopenharmony_ci 2758c2ecf20Sopenharmony_ci/* Poll for available data and consume records, if any are available. 2768c2ecf20Sopenharmony_ci * Returns number of records consumed (or INT_MAX, whichever is less), or 2778c2ecf20Sopenharmony_ci * negative number, if any of the registered callbacks returned error. 2788c2ecf20Sopenharmony_ci */ 2798c2ecf20Sopenharmony_ciint ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) 2808c2ecf20Sopenharmony_ci{ 2818c2ecf20Sopenharmony_ci int i, cnt; 2828c2ecf20Sopenharmony_ci int64_t err, res = 0; 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms); 2858c2ecf20Sopenharmony_ci if (cnt < 0) 2868c2ecf20Sopenharmony_ci return -errno; 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_ci for (i = 0; i < cnt; i++) { 2898c2ecf20Sopenharmony_ci __u32 ring_id = rb->events[i].data.fd; 2908c2ecf20Sopenharmony_ci struct ring *ring = &rb->rings[ring_id]; 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci err = ringbuf_process_ring(ring); 2938c2ecf20Sopenharmony_ci if (err < 0) 2948c2ecf20Sopenharmony_ci return err; 2958c2ecf20Sopenharmony_ci res += err; 2968c2ecf20Sopenharmony_ci } 2978c2ecf20Sopenharmony_ci if (res > INT_MAX) 2988c2ecf20Sopenharmony_ci return INT_MAX; 2998c2ecf20Sopenharmony_ci return res; 3008c2ecf20Sopenharmony_ci} 301