xref: /kernel/linux/linux-5.10/tools/lib/bpf/ringbuf.c (revision 8c2ecf20)
18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Ring buffer operations.
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2020 Facebook, Inc.
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci#ifndef _GNU_SOURCE
88c2ecf20Sopenharmony_ci#define _GNU_SOURCE
98c2ecf20Sopenharmony_ci#endif
108c2ecf20Sopenharmony_ci#include <stdlib.h>
118c2ecf20Sopenharmony_ci#include <stdio.h>
128c2ecf20Sopenharmony_ci#include <errno.h>
138c2ecf20Sopenharmony_ci#include <unistd.h>
148c2ecf20Sopenharmony_ci#include <linux/err.h>
158c2ecf20Sopenharmony_ci#include <linux/bpf.h>
168c2ecf20Sopenharmony_ci#include <asm/barrier.h>
178c2ecf20Sopenharmony_ci#include <sys/mman.h>
188c2ecf20Sopenharmony_ci#include <sys/epoll.h>
198c2ecf20Sopenharmony_ci
208c2ecf20Sopenharmony_ci#include "libbpf.h"
218c2ecf20Sopenharmony_ci#include "libbpf_internal.h"
228c2ecf20Sopenharmony_ci#include "bpf.h"
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_cistruct ring {
258c2ecf20Sopenharmony_ci	ring_buffer_sample_fn sample_cb;
268c2ecf20Sopenharmony_ci	void *ctx;
278c2ecf20Sopenharmony_ci	void *data;
288c2ecf20Sopenharmony_ci	unsigned long *consumer_pos;
298c2ecf20Sopenharmony_ci	unsigned long *producer_pos;
308c2ecf20Sopenharmony_ci	unsigned long mask;
318c2ecf20Sopenharmony_ci	int map_fd;
328c2ecf20Sopenharmony_ci};
338c2ecf20Sopenharmony_ci
348c2ecf20Sopenharmony_cistruct ring_buffer {
358c2ecf20Sopenharmony_ci	struct epoll_event *events;
368c2ecf20Sopenharmony_ci	struct ring *rings;
378c2ecf20Sopenharmony_ci	size_t page_size;
388c2ecf20Sopenharmony_ci	int epoll_fd;
398c2ecf20Sopenharmony_ci	int ring_cnt;
408c2ecf20Sopenharmony_ci};
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_cistatic void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r)
438c2ecf20Sopenharmony_ci{
448c2ecf20Sopenharmony_ci	if (r->consumer_pos) {
458c2ecf20Sopenharmony_ci		munmap(r->consumer_pos, rb->page_size);
468c2ecf20Sopenharmony_ci		r->consumer_pos = NULL;
478c2ecf20Sopenharmony_ci	}
488c2ecf20Sopenharmony_ci	if (r->producer_pos) {
498c2ecf20Sopenharmony_ci		munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1));
508c2ecf20Sopenharmony_ci		r->producer_pos = NULL;
518c2ecf20Sopenharmony_ci	}
528c2ecf20Sopenharmony_ci}
538c2ecf20Sopenharmony_ci
548c2ecf20Sopenharmony_ci/* Add extra RINGBUF maps to this ring buffer manager */
558c2ecf20Sopenharmony_ciint ring_buffer__add(struct ring_buffer *rb, int map_fd,
568c2ecf20Sopenharmony_ci		     ring_buffer_sample_fn sample_cb, void *ctx)
578c2ecf20Sopenharmony_ci{
588c2ecf20Sopenharmony_ci	struct bpf_map_info info;
598c2ecf20Sopenharmony_ci	__u32 len = sizeof(info);
608c2ecf20Sopenharmony_ci	struct epoll_event *e;
618c2ecf20Sopenharmony_ci	struct ring *r;
628c2ecf20Sopenharmony_ci	__u64 mmap_sz;
638c2ecf20Sopenharmony_ci	void *tmp;
648c2ecf20Sopenharmony_ci	int err;
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci	memset(&info, 0, sizeof(info));
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci	err = bpf_obj_get_info_by_fd(map_fd, &info, &len);
698c2ecf20Sopenharmony_ci	if (err) {
708c2ecf20Sopenharmony_ci		err = -errno;
718c2ecf20Sopenharmony_ci		pr_warn("ringbuf: failed to get map info for fd=%d: %d\n",
728c2ecf20Sopenharmony_ci			map_fd, err);
738c2ecf20Sopenharmony_ci		return err;
748c2ecf20Sopenharmony_ci	}
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci	if (info.type != BPF_MAP_TYPE_RINGBUF) {
778c2ecf20Sopenharmony_ci		pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n",
788c2ecf20Sopenharmony_ci			map_fd);
798c2ecf20Sopenharmony_ci		return -EINVAL;
808c2ecf20Sopenharmony_ci	}
818c2ecf20Sopenharmony_ci
828c2ecf20Sopenharmony_ci	tmp = libbpf_reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings));
838c2ecf20Sopenharmony_ci	if (!tmp)
848c2ecf20Sopenharmony_ci		return -ENOMEM;
858c2ecf20Sopenharmony_ci	rb->rings = tmp;
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci	tmp = libbpf_reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events));
888c2ecf20Sopenharmony_ci	if (!tmp)
898c2ecf20Sopenharmony_ci		return -ENOMEM;
908c2ecf20Sopenharmony_ci	rb->events = tmp;
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci	r = &rb->rings[rb->ring_cnt];
938c2ecf20Sopenharmony_ci	memset(r, 0, sizeof(*r));
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci	r->map_fd = map_fd;
968c2ecf20Sopenharmony_ci	r->sample_cb = sample_cb;
978c2ecf20Sopenharmony_ci	r->ctx = ctx;
988c2ecf20Sopenharmony_ci	r->mask = info.max_entries - 1;
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_ci	/* Map writable consumer page */
1018c2ecf20Sopenharmony_ci	tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0);
1028c2ecf20Sopenharmony_ci	if (tmp == MAP_FAILED) {
1038c2ecf20Sopenharmony_ci		err = -errno;
1048c2ecf20Sopenharmony_ci		pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
1058c2ecf20Sopenharmony_ci			map_fd, err);
1068c2ecf20Sopenharmony_ci		return err;
1078c2ecf20Sopenharmony_ci	}
1088c2ecf20Sopenharmony_ci	r->consumer_pos = tmp;
1098c2ecf20Sopenharmony_ci
1108c2ecf20Sopenharmony_ci	/* Map read-only producer page and data pages. We map twice as big
1118c2ecf20Sopenharmony_ci	 * data size to allow simple reading of samples that wrap around the
1128c2ecf20Sopenharmony_ci	 * end of a ring buffer. See kernel implementation for details.
1138c2ecf20Sopenharmony_ci	 * */
1148c2ecf20Sopenharmony_ci	mmap_sz = rb->page_size + 2 * (__u64)info.max_entries;
1158c2ecf20Sopenharmony_ci	if (mmap_sz != (__u64)(size_t)mmap_sz) {
1168c2ecf20Sopenharmony_ci		pr_warn("ringbuf: ring buffer size (%u) is too big\n", info.max_entries);
1178c2ecf20Sopenharmony_ci		return -E2BIG;
1188c2ecf20Sopenharmony_ci	}
1198c2ecf20Sopenharmony_ci	tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ, MAP_SHARED, map_fd, rb->page_size);
1208c2ecf20Sopenharmony_ci	if (tmp == MAP_FAILED) {
1218c2ecf20Sopenharmony_ci		err = -errno;
1228c2ecf20Sopenharmony_ci		ringbuf_unmap_ring(rb, r);
1238c2ecf20Sopenharmony_ci		pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n",
1248c2ecf20Sopenharmony_ci			map_fd, err);
1258c2ecf20Sopenharmony_ci		return err;
1268c2ecf20Sopenharmony_ci	}
1278c2ecf20Sopenharmony_ci	r->producer_pos = tmp;
1288c2ecf20Sopenharmony_ci	r->data = tmp + rb->page_size;
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci	e = &rb->events[rb->ring_cnt];
1318c2ecf20Sopenharmony_ci	memset(e, 0, sizeof(*e));
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci	e->events = EPOLLIN;
1348c2ecf20Sopenharmony_ci	e->data.fd = rb->ring_cnt;
1358c2ecf20Sopenharmony_ci	if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) {
1368c2ecf20Sopenharmony_ci		err = -errno;
1378c2ecf20Sopenharmony_ci		ringbuf_unmap_ring(rb, r);
1388c2ecf20Sopenharmony_ci		pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n",
1398c2ecf20Sopenharmony_ci			map_fd, err);
1408c2ecf20Sopenharmony_ci		return err;
1418c2ecf20Sopenharmony_ci	}
1428c2ecf20Sopenharmony_ci
1438c2ecf20Sopenharmony_ci	rb->ring_cnt++;
1448c2ecf20Sopenharmony_ci	return 0;
1458c2ecf20Sopenharmony_ci}
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_civoid ring_buffer__free(struct ring_buffer *rb)
1488c2ecf20Sopenharmony_ci{
1498c2ecf20Sopenharmony_ci	int i;
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci	if (!rb)
1528c2ecf20Sopenharmony_ci		return;
1538c2ecf20Sopenharmony_ci
1548c2ecf20Sopenharmony_ci	for (i = 0; i < rb->ring_cnt; ++i)
1558c2ecf20Sopenharmony_ci		ringbuf_unmap_ring(rb, &rb->rings[i]);
1568c2ecf20Sopenharmony_ci	if (rb->epoll_fd >= 0)
1578c2ecf20Sopenharmony_ci		close(rb->epoll_fd);
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_ci	free(rb->events);
1608c2ecf20Sopenharmony_ci	free(rb->rings);
1618c2ecf20Sopenharmony_ci	free(rb);
1628c2ecf20Sopenharmony_ci}
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_cistruct ring_buffer *
1658c2ecf20Sopenharmony_ciring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx,
1668c2ecf20Sopenharmony_ci		 const struct ring_buffer_opts *opts)
1678c2ecf20Sopenharmony_ci{
1688c2ecf20Sopenharmony_ci	struct ring_buffer *rb;
1698c2ecf20Sopenharmony_ci	int err;
1708c2ecf20Sopenharmony_ci
1718c2ecf20Sopenharmony_ci	if (!OPTS_VALID(opts, ring_buffer_opts))
1728c2ecf20Sopenharmony_ci		return NULL;
1738c2ecf20Sopenharmony_ci
1748c2ecf20Sopenharmony_ci	rb = calloc(1, sizeof(*rb));
1758c2ecf20Sopenharmony_ci	if (!rb)
1768c2ecf20Sopenharmony_ci		return NULL;
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci	rb->page_size = getpagesize();
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci	rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
1818c2ecf20Sopenharmony_ci	if (rb->epoll_fd < 0) {
1828c2ecf20Sopenharmony_ci		err = -errno;
1838c2ecf20Sopenharmony_ci		pr_warn("ringbuf: failed to create epoll instance: %d\n", err);
1848c2ecf20Sopenharmony_ci		goto err_out;
1858c2ecf20Sopenharmony_ci	}
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci	err = ring_buffer__add(rb, map_fd, sample_cb, ctx);
1888c2ecf20Sopenharmony_ci	if (err)
1898c2ecf20Sopenharmony_ci		goto err_out;
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci	return rb;
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_cierr_out:
1948c2ecf20Sopenharmony_ci	ring_buffer__free(rb);
1958c2ecf20Sopenharmony_ci	return NULL;
1968c2ecf20Sopenharmony_ci}
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_cistatic inline int roundup_len(__u32 len)
1998c2ecf20Sopenharmony_ci{
2008c2ecf20Sopenharmony_ci	/* clear out top 2 bits (discard and busy, if set) */
2018c2ecf20Sopenharmony_ci	len <<= 2;
2028c2ecf20Sopenharmony_ci	len >>= 2;
2038c2ecf20Sopenharmony_ci	/* add length prefix */
2048c2ecf20Sopenharmony_ci	len += BPF_RINGBUF_HDR_SZ;
2058c2ecf20Sopenharmony_ci	/* round up to 8 byte alignment */
2068c2ecf20Sopenharmony_ci	return (len + 7) / 8 * 8;
2078c2ecf20Sopenharmony_ci}
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_cistatic int64_t ringbuf_process_ring(struct ring* r)
2108c2ecf20Sopenharmony_ci{
2118c2ecf20Sopenharmony_ci	int *len_ptr, len, err;
2128c2ecf20Sopenharmony_ci	/* 64-bit to avoid overflow in case of extreme application behavior */
2138c2ecf20Sopenharmony_ci	int64_t cnt = 0;
2148c2ecf20Sopenharmony_ci	unsigned long cons_pos, prod_pos;
2158c2ecf20Sopenharmony_ci	bool got_new_data;
2168c2ecf20Sopenharmony_ci	void *sample;
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci	cons_pos = smp_load_acquire(r->consumer_pos);
2198c2ecf20Sopenharmony_ci	do {
2208c2ecf20Sopenharmony_ci		got_new_data = false;
2218c2ecf20Sopenharmony_ci		prod_pos = smp_load_acquire(r->producer_pos);
2228c2ecf20Sopenharmony_ci		while (cons_pos < prod_pos) {
2238c2ecf20Sopenharmony_ci			len_ptr = r->data + (cons_pos & r->mask);
2248c2ecf20Sopenharmony_ci			len = smp_load_acquire(len_ptr);
2258c2ecf20Sopenharmony_ci
2268c2ecf20Sopenharmony_ci			/* sample not committed yet, bail out for now */
2278c2ecf20Sopenharmony_ci			if (len & BPF_RINGBUF_BUSY_BIT)
2288c2ecf20Sopenharmony_ci				goto done;
2298c2ecf20Sopenharmony_ci
2308c2ecf20Sopenharmony_ci			got_new_data = true;
2318c2ecf20Sopenharmony_ci			cons_pos += roundup_len(len);
2328c2ecf20Sopenharmony_ci
2338c2ecf20Sopenharmony_ci			if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) {
2348c2ecf20Sopenharmony_ci				sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
2358c2ecf20Sopenharmony_ci				err = r->sample_cb(r->ctx, sample, len);
2368c2ecf20Sopenharmony_ci				if (err < 0) {
2378c2ecf20Sopenharmony_ci					/* update consumer pos and bail out */
2388c2ecf20Sopenharmony_ci					smp_store_release(r->consumer_pos,
2398c2ecf20Sopenharmony_ci							  cons_pos);
2408c2ecf20Sopenharmony_ci					return err;
2418c2ecf20Sopenharmony_ci				}
2428c2ecf20Sopenharmony_ci				cnt++;
2438c2ecf20Sopenharmony_ci			}
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci			smp_store_release(r->consumer_pos, cons_pos);
2468c2ecf20Sopenharmony_ci		}
2478c2ecf20Sopenharmony_ci	} while (got_new_data);
2488c2ecf20Sopenharmony_cidone:
2498c2ecf20Sopenharmony_ci	return cnt;
2508c2ecf20Sopenharmony_ci}
2518c2ecf20Sopenharmony_ci
2528c2ecf20Sopenharmony_ci/* Consume available ring buffer(s) data without event polling.
2538c2ecf20Sopenharmony_ci * Returns number of records consumed across all registered ring buffers (or
2548c2ecf20Sopenharmony_ci * INT_MAX, whichever is less), or negative number if any of the callbacks
2558c2ecf20Sopenharmony_ci * return error.
2568c2ecf20Sopenharmony_ci */
2578c2ecf20Sopenharmony_ciint ring_buffer__consume(struct ring_buffer *rb)
2588c2ecf20Sopenharmony_ci{
2598c2ecf20Sopenharmony_ci	int64_t err, res = 0;
2608c2ecf20Sopenharmony_ci	int i;
2618c2ecf20Sopenharmony_ci
2628c2ecf20Sopenharmony_ci	for (i = 0; i < rb->ring_cnt; i++) {
2638c2ecf20Sopenharmony_ci		struct ring *ring = &rb->rings[i];
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci		err = ringbuf_process_ring(ring);
2668c2ecf20Sopenharmony_ci		if (err < 0)
2678c2ecf20Sopenharmony_ci			return err;
2688c2ecf20Sopenharmony_ci		res += err;
2698c2ecf20Sopenharmony_ci	}
2708c2ecf20Sopenharmony_ci	if (res > INT_MAX)
2718c2ecf20Sopenharmony_ci		return INT_MAX;
2728c2ecf20Sopenharmony_ci	return res;
2738c2ecf20Sopenharmony_ci}
2748c2ecf20Sopenharmony_ci
2758c2ecf20Sopenharmony_ci/* Poll for available data and consume records, if any are available.
2768c2ecf20Sopenharmony_ci * Returns number of records consumed (or INT_MAX, whichever is less), or
2778c2ecf20Sopenharmony_ci * negative number, if any of the registered callbacks returned error.
2788c2ecf20Sopenharmony_ci */
2798c2ecf20Sopenharmony_ciint ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
2808c2ecf20Sopenharmony_ci{
2818c2ecf20Sopenharmony_ci	int i, cnt;
2828c2ecf20Sopenharmony_ci	int64_t err, res = 0;
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci	cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms);
2858c2ecf20Sopenharmony_ci	if (cnt < 0)
2868c2ecf20Sopenharmony_ci		return -errno;
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ci	for (i = 0; i < cnt; i++) {
2898c2ecf20Sopenharmony_ci		__u32 ring_id = rb->events[i].data.fd;
2908c2ecf20Sopenharmony_ci		struct ring *ring = &rb->rings[ring_id];
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci		err = ringbuf_process_ring(ring);
2938c2ecf20Sopenharmony_ci		if (err < 0)
2948c2ecf20Sopenharmony_ci			return err;
2958c2ecf20Sopenharmony_ci		res += err;
2968c2ecf20Sopenharmony_ci	}
2978c2ecf20Sopenharmony_ci	if (res > INT_MAX)
2988c2ecf20Sopenharmony_ci		return INT_MAX;
2998c2ecf20Sopenharmony_ci	return res;
3008c2ecf20Sopenharmony_ci}
301