162306a36Sopenharmony_ci// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Ring buffer operations.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2020 Facebook, Inc.
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci#ifndef _GNU_SOURCE
862306a36Sopenharmony_ci#define _GNU_SOURCE
962306a36Sopenharmony_ci#endif
1062306a36Sopenharmony_ci#include <stdlib.h>
1162306a36Sopenharmony_ci#include <stdio.h>
1262306a36Sopenharmony_ci#include <errno.h>
1362306a36Sopenharmony_ci#include <unistd.h>
1462306a36Sopenharmony_ci#include <linux/err.h>
1562306a36Sopenharmony_ci#include <linux/bpf.h>
1662306a36Sopenharmony_ci#include <asm/barrier.h>
1762306a36Sopenharmony_ci#include <sys/mman.h>
1862306a36Sopenharmony_ci#include <sys/epoll.h>
1962306a36Sopenharmony_ci#include <time.h>
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci#include "libbpf.h"
2262306a36Sopenharmony_ci#include "libbpf_internal.h"
2362306a36Sopenharmony_ci#include "bpf.h"
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_cistruct ring {
2662306a36Sopenharmony_ci	ring_buffer_sample_fn sample_cb;
2762306a36Sopenharmony_ci	void *ctx;
2862306a36Sopenharmony_ci	void *data;
2962306a36Sopenharmony_ci	unsigned long *consumer_pos;
3062306a36Sopenharmony_ci	unsigned long *producer_pos;
3162306a36Sopenharmony_ci	unsigned long mask;
3262306a36Sopenharmony_ci	int map_fd;
3362306a36Sopenharmony_ci};
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_cistruct ring_buffer {
3662306a36Sopenharmony_ci	struct epoll_event *events;
3762306a36Sopenharmony_ci	struct ring *rings;
3862306a36Sopenharmony_ci	size_t page_size;
3962306a36Sopenharmony_ci	int epoll_fd;
4062306a36Sopenharmony_ci	int ring_cnt;
4162306a36Sopenharmony_ci};
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_cistruct user_ring_buffer {
4462306a36Sopenharmony_ci	struct epoll_event event;
4562306a36Sopenharmony_ci	unsigned long *consumer_pos;
4662306a36Sopenharmony_ci	unsigned long *producer_pos;
4762306a36Sopenharmony_ci	void *data;
4862306a36Sopenharmony_ci	unsigned long mask;
4962306a36Sopenharmony_ci	size_t page_size;
5062306a36Sopenharmony_ci	int map_fd;
5162306a36Sopenharmony_ci	int epoll_fd;
5262306a36Sopenharmony_ci};
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci/* 8-byte ring buffer header structure */
5562306a36Sopenharmony_cistruct ringbuf_hdr {
5662306a36Sopenharmony_ci	__u32 len;
5762306a36Sopenharmony_ci	__u32 pad;
5862306a36Sopenharmony_ci};
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_cistatic void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r)
6162306a36Sopenharmony_ci{
6262306a36Sopenharmony_ci	if (r->consumer_pos) {
6362306a36Sopenharmony_ci		munmap(r->consumer_pos, rb->page_size);
6462306a36Sopenharmony_ci		r->consumer_pos = NULL;
6562306a36Sopenharmony_ci	}
6662306a36Sopenharmony_ci	if (r->producer_pos) {
6762306a36Sopenharmony_ci		munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1));
6862306a36Sopenharmony_ci		r->producer_pos = NULL;
6962306a36Sopenharmony_ci	}
7062306a36Sopenharmony_ci}
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci/* Add extra RINGBUF maps to this ring buffer manager */
7362306a36Sopenharmony_ciint ring_buffer__add(struct ring_buffer *rb, int map_fd,
7462306a36Sopenharmony_ci		     ring_buffer_sample_fn sample_cb, void *ctx)
7562306a36Sopenharmony_ci{
7662306a36Sopenharmony_ci	struct bpf_map_info info;
7762306a36Sopenharmony_ci	__u32 len = sizeof(info);
7862306a36Sopenharmony_ci	struct epoll_event *e;
7962306a36Sopenharmony_ci	struct ring *r;
8062306a36Sopenharmony_ci	__u64 mmap_sz;
8162306a36Sopenharmony_ci	void *tmp;
8262306a36Sopenharmony_ci	int err;
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	memset(&info, 0, sizeof(info));
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci	err = bpf_map_get_info_by_fd(map_fd, &info, &len);
8762306a36Sopenharmony_ci	if (err) {
8862306a36Sopenharmony_ci		err = -errno;
8962306a36Sopenharmony_ci		pr_warn("ringbuf: failed to get map info for fd=%d: %d\n",
9062306a36Sopenharmony_ci			map_fd, err);
9162306a36Sopenharmony_ci		return libbpf_err(err);
9262306a36Sopenharmony_ci	}
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci	if (info.type != BPF_MAP_TYPE_RINGBUF) {
9562306a36Sopenharmony_ci		pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n",
9662306a36Sopenharmony_ci			map_fd);
9762306a36Sopenharmony_ci		return libbpf_err(-EINVAL);
9862306a36Sopenharmony_ci	}
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci	tmp = libbpf_reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings));
10162306a36Sopenharmony_ci	if (!tmp)
10262306a36Sopenharmony_ci		return libbpf_err(-ENOMEM);
10362306a36Sopenharmony_ci	rb->rings = tmp;
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	tmp = libbpf_reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events));
10662306a36Sopenharmony_ci	if (!tmp)
10762306a36Sopenharmony_ci		return libbpf_err(-ENOMEM);
10862306a36Sopenharmony_ci	rb->events = tmp;
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	r = &rb->rings[rb->ring_cnt];
11162306a36Sopenharmony_ci	memset(r, 0, sizeof(*r));
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci	r->map_fd = map_fd;
11462306a36Sopenharmony_ci	r->sample_cb = sample_cb;
11562306a36Sopenharmony_ci	r->ctx = ctx;
11662306a36Sopenharmony_ci	r->mask = info.max_entries - 1;
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	/* Map writable consumer page */
11962306a36Sopenharmony_ci	tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0);
12062306a36Sopenharmony_ci	if (tmp == MAP_FAILED) {
12162306a36Sopenharmony_ci		err = -errno;
12262306a36Sopenharmony_ci		pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
12362306a36Sopenharmony_ci			map_fd, err);
12462306a36Sopenharmony_ci		return libbpf_err(err);
12562306a36Sopenharmony_ci	}
12662306a36Sopenharmony_ci	r->consumer_pos = tmp;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	/* Map read-only producer page and data pages. We map twice as big
12962306a36Sopenharmony_ci	 * data size to allow simple reading of samples that wrap around the
13062306a36Sopenharmony_ci	 * end of a ring buffer. See kernel implementation for details.
13162306a36Sopenharmony_ci	 */
13262306a36Sopenharmony_ci	mmap_sz = rb->page_size + 2 * (__u64)info.max_entries;
13362306a36Sopenharmony_ci	if (mmap_sz != (__u64)(size_t)mmap_sz) {
13462306a36Sopenharmony_ci		pr_warn("ringbuf: ring buffer size (%u) is too big\n", info.max_entries);
13562306a36Sopenharmony_ci		return libbpf_err(-E2BIG);
13662306a36Sopenharmony_ci	}
13762306a36Sopenharmony_ci	tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ, MAP_SHARED, map_fd, rb->page_size);
13862306a36Sopenharmony_ci	if (tmp == MAP_FAILED) {
13962306a36Sopenharmony_ci		err = -errno;
14062306a36Sopenharmony_ci		ringbuf_unmap_ring(rb, r);
14162306a36Sopenharmony_ci		pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n",
14262306a36Sopenharmony_ci			map_fd, err);
14362306a36Sopenharmony_ci		return libbpf_err(err);
14462306a36Sopenharmony_ci	}
14562306a36Sopenharmony_ci	r->producer_pos = tmp;
14662306a36Sopenharmony_ci	r->data = tmp + rb->page_size;
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci	e = &rb->events[rb->ring_cnt];
14962306a36Sopenharmony_ci	memset(e, 0, sizeof(*e));
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	e->events = EPOLLIN;
15262306a36Sopenharmony_ci	e->data.fd = rb->ring_cnt;
15362306a36Sopenharmony_ci	if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) {
15462306a36Sopenharmony_ci		err = -errno;
15562306a36Sopenharmony_ci		ringbuf_unmap_ring(rb, r);
15662306a36Sopenharmony_ci		pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n",
15762306a36Sopenharmony_ci			map_fd, err);
15862306a36Sopenharmony_ci		return libbpf_err(err);
15962306a36Sopenharmony_ci	}
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	rb->ring_cnt++;
16262306a36Sopenharmony_ci	return 0;
16362306a36Sopenharmony_ci}
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_civoid ring_buffer__free(struct ring_buffer *rb)
16662306a36Sopenharmony_ci{
16762306a36Sopenharmony_ci	int i;
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci	if (!rb)
17062306a36Sopenharmony_ci		return;
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci	for (i = 0; i < rb->ring_cnt; ++i)
17362306a36Sopenharmony_ci		ringbuf_unmap_ring(rb, &rb->rings[i]);
17462306a36Sopenharmony_ci	if (rb->epoll_fd >= 0)
17562306a36Sopenharmony_ci		close(rb->epoll_fd);
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_ci	free(rb->events);
17862306a36Sopenharmony_ci	free(rb->rings);
17962306a36Sopenharmony_ci	free(rb);
18062306a36Sopenharmony_ci}
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_cistruct ring_buffer *
18362306a36Sopenharmony_ciring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx,
18462306a36Sopenharmony_ci		 const struct ring_buffer_opts *opts)
18562306a36Sopenharmony_ci{
18662306a36Sopenharmony_ci	struct ring_buffer *rb;
18762306a36Sopenharmony_ci	int err;
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	if (!OPTS_VALID(opts, ring_buffer_opts))
19062306a36Sopenharmony_ci		return errno = EINVAL, NULL;
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci	rb = calloc(1, sizeof(*rb));
19362306a36Sopenharmony_ci	if (!rb)
19462306a36Sopenharmony_ci		return errno = ENOMEM, NULL;
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	rb->page_size = getpagesize();
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
19962306a36Sopenharmony_ci	if (rb->epoll_fd < 0) {
20062306a36Sopenharmony_ci		err = -errno;
20162306a36Sopenharmony_ci		pr_warn("ringbuf: failed to create epoll instance: %d\n", err);
20262306a36Sopenharmony_ci		goto err_out;
20362306a36Sopenharmony_ci	}
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	err = ring_buffer__add(rb, map_fd, sample_cb, ctx);
20662306a36Sopenharmony_ci	if (err)
20762306a36Sopenharmony_ci		goto err_out;
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci	return rb;
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_cierr_out:
21262306a36Sopenharmony_ci	ring_buffer__free(rb);
21362306a36Sopenharmony_ci	return errno = -err, NULL;
21462306a36Sopenharmony_ci}
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_cistatic inline int roundup_len(__u32 len)
21762306a36Sopenharmony_ci{
21862306a36Sopenharmony_ci	/* clear out top 2 bits (discard and busy, if set) */
21962306a36Sopenharmony_ci	len <<= 2;
22062306a36Sopenharmony_ci	len >>= 2;
22162306a36Sopenharmony_ci	/* add length prefix */
22262306a36Sopenharmony_ci	len += BPF_RINGBUF_HDR_SZ;
22362306a36Sopenharmony_ci	/* round up to 8 byte alignment */
22462306a36Sopenharmony_ci	return (len + 7) / 8 * 8;
22562306a36Sopenharmony_ci}
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_cistatic int64_t ringbuf_process_ring(struct ring *r)
22862306a36Sopenharmony_ci{
22962306a36Sopenharmony_ci	int *len_ptr, len, err;
23062306a36Sopenharmony_ci	/* 64-bit to avoid overflow in case of extreme application behavior */
23162306a36Sopenharmony_ci	int64_t cnt = 0;
23262306a36Sopenharmony_ci	unsigned long cons_pos, prod_pos;
23362306a36Sopenharmony_ci	bool got_new_data;
23462306a36Sopenharmony_ci	void *sample;
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci	cons_pos = smp_load_acquire(r->consumer_pos);
23762306a36Sopenharmony_ci	do {
23862306a36Sopenharmony_ci		got_new_data = false;
23962306a36Sopenharmony_ci		prod_pos = smp_load_acquire(r->producer_pos);
24062306a36Sopenharmony_ci		while (cons_pos < prod_pos) {
24162306a36Sopenharmony_ci			len_ptr = r->data + (cons_pos & r->mask);
24262306a36Sopenharmony_ci			len = smp_load_acquire(len_ptr);
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci			/* sample not committed yet, bail out for now */
24562306a36Sopenharmony_ci			if (len & BPF_RINGBUF_BUSY_BIT)
24662306a36Sopenharmony_ci				goto done;
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci			got_new_data = true;
24962306a36Sopenharmony_ci			cons_pos += roundup_len(len);
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci			if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) {
25262306a36Sopenharmony_ci				sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
25362306a36Sopenharmony_ci				err = r->sample_cb(r->ctx, sample, len);
25462306a36Sopenharmony_ci				if (err < 0) {
25562306a36Sopenharmony_ci					/* update consumer pos and bail out */
25662306a36Sopenharmony_ci					smp_store_release(r->consumer_pos,
25762306a36Sopenharmony_ci							  cons_pos);
25862306a36Sopenharmony_ci					return err;
25962306a36Sopenharmony_ci				}
26062306a36Sopenharmony_ci				cnt++;
26162306a36Sopenharmony_ci			}
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci			smp_store_release(r->consumer_pos, cons_pos);
26462306a36Sopenharmony_ci		}
26562306a36Sopenharmony_ci	} while (got_new_data);
26662306a36Sopenharmony_cidone:
26762306a36Sopenharmony_ci	return cnt;
26862306a36Sopenharmony_ci}
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci/* Consume available ring buffer(s) data without event polling.
27162306a36Sopenharmony_ci * Returns number of records consumed across all registered ring buffers (or
27262306a36Sopenharmony_ci * INT_MAX, whichever is less), or negative number if any of the callbacks
27362306a36Sopenharmony_ci * return error.
27462306a36Sopenharmony_ci */
27562306a36Sopenharmony_ciint ring_buffer__consume(struct ring_buffer *rb)
27662306a36Sopenharmony_ci{
27762306a36Sopenharmony_ci	int64_t err, res = 0;
27862306a36Sopenharmony_ci	int i;
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	for (i = 0; i < rb->ring_cnt; i++) {
28162306a36Sopenharmony_ci		struct ring *ring = &rb->rings[i];
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci		err = ringbuf_process_ring(ring);
28462306a36Sopenharmony_ci		if (err < 0)
28562306a36Sopenharmony_ci			return libbpf_err(err);
28662306a36Sopenharmony_ci		res += err;
28762306a36Sopenharmony_ci	}
28862306a36Sopenharmony_ci	if (res > INT_MAX)
28962306a36Sopenharmony_ci		return INT_MAX;
29062306a36Sopenharmony_ci	return res;
29162306a36Sopenharmony_ci}
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci/* Poll for available data and consume records, if any are available.
29462306a36Sopenharmony_ci * Returns number of records consumed (or INT_MAX, whichever is less), or
29562306a36Sopenharmony_ci * negative number, if any of the registered callbacks returned error.
29662306a36Sopenharmony_ci */
29762306a36Sopenharmony_ciint ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
29862306a36Sopenharmony_ci{
29962306a36Sopenharmony_ci	int i, cnt;
30062306a36Sopenharmony_ci	int64_t err, res = 0;
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms);
30362306a36Sopenharmony_ci	if (cnt < 0)
30462306a36Sopenharmony_ci		return libbpf_err(-errno);
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_ci	for (i = 0; i < cnt; i++) {
30762306a36Sopenharmony_ci		__u32 ring_id = rb->events[i].data.fd;
30862306a36Sopenharmony_ci		struct ring *ring = &rb->rings[ring_id];
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci		err = ringbuf_process_ring(ring);
31162306a36Sopenharmony_ci		if (err < 0)
31262306a36Sopenharmony_ci			return libbpf_err(err);
31362306a36Sopenharmony_ci		res += err;
31462306a36Sopenharmony_ci	}
31562306a36Sopenharmony_ci	if (res > INT_MAX)
31662306a36Sopenharmony_ci		return INT_MAX;
31762306a36Sopenharmony_ci	return res;
31862306a36Sopenharmony_ci}
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci/* Get an fd that can be used to sleep until data is available in the ring(s) */
32162306a36Sopenharmony_ciint ring_buffer__epoll_fd(const struct ring_buffer *rb)
32262306a36Sopenharmony_ci{
32362306a36Sopenharmony_ci	return rb->epoll_fd;
32462306a36Sopenharmony_ci}
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_cistatic void user_ringbuf_unmap_ring(struct user_ring_buffer *rb)
32762306a36Sopenharmony_ci{
32862306a36Sopenharmony_ci	if (rb->consumer_pos) {
32962306a36Sopenharmony_ci		munmap(rb->consumer_pos, rb->page_size);
33062306a36Sopenharmony_ci		rb->consumer_pos = NULL;
33162306a36Sopenharmony_ci	}
33262306a36Sopenharmony_ci	if (rb->producer_pos) {
33362306a36Sopenharmony_ci		munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1));
33462306a36Sopenharmony_ci		rb->producer_pos = NULL;
33562306a36Sopenharmony_ci	}
33662306a36Sopenharmony_ci}
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_civoid user_ring_buffer__free(struct user_ring_buffer *rb)
33962306a36Sopenharmony_ci{
34062306a36Sopenharmony_ci	if (!rb)
34162306a36Sopenharmony_ci		return;
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_ci	user_ringbuf_unmap_ring(rb);
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci	if (rb->epoll_fd >= 0)
34662306a36Sopenharmony_ci		close(rb->epoll_fd);
34762306a36Sopenharmony_ci
34862306a36Sopenharmony_ci	free(rb);
34962306a36Sopenharmony_ci}
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_cistatic int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd)
35262306a36Sopenharmony_ci{
35362306a36Sopenharmony_ci	struct bpf_map_info info;
35462306a36Sopenharmony_ci	__u32 len = sizeof(info);
35562306a36Sopenharmony_ci	__u64 mmap_sz;
35662306a36Sopenharmony_ci	void *tmp;
35762306a36Sopenharmony_ci	struct epoll_event *rb_epoll;
35862306a36Sopenharmony_ci	int err;
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci	memset(&info, 0, sizeof(info));
36162306a36Sopenharmony_ci
36262306a36Sopenharmony_ci	err = bpf_map_get_info_by_fd(map_fd, &info, &len);
36362306a36Sopenharmony_ci	if (err) {
36462306a36Sopenharmony_ci		err = -errno;
36562306a36Sopenharmony_ci		pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err);
36662306a36Sopenharmony_ci		return err;
36762306a36Sopenharmony_ci	}
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci	if (info.type != BPF_MAP_TYPE_USER_RINGBUF) {
37062306a36Sopenharmony_ci		pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n", map_fd);
37162306a36Sopenharmony_ci		return -EINVAL;
37262306a36Sopenharmony_ci	}
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci	rb->map_fd = map_fd;
37562306a36Sopenharmony_ci	rb->mask = info.max_entries - 1;
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci	/* Map read-only consumer page */
37862306a36Sopenharmony_ci	tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0);
37962306a36Sopenharmony_ci	if (tmp == MAP_FAILED) {
38062306a36Sopenharmony_ci		err = -errno;
38162306a36Sopenharmony_ci		pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
38262306a36Sopenharmony_ci			map_fd, err);
38362306a36Sopenharmony_ci		return err;
38462306a36Sopenharmony_ci	}
38562306a36Sopenharmony_ci	rb->consumer_pos = tmp;
38662306a36Sopenharmony_ci
38762306a36Sopenharmony_ci	/* Map read-write the producer page and data pages. We map the data
38862306a36Sopenharmony_ci	 * region as twice the total size of the ring buffer to allow the
38962306a36Sopenharmony_ci	 * simple reading and writing of samples that wrap around the end of
39062306a36Sopenharmony_ci	 * the buffer.  See the kernel implementation for details.
39162306a36Sopenharmony_ci	 */
39262306a36Sopenharmony_ci	mmap_sz = rb->page_size + 2 * (__u64)info.max_entries;
39362306a36Sopenharmony_ci	if (mmap_sz != (__u64)(size_t)mmap_sz) {
39462306a36Sopenharmony_ci		pr_warn("user ringbuf: ring buf size (%u) is too big\n", info.max_entries);
39562306a36Sopenharmony_ci		return -E2BIG;
39662306a36Sopenharmony_ci	}
39762306a36Sopenharmony_ci	tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ | PROT_WRITE, MAP_SHARED,
39862306a36Sopenharmony_ci		   map_fd, rb->page_size);
39962306a36Sopenharmony_ci	if (tmp == MAP_FAILED) {
40062306a36Sopenharmony_ci		err = -errno;
40162306a36Sopenharmony_ci		pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n",
40262306a36Sopenharmony_ci			map_fd, err);
40362306a36Sopenharmony_ci		return err;
40462306a36Sopenharmony_ci	}
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci	rb->producer_pos = tmp;
40762306a36Sopenharmony_ci	rb->data = tmp + rb->page_size;
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	rb_epoll = &rb->event;
41062306a36Sopenharmony_ci	rb_epoll->events = EPOLLOUT;
41162306a36Sopenharmony_ci	if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) {
41262306a36Sopenharmony_ci		err = -errno;
41362306a36Sopenharmony_ci		pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err);
41462306a36Sopenharmony_ci		return err;
41562306a36Sopenharmony_ci	}
41662306a36Sopenharmony_ci
41762306a36Sopenharmony_ci	return 0;
41862306a36Sopenharmony_ci}
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_cistruct user_ring_buffer *
42162306a36Sopenharmony_ciuser_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts)
42262306a36Sopenharmony_ci{
42362306a36Sopenharmony_ci	struct user_ring_buffer *rb;
42462306a36Sopenharmony_ci	int err;
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci	if (!OPTS_VALID(opts, user_ring_buffer_opts))
42762306a36Sopenharmony_ci		return errno = EINVAL, NULL;
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	rb = calloc(1, sizeof(*rb));
43062306a36Sopenharmony_ci	if (!rb)
43162306a36Sopenharmony_ci		return errno = ENOMEM, NULL;
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci	rb->page_size = getpagesize();
43462306a36Sopenharmony_ci
43562306a36Sopenharmony_ci	rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
43662306a36Sopenharmony_ci	if (rb->epoll_fd < 0) {
43762306a36Sopenharmony_ci		err = -errno;
43862306a36Sopenharmony_ci		pr_warn("user ringbuf: failed to create epoll instance: %d\n", err);
43962306a36Sopenharmony_ci		goto err_out;
44062306a36Sopenharmony_ci	}
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_ci	err = user_ringbuf_map(rb, map_fd);
44362306a36Sopenharmony_ci	if (err)
44462306a36Sopenharmony_ci		goto err_out;
44562306a36Sopenharmony_ci
44662306a36Sopenharmony_ci	return rb;
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_cierr_out:
44962306a36Sopenharmony_ci	user_ring_buffer__free(rb);
45062306a36Sopenharmony_ci	return errno = -err, NULL;
45162306a36Sopenharmony_ci}
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_cistatic void user_ringbuf_commit(struct user_ring_buffer *rb, void *sample, bool discard)
45462306a36Sopenharmony_ci{
45562306a36Sopenharmony_ci	__u32 new_len;
45662306a36Sopenharmony_ci	struct ringbuf_hdr *hdr;
45762306a36Sopenharmony_ci	uintptr_t hdr_offset;
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci	hdr_offset = rb->mask + 1 + (sample - rb->data) - BPF_RINGBUF_HDR_SZ;
46062306a36Sopenharmony_ci	hdr = rb->data + (hdr_offset & rb->mask);
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci	new_len = hdr->len & ~BPF_RINGBUF_BUSY_BIT;
46362306a36Sopenharmony_ci	if (discard)
46462306a36Sopenharmony_ci		new_len |= BPF_RINGBUF_DISCARD_BIT;
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_ci	/* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in
46762306a36Sopenharmony_ci	 * the kernel.
46862306a36Sopenharmony_ci	 */
46962306a36Sopenharmony_ci	__atomic_exchange_n(&hdr->len, new_len, __ATOMIC_ACQ_REL);
47062306a36Sopenharmony_ci}
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_civoid user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample)
47362306a36Sopenharmony_ci{
47462306a36Sopenharmony_ci	user_ringbuf_commit(rb, sample, true);
47562306a36Sopenharmony_ci}
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_civoid user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample)
47862306a36Sopenharmony_ci{
47962306a36Sopenharmony_ci	user_ringbuf_commit(rb, sample, false);
48062306a36Sopenharmony_ci}
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_civoid *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size)
48362306a36Sopenharmony_ci{
48462306a36Sopenharmony_ci	__u32 avail_size, total_size, max_size;
48562306a36Sopenharmony_ci	/* 64-bit to avoid overflow in case of extreme application behavior */
48662306a36Sopenharmony_ci	__u64 cons_pos, prod_pos;
48762306a36Sopenharmony_ci	struct ringbuf_hdr *hdr;
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci	/* The top two bits are used as special flags */
49062306a36Sopenharmony_ci	if (size & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT))
49162306a36Sopenharmony_ci		return errno = E2BIG, NULL;
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci	/* Synchronizes with smp_store_release() in __bpf_user_ringbuf_peek() in
49462306a36Sopenharmony_ci	 * the kernel.
49562306a36Sopenharmony_ci	 */
49662306a36Sopenharmony_ci	cons_pos = smp_load_acquire(rb->consumer_pos);
49762306a36Sopenharmony_ci	/* Synchronizes with smp_store_release() in user_ringbuf_commit() */
49862306a36Sopenharmony_ci	prod_pos = smp_load_acquire(rb->producer_pos);
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	max_size = rb->mask + 1;
50162306a36Sopenharmony_ci	avail_size = max_size - (prod_pos - cons_pos);
50262306a36Sopenharmony_ci	/* Round up total size to a multiple of 8. */
50362306a36Sopenharmony_ci	total_size = (size + BPF_RINGBUF_HDR_SZ + 7) / 8 * 8;
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci	if (total_size > max_size)
50662306a36Sopenharmony_ci		return errno = E2BIG, NULL;
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	if (avail_size < total_size)
50962306a36Sopenharmony_ci		return errno = ENOSPC, NULL;
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci	hdr = rb->data + (prod_pos & rb->mask);
51262306a36Sopenharmony_ci	hdr->len = size | BPF_RINGBUF_BUSY_BIT;
51362306a36Sopenharmony_ci	hdr->pad = 0;
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci	/* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in
51662306a36Sopenharmony_ci	 * the kernel.
51762306a36Sopenharmony_ci	 */
51862306a36Sopenharmony_ci	smp_store_release(rb->producer_pos, prod_pos + total_size);
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci	return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask);
52162306a36Sopenharmony_ci}
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_cistatic __u64 ns_elapsed_timespec(const struct timespec *start, const struct timespec *end)
52462306a36Sopenharmony_ci{
52562306a36Sopenharmony_ci	__u64 start_ns, end_ns, ns_per_s = 1000000000;
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci	start_ns = (__u64)start->tv_sec * ns_per_s + start->tv_nsec;
52862306a36Sopenharmony_ci	end_ns = (__u64)end->tv_sec * ns_per_s + end->tv_nsec;
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci	return end_ns - start_ns;
53162306a36Sopenharmony_ci}
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_civoid *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms)
53462306a36Sopenharmony_ci{
53562306a36Sopenharmony_ci	void *sample;
53662306a36Sopenharmony_ci	int err, ms_remaining = timeout_ms;
53762306a36Sopenharmony_ci	struct timespec start;
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_ci	if (timeout_ms < 0 && timeout_ms != -1)
54062306a36Sopenharmony_ci		return errno = EINVAL, NULL;
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci	if (timeout_ms != -1) {
54362306a36Sopenharmony_ci		err = clock_gettime(CLOCK_MONOTONIC, &start);
54462306a36Sopenharmony_ci		if (err)
54562306a36Sopenharmony_ci			return NULL;
54662306a36Sopenharmony_ci	}
54762306a36Sopenharmony_ci
54862306a36Sopenharmony_ci	do {
54962306a36Sopenharmony_ci		int cnt, ms_elapsed;
55062306a36Sopenharmony_ci		struct timespec curr;
55162306a36Sopenharmony_ci		__u64 ns_per_ms = 1000000;
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci		sample = user_ring_buffer__reserve(rb, size);
55462306a36Sopenharmony_ci		if (sample)
55562306a36Sopenharmony_ci			return sample;
55662306a36Sopenharmony_ci		else if (errno != ENOSPC)
55762306a36Sopenharmony_ci			return NULL;
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci		/* The kernel guarantees at least one event notification
56062306a36Sopenharmony_ci		 * delivery whenever at least one sample is drained from the
56162306a36Sopenharmony_ci		 * ring buffer in an invocation to bpf_ringbuf_drain(). Other
56262306a36Sopenharmony_ci		 * additional events may be delivered at any time, but only one
56362306a36Sopenharmony_ci		 * event is guaranteed per bpf_ringbuf_drain() invocation,
56462306a36Sopenharmony_ci		 * provided that a sample is drained, and the BPF program did
56562306a36Sopenharmony_ci		 * not pass BPF_RB_NO_WAKEUP to bpf_ringbuf_drain(). If
56662306a36Sopenharmony_ci		 * BPF_RB_FORCE_WAKEUP is passed to bpf_ringbuf_drain(), a
56762306a36Sopenharmony_ci		 * wakeup event will be delivered even if no samples are
56862306a36Sopenharmony_ci		 * drained.
56962306a36Sopenharmony_ci		 */
57062306a36Sopenharmony_ci		cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, ms_remaining);
57162306a36Sopenharmony_ci		if (cnt < 0)
57262306a36Sopenharmony_ci			return NULL;
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_ci		if (timeout_ms == -1)
57562306a36Sopenharmony_ci			continue;
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ci		err = clock_gettime(CLOCK_MONOTONIC, &curr);
57862306a36Sopenharmony_ci		if (err)
57962306a36Sopenharmony_ci			return NULL;
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci		ms_elapsed = ns_elapsed_timespec(&start, &curr) / ns_per_ms;
58262306a36Sopenharmony_ci		ms_remaining = timeout_ms - ms_elapsed;
58362306a36Sopenharmony_ci	} while (ms_remaining > 0);
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci	/* Try one more time to reserve a sample after the specified timeout has elapsed. */
58662306a36Sopenharmony_ci	return user_ring_buffer__reserve(rb, size);
58762306a36Sopenharmony_ci}
588