xref: /kernel/linux/linux-5.10/tools/lib/bpf/xsk.c (revision 8c2ecf20)
1// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2
3/*
4 * AF_XDP user-space access library.
5 *
6 * Copyright(c) 2018 - 2019 Intel Corporation.
7 *
8 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
9 */
10
11#include <errno.h>
12#include <stdlib.h>
13#include <string.h>
14#include <unistd.h>
15#include <arpa/inet.h>
16#include <asm/barrier.h>
17#include <linux/compiler.h>
18#include <linux/ethtool.h>
19#include <linux/filter.h>
20#include <linux/if_ether.h>
21#include <linux/if_packet.h>
22#include <linux/if_xdp.h>
23#include <linux/kernel.h>
24#include <linux/list.h>
25#include <linux/sockios.h>
26#include <net/if.h>
27#include <sys/ioctl.h>
28#include <sys/mman.h>
29#include <sys/socket.h>
30#include <sys/types.h>
31
32#include "bpf.h"
33#include "libbpf.h"
34#include "libbpf_internal.h"
35#include "xsk.h"
36
37#ifndef SOL_XDP
38 #define SOL_XDP 283
39#endif
40
41#ifndef AF_XDP
42 #define AF_XDP 44
43#endif
44
45#ifndef PF_XDP
46 #define PF_XDP AF_XDP
47#endif
48
49struct xsk_umem {
50	struct xsk_ring_prod *fill_save;
51	struct xsk_ring_cons *comp_save;
52	char *umem_area;
53	struct xsk_umem_config config;
54	int fd;
55	int refcount;
56	struct list_head ctx_list;
57	bool rx_ring_setup_done;
58	bool tx_ring_setup_done;
59};
60
61struct xsk_ctx {
62	struct xsk_ring_prod *fill;
63	struct xsk_ring_cons *comp;
64	__u32 queue_id;
65	struct xsk_umem *umem;
66	int refcount;
67	int ifindex;
68	struct list_head list;
69	int prog_fd;
70	int xsks_map_fd;
71	char ifname[IFNAMSIZ];
72};
73
74struct xsk_socket {
75	struct xsk_ring_cons *rx;
76	struct xsk_ring_prod *tx;
77	__u64 outstanding_tx;
78	struct xsk_ctx *ctx;
79	struct xsk_socket_config config;
80	int fd;
81};
82
83struct xsk_nl_info {
84	bool xdp_prog_attached;
85	int ifindex;
86	int fd;
87};
88
89/* Up until and including Linux 5.3 */
90struct xdp_ring_offset_v1 {
91	__u64 producer;
92	__u64 consumer;
93	__u64 desc;
94};
95
96/* Up until and including Linux 5.3 */
97struct xdp_mmap_offsets_v1 {
98	struct xdp_ring_offset_v1 rx;
99	struct xdp_ring_offset_v1 tx;
100	struct xdp_ring_offset_v1 fr;
101	struct xdp_ring_offset_v1 cr;
102};
103
104int xsk_umem__fd(const struct xsk_umem *umem)
105{
106	return umem ? umem->fd : -EINVAL;
107}
108
109int xsk_socket__fd(const struct xsk_socket *xsk)
110{
111	return xsk ? xsk->fd : -EINVAL;
112}
113
114static bool xsk_page_aligned(void *buffer)
115{
116	unsigned long addr = (unsigned long)buffer;
117
118	return !(addr & (getpagesize() - 1));
119}
120
121static void xsk_set_umem_config(struct xsk_umem_config *cfg,
122				const struct xsk_umem_config *usr_cfg)
123{
124	if (!usr_cfg) {
125		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
126		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
127		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
128		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
129		cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
130		return;
131	}
132
133	cfg->fill_size = usr_cfg->fill_size;
134	cfg->comp_size = usr_cfg->comp_size;
135	cfg->frame_size = usr_cfg->frame_size;
136	cfg->frame_headroom = usr_cfg->frame_headroom;
137	cfg->flags = usr_cfg->flags;
138}
139
140static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
141				     const struct xsk_socket_config *usr_cfg)
142{
143	if (!usr_cfg) {
144		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
145		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
146		cfg->libbpf_flags = 0;
147		cfg->xdp_flags = 0;
148		cfg->bind_flags = 0;
149		return 0;
150	}
151
152	if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
153		return -EINVAL;
154
155	cfg->rx_size = usr_cfg->rx_size;
156	cfg->tx_size = usr_cfg->tx_size;
157	cfg->libbpf_flags = usr_cfg->libbpf_flags;
158	cfg->xdp_flags = usr_cfg->xdp_flags;
159	cfg->bind_flags = usr_cfg->bind_flags;
160
161	return 0;
162}
163
164static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
165{
166	struct xdp_mmap_offsets_v1 off_v1;
167
168	/* getsockopt on a kernel <= 5.3 has no flags fields.
169	 * Copy over the offsets to the correct places in the >=5.4 format
170	 * and put the flags where they would have been on that kernel.
171	 */
172	memcpy(&off_v1, off, sizeof(off_v1));
173
174	off->rx.producer = off_v1.rx.producer;
175	off->rx.consumer = off_v1.rx.consumer;
176	off->rx.desc = off_v1.rx.desc;
177	off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
178
179	off->tx.producer = off_v1.tx.producer;
180	off->tx.consumer = off_v1.tx.consumer;
181	off->tx.desc = off_v1.tx.desc;
182	off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
183
184	off->fr.producer = off_v1.fr.producer;
185	off->fr.consumer = off_v1.fr.consumer;
186	off->fr.desc = off_v1.fr.desc;
187	off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
188
189	off->cr.producer = off_v1.cr.producer;
190	off->cr.consumer = off_v1.cr.consumer;
191	off->cr.desc = off_v1.cr.desc;
192	off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
193}
194
195static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
196{
197	socklen_t optlen;
198	int err;
199
200	optlen = sizeof(*off);
201	err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
202	if (err)
203		return err;
204
205	if (optlen == sizeof(*off))
206		return 0;
207
208	if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
209		xsk_mmap_offsets_v1(off);
210		return 0;
211	}
212
213	return -EINVAL;
214}
215
216static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
217				 struct xsk_ring_prod *fill,
218				 struct xsk_ring_cons *comp)
219{
220	struct xdp_mmap_offsets off;
221	void *map;
222	int err;
223
224	err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
225			 &umem->config.fill_size,
226			 sizeof(umem->config.fill_size));
227	if (err)
228		return -errno;
229
230	err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
231			 &umem->config.comp_size,
232			 sizeof(umem->config.comp_size));
233	if (err)
234		return -errno;
235
236	err = xsk_get_mmap_offsets(fd, &off);
237	if (err)
238		return -errno;
239
240	map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
241		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
242		   XDP_UMEM_PGOFF_FILL_RING);
243	if (map == MAP_FAILED)
244		return -errno;
245
246	fill->mask = umem->config.fill_size - 1;
247	fill->size = umem->config.fill_size;
248	fill->producer = map + off.fr.producer;
249	fill->consumer = map + off.fr.consumer;
250	fill->flags = map + off.fr.flags;
251	fill->ring = map + off.fr.desc;
252	fill->cached_cons = umem->config.fill_size;
253
254	map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
255		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
256		   XDP_UMEM_PGOFF_COMPLETION_RING);
257	if (map == MAP_FAILED) {
258		err = -errno;
259		goto out_mmap;
260	}
261
262	comp->mask = umem->config.comp_size - 1;
263	comp->size = umem->config.comp_size;
264	comp->producer = map + off.cr.producer;
265	comp->consumer = map + off.cr.consumer;
266	comp->flags = map + off.cr.flags;
267	comp->ring = map + off.cr.desc;
268
269	return 0;
270
271out_mmap:
272	munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
273	return err;
274}
275
276int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area,
277			    __u64 size, struct xsk_ring_prod *fill,
278			    struct xsk_ring_cons *comp,
279			    const struct xsk_umem_config *usr_config)
280{
281	struct xdp_umem_reg mr;
282	struct xsk_umem *umem;
283	int err;
284
285	if (!umem_area || !umem_ptr || !fill || !comp)
286		return -EFAULT;
287	if (!size && !xsk_page_aligned(umem_area))
288		return -EINVAL;
289
290	umem = calloc(1, sizeof(*umem));
291	if (!umem)
292		return -ENOMEM;
293
294	umem->fd = socket(AF_XDP, SOCK_RAW, 0);
295	if (umem->fd < 0) {
296		err = -errno;
297		goto out_umem_alloc;
298	}
299
300	umem->umem_area = umem_area;
301	INIT_LIST_HEAD(&umem->ctx_list);
302	xsk_set_umem_config(&umem->config, usr_config);
303
304	memset(&mr, 0, sizeof(mr));
305	mr.addr = (uintptr_t)umem_area;
306	mr.len = size;
307	mr.chunk_size = umem->config.frame_size;
308	mr.headroom = umem->config.frame_headroom;
309	mr.flags = umem->config.flags;
310
311	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
312	if (err) {
313		err = -errno;
314		goto out_socket;
315	}
316
317	err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
318	if (err)
319		goto out_socket;
320
321	umem->fill_save = fill;
322	umem->comp_save = comp;
323	*umem_ptr = umem;
324	return 0;
325
326out_socket:
327	close(umem->fd);
328out_umem_alloc:
329	free(umem);
330	return err;
331}
332
333struct xsk_umem_config_v1 {
334	__u32 fill_size;
335	__u32 comp_size;
336	__u32 frame_size;
337	__u32 frame_headroom;
338};
339
340int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
341			    __u64 size, struct xsk_ring_prod *fill,
342			    struct xsk_ring_cons *comp,
343			    const struct xsk_umem_config *usr_config)
344{
345	struct xsk_umem_config config;
346
347	memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1));
348	config.flags = 0;
349
350	return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp,
351					&config);
352}
353COMPAT_VERSION(xsk_umem__create_v0_0_2, xsk_umem__create, LIBBPF_0.0.2)
354DEFAULT_VERSION(xsk_umem__create_v0_0_4, xsk_umem__create, LIBBPF_0.0.4)
355
356static int xsk_load_xdp_prog(struct xsk_socket *xsk)
357{
358	static const int log_buf_size = 16 * 1024;
359	struct xsk_ctx *ctx = xsk->ctx;
360	char log_buf[log_buf_size];
361	int err, prog_fd;
362
363	/* This is the C-program:
364	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
365	 * {
366	 *     int ret, index = ctx->rx_queue_index;
367	 *
368	 *     // A set entry here means that the correspnding queue_id
369	 *     // has an active AF_XDP socket bound to it.
370	 *     ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
371	 *     if (ret > 0)
372	 *         return ret;
373	 *
374	 *     // Fallback for pre-5.3 kernels, not supporting default
375	 *     // action in the flags parameter.
376	 *     if (bpf_map_lookup_elem(&xsks_map, &index))
377	 *         return bpf_redirect_map(&xsks_map, index, 0);
378	 *     return XDP_PASS;
379	 * }
380	 */
381	struct bpf_insn prog[] = {
382		/* r2 = *(u32 *)(r1 + 16) */
383		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
384		/* *(u32 *)(r10 - 4) = r2 */
385		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
386		/* r1 = xskmap[] */
387		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
388		/* r3 = XDP_PASS */
389		BPF_MOV64_IMM(BPF_REG_3, 2),
390		/* call bpf_redirect_map */
391		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
392		/* if w0 != 0 goto pc+13 */
393		BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
394		/* r2 = r10 */
395		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
396		/* r2 += -4 */
397		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
398		/* r1 = xskmap[] */
399		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
400		/* call bpf_map_lookup_elem */
401		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
402		/* r1 = r0 */
403		BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
404		/* r0 = XDP_PASS */
405		BPF_MOV64_IMM(BPF_REG_0, 2),
406		/* if r1 == 0 goto pc+5 */
407		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
408		/* r2 = *(u32 *)(r10 - 4) */
409		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
410		/* r1 = xskmap[] */
411		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
412		/* r3 = 0 */
413		BPF_MOV64_IMM(BPF_REG_3, 0),
414		/* call bpf_redirect_map */
415		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
416		/* The jumps are to this instruction */
417		BPF_EXIT_INSN(),
418	};
419	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
420
421	prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
422				   "LGPL-2.1 or BSD-2-Clause", 0, log_buf,
423				   log_buf_size);
424	if (prog_fd < 0) {
425		pr_warn("BPF log buffer:\n%s", log_buf);
426		return prog_fd;
427	}
428
429	err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd,
430				  xsk->config.xdp_flags);
431	if (err) {
432		close(prog_fd);
433		return err;
434	}
435
436	ctx->prog_fd = prog_fd;
437	return 0;
438}
439
440static int xsk_get_max_queues(struct xsk_socket *xsk)
441{
442	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
443	struct xsk_ctx *ctx = xsk->ctx;
444	struct ifreq ifr = {};
445	int fd, err, ret;
446
447	fd = socket(AF_INET, SOCK_DGRAM, 0);
448	if (fd < 0)
449		return -errno;
450
451	ifr.ifr_data = (void *)&channels;
452	memcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ - 1);
453	ifr.ifr_name[IFNAMSIZ - 1] = '\0';
454	err = ioctl(fd, SIOCETHTOOL, &ifr);
455	if (err && errno != EOPNOTSUPP) {
456		ret = -errno;
457		goto out;
458	}
459
460	if (err) {
461		/* If the device says it has no channels, then all traffic
462		 * is sent to a single stream, so max queues = 1.
463		 */
464		ret = 1;
465	} else {
466		/* Take the max of rx, tx, combined. Drivers return
467		 * the number of channels in different ways.
468		 */
469		ret = max(channels.max_rx, channels.max_tx);
470		ret = max(ret, (int)channels.max_combined);
471	}
472
473out:
474	close(fd);
475	return ret;
476}
477
478static int xsk_create_bpf_maps(struct xsk_socket *xsk)
479{
480	struct xsk_ctx *ctx = xsk->ctx;
481	int max_queues;
482	int fd;
483
484	max_queues = xsk_get_max_queues(xsk);
485	if (max_queues < 0)
486		return max_queues;
487
488	fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
489				 sizeof(int), sizeof(int), max_queues, 0);
490	if (fd < 0)
491		return fd;
492
493	ctx->xsks_map_fd = fd;
494
495	return 0;
496}
497
498static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
499{
500	struct xsk_ctx *ctx = xsk->ctx;
501
502	bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
503	close(ctx->xsks_map_fd);
504}
505
506static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
507{
508	__u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
509	__u32 map_len = sizeof(struct bpf_map_info);
510	struct bpf_prog_info prog_info = {};
511	struct xsk_ctx *ctx = xsk->ctx;
512	struct bpf_map_info map_info;
513	int fd, err;
514
515	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
516	if (err)
517		return err;
518
519	num_maps = prog_info.nr_map_ids;
520
521	map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
522	if (!map_ids)
523		return -ENOMEM;
524
525	memset(&prog_info, 0, prog_len);
526	prog_info.nr_map_ids = num_maps;
527	prog_info.map_ids = (__u64)(unsigned long)map_ids;
528
529	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
530	if (err)
531		goto out_map_ids;
532
533	ctx->xsks_map_fd = -1;
534
535	for (i = 0; i < prog_info.nr_map_ids; i++) {
536		fd = bpf_map_get_fd_by_id(map_ids[i]);
537		if (fd < 0)
538			continue;
539
540		memset(&map_info, 0, map_len);
541		err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
542		if (err) {
543			close(fd);
544			continue;
545		}
546
547		if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
548			ctx->xsks_map_fd = fd;
549			break;
550		}
551
552		close(fd);
553	}
554
555	err = 0;
556	if (ctx->xsks_map_fd == -1)
557		err = -ENOENT;
558
559out_map_ids:
560	free(map_ids);
561	return err;
562}
563
564static int xsk_set_bpf_maps(struct xsk_socket *xsk)
565{
566	struct xsk_ctx *ctx = xsk->ctx;
567
568	return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
569				   &xsk->fd, 0);
570}
571
572static int xsk_setup_xdp_prog(struct xsk_socket *xsk)
573{
574	struct xsk_ctx *ctx = xsk->ctx;
575	__u32 prog_id = 0;
576	int err;
577
578	err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id,
579				  xsk->config.xdp_flags);
580	if (err)
581		return err;
582
583	if (!prog_id) {
584		err = xsk_create_bpf_maps(xsk);
585		if (err)
586			return err;
587
588		err = xsk_load_xdp_prog(xsk);
589		if (err) {
590			xsk_delete_bpf_maps(xsk);
591			return err;
592		}
593	} else {
594		ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
595		if (ctx->prog_fd < 0)
596			return -errno;
597		err = xsk_lookup_bpf_maps(xsk);
598		if (err) {
599			close(ctx->prog_fd);
600			return err;
601		}
602	}
603
604	if (xsk->rx)
605		err = xsk_set_bpf_maps(xsk);
606	if (err) {
607		xsk_delete_bpf_maps(xsk);
608		close(ctx->prog_fd);
609		return err;
610	}
611
612	return 0;
613}
614
615static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
616				   __u32 queue_id)
617{
618	struct xsk_ctx *ctx;
619
620	if (list_empty(&umem->ctx_list))
621		return NULL;
622
623	list_for_each_entry(ctx, &umem->ctx_list, list) {
624		if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
625			ctx->refcount++;
626			return ctx;
627		}
628	}
629
630	return NULL;
631}
632
633static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
634{
635	struct xsk_umem *umem = ctx->umem;
636	struct xdp_mmap_offsets off;
637	int err;
638
639	if (--ctx->refcount)
640		return;
641
642	if (!unmap)
643		goto out_free;
644
645	err = xsk_get_mmap_offsets(umem->fd, &off);
646	if (err)
647		goto out_free;
648
649	munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
650	       sizeof(__u64));
651	munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
652	       sizeof(__u64));
653
654out_free:
655	list_del(&ctx->list);
656	free(ctx);
657}
658
659static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
660				      struct xsk_umem *umem, int ifindex,
661				      const char *ifname, __u32 queue_id,
662				      struct xsk_ring_prod *fill,
663				      struct xsk_ring_cons *comp)
664{
665	struct xsk_ctx *ctx;
666	int err;
667
668	ctx = calloc(1, sizeof(*ctx));
669	if (!ctx)
670		return NULL;
671
672	if (!umem->fill_save) {
673		err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
674		if (err) {
675			free(ctx);
676			return NULL;
677		}
678	} else if (umem->fill_save != fill || umem->comp_save != comp) {
679		/* Copy over rings to new structs. */
680		memcpy(fill, umem->fill_save, sizeof(*fill));
681		memcpy(comp, umem->comp_save, sizeof(*comp));
682	}
683
684	ctx->ifindex = ifindex;
685	ctx->refcount = 1;
686	ctx->umem = umem;
687	ctx->queue_id = queue_id;
688	memcpy(ctx->ifname, ifname, IFNAMSIZ - 1);
689	ctx->ifname[IFNAMSIZ - 1] = '\0';
690
691	ctx->fill = fill;
692	ctx->comp = comp;
693	list_add(&ctx->list, &umem->ctx_list);
694	return ctx;
695}
696
697int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
698			      const char *ifname,
699			      __u32 queue_id, struct xsk_umem *umem,
700			      struct xsk_ring_cons *rx,
701			      struct xsk_ring_prod *tx,
702			      struct xsk_ring_prod *fill,
703			      struct xsk_ring_cons *comp,
704			      const struct xsk_socket_config *usr_config)
705{
706	bool unmap, rx_setup_done = false, tx_setup_done = false;
707	void *rx_map = NULL, *tx_map = NULL;
708	struct sockaddr_xdp sxdp = {};
709	struct xdp_mmap_offsets off;
710	struct xsk_socket *xsk;
711	struct xsk_ctx *ctx;
712	int err, ifindex;
713
714	if (!umem || !xsk_ptr || !(rx || tx))
715		return -EFAULT;
716
717	unmap = umem->fill_save != fill;
718
719	xsk = calloc(1, sizeof(*xsk));
720	if (!xsk)
721		return -ENOMEM;
722
723	err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
724	if (err)
725		goto out_xsk_alloc;
726
727	xsk->outstanding_tx = 0;
728	ifindex = if_nametoindex(ifname);
729	if (!ifindex) {
730		err = -errno;
731		goto out_xsk_alloc;
732	}
733
734	if (umem->refcount++ > 0) {
735		xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
736		if (xsk->fd < 0) {
737			err = -errno;
738			goto out_xsk_alloc;
739		}
740	} else {
741		xsk->fd = umem->fd;
742		rx_setup_done = umem->rx_ring_setup_done;
743		tx_setup_done = umem->tx_ring_setup_done;
744	}
745
746	ctx = xsk_get_ctx(umem, ifindex, queue_id);
747	if (!ctx) {
748		if (!fill || !comp) {
749			err = -EFAULT;
750			goto out_socket;
751		}
752
753		ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
754				     fill, comp);
755		if (!ctx) {
756			err = -ENOMEM;
757			goto out_socket;
758		}
759	}
760	xsk->ctx = ctx;
761
762	if (rx && !rx_setup_done) {
763		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
764				 &xsk->config.rx_size,
765				 sizeof(xsk->config.rx_size));
766		if (err) {
767			err = -errno;
768			goto out_put_ctx;
769		}
770		if (xsk->fd == umem->fd)
771			umem->rx_ring_setup_done = true;
772	}
773	if (tx && !tx_setup_done) {
774		err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
775				 &xsk->config.tx_size,
776				 sizeof(xsk->config.tx_size));
777		if (err) {
778			err = -errno;
779			goto out_put_ctx;
780		}
781		if (xsk->fd == umem->fd)
782			umem->tx_ring_setup_done = true;
783	}
784
785	err = xsk_get_mmap_offsets(xsk->fd, &off);
786	if (err) {
787		err = -errno;
788		goto out_put_ctx;
789	}
790
791	if (rx) {
792		rx_map = mmap(NULL, off.rx.desc +
793			      xsk->config.rx_size * sizeof(struct xdp_desc),
794			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
795			      xsk->fd, XDP_PGOFF_RX_RING);
796		if (rx_map == MAP_FAILED) {
797			err = -errno;
798			goto out_put_ctx;
799		}
800
801		rx->mask = xsk->config.rx_size - 1;
802		rx->size = xsk->config.rx_size;
803		rx->producer = rx_map + off.rx.producer;
804		rx->consumer = rx_map + off.rx.consumer;
805		rx->flags = rx_map + off.rx.flags;
806		rx->ring = rx_map + off.rx.desc;
807		rx->cached_prod = *rx->producer;
808		rx->cached_cons = *rx->consumer;
809	}
810	xsk->rx = rx;
811
812	if (tx) {
813		tx_map = mmap(NULL, off.tx.desc +
814			      xsk->config.tx_size * sizeof(struct xdp_desc),
815			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
816			      xsk->fd, XDP_PGOFF_TX_RING);
817		if (tx_map == MAP_FAILED) {
818			err = -errno;
819			goto out_mmap_rx;
820		}
821
822		tx->mask = xsk->config.tx_size - 1;
823		tx->size = xsk->config.tx_size;
824		tx->producer = tx_map + off.tx.producer;
825		tx->consumer = tx_map + off.tx.consumer;
826		tx->flags = tx_map + off.tx.flags;
827		tx->ring = tx_map + off.tx.desc;
828		tx->cached_prod = *tx->producer;
829		/* cached_cons is r->size bigger than the real consumer pointer
830		 * See xsk_prod_nb_free
831		 */
832		tx->cached_cons = *tx->consumer + xsk->config.tx_size;
833	}
834	xsk->tx = tx;
835
836	sxdp.sxdp_family = PF_XDP;
837	sxdp.sxdp_ifindex = ctx->ifindex;
838	sxdp.sxdp_queue_id = ctx->queue_id;
839	if (umem->refcount > 1) {
840		sxdp.sxdp_flags |= XDP_SHARED_UMEM;
841		sxdp.sxdp_shared_umem_fd = umem->fd;
842	} else {
843		sxdp.sxdp_flags = xsk->config.bind_flags;
844	}
845
846	err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
847	if (err) {
848		err = -errno;
849		goto out_mmap_tx;
850	}
851
852	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
853		err = xsk_setup_xdp_prog(xsk);
854		if (err)
855			goto out_mmap_tx;
856	}
857
858	*xsk_ptr = xsk;
859	umem->fill_save = NULL;
860	umem->comp_save = NULL;
861	return 0;
862
863out_mmap_tx:
864	if (tx)
865		munmap(tx_map, off.tx.desc +
866		       xsk->config.tx_size * sizeof(struct xdp_desc));
867out_mmap_rx:
868	if (rx)
869		munmap(rx_map, off.rx.desc +
870		       xsk->config.rx_size * sizeof(struct xdp_desc));
871out_put_ctx:
872	xsk_put_ctx(ctx, unmap);
873out_socket:
874	if (--umem->refcount)
875		close(xsk->fd);
876out_xsk_alloc:
877	free(xsk);
878	return err;
879}
880
881int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
882		       __u32 queue_id, struct xsk_umem *umem,
883		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
884		       const struct xsk_socket_config *usr_config)
885{
886	if (!umem)
887		return -EFAULT;
888
889	return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
890					 rx, tx, umem->fill_save,
891					 umem->comp_save, usr_config);
892}
893
894int xsk_umem__delete(struct xsk_umem *umem)
895{
896	struct xdp_mmap_offsets off;
897	int err;
898
899	if (!umem)
900		return 0;
901
902	if (umem->refcount)
903		return -EBUSY;
904
905	err = xsk_get_mmap_offsets(umem->fd, &off);
906	if (!err && umem->fill_save && umem->comp_save) {
907		munmap(umem->fill_save->ring - off.fr.desc,
908		       off.fr.desc + umem->config.fill_size * sizeof(__u64));
909		munmap(umem->comp_save->ring - off.cr.desc,
910		       off.cr.desc + umem->config.comp_size * sizeof(__u64));
911	}
912
913	close(umem->fd);
914	free(umem);
915
916	return 0;
917}
918
919void xsk_socket__delete(struct xsk_socket *xsk)
920{
921	size_t desc_sz = sizeof(struct xdp_desc);
922	struct xdp_mmap_offsets off;
923	struct xsk_umem *umem;
924	struct xsk_ctx *ctx;
925	int err;
926
927	if (!xsk)
928		return;
929
930	ctx = xsk->ctx;
931	umem = ctx->umem;
932
933	if (ctx->refcount == 1) {
934		xsk_delete_bpf_maps(xsk);
935		close(ctx->prog_fd);
936	}
937
938	xsk_put_ctx(ctx, true);
939
940	err = xsk_get_mmap_offsets(xsk->fd, &off);
941	if (!err) {
942		if (xsk->rx) {
943			munmap(xsk->rx->ring - off.rx.desc,
944			       off.rx.desc + xsk->config.rx_size * desc_sz);
945		}
946		if (xsk->tx) {
947			munmap(xsk->tx->ring - off.tx.desc,
948			       off.tx.desc + xsk->config.tx_size * desc_sz);
949		}
950	}
951
952	umem->refcount--;
953	/* Do not close an fd that also has an associated umem connected
954	 * to it.
955	 */
956	if (xsk->fd != umem->fd)
957		close(xsk->fd);
958	free(xsk);
959}
960