162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* XDP sockets 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * AF_XDP sockets allows a channel between XDP programs and userspace 562306a36Sopenharmony_ci * applications. 662306a36Sopenharmony_ci * Copyright(c) 2018 Intel Corporation. 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * Author(s): Björn Töpel <bjorn.topel@intel.com> 962306a36Sopenharmony_ci * Magnus Karlsson <magnus.karlsson@intel.com> 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci#include <linux/if_xdp.h> 1562306a36Sopenharmony_ci#include <linux/init.h> 1662306a36Sopenharmony_ci#include <linux/sched/mm.h> 1762306a36Sopenharmony_ci#include <linux/sched/signal.h> 1862306a36Sopenharmony_ci#include <linux/sched/task.h> 1962306a36Sopenharmony_ci#include <linux/socket.h> 2062306a36Sopenharmony_ci#include <linux/file.h> 2162306a36Sopenharmony_ci#include <linux/uaccess.h> 2262306a36Sopenharmony_ci#include <linux/net.h> 2362306a36Sopenharmony_ci#include <linux/netdevice.h> 2462306a36Sopenharmony_ci#include <linux/rculist.h> 2562306a36Sopenharmony_ci#include <linux/vmalloc.h> 2662306a36Sopenharmony_ci#include <net/xdp_sock_drv.h> 2762306a36Sopenharmony_ci#include <net/busy_poll.h> 2862306a36Sopenharmony_ci#include <net/netdev_rx_queue.h> 2962306a36Sopenharmony_ci#include <net/xdp.h> 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci#include "xsk_queue.h" 3262306a36Sopenharmony_ci#include "xdp_umem.h" 3362306a36Sopenharmony_ci#include "xsk.h" 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci#define TX_BATCH_SIZE 32 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_cistatic DEFINE_PER_CPU(struct list_head, xskmap_flush_list); 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_civoid xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) 4062306a36Sopenharmony_ci{ 4162306a36Sopenharmony_ci if (pool->cached_need_wakeup & XDP_WAKEUP_RX) 4262306a36Sopenharmony_ci return; 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; 4562306a36Sopenharmony_ci pool->cached_need_wakeup |= XDP_WAKEUP_RX; 4662306a36Sopenharmony_ci} 4762306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_set_rx_need_wakeup); 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_civoid xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) 5062306a36Sopenharmony_ci{ 5162306a36Sopenharmony_ci struct xdp_sock *xs; 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci if (pool->cached_need_wakeup & XDP_WAKEUP_TX) 5462306a36Sopenharmony_ci return; 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci rcu_read_lock(); 5762306a36Sopenharmony_ci list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 5862306a36Sopenharmony_ci xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 5962306a36Sopenharmony_ci } 6062306a36Sopenharmony_ci rcu_read_unlock(); 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci pool->cached_need_wakeup |= XDP_WAKEUP_TX; 6362306a36Sopenharmony_ci} 6462306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_set_tx_need_wakeup); 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_civoid xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) 6762306a36Sopenharmony_ci{ 6862306a36Sopenharmony_ci if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) 6962306a36Sopenharmony_ci return; 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; 7262306a36Sopenharmony_ci pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; 7362306a36Sopenharmony_ci} 7462306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_clear_rx_need_wakeup); 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_civoid xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) 7762306a36Sopenharmony_ci{ 7862306a36Sopenharmony_ci struct xdp_sock *xs; 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) 8162306a36Sopenharmony_ci return; 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci rcu_read_lock(); 8462306a36Sopenharmony_ci list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 8562306a36Sopenharmony_ci xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; 8662306a36Sopenharmony_ci } 8762306a36Sopenharmony_ci rcu_read_unlock(); 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; 9062306a36Sopenharmony_ci} 9162306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_clear_tx_need_wakeup); 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_cibool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) 9462306a36Sopenharmony_ci{ 9562306a36Sopenharmony_ci return pool->uses_need_wakeup; 9662306a36Sopenharmony_ci} 9762306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_uses_need_wakeup); 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_cistruct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, 10062306a36Sopenharmony_ci u16 queue_id) 10162306a36Sopenharmony_ci{ 10262306a36Sopenharmony_ci if (queue_id < dev->real_num_rx_queues) 10362306a36Sopenharmony_ci return dev->_rx[queue_id].pool; 10462306a36Sopenharmony_ci if (queue_id < dev->real_num_tx_queues) 10562306a36Sopenharmony_ci return dev->_tx[queue_id].pool; 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci return NULL; 10862306a36Sopenharmony_ci} 10962306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_get_pool_from_qid); 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_civoid xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 11262306a36Sopenharmony_ci{ 11362306a36Sopenharmony_ci if (queue_id < dev->num_rx_queues) 11462306a36Sopenharmony_ci dev->_rx[queue_id].pool = NULL; 11562306a36Sopenharmony_ci if (queue_id < dev->num_tx_queues) 11662306a36Sopenharmony_ci dev->_tx[queue_id].pool = NULL; 11762306a36Sopenharmony_ci} 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci/* The buffer pool is stored both in the _rx struct and the _tx struct as we do 12062306a36Sopenharmony_ci * not know if the device has more tx queues than rx, or the opposite. 12162306a36Sopenharmony_ci * This might also change during run time. 12262306a36Sopenharmony_ci */ 12362306a36Sopenharmony_ciint xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, 12462306a36Sopenharmony_ci u16 queue_id) 12562306a36Sopenharmony_ci{ 12662306a36Sopenharmony_ci if (queue_id >= max_t(unsigned int, 12762306a36Sopenharmony_ci dev->real_num_rx_queues, 12862306a36Sopenharmony_ci dev->real_num_tx_queues)) 12962306a36Sopenharmony_ci return -EINVAL; 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci if (queue_id < dev->real_num_rx_queues) 13262306a36Sopenharmony_ci dev->_rx[queue_id].pool = pool; 13362306a36Sopenharmony_ci if (queue_id < dev->real_num_tx_queues) 13462306a36Sopenharmony_ci dev->_tx[queue_id].pool = pool; 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci return 0; 13762306a36Sopenharmony_ci} 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_cistatic int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, 14062306a36Sopenharmony_ci u32 flags) 14162306a36Sopenharmony_ci{ 14262306a36Sopenharmony_ci u64 addr; 14362306a36Sopenharmony_ci int err; 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci addr = xp_get_handle(xskb); 14662306a36Sopenharmony_ci err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); 14762306a36Sopenharmony_ci if (err) { 14862306a36Sopenharmony_ci xs->rx_queue_full++; 14962306a36Sopenharmony_ci return err; 15062306a36Sopenharmony_ci } 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci xp_release(xskb); 15362306a36Sopenharmony_ci return 0; 15462306a36Sopenharmony_ci} 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_cistatic int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 15762306a36Sopenharmony_ci{ 15862306a36Sopenharmony_ci struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); 15962306a36Sopenharmony_ci u32 frags = xdp_buff_has_frags(xdp); 16062306a36Sopenharmony_ci struct xdp_buff_xsk *pos, *tmp; 16162306a36Sopenharmony_ci struct list_head *xskb_list; 16262306a36Sopenharmony_ci u32 contd = 0; 16362306a36Sopenharmony_ci int err; 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci if (frags) 16662306a36Sopenharmony_ci contd = XDP_PKT_CONTD; 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci err = __xsk_rcv_zc(xs, xskb, len, contd); 16962306a36Sopenharmony_ci if (err) 17062306a36Sopenharmony_ci goto err; 17162306a36Sopenharmony_ci if (likely(!frags)) 17262306a36Sopenharmony_ci return 0; 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci xskb_list = &xskb->pool->xskb_list; 17562306a36Sopenharmony_ci list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { 17662306a36Sopenharmony_ci if (list_is_singular(xskb_list)) 17762306a36Sopenharmony_ci contd = 0; 17862306a36Sopenharmony_ci len = pos->xdp.data_end - pos->xdp.data; 17962306a36Sopenharmony_ci err = __xsk_rcv_zc(xs, pos, len, contd); 18062306a36Sopenharmony_ci if (err) 18162306a36Sopenharmony_ci goto err; 18262306a36Sopenharmony_ci list_del(&pos->xskb_list_node); 18362306a36Sopenharmony_ci } 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci return 0; 18662306a36Sopenharmony_cierr: 18762306a36Sopenharmony_ci xsk_buff_free(xdp); 18862306a36Sopenharmony_ci return err; 18962306a36Sopenharmony_ci} 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_cistatic void *xsk_copy_xdp_start(struct xdp_buff *from) 19262306a36Sopenharmony_ci{ 19362306a36Sopenharmony_ci if (unlikely(xdp_data_meta_unsupported(from))) 19462306a36Sopenharmony_ci return from->data; 19562306a36Sopenharmony_ci else 19662306a36Sopenharmony_ci return from->data_meta; 19762306a36Sopenharmony_ci} 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_cistatic u32 xsk_copy_xdp(void *to, void **from, u32 to_len, 20062306a36Sopenharmony_ci u32 *from_len, skb_frag_t **frag, u32 rem) 20162306a36Sopenharmony_ci{ 20262306a36Sopenharmony_ci u32 copied = 0; 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci while (1) { 20562306a36Sopenharmony_ci u32 copy_len = min_t(u32, *from_len, to_len); 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci memcpy(to, *from, copy_len); 20862306a36Sopenharmony_ci copied += copy_len; 20962306a36Sopenharmony_ci if (rem == copied) 21062306a36Sopenharmony_ci return copied; 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci if (*from_len == copy_len) { 21362306a36Sopenharmony_ci *from = skb_frag_address(*frag); 21462306a36Sopenharmony_ci *from_len = skb_frag_size((*frag)++); 21562306a36Sopenharmony_ci } else { 21662306a36Sopenharmony_ci *from += copy_len; 21762306a36Sopenharmony_ci *from_len -= copy_len; 21862306a36Sopenharmony_ci } 21962306a36Sopenharmony_ci if (to_len == copy_len) 22062306a36Sopenharmony_ci return copied; 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci to_len -= copy_len; 22362306a36Sopenharmony_ci to += copy_len; 22462306a36Sopenharmony_ci } 22562306a36Sopenharmony_ci} 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_cistatic int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 22862306a36Sopenharmony_ci{ 22962306a36Sopenharmony_ci u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); 23062306a36Sopenharmony_ci void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; 23162306a36Sopenharmony_ci u32 from_len, meta_len, rem, num_desc; 23262306a36Sopenharmony_ci struct xdp_buff_xsk *xskb; 23362306a36Sopenharmony_ci struct xdp_buff *xsk_xdp; 23462306a36Sopenharmony_ci skb_frag_t *frag; 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci from_len = xdp->data_end - copy_from; 23762306a36Sopenharmony_ci meta_len = xdp->data - copy_from; 23862306a36Sopenharmony_ci rem = len + meta_len; 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci if (len <= frame_size && !xdp_buff_has_frags(xdp)) { 24162306a36Sopenharmony_ci int err; 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci xsk_xdp = xsk_buff_alloc(xs->pool); 24462306a36Sopenharmony_ci if (!xsk_xdp) { 24562306a36Sopenharmony_ci xs->rx_dropped++; 24662306a36Sopenharmony_ci return -ENOMEM; 24762306a36Sopenharmony_ci } 24862306a36Sopenharmony_ci memcpy(xsk_xdp->data - meta_len, copy_from, rem); 24962306a36Sopenharmony_ci xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 25062306a36Sopenharmony_ci err = __xsk_rcv_zc(xs, xskb, len, 0); 25162306a36Sopenharmony_ci if (err) { 25262306a36Sopenharmony_ci xsk_buff_free(xsk_xdp); 25362306a36Sopenharmony_ci return err; 25462306a36Sopenharmony_ci } 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci return 0; 25762306a36Sopenharmony_ci } 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci num_desc = (len - 1) / frame_size + 1; 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci if (!xsk_buff_can_alloc(xs->pool, num_desc)) { 26262306a36Sopenharmony_ci xs->rx_dropped++; 26362306a36Sopenharmony_ci return -ENOMEM; 26462306a36Sopenharmony_ci } 26562306a36Sopenharmony_ci if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { 26662306a36Sopenharmony_ci xs->rx_queue_full++; 26762306a36Sopenharmony_ci return -ENOBUFS; 26862306a36Sopenharmony_ci } 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci if (xdp_buff_has_frags(xdp)) { 27162306a36Sopenharmony_ci struct skb_shared_info *sinfo; 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci sinfo = xdp_get_shared_info_from_buff(xdp); 27462306a36Sopenharmony_ci frag = &sinfo->frags[0]; 27562306a36Sopenharmony_ci } 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci do { 27862306a36Sopenharmony_ci u32 to_len = frame_size + meta_len; 27962306a36Sopenharmony_ci u32 copied; 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci xsk_xdp = xsk_buff_alloc(xs->pool); 28262306a36Sopenharmony_ci copy_to = xsk_xdp->data - meta_len; 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); 28562306a36Sopenharmony_ci rem -= copied; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); 28862306a36Sopenharmony_ci __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); 28962306a36Sopenharmony_ci meta_len = 0; 29062306a36Sopenharmony_ci } while (rem); 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci return 0; 29362306a36Sopenharmony_ci} 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_cistatic bool xsk_tx_writeable(struct xdp_sock *xs) 29662306a36Sopenharmony_ci{ 29762306a36Sopenharmony_ci if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) 29862306a36Sopenharmony_ci return false; 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci return true; 30162306a36Sopenharmony_ci} 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_cistatic bool xsk_is_bound(struct xdp_sock *xs) 30462306a36Sopenharmony_ci{ 30562306a36Sopenharmony_ci if (READ_ONCE(xs->state) == XSK_BOUND) { 30662306a36Sopenharmony_ci /* Matches smp_wmb() in bind(). */ 30762306a36Sopenharmony_ci smp_rmb(); 30862306a36Sopenharmony_ci return true; 30962306a36Sopenharmony_ci } 31062306a36Sopenharmony_ci return false; 31162306a36Sopenharmony_ci} 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_cistatic int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 31462306a36Sopenharmony_ci{ 31562306a36Sopenharmony_ci if (!xsk_is_bound(xs)) 31662306a36Sopenharmony_ci return -ENXIO; 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 31962306a36Sopenharmony_ci return -EINVAL; 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { 32262306a36Sopenharmony_ci xs->rx_dropped++; 32362306a36Sopenharmony_ci return -ENOSPC; 32462306a36Sopenharmony_ci } 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci sk_mark_napi_id_once_xdp(&xs->sk, xdp); 32762306a36Sopenharmony_ci return 0; 32862306a36Sopenharmony_ci} 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_cistatic void xsk_flush(struct xdp_sock *xs) 33162306a36Sopenharmony_ci{ 33262306a36Sopenharmony_ci xskq_prod_submit(xs->rx); 33362306a36Sopenharmony_ci __xskq_cons_release(xs->pool->fq); 33462306a36Sopenharmony_ci sock_def_readable(&xs->sk); 33562306a36Sopenharmony_ci} 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ciint xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 33862306a36Sopenharmony_ci{ 33962306a36Sopenharmony_ci u32 len = xdp_get_buff_len(xdp); 34062306a36Sopenharmony_ci int err; 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci spin_lock_bh(&xs->rx_lock); 34362306a36Sopenharmony_ci err = xsk_rcv_check(xs, xdp, len); 34462306a36Sopenharmony_ci if (!err) { 34562306a36Sopenharmony_ci err = __xsk_rcv(xs, xdp, len); 34662306a36Sopenharmony_ci xsk_flush(xs); 34762306a36Sopenharmony_ci } 34862306a36Sopenharmony_ci spin_unlock_bh(&xs->rx_lock); 34962306a36Sopenharmony_ci return err; 35062306a36Sopenharmony_ci} 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_cistatic int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) 35362306a36Sopenharmony_ci{ 35462306a36Sopenharmony_ci u32 len = xdp_get_buff_len(xdp); 35562306a36Sopenharmony_ci int err; 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci err = xsk_rcv_check(xs, xdp, len); 35862306a36Sopenharmony_ci if (err) 35962306a36Sopenharmony_ci return err; 36062306a36Sopenharmony_ci 36162306a36Sopenharmony_ci if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { 36262306a36Sopenharmony_ci len = xdp->data_end - xdp->data; 36362306a36Sopenharmony_ci return xsk_rcv_zc(xs, xdp, len); 36462306a36Sopenharmony_ci } 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci err = __xsk_rcv(xs, xdp, len); 36762306a36Sopenharmony_ci if (!err) 36862306a36Sopenharmony_ci xdp_return_buff(xdp); 36962306a36Sopenharmony_ci return err; 37062306a36Sopenharmony_ci} 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ciint __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) 37362306a36Sopenharmony_ci{ 37462306a36Sopenharmony_ci struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); 37562306a36Sopenharmony_ci int err; 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci err = xsk_rcv(xs, xdp); 37862306a36Sopenharmony_ci if (err) 37962306a36Sopenharmony_ci return err; 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci if (!xs->flush_node.prev) 38262306a36Sopenharmony_ci list_add(&xs->flush_node, flush_list); 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci return 0; 38562306a36Sopenharmony_ci} 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_civoid __xsk_map_flush(void) 38862306a36Sopenharmony_ci{ 38962306a36Sopenharmony_ci struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); 39062306a36Sopenharmony_ci struct xdp_sock *xs, *tmp; 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_ci list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { 39362306a36Sopenharmony_ci xsk_flush(xs); 39462306a36Sopenharmony_ci __list_del_clearprev(&xs->flush_node); 39562306a36Sopenharmony_ci } 39662306a36Sopenharmony_ci} 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_civoid xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) 39962306a36Sopenharmony_ci{ 40062306a36Sopenharmony_ci xskq_prod_submit_n(pool->cq, nb_entries); 40162306a36Sopenharmony_ci} 40262306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_tx_completed); 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_civoid xsk_tx_release(struct xsk_buff_pool *pool) 40562306a36Sopenharmony_ci{ 40662306a36Sopenharmony_ci struct xdp_sock *xs; 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci rcu_read_lock(); 40962306a36Sopenharmony_ci list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 41062306a36Sopenharmony_ci __xskq_cons_release(xs->tx); 41162306a36Sopenharmony_ci if (xsk_tx_writeable(xs)) 41262306a36Sopenharmony_ci xs->sk.sk_write_space(&xs->sk); 41362306a36Sopenharmony_ci } 41462306a36Sopenharmony_ci rcu_read_unlock(); 41562306a36Sopenharmony_ci} 41662306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_tx_release); 41762306a36Sopenharmony_ci 41862306a36Sopenharmony_cibool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) 41962306a36Sopenharmony_ci{ 42062306a36Sopenharmony_ci struct xdp_sock *xs; 42162306a36Sopenharmony_ci 42262306a36Sopenharmony_ci rcu_read_lock(); 42362306a36Sopenharmony_ci list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { 42462306a36Sopenharmony_ci if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { 42562306a36Sopenharmony_ci if (xskq_has_descs(xs->tx)) 42662306a36Sopenharmony_ci xskq_cons_release(xs->tx); 42762306a36Sopenharmony_ci continue; 42862306a36Sopenharmony_ci } 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_ci /* This is the backpressure mechanism for the Tx path. 43162306a36Sopenharmony_ci * Reserve space in the completion queue and only proceed 43262306a36Sopenharmony_ci * if there is space in it. This avoids having to implement 43362306a36Sopenharmony_ci * any buffering in the Tx path. 43462306a36Sopenharmony_ci */ 43562306a36Sopenharmony_ci if (xskq_prod_reserve_addr(pool->cq, desc->addr)) 43662306a36Sopenharmony_ci goto out; 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci xskq_cons_release(xs->tx); 43962306a36Sopenharmony_ci rcu_read_unlock(); 44062306a36Sopenharmony_ci return true; 44162306a36Sopenharmony_ci } 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ciout: 44462306a36Sopenharmony_ci rcu_read_unlock(); 44562306a36Sopenharmony_ci return false; 44662306a36Sopenharmony_ci} 44762306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_tx_peek_desc); 44862306a36Sopenharmony_ci 44962306a36Sopenharmony_cistatic u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) 45062306a36Sopenharmony_ci{ 45162306a36Sopenharmony_ci struct xdp_desc *descs = pool->tx_descs; 45262306a36Sopenharmony_ci u32 nb_pkts = 0; 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) 45562306a36Sopenharmony_ci nb_pkts++; 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_ci xsk_tx_release(pool); 45862306a36Sopenharmony_ci return nb_pkts; 45962306a36Sopenharmony_ci} 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_ciu32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) 46262306a36Sopenharmony_ci{ 46362306a36Sopenharmony_ci struct xdp_sock *xs; 46462306a36Sopenharmony_ci 46562306a36Sopenharmony_ci rcu_read_lock(); 46662306a36Sopenharmony_ci if (!list_is_singular(&pool->xsk_tx_list)) { 46762306a36Sopenharmony_ci /* Fallback to the non-batched version */ 46862306a36Sopenharmony_ci rcu_read_unlock(); 46962306a36Sopenharmony_ci return xsk_tx_peek_release_fallback(pool, nb_pkts); 47062306a36Sopenharmony_ci } 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); 47362306a36Sopenharmony_ci if (!xs) { 47462306a36Sopenharmony_ci nb_pkts = 0; 47562306a36Sopenharmony_ci goto out; 47662306a36Sopenharmony_ci } 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_ci /* This is the backpressure mechanism for the Tx path. Try to 48162306a36Sopenharmony_ci * reserve space in the completion queue for all packets, but 48262306a36Sopenharmony_ci * if there are fewer slots available, just process that many 48362306a36Sopenharmony_ci * packets. This avoids having to implement any buffering in 48462306a36Sopenharmony_ci * the Tx path. 48562306a36Sopenharmony_ci */ 48662306a36Sopenharmony_ci nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); 48762306a36Sopenharmony_ci if (!nb_pkts) 48862306a36Sopenharmony_ci goto out; 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_ci nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); 49162306a36Sopenharmony_ci if (!nb_pkts) { 49262306a36Sopenharmony_ci xs->tx->queue_empty_descs++; 49362306a36Sopenharmony_ci goto out; 49462306a36Sopenharmony_ci } 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci __xskq_cons_release(xs->tx); 49762306a36Sopenharmony_ci xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); 49862306a36Sopenharmony_ci xs->sk.sk_write_space(&xs->sk); 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ciout: 50162306a36Sopenharmony_ci rcu_read_unlock(); 50262306a36Sopenharmony_ci return nb_pkts; 50362306a36Sopenharmony_ci} 50462306a36Sopenharmony_ciEXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_cistatic int xsk_wakeup(struct xdp_sock *xs, u8 flags) 50762306a36Sopenharmony_ci{ 50862306a36Sopenharmony_ci struct net_device *dev = xs->dev; 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); 51162306a36Sopenharmony_ci} 51262306a36Sopenharmony_ci 51362306a36Sopenharmony_cistatic int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) 51462306a36Sopenharmony_ci{ 51562306a36Sopenharmony_ci unsigned long flags; 51662306a36Sopenharmony_ci int ret; 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci spin_lock_irqsave(&xs->pool->cq_lock, flags); 51962306a36Sopenharmony_ci ret = xskq_prod_reserve_addr(xs->pool->cq, addr); 52062306a36Sopenharmony_ci spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 52162306a36Sopenharmony_ci 52262306a36Sopenharmony_ci return ret; 52362306a36Sopenharmony_ci} 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_cistatic void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) 52662306a36Sopenharmony_ci{ 52762306a36Sopenharmony_ci unsigned long flags; 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci spin_lock_irqsave(&xs->pool->cq_lock, flags); 53062306a36Sopenharmony_ci xskq_prod_submit_n(xs->pool->cq, n); 53162306a36Sopenharmony_ci spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 53262306a36Sopenharmony_ci} 53362306a36Sopenharmony_ci 53462306a36Sopenharmony_cistatic void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) 53562306a36Sopenharmony_ci{ 53662306a36Sopenharmony_ci unsigned long flags; 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_ci spin_lock_irqsave(&xs->pool->cq_lock, flags); 53962306a36Sopenharmony_ci xskq_prod_cancel_n(xs->pool->cq, n); 54062306a36Sopenharmony_ci spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 54162306a36Sopenharmony_ci} 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_cistatic u32 xsk_get_num_desc(struct sk_buff *skb) 54462306a36Sopenharmony_ci{ 54562306a36Sopenharmony_ci return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; 54662306a36Sopenharmony_ci} 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_cistatic void xsk_destruct_skb(struct sk_buff *skb) 54962306a36Sopenharmony_ci{ 55062306a36Sopenharmony_ci xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); 55162306a36Sopenharmony_ci sock_wfree(skb); 55262306a36Sopenharmony_ci} 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_cistatic void xsk_set_destructor_arg(struct sk_buff *skb) 55562306a36Sopenharmony_ci{ 55662306a36Sopenharmony_ci long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; 55762306a36Sopenharmony_ci 55862306a36Sopenharmony_ci skb_shinfo(skb)->destructor_arg = (void *)num; 55962306a36Sopenharmony_ci} 56062306a36Sopenharmony_ci 56162306a36Sopenharmony_cistatic void xsk_consume_skb(struct sk_buff *skb) 56262306a36Sopenharmony_ci{ 56362306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(skb->sk); 56462306a36Sopenharmony_ci 56562306a36Sopenharmony_ci skb->destructor = sock_wfree; 56662306a36Sopenharmony_ci xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); 56762306a36Sopenharmony_ci /* Free skb without triggering the perf drop trace */ 56862306a36Sopenharmony_ci consume_skb(skb); 56962306a36Sopenharmony_ci xs->skb = NULL; 57062306a36Sopenharmony_ci} 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_cistatic void xsk_drop_skb(struct sk_buff *skb) 57362306a36Sopenharmony_ci{ 57462306a36Sopenharmony_ci xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); 57562306a36Sopenharmony_ci xsk_consume_skb(skb); 57662306a36Sopenharmony_ci} 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_cistatic struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, 57962306a36Sopenharmony_ci struct xdp_desc *desc) 58062306a36Sopenharmony_ci{ 58162306a36Sopenharmony_ci struct xsk_buff_pool *pool = xs->pool; 58262306a36Sopenharmony_ci u32 hr, len, ts, offset, copy, copied; 58362306a36Sopenharmony_ci struct sk_buff *skb = xs->skb; 58462306a36Sopenharmony_ci struct page *page; 58562306a36Sopenharmony_ci void *buffer; 58662306a36Sopenharmony_ci int err, i; 58762306a36Sopenharmony_ci u64 addr; 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci if (!skb) { 59062306a36Sopenharmony_ci hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); 59162306a36Sopenharmony_ci 59262306a36Sopenharmony_ci skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); 59362306a36Sopenharmony_ci if (unlikely(!skb)) 59462306a36Sopenharmony_ci return ERR_PTR(err); 59562306a36Sopenharmony_ci 59662306a36Sopenharmony_ci skb_reserve(skb, hr); 59762306a36Sopenharmony_ci } 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_ci addr = desc->addr; 60062306a36Sopenharmony_ci len = desc->len; 60162306a36Sopenharmony_ci ts = pool->unaligned ? len : pool->chunk_size; 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci buffer = xsk_buff_raw_get_data(pool, addr); 60462306a36Sopenharmony_ci offset = offset_in_page(buffer); 60562306a36Sopenharmony_ci addr = buffer - pool->addrs; 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { 60862306a36Sopenharmony_ci if (unlikely(i >= MAX_SKB_FRAGS)) 60962306a36Sopenharmony_ci return ERR_PTR(-EOVERFLOW); 61062306a36Sopenharmony_ci 61162306a36Sopenharmony_ci page = pool->umem->pgs[addr >> PAGE_SHIFT]; 61262306a36Sopenharmony_ci get_page(page); 61362306a36Sopenharmony_ci 61462306a36Sopenharmony_ci copy = min_t(u32, PAGE_SIZE - offset, len - copied); 61562306a36Sopenharmony_ci skb_fill_page_desc(skb, i, page, offset, copy); 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci copied += copy; 61862306a36Sopenharmony_ci addr += copy; 61962306a36Sopenharmony_ci offset = 0; 62062306a36Sopenharmony_ci } 62162306a36Sopenharmony_ci 62262306a36Sopenharmony_ci skb->len += len; 62362306a36Sopenharmony_ci skb->data_len += len; 62462306a36Sopenharmony_ci skb->truesize += ts; 62562306a36Sopenharmony_ci 62662306a36Sopenharmony_ci refcount_add(ts, &xs->sk.sk_wmem_alloc); 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci return skb; 62962306a36Sopenharmony_ci} 63062306a36Sopenharmony_ci 63162306a36Sopenharmony_cistatic struct sk_buff *xsk_build_skb(struct xdp_sock *xs, 63262306a36Sopenharmony_ci struct xdp_desc *desc) 63362306a36Sopenharmony_ci{ 63462306a36Sopenharmony_ci struct net_device *dev = xs->dev; 63562306a36Sopenharmony_ci struct sk_buff *skb = xs->skb; 63662306a36Sopenharmony_ci int err; 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { 63962306a36Sopenharmony_ci skb = xsk_build_skb_zerocopy(xs, desc); 64062306a36Sopenharmony_ci if (IS_ERR(skb)) { 64162306a36Sopenharmony_ci err = PTR_ERR(skb); 64262306a36Sopenharmony_ci goto free_err; 64362306a36Sopenharmony_ci } 64462306a36Sopenharmony_ci } else { 64562306a36Sopenharmony_ci u32 hr, tr, len; 64662306a36Sopenharmony_ci void *buffer; 64762306a36Sopenharmony_ci 64862306a36Sopenharmony_ci buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); 64962306a36Sopenharmony_ci len = desc->len; 65062306a36Sopenharmony_ci 65162306a36Sopenharmony_ci if (!skb) { 65262306a36Sopenharmony_ci hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); 65362306a36Sopenharmony_ci tr = dev->needed_tailroom; 65462306a36Sopenharmony_ci skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); 65562306a36Sopenharmony_ci if (unlikely(!skb)) 65662306a36Sopenharmony_ci goto free_err; 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci skb_reserve(skb, hr); 65962306a36Sopenharmony_ci skb_put(skb, len); 66062306a36Sopenharmony_ci 66162306a36Sopenharmony_ci err = skb_store_bits(skb, 0, buffer, len); 66262306a36Sopenharmony_ci if (unlikely(err)) { 66362306a36Sopenharmony_ci kfree_skb(skb); 66462306a36Sopenharmony_ci goto free_err; 66562306a36Sopenharmony_ci } 66662306a36Sopenharmony_ci } else { 66762306a36Sopenharmony_ci int nr_frags = skb_shinfo(skb)->nr_frags; 66862306a36Sopenharmony_ci struct page *page; 66962306a36Sopenharmony_ci u8 *vaddr; 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { 67262306a36Sopenharmony_ci err = -EOVERFLOW; 67362306a36Sopenharmony_ci goto free_err; 67462306a36Sopenharmony_ci } 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_ci page = alloc_page(xs->sk.sk_allocation); 67762306a36Sopenharmony_ci if (unlikely(!page)) { 67862306a36Sopenharmony_ci err = -EAGAIN; 67962306a36Sopenharmony_ci goto free_err; 68062306a36Sopenharmony_ci } 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_ci vaddr = kmap_local_page(page); 68362306a36Sopenharmony_ci memcpy(vaddr, buffer, len); 68462306a36Sopenharmony_ci kunmap_local(vaddr); 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); 68762306a36Sopenharmony_ci refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); 68862306a36Sopenharmony_ci } 68962306a36Sopenharmony_ci } 69062306a36Sopenharmony_ci 69162306a36Sopenharmony_ci skb->dev = dev; 69262306a36Sopenharmony_ci skb->priority = xs->sk.sk_priority; 69362306a36Sopenharmony_ci skb->mark = READ_ONCE(xs->sk.sk_mark); 69462306a36Sopenharmony_ci skb->destructor = xsk_destruct_skb; 69562306a36Sopenharmony_ci xsk_set_destructor_arg(skb); 69662306a36Sopenharmony_ci 69762306a36Sopenharmony_ci return skb; 69862306a36Sopenharmony_ci 69962306a36Sopenharmony_cifree_err: 70062306a36Sopenharmony_ci if (err == -EOVERFLOW) { 70162306a36Sopenharmony_ci /* Drop the packet */ 70262306a36Sopenharmony_ci xsk_set_destructor_arg(xs->skb); 70362306a36Sopenharmony_ci xsk_drop_skb(xs->skb); 70462306a36Sopenharmony_ci xskq_cons_release(xs->tx); 70562306a36Sopenharmony_ci } else { 70662306a36Sopenharmony_ci /* Let application retry */ 70762306a36Sopenharmony_ci xsk_cq_cancel_locked(xs, 1); 70862306a36Sopenharmony_ci } 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_ci return ERR_PTR(err); 71162306a36Sopenharmony_ci} 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_cistatic int __xsk_generic_xmit(struct sock *sk) 71462306a36Sopenharmony_ci{ 71562306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 71662306a36Sopenharmony_ci u32 max_batch = TX_BATCH_SIZE; 71762306a36Sopenharmony_ci bool sent_frame = false; 71862306a36Sopenharmony_ci struct xdp_desc desc; 71962306a36Sopenharmony_ci struct sk_buff *skb; 72062306a36Sopenharmony_ci int err = 0; 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci mutex_lock(&xs->mutex); 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci /* Since we dropped the RCU read lock, the socket state might have changed. */ 72562306a36Sopenharmony_ci if (unlikely(!xsk_is_bound(xs))) { 72662306a36Sopenharmony_ci err = -ENXIO; 72762306a36Sopenharmony_ci goto out; 72862306a36Sopenharmony_ci } 72962306a36Sopenharmony_ci 73062306a36Sopenharmony_ci if (xs->queue_id >= xs->dev->real_num_tx_queues) 73162306a36Sopenharmony_ci goto out; 73262306a36Sopenharmony_ci 73362306a36Sopenharmony_ci while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { 73462306a36Sopenharmony_ci if (max_batch-- == 0) { 73562306a36Sopenharmony_ci err = -EAGAIN; 73662306a36Sopenharmony_ci goto out; 73762306a36Sopenharmony_ci } 73862306a36Sopenharmony_ci 73962306a36Sopenharmony_ci /* This is the backpressure mechanism for the Tx path. 74062306a36Sopenharmony_ci * Reserve space in the completion queue and only proceed 74162306a36Sopenharmony_ci * if there is space in it. This avoids having to implement 74262306a36Sopenharmony_ci * any buffering in the Tx path. 74362306a36Sopenharmony_ci */ 74462306a36Sopenharmony_ci if (xsk_cq_reserve_addr_locked(xs, desc.addr)) 74562306a36Sopenharmony_ci goto out; 74662306a36Sopenharmony_ci 74762306a36Sopenharmony_ci skb = xsk_build_skb(xs, &desc); 74862306a36Sopenharmony_ci if (IS_ERR(skb)) { 74962306a36Sopenharmony_ci err = PTR_ERR(skb); 75062306a36Sopenharmony_ci if (err != -EOVERFLOW) 75162306a36Sopenharmony_ci goto out; 75262306a36Sopenharmony_ci err = 0; 75362306a36Sopenharmony_ci continue; 75462306a36Sopenharmony_ci } 75562306a36Sopenharmony_ci 75662306a36Sopenharmony_ci xskq_cons_release(xs->tx); 75762306a36Sopenharmony_ci 75862306a36Sopenharmony_ci if (xp_mb_desc(&desc)) { 75962306a36Sopenharmony_ci xs->skb = skb; 76062306a36Sopenharmony_ci continue; 76162306a36Sopenharmony_ci } 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_ci err = __dev_direct_xmit(skb, xs->queue_id); 76462306a36Sopenharmony_ci if (err == NETDEV_TX_BUSY) { 76562306a36Sopenharmony_ci /* Tell user-space to retry the send */ 76662306a36Sopenharmony_ci xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); 76762306a36Sopenharmony_ci xsk_consume_skb(skb); 76862306a36Sopenharmony_ci err = -EAGAIN; 76962306a36Sopenharmony_ci goto out; 77062306a36Sopenharmony_ci } 77162306a36Sopenharmony_ci 77262306a36Sopenharmony_ci /* Ignore NET_XMIT_CN as packet might have been sent */ 77362306a36Sopenharmony_ci if (err == NET_XMIT_DROP) { 77462306a36Sopenharmony_ci /* SKB completed but not sent */ 77562306a36Sopenharmony_ci err = -EBUSY; 77662306a36Sopenharmony_ci xs->skb = NULL; 77762306a36Sopenharmony_ci goto out; 77862306a36Sopenharmony_ci } 77962306a36Sopenharmony_ci 78062306a36Sopenharmony_ci sent_frame = true; 78162306a36Sopenharmony_ci xs->skb = NULL; 78262306a36Sopenharmony_ci } 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci if (xskq_has_descs(xs->tx)) { 78562306a36Sopenharmony_ci if (xs->skb) 78662306a36Sopenharmony_ci xsk_drop_skb(xs->skb); 78762306a36Sopenharmony_ci xskq_cons_release(xs->tx); 78862306a36Sopenharmony_ci } 78962306a36Sopenharmony_ci 79062306a36Sopenharmony_ciout: 79162306a36Sopenharmony_ci if (sent_frame) 79262306a36Sopenharmony_ci if (xsk_tx_writeable(xs)) 79362306a36Sopenharmony_ci sk->sk_write_space(sk); 79462306a36Sopenharmony_ci 79562306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 79662306a36Sopenharmony_ci return err; 79762306a36Sopenharmony_ci} 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_cistatic int xsk_generic_xmit(struct sock *sk) 80062306a36Sopenharmony_ci{ 80162306a36Sopenharmony_ci int ret; 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_ci /* Drop the RCU lock since the SKB path might sleep. */ 80462306a36Sopenharmony_ci rcu_read_unlock(); 80562306a36Sopenharmony_ci ret = __xsk_generic_xmit(sk); 80662306a36Sopenharmony_ci /* Reaquire RCU lock before going into common code. */ 80762306a36Sopenharmony_ci rcu_read_lock(); 80862306a36Sopenharmony_ci 80962306a36Sopenharmony_ci return ret; 81062306a36Sopenharmony_ci} 81162306a36Sopenharmony_ci 81262306a36Sopenharmony_cistatic bool xsk_no_wakeup(struct sock *sk) 81362306a36Sopenharmony_ci{ 81462306a36Sopenharmony_ci#ifdef CONFIG_NET_RX_BUSY_POLL 81562306a36Sopenharmony_ci /* Prefer busy-polling, skip the wakeup. */ 81662306a36Sopenharmony_ci return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && 81762306a36Sopenharmony_ci READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; 81862306a36Sopenharmony_ci#else 81962306a36Sopenharmony_ci return false; 82062306a36Sopenharmony_ci#endif 82162306a36Sopenharmony_ci} 82262306a36Sopenharmony_ci 82362306a36Sopenharmony_cistatic int xsk_check_common(struct xdp_sock *xs) 82462306a36Sopenharmony_ci{ 82562306a36Sopenharmony_ci if (unlikely(!xsk_is_bound(xs))) 82662306a36Sopenharmony_ci return -ENXIO; 82762306a36Sopenharmony_ci if (unlikely(!(xs->dev->flags & IFF_UP))) 82862306a36Sopenharmony_ci return -ENETDOWN; 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci return 0; 83162306a36Sopenharmony_ci} 83262306a36Sopenharmony_ci 83362306a36Sopenharmony_cistatic int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 83462306a36Sopenharmony_ci{ 83562306a36Sopenharmony_ci bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 83662306a36Sopenharmony_ci struct sock *sk = sock->sk; 83762306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 83862306a36Sopenharmony_ci struct xsk_buff_pool *pool; 83962306a36Sopenharmony_ci int err; 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_ci err = xsk_check_common(xs); 84262306a36Sopenharmony_ci if (err) 84362306a36Sopenharmony_ci return err; 84462306a36Sopenharmony_ci if (unlikely(need_wait)) 84562306a36Sopenharmony_ci return -EOPNOTSUPP; 84662306a36Sopenharmony_ci if (unlikely(!xs->tx)) 84762306a36Sopenharmony_ci return -ENOBUFS; 84862306a36Sopenharmony_ci 84962306a36Sopenharmony_ci if (sk_can_busy_loop(sk)) { 85062306a36Sopenharmony_ci if (xs->zc) 85162306a36Sopenharmony_ci __sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool)); 85262306a36Sopenharmony_ci sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 85362306a36Sopenharmony_ci } 85462306a36Sopenharmony_ci 85562306a36Sopenharmony_ci if (xs->zc && xsk_no_wakeup(sk)) 85662306a36Sopenharmony_ci return 0; 85762306a36Sopenharmony_ci 85862306a36Sopenharmony_ci pool = xs->pool; 85962306a36Sopenharmony_ci if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { 86062306a36Sopenharmony_ci if (xs->zc) 86162306a36Sopenharmony_ci return xsk_wakeup(xs, XDP_WAKEUP_TX); 86262306a36Sopenharmony_ci return xsk_generic_xmit(sk); 86362306a36Sopenharmony_ci } 86462306a36Sopenharmony_ci return 0; 86562306a36Sopenharmony_ci} 86662306a36Sopenharmony_ci 86762306a36Sopenharmony_cistatic int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 86862306a36Sopenharmony_ci{ 86962306a36Sopenharmony_ci int ret; 87062306a36Sopenharmony_ci 87162306a36Sopenharmony_ci rcu_read_lock(); 87262306a36Sopenharmony_ci ret = __xsk_sendmsg(sock, m, total_len); 87362306a36Sopenharmony_ci rcu_read_unlock(); 87462306a36Sopenharmony_ci 87562306a36Sopenharmony_ci return ret; 87662306a36Sopenharmony_ci} 87762306a36Sopenharmony_ci 87862306a36Sopenharmony_cistatic int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 87962306a36Sopenharmony_ci{ 88062306a36Sopenharmony_ci bool need_wait = !(flags & MSG_DONTWAIT); 88162306a36Sopenharmony_ci struct sock *sk = sock->sk; 88262306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 88362306a36Sopenharmony_ci int err; 88462306a36Sopenharmony_ci 88562306a36Sopenharmony_ci err = xsk_check_common(xs); 88662306a36Sopenharmony_ci if (err) 88762306a36Sopenharmony_ci return err; 88862306a36Sopenharmony_ci if (unlikely(!xs->rx)) 88962306a36Sopenharmony_ci return -ENOBUFS; 89062306a36Sopenharmony_ci if (unlikely(need_wait)) 89162306a36Sopenharmony_ci return -EOPNOTSUPP; 89262306a36Sopenharmony_ci 89362306a36Sopenharmony_ci if (sk_can_busy_loop(sk)) 89462306a36Sopenharmony_ci sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_ci if (xsk_no_wakeup(sk)) 89762306a36Sopenharmony_ci return 0; 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) 90062306a36Sopenharmony_ci return xsk_wakeup(xs, XDP_WAKEUP_RX); 90162306a36Sopenharmony_ci return 0; 90262306a36Sopenharmony_ci} 90362306a36Sopenharmony_ci 90462306a36Sopenharmony_cistatic int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 90562306a36Sopenharmony_ci{ 90662306a36Sopenharmony_ci int ret; 90762306a36Sopenharmony_ci 90862306a36Sopenharmony_ci rcu_read_lock(); 90962306a36Sopenharmony_ci ret = __xsk_recvmsg(sock, m, len, flags); 91062306a36Sopenharmony_ci rcu_read_unlock(); 91162306a36Sopenharmony_ci 91262306a36Sopenharmony_ci return ret; 91362306a36Sopenharmony_ci} 91462306a36Sopenharmony_ci 91562306a36Sopenharmony_cistatic __poll_t xsk_poll(struct file *file, struct socket *sock, 91662306a36Sopenharmony_ci struct poll_table_struct *wait) 91762306a36Sopenharmony_ci{ 91862306a36Sopenharmony_ci __poll_t mask = 0; 91962306a36Sopenharmony_ci struct sock *sk = sock->sk; 92062306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 92162306a36Sopenharmony_ci struct xsk_buff_pool *pool; 92262306a36Sopenharmony_ci 92362306a36Sopenharmony_ci sock_poll_wait(file, sock, wait); 92462306a36Sopenharmony_ci 92562306a36Sopenharmony_ci rcu_read_lock(); 92662306a36Sopenharmony_ci if (xsk_check_common(xs)) 92762306a36Sopenharmony_ci goto out; 92862306a36Sopenharmony_ci 92962306a36Sopenharmony_ci pool = xs->pool; 93062306a36Sopenharmony_ci 93162306a36Sopenharmony_ci if (pool->cached_need_wakeup) { 93262306a36Sopenharmony_ci if (xs->zc) 93362306a36Sopenharmony_ci xsk_wakeup(xs, pool->cached_need_wakeup); 93462306a36Sopenharmony_ci else if (xs->tx) 93562306a36Sopenharmony_ci /* Poll needs to drive Tx also in copy mode */ 93662306a36Sopenharmony_ci xsk_generic_xmit(sk); 93762306a36Sopenharmony_ci } 93862306a36Sopenharmony_ci 93962306a36Sopenharmony_ci if (xs->rx && !xskq_prod_is_empty(xs->rx)) 94062306a36Sopenharmony_ci mask |= EPOLLIN | EPOLLRDNORM; 94162306a36Sopenharmony_ci if (xs->tx && xsk_tx_writeable(xs)) 94262306a36Sopenharmony_ci mask |= EPOLLOUT | EPOLLWRNORM; 94362306a36Sopenharmony_ciout: 94462306a36Sopenharmony_ci rcu_read_unlock(); 94562306a36Sopenharmony_ci return mask; 94662306a36Sopenharmony_ci} 94762306a36Sopenharmony_ci 94862306a36Sopenharmony_cistatic int xsk_init_queue(u32 entries, struct xsk_queue **queue, 94962306a36Sopenharmony_ci bool umem_queue) 95062306a36Sopenharmony_ci{ 95162306a36Sopenharmony_ci struct xsk_queue *q; 95262306a36Sopenharmony_ci 95362306a36Sopenharmony_ci if (entries == 0 || *queue || !is_power_of_2(entries)) 95462306a36Sopenharmony_ci return -EINVAL; 95562306a36Sopenharmony_ci 95662306a36Sopenharmony_ci q = xskq_create(entries, umem_queue); 95762306a36Sopenharmony_ci if (!q) 95862306a36Sopenharmony_ci return -ENOMEM; 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_ci /* Make sure queue is ready before it can be seen by others */ 96162306a36Sopenharmony_ci smp_wmb(); 96262306a36Sopenharmony_ci WRITE_ONCE(*queue, q); 96362306a36Sopenharmony_ci return 0; 96462306a36Sopenharmony_ci} 96562306a36Sopenharmony_ci 96662306a36Sopenharmony_cistatic void xsk_unbind_dev(struct xdp_sock *xs) 96762306a36Sopenharmony_ci{ 96862306a36Sopenharmony_ci struct net_device *dev = xs->dev; 96962306a36Sopenharmony_ci 97062306a36Sopenharmony_ci if (xs->state != XSK_BOUND) 97162306a36Sopenharmony_ci return; 97262306a36Sopenharmony_ci WRITE_ONCE(xs->state, XSK_UNBOUND); 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci /* Wait for driver to stop using the xdp socket. */ 97562306a36Sopenharmony_ci xp_del_xsk(xs->pool, xs); 97662306a36Sopenharmony_ci synchronize_net(); 97762306a36Sopenharmony_ci dev_put(dev); 97862306a36Sopenharmony_ci} 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_cistatic struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, 98162306a36Sopenharmony_ci struct xdp_sock __rcu ***map_entry) 98262306a36Sopenharmony_ci{ 98362306a36Sopenharmony_ci struct xsk_map *map = NULL; 98462306a36Sopenharmony_ci struct xsk_map_node *node; 98562306a36Sopenharmony_ci 98662306a36Sopenharmony_ci *map_entry = NULL; 98762306a36Sopenharmony_ci 98862306a36Sopenharmony_ci spin_lock_bh(&xs->map_list_lock); 98962306a36Sopenharmony_ci node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, 99062306a36Sopenharmony_ci node); 99162306a36Sopenharmony_ci if (node) { 99262306a36Sopenharmony_ci bpf_map_inc(&node->map->map); 99362306a36Sopenharmony_ci map = node->map; 99462306a36Sopenharmony_ci *map_entry = node->map_entry; 99562306a36Sopenharmony_ci } 99662306a36Sopenharmony_ci spin_unlock_bh(&xs->map_list_lock); 99762306a36Sopenharmony_ci return map; 99862306a36Sopenharmony_ci} 99962306a36Sopenharmony_ci 100062306a36Sopenharmony_cistatic void xsk_delete_from_maps(struct xdp_sock *xs) 100162306a36Sopenharmony_ci{ 100262306a36Sopenharmony_ci /* This function removes the current XDP socket from all the 100362306a36Sopenharmony_ci * maps it resides in. We need to take extra care here, due to 100462306a36Sopenharmony_ci * the two locks involved. Each map has a lock synchronizing 100562306a36Sopenharmony_ci * updates to the entries, and each socket has a lock that 100662306a36Sopenharmony_ci * synchronizes access to the list of maps (map_list). For 100762306a36Sopenharmony_ci * deadlock avoidance the locks need to be taken in the order 100862306a36Sopenharmony_ci * "map lock"->"socket map list lock". We start off by 100962306a36Sopenharmony_ci * accessing the socket map list, and take a reference to the 101062306a36Sopenharmony_ci * map to guarantee existence between the 101162306a36Sopenharmony_ci * xsk_get_map_list_entry() and xsk_map_try_sock_delete() 101262306a36Sopenharmony_ci * calls. Then we ask the map to remove the socket, which 101362306a36Sopenharmony_ci * tries to remove the socket from the map. Note that there 101462306a36Sopenharmony_ci * might be updates to the map between 101562306a36Sopenharmony_ci * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). 101662306a36Sopenharmony_ci */ 101762306a36Sopenharmony_ci struct xdp_sock __rcu **map_entry = NULL; 101862306a36Sopenharmony_ci struct xsk_map *map; 101962306a36Sopenharmony_ci 102062306a36Sopenharmony_ci while ((map = xsk_get_map_list_entry(xs, &map_entry))) { 102162306a36Sopenharmony_ci xsk_map_try_sock_delete(map, xs, map_entry); 102262306a36Sopenharmony_ci bpf_map_put(&map->map); 102362306a36Sopenharmony_ci } 102462306a36Sopenharmony_ci} 102562306a36Sopenharmony_ci 102662306a36Sopenharmony_cistatic int xsk_release(struct socket *sock) 102762306a36Sopenharmony_ci{ 102862306a36Sopenharmony_ci struct sock *sk = sock->sk; 102962306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 103062306a36Sopenharmony_ci struct net *net; 103162306a36Sopenharmony_ci 103262306a36Sopenharmony_ci if (!sk) 103362306a36Sopenharmony_ci return 0; 103462306a36Sopenharmony_ci 103562306a36Sopenharmony_ci net = sock_net(sk); 103662306a36Sopenharmony_ci 103762306a36Sopenharmony_ci if (xs->skb) 103862306a36Sopenharmony_ci xsk_drop_skb(xs->skb); 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci mutex_lock(&net->xdp.lock); 104162306a36Sopenharmony_ci sk_del_node_init_rcu(sk); 104262306a36Sopenharmony_ci mutex_unlock(&net->xdp.lock); 104362306a36Sopenharmony_ci 104462306a36Sopenharmony_ci sock_prot_inuse_add(net, sk->sk_prot, -1); 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_ci xsk_delete_from_maps(xs); 104762306a36Sopenharmony_ci mutex_lock(&xs->mutex); 104862306a36Sopenharmony_ci xsk_unbind_dev(xs); 104962306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 105062306a36Sopenharmony_ci 105162306a36Sopenharmony_ci xskq_destroy(xs->rx); 105262306a36Sopenharmony_ci xskq_destroy(xs->tx); 105362306a36Sopenharmony_ci xskq_destroy(xs->fq_tmp); 105462306a36Sopenharmony_ci xskq_destroy(xs->cq_tmp); 105562306a36Sopenharmony_ci 105662306a36Sopenharmony_ci sock_orphan(sk); 105762306a36Sopenharmony_ci sock->sk = NULL; 105862306a36Sopenharmony_ci 105962306a36Sopenharmony_ci sock_put(sk); 106062306a36Sopenharmony_ci 106162306a36Sopenharmony_ci return 0; 106262306a36Sopenharmony_ci} 106362306a36Sopenharmony_ci 106462306a36Sopenharmony_cistatic struct socket *xsk_lookup_xsk_from_fd(int fd) 106562306a36Sopenharmony_ci{ 106662306a36Sopenharmony_ci struct socket *sock; 106762306a36Sopenharmony_ci int err; 106862306a36Sopenharmony_ci 106962306a36Sopenharmony_ci sock = sockfd_lookup(fd, &err); 107062306a36Sopenharmony_ci if (!sock) 107162306a36Sopenharmony_ci return ERR_PTR(-ENOTSOCK); 107262306a36Sopenharmony_ci 107362306a36Sopenharmony_ci if (sock->sk->sk_family != PF_XDP) { 107462306a36Sopenharmony_ci sockfd_put(sock); 107562306a36Sopenharmony_ci return ERR_PTR(-ENOPROTOOPT); 107662306a36Sopenharmony_ci } 107762306a36Sopenharmony_ci 107862306a36Sopenharmony_ci return sock; 107962306a36Sopenharmony_ci} 108062306a36Sopenharmony_ci 108162306a36Sopenharmony_cistatic bool xsk_validate_queues(struct xdp_sock *xs) 108262306a36Sopenharmony_ci{ 108362306a36Sopenharmony_ci return xs->fq_tmp && xs->cq_tmp; 108462306a36Sopenharmony_ci} 108562306a36Sopenharmony_ci 108662306a36Sopenharmony_cistatic int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 108762306a36Sopenharmony_ci{ 108862306a36Sopenharmony_ci struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 108962306a36Sopenharmony_ci struct sock *sk = sock->sk; 109062306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 109162306a36Sopenharmony_ci struct net_device *dev; 109262306a36Sopenharmony_ci int bound_dev_if; 109362306a36Sopenharmony_ci u32 flags, qid; 109462306a36Sopenharmony_ci int err = 0; 109562306a36Sopenharmony_ci 109662306a36Sopenharmony_ci if (addr_len < sizeof(struct sockaddr_xdp)) 109762306a36Sopenharmony_ci return -EINVAL; 109862306a36Sopenharmony_ci if (sxdp->sxdp_family != AF_XDP) 109962306a36Sopenharmony_ci return -EINVAL; 110062306a36Sopenharmony_ci 110162306a36Sopenharmony_ci flags = sxdp->sxdp_flags; 110262306a36Sopenharmony_ci if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | 110362306a36Sopenharmony_ci XDP_USE_NEED_WAKEUP | XDP_USE_SG)) 110462306a36Sopenharmony_ci return -EINVAL; 110562306a36Sopenharmony_ci 110662306a36Sopenharmony_ci bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 110762306a36Sopenharmony_ci if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) 110862306a36Sopenharmony_ci return -EINVAL; 110962306a36Sopenharmony_ci 111062306a36Sopenharmony_ci rtnl_lock(); 111162306a36Sopenharmony_ci mutex_lock(&xs->mutex); 111262306a36Sopenharmony_ci if (xs->state != XSK_READY) { 111362306a36Sopenharmony_ci err = -EBUSY; 111462306a36Sopenharmony_ci goto out_release; 111562306a36Sopenharmony_ci } 111662306a36Sopenharmony_ci 111762306a36Sopenharmony_ci dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 111862306a36Sopenharmony_ci if (!dev) { 111962306a36Sopenharmony_ci err = -ENODEV; 112062306a36Sopenharmony_ci goto out_release; 112162306a36Sopenharmony_ci } 112262306a36Sopenharmony_ci 112362306a36Sopenharmony_ci if (!xs->rx && !xs->tx) { 112462306a36Sopenharmony_ci err = -EINVAL; 112562306a36Sopenharmony_ci goto out_unlock; 112662306a36Sopenharmony_ci } 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci qid = sxdp->sxdp_queue_id; 112962306a36Sopenharmony_ci 113062306a36Sopenharmony_ci if (flags & XDP_SHARED_UMEM) { 113162306a36Sopenharmony_ci struct xdp_sock *umem_xs; 113262306a36Sopenharmony_ci struct socket *sock; 113362306a36Sopenharmony_ci 113462306a36Sopenharmony_ci if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || 113562306a36Sopenharmony_ci (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { 113662306a36Sopenharmony_ci /* Cannot specify flags for shared sockets. */ 113762306a36Sopenharmony_ci err = -EINVAL; 113862306a36Sopenharmony_ci goto out_unlock; 113962306a36Sopenharmony_ci } 114062306a36Sopenharmony_ci 114162306a36Sopenharmony_ci if (xs->umem) { 114262306a36Sopenharmony_ci /* We have already our own. */ 114362306a36Sopenharmony_ci err = -EINVAL; 114462306a36Sopenharmony_ci goto out_unlock; 114562306a36Sopenharmony_ci } 114662306a36Sopenharmony_ci 114762306a36Sopenharmony_ci sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); 114862306a36Sopenharmony_ci if (IS_ERR(sock)) { 114962306a36Sopenharmony_ci err = PTR_ERR(sock); 115062306a36Sopenharmony_ci goto out_unlock; 115162306a36Sopenharmony_ci } 115262306a36Sopenharmony_ci 115362306a36Sopenharmony_ci umem_xs = xdp_sk(sock->sk); 115462306a36Sopenharmony_ci if (!xsk_is_bound(umem_xs)) { 115562306a36Sopenharmony_ci err = -EBADF; 115662306a36Sopenharmony_ci sockfd_put(sock); 115762306a36Sopenharmony_ci goto out_unlock; 115862306a36Sopenharmony_ci } 115962306a36Sopenharmony_ci 116062306a36Sopenharmony_ci if (umem_xs->queue_id != qid || umem_xs->dev != dev) { 116162306a36Sopenharmony_ci /* Share the umem with another socket on another qid 116262306a36Sopenharmony_ci * and/or device. 116362306a36Sopenharmony_ci */ 116462306a36Sopenharmony_ci xs->pool = xp_create_and_assign_umem(xs, 116562306a36Sopenharmony_ci umem_xs->umem); 116662306a36Sopenharmony_ci if (!xs->pool) { 116762306a36Sopenharmony_ci err = -ENOMEM; 116862306a36Sopenharmony_ci sockfd_put(sock); 116962306a36Sopenharmony_ci goto out_unlock; 117062306a36Sopenharmony_ci } 117162306a36Sopenharmony_ci 117262306a36Sopenharmony_ci err = xp_assign_dev_shared(xs->pool, umem_xs, dev, 117362306a36Sopenharmony_ci qid); 117462306a36Sopenharmony_ci if (err) { 117562306a36Sopenharmony_ci xp_destroy(xs->pool); 117662306a36Sopenharmony_ci xs->pool = NULL; 117762306a36Sopenharmony_ci sockfd_put(sock); 117862306a36Sopenharmony_ci goto out_unlock; 117962306a36Sopenharmony_ci } 118062306a36Sopenharmony_ci } else { 118162306a36Sopenharmony_ci /* Share the buffer pool with the other socket. */ 118262306a36Sopenharmony_ci if (xs->fq_tmp || xs->cq_tmp) { 118362306a36Sopenharmony_ci /* Do not allow setting your own fq or cq. */ 118462306a36Sopenharmony_ci err = -EINVAL; 118562306a36Sopenharmony_ci sockfd_put(sock); 118662306a36Sopenharmony_ci goto out_unlock; 118762306a36Sopenharmony_ci } 118862306a36Sopenharmony_ci 118962306a36Sopenharmony_ci xp_get_pool(umem_xs->pool); 119062306a36Sopenharmony_ci xs->pool = umem_xs->pool; 119162306a36Sopenharmony_ci 119262306a36Sopenharmony_ci /* If underlying shared umem was created without Tx 119362306a36Sopenharmony_ci * ring, allocate Tx descs array that Tx batching API 119462306a36Sopenharmony_ci * utilizes 119562306a36Sopenharmony_ci */ 119662306a36Sopenharmony_ci if (xs->tx && !xs->pool->tx_descs) { 119762306a36Sopenharmony_ci err = xp_alloc_tx_descs(xs->pool, xs); 119862306a36Sopenharmony_ci if (err) { 119962306a36Sopenharmony_ci xp_put_pool(xs->pool); 120062306a36Sopenharmony_ci xs->pool = NULL; 120162306a36Sopenharmony_ci sockfd_put(sock); 120262306a36Sopenharmony_ci goto out_unlock; 120362306a36Sopenharmony_ci } 120462306a36Sopenharmony_ci } 120562306a36Sopenharmony_ci } 120662306a36Sopenharmony_ci 120762306a36Sopenharmony_ci xdp_get_umem(umem_xs->umem); 120862306a36Sopenharmony_ci WRITE_ONCE(xs->umem, umem_xs->umem); 120962306a36Sopenharmony_ci sockfd_put(sock); 121062306a36Sopenharmony_ci } else if (!xs->umem || !xsk_validate_queues(xs)) { 121162306a36Sopenharmony_ci err = -EINVAL; 121262306a36Sopenharmony_ci goto out_unlock; 121362306a36Sopenharmony_ci } else { 121462306a36Sopenharmony_ci /* This xsk has its own umem. */ 121562306a36Sopenharmony_ci xs->pool = xp_create_and_assign_umem(xs, xs->umem); 121662306a36Sopenharmony_ci if (!xs->pool) { 121762306a36Sopenharmony_ci err = -ENOMEM; 121862306a36Sopenharmony_ci goto out_unlock; 121962306a36Sopenharmony_ci } 122062306a36Sopenharmony_ci 122162306a36Sopenharmony_ci err = xp_assign_dev(xs->pool, dev, qid, flags); 122262306a36Sopenharmony_ci if (err) { 122362306a36Sopenharmony_ci xp_destroy(xs->pool); 122462306a36Sopenharmony_ci xs->pool = NULL; 122562306a36Sopenharmony_ci goto out_unlock; 122662306a36Sopenharmony_ci } 122762306a36Sopenharmony_ci } 122862306a36Sopenharmony_ci 122962306a36Sopenharmony_ci /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ 123062306a36Sopenharmony_ci xs->fq_tmp = NULL; 123162306a36Sopenharmony_ci xs->cq_tmp = NULL; 123262306a36Sopenharmony_ci 123362306a36Sopenharmony_ci xs->dev = dev; 123462306a36Sopenharmony_ci xs->zc = xs->umem->zc; 123562306a36Sopenharmony_ci xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); 123662306a36Sopenharmony_ci xs->queue_id = qid; 123762306a36Sopenharmony_ci xp_add_xsk(xs->pool, xs); 123862306a36Sopenharmony_ci 123962306a36Sopenharmony_ciout_unlock: 124062306a36Sopenharmony_ci if (err) { 124162306a36Sopenharmony_ci dev_put(dev); 124262306a36Sopenharmony_ci } else { 124362306a36Sopenharmony_ci /* Matches smp_rmb() in bind() for shared umem 124462306a36Sopenharmony_ci * sockets, and xsk_is_bound(). 124562306a36Sopenharmony_ci */ 124662306a36Sopenharmony_ci smp_wmb(); 124762306a36Sopenharmony_ci WRITE_ONCE(xs->state, XSK_BOUND); 124862306a36Sopenharmony_ci } 124962306a36Sopenharmony_ciout_release: 125062306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 125162306a36Sopenharmony_ci rtnl_unlock(); 125262306a36Sopenharmony_ci return err; 125362306a36Sopenharmony_ci} 125462306a36Sopenharmony_ci 125562306a36Sopenharmony_cistruct xdp_umem_reg_v1 { 125662306a36Sopenharmony_ci __u64 addr; /* Start of packet data area */ 125762306a36Sopenharmony_ci __u64 len; /* Length of packet data area */ 125862306a36Sopenharmony_ci __u32 chunk_size; 125962306a36Sopenharmony_ci __u32 headroom; 126062306a36Sopenharmony_ci}; 126162306a36Sopenharmony_ci 126262306a36Sopenharmony_cistatic int xsk_setsockopt(struct socket *sock, int level, int optname, 126362306a36Sopenharmony_ci sockptr_t optval, unsigned int optlen) 126462306a36Sopenharmony_ci{ 126562306a36Sopenharmony_ci struct sock *sk = sock->sk; 126662306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 126762306a36Sopenharmony_ci int err; 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_ci if (level != SOL_XDP) 127062306a36Sopenharmony_ci return -ENOPROTOOPT; 127162306a36Sopenharmony_ci 127262306a36Sopenharmony_ci switch (optname) { 127362306a36Sopenharmony_ci case XDP_RX_RING: 127462306a36Sopenharmony_ci case XDP_TX_RING: 127562306a36Sopenharmony_ci { 127662306a36Sopenharmony_ci struct xsk_queue **q; 127762306a36Sopenharmony_ci int entries; 127862306a36Sopenharmony_ci 127962306a36Sopenharmony_ci if (optlen < sizeof(entries)) 128062306a36Sopenharmony_ci return -EINVAL; 128162306a36Sopenharmony_ci if (copy_from_sockptr(&entries, optval, sizeof(entries))) 128262306a36Sopenharmony_ci return -EFAULT; 128362306a36Sopenharmony_ci 128462306a36Sopenharmony_ci mutex_lock(&xs->mutex); 128562306a36Sopenharmony_ci if (xs->state != XSK_READY) { 128662306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 128762306a36Sopenharmony_ci return -EBUSY; 128862306a36Sopenharmony_ci } 128962306a36Sopenharmony_ci q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; 129062306a36Sopenharmony_ci err = xsk_init_queue(entries, q, false); 129162306a36Sopenharmony_ci if (!err && optname == XDP_TX_RING) 129262306a36Sopenharmony_ci /* Tx needs to be explicitly woken up the first time */ 129362306a36Sopenharmony_ci xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 129462306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 129562306a36Sopenharmony_ci return err; 129662306a36Sopenharmony_ci } 129762306a36Sopenharmony_ci case XDP_UMEM_REG: 129862306a36Sopenharmony_ci { 129962306a36Sopenharmony_ci size_t mr_size = sizeof(struct xdp_umem_reg); 130062306a36Sopenharmony_ci struct xdp_umem_reg mr = {}; 130162306a36Sopenharmony_ci struct xdp_umem *umem; 130262306a36Sopenharmony_ci 130362306a36Sopenharmony_ci if (optlen < sizeof(struct xdp_umem_reg_v1)) 130462306a36Sopenharmony_ci return -EINVAL; 130562306a36Sopenharmony_ci else if (optlen < sizeof(mr)) 130662306a36Sopenharmony_ci mr_size = sizeof(struct xdp_umem_reg_v1); 130762306a36Sopenharmony_ci 130862306a36Sopenharmony_ci if (copy_from_sockptr(&mr, optval, mr_size)) 130962306a36Sopenharmony_ci return -EFAULT; 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_ci mutex_lock(&xs->mutex); 131262306a36Sopenharmony_ci if (xs->state != XSK_READY || xs->umem) { 131362306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 131462306a36Sopenharmony_ci return -EBUSY; 131562306a36Sopenharmony_ci } 131662306a36Sopenharmony_ci 131762306a36Sopenharmony_ci umem = xdp_umem_create(&mr); 131862306a36Sopenharmony_ci if (IS_ERR(umem)) { 131962306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 132062306a36Sopenharmony_ci return PTR_ERR(umem); 132162306a36Sopenharmony_ci } 132262306a36Sopenharmony_ci 132362306a36Sopenharmony_ci /* Make sure umem is ready before it can be seen by others */ 132462306a36Sopenharmony_ci smp_wmb(); 132562306a36Sopenharmony_ci WRITE_ONCE(xs->umem, umem); 132662306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 132762306a36Sopenharmony_ci return 0; 132862306a36Sopenharmony_ci } 132962306a36Sopenharmony_ci case XDP_UMEM_FILL_RING: 133062306a36Sopenharmony_ci case XDP_UMEM_COMPLETION_RING: 133162306a36Sopenharmony_ci { 133262306a36Sopenharmony_ci struct xsk_queue **q; 133362306a36Sopenharmony_ci int entries; 133462306a36Sopenharmony_ci 133562306a36Sopenharmony_ci if (copy_from_sockptr(&entries, optval, sizeof(entries))) 133662306a36Sopenharmony_ci return -EFAULT; 133762306a36Sopenharmony_ci 133862306a36Sopenharmony_ci mutex_lock(&xs->mutex); 133962306a36Sopenharmony_ci if (xs->state != XSK_READY) { 134062306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 134162306a36Sopenharmony_ci return -EBUSY; 134262306a36Sopenharmony_ci } 134362306a36Sopenharmony_ci 134462306a36Sopenharmony_ci q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : 134562306a36Sopenharmony_ci &xs->cq_tmp; 134662306a36Sopenharmony_ci err = xsk_init_queue(entries, q, true); 134762306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 134862306a36Sopenharmony_ci return err; 134962306a36Sopenharmony_ci } 135062306a36Sopenharmony_ci default: 135162306a36Sopenharmony_ci break; 135262306a36Sopenharmony_ci } 135362306a36Sopenharmony_ci 135462306a36Sopenharmony_ci return -ENOPROTOOPT; 135562306a36Sopenharmony_ci} 135662306a36Sopenharmony_ci 135762306a36Sopenharmony_cistatic void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) 135862306a36Sopenharmony_ci{ 135962306a36Sopenharmony_ci ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 136062306a36Sopenharmony_ci ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 136162306a36Sopenharmony_ci ring->desc = offsetof(struct xdp_rxtx_ring, desc); 136262306a36Sopenharmony_ci} 136362306a36Sopenharmony_ci 136462306a36Sopenharmony_cistatic void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) 136562306a36Sopenharmony_ci{ 136662306a36Sopenharmony_ci ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); 136762306a36Sopenharmony_ci ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 136862306a36Sopenharmony_ci ring->desc = offsetof(struct xdp_umem_ring, desc); 136962306a36Sopenharmony_ci} 137062306a36Sopenharmony_ci 137162306a36Sopenharmony_cistruct xdp_statistics_v1 { 137262306a36Sopenharmony_ci __u64 rx_dropped; 137362306a36Sopenharmony_ci __u64 rx_invalid_descs; 137462306a36Sopenharmony_ci __u64 tx_invalid_descs; 137562306a36Sopenharmony_ci}; 137662306a36Sopenharmony_ci 137762306a36Sopenharmony_cistatic int xsk_getsockopt(struct socket *sock, int level, int optname, 137862306a36Sopenharmony_ci char __user *optval, int __user *optlen) 137962306a36Sopenharmony_ci{ 138062306a36Sopenharmony_ci struct sock *sk = sock->sk; 138162306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 138262306a36Sopenharmony_ci int len; 138362306a36Sopenharmony_ci 138462306a36Sopenharmony_ci if (level != SOL_XDP) 138562306a36Sopenharmony_ci return -ENOPROTOOPT; 138662306a36Sopenharmony_ci 138762306a36Sopenharmony_ci if (get_user(len, optlen)) 138862306a36Sopenharmony_ci return -EFAULT; 138962306a36Sopenharmony_ci if (len < 0) 139062306a36Sopenharmony_ci return -EINVAL; 139162306a36Sopenharmony_ci 139262306a36Sopenharmony_ci switch (optname) { 139362306a36Sopenharmony_ci case XDP_STATISTICS: 139462306a36Sopenharmony_ci { 139562306a36Sopenharmony_ci struct xdp_statistics stats = {}; 139662306a36Sopenharmony_ci bool extra_stats = true; 139762306a36Sopenharmony_ci size_t stats_size; 139862306a36Sopenharmony_ci 139962306a36Sopenharmony_ci if (len < sizeof(struct xdp_statistics_v1)) { 140062306a36Sopenharmony_ci return -EINVAL; 140162306a36Sopenharmony_ci } else if (len < sizeof(stats)) { 140262306a36Sopenharmony_ci extra_stats = false; 140362306a36Sopenharmony_ci stats_size = sizeof(struct xdp_statistics_v1); 140462306a36Sopenharmony_ci } else { 140562306a36Sopenharmony_ci stats_size = sizeof(stats); 140662306a36Sopenharmony_ci } 140762306a36Sopenharmony_ci 140862306a36Sopenharmony_ci mutex_lock(&xs->mutex); 140962306a36Sopenharmony_ci stats.rx_dropped = xs->rx_dropped; 141062306a36Sopenharmony_ci if (extra_stats) { 141162306a36Sopenharmony_ci stats.rx_ring_full = xs->rx_queue_full; 141262306a36Sopenharmony_ci stats.rx_fill_ring_empty_descs = 141362306a36Sopenharmony_ci xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; 141462306a36Sopenharmony_ci stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); 141562306a36Sopenharmony_ci } else { 141662306a36Sopenharmony_ci stats.rx_dropped += xs->rx_queue_full; 141762306a36Sopenharmony_ci } 141862306a36Sopenharmony_ci stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); 141962306a36Sopenharmony_ci stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); 142062306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 142162306a36Sopenharmony_ci 142262306a36Sopenharmony_ci if (copy_to_user(optval, &stats, stats_size)) 142362306a36Sopenharmony_ci return -EFAULT; 142462306a36Sopenharmony_ci if (put_user(stats_size, optlen)) 142562306a36Sopenharmony_ci return -EFAULT; 142662306a36Sopenharmony_ci 142762306a36Sopenharmony_ci return 0; 142862306a36Sopenharmony_ci } 142962306a36Sopenharmony_ci case XDP_MMAP_OFFSETS: 143062306a36Sopenharmony_ci { 143162306a36Sopenharmony_ci struct xdp_mmap_offsets off; 143262306a36Sopenharmony_ci struct xdp_mmap_offsets_v1 off_v1; 143362306a36Sopenharmony_ci bool flags_supported = true; 143462306a36Sopenharmony_ci void *to_copy; 143562306a36Sopenharmony_ci 143662306a36Sopenharmony_ci if (len < sizeof(off_v1)) 143762306a36Sopenharmony_ci return -EINVAL; 143862306a36Sopenharmony_ci else if (len < sizeof(off)) 143962306a36Sopenharmony_ci flags_supported = false; 144062306a36Sopenharmony_ci 144162306a36Sopenharmony_ci if (flags_supported) { 144262306a36Sopenharmony_ci /* xdp_ring_offset is identical to xdp_ring_offset_v1 144362306a36Sopenharmony_ci * except for the flags field added to the end. 144462306a36Sopenharmony_ci */ 144562306a36Sopenharmony_ci xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 144662306a36Sopenharmony_ci &off.rx); 144762306a36Sopenharmony_ci xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) 144862306a36Sopenharmony_ci &off.tx); 144962306a36Sopenharmony_ci xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 145062306a36Sopenharmony_ci &off.fr); 145162306a36Sopenharmony_ci xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) 145262306a36Sopenharmony_ci &off.cr); 145362306a36Sopenharmony_ci off.rx.flags = offsetof(struct xdp_rxtx_ring, 145462306a36Sopenharmony_ci ptrs.flags); 145562306a36Sopenharmony_ci off.tx.flags = offsetof(struct xdp_rxtx_ring, 145662306a36Sopenharmony_ci ptrs.flags); 145762306a36Sopenharmony_ci off.fr.flags = offsetof(struct xdp_umem_ring, 145862306a36Sopenharmony_ci ptrs.flags); 145962306a36Sopenharmony_ci off.cr.flags = offsetof(struct xdp_umem_ring, 146062306a36Sopenharmony_ci ptrs.flags); 146162306a36Sopenharmony_ci 146262306a36Sopenharmony_ci len = sizeof(off); 146362306a36Sopenharmony_ci to_copy = &off; 146462306a36Sopenharmony_ci } else { 146562306a36Sopenharmony_ci xsk_enter_rxtx_offsets(&off_v1.rx); 146662306a36Sopenharmony_ci xsk_enter_rxtx_offsets(&off_v1.tx); 146762306a36Sopenharmony_ci xsk_enter_umem_offsets(&off_v1.fr); 146862306a36Sopenharmony_ci xsk_enter_umem_offsets(&off_v1.cr); 146962306a36Sopenharmony_ci 147062306a36Sopenharmony_ci len = sizeof(off_v1); 147162306a36Sopenharmony_ci to_copy = &off_v1; 147262306a36Sopenharmony_ci } 147362306a36Sopenharmony_ci 147462306a36Sopenharmony_ci if (copy_to_user(optval, to_copy, len)) 147562306a36Sopenharmony_ci return -EFAULT; 147662306a36Sopenharmony_ci if (put_user(len, optlen)) 147762306a36Sopenharmony_ci return -EFAULT; 147862306a36Sopenharmony_ci 147962306a36Sopenharmony_ci return 0; 148062306a36Sopenharmony_ci } 148162306a36Sopenharmony_ci case XDP_OPTIONS: 148262306a36Sopenharmony_ci { 148362306a36Sopenharmony_ci struct xdp_options opts = {}; 148462306a36Sopenharmony_ci 148562306a36Sopenharmony_ci if (len < sizeof(opts)) 148662306a36Sopenharmony_ci return -EINVAL; 148762306a36Sopenharmony_ci 148862306a36Sopenharmony_ci mutex_lock(&xs->mutex); 148962306a36Sopenharmony_ci if (xs->zc) 149062306a36Sopenharmony_ci opts.flags |= XDP_OPTIONS_ZEROCOPY; 149162306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 149262306a36Sopenharmony_ci 149362306a36Sopenharmony_ci len = sizeof(opts); 149462306a36Sopenharmony_ci if (copy_to_user(optval, &opts, len)) 149562306a36Sopenharmony_ci return -EFAULT; 149662306a36Sopenharmony_ci if (put_user(len, optlen)) 149762306a36Sopenharmony_ci return -EFAULT; 149862306a36Sopenharmony_ci 149962306a36Sopenharmony_ci return 0; 150062306a36Sopenharmony_ci } 150162306a36Sopenharmony_ci default: 150262306a36Sopenharmony_ci break; 150362306a36Sopenharmony_ci } 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci return -EOPNOTSUPP; 150662306a36Sopenharmony_ci} 150762306a36Sopenharmony_ci 150862306a36Sopenharmony_cistatic int xsk_mmap(struct file *file, struct socket *sock, 150962306a36Sopenharmony_ci struct vm_area_struct *vma) 151062306a36Sopenharmony_ci{ 151162306a36Sopenharmony_ci loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 151262306a36Sopenharmony_ci unsigned long size = vma->vm_end - vma->vm_start; 151362306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sock->sk); 151462306a36Sopenharmony_ci int state = READ_ONCE(xs->state); 151562306a36Sopenharmony_ci struct xsk_queue *q = NULL; 151662306a36Sopenharmony_ci 151762306a36Sopenharmony_ci if (state != XSK_READY && state != XSK_BOUND) 151862306a36Sopenharmony_ci return -EBUSY; 151962306a36Sopenharmony_ci 152062306a36Sopenharmony_ci if (offset == XDP_PGOFF_RX_RING) { 152162306a36Sopenharmony_ci q = READ_ONCE(xs->rx); 152262306a36Sopenharmony_ci } else if (offset == XDP_PGOFF_TX_RING) { 152362306a36Sopenharmony_ci q = READ_ONCE(xs->tx); 152462306a36Sopenharmony_ci } else { 152562306a36Sopenharmony_ci /* Matches the smp_wmb() in XDP_UMEM_REG */ 152662306a36Sopenharmony_ci smp_rmb(); 152762306a36Sopenharmony_ci if (offset == XDP_UMEM_PGOFF_FILL_RING) 152862306a36Sopenharmony_ci q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : 152962306a36Sopenharmony_ci READ_ONCE(xs->pool->fq); 153062306a36Sopenharmony_ci else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 153162306a36Sopenharmony_ci q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : 153262306a36Sopenharmony_ci READ_ONCE(xs->pool->cq); 153362306a36Sopenharmony_ci } 153462306a36Sopenharmony_ci 153562306a36Sopenharmony_ci if (!q) 153662306a36Sopenharmony_ci return -EINVAL; 153762306a36Sopenharmony_ci 153862306a36Sopenharmony_ci /* Matches the smp_wmb() in xsk_init_queue */ 153962306a36Sopenharmony_ci smp_rmb(); 154062306a36Sopenharmony_ci if (size > q->ring_vmalloc_size) 154162306a36Sopenharmony_ci return -EINVAL; 154262306a36Sopenharmony_ci 154362306a36Sopenharmony_ci return remap_vmalloc_range(vma, q->ring, 0); 154462306a36Sopenharmony_ci} 154562306a36Sopenharmony_ci 154662306a36Sopenharmony_cistatic int xsk_notifier(struct notifier_block *this, 154762306a36Sopenharmony_ci unsigned long msg, void *ptr) 154862306a36Sopenharmony_ci{ 154962306a36Sopenharmony_ci struct net_device *dev = netdev_notifier_info_to_dev(ptr); 155062306a36Sopenharmony_ci struct net *net = dev_net(dev); 155162306a36Sopenharmony_ci struct sock *sk; 155262306a36Sopenharmony_ci 155362306a36Sopenharmony_ci switch (msg) { 155462306a36Sopenharmony_ci case NETDEV_UNREGISTER: 155562306a36Sopenharmony_ci mutex_lock(&net->xdp.lock); 155662306a36Sopenharmony_ci sk_for_each(sk, &net->xdp.list) { 155762306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 155862306a36Sopenharmony_ci 155962306a36Sopenharmony_ci mutex_lock(&xs->mutex); 156062306a36Sopenharmony_ci if (xs->dev == dev) { 156162306a36Sopenharmony_ci sk->sk_err = ENETDOWN; 156262306a36Sopenharmony_ci if (!sock_flag(sk, SOCK_DEAD)) 156362306a36Sopenharmony_ci sk_error_report(sk); 156462306a36Sopenharmony_ci 156562306a36Sopenharmony_ci xsk_unbind_dev(xs); 156662306a36Sopenharmony_ci 156762306a36Sopenharmony_ci /* Clear device references. */ 156862306a36Sopenharmony_ci xp_clear_dev(xs->pool); 156962306a36Sopenharmony_ci } 157062306a36Sopenharmony_ci mutex_unlock(&xs->mutex); 157162306a36Sopenharmony_ci } 157262306a36Sopenharmony_ci mutex_unlock(&net->xdp.lock); 157362306a36Sopenharmony_ci break; 157462306a36Sopenharmony_ci } 157562306a36Sopenharmony_ci return NOTIFY_DONE; 157662306a36Sopenharmony_ci} 157762306a36Sopenharmony_ci 157862306a36Sopenharmony_cistatic struct proto xsk_proto = { 157962306a36Sopenharmony_ci .name = "XDP", 158062306a36Sopenharmony_ci .owner = THIS_MODULE, 158162306a36Sopenharmony_ci .obj_size = sizeof(struct xdp_sock), 158262306a36Sopenharmony_ci}; 158362306a36Sopenharmony_ci 158462306a36Sopenharmony_cistatic const struct proto_ops xsk_proto_ops = { 158562306a36Sopenharmony_ci .family = PF_XDP, 158662306a36Sopenharmony_ci .owner = THIS_MODULE, 158762306a36Sopenharmony_ci .release = xsk_release, 158862306a36Sopenharmony_ci .bind = xsk_bind, 158962306a36Sopenharmony_ci .connect = sock_no_connect, 159062306a36Sopenharmony_ci .socketpair = sock_no_socketpair, 159162306a36Sopenharmony_ci .accept = sock_no_accept, 159262306a36Sopenharmony_ci .getname = sock_no_getname, 159362306a36Sopenharmony_ci .poll = xsk_poll, 159462306a36Sopenharmony_ci .ioctl = sock_no_ioctl, 159562306a36Sopenharmony_ci .listen = sock_no_listen, 159662306a36Sopenharmony_ci .shutdown = sock_no_shutdown, 159762306a36Sopenharmony_ci .setsockopt = xsk_setsockopt, 159862306a36Sopenharmony_ci .getsockopt = xsk_getsockopt, 159962306a36Sopenharmony_ci .sendmsg = xsk_sendmsg, 160062306a36Sopenharmony_ci .recvmsg = xsk_recvmsg, 160162306a36Sopenharmony_ci .mmap = xsk_mmap, 160262306a36Sopenharmony_ci}; 160362306a36Sopenharmony_ci 160462306a36Sopenharmony_cistatic void xsk_destruct(struct sock *sk) 160562306a36Sopenharmony_ci{ 160662306a36Sopenharmony_ci struct xdp_sock *xs = xdp_sk(sk); 160762306a36Sopenharmony_ci 160862306a36Sopenharmony_ci if (!sock_flag(sk, SOCK_DEAD)) 160962306a36Sopenharmony_ci return; 161062306a36Sopenharmony_ci 161162306a36Sopenharmony_ci if (!xp_put_pool(xs->pool)) 161262306a36Sopenharmony_ci xdp_put_umem(xs->umem, !xs->pool); 161362306a36Sopenharmony_ci} 161462306a36Sopenharmony_ci 161562306a36Sopenharmony_cistatic int xsk_create(struct net *net, struct socket *sock, int protocol, 161662306a36Sopenharmony_ci int kern) 161762306a36Sopenharmony_ci{ 161862306a36Sopenharmony_ci struct xdp_sock *xs; 161962306a36Sopenharmony_ci struct sock *sk; 162062306a36Sopenharmony_ci 162162306a36Sopenharmony_ci if (!ns_capable(net->user_ns, CAP_NET_RAW)) 162262306a36Sopenharmony_ci return -EPERM; 162362306a36Sopenharmony_ci if (sock->type != SOCK_RAW) 162462306a36Sopenharmony_ci return -ESOCKTNOSUPPORT; 162562306a36Sopenharmony_ci 162662306a36Sopenharmony_ci if (protocol) 162762306a36Sopenharmony_ci return -EPROTONOSUPPORT; 162862306a36Sopenharmony_ci 162962306a36Sopenharmony_ci sock->state = SS_UNCONNECTED; 163062306a36Sopenharmony_ci 163162306a36Sopenharmony_ci sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); 163262306a36Sopenharmony_ci if (!sk) 163362306a36Sopenharmony_ci return -ENOBUFS; 163462306a36Sopenharmony_ci 163562306a36Sopenharmony_ci sock->ops = &xsk_proto_ops; 163662306a36Sopenharmony_ci 163762306a36Sopenharmony_ci sock_init_data(sock, sk); 163862306a36Sopenharmony_ci 163962306a36Sopenharmony_ci sk->sk_family = PF_XDP; 164062306a36Sopenharmony_ci 164162306a36Sopenharmony_ci sk->sk_destruct = xsk_destruct; 164262306a36Sopenharmony_ci 164362306a36Sopenharmony_ci sock_set_flag(sk, SOCK_RCU_FREE); 164462306a36Sopenharmony_ci 164562306a36Sopenharmony_ci xs = xdp_sk(sk); 164662306a36Sopenharmony_ci xs->state = XSK_READY; 164762306a36Sopenharmony_ci mutex_init(&xs->mutex); 164862306a36Sopenharmony_ci spin_lock_init(&xs->rx_lock); 164962306a36Sopenharmony_ci 165062306a36Sopenharmony_ci INIT_LIST_HEAD(&xs->map_list); 165162306a36Sopenharmony_ci spin_lock_init(&xs->map_list_lock); 165262306a36Sopenharmony_ci 165362306a36Sopenharmony_ci mutex_lock(&net->xdp.lock); 165462306a36Sopenharmony_ci sk_add_node_rcu(sk, &net->xdp.list); 165562306a36Sopenharmony_ci mutex_unlock(&net->xdp.lock); 165662306a36Sopenharmony_ci 165762306a36Sopenharmony_ci sock_prot_inuse_add(net, &xsk_proto, 1); 165862306a36Sopenharmony_ci 165962306a36Sopenharmony_ci return 0; 166062306a36Sopenharmony_ci} 166162306a36Sopenharmony_ci 166262306a36Sopenharmony_cistatic const struct net_proto_family xsk_family_ops = { 166362306a36Sopenharmony_ci .family = PF_XDP, 166462306a36Sopenharmony_ci .create = xsk_create, 166562306a36Sopenharmony_ci .owner = THIS_MODULE, 166662306a36Sopenharmony_ci}; 166762306a36Sopenharmony_ci 166862306a36Sopenharmony_cistatic struct notifier_block xsk_netdev_notifier = { 166962306a36Sopenharmony_ci .notifier_call = xsk_notifier, 167062306a36Sopenharmony_ci}; 167162306a36Sopenharmony_ci 167262306a36Sopenharmony_cistatic int __net_init xsk_net_init(struct net *net) 167362306a36Sopenharmony_ci{ 167462306a36Sopenharmony_ci mutex_init(&net->xdp.lock); 167562306a36Sopenharmony_ci INIT_HLIST_HEAD(&net->xdp.list); 167662306a36Sopenharmony_ci return 0; 167762306a36Sopenharmony_ci} 167862306a36Sopenharmony_ci 167962306a36Sopenharmony_cistatic void __net_exit xsk_net_exit(struct net *net) 168062306a36Sopenharmony_ci{ 168162306a36Sopenharmony_ci WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); 168262306a36Sopenharmony_ci} 168362306a36Sopenharmony_ci 168462306a36Sopenharmony_cistatic struct pernet_operations xsk_net_ops = { 168562306a36Sopenharmony_ci .init = xsk_net_init, 168662306a36Sopenharmony_ci .exit = xsk_net_exit, 168762306a36Sopenharmony_ci}; 168862306a36Sopenharmony_ci 168962306a36Sopenharmony_cistatic int __init xsk_init(void) 169062306a36Sopenharmony_ci{ 169162306a36Sopenharmony_ci int err, cpu; 169262306a36Sopenharmony_ci 169362306a36Sopenharmony_ci err = proto_register(&xsk_proto, 0 /* no slab */); 169462306a36Sopenharmony_ci if (err) 169562306a36Sopenharmony_ci goto out; 169662306a36Sopenharmony_ci 169762306a36Sopenharmony_ci err = sock_register(&xsk_family_ops); 169862306a36Sopenharmony_ci if (err) 169962306a36Sopenharmony_ci goto out_proto; 170062306a36Sopenharmony_ci 170162306a36Sopenharmony_ci err = register_pernet_subsys(&xsk_net_ops); 170262306a36Sopenharmony_ci if (err) 170362306a36Sopenharmony_ci goto out_sk; 170462306a36Sopenharmony_ci 170562306a36Sopenharmony_ci err = register_netdevice_notifier(&xsk_netdev_notifier); 170662306a36Sopenharmony_ci if (err) 170762306a36Sopenharmony_ci goto out_pernet; 170862306a36Sopenharmony_ci 170962306a36Sopenharmony_ci for_each_possible_cpu(cpu) 171062306a36Sopenharmony_ci INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); 171162306a36Sopenharmony_ci return 0; 171262306a36Sopenharmony_ci 171362306a36Sopenharmony_ciout_pernet: 171462306a36Sopenharmony_ci unregister_pernet_subsys(&xsk_net_ops); 171562306a36Sopenharmony_ciout_sk: 171662306a36Sopenharmony_ci sock_unregister(PF_XDP); 171762306a36Sopenharmony_ciout_proto: 171862306a36Sopenharmony_ci proto_unregister(&xsk_proto); 171962306a36Sopenharmony_ciout: 172062306a36Sopenharmony_ci return err; 172162306a36Sopenharmony_ci} 172262306a36Sopenharmony_ci 172362306a36Sopenharmony_cifs_initcall(xsk_init); 1724