18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* Copyright (C) 2009 Red Hat, Inc. 38c2ecf20Sopenharmony_ci * Author: Michael S. Tsirkin <mst@redhat.com> 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * virtio-net server in host kernel. 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#include <linux/compat.h> 98c2ecf20Sopenharmony_ci#include <linux/eventfd.h> 108c2ecf20Sopenharmony_ci#include <linux/vhost.h> 118c2ecf20Sopenharmony_ci#include <linux/virtio_net.h> 128c2ecf20Sopenharmony_ci#include <linux/miscdevice.h> 138c2ecf20Sopenharmony_ci#include <linux/module.h> 148c2ecf20Sopenharmony_ci#include <linux/moduleparam.h> 158c2ecf20Sopenharmony_ci#include <linux/mutex.h> 168c2ecf20Sopenharmony_ci#include <linux/workqueue.h> 178c2ecf20Sopenharmony_ci#include <linux/file.h> 188c2ecf20Sopenharmony_ci#include <linux/slab.h> 198c2ecf20Sopenharmony_ci#include <linux/sched/clock.h> 208c2ecf20Sopenharmony_ci#include <linux/sched/signal.h> 218c2ecf20Sopenharmony_ci#include <linux/vmalloc.h> 228c2ecf20Sopenharmony_ci 238c2ecf20Sopenharmony_ci#include <linux/net.h> 248c2ecf20Sopenharmony_ci#include <linux/if_packet.h> 258c2ecf20Sopenharmony_ci#include <linux/if_arp.h> 268c2ecf20Sopenharmony_ci#include <linux/if_tun.h> 278c2ecf20Sopenharmony_ci#include <linux/if_macvlan.h> 288c2ecf20Sopenharmony_ci#include <linux/if_tap.h> 298c2ecf20Sopenharmony_ci#include <linux/if_vlan.h> 308c2ecf20Sopenharmony_ci#include <linux/skb_array.h> 318c2ecf20Sopenharmony_ci#include <linux/skbuff.h> 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci#include <net/sock.h> 348c2ecf20Sopenharmony_ci#include <net/xdp.h> 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_ci#include "vhost.h" 378c2ecf20Sopenharmony_ci 388c2ecf20Sopenharmony_cistatic int experimental_zcopytx = 0; 398c2ecf20Sopenharmony_cimodule_param(experimental_zcopytx, int, 0444); 408c2ecf20Sopenharmony_ciMODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" 418c2ecf20Sopenharmony_ci " 1 -Enable; 0 - Disable"); 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci/* Max number of bytes transferred before requeueing the job. 448c2ecf20Sopenharmony_ci * Using this limit prevents one virtqueue from starving others. */ 458c2ecf20Sopenharmony_ci#define VHOST_NET_WEIGHT 0x80000 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci/* Max number of packets transferred before requeueing the job. 488c2ecf20Sopenharmony_ci * Using this limit prevents one virtqueue from starving others with small 498c2ecf20Sopenharmony_ci * pkts. 508c2ecf20Sopenharmony_ci */ 518c2ecf20Sopenharmony_ci#define VHOST_NET_PKT_WEIGHT 256 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci/* MAX number of TX used buffers for outstanding zerocopy */ 548c2ecf20Sopenharmony_ci#define VHOST_MAX_PEND 128 558c2ecf20Sopenharmony_ci#define VHOST_GOODCOPY_LEN 256 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ci/* 588c2ecf20Sopenharmony_ci * For transmit, used buffer len is unused; we override it to track buffer 598c2ecf20Sopenharmony_ci * status internally; used for zerocopy tx only. 608c2ecf20Sopenharmony_ci */ 618c2ecf20Sopenharmony_ci/* Lower device DMA failed */ 628c2ecf20Sopenharmony_ci#define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) 638c2ecf20Sopenharmony_ci/* Lower device DMA done */ 648c2ecf20Sopenharmony_ci#define VHOST_DMA_DONE_LEN ((__force __virtio32)2) 658c2ecf20Sopenharmony_ci/* Lower device DMA in progress */ 668c2ecf20Sopenharmony_ci#define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) 678c2ecf20Sopenharmony_ci/* Buffer unused */ 688c2ecf20Sopenharmony_ci#define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_cienum { 738c2ecf20Sopenharmony_ci VHOST_NET_FEATURES = VHOST_FEATURES | 748c2ecf20Sopenharmony_ci (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | 758c2ecf20Sopenharmony_ci (1ULL << VIRTIO_NET_F_MRG_RXBUF) | 768c2ecf20Sopenharmony_ci (1ULL << VIRTIO_F_ACCESS_PLATFORM) 778c2ecf20Sopenharmony_ci}; 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_cienum { 808c2ecf20Sopenharmony_ci VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) 818c2ecf20Sopenharmony_ci}; 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_cienum { 848c2ecf20Sopenharmony_ci VHOST_NET_VQ_RX = 0, 858c2ecf20Sopenharmony_ci VHOST_NET_VQ_TX = 1, 868c2ecf20Sopenharmony_ci VHOST_NET_VQ_MAX = 2, 878c2ecf20Sopenharmony_ci}; 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_cistruct vhost_net_ubuf_ref { 908c2ecf20Sopenharmony_ci /* refcount follows semantics similar to kref: 918c2ecf20Sopenharmony_ci * 0: object is released 928c2ecf20Sopenharmony_ci * 1: no outstanding ubufs 938c2ecf20Sopenharmony_ci * >1: outstanding ubufs 948c2ecf20Sopenharmony_ci */ 958c2ecf20Sopenharmony_ci atomic_t refcount; 968c2ecf20Sopenharmony_ci wait_queue_head_t wait; 978c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq; 988c2ecf20Sopenharmony_ci}; 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ci#define VHOST_NET_BATCH 64 1018c2ecf20Sopenharmony_cistruct vhost_net_buf { 1028c2ecf20Sopenharmony_ci void **queue; 1038c2ecf20Sopenharmony_ci int tail; 1048c2ecf20Sopenharmony_ci int head; 1058c2ecf20Sopenharmony_ci}; 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_cistruct vhost_net_virtqueue { 1088c2ecf20Sopenharmony_ci struct vhost_virtqueue vq; 1098c2ecf20Sopenharmony_ci size_t vhost_hlen; 1108c2ecf20Sopenharmony_ci size_t sock_hlen; 1118c2ecf20Sopenharmony_ci /* vhost zerocopy support fields below: */ 1128c2ecf20Sopenharmony_ci /* last used idx for outstanding DMA zerocopy buffers */ 1138c2ecf20Sopenharmony_ci int upend_idx; 1148c2ecf20Sopenharmony_ci /* For TX, first used idx for DMA done zerocopy buffers 1158c2ecf20Sopenharmony_ci * For RX, number of batched heads 1168c2ecf20Sopenharmony_ci */ 1178c2ecf20Sopenharmony_ci int done_idx; 1188c2ecf20Sopenharmony_ci /* Number of XDP frames batched */ 1198c2ecf20Sopenharmony_ci int batched_xdp; 1208c2ecf20Sopenharmony_ci /* an array of userspace buffers info */ 1218c2ecf20Sopenharmony_ci struct ubuf_info *ubuf_info; 1228c2ecf20Sopenharmony_ci /* Reference counting for outstanding ubufs. 1238c2ecf20Sopenharmony_ci * Protected by vq mutex. Writers must also take device mutex. */ 1248c2ecf20Sopenharmony_ci struct vhost_net_ubuf_ref *ubufs; 1258c2ecf20Sopenharmony_ci struct ptr_ring *rx_ring; 1268c2ecf20Sopenharmony_ci struct vhost_net_buf rxq; 1278c2ecf20Sopenharmony_ci /* Batched XDP buffs */ 1288c2ecf20Sopenharmony_ci struct xdp_buff *xdp; 1298c2ecf20Sopenharmony_ci}; 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_cistruct vhost_net { 1328c2ecf20Sopenharmony_ci struct vhost_dev dev; 1338c2ecf20Sopenharmony_ci struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; 1348c2ecf20Sopenharmony_ci struct vhost_poll poll[VHOST_NET_VQ_MAX]; 1358c2ecf20Sopenharmony_ci /* Number of TX recently submitted. 1368c2ecf20Sopenharmony_ci * Protected by tx vq lock. */ 1378c2ecf20Sopenharmony_ci unsigned tx_packets; 1388c2ecf20Sopenharmony_ci /* Number of times zerocopy TX recently failed. 1398c2ecf20Sopenharmony_ci * Protected by tx vq lock. */ 1408c2ecf20Sopenharmony_ci unsigned tx_zcopy_err; 1418c2ecf20Sopenharmony_ci /* Flush in progress. Protected by tx vq lock. */ 1428c2ecf20Sopenharmony_ci bool tx_flush; 1438c2ecf20Sopenharmony_ci /* Private page frag */ 1448c2ecf20Sopenharmony_ci struct page_frag page_frag; 1458c2ecf20Sopenharmony_ci /* Refcount bias of page frag */ 1468c2ecf20Sopenharmony_ci int refcnt_bias; 1478c2ecf20Sopenharmony_ci}; 1488c2ecf20Sopenharmony_ci 1498c2ecf20Sopenharmony_cistatic unsigned vhost_net_zcopy_mask __read_mostly; 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_cistatic void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) 1528c2ecf20Sopenharmony_ci{ 1538c2ecf20Sopenharmony_ci if (rxq->tail != rxq->head) 1548c2ecf20Sopenharmony_ci return rxq->queue[rxq->head]; 1558c2ecf20Sopenharmony_ci else 1568c2ecf20Sopenharmony_ci return NULL; 1578c2ecf20Sopenharmony_ci} 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_cistatic int vhost_net_buf_get_size(struct vhost_net_buf *rxq) 1608c2ecf20Sopenharmony_ci{ 1618c2ecf20Sopenharmony_ci return rxq->tail - rxq->head; 1628c2ecf20Sopenharmony_ci} 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_cistatic int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) 1658c2ecf20Sopenharmony_ci{ 1668c2ecf20Sopenharmony_ci return rxq->tail == rxq->head; 1678c2ecf20Sopenharmony_ci} 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_cistatic void *vhost_net_buf_consume(struct vhost_net_buf *rxq) 1708c2ecf20Sopenharmony_ci{ 1718c2ecf20Sopenharmony_ci void *ret = vhost_net_buf_get_ptr(rxq); 1728c2ecf20Sopenharmony_ci ++rxq->head; 1738c2ecf20Sopenharmony_ci return ret; 1748c2ecf20Sopenharmony_ci} 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_cistatic int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) 1778c2ecf20Sopenharmony_ci{ 1788c2ecf20Sopenharmony_ci struct vhost_net_buf *rxq = &nvq->rxq; 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_ci rxq->head = 0; 1818c2ecf20Sopenharmony_ci rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, 1828c2ecf20Sopenharmony_ci VHOST_NET_BATCH); 1838c2ecf20Sopenharmony_ci return rxq->tail; 1848c2ecf20Sopenharmony_ci} 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_cistatic void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) 1878c2ecf20Sopenharmony_ci{ 1888c2ecf20Sopenharmony_ci struct vhost_net_buf *rxq = &nvq->rxq; 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { 1918c2ecf20Sopenharmony_ci ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, 1928c2ecf20Sopenharmony_ci vhost_net_buf_get_size(rxq), 1938c2ecf20Sopenharmony_ci tun_ptr_free); 1948c2ecf20Sopenharmony_ci rxq->head = rxq->tail = 0; 1958c2ecf20Sopenharmony_ci } 1968c2ecf20Sopenharmony_ci} 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_cistatic int vhost_net_buf_peek_len(void *ptr) 1998c2ecf20Sopenharmony_ci{ 2008c2ecf20Sopenharmony_ci if (tun_is_xdp_frame(ptr)) { 2018c2ecf20Sopenharmony_ci struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_ci return xdpf->len; 2048c2ecf20Sopenharmony_ci } 2058c2ecf20Sopenharmony_ci 2068c2ecf20Sopenharmony_ci return __skb_array_len_with_tag(ptr); 2078c2ecf20Sopenharmony_ci} 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_cistatic int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) 2108c2ecf20Sopenharmony_ci{ 2118c2ecf20Sopenharmony_ci struct vhost_net_buf *rxq = &nvq->rxq; 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci if (!vhost_net_buf_is_empty(rxq)) 2148c2ecf20Sopenharmony_ci goto out; 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci if (!vhost_net_buf_produce(nvq)) 2178c2ecf20Sopenharmony_ci return 0; 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ciout: 2208c2ecf20Sopenharmony_ci return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq)); 2218c2ecf20Sopenharmony_ci} 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_cistatic void vhost_net_buf_init(struct vhost_net_buf *rxq) 2248c2ecf20Sopenharmony_ci{ 2258c2ecf20Sopenharmony_ci rxq->head = rxq->tail = 0; 2268c2ecf20Sopenharmony_ci} 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_cistatic void vhost_net_enable_zcopy(int vq) 2298c2ecf20Sopenharmony_ci{ 2308c2ecf20Sopenharmony_ci vhost_net_zcopy_mask |= 0x1 << vq; 2318c2ecf20Sopenharmony_ci} 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_cistatic struct vhost_net_ubuf_ref * 2348c2ecf20Sopenharmony_civhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) 2358c2ecf20Sopenharmony_ci{ 2368c2ecf20Sopenharmony_ci struct vhost_net_ubuf_ref *ubufs; 2378c2ecf20Sopenharmony_ci /* No zero copy backend? Nothing to count. */ 2388c2ecf20Sopenharmony_ci if (!zcopy) 2398c2ecf20Sopenharmony_ci return NULL; 2408c2ecf20Sopenharmony_ci ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); 2418c2ecf20Sopenharmony_ci if (!ubufs) 2428c2ecf20Sopenharmony_ci return ERR_PTR(-ENOMEM); 2438c2ecf20Sopenharmony_ci atomic_set(&ubufs->refcount, 1); 2448c2ecf20Sopenharmony_ci init_waitqueue_head(&ubufs->wait); 2458c2ecf20Sopenharmony_ci ubufs->vq = vq; 2468c2ecf20Sopenharmony_ci return ubufs; 2478c2ecf20Sopenharmony_ci} 2488c2ecf20Sopenharmony_ci 2498c2ecf20Sopenharmony_cistatic int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) 2508c2ecf20Sopenharmony_ci{ 2518c2ecf20Sopenharmony_ci int r = atomic_sub_return(1, &ubufs->refcount); 2528c2ecf20Sopenharmony_ci if (unlikely(!r)) 2538c2ecf20Sopenharmony_ci wake_up(&ubufs->wait); 2548c2ecf20Sopenharmony_ci return r; 2558c2ecf20Sopenharmony_ci} 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_cistatic void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) 2588c2ecf20Sopenharmony_ci{ 2598c2ecf20Sopenharmony_ci vhost_net_ubuf_put(ubufs); 2608c2ecf20Sopenharmony_ci wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); 2618c2ecf20Sopenharmony_ci} 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_cistatic void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) 2648c2ecf20Sopenharmony_ci{ 2658c2ecf20Sopenharmony_ci vhost_net_ubuf_put_and_wait(ubufs); 2668c2ecf20Sopenharmony_ci kfree(ubufs); 2678c2ecf20Sopenharmony_ci} 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_cistatic void vhost_net_clear_ubuf_info(struct vhost_net *n) 2708c2ecf20Sopenharmony_ci{ 2718c2ecf20Sopenharmony_ci int i; 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 2748c2ecf20Sopenharmony_ci kfree(n->vqs[i].ubuf_info); 2758c2ecf20Sopenharmony_ci n->vqs[i].ubuf_info = NULL; 2768c2ecf20Sopenharmony_ci } 2778c2ecf20Sopenharmony_ci} 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_cistatic int vhost_net_set_ubuf_info(struct vhost_net *n) 2808c2ecf20Sopenharmony_ci{ 2818c2ecf20Sopenharmony_ci bool zcopy; 2828c2ecf20Sopenharmony_ci int i; 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 2858c2ecf20Sopenharmony_ci zcopy = vhost_net_zcopy_mask & (0x1 << i); 2868c2ecf20Sopenharmony_ci if (!zcopy) 2878c2ecf20Sopenharmony_ci continue; 2888c2ecf20Sopenharmony_ci n->vqs[i].ubuf_info = 2898c2ecf20Sopenharmony_ci kmalloc_array(UIO_MAXIOV, 2908c2ecf20Sopenharmony_ci sizeof(*n->vqs[i].ubuf_info), 2918c2ecf20Sopenharmony_ci GFP_KERNEL); 2928c2ecf20Sopenharmony_ci if (!n->vqs[i].ubuf_info) 2938c2ecf20Sopenharmony_ci goto err; 2948c2ecf20Sopenharmony_ci } 2958c2ecf20Sopenharmony_ci return 0; 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_cierr: 2988c2ecf20Sopenharmony_ci vhost_net_clear_ubuf_info(n); 2998c2ecf20Sopenharmony_ci return -ENOMEM; 3008c2ecf20Sopenharmony_ci} 3018c2ecf20Sopenharmony_ci 3028c2ecf20Sopenharmony_cistatic void vhost_net_vq_reset(struct vhost_net *n) 3038c2ecf20Sopenharmony_ci{ 3048c2ecf20Sopenharmony_ci int i; 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci vhost_net_clear_ubuf_info(n); 3078c2ecf20Sopenharmony_ci 3088c2ecf20Sopenharmony_ci for (i = 0; i < VHOST_NET_VQ_MAX; i++) { 3098c2ecf20Sopenharmony_ci n->vqs[i].done_idx = 0; 3108c2ecf20Sopenharmony_ci n->vqs[i].upend_idx = 0; 3118c2ecf20Sopenharmony_ci n->vqs[i].ubufs = NULL; 3128c2ecf20Sopenharmony_ci n->vqs[i].vhost_hlen = 0; 3138c2ecf20Sopenharmony_ci n->vqs[i].sock_hlen = 0; 3148c2ecf20Sopenharmony_ci vhost_net_buf_init(&n->vqs[i].rxq); 3158c2ecf20Sopenharmony_ci } 3168c2ecf20Sopenharmony_ci 3178c2ecf20Sopenharmony_ci} 3188c2ecf20Sopenharmony_ci 3198c2ecf20Sopenharmony_cistatic void vhost_net_tx_packet(struct vhost_net *net) 3208c2ecf20Sopenharmony_ci{ 3218c2ecf20Sopenharmony_ci ++net->tx_packets; 3228c2ecf20Sopenharmony_ci if (net->tx_packets < 1024) 3238c2ecf20Sopenharmony_ci return; 3248c2ecf20Sopenharmony_ci net->tx_packets = 0; 3258c2ecf20Sopenharmony_ci net->tx_zcopy_err = 0; 3268c2ecf20Sopenharmony_ci} 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_cistatic void vhost_net_tx_err(struct vhost_net *net) 3298c2ecf20Sopenharmony_ci{ 3308c2ecf20Sopenharmony_ci ++net->tx_zcopy_err; 3318c2ecf20Sopenharmony_ci} 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_cistatic bool vhost_net_tx_select_zcopy(struct vhost_net *net) 3348c2ecf20Sopenharmony_ci{ 3358c2ecf20Sopenharmony_ci /* TX flush waits for outstanding DMAs to be done. 3368c2ecf20Sopenharmony_ci * Don't start new DMAs. 3378c2ecf20Sopenharmony_ci */ 3388c2ecf20Sopenharmony_ci return !net->tx_flush && 3398c2ecf20Sopenharmony_ci net->tx_packets / 64 >= net->tx_zcopy_err; 3408c2ecf20Sopenharmony_ci} 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_cistatic bool vhost_sock_zcopy(struct socket *sock) 3438c2ecf20Sopenharmony_ci{ 3448c2ecf20Sopenharmony_ci return unlikely(experimental_zcopytx) && 3458c2ecf20Sopenharmony_ci sock_flag(sock->sk, SOCK_ZEROCOPY); 3468c2ecf20Sopenharmony_ci} 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_cistatic bool vhost_sock_xdp(struct socket *sock) 3498c2ecf20Sopenharmony_ci{ 3508c2ecf20Sopenharmony_ci return sock_flag(sock->sk, SOCK_XDP); 3518c2ecf20Sopenharmony_ci} 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci/* In case of DMA done not in order in lower device driver for some reason. 3548c2ecf20Sopenharmony_ci * upend_idx is used to track end of used idx, done_idx is used to track head 3558c2ecf20Sopenharmony_ci * of used idx. Once lower device DMA done contiguously, we will signal KVM 3568c2ecf20Sopenharmony_ci * guest used idx. 3578c2ecf20Sopenharmony_ci */ 3588c2ecf20Sopenharmony_cistatic void vhost_zerocopy_signal_used(struct vhost_net *net, 3598c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq) 3608c2ecf20Sopenharmony_ci{ 3618c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq = 3628c2ecf20Sopenharmony_ci container_of(vq, struct vhost_net_virtqueue, vq); 3638c2ecf20Sopenharmony_ci int i, add; 3648c2ecf20Sopenharmony_ci int j = 0; 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { 3678c2ecf20Sopenharmony_ci if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) 3688c2ecf20Sopenharmony_ci vhost_net_tx_err(net); 3698c2ecf20Sopenharmony_ci if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { 3708c2ecf20Sopenharmony_ci vq->heads[i].len = VHOST_DMA_CLEAR_LEN; 3718c2ecf20Sopenharmony_ci ++j; 3728c2ecf20Sopenharmony_ci } else 3738c2ecf20Sopenharmony_ci break; 3748c2ecf20Sopenharmony_ci } 3758c2ecf20Sopenharmony_ci while (j) { 3768c2ecf20Sopenharmony_ci add = min(UIO_MAXIOV - nvq->done_idx, j); 3778c2ecf20Sopenharmony_ci vhost_add_used_and_signal_n(vq->dev, vq, 3788c2ecf20Sopenharmony_ci &vq->heads[nvq->done_idx], add); 3798c2ecf20Sopenharmony_ci nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; 3808c2ecf20Sopenharmony_ci j -= add; 3818c2ecf20Sopenharmony_ci } 3828c2ecf20Sopenharmony_ci} 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_cistatic void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) 3858c2ecf20Sopenharmony_ci{ 3868c2ecf20Sopenharmony_ci struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; 3878c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = ubufs->vq; 3888c2ecf20Sopenharmony_ci int cnt; 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_ci rcu_read_lock_bh(); 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci /* set len to mark this desc buffers done DMA */ 3938c2ecf20Sopenharmony_ci vq->heads[ubuf->desc].len = success ? 3948c2ecf20Sopenharmony_ci VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; 3958c2ecf20Sopenharmony_ci cnt = vhost_net_ubuf_put(ubufs); 3968c2ecf20Sopenharmony_ci 3978c2ecf20Sopenharmony_ci /* 3988c2ecf20Sopenharmony_ci * Trigger polling thread if guest stopped submitting new buffers: 3998c2ecf20Sopenharmony_ci * in this case, the refcount after decrement will eventually reach 1. 4008c2ecf20Sopenharmony_ci * We also trigger polling periodically after each 16 packets 4018c2ecf20Sopenharmony_ci * (the value 16 here is more or less arbitrary, it's tuned to trigger 4028c2ecf20Sopenharmony_ci * less than 10% of times). 4038c2ecf20Sopenharmony_ci */ 4048c2ecf20Sopenharmony_ci if (cnt <= 1 || !(cnt % 16)) 4058c2ecf20Sopenharmony_ci vhost_poll_queue(&vq->poll); 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci rcu_read_unlock_bh(); 4088c2ecf20Sopenharmony_ci} 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_cistatic inline unsigned long busy_clock(void) 4118c2ecf20Sopenharmony_ci{ 4128c2ecf20Sopenharmony_ci return local_clock() >> 10; 4138c2ecf20Sopenharmony_ci} 4148c2ecf20Sopenharmony_ci 4158c2ecf20Sopenharmony_cistatic bool vhost_can_busy_poll(unsigned long endtime) 4168c2ecf20Sopenharmony_ci{ 4178c2ecf20Sopenharmony_ci return likely(!need_resched() && !time_after(busy_clock(), endtime) && 4188c2ecf20Sopenharmony_ci !signal_pending(current)); 4198c2ecf20Sopenharmony_ci} 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_cistatic void vhost_net_disable_vq(struct vhost_net *n, 4228c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq) 4238c2ecf20Sopenharmony_ci{ 4248c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq = 4258c2ecf20Sopenharmony_ci container_of(vq, struct vhost_net_virtqueue, vq); 4268c2ecf20Sopenharmony_ci struct vhost_poll *poll = n->poll + (nvq - n->vqs); 4278c2ecf20Sopenharmony_ci if (!vhost_vq_get_backend(vq)) 4288c2ecf20Sopenharmony_ci return; 4298c2ecf20Sopenharmony_ci vhost_poll_stop(poll); 4308c2ecf20Sopenharmony_ci} 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_cistatic int vhost_net_enable_vq(struct vhost_net *n, 4338c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq) 4348c2ecf20Sopenharmony_ci{ 4358c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq = 4368c2ecf20Sopenharmony_ci container_of(vq, struct vhost_net_virtqueue, vq); 4378c2ecf20Sopenharmony_ci struct vhost_poll *poll = n->poll + (nvq - n->vqs); 4388c2ecf20Sopenharmony_ci struct socket *sock; 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci sock = vhost_vq_get_backend(vq); 4418c2ecf20Sopenharmony_ci if (!sock) 4428c2ecf20Sopenharmony_ci return 0; 4438c2ecf20Sopenharmony_ci 4448c2ecf20Sopenharmony_ci return vhost_poll_start(poll, sock->file); 4458c2ecf20Sopenharmony_ci} 4468c2ecf20Sopenharmony_ci 4478c2ecf20Sopenharmony_cistatic void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) 4488c2ecf20Sopenharmony_ci{ 4498c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = &nvq->vq; 4508c2ecf20Sopenharmony_ci struct vhost_dev *dev = vq->dev; 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_ci if (!nvq->done_idx) 4538c2ecf20Sopenharmony_ci return; 4548c2ecf20Sopenharmony_ci 4558c2ecf20Sopenharmony_ci vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); 4568c2ecf20Sopenharmony_ci nvq->done_idx = 0; 4578c2ecf20Sopenharmony_ci} 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_cistatic void vhost_tx_batch(struct vhost_net *net, 4608c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq, 4618c2ecf20Sopenharmony_ci struct socket *sock, 4628c2ecf20Sopenharmony_ci struct msghdr *msghdr) 4638c2ecf20Sopenharmony_ci{ 4648c2ecf20Sopenharmony_ci struct tun_msg_ctl ctl = { 4658c2ecf20Sopenharmony_ci .type = TUN_MSG_PTR, 4668c2ecf20Sopenharmony_ci .num = nvq->batched_xdp, 4678c2ecf20Sopenharmony_ci .ptr = nvq->xdp, 4688c2ecf20Sopenharmony_ci }; 4698c2ecf20Sopenharmony_ci int i, err; 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci if (nvq->batched_xdp == 0) 4728c2ecf20Sopenharmony_ci goto signal_used; 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci msghdr->msg_control = &ctl; 4758c2ecf20Sopenharmony_ci msghdr->msg_controllen = sizeof(ctl); 4768c2ecf20Sopenharmony_ci err = sock->ops->sendmsg(sock, msghdr, 0); 4778c2ecf20Sopenharmony_ci if (unlikely(err < 0)) { 4788c2ecf20Sopenharmony_ci vq_err(&nvq->vq, "Fail to batch sending packets\n"); 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_ci /* free pages owned by XDP; since this is an unlikely error path, 4818c2ecf20Sopenharmony_ci * keep it simple and avoid more complex bulk update for the 4828c2ecf20Sopenharmony_ci * used pages 4838c2ecf20Sopenharmony_ci */ 4848c2ecf20Sopenharmony_ci for (i = 0; i < nvq->batched_xdp; ++i) 4858c2ecf20Sopenharmony_ci put_page(virt_to_head_page(nvq->xdp[i].data)); 4868c2ecf20Sopenharmony_ci nvq->batched_xdp = 0; 4878c2ecf20Sopenharmony_ci nvq->done_idx = 0; 4888c2ecf20Sopenharmony_ci return; 4898c2ecf20Sopenharmony_ci } 4908c2ecf20Sopenharmony_ci 4918c2ecf20Sopenharmony_cisignal_used: 4928c2ecf20Sopenharmony_ci vhost_net_signal_used(nvq); 4938c2ecf20Sopenharmony_ci nvq->batched_xdp = 0; 4948c2ecf20Sopenharmony_ci} 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_cistatic int sock_has_rx_data(struct socket *sock) 4978c2ecf20Sopenharmony_ci{ 4988c2ecf20Sopenharmony_ci if (unlikely(!sock)) 4998c2ecf20Sopenharmony_ci return 0; 5008c2ecf20Sopenharmony_ci 5018c2ecf20Sopenharmony_ci if (sock->ops->peek_len) 5028c2ecf20Sopenharmony_ci return sock->ops->peek_len(sock); 5038c2ecf20Sopenharmony_ci 5048c2ecf20Sopenharmony_ci return skb_queue_empty(&sock->sk->sk_receive_queue); 5058c2ecf20Sopenharmony_ci} 5068c2ecf20Sopenharmony_ci 5078c2ecf20Sopenharmony_cistatic void vhost_net_busy_poll_try_queue(struct vhost_net *net, 5088c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq) 5098c2ecf20Sopenharmony_ci{ 5108c2ecf20Sopenharmony_ci if (!vhost_vq_avail_empty(&net->dev, vq)) { 5118c2ecf20Sopenharmony_ci vhost_poll_queue(&vq->poll); 5128c2ecf20Sopenharmony_ci } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { 5138c2ecf20Sopenharmony_ci vhost_disable_notify(&net->dev, vq); 5148c2ecf20Sopenharmony_ci vhost_poll_queue(&vq->poll); 5158c2ecf20Sopenharmony_ci } 5168c2ecf20Sopenharmony_ci} 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_cistatic void vhost_net_busy_poll(struct vhost_net *net, 5198c2ecf20Sopenharmony_ci struct vhost_virtqueue *rvq, 5208c2ecf20Sopenharmony_ci struct vhost_virtqueue *tvq, 5218c2ecf20Sopenharmony_ci bool *busyloop_intr, 5228c2ecf20Sopenharmony_ci bool poll_rx) 5238c2ecf20Sopenharmony_ci{ 5248c2ecf20Sopenharmony_ci unsigned long busyloop_timeout; 5258c2ecf20Sopenharmony_ci unsigned long endtime; 5268c2ecf20Sopenharmony_ci struct socket *sock; 5278c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ci /* Try to hold the vq mutex of the paired virtqueue. We can't 5308c2ecf20Sopenharmony_ci * use mutex_lock() here since we could not guarantee a 5318c2ecf20Sopenharmony_ci * consistenet lock ordering. 5328c2ecf20Sopenharmony_ci */ 5338c2ecf20Sopenharmony_ci if (!mutex_trylock(&vq->mutex)) 5348c2ecf20Sopenharmony_ci return; 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_ci vhost_disable_notify(&net->dev, vq); 5378c2ecf20Sopenharmony_ci sock = vhost_vq_get_backend(rvq); 5388c2ecf20Sopenharmony_ci 5398c2ecf20Sopenharmony_ci busyloop_timeout = poll_rx ? rvq->busyloop_timeout: 5408c2ecf20Sopenharmony_ci tvq->busyloop_timeout; 5418c2ecf20Sopenharmony_ci 5428c2ecf20Sopenharmony_ci preempt_disable(); 5438c2ecf20Sopenharmony_ci endtime = busy_clock() + busyloop_timeout; 5448c2ecf20Sopenharmony_ci 5458c2ecf20Sopenharmony_ci while (vhost_can_busy_poll(endtime)) { 5468c2ecf20Sopenharmony_ci if (vhost_has_work(&net->dev)) { 5478c2ecf20Sopenharmony_ci *busyloop_intr = true; 5488c2ecf20Sopenharmony_ci break; 5498c2ecf20Sopenharmony_ci } 5508c2ecf20Sopenharmony_ci 5518c2ecf20Sopenharmony_ci if ((sock_has_rx_data(sock) && 5528c2ecf20Sopenharmony_ci !vhost_vq_avail_empty(&net->dev, rvq)) || 5538c2ecf20Sopenharmony_ci !vhost_vq_avail_empty(&net->dev, tvq)) 5548c2ecf20Sopenharmony_ci break; 5558c2ecf20Sopenharmony_ci 5568c2ecf20Sopenharmony_ci cpu_relax(); 5578c2ecf20Sopenharmony_ci } 5588c2ecf20Sopenharmony_ci 5598c2ecf20Sopenharmony_ci preempt_enable(); 5608c2ecf20Sopenharmony_ci 5618c2ecf20Sopenharmony_ci if (poll_rx || sock_has_rx_data(sock)) 5628c2ecf20Sopenharmony_ci vhost_net_busy_poll_try_queue(net, vq); 5638c2ecf20Sopenharmony_ci else if (!poll_rx) /* On tx here, sock has no rx data. */ 5648c2ecf20Sopenharmony_ci vhost_enable_notify(&net->dev, rvq); 5658c2ecf20Sopenharmony_ci 5668c2ecf20Sopenharmony_ci mutex_unlock(&vq->mutex); 5678c2ecf20Sopenharmony_ci} 5688c2ecf20Sopenharmony_ci 5698c2ecf20Sopenharmony_cistatic int vhost_net_tx_get_vq_desc(struct vhost_net *net, 5708c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *tnvq, 5718c2ecf20Sopenharmony_ci unsigned int *out_num, unsigned int *in_num, 5728c2ecf20Sopenharmony_ci struct msghdr *msghdr, bool *busyloop_intr) 5738c2ecf20Sopenharmony_ci{ 5748c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; 5758c2ecf20Sopenharmony_ci struct vhost_virtqueue *rvq = &rnvq->vq; 5768c2ecf20Sopenharmony_ci struct vhost_virtqueue *tvq = &tnvq->vq; 5778c2ecf20Sopenharmony_ci 5788c2ecf20Sopenharmony_ci int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), 5798c2ecf20Sopenharmony_ci out_num, in_num, NULL, NULL); 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_ci if (r == tvq->num && tvq->busyloop_timeout) { 5828c2ecf20Sopenharmony_ci /* Flush batched packets first */ 5838c2ecf20Sopenharmony_ci if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq))) 5848c2ecf20Sopenharmony_ci vhost_tx_batch(net, tnvq, 5858c2ecf20Sopenharmony_ci vhost_vq_get_backend(tvq), 5868c2ecf20Sopenharmony_ci msghdr); 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_ci vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), 5918c2ecf20Sopenharmony_ci out_num, in_num, NULL, NULL); 5928c2ecf20Sopenharmony_ci } 5938c2ecf20Sopenharmony_ci 5948c2ecf20Sopenharmony_ci return r; 5958c2ecf20Sopenharmony_ci} 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_cistatic bool vhost_exceeds_maxpend(struct vhost_net *net) 5988c2ecf20Sopenharmony_ci{ 5998c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; 6008c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = &nvq->vq; 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_ci return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > 6038c2ecf20Sopenharmony_ci min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); 6048c2ecf20Sopenharmony_ci} 6058c2ecf20Sopenharmony_ci 6068c2ecf20Sopenharmony_cistatic size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, 6078c2ecf20Sopenharmony_ci size_t hdr_size, int out) 6088c2ecf20Sopenharmony_ci{ 6098c2ecf20Sopenharmony_ci /* Skip header. TODO: support TSO. */ 6108c2ecf20Sopenharmony_ci size_t len = iov_length(vq->iov, out); 6118c2ecf20Sopenharmony_ci 6128c2ecf20Sopenharmony_ci iov_iter_init(iter, WRITE, vq->iov, out, len); 6138c2ecf20Sopenharmony_ci iov_iter_advance(iter, hdr_size); 6148c2ecf20Sopenharmony_ci 6158c2ecf20Sopenharmony_ci return iov_iter_count(iter); 6168c2ecf20Sopenharmony_ci} 6178c2ecf20Sopenharmony_ci 6188c2ecf20Sopenharmony_cistatic int get_tx_bufs(struct vhost_net *net, 6198c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq, 6208c2ecf20Sopenharmony_ci struct msghdr *msg, 6218c2ecf20Sopenharmony_ci unsigned int *out, unsigned int *in, 6228c2ecf20Sopenharmony_ci size_t *len, bool *busyloop_intr) 6238c2ecf20Sopenharmony_ci{ 6248c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = &nvq->vq; 6258c2ecf20Sopenharmony_ci int ret; 6268c2ecf20Sopenharmony_ci 6278c2ecf20Sopenharmony_ci ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); 6288c2ecf20Sopenharmony_ci 6298c2ecf20Sopenharmony_ci if (ret < 0 || ret == vq->num) 6308c2ecf20Sopenharmony_ci return ret; 6318c2ecf20Sopenharmony_ci 6328c2ecf20Sopenharmony_ci if (*in) { 6338c2ecf20Sopenharmony_ci vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n", 6348c2ecf20Sopenharmony_ci *out, *in); 6358c2ecf20Sopenharmony_ci return -EFAULT; 6368c2ecf20Sopenharmony_ci } 6378c2ecf20Sopenharmony_ci 6388c2ecf20Sopenharmony_ci /* Sanity check */ 6398c2ecf20Sopenharmony_ci *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out); 6408c2ecf20Sopenharmony_ci if (*len == 0) { 6418c2ecf20Sopenharmony_ci vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n", 6428c2ecf20Sopenharmony_ci *len, nvq->vhost_hlen); 6438c2ecf20Sopenharmony_ci return -EFAULT; 6448c2ecf20Sopenharmony_ci } 6458c2ecf20Sopenharmony_ci 6468c2ecf20Sopenharmony_ci return ret; 6478c2ecf20Sopenharmony_ci} 6488c2ecf20Sopenharmony_ci 6498c2ecf20Sopenharmony_cistatic bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) 6508c2ecf20Sopenharmony_ci{ 6518c2ecf20Sopenharmony_ci return total_len < VHOST_NET_WEIGHT && 6528c2ecf20Sopenharmony_ci !vhost_vq_avail_empty(vq->dev, vq); 6538c2ecf20Sopenharmony_ci} 6548c2ecf20Sopenharmony_ci 6558c2ecf20Sopenharmony_cistatic bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, 6568c2ecf20Sopenharmony_ci struct page_frag *pfrag, gfp_t gfp) 6578c2ecf20Sopenharmony_ci{ 6588c2ecf20Sopenharmony_ci if (pfrag->page) { 6598c2ecf20Sopenharmony_ci if (pfrag->offset + sz <= pfrag->size) 6608c2ecf20Sopenharmony_ci return true; 6618c2ecf20Sopenharmony_ci __page_frag_cache_drain(pfrag->page, net->refcnt_bias); 6628c2ecf20Sopenharmony_ci } 6638c2ecf20Sopenharmony_ci 6648c2ecf20Sopenharmony_ci pfrag->offset = 0; 6658c2ecf20Sopenharmony_ci net->refcnt_bias = 0; 6668c2ecf20Sopenharmony_ci if (SKB_FRAG_PAGE_ORDER) { 6678c2ecf20Sopenharmony_ci /* Avoid direct reclaim but allow kswapd to wake */ 6688c2ecf20Sopenharmony_ci pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 6698c2ecf20Sopenharmony_ci __GFP_COMP | __GFP_NOWARN | 6708c2ecf20Sopenharmony_ci __GFP_NORETRY, 6718c2ecf20Sopenharmony_ci SKB_FRAG_PAGE_ORDER); 6728c2ecf20Sopenharmony_ci if (likely(pfrag->page)) { 6738c2ecf20Sopenharmony_ci pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 6748c2ecf20Sopenharmony_ci goto done; 6758c2ecf20Sopenharmony_ci } 6768c2ecf20Sopenharmony_ci } 6778c2ecf20Sopenharmony_ci pfrag->page = alloc_page(gfp); 6788c2ecf20Sopenharmony_ci if (likely(pfrag->page)) { 6798c2ecf20Sopenharmony_ci pfrag->size = PAGE_SIZE; 6808c2ecf20Sopenharmony_ci goto done; 6818c2ecf20Sopenharmony_ci } 6828c2ecf20Sopenharmony_ci return false; 6838c2ecf20Sopenharmony_ci 6848c2ecf20Sopenharmony_cidone: 6858c2ecf20Sopenharmony_ci net->refcnt_bias = USHRT_MAX; 6868c2ecf20Sopenharmony_ci page_ref_add(pfrag->page, USHRT_MAX - 1); 6878c2ecf20Sopenharmony_ci return true; 6888c2ecf20Sopenharmony_ci} 6898c2ecf20Sopenharmony_ci 6908c2ecf20Sopenharmony_ci#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) 6918c2ecf20Sopenharmony_ci 6928c2ecf20Sopenharmony_cistatic int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, 6938c2ecf20Sopenharmony_ci struct iov_iter *from) 6948c2ecf20Sopenharmony_ci{ 6958c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = &nvq->vq; 6968c2ecf20Sopenharmony_ci struct vhost_net *net = container_of(vq->dev, struct vhost_net, 6978c2ecf20Sopenharmony_ci dev); 6988c2ecf20Sopenharmony_ci struct socket *sock = vhost_vq_get_backend(vq); 6998c2ecf20Sopenharmony_ci struct page_frag *alloc_frag = &net->page_frag; 7008c2ecf20Sopenharmony_ci struct virtio_net_hdr *gso; 7018c2ecf20Sopenharmony_ci struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; 7028c2ecf20Sopenharmony_ci struct tun_xdp_hdr *hdr; 7038c2ecf20Sopenharmony_ci size_t len = iov_iter_count(from); 7048c2ecf20Sopenharmony_ci int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; 7058c2ecf20Sopenharmony_ci int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 7068c2ecf20Sopenharmony_ci int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); 7078c2ecf20Sopenharmony_ci int sock_hlen = nvq->sock_hlen; 7088c2ecf20Sopenharmony_ci void *buf; 7098c2ecf20Sopenharmony_ci int copied; 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_ci if (unlikely(len < nvq->sock_hlen)) 7128c2ecf20Sopenharmony_ci return -EFAULT; 7138c2ecf20Sopenharmony_ci 7148c2ecf20Sopenharmony_ci if (SKB_DATA_ALIGN(len + pad) + 7158c2ecf20Sopenharmony_ci SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) 7168c2ecf20Sopenharmony_ci return -ENOSPC; 7178c2ecf20Sopenharmony_ci 7188c2ecf20Sopenharmony_ci buflen += SKB_DATA_ALIGN(len + pad); 7198c2ecf20Sopenharmony_ci alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); 7208c2ecf20Sopenharmony_ci if (unlikely(!vhost_net_page_frag_refill(net, buflen, 7218c2ecf20Sopenharmony_ci alloc_frag, GFP_KERNEL))) 7228c2ecf20Sopenharmony_ci return -ENOMEM; 7238c2ecf20Sopenharmony_ci 7248c2ecf20Sopenharmony_ci buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; 7258c2ecf20Sopenharmony_ci copied = copy_page_from_iter(alloc_frag->page, 7268c2ecf20Sopenharmony_ci alloc_frag->offset + 7278c2ecf20Sopenharmony_ci offsetof(struct tun_xdp_hdr, gso), 7288c2ecf20Sopenharmony_ci sock_hlen, from); 7298c2ecf20Sopenharmony_ci if (copied != sock_hlen) 7308c2ecf20Sopenharmony_ci return -EFAULT; 7318c2ecf20Sopenharmony_ci 7328c2ecf20Sopenharmony_ci hdr = buf; 7338c2ecf20Sopenharmony_ci gso = &hdr->gso; 7348c2ecf20Sopenharmony_ci 7358c2ecf20Sopenharmony_ci if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 7368c2ecf20Sopenharmony_ci vhost16_to_cpu(vq, gso->csum_start) + 7378c2ecf20Sopenharmony_ci vhost16_to_cpu(vq, gso->csum_offset) + 2 > 7388c2ecf20Sopenharmony_ci vhost16_to_cpu(vq, gso->hdr_len)) { 7398c2ecf20Sopenharmony_ci gso->hdr_len = cpu_to_vhost16(vq, 7408c2ecf20Sopenharmony_ci vhost16_to_cpu(vq, gso->csum_start) + 7418c2ecf20Sopenharmony_ci vhost16_to_cpu(vq, gso->csum_offset) + 2); 7428c2ecf20Sopenharmony_ci 7438c2ecf20Sopenharmony_ci if (vhost16_to_cpu(vq, gso->hdr_len) > len) 7448c2ecf20Sopenharmony_ci return -EINVAL; 7458c2ecf20Sopenharmony_ci } 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci len -= sock_hlen; 7488c2ecf20Sopenharmony_ci copied = copy_page_from_iter(alloc_frag->page, 7498c2ecf20Sopenharmony_ci alloc_frag->offset + pad, 7508c2ecf20Sopenharmony_ci len, from); 7518c2ecf20Sopenharmony_ci if (copied != len) 7528c2ecf20Sopenharmony_ci return -EFAULT; 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci xdp->data_hard_start = buf; 7558c2ecf20Sopenharmony_ci xdp->data = buf + pad; 7568c2ecf20Sopenharmony_ci xdp->data_end = xdp->data + len; 7578c2ecf20Sopenharmony_ci hdr->buflen = buflen; 7588c2ecf20Sopenharmony_ci xdp->frame_sz = buflen; 7598c2ecf20Sopenharmony_ci 7608c2ecf20Sopenharmony_ci --net->refcnt_bias; 7618c2ecf20Sopenharmony_ci alloc_frag->offset += buflen; 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_ci ++nvq->batched_xdp; 7648c2ecf20Sopenharmony_ci 7658c2ecf20Sopenharmony_ci return 0; 7668c2ecf20Sopenharmony_ci} 7678c2ecf20Sopenharmony_ci 7688c2ecf20Sopenharmony_cistatic void handle_tx_copy(struct vhost_net *net, struct socket *sock) 7698c2ecf20Sopenharmony_ci{ 7708c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; 7718c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = &nvq->vq; 7728c2ecf20Sopenharmony_ci unsigned out, in; 7738c2ecf20Sopenharmony_ci int head; 7748c2ecf20Sopenharmony_ci struct msghdr msg = { 7758c2ecf20Sopenharmony_ci .msg_name = NULL, 7768c2ecf20Sopenharmony_ci .msg_namelen = 0, 7778c2ecf20Sopenharmony_ci .msg_control = NULL, 7788c2ecf20Sopenharmony_ci .msg_controllen = 0, 7798c2ecf20Sopenharmony_ci .msg_flags = MSG_DONTWAIT, 7808c2ecf20Sopenharmony_ci }; 7818c2ecf20Sopenharmony_ci size_t len, total_len = 0; 7828c2ecf20Sopenharmony_ci int err; 7838c2ecf20Sopenharmony_ci int sent_pkts = 0; 7848c2ecf20Sopenharmony_ci bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_ci do { 7878c2ecf20Sopenharmony_ci bool busyloop_intr = false; 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_ci if (nvq->done_idx == VHOST_NET_BATCH) 7908c2ecf20Sopenharmony_ci vhost_tx_batch(net, nvq, sock, &msg); 7918c2ecf20Sopenharmony_ci 7928c2ecf20Sopenharmony_ci head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, 7938c2ecf20Sopenharmony_ci &busyloop_intr); 7948c2ecf20Sopenharmony_ci /* On error, stop handling until the next kick. */ 7958c2ecf20Sopenharmony_ci if (unlikely(head < 0)) 7968c2ecf20Sopenharmony_ci break; 7978c2ecf20Sopenharmony_ci /* Nothing new? Wait for eventfd to tell us they refilled. */ 7988c2ecf20Sopenharmony_ci if (head == vq->num) { 7998c2ecf20Sopenharmony_ci if (unlikely(busyloop_intr)) { 8008c2ecf20Sopenharmony_ci vhost_poll_queue(&vq->poll); 8018c2ecf20Sopenharmony_ci } else if (unlikely(vhost_enable_notify(&net->dev, 8028c2ecf20Sopenharmony_ci vq))) { 8038c2ecf20Sopenharmony_ci vhost_disable_notify(&net->dev, vq); 8048c2ecf20Sopenharmony_ci continue; 8058c2ecf20Sopenharmony_ci } 8068c2ecf20Sopenharmony_ci break; 8078c2ecf20Sopenharmony_ci } 8088c2ecf20Sopenharmony_ci 8098c2ecf20Sopenharmony_ci total_len += len; 8108c2ecf20Sopenharmony_ci 8118c2ecf20Sopenharmony_ci /* For simplicity, TX batching is only enabled if 8128c2ecf20Sopenharmony_ci * sndbuf is unlimited. 8138c2ecf20Sopenharmony_ci */ 8148c2ecf20Sopenharmony_ci if (sock_can_batch) { 8158c2ecf20Sopenharmony_ci err = vhost_net_build_xdp(nvq, &msg.msg_iter); 8168c2ecf20Sopenharmony_ci if (!err) { 8178c2ecf20Sopenharmony_ci goto done; 8188c2ecf20Sopenharmony_ci } else if (unlikely(err != -ENOSPC)) { 8198c2ecf20Sopenharmony_ci vhost_tx_batch(net, nvq, sock, &msg); 8208c2ecf20Sopenharmony_ci vhost_discard_vq_desc(vq, 1); 8218c2ecf20Sopenharmony_ci vhost_net_enable_vq(net, vq); 8228c2ecf20Sopenharmony_ci break; 8238c2ecf20Sopenharmony_ci } 8248c2ecf20Sopenharmony_ci 8258c2ecf20Sopenharmony_ci /* We can't build XDP buff, go for single 8268c2ecf20Sopenharmony_ci * packet path but let's flush batched 8278c2ecf20Sopenharmony_ci * packets. 8288c2ecf20Sopenharmony_ci */ 8298c2ecf20Sopenharmony_ci vhost_tx_batch(net, nvq, sock, &msg); 8308c2ecf20Sopenharmony_ci msg.msg_control = NULL; 8318c2ecf20Sopenharmony_ci } else { 8328c2ecf20Sopenharmony_ci if (tx_can_batch(vq, total_len)) 8338c2ecf20Sopenharmony_ci msg.msg_flags |= MSG_MORE; 8348c2ecf20Sopenharmony_ci else 8358c2ecf20Sopenharmony_ci msg.msg_flags &= ~MSG_MORE; 8368c2ecf20Sopenharmony_ci } 8378c2ecf20Sopenharmony_ci 8388c2ecf20Sopenharmony_ci /* TODO: Check specific error and bomb out unless ENOBUFS? */ 8398c2ecf20Sopenharmony_ci err = sock->ops->sendmsg(sock, &msg, len); 8408c2ecf20Sopenharmony_ci if (unlikely(err < 0)) { 8418c2ecf20Sopenharmony_ci vhost_discard_vq_desc(vq, 1); 8428c2ecf20Sopenharmony_ci vhost_net_enable_vq(net, vq); 8438c2ecf20Sopenharmony_ci break; 8448c2ecf20Sopenharmony_ci } 8458c2ecf20Sopenharmony_ci if (err != len) 8468c2ecf20Sopenharmony_ci pr_debug("Truncated TX packet: len %d != %zd\n", 8478c2ecf20Sopenharmony_ci err, len); 8488c2ecf20Sopenharmony_cidone: 8498c2ecf20Sopenharmony_ci vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); 8508c2ecf20Sopenharmony_ci vq->heads[nvq->done_idx].len = 0; 8518c2ecf20Sopenharmony_ci ++nvq->done_idx; 8528c2ecf20Sopenharmony_ci } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); 8538c2ecf20Sopenharmony_ci 8548c2ecf20Sopenharmony_ci vhost_tx_batch(net, nvq, sock, &msg); 8558c2ecf20Sopenharmony_ci} 8568c2ecf20Sopenharmony_ci 8578c2ecf20Sopenharmony_cistatic void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) 8588c2ecf20Sopenharmony_ci{ 8598c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; 8608c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = &nvq->vq; 8618c2ecf20Sopenharmony_ci unsigned out, in; 8628c2ecf20Sopenharmony_ci int head; 8638c2ecf20Sopenharmony_ci struct msghdr msg = { 8648c2ecf20Sopenharmony_ci .msg_name = NULL, 8658c2ecf20Sopenharmony_ci .msg_namelen = 0, 8668c2ecf20Sopenharmony_ci .msg_control = NULL, 8678c2ecf20Sopenharmony_ci .msg_controllen = 0, 8688c2ecf20Sopenharmony_ci .msg_flags = MSG_DONTWAIT, 8698c2ecf20Sopenharmony_ci }; 8708c2ecf20Sopenharmony_ci struct tun_msg_ctl ctl; 8718c2ecf20Sopenharmony_ci size_t len, total_len = 0; 8728c2ecf20Sopenharmony_ci int err; 8738c2ecf20Sopenharmony_ci struct vhost_net_ubuf_ref *ubufs; 8748c2ecf20Sopenharmony_ci struct ubuf_info *ubuf; 8758c2ecf20Sopenharmony_ci bool zcopy_used; 8768c2ecf20Sopenharmony_ci int sent_pkts = 0; 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci do { 8798c2ecf20Sopenharmony_ci bool busyloop_intr; 8808c2ecf20Sopenharmony_ci 8818c2ecf20Sopenharmony_ci /* Release DMAs done buffers first */ 8828c2ecf20Sopenharmony_ci vhost_zerocopy_signal_used(net, vq); 8838c2ecf20Sopenharmony_ci 8848c2ecf20Sopenharmony_ci busyloop_intr = false; 8858c2ecf20Sopenharmony_ci head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, 8868c2ecf20Sopenharmony_ci &busyloop_intr); 8878c2ecf20Sopenharmony_ci /* On error, stop handling until the next kick. */ 8888c2ecf20Sopenharmony_ci if (unlikely(head < 0)) 8898c2ecf20Sopenharmony_ci break; 8908c2ecf20Sopenharmony_ci /* Nothing new? Wait for eventfd to tell us they refilled. */ 8918c2ecf20Sopenharmony_ci if (head == vq->num) { 8928c2ecf20Sopenharmony_ci if (unlikely(busyloop_intr)) { 8938c2ecf20Sopenharmony_ci vhost_poll_queue(&vq->poll); 8948c2ecf20Sopenharmony_ci } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { 8958c2ecf20Sopenharmony_ci vhost_disable_notify(&net->dev, vq); 8968c2ecf20Sopenharmony_ci continue; 8978c2ecf20Sopenharmony_ci } 8988c2ecf20Sopenharmony_ci break; 8998c2ecf20Sopenharmony_ci } 9008c2ecf20Sopenharmony_ci 9018c2ecf20Sopenharmony_ci zcopy_used = len >= VHOST_GOODCOPY_LEN 9028c2ecf20Sopenharmony_ci && !vhost_exceeds_maxpend(net) 9038c2ecf20Sopenharmony_ci && vhost_net_tx_select_zcopy(net); 9048c2ecf20Sopenharmony_ci 9058c2ecf20Sopenharmony_ci /* use msg_control to pass vhost zerocopy ubuf info to skb */ 9068c2ecf20Sopenharmony_ci if (zcopy_used) { 9078c2ecf20Sopenharmony_ci ubuf = nvq->ubuf_info + nvq->upend_idx; 9088c2ecf20Sopenharmony_ci vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); 9098c2ecf20Sopenharmony_ci vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; 9108c2ecf20Sopenharmony_ci ubuf->callback = vhost_zerocopy_callback; 9118c2ecf20Sopenharmony_ci ubuf->ctx = nvq->ubufs; 9128c2ecf20Sopenharmony_ci ubuf->desc = nvq->upend_idx; 9138c2ecf20Sopenharmony_ci refcount_set(&ubuf->refcnt, 1); 9148c2ecf20Sopenharmony_ci msg.msg_control = &ctl; 9158c2ecf20Sopenharmony_ci ctl.type = TUN_MSG_UBUF; 9168c2ecf20Sopenharmony_ci ctl.ptr = ubuf; 9178c2ecf20Sopenharmony_ci msg.msg_controllen = sizeof(ctl); 9188c2ecf20Sopenharmony_ci ubufs = nvq->ubufs; 9198c2ecf20Sopenharmony_ci atomic_inc(&ubufs->refcount); 9208c2ecf20Sopenharmony_ci nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; 9218c2ecf20Sopenharmony_ci } else { 9228c2ecf20Sopenharmony_ci msg.msg_control = NULL; 9238c2ecf20Sopenharmony_ci ubufs = NULL; 9248c2ecf20Sopenharmony_ci } 9258c2ecf20Sopenharmony_ci total_len += len; 9268c2ecf20Sopenharmony_ci if (tx_can_batch(vq, total_len) && 9278c2ecf20Sopenharmony_ci likely(!vhost_exceeds_maxpend(net))) { 9288c2ecf20Sopenharmony_ci msg.msg_flags |= MSG_MORE; 9298c2ecf20Sopenharmony_ci } else { 9308c2ecf20Sopenharmony_ci msg.msg_flags &= ~MSG_MORE; 9318c2ecf20Sopenharmony_ci } 9328c2ecf20Sopenharmony_ci 9338c2ecf20Sopenharmony_ci /* TODO: Check specific error and bomb out unless ENOBUFS? */ 9348c2ecf20Sopenharmony_ci err = sock->ops->sendmsg(sock, &msg, len); 9358c2ecf20Sopenharmony_ci if (unlikely(err < 0)) { 9368c2ecf20Sopenharmony_ci if (zcopy_used) { 9378c2ecf20Sopenharmony_ci if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) 9388c2ecf20Sopenharmony_ci vhost_net_ubuf_put(ubufs); 9398c2ecf20Sopenharmony_ci nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) 9408c2ecf20Sopenharmony_ci % UIO_MAXIOV; 9418c2ecf20Sopenharmony_ci } 9428c2ecf20Sopenharmony_ci vhost_discard_vq_desc(vq, 1); 9438c2ecf20Sopenharmony_ci vhost_net_enable_vq(net, vq); 9448c2ecf20Sopenharmony_ci break; 9458c2ecf20Sopenharmony_ci } 9468c2ecf20Sopenharmony_ci if (err != len) 9478c2ecf20Sopenharmony_ci pr_debug("Truncated TX packet: " 9488c2ecf20Sopenharmony_ci " len %d != %zd\n", err, len); 9498c2ecf20Sopenharmony_ci if (!zcopy_used) 9508c2ecf20Sopenharmony_ci vhost_add_used_and_signal(&net->dev, vq, head, 0); 9518c2ecf20Sopenharmony_ci else 9528c2ecf20Sopenharmony_ci vhost_zerocopy_signal_used(net, vq); 9538c2ecf20Sopenharmony_ci vhost_net_tx_packet(net); 9548c2ecf20Sopenharmony_ci } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); 9558c2ecf20Sopenharmony_ci} 9568c2ecf20Sopenharmony_ci 9578c2ecf20Sopenharmony_ci/* Expects to be always run from workqueue - which acts as 9588c2ecf20Sopenharmony_ci * read-size critical section for our kind of RCU. */ 9598c2ecf20Sopenharmony_cistatic void handle_tx(struct vhost_net *net) 9608c2ecf20Sopenharmony_ci{ 9618c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; 9628c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = &nvq->vq; 9638c2ecf20Sopenharmony_ci struct socket *sock; 9648c2ecf20Sopenharmony_ci 9658c2ecf20Sopenharmony_ci mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX); 9668c2ecf20Sopenharmony_ci sock = vhost_vq_get_backend(vq); 9678c2ecf20Sopenharmony_ci if (!sock) 9688c2ecf20Sopenharmony_ci goto out; 9698c2ecf20Sopenharmony_ci 9708c2ecf20Sopenharmony_ci if (!vq_meta_prefetch(vq)) 9718c2ecf20Sopenharmony_ci goto out; 9728c2ecf20Sopenharmony_ci 9738c2ecf20Sopenharmony_ci vhost_disable_notify(&net->dev, vq); 9748c2ecf20Sopenharmony_ci vhost_net_disable_vq(net, vq); 9758c2ecf20Sopenharmony_ci 9768c2ecf20Sopenharmony_ci if (vhost_sock_zcopy(sock)) 9778c2ecf20Sopenharmony_ci handle_tx_zerocopy(net, sock); 9788c2ecf20Sopenharmony_ci else 9798c2ecf20Sopenharmony_ci handle_tx_copy(net, sock); 9808c2ecf20Sopenharmony_ci 9818c2ecf20Sopenharmony_ciout: 9828c2ecf20Sopenharmony_ci mutex_unlock(&vq->mutex); 9838c2ecf20Sopenharmony_ci} 9848c2ecf20Sopenharmony_ci 9858c2ecf20Sopenharmony_cistatic int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) 9868c2ecf20Sopenharmony_ci{ 9878c2ecf20Sopenharmony_ci struct sk_buff *head; 9888c2ecf20Sopenharmony_ci int len = 0; 9898c2ecf20Sopenharmony_ci unsigned long flags; 9908c2ecf20Sopenharmony_ci 9918c2ecf20Sopenharmony_ci if (rvq->rx_ring) 9928c2ecf20Sopenharmony_ci return vhost_net_buf_peek(rvq); 9938c2ecf20Sopenharmony_ci 9948c2ecf20Sopenharmony_ci spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); 9958c2ecf20Sopenharmony_ci head = skb_peek(&sk->sk_receive_queue); 9968c2ecf20Sopenharmony_ci if (likely(head)) { 9978c2ecf20Sopenharmony_ci len = head->len; 9988c2ecf20Sopenharmony_ci if (skb_vlan_tag_present(head)) 9998c2ecf20Sopenharmony_ci len += VLAN_HLEN; 10008c2ecf20Sopenharmony_ci } 10018c2ecf20Sopenharmony_ci 10028c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); 10038c2ecf20Sopenharmony_ci return len; 10048c2ecf20Sopenharmony_ci} 10058c2ecf20Sopenharmony_ci 10068c2ecf20Sopenharmony_cistatic int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, 10078c2ecf20Sopenharmony_ci bool *busyloop_intr) 10088c2ecf20Sopenharmony_ci{ 10098c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; 10108c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; 10118c2ecf20Sopenharmony_ci struct vhost_virtqueue *rvq = &rnvq->vq; 10128c2ecf20Sopenharmony_ci struct vhost_virtqueue *tvq = &tnvq->vq; 10138c2ecf20Sopenharmony_ci int len = peek_head_len(rnvq, sk); 10148c2ecf20Sopenharmony_ci 10158c2ecf20Sopenharmony_ci if (!len && rvq->busyloop_timeout) { 10168c2ecf20Sopenharmony_ci /* Flush batched heads first */ 10178c2ecf20Sopenharmony_ci vhost_net_signal_used(rnvq); 10188c2ecf20Sopenharmony_ci /* Both tx vq and rx socket were polled here */ 10198c2ecf20Sopenharmony_ci vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); 10208c2ecf20Sopenharmony_ci 10218c2ecf20Sopenharmony_ci len = peek_head_len(rnvq, sk); 10228c2ecf20Sopenharmony_ci } 10238c2ecf20Sopenharmony_ci 10248c2ecf20Sopenharmony_ci return len; 10258c2ecf20Sopenharmony_ci} 10268c2ecf20Sopenharmony_ci 10278c2ecf20Sopenharmony_ci/* This is a multi-buffer version of vhost_get_desc, that works if 10288c2ecf20Sopenharmony_ci * vq has read descriptors only. 10298c2ecf20Sopenharmony_ci * @vq - the relevant virtqueue 10308c2ecf20Sopenharmony_ci * @datalen - data length we'll be reading 10318c2ecf20Sopenharmony_ci * @iovcount - returned count of io vectors we fill 10328c2ecf20Sopenharmony_ci * @log - vhost log 10338c2ecf20Sopenharmony_ci * @log_num - log offset 10348c2ecf20Sopenharmony_ci * @quota - headcount quota, 1 for big buffer 10358c2ecf20Sopenharmony_ci * returns number of buffer heads allocated, negative on error 10368c2ecf20Sopenharmony_ci */ 10378c2ecf20Sopenharmony_cistatic int get_rx_bufs(struct vhost_virtqueue *vq, 10388c2ecf20Sopenharmony_ci struct vring_used_elem *heads, 10398c2ecf20Sopenharmony_ci int datalen, 10408c2ecf20Sopenharmony_ci unsigned *iovcount, 10418c2ecf20Sopenharmony_ci struct vhost_log *log, 10428c2ecf20Sopenharmony_ci unsigned *log_num, 10438c2ecf20Sopenharmony_ci unsigned int quota) 10448c2ecf20Sopenharmony_ci{ 10458c2ecf20Sopenharmony_ci unsigned int out, in; 10468c2ecf20Sopenharmony_ci int seg = 0; 10478c2ecf20Sopenharmony_ci int headcount = 0; 10488c2ecf20Sopenharmony_ci unsigned d; 10498c2ecf20Sopenharmony_ci int r, nlogs = 0; 10508c2ecf20Sopenharmony_ci /* len is always initialized before use since we are always called with 10518c2ecf20Sopenharmony_ci * datalen > 0. 10528c2ecf20Sopenharmony_ci */ 10538c2ecf20Sopenharmony_ci u32 len; 10548c2ecf20Sopenharmony_ci 10558c2ecf20Sopenharmony_ci while (datalen > 0 && headcount < quota) { 10568c2ecf20Sopenharmony_ci if (unlikely(seg >= UIO_MAXIOV)) { 10578c2ecf20Sopenharmony_ci r = -ENOBUFS; 10588c2ecf20Sopenharmony_ci goto err; 10598c2ecf20Sopenharmony_ci } 10608c2ecf20Sopenharmony_ci r = vhost_get_vq_desc(vq, vq->iov + seg, 10618c2ecf20Sopenharmony_ci ARRAY_SIZE(vq->iov) - seg, &out, 10628c2ecf20Sopenharmony_ci &in, log, log_num); 10638c2ecf20Sopenharmony_ci if (unlikely(r < 0)) 10648c2ecf20Sopenharmony_ci goto err; 10658c2ecf20Sopenharmony_ci 10668c2ecf20Sopenharmony_ci d = r; 10678c2ecf20Sopenharmony_ci if (d == vq->num) { 10688c2ecf20Sopenharmony_ci r = 0; 10698c2ecf20Sopenharmony_ci goto err; 10708c2ecf20Sopenharmony_ci } 10718c2ecf20Sopenharmony_ci if (unlikely(out || in <= 0)) { 10728c2ecf20Sopenharmony_ci vq_err(vq, "unexpected descriptor format for RX: " 10738c2ecf20Sopenharmony_ci "out %d, in %d\n", out, in); 10748c2ecf20Sopenharmony_ci r = -EINVAL; 10758c2ecf20Sopenharmony_ci goto err; 10768c2ecf20Sopenharmony_ci } 10778c2ecf20Sopenharmony_ci if (unlikely(log)) { 10788c2ecf20Sopenharmony_ci nlogs += *log_num; 10798c2ecf20Sopenharmony_ci log += *log_num; 10808c2ecf20Sopenharmony_ci } 10818c2ecf20Sopenharmony_ci heads[headcount].id = cpu_to_vhost32(vq, d); 10828c2ecf20Sopenharmony_ci len = iov_length(vq->iov + seg, in); 10838c2ecf20Sopenharmony_ci heads[headcount].len = cpu_to_vhost32(vq, len); 10848c2ecf20Sopenharmony_ci datalen -= len; 10858c2ecf20Sopenharmony_ci ++headcount; 10868c2ecf20Sopenharmony_ci seg += in; 10878c2ecf20Sopenharmony_ci } 10888c2ecf20Sopenharmony_ci heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); 10898c2ecf20Sopenharmony_ci *iovcount = seg; 10908c2ecf20Sopenharmony_ci if (unlikely(log)) 10918c2ecf20Sopenharmony_ci *log_num = nlogs; 10928c2ecf20Sopenharmony_ci 10938c2ecf20Sopenharmony_ci /* Detect overrun */ 10948c2ecf20Sopenharmony_ci if (unlikely(datalen > 0)) { 10958c2ecf20Sopenharmony_ci r = UIO_MAXIOV + 1; 10968c2ecf20Sopenharmony_ci goto err; 10978c2ecf20Sopenharmony_ci } 10988c2ecf20Sopenharmony_ci return headcount; 10998c2ecf20Sopenharmony_cierr: 11008c2ecf20Sopenharmony_ci vhost_discard_vq_desc(vq, headcount); 11018c2ecf20Sopenharmony_ci return r; 11028c2ecf20Sopenharmony_ci} 11038c2ecf20Sopenharmony_ci 11048c2ecf20Sopenharmony_ci/* Expects to be always run from workqueue - which acts as 11058c2ecf20Sopenharmony_ci * read-size critical section for our kind of RCU. */ 11068c2ecf20Sopenharmony_cistatic void handle_rx(struct vhost_net *net) 11078c2ecf20Sopenharmony_ci{ 11088c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; 11098c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = &nvq->vq; 11108c2ecf20Sopenharmony_ci unsigned in, log; 11118c2ecf20Sopenharmony_ci struct vhost_log *vq_log; 11128c2ecf20Sopenharmony_ci struct msghdr msg = { 11138c2ecf20Sopenharmony_ci .msg_name = NULL, 11148c2ecf20Sopenharmony_ci .msg_namelen = 0, 11158c2ecf20Sopenharmony_ci .msg_control = NULL, /* FIXME: get and handle RX aux data. */ 11168c2ecf20Sopenharmony_ci .msg_controllen = 0, 11178c2ecf20Sopenharmony_ci .msg_flags = MSG_DONTWAIT, 11188c2ecf20Sopenharmony_ci }; 11198c2ecf20Sopenharmony_ci struct virtio_net_hdr hdr = { 11208c2ecf20Sopenharmony_ci .flags = 0, 11218c2ecf20Sopenharmony_ci .gso_type = VIRTIO_NET_HDR_GSO_NONE 11228c2ecf20Sopenharmony_ci }; 11238c2ecf20Sopenharmony_ci size_t total_len = 0; 11248c2ecf20Sopenharmony_ci int err, mergeable; 11258c2ecf20Sopenharmony_ci s16 headcount; 11268c2ecf20Sopenharmony_ci size_t vhost_hlen, sock_hlen; 11278c2ecf20Sopenharmony_ci size_t vhost_len, sock_len; 11288c2ecf20Sopenharmony_ci bool busyloop_intr = false; 11298c2ecf20Sopenharmony_ci struct socket *sock; 11308c2ecf20Sopenharmony_ci struct iov_iter fixup; 11318c2ecf20Sopenharmony_ci __virtio16 num_buffers; 11328c2ecf20Sopenharmony_ci int recv_pkts = 0; 11338c2ecf20Sopenharmony_ci 11348c2ecf20Sopenharmony_ci mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); 11358c2ecf20Sopenharmony_ci sock = vhost_vq_get_backend(vq); 11368c2ecf20Sopenharmony_ci if (!sock) 11378c2ecf20Sopenharmony_ci goto out; 11388c2ecf20Sopenharmony_ci 11398c2ecf20Sopenharmony_ci if (!vq_meta_prefetch(vq)) 11408c2ecf20Sopenharmony_ci goto out; 11418c2ecf20Sopenharmony_ci 11428c2ecf20Sopenharmony_ci vhost_disable_notify(&net->dev, vq); 11438c2ecf20Sopenharmony_ci vhost_net_disable_vq(net, vq); 11448c2ecf20Sopenharmony_ci 11458c2ecf20Sopenharmony_ci vhost_hlen = nvq->vhost_hlen; 11468c2ecf20Sopenharmony_ci sock_hlen = nvq->sock_hlen; 11478c2ecf20Sopenharmony_ci 11488c2ecf20Sopenharmony_ci vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? 11498c2ecf20Sopenharmony_ci vq->log : NULL; 11508c2ecf20Sopenharmony_ci mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); 11518c2ecf20Sopenharmony_ci 11528c2ecf20Sopenharmony_ci do { 11538c2ecf20Sopenharmony_ci sock_len = vhost_net_rx_peek_head_len(net, sock->sk, 11548c2ecf20Sopenharmony_ci &busyloop_intr); 11558c2ecf20Sopenharmony_ci if (!sock_len) 11568c2ecf20Sopenharmony_ci break; 11578c2ecf20Sopenharmony_ci sock_len += sock_hlen; 11588c2ecf20Sopenharmony_ci vhost_len = sock_len + vhost_hlen; 11598c2ecf20Sopenharmony_ci headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, 11608c2ecf20Sopenharmony_ci vhost_len, &in, vq_log, &log, 11618c2ecf20Sopenharmony_ci likely(mergeable) ? UIO_MAXIOV : 1); 11628c2ecf20Sopenharmony_ci /* On error, stop handling until the next kick. */ 11638c2ecf20Sopenharmony_ci if (unlikely(headcount < 0)) 11648c2ecf20Sopenharmony_ci goto out; 11658c2ecf20Sopenharmony_ci /* OK, now we need to know about added descriptors. */ 11668c2ecf20Sopenharmony_ci if (!headcount) { 11678c2ecf20Sopenharmony_ci if (unlikely(busyloop_intr)) { 11688c2ecf20Sopenharmony_ci vhost_poll_queue(&vq->poll); 11698c2ecf20Sopenharmony_ci } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { 11708c2ecf20Sopenharmony_ci /* They have slipped one in as we were 11718c2ecf20Sopenharmony_ci * doing that: check again. */ 11728c2ecf20Sopenharmony_ci vhost_disable_notify(&net->dev, vq); 11738c2ecf20Sopenharmony_ci continue; 11748c2ecf20Sopenharmony_ci } 11758c2ecf20Sopenharmony_ci /* Nothing new? Wait for eventfd to tell us 11768c2ecf20Sopenharmony_ci * they refilled. */ 11778c2ecf20Sopenharmony_ci goto out; 11788c2ecf20Sopenharmony_ci } 11798c2ecf20Sopenharmony_ci busyloop_intr = false; 11808c2ecf20Sopenharmony_ci if (nvq->rx_ring) 11818c2ecf20Sopenharmony_ci msg.msg_control = vhost_net_buf_consume(&nvq->rxq); 11828c2ecf20Sopenharmony_ci /* On overrun, truncate and discard */ 11838c2ecf20Sopenharmony_ci if (unlikely(headcount > UIO_MAXIOV)) { 11848c2ecf20Sopenharmony_ci iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); 11858c2ecf20Sopenharmony_ci err = sock->ops->recvmsg(sock, &msg, 11868c2ecf20Sopenharmony_ci 1, MSG_DONTWAIT | MSG_TRUNC); 11878c2ecf20Sopenharmony_ci pr_debug("Discarded rx packet: len %zd\n", sock_len); 11888c2ecf20Sopenharmony_ci continue; 11898c2ecf20Sopenharmony_ci } 11908c2ecf20Sopenharmony_ci /* We don't need to be notified again. */ 11918c2ecf20Sopenharmony_ci iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len); 11928c2ecf20Sopenharmony_ci fixup = msg.msg_iter; 11938c2ecf20Sopenharmony_ci if (unlikely((vhost_hlen))) { 11948c2ecf20Sopenharmony_ci /* We will supply the header ourselves 11958c2ecf20Sopenharmony_ci * TODO: support TSO. 11968c2ecf20Sopenharmony_ci */ 11978c2ecf20Sopenharmony_ci iov_iter_advance(&msg.msg_iter, vhost_hlen); 11988c2ecf20Sopenharmony_ci } 11998c2ecf20Sopenharmony_ci err = sock->ops->recvmsg(sock, &msg, 12008c2ecf20Sopenharmony_ci sock_len, MSG_DONTWAIT | MSG_TRUNC); 12018c2ecf20Sopenharmony_ci /* Userspace might have consumed the packet meanwhile: 12028c2ecf20Sopenharmony_ci * it's not supposed to do this usually, but might be hard 12038c2ecf20Sopenharmony_ci * to prevent. Discard data we got (if any) and keep going. */ 12048c2ecf20Sopenharmony_ci if (unlikely(err != sock_len)) { 12058c2ecf20Sopenharmony_ci pr_debug("Discarded rx packet: " 12068c2ecf20Sopenharmony_ci " len %d, expected %zd\n", err, sock_len); 12078c2ecf20Sopenharmony_ci vhost_discard_vq_desc(vq, headcount); 12088c2ecf20Sopenharmony_ci continue; 12098c2ecf20Sopenharmony_ci } 12108c2ecf20Sopenharmony_ci /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ 12118c2ecf20Sopenharmony_ci if (unlikely(vhost_hlen)) { 12128c2ecf20Sopenharmony_ci if (copy_to_iter(&hdr, sizeof(hdr), 12138c2ecf20Sopenharmony_ci &fixup) != sizeof(hdr)) { 12148c2ecf20Sopenharmony_ci vq_err(vq, "Unable to write vnet_hdr " 12158c2ecf20Sopenharmony_ci "at addr %p\n", vq->iov->iov_base); 12168c2ecf20Sopenharmony_ci goto out; 12178c2ecf20Sopenharmony_ci } 12188c2ecf20Sopenharmony_ci } else { 12198c2ecf20Sopenharmony_ci /* Header came from socket; we'll need to patch 12208c2ecf20Sopenharmony_ci * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF 12218c2ecf20Sopenharmony_ci */ 12228c2ecf20Sopenharmony_ci iov_iter_advance(&fixup, sizeof(hdr)); 12238c2ecf20Sopenharmony_ci } 12248c2ecf20Sopenharmony_ci /* TODO: Should check and handle checksum. */ 12258c2ecf20Sopenharmony_ci 12268c2ecf20Sopenharmony_ci num_buffers = cpu_to_vhost16(vq, headcount); 12278c2ecf20Sopenharmony_ci if (likely(mergeable) && 12288c2ecf20Sopenharmony_ci copy_to_iter(&num_buffers, sizeof num_buffers, 12298c2ecf20Sopenharmony_ci &fixup) != sizeof num_buffers) { 12308c2ecf20Sopenharmony_ci vq_err(vq, "Failed num_buffers write"); 12318c2ecf20Sopenharmony_ci vhost_discard_vq_desc(vq, headcount); 12328c2ecf20Sopenharmony_ci goto out; 12338c2ecf20Sopenharmony_ci } 12348c2ecf20Sopenharmony_ci nvq->done_idx += headcount; 12358c2ecf20Sopenharmony_ci if (nvq->done_idx > VHOST_NET_BATCH) 12368c2ecf20Sopenharmony_ci vhost_net_signal_used(nvq); 12378c2ecf20Sopenharmony_ci if (unlikely(vq_log)) 12388c2ecf20Sopenharmony_ci vhost_log_write(vq, vq_log, log, vhost_len, 12398c2ecf20Sopenharmony_ci vq->iov, in); 12408c2ecf20Sopenharmony_ci total_len += vhost_len; 12418c2ecf20Sopenharmony_ci } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); 12428c2ecf20Sopenharmony_ci 12438c2ecf20Sopenharmony_ci if (unlikely(busyloop_intr)) 12448c2ecf20Sopenharmony_ci vhost_poll_queue(&vq->poll); 12458c2ecf20Sopenharmony_ci else if (!sock_len) 12468c2ecf20Sopenharmony_ci vhost_net_enable_vq(net, vq); 12478c2ecf20Sopenharmony_ciout: 12488c2ecf20Sopenharmony_ci vhost_net_signal_used(nvq); 12498c2ecf20Sopenharmony_ci mutex_unlock(&vq->mutex); 12508c2ecf20Sopenharmony_ci} 12518c2ecf20Sopenharmony_ci 12528c2ecf20Sopenharmony_cistatic void handle_tx_kick(struct vhost_work *work) 12538c2ecf20Sopenharmony_ci{ 12548c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, 12558c2ecf20Sopenharmony_ci poll.work); 12568c2ecf20Sopenharmony_ci struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); 12578c2ecf20Sopenharmony_ci 12588c2ecf20Sopenharmony_ci handle_tx(net); 12598c2ecf20Sopenharmony_ci} 12608c2ecf20Sopenharmony_ci 12618c2ecf20Sopenharmony_cistatic void handle_rx_kick(struct vhost_work *work) 12628c2ecf20Sopenharmony_ci{ 12638c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, 12648c2ecf20Sopenharmony_ci poll.work); 12658c2ecf20Sopenharmony_ci struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); 12668c2ecf20Sopenharmony_ci 12678c2ecf20Sopenharmony_ci handle_rx(net); 12688c2ecf20Sopenharmony_ci} 12698c2ecf20Sopenharmony_ci 12708c2ecf20Sopenharmony_cistatic void handle_tx_net(struct vhost_work *work) 12718c2ecf20Sopenharmony_ci{ 12728c2ecf20Sopenharmony_ci struct vhost_net *net = container_of(work, struct vhost_net, 12738c2ecf20Sopenharmony_ci poll[VHOST_NET_VQ_TX].work); 12748c2ecf20Sopenharmony_ci handle_tx(net); 12758c2ecf20Sopenharmony_ci} 12768c2ecf20Sopenharmony_ci 12778c2ecf20Sopenharmony_cistatic void handle_rx_net(struct vhost_work *work) 12788c2ecf20Sopenharmony_ci{ 12798c2ecf20Sopenharmony_ci struct vhost_net *net = container_of(work, struct vhost_net, 12808c2ecf20Sopenharmony_ci poll[VHOST_NET_VQ_RX].work); 12818c2ecf20Sopenharmony_ci handle_rx(net); 12828c2ecf20Sopenharmony_ci} 12838c2ecf20Sopenharmony_ci 12848c2ecf20Sopenharmony_cistatic int vhost_net_open(struct inode *inode, struct file *f) 12858c2ecf20Sopenharmony_ci{ 12868c2ecf20Sopenharmony_ci struct vhost_net *n; 12878c2ecf20Sopenharmony_ci struct vhost_dev *dev; 12888c2ecf20Sopenharmony_ci struct vhost_virtqueue **vqs; 12898c2ecf20Sopenharmony_ci void **queue; 12908c2ecf20Sopenharmony_ci struct xdp_buff *xdp; 12918c2ecf20Sopenharmony_ci int i; 12928c2ecf20Sopenharmony_ci 12938c2ecf20Sopenharmony_ci n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); 12948c2ecf20Sopenharmony_ci if (!n) 12958c2ecf20Sopenharmony_ci return -ENOMEM; 12968c2ecf20Sopenharmony_ci vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL); 12978c2ecf20Sopenharmony_ci if (!vqs) { 12988c2ecf20Sopenharmony_ci kvfree(n); 12998c2ecf20Sopenharmony_ci return -ENOMEM; 13008c2ecf20Sopenharmony_ci } 13018c2ecf20Sopenharmony_ci 13028c2ecf20Sopenharmony_ci queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *), 13038c2ecf20Sopenharmony_ci GFP_KERNEL); 13048c2ecf20Sopenharmony_ci if (!queue) { 13058c2ecf20Sopenharmony_ci kfree(vqs); 13068c2ecf20Sopenharmony_ci kvfree(n); 13078c2ecf20Sopenharmony_ci return -ENOMEM; 13088c2ecf20Sopenharmony_ci } 13098c2ecf20Sopenharmony_ci n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; 13108c2ecf20Sopenharmony_ci 13118c2ecf20Sopenharmony_ci xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL); 13128c2ecf20Sopenharmony_ci if (!xdp) { 13138c2ecf20Sopenharmony_ci kfree(vqs); 13148c2ecf20Sopenharmony_ci kvfree(n); 13158c2ecf20Sopenharmony_ci kfree(queue); 13168c2ecf20Sopenharmony_ci return -ENOMEM; 13178c2ecf20Sopenharmony_ci } 13188c2ecf20Sopenharmony_ci n->vqs[VHOST_NET_VQ_TX].xdp = xdp; 13198c2ecf20Sopenharmony_ci 13208c2ecf20Sopenharmony_ci dev = &n->dev; 13218c2ecf20Sopenharmony_ci vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; 13228c2ecf20Sopenharmony_ci vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; 13238c2ecf20Sopenharmony_ci n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; 13248c2ecf20Sopenharmony_ci n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; 13258c2ecf20Sopenharmony_ci for (i = 0; i < VHOST_NET_VQ_MAX; i++) { 13268c2ecf20Sopenharmony_ci n->vqs[i].ubufs = NULL; 13278c2ecf20Sopenharmony_ci n->vqs[i].ubuf_info = NULL; 13288c2ecf20Sopenharmony_ci n->vqs[i].upend_idx = 0; 13298c2ecf20Sopenharmony_ci n->vqs[i].done_idx = 0; 13308c2ecf20Sopenharmony_ci n->vqs[i].batched_xdp = 0; 13318c2ecf20Sopenharmony_ci n->vqs[i].vhost_hlen = 0; 13328c2ecf20Sopenharmony_ci n->vqs[i].sock_hlen = 0; 13338c2ecf20Sopenharmony_ci n->vqs[i].rx_ring = NULL; 13348c2ecf20Sopenharmony_ci vhost_net_buf_init(&n->vqs[i].rxq); 13358c2ecf20Sopenharmony_ci } 13368c2ecf20Sopenharmony_ci vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, 13378c2ecf20Sopenharmony_ci UIO_MAXIOV + VHOST_NET_BATCH, 13388c2ecf20Sopenharmony_ci VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true, 13398c2ecf20Sopenharmony_ci NULL); 13408c2ecf20Sopenharmony_ci 13418c2ecf20Sopenharmony_ci vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev); 13428c2ecf20Sopenharmony_ci vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev); 13438c2ecf20Sopenharmony_ci 13448c2ecf20Sopenharmony_ci f->private_data = n; 13458c2ecf20Sopenharmony_ci n->page_frag.page = NULL; 13468c2ecf20Sopenharmony_ci n->refcnt_bias = 0; 13478c2ecf20Sopenharmony_ci 13488c2ecf20Sopenharmony_ci return 0; 13498c2ecf20Sopenharmony_ci} 13508c2ecf20Sopenharmony_ci 13518c2ecf20Sopenharmony_cistatic struct socket *vhost_net_stop_vq(struct vhost_net *n, 13528c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq) 13538c2ecf20Sopenharmony_ci{ 13548c2ecf20Sopenharmony_ci struct socket *sock; 13558c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq = 13568c2ecf20Sopenharmony_ci container_of(vq, struct vhost_net_virtqueue, vq); 13578c2ecf20Sopenharmony_ci 13588c2ecf20Sopenharmony_ci mutex_lock(&vq->mutex); 13598c2ecf20Sopenharmony_ci sock = vhost_vq_get_backend(vq); 13608c2ecf20Sopenharmony_ci vhost_net_disable_vq(n, vq); 13618c2ecf20Sopenharmony_ci vhost_vq_set_backend(vq, NULL); 13628c2ecf20Sopenharmony_ci vhost_net_buf_unproduce(nvq); 13638c2ecf20Sopenharmony_ci nvq->rx_ring = NULL; 13648c2ecf20Sopenharmony_ci mutex_unlock(&vq->mutex); 13658c2ecf20Sopenharmony_ci return sock; 13668c2ecf20Sopenharmony_ci} 13678c2ecf20Sopenharmony_ci 13688c2ecf20Sopenharmony_cistatic void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, 13698c2ecf20Sopenharmony_ci struct socket **rx_sock) 13708c2ecf20Sopenharmony_ci{ 13718c2ecf20Sopenharmony_ci *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); 13728c2ecf20Sopenharmony_ci *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); 13738c2ecf20Sopenharmony_ci} 13748c2ecf20Sopenharmony_ci 13758c2ecf20Sopenharmony_cistatic void vhost_net_flush_vq(struct vhost_net *n, int index) 13768c2ecf20Sopenharmony_ci{ 13778c2ecf20Sopenharmony_ci vhost_poll_flush(n->poll + index); 13788c2ecf20Sopenharmony_ci vhost_poll_flush(&n->vqs[index].vq.poll); 13798c2ecf20Sopenharmony_ci} 13808c2ecf20Sopenharmony_ci 13818c2ecf20Sopenharmony_cistatic void vhost_net_flush(struct vhost_net *n) 13828c2ecf20Sopenharmony_ci{ 13838c2ecf20Sopenharmony_ci vhost_net_flush_vq(n, VHOST_NET_VQ_TX); 13848c2ecf20Sopenharmony_ci vhost_net_flush_vq(n, VHOST_NET_VQ_RX); 13858c2ecf20Sopenharmony_ci if (n->vqs[VHOST_NET_VQ_TX].ubufs) { 13868c2ecf20Sopenharmony_ci mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 13878c2ecf20Sopenharmony_ci n->tx_flush = true; 13888c2ecf20Sopenharmony_ci mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 13898c2ecf20Sopenharmony_ci /* Wait for all lower device DMAs done. */ 13908c2ecf20Sopenharmony_ci vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); 13918c2ecf20Sopenharmony_ci mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 13928c2ecf20Sopenharmony_ci n->tx_flush = false; 13938c2ecf20Sopenharmony_ci atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); 13948c2ecf20Sopenharmony_ci mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 13958c2ecf20Sopenharmony_ci } 13968c2ecf20Sopenharmony_ci} 13978c2ecf20Sopenharmony_ci 13988c2ecf20Sopenharmony_cistatic int vhost_net_release(struct inode *inode, struct file *f) 13998c2ecf20Sopenharmony_ci{ 14008c2ecf20Sopenharmony_ci struct vhost_net *n = f->private_data; 14018c2ecf20Sopenharmony_ci struct socket *tx_sock; 14028c2ecf20Sopenharmony_ci struct socket *rx_sock; 14038c2ecf20Sopenharmony_ci 14048c2ecf20Sopenharmony_ci vhost_net_stop(n, &tx_sock, &rx_sock); 14058c2ecf20Sopenharmony_ci vhost_net_flush(n); 14068c2ecf20Sopenharmony_ci vhost_dev_stop(&n->dev); 14078c2ecf20Sopenharmony_ci vhost_dev_cleanup(&n->dev); 14088c2ecf20Sopenharmony_ci vhost_net_vq_reset(n); 14098c2ecf20Sopenharmony_ci if (tx_sock) 14108c2ecf20Sopenharmony_ci sockfd_put(tx_sock); 14118c2ecf20Sopenharmony_ci if (rx_sock) 14128c2ecf20Sopenharmony_ci sockfd_put(rx_sock); 14138c2ecf20Sopenharmony_ci /* Make sure no callbacks are outstanding */ 14148c2ecf20Sopenharmony_ci synchronize_rcu(); 14158c2ecf20Sopenharmony_ci /* We do an extra flush before freeing memory, 14168c2ecf20Sopenharmony_ci * since jobs can re-queue themselves. */ 14178c2ecf20Sopenharmony_ci vhost_net_flush(n); 14188c2ecf20Sopenharmony_ci kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); 14198c2ecf20Sopenharmony_ci kfree(n->vqs[VHOST_NET_VQ_TX].xdp); 14208c2ecf20Sopenharmony_ci kfree(n->dev.vqs); 14218c2ecf20Sopenharmony_ci if (n->page_frag.page) 14228c2ecf20Sopenharmony_ci __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias); 14238c2ecf20Sopenharmony_ci kvfree(n); 14248c2ecf20Sopenharmony_ci return 0; 14258c2ecf20Sopenharmony_ci} 14268c2ecf20Sopenharmony_ci 14278c2ecf20Sopenharmony_cistatic struct socket *get_raw_socket(int fd) 14288c2ecf20Sopenharmony_ci{ 14298c2ecf20Sopenharmony_ci int r; 14308c2ecf20Sopenharmony_ci struct socket *sock = sockfd_lookup(fd, &r); 14318c2ecf20Sopenharmony_ci 14328c2ecf20Sopenharmony_ci if (!sock) 14338c2ecf20Sopenharmony_ci return ERR_PTR(-ENOTSOCK); 14348c2ecf20Sopenharmony_ci 14358c2ecf20Sopenharmony_ci /* Parameter checking */ 14368c2ecf20Sopenharmony_ci if (sock->sk->sk_type != SOCK_RAW) { 14378c2ecf20Sopenharmony_ci r = -ESOCKTNOSUPPORT; 14388c2ecf20Sopenharmony_ci goto err; 14398c2ecf20Sopenharmony_ci } 14408c2ecf20Sopenharmony_ci 14418c2ecf20Sopenharmony_ci if (sock->sk->sk_family != AF_PACKET) { 14428c2ecf20Sopenharmony_ci r = -EPFNOSUPPORT; 14438c2ecf20Sopenharmony_ci goto err; 14448c2ecf20Sopenharmony_ci } 14458c2ecf20Sopenharmony_ci return sock; 14468c2ecf20Sopenharmony_cierr: 14478c2ecf20Sopenharmony_ci sockfd_put(sock); 14488c2ecf20Sopenharmony_ci return ERR_PTR(r); 14498c2ecf20Sopenharmony_ci} 14508c2ecf20Sopenharmony_ci 14518c2ecf20Sopenharmony_cistatic struct ptr_ring *get_tap_ptr_ring(struct file *file) 14528c2ecf20Sopenharmony_ci{ 14538c2ecf20Sopenharmony_ci struct ptr_ring *ring; 14548c2ecf20Sopenharmony_ci ring = tun_get_tx_ring(file); 14558c2ecf20Sopenharmony_ci if (!IS_ERR(ring)) 14568c2ecf20Sopenharmony_ci goto out; 14578c2ecf20Sopenharmony_ci ring = tap_get_ptr_ring(file); 14588c2ecf20Sopenharmony_ci if (!IS_ERR(ring)) 14598c2ecf20Sopenharmony_ci goto out; 14608c2ecf20Sopenharmony_ci ring = NULL; 14618c2ecf20Sopenharmony_ciout: 14628c2ecf20Sopenharmony_ci return ring; 14638c2ecf20Sopenharmony_ci} 14648c2ecf20Sopenharmony_ci 14658c2ecf20Sopenharmony_cistatic struct socket *get_tap_socket(int fd) 14668c2ecf20Sopenharmony_ci{ 14678c2ecf20Sopenharmony_ci struct file *file = fget(fd); 14688c2ecf20Sopenharmony_ci struct socket *sock; 14698c2ecf20Sopenharmony_ci 14708c2ecf20Sopenharmony_ci if (!file) 14718c2ecf20Sopenharmony_ci return ERR_PTR(-EBADF); 14728c2ecf20Sopenharmony_ci sock = tun_get_socket(file); 14738c2ecf20Sopenharmony_ci if (!IS_ERR(sock)) 14748c2ecf20Sopenharmony_ci return sock; 14758c2ecf20Sopenharmony_ci sock = tap_get_socket(file); 14768c2ecf20Sopenharmony_ci if (IS_ERR(sock)) 14778c2ecf20Sopenharmony_ci fput(file); 14788c2ecf20Sopenharmony_ci return sock; 14798c2ecf20Sopenharmony_ci} 14808c2ecf20Sopenharmony_ci 14818c2ecf20Sopenharmony_cistatic struct socket *get_socket(int fd) 14828c2ecf20Sopenharmony_ci{ 14838c2ecf20Sopenharmony_ci struct socket *sock; 14848c2ecf20Sopenharmony_ci 14858c2ecf20Sopenharmony_ci /* special case to disable backend */ 14868c2ecf20Sopenharmony_ci if (fd == -1) 14878c2ecf20Sopenharmony_ci return NULL; 14888c2ecf20Sopenharmony_ci sock = get_raw_socket(fd); 14898c2ecf20Sopenharmony_ci if (!IS_ERR(sock)) 14908c2ecf20Sopenharmony_ci return sock; 14918c2ecf20Sopenharmony_ci sock = get_tap_socket(fd); 14928c2ecf20Sopenharmony_ci if (!IS_ERR(sock)) 14938c2ecf20Sopenharmony_ci return sock; 14948c2ecf20Sopenharmony_ci return ERR_PTR(-ENOTSOCK); 14958c2ecf20Sopenharmony_ci} 14968c2ecf20Sopenharmony_ci 14978c2ecf20Sopenharmony_cistatic long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) 14988c2ecf20Sopenharmony_ci{ 14998c2ecf20Sopenharmony_ci struct socket *sock, *oldsock; 15008c2ecf20Sopenharmony_ci struct vhost_virtqueue *vq; 15018c2ecf20Sopenharmony_ci struct vhost_net_virtqueue *nvq; 15028c2ecf20Sopenharmony_ci struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; 15038c2ecf20Sopenharmony_ci int r; 15048c2ecf20Sopenharmony_ci 15058c2ecf20Sopenharmony_ci mutex_lock(&n->dev.mutex); 15068c2ecf20Sopenharmony_ci r = vhost_dev_check_owner(&n->dev); 15078c2ecf20Sopenharmony_ci if (r) 15088c2ecf20Sopenharmony_ci goto err; 15098c2ecf20Sopenharmony_ci 15108c2ecf20Sopenharmony_ci if (index >= VHOST_NET_VQ_MAX) { 15118c2ecf20Sopenharmony_ci r = -ENOBUFS; 15128c2ecf20Sopenharmony_ci goto err; 15138c2ecf20Sopenharmony_ci } 15148c2ecf20Sopenharmony_ci vq = &n->vqs[index].vq; 15158c2ecf20Sopenharmony_ci nvq = &n->vqs[index]; 15168c2ecf20Sopenharmony_ci mutex_lock(&vq->mutex); 15178c2ecf20Sopenharmony_ci 15188c2ecf20Sopenharmony_ci if (fd == -1) 15198c2ecf20Sopenharmony_ci vhost_clear_msg(&n->dev); 15208c2ecf20Sopenharmony_ci 15218c2ecf20Sopenharmony_ci /* Verify that ring has been setup correctly. */ 15228c2ecf20Sopenharmony_ci if (!vhost_vq_access_ok(vq)) { 15238c2ecf20Sopenharmony_ci r = -EFAULT; 15248c2ecf20Sopenharmony_ci goto err_vq; 15258c2ecf20Sopenharmony_ci } 15268c2ecf20Sopenharmony_ci sock = get_socket(fd); 15278c2ecf20Sopenharmony_ci if (IS_ERR(sock)) { 15288c2ecf20Sopenharmony_ci r = PTR_ERR(sock); 15298c2ecf20Sopenharmony_ci goto err_vq; 15308c2ecf20Sopenharmony_ci } 15318c2ecf20Sopenharmony_ci 15328c2ecf20Sopenharmony_ci /* start polling new socket */ 15338c2ecf20Sopenharmony_ci oldsock = vhost_vq_get_backend(vq); 15348c2ecf20Sopenharmony_ci if (sock != oldsock) { 15358c2ecf20Sopenharmony_ci ubufs = vhost_net_ubuf_alloc(vq, 15368c2ecf20Sopenharmony_ci sock && vhost_sock_zcopy(sock)); 15378c2ecf20Sopenharmony_ci if (IS_ERR(ubufs)) { 15388c2ecf20Sopenharmony_ci r = PTR_ERR(ubufs); 15398c2ecf20Sopenharmony_ci goto err_ubufs; 15408c2ecf20Sopenharmony_ci } 15418c2ecf20Sopenharmony_ci 15428c2ecf20Sopenharmony_ci vhost_net_disable_vq(n, vq); 15438c2ecf20Sopenharmony_ci vhost_vq_set_backend(vq, sock); 15448c2ecf20Sopenharmony_ci vhost_net_buf_unproduce(nvq); 15458c2ecf20Sopenharmony_ci r = vhost_vq_init_access(vq); 15468c2ecf20Sopenharmony_ci if (r) 15478c2ecf20Sopenharmony_ci goto err_used; 15488c2ecf20Sopenharmony_ci r = vhost_net_enable_vq(n, vq); 15498c2ecf20Sopenharmony_ci if (r) 15508c2ecf20Sopenharmony_ci goto err_used; 15518c2ecf20Sopenharmony_ci if (index == VHOST_NET_VQ_RX) { 15528c2ecf20Sopenharmony_ci if (sock) 15538c2ecf20Sopenharmony_ci nvq->rx_ring = get_tap_ptr_ring(sock->file); 15548c2ecf20Sopenharmony_ci else 15558c2ecf20Sopenharmony_ci nvq->rx_ring = NULL; 15568c2ecf20Sopenharmony_ci } 15578c2ecf20Sopenharmony_ci 15588c2ecf20Sopenharmony_ci oldubufs = nvq->ubufs; 15598c2ecf20Sopenharmony_ci nvq->ubufs = ubufs; 15608c2ecf20Sopenharmony_ci 15618c2ecf20Sopenharmony_ci n->tx_packets = 0; 15628c2ecf20Sopenharmony_ci n->tx_zcopy_err = 0; 15638c2ecf20Sopenharmony_ci n->tx_flush = false; 15648c2ecf20Sopenharmony_ci } 15658c2ecf20Sopenharmony_ci 15668c2ecf20Sopenharmony_ci mutex_unlock(&vq->mutex); 15678c2ecf20Sopenharmony_ci 15688c2ecf20Sopenharmony_ci if (oldubufs) { 15698c2ecf20Sopenharmony_ci vhost_net_ubuf_put_wait_and_free(oldubufs); 15708c2ecf20Sopenharmony_ci mutex_lock(&vq->mutex); 15718c2ecf20Sopenharmony_ci vhost_zerocopy_signal_used(n, vq); 15728c2ecf20Sopenharmony_ci mutex_unlock(&vq->mutex); 15738c2ecf20Sopenharmony_ci } 15748c2ecf20Sopenharmony_ci 15758c2ecf20Sopenharmony_ci if (oldsock) { 15768c2ecf20Sopenharmony_ci vhost_net_flush_vq(n, index); 15778c2ecf20Sopenharmony_ci sockfd_put(oldsock); 15788c2ecf20Sopenharmony_ci } 15798c2ecf20Sopenharmony_ci 15808c2ecf20Sopenharmony_ci mutex_unlock(&n->dev.mutex); 15818c2ecf20Sopenharmony_ci return 0; 15828c2ecf20Sopenharmony_ci 15838c2ecf20Sopenharmony_cierr_used: 15848c2ecf20Sopenharmony_ci vhost_vq_set_backend(vq, oldsock); 15858c2ecf20Sopenharmony_ci vhost_net_enable_vq(n, vq); 15868c2ecf20Sopenharmony_ci if (ubufs) 15878c2ecf20Sopenharmony_ci vhost_net_ubuf_put_wait_and_free(ubufs); 15888c2ecf20Sopenharmony_cierr_ubufs: 15898c2ecf20Sopenharmony_ci if (sock) 15908c2ecf20Sopenharmony_ci sockfd_put(sock); 15918c2ecf20Sopenharmony_cierr_vq: 15928c2ecf20Sopenharmony_ci mutex_unlock(&vq->mutex); 15938c2ecf20Sopenharmony_cierr: 15948c2ecf20Sopenharmony_ci mutex_unlock(&n->dev.mutex); 15958c2ecf20Sopenharmony_ci return r; 15968c2ecf20Sopenharmony_ci} 15978c2ecf20Sopenharmony_ci 15988c2ecf20Sopenharmony_cistatic long vhost_net_reset_owner(struct vhost_net *n) 15998c2ecf20Sopenharmony_ci{ 16008c2ecf20Sopenharmony_ci struct socket *tx_sock = NULL; 16018c2ecf20Sopenharmony_ci struct socket *rx_sock = NULL; 16028c2ecf20Sopenharmony_ci long err; 16038c2ecf20Sopenharmony_ci struct vhost_iotlb *umem; 16048c2ecf20Sopenharmony_ci 16058c2ecf20Sopenharmony_ci mutex_lock(&n->dev.mutex); 16068c2ecf20Sopenharmony_ci err = vhost_dev_check_owner(&n->dev); 16078c2ecf20Sopenharmony_ci if (err) 16088c2ecf20Sopenharmony_ci goto done; 16098c2ecf20Sopenharmony_ci umem = vhost_dev_reset_owner_prepare(); 16108c2ecf20Sopenharmony_ci if (!umem) { 16118c2ecf20Sopenharmony_ci err = -ENOMEM; 16128c2ecf20Sopenharmony_ci goto done; 16138c2ecf20Sopenharmony_ci } 16148c2ecf20Sopenharmony_ci vhost_net_stop(n, &tx_sock, &rx_sock); 16158c2ecf20Sopenharmony_ci vhost_net_flush(n); 16168c2ecf20Sopenharmony_ci vhost_dev_stop(&n->dev); 16178c2ecf20Sopenharmony_ci vhost_dev_reset_owner(&n->dev, umem); 16188c2ecf20Sopenharmony_ci vhost_net_vq_reset(n); 16198c2ecf20Sopenharmony_cidone: 16208c2ecf20Sopenharmony_ci mutex_unlock(&n->dev.mutex); 16218c2ecf20Sopenharmony_ci if (tx_sock) 16228c2ecf20Sopenharmony_ci sockfd_put(tx_sock); 16238c2ecf20Sopenharmony_ci if (rx_sock) 16248c2ecf20Sopenharmony_ci sockfd_put(rx_sock); 16258c2ecf20Sopenharmony_ci return err; 16268c2ecf20Sopenharmony_ci} 16278c2ecf20Sopenharmony_ci 16288c2ecf20Sopenharmony_cistatic int vhost_net_set_features(struct vhost_net *n, u64 features) 16298c2ecf20Sopenharmony_ci{ 16308c2ecf20Sopenharmony_ci size_t vhost_hlen, sock_hlen, hdr_len; 16318c2ecf20Sopenharmony_ci int i; 16328c2ecf20Sopenharmony_ci 16338c2ecf20Sopenharmony_ci hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | 16348c2ecf20Sopenharmony_ci (1ULL << VIRTIO_F_VERSION_1))) ? 16358c2ecf20Sopenharmony_ci sizeof(struct virtio_net_hdr_mrg_rxbuf) : 16368c2ecf20Sopenharmony_ci sizeof(struct virtio_net_hdr); 16378c2ecf20Sopenharmony_ci if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { 16388c2ecf20Sopenharmony_ci /* vhost provides vnet_hdr */ 16398c2ecf20Sopenharmony_ci vhost_hlen = hdr_len; 16408c2ecf20Sopenharmony_ci sock_hlen = 0; 16418c2ecf20Sopenharmony_ci } else { 16428c2ecf20Sopenharmony_ci /* socket provides vnet_hdr */ 16438c2ecf20Sopenharmony_ci vhost_hlen = 0; 16448c2ecf20Sopenharmony_ci sock_hlen = hdr_len; 16458c2ecf20Sopenharmony_ci } 16468c2ecf20Sopenharmony_ci mutex_lock(&n->dev.mutex); 16478c2ecf20Sopenharmony_ci if ((features & (1 << VHOST_F_LOG_ALL)) && 16488c2ecf20Sopenharmony_ci !vhost_log_access_ok(&n->dev)) 16498c2ecf20Sopenharmony_ci goto out_unlock; 16508c2ecf20Sopenharmony_ci 16518c2ecf20Sopenharmony_ci if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { 16528c2ecf20Sopenharmony_ci if (vhost_init_device_iotlb(&n->dev, true)) 16538c2ecf20Sopenharmony_ci goto out_unlock; 16548c2ecf20Sopenharmony_ci } 16558c2ecf20Sopenharmony_ci 16568c2ecf20Sopenharmony_ci for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 16578c2ecf20Sopenharmony_ci mutex_lock(&n->vqs[i].vq.mutex); 16588c2ecf20Sopenharmony_ci n->vqs[i].vq.acked_features = features; 16598c2ecf20Sopenharmony_ci n->vqs[i].vhost_hlen = vhost_hlen; 16608c2ecf20Sopenharmony_ci n->vqs[i].sock_hlen = sock_hlen; 16618c2ecf20Sopenharmony_ci mutex_unlock(&n->vqs[i].vq.mutex); 16628c2ecf20Sopenharmony_ci } 16638c2ecf20Sopenharmony_ci mutex_unlock(&n->dev.mutex); 16648c2ecf20Sopenharmony_ci return 0; 16658c2ecf20Sopenharmony_ci 16668c2ecf20Sopenharmony_ciout_unlock: 16678c2ecf20Sopenharmony_ci mutex_unlock(&n->dev.mutex); 16688c2ecf20Sopenharmony_ci return -EFAULT; 16698c2ecf20Sopenharmony_ci} 16708c2ecf20Sopenharmony_ci 16718c2ecf20Sopenharmony_cistatic long vhost_net_set_owner(struct vhost_net *n) 16728c2ecf20Sopenharmony_ci{ 16738c2ecf20Sopenharmony_ci int r; 16748c2ecf20Sopenharmony_ci 16758c2ecf20Sopenharmony_ci mutex_lock(&n->dev.mutex); 16768c2ecf20Sopenharmony_ci if (vhost_dev_has_owner(&n->dev)) { 16778c2ecf20Sopenharmony_ci r = -EBUSY; 16788c2ecf20Sopenharmony_ci goto out; 16798c2ecf20Sopenharmony_ci } 16808c2ecf20Sopenharmony_ci r = vhost_net_set_ubuf_info(n); 16818c2ecf20Sopenharmony_ci if (r) 16828c2ecf20Sopenharmony_ci goto out; 16838c2ecf20Sopenharmony_ci r = vhost_dev_set_owner(&n->dev); 16848c2ecf20Sopenharmony_ci if (r) 16858c2ecf20Sopenharmony_ci vhost_net_clear_ubuf_info(n); 16868c2ecf20Sopenharmony_ci vhost_net_flush(n); 16878c2ecf20Sopenharmony_ciout: 16888c2ecf20Sopenharmony_ci mutex_unlock(&n->dev.mutex); 16898c2ecf20Sopenharmony_ci return r; 16908c2ecf20Sopenharmony_ci} 16918c2ecf20Sopenharmony_ci 16928c2ecf20Sopenharmony_cistatic long vhost_net_ioctl(struct file *f, unsigned int ioctl, 16938c2ecf20Sopenharmony_ci unsigned long arg) 16948c2ecf20Sopenharmony_ci{ 16958c2ecf20Sopenharmony_ci struct vhost_net *n = f->private_data; 16968c2ecf20Sopenharmony_ci void __user *argp = (void __user *)arg; 16978c2ecf20Sopenharmony_ci u64 __user *featurep = argp; 16988c2ecf20Sopenharmony_ci struct vhost_vring_file backend; 16998c2ecf20Sopenharmony_ci u64 features; 17008c2ecf20Sopenharmony_ci int r; 17018c2ecf20Sopenharmony_ci 17028c2ecf20Sopenharmony_ci switch (ioctl) { 17038c2ecf20Sopenharmony_ci case VHOST_NET_SET_BACKEND: 17048c2ecf20Sopenharmony_ci if (copy_from_user(&backend, argp, sizeof backend)) 17058c2ecf20Sopenharmony_ci return -EFAULT; 17068c2ecf20Sopenharmony_ci return vhost_net_set_backend(n, backend.index, backend.fd); 17078c2ecf20Sopenharmony_ci case VHOST_GET_FEATURES: 17088c2ecf20Sopenharmony_ci features = VHOST_NET_FEATURES; 17098c2ecf20Sopenharmony_ci if (copy_to_user(featurep, &features, sizeof features)) 17108c2ecf20Sopenharmony_ci return -EFAULT; 17118c2ecf20Sopenharmony_ci return 0; 17128c2ecf20Sopenharmony_ci case VHOST_SET_FEATURES: 17138c2ecf20Sopenharmony_ci if (copy_from_user(&features, featurep, sizeof features)) 17148c2ecf20Sopenharmony_ci return -EFAULT; 17158c2ecf20Sopenharmony_ci if (features & ~VHOST_NET_FEATURES) 17168c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 17178c2ecf20Sopenharmony_ci return vhost_net_set_features(n, features); 17188c2ecf20Sopenharmony_ci case VHOST_GET_BACKEND_FEATURES: 17198c2ecf20Sopenharmony_ci features = VHOST_NET_BACKEND_FEATURES; 17208c2ecf20Sopenharmony_ci if (copy_to_user(featurep, &features, sizeof(features))) 17218c2ecf20Sopenharmony_ci return -EFAULT; 17228c2ecf20Sopenharmony_ci return 0; 17238c2ecf20Sopenharmony_ci case VHOST_SET_BACKEND_FEATURES: 17248c2ecf20Sopenharmony_ci if (copy_from_user(&features, featurep, sizeof(features))) 17258c2ecf20Sopenharmony_ci return -EFAULT; 17268c2ecf20Sopenharmony_ci if (features & ~VHOST_NET_BACKEND_FEATURES) 17278c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 17288c2ecf20Sopenharmony_ci vhost_set_backend_features(&n->dev, features); 17298c2ecf20Sopenharmony_ci return 0; 17308c2ecf20Sopenharmony_ci case VHOST_RESET_OWNER: 17318c2ecf20Sopenharmony_ci return vhost_net_reset_owner(n); 17328c2ecf20Sopenharmony_ci case VHOST_SET_OWNER: 17338c2ecf20Sopenharmony_ci return vhost_net_set_owner(n); 17348c2ecf20Sopenharmony_ci default: 17358c2ecf20Sopenharmony_ci mutex_lock(&n->dev.mutex); 17368c2ecf20Sopenharmony_ci r = vhost_dev_ioctl(&n->dev, ioctl, argp); 17378c2ecf20Sopenharmony_ci if (r == -ENOIOCTLCMD) 17388c2ecf20Sopenharmony_ci r = vhost_vring_ioctl(&n->dev, ioctl, argp); 17398c2ecf20Sopenharmony_ci else 17408c2ecf20Sopenharmony_ci vhost_net_flush(n); 17418c2ecf20Sopenharmony_ci mutex_unlock(&n->dev.mutex); 17428c2ecf20Sopenharmony_ci return r; 17438c2ecf20Sopenharmony_ci } 17448c2ecf20Sopenharmony_ci} 17458c2ecf20Sopenharmony_ci 17468c2ecf20Sopenharmony_cistatic ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) 17478c2ecf20Sopenharmony_ci{ 17488c2ecf20Sopenharmony_ci struct file *file = iocb->ki_filp; 17498c2ecf20Sopenharmony_ci struct vhost_net *n = file->private_data; 17508c2ecf20Sopenharmony_ci struct vhost_dev *dev = &n->dev; 17518c2ecf20Sopenharmony_ci int noblock = file->f_flags & O_NONBLOCK; 17528c2ecf20Sopenharmony_ci 17538c2ecf20Sopenharmony_ci return vhost_chr_read_iter(dev, to, noblock); 17548c2ecf20Sopenharmony_ci} 17558c2ecf20Sopenharmony_ci 17568c2ecf20Sopenharmony_cistatic ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, 17578c2ecf20Sopenharmony_ci struct iov_iter *from) 17588c2ecf20Sopenharmony_ci{ 17598c2ecf20Sopenharmony_ci struct file *file = iocb->ki_filp; 17608c2ecf20Sopenharmony_ci struct vhost_net *n = file->private_data; 17618c2ecf20Sopenharmony_ci struct vhost_dev *dev = &n->dev; 17628c2ecf20Sopenharmony_ci 17638c2ecf20Sopenharmony_ci return vhost_chr_write_iter(dev, from); 17648c2ecf20Sopenharmony_ci} 17658c2ecf20Sopenharmony_ci 17668c2ecf20Sopenharmony_cistatic __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) 17678c2ecf20Sopenharmony_ci{ 17688c2ecf20Sopenharmony_ci struct vhost_net *n = file->private_data; 17698c2ecf20Sopenharmony_ci struct vhost_dev *dev = &n->dev; 17708c2ecf20Sopenharmony_ci 17718c2ecf20Sopenharmony_ci return vhost_chr_poll(file, dev, wait); 17728c2ecf20Sopenharmony_ci} 17738c2ecf20Sopenharmony_ci 17748c2ecf20Sopenharmony_cistatic const struct file_operations vhost_net_fops = { 17758c2ecf20Sopenharmony_ci .owner = THIS_MODULE, 17768c2ecf20Sopenharmony_ci .release = vhost_net_release, 17778c2ecf20Sopenharmony_ci .read_iter = vhost_net_chr_read_iter, 17788c2ecf20Sopenharmony_ci .write_iter = vhost_net_chr_write_iter, 17798c2ecf20Sopenharmony_ci .poll = vhost_net_chr_poll, 17808c2ecf20Sopenharmony_ci .unlocked_ioctl = vhost_net_ioctl, 17818c2ecf20Sopenharmony_ci .compat_ioctl = compat_ptr_ioctl, 17828c2ecf20Sopenharmony_ci .open = vhost_net_open, 17838c2ecf20Sopenharmony_ci .llseek = noop_llseek, 17848c2ecf20Sopenharmony_ci}; 17858c2ecf20Sopenharmony_ci 17868c2ecf20Sopenharmony_cistatic struct miscdevice vhost_net_misc = { 17878c2ecf20Sopenharmony_ci .minor = VHOST_NET_MINOR, 17888c2ecf20Sopenharmony_ci .name = "vhost-net", 17898c2ecf20Sopenharmony_ci .fops = &vhost_net_fops, 17908c2ecf20Sopenharmony_ci}; 17918c2ecf20Sopenharmony_ci 17928c2ecf20Sopenharmony_cistatic int vhost_net_init(void) 17938c2ecf20Sopenharmony_ci{ 17948c2ecf20Sopenharmony_ci if (experimental_zcopytx) 17958c2ecf20Sopenharmony_ci vhost_net_enable_zcopy(VHOST_NET_VQ_TX); 17968c2ecf20Sopenharmony_ci return misc_register(&vhost_net_misc); 17978c2ecf20Sopenharmony_ci} 17988c2ecf20Sopenharmony_cimodule_init(vhost_net_init); 17998c2ecf20Sopenharmony_ci 18008c2ecf20Sopenharmony_cistatic void vhost_net_exit(void) 18018c2ecf20Sopenharmony_ci{ 18028c2ecf20Sopenharmony_ci misc_deregister(&vhost_net_misc); 18038c2ecf20Sopenharmony_ci} 18048c2ecf20Sopenharmony_cimodule_exit(vhost_net_exit); 18058c2ecf20Sopenharmony_ci 18068c2ecf20Sopenharmony_ciMODULE_VERSION("0.0.1"); 18078c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL v2"); 18088c2ecf20Sopenharmony_ciMODULE_AUTHOR("Michael S. Tsirkin"); 18098c2ecf20Sopenharmony_ciMODULE_DESCRIPTION("Host kernel accelerator for virtio net"); 18108c2ecf20Sopenharmony_ciMODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); 18118c2ecf20Sopenharmony_ciMODULE_ALIAS("devname:vhost-net"); 1812