xref: /kernel/linux/linux-5.10/drivers/vhost/net.c (revision 8c2ecf20)
18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/* Copyright (C) 2009 Red Hat, Inc.
38c2ecf20Sopenharmony_ci * Author: Michael S. Tsirkin <mst@redhat.com>
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * virtio-net server in host kernel.
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#include <linux/compat.h>
98c2ecf20Sopenharmony_ci#include <linux/eventfd.h>
108c2ecf20Sopenharmony_ci#include <linux/vhost.h>
118c2ecf20Sopenharmony_ci#include <linux/virtio_net.h>
128c2ecf20Sopenharmony_ci#include <linux/miscdevice.h>
138c2ecf20Sopenharmony_ci#include <linux/module.h>
148c2ecf20Sopenharmony_ci#include <linux/moduleparam.h>
158c2ecf20Sopenharmony_ci#include <linux/mutex.h>
168c2ecf20Sopenharmony_ci#include <linux/workqueue.h>
178c2ecf20Sopenharmony_ci#include <linux/file.h>
188c2ecf20Sopenharmony_ci#include <linux/slab.h>
198c2ecf20Sopenharmony_ci#include <linux/sched/clock.h>
208c2ecf20Sopenharmony_ci#include <linux/sched/signal.h>
218c2ecf20Sopenharmony_ci#include <linux/vmalloc.h>
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_ci#include <linux/net.h>
248c2ecf20Sopenharmony_ci#include <linux/if_packet.h>
258c2ecf20Sopenharmony_ci#include <linux/if_arp.h>
268c2ecf20Sopenharmony_ci#include <linux/if_tun.h>
278c2ecf20Sopenharmony_ci#include <linux/if_macvlan.h>
288c2ecf20Sopenharmony_ci#include <linux/if_tap.h>
298c2ecf20Sopenharmony_ci#include <linux/if_vlan.h>
308c2ecf20Sopenharmony_ci#include <linux/skb_array.h>
318c2ecf20Sopenharmony_ci#include <linux/skbuff.h>
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci#include <net/sock.h>
348c2ecf20Sopenharmony_ci#include <net/xdp.h>
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_ci#include "vhost.h"
378c2ecf20Sopenharmony_ci
388c2ecf20Sopenharmony_cistatic int experimental_zcopytx = 0;
398c2ecf20Sopenharmony_cimodule_param(experimental_zcopytx, int, 0444);
408c2ecf20Sopenharmony_ciMODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
418c2ecf20Sopenharmony_ci		                       " 1 -Enable; 0 - Disable");
428c2ecf20Sopenharmony_ci
438c2ecf20Sopenharmony_ci/* Max number of bytes transferred before requeueing the job.
448c2ecf20Sopenharmony_ci * Using this limit prevents one virtqueue from starving others. */
458c2ecf20Sopenharmony_ci#define VHOST_NET_WEIGHT 0x80000
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci/* Max number of packets transferred before requeueing the job.
488c2ecf20Sopenharmony_ci * Using this limit prevents one virtqueue from starving others with small
498c2ecf20Sopenharmony_ci * pkts.
508c2ecf20Sopenharmony_ci */
518c2ecf20Sopenharmony_ci#define VHOST_NET_PKT_WEIGHT 256
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci/* MAX number of TX used buffers for outstanding zerocopy */
548c2ecf20Sopenharmony_ci#define VHOST_MAX_PEND 128
558c2ecf20Sopenharmony_ci#define VHOST_GOODCOPY_LEN 256
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci/*
588c2ecf20Sopenharmony_ci * For transmit, used buffer len is unused; we override it to track buffer
598c2ecf20Sopenharmony_ci * status internally; used for zerocopy tx only.
608c2ecf20Sopenharmony_ci */
618c2ecf20Sopenharmony_ci/* Lower device DMA failed */
628c2ecf20Sopenharmony_ci#define VHOST_DMA_FAILED_LEN	((__force __virtio32)3)
638c2ecf20Sopenharmony_ci/* Lower device DMA done */
648c2ecf20Sopenharmony_ci#define VHOST_DMA_DONE_LEN	((__force __virtio32)2)
658c2ecf20Sopenharmony_ci/* Lower device DMA in progress */
668c2ecf20Sopenharmony_ci#define VHOST_DMA_IN_PROGRESS	((__force __virtio32)1)
678c2ecf20Sopenharmony_ci/* Buffer unused */
688c2ecf20Sopenharmony_ci#define VHOST_DMA_CLEAR_LEN	((__force __virtio32)0)
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_cienum {
738c2ecf20Sopenharmony_ci	VHOST_NET_FEATURES = VHOST_FEATURES |
748c2ecf20Sopenharmony_ci			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
758c2ecf20Sopenharmony_ci			 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
768c2ecf20Sopenharmony_ci			 (1ULL << VIRTIO_F_ACCESS_PLATFORM)
778c2ecf20Sopenharmony_ci};
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_cienum {
808c2ecf20Sopenharmony_ci	VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
818c2ecf20Sopenharmony_ci};
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_cienum {
848c2ecf20Sopenharmony_ci	VHOST_NET_VQ_RX = 0,
858c2ecf20Sopenharmony_ci	VHOST_NET_VQ_TX = 1,
868c2ecf20Sopenharmony_ci	VHOST_NET_VQ_MAX = 2,
878c2ecf20Sopenharmony_ci};
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_cistruct vhost_net_ubuf_ref {
908c2ecf20Sopenharmony_ci	/* refcount follows semantics similar to kref:
918c2ecf20Sopenharmony_ci	 *  0: object is released
928c2ecf20Sopenharmony_ci	 *  1: no outstanding ubufs
938c2ecf20Sopenharmony_ci	 * >1: outstanding ubufs
948c2ecf20Sopenharmony_ci	 */
958c2ecf20Sopenharmony_ci	atomic_t refcount;
968c2ecf20Sopenharmony_ci	wait_queue_head_t wait;
978c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq;
988c2ecf20Sopenharmony_ci};
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_ci#define VHOST_NET_BATCH 64
1018c2ecf20Sopenharmony_cistruct vhost_net_buf {
1028c2ecf20Sopenharmony_ci	void **queue;
1038c2ecf20Sopenharmony_ci	int tail;
1048c2ecf20Sopenharmony_ci	int head;
1058c2ecf20Sopenharmony_ci};
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_cistruct vhost_net_virtqueue {
1088c2ecf20Sopenharmony_ci	struct vhost_virtqueue vq;
1098c2ecf20Sopenharmony_ci	size_t vhost_hlen;
1108c2ecf20Sopenharmony_ci	size_t sock_hlen;
1118c2ecf20Sopenharmony_ci	/* vhost zerocopy support fields below: */
1128c2ecf20Sopenharmony_ci	/* last used idx for outstanding DMA zerocopy buffers */
1138c2ecf20Sopenharmony_ci	int upend_idx;
1148c2ecf20Sopenharmony_ci	/* For TX, first used idx for DMA done zerocopy buffers
1158c2ecf20Sopenharmony_ci	 * For RX, number of batched heads
1168c2ecf20Sopenharmony_ci	 */
1178c2ecf20Sopenharmony_ci	int done_idx;
1188c2ecf20Sopenharmony_ci	/* Number of XDP frames batched */
1198c2ecf20Sopenharmony_ci	int batched_xdp;
1208c2ecf20Sopenharmony_ci	/* an array of userspace buffers info */
1218c2ecf20Sopenharmony_ci	struct ubuf_info *ubuf_info;
1228c2ecf20Sopenharmony_ci	/* Reference counting for outstanding ubufs.
1238c2ecf20Sopenharmony_ci	 * Protected by vq mutex. Writers must also take device mutex. */
1248c2ecf20Sopenharmony_ci	struct vhost_net_ubuf_ref *ubufs;
1258c2ecf20Sopenharmony_ci	struct ptr_ring *rx_ring;
1268c2ecf20Sopenharmony_ci	struct vhost_net_buf rxq;
1278c2ecf20Sopenharmony_ci	/* Batched XDP buffs */
1288c2ecf20Sopenharmony_ci	struct xdp_buff *xdp;
1298c2ecf20Sopenharmony_ci};
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_cistruct vhost_net {
1328c2ecf20Sopenharmony_ci	struct vhost_dev dev;
1338c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
1348c2ecf20Sopenharmony_ci	struct vhost_poll poll[VHOST_NET_VQ_MAX];
1358c2ecf20Sopenharmony_ci	/* Number of TX recently submitted.
1368c2ecf20Sopenharmony_ci	 * Protected by tx vq lock. */
1378c2ecf20Sopenharmony_ci	unsigned tx_packets;
1388c2ecf20Sopenharmony_ci	/* Number of times zerocopy TX recently failed.
1398c2ecf20Sopenharmony_ci	 * Protected by tx vq lock. */
1408c2ecf20Sopenharmony_ci	unsigned tx_zcopy_err;
1418c2ecf20Sopenharmony_ci	/* Flush in progress. Protected by tx vq lock. */
1428c2ecf20Sopenharmony_ci	bool tx_flush;
1438c2ecf20Sopenharmony_ci	/* Private page frag */
1448c2ecf20Sopenharmony_ci	struct page_frag page_frag;
1458c2ecf20Sopenharmony_ci	/* Refcount bias of page frag */
1468c2ecf20Sopenharmony_ci	int refcnt_bias;
1478c2ecf20Sopenharmony_ci};
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_cistatic unsigned vhost_net_zcopy_mask __read_mostly;
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_cistatic void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
1528c2ecf20Sopenharmony_ci{
1538c2ecf20Sopenharmony_ci	if (rxq->tail != rxq->head)
1548c2ecf20Sopenharmony_ci		return rxq->queue[rxq->head];
1558c2ecf20Sopenharmony_ci	else
1568c2ecf20Sopenharmony_ci		return NULL;
1578c2ecf20Sopenharmony_ci}
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_cistatic int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
1608c2ecf20Sopenharmony_ci{
1618c2ecf20Sopenharmony_ci	return rxq->tail - rxq->head;
1628c2ecf20Sopenharmony_ci}
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_cistatic int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
1658c2ecf20Sopenharmony_ci{
1668c2ecf20Sopenharmony_ci	return rxq->tail == rxq->head;
1678c2ecf20Sopenharmony_ci}
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_cistatic void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
1708c2ecf20Sopenharmony_ci{
1718c2ecf20Sopenharmony_ci	void *ret = vhost_net_buf_get_ptr(rxq);
1728c2ecf20Sopenharmony_ci	++rxq->head;
1738c2ecf20Sopenharmony_ci	return ret;
1748c2ecf20Sopenharmony_ci}
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_cistatic int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
1778c2ecf20Sopenharmony_ci{
1788c2ecf20Sopenharmony_ci	struct vhost_net_buf *rxq = &nvq->rxq;
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci	rxq->head = 0;
1818c2ecf20Sopenharmony_ci	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
1828c2ecf20Sopenharmony_ci					      VHOST_NET_BATCH);
1838c2ecf20Sopenharmony_ci	return rxq->tail;
1848c2ecf20Sopenharmony_ci}
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_cistatic void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
1878c2ecf20Sopenharmony_ci{
1888c2ecf20Sopenharmony_ci	struct vhost_net_buf *rxq = &nvq->rxq;
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
1918c2ecf20Sopenharmony_ci		ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
1928c2ecf20Sopenharmony_ci				   vhost_net_buf_get_size(rxq),
1938c2ecf20Sopenharmony_ci				   tun_ptr_free);
1948c2ecf20Sopenharmony_ci		rxq->head = rxq->tail = 0;
1958c2ecf20Sopenharmony_ci	}
1968c2ecf20Sopenharmony_ci}
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_cistatic int vhost_net_buf_peek_len(void *ptr)
1998c2ecf20Sopenharmony_ci{
2008c2ecf20Sopenharmony_ci	if (tun_is_xdp_frame(ptr)) {
2018c2ecf20Sopenharmony_ci		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci		return xdpf->len;
2048c2ecf20Sopenharmony_ci	}
2058c2ecf20Sopenharmony_ci
2068c2ecf20Sopenharmony_ci	return __skb_array_len_with_tag(ptr);
2078c2ecf20Sopenharmony_ci}
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_cistatic int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
2108c2ecf20Sopenharmony_ci{
2118c2ecf20Sopenharmony_ci	struct vhost_net_buf *rxq = &nvq->rxq;
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci	if (!vhost_net_buf_is_empty(rxq))
2148c2ecf20Sopenharmony_ci		goto out;
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci	if (!vhost_net_buf_produce(nvq))
2178c2ecf20Sopenharmony_ci		return 0;
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_ciout:
2208c2ecf20Sopenharmony_ci	return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
2218c2ecf20Sopenharmony_ci}
2228c2ecf20Sopenharmony_ci
2238c2ecf20Sopenharmony_cistatic void vhost_net_buf_init(struct vhost_net_buf *rxq)
2248c2ecf20Sopenharmony_ci{
2258c2ecf20Sopenharmony_ci	rxq->head = rxq->tail = 0;
2268c2ecf20Sopenharmony_ci}
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_cistatic void vhost_net_enable_zcopy(int vq)
2298c2ecf20Sopenharmony_ci{
2308c2ecf20Sopenharmony_ci	vhost_net_zcopy_mask |= 0x1 << vq;
2318c2ecf20Sopenharmony_ci}
2328c2ecf20Sopenharmony_ci
2338c2ecf20Sopenharmony_cistatic struct vhost_net_ubuf_ref *
2348c2ecf20Sopenharmony_civhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
2358c2ecf20Sopenharmony_ci{
2368c2ecf20Sopenharmony_ci	struct vhost_net_ubuf_ref *ubufs;
2378c2ecf20Sopenharmony_ci	/* No zero copy backend? Nothing to count. */
2388c2ecf20Sopenharmony_ci	if (!zcopy)
2398c2ecf20Sopenharmony_ci		return NULL;
2408c2ecf20Sopenharmony_ci	ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
2418c2ecf20Sopenharmony_ci	if (!ubufs)
2428c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
2438c2ecf20Sopenharmony_ci	atomic_set(&ubufs->refcount, 1);
2448c2ecf20Sopenharmony_ci	init_waitqueue_head(&ubufs->wait);
2458c2ecf20Sopenharmony_ci	ubufs->vq = vq;
2468c2ecf20Sopenharmony_ci	return ubufs;
2478c2ecf20Sopenharmony_ci}
2488c2ecf20Sopenharmony_ci
2498c2ecf20Sopenharmony_cistatic int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
2508c2ecf20Sopenharmony_ci{
2518c2ecf20Sopenharmony_ci	int r = atomic_sub_return(1, &ubufs->refcount);
2528c2ecf20Sopenharmony_ci	if (unlikely(!r))
2538c2ecf20Sopenharmony_ci		wake_up(&ubufs->wait);
2548c2ecf20Sopenharmony_ci	return r;
2558c2ecf20Sopenharmony_ci}
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_cistatic void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
2588c2ecf20Sopenharmony_ci{
2598c2ecf20Sopenharmony_ci	vhost_net_ubuf_put(ubufs);
2608c2ecf20Sopenharmony_ci	wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
2618c2ecf20Sopenharmony_ci}
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_cistatic void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
2648c2ecf20Sopenharmony_ci{
2658c2ecf20Sopenharmony_ci	vhost_net_ubuf_put_and_wait(ubufs);
2668c2ecf20Sopenharmony_ci	kfree(ubufs);
2678c2ecf20Sopenharmony_ci}
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_cistatic void vhost_net_clear_ubuf_info(struct vhost_net *n)
2708c2ecf20Sopenharmony_ci{
2718c2ecf20Sopenharmony_ci	int i;
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ci	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
2748c2ecf20Sopenharmony_ci		kfree(n->vqs[i].ubuf_info);
2758c2ecf20Sopenharmony_ci		n->vqs[i].ubuf_info = NULL;
2768c2ecf20Sopenharmony_ci	}
2778c2ecf20Sopenharmony_ci}
2788c2ecf20Sopenharmony_ci
2798c2ecf20Sopenharmony_cistatic int vhost_net_set_ubuf_info(struct vhost_net *n)
2808c2ecf20Sopenharmony_ci{
2818c2ecf20Sopenharmony_ci	bool zcopy;
2828c2ecf20Sopenharmony_ci	int i;
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
2858c2ecf20Sopenharmony_ci		zcopy = vhost_net_zcopy_mask & (0x1 << i);
2868c2ecf20Sopenharmony_ci		if (!zcopy)
2878c2ecf20Sopenharmony_ci			continue;
2888c2ecf20Sopenharmony_ci		n->vqs[i].ubuf_info =
2898c2ecf20Sopenharmony_ci			kmalloc_array(UIO_MAXIOV,
2908c2ecf20Sopenharmony_ci				      sizeof(*n->vqs[i].ubuf_info),
2918c2ecf20Sopenharmony_ci				      GFP_KERNEL);
2928c2ecf20Sopenharmony_ci		if  (!n->vqs[i].ubuf_info)
2938c2ecf20Sopenharmony_ci			goto err;
2948c2ecf20Sopenharmony_ci	}
2958c2ecf20Sopenharmony_ci	return 0;
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_cierr:
2988c2ecf20Sopenharmony_ci	vhost_net_clear_ubuf_info(n);
2998c2ecf20Sopenharmony_ci	return -ENOMEM;
3008c2ecf20Sopenharmony_ci}
3018c2ecf20Sopenharmony_ci
3028c2ecf20Sopenharmony_cistatic void vhost_net_vq_reset(struct vhost_net *n)
3038c2ecf20Sopenharmony_ci{
3048c2ecf20Sopenharmony_ci	int i;
3058c2ecf20Sopenharmony_ci
3068c2ecf20Sopenharmony_ci	vhost_net_clear_ubuf_info(n);
3078c2ecf20Sopenharmony_ci
3088c2ecf20Sopenharmony_ci	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
3098c2ecf20Sopenharmony_ci		n->vqs[i].done_idx = 0;
3108c2ecf20Sopenharmony_ci		n->vqs[i].upend_idx = 0;
3118c2ecf20Sopenharmony_ci		n->vqs[i].ubufs = NULL;
3128c2ecf20Sopenharmony_ci		n->vqs[i].vhost_hlen = 0;
3138c2ecf20Sopenharmony_ci		n->vqs[i].sock_hlen = 0;
3148c2ecf20Sopenharmony_ci		vhost_net_buf_init(&n->vqs[i].rxq);
3158c2ecf20Sopenharmony_ci	}
3168c2ecf20Sopenharmony_ci
3178c2ecf20Sopenharmony_ci}
3188c2ecf20Sopenharmony_ci
3198c2ecf20Sopenharmony_cistatic void vhost_net_tx_packet(struct vhost_net *net)
3208c2ecf20Sopenharmony_ci{
3218c2ecf20Sopenharmony_ci	++net->tx_packets;
3228c2ecf20Sopenharmony_ci	if (net->tx_packets < 1024)
3238c2ecf20Sopenharmony_ci		return;
3248c2ecf20Sopenharmony_ci	net->tx_packets = 0;
3258c2ecf20Sopenharmony_ci	net->tx_zcopy_err = 0;
3268c2ecf20Sopenharmony_ci}
3278c2ecf20Sopenharmony_ci
3288c2ecf20Sopenharmony_cistatic void vhost_net_tx_err(struct vhost_net *net)
3298c2ecf20Sopenharmony_ci{
3308c2ecf20Sopenharmony_ci	++net->tx_zcopy_err;
3318c2ecf20Sopenharmony_ci}
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_cistatic bool vhost_net_tx_select_zcopy(struct vhost_net *net)
3348c2ecf20Sopenharmony_ci{
3358c2ecf20Sopenharmony_ci	/* TX flush waits for outstanding DMAs to be done.
3368c2ecf20Sopenharmony_ci	 * Don't start new DMAs.
3378c2ecf20Sopenharmony_ci	 */
3388c2ecf20Sopenharmony_ci	return !net->tx_flush &&
3398c2ecf20Sopenharmony_ci		net->tx_packets / 64 >= net->tx_zcopy_err;
3408c2ecf20Sopenharmony_ci}
3418c2ecf20Sopenharmony_ci
3428c2ecf20Sopenharmony_cistatic bool vhost_sock_zcopy(struct socket *sock)
3438c2ecf20Sopenharmony_ci{
3448c2ecf20Sopenharmony_ci	return unlikely(experimental_zcopytx) &&
3458c2ecf20Sopenharmony_ci		sock_flag(sock->sk, SOCK_ZEROCOPY);
3468c2ecf20Sopenharmony_ci}
3478c2ecf20Sopenharmony_ci
3488c2ecf20Sopenharmony_cistatic bool vhost_sock_xdp(struct socket *sock)
3498c2ecf20Sopenharmony_ci{
3508c2ecf20Sopenharmony_ci	return sock_flag(sock->sk, SOCK_XDP);
3518c2ecf20Sopenharmony_ci}
3528c2ecf20Sopenharmony_ci
3538c2ecf20Sopenharmony_ci/* In case of DMA done not in order in lower device driver for some reason.
3548c2ecf20Sopenharmony_ci * upend_idx is used to track end of used idx, done_idx is used to track head
3558c2ecf20Sopenharmony_ci * of used idx. Once lower device DMA done contiguously, we will signal KVM
3568c2ecf20Sopenharmony_ci * guest used idx.
3578c2ecf20Sopenharmony_ci */
3588c2ecf20Sopenharmony_cistatic void vhost_zerocopy_signal_used(struct vhost_net *net,
3598c2ecf20Sopenharmony_ci				       struct vhost_virtqueue *vq)
3608c2ecf20Sopenharmony_ci{
3618c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq =
3628c2ecf20Sopenharmony_ci		container_of(vq, struct vhost_net_virtqueue, vq);
3638c2ecf20Sopenharmony_ci	int i, add;
3648c2ecf20Sopenharmony_ci	int j = 0;
3658c2ecf20Sopenharmony_ci
3668c2ecf20Sopenharmony_ci	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
3678c2ecf20Sopenharmony_ci		if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
3688c2ecf20Sopenharmony_ci			vhost_net_tx_err(net);
3698c2ecf20Sopenharmony_ci		if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
3708c2ecf20Sopenharmony_ci			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
3718c2ecf20Sopenharmony_ci			++j;
3728c2ecf20Sopenharmony_ci		} else
3738c2ecf20Sopenharmony_ci			break;
3748c2ecf20Sopenharmony_ci	}
3758c2ecf20Sopenharmony_ci	while (j) {
3768c2ecf20Sopenharmony_ci		add = min(UIO_MAXIOV - nvq->done_idx, j);
3778c2ecf20Sopenharmony_ci		vhost_add_used_and_signal_n(vq->dev, vq,
3788c2ecf20Sopenharmony_ci					    &vq->heads[nvq->done_idx], add);
3798c2ecf20Sopenharmony_ci		nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
3808c2ecf20Sopenharmony_ci		j -= add;
3818c2ecf20Sopenharmony_ci	}
3828c2ecf20Sopenharmony_ci}
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_cistatic void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
3858c2ecf20Sopenharmony_ci{
3868c2ecf20Sopenharmony_ci	struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
3878c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = ubufs->vq;
3888c2ecf20Sopenharmony_ci	int cnt;
3898c2ecf20Sopenharmony_ci
3908c2ecf20Sopenharmony_ci	rcu_read_lock_bh();
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci	/* set len to mark this desc buffers done DMA */
3938c2ecf20Sopenharmony_ci	vq->heads[ubuf->desc].len = success ?
3948c2ecf20Sopenharmony_ci		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
3958c2ecf20Sopenharmony_ci	cnt = vhost_net_ubuf_put(ubufs);
3968c2ecf20Sopenharmony_ci
3978c2ecf20Sopenharmony_ci	/*
3988c2ecf20Sopenharmony_ci	 * Trigger polling thread if guest stopped submitting new buffers:
3998c2ecf20Sopenharmony_ci	 * in this case, the refcount after decrement will eventually reach 1.
4008c2ecf20Sopenharmony_ci	 * We also trigger polling periodically after each 16 packets
4018c2ecf20Sopenharmony_ci	 * (the value 16 here is more or less arbitrary, it's tuned to trigger
4028c2ecf20Sopenharmony_ci	 * less than 10% of times).
4038c2ecf20Sopenharmony_ci	 */
4048c2ecf20Sopenharmony_ci	if (cnt <= 1 || !(cnt % 16))
4058c2ecf20Sopenharmony_ci		vhost_poll_queue(&vq->poll);
4068c2ecf20Sopenharmony_ci
4078c2ecf20Sopenharmony_ci	rcu_read_unlock_bh();
4088c2ecf20Sopenharmony_ci}
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_cistatic inline unsigned long busy_clock(void)
4118c2ecf20Sopenharmony_ci{
4128c2ecf20Sopenharmony_ci	return local_clock() >> 10;
4138c2ecf20Sopenharmony_ci}
4148c2ecf20Sopenharmony_ci
4158c2ecf20Sopenharmony_cistatic bool vhost_can_busy_poll(unsigned long endtime)
4168c2ecf20Sopenharmony_ci{
4178c2ecf20Sopenharmony_ci	return likely(!need_resched() && !time_after(busy_clock(), endtime) &&
4188c2ecf20Sopenharmony_ci		      !signal_pending(current));
4198c2ecf20Sopenharmony_ci}
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_cistatic void vhost_net_disable_vq(struct vhost_net *n,
4228c2ecf20Sopenharmony_ci				 struct vhost_virtqueue *vq)
4238c2ecf20Sopenharmony_ci{
4248c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq =
4258c2ecf20Sopenharmony_ci		container_of(vq, struct vhost_net_virtqueue, vq);
4268c2ecf20Sopenharmony_ci	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
4278c2ecf20Sopenharmony_ci	if (!vhost_vq_get_backend(vq))
4288c2ecf20Sopenharmony_ci		return;
4298c2ecf20Sopenharmony_ci	vhost_poll_stop(poll);
4308c2ecf20Sopenharmony_ci}
4318c2ecf20Sopenharmony_ci
4328c2ecf20Sopenharmony_cistatic int vhost_net_enable_vq(struct vhost_net *n,
4338c2ecf20Sopenharmony_ci				struct vhost_virtqueue *vq)
4348c2ecf20Sopenharmony_ci{
4358c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq =
4368c2ecf20Sopenharmony_ci		container_of(vq, struct vhost_net_virtqueue, vq);
4378c2ecf20Sopenharmony_ci	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
4388c2ecf20Sopenharmony_ci	struct socket *sock;
4398c2ecf20Sopenharmony_ci
4408c2ecf20Sopenharmony_ci	sock = vhost_vq_get_backend(vq);
4418c2ecf20Sopenharmony_ci	if (!sock)
4428c2ecf20Sopenharmony_ci		return 0;
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci	return vhost_poll_start(poll, sock->file);
4458c2ecf20Sopenharmony_ci}
4468c2ecf20Sopenharmony_ci
4478c2ecf20Sopenharmony_cistatic void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
4488c2ecf20Sopenharmony_ci{
4498c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = &nvq->vq;
4508c2ecf20Sopenharmony_ci	struct vhost_dev *dev = vq->dev;
4518c2ecf20Sopenharmony_ci
4528c2ecf20Sopenharmony_ci	if (!nvq->done_idx)
4538c2ecf20Sopenharmony_ci		return;
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_ci	vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
4568c2ecf20Sopenharmony_ci	nvq->done_idx = 0;
4578c2ecf20Sopenharmony_ci}
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_cistatic void vhost_tx_batch(struct vhost_net *net,
4608c2ecf20Sopenharmony_ci			   struct vhost_net_virtqueue *nvq,
4618c2ecf20Sopenharmony_ci			   struct socket *sock,
4628c2ecf20Sopenharmony_ci			   struct msghdr *msghdr)
4638c2ecf20Sopenharmony_ci{
4648c2ecf20Sopenharmony_ci	struct tun_msg_ctl ctl = {
4658c2ecf20Sopenharmony_ci		.type = TUN_MSG_PTR,
4668c2ecf20Sopenharmony_ci		.num = nvq->batched_xdp,
4678c2ecf20Sopenharmony_ci		.ptr = nvq->xdp,
4688c2ecf20Sopenharmony_ci	};
4698c2ecf20Sopenharmony_ci	int i, err;
4708c2ecf20Sopenharmony_ci
4718c2ecf20Sopenharmony_ci	if (nvq->batched_xdp == 0)
4728c2ecf20Sopenharmony_ci		goto signal_used;
4738c2ecf20Sopenharmony_ci
4748c2ecf20Sopenharmony_ci	msghdr->msg_control = &ctl;
4758c2ecf20Sopenharmony_ci	msghdr->msg_controllen = sizeof(ctl);
4768c2ecf20Sopenharmony_ci	err = sock->ops->sendmsg(sock, msghdr, 0);
4778c2ecf20Sopenharmony_ci	if (unlikely(err < 0)) {
4788c2ecf20Sopenharmony_ci		vq_err(&nvq->vq, "Fail to batch sending packets\n");
4798c2ecf20Sopenharmony_ci
4808c2ecf20Sopenharmony_ci		/* free pages owned by XDP; since this is an unlikely error path,
4818c2ecf20Sopenharmony_ci		 * keep it simple and avoid more complex bulk update for the
4828c2ecf20Sopenharmony_ci		 * used pages
4838c2ecf20Sopenharmony_ci		 */
4848c2ecf20Sopenharmony_ci		for (i = 0; i < nvq->batched_xdp; ++i)
4858c2ecf20Sopenharmony_ci			put_page(virt_to_head_page(nvq->xdp[i].data));
4868c2ecf20Sopenharmony_ci		nvq->batched_xdp = 0;
4878c2ecf20Sopenharmony_ci		nvq->done_idx = 0;
4888c2ecf20Sopenharmony_ci		return;
4898c2ecf20Sopenharmony_ci	}
4908c2ecf20Sopenharmony_ci
4918c2ecf20Sopenharmony_cisignal_used:
4928c2ecf20Sopenharmony_ci	vhost_net_signal_used(nvq);
4938c2ecf20Sopenharmony_ci	nvq->batched_xdp = 0;
4948c2ecf20Sopenharmony_ci}
4958c2ecf20Sopenharmony_ci
4968c2ecf20Sopenharmony_cistatic int sock_has_rx_data(struct socket *sock)
4978c2ecf20Sopenharmony_ci{
4988c2ecf20Sopenharmony_ci	if (unlikely(!sock))
4998c2ecf20Sopenharmony_ci		return 0;
5008c2ecf20Sopenharmony_ci
5018c2ecf20Sopenharmony_ci	if (sock->ops->peek_len)
5028c2ecf20Sopenharmony_ci		return sock->ops->peek_len(sock);
5038c2ecf20Sopenharmony_ci
5048c2ecf20Sopenharmony_ci	return skb_queue_empty(&sock->sk->sk_receive_queue);
5058c2ecf20Sopenharmony_ci}
5068c2ecf20Sopenharmony_ci
5078c2ecf20Sopenharmony_cistatic void vhost_net_busy_poll_try_queue(struct vhost_net *net,
5088c2ecf20Sopenharmony_ci					  struct vhost_virtqueue *vq)
5098c2ecf20Sopenharmony_ci{
5108c2ecf20Sopenharmony_ci	if (!vhost_vq_avail_empty(&net->dev, vq)) {
5118c2ecf20Sopenharmony_ci		vhost_poll_queue(&vq->poll);
5128c2ecf20Sopenharmony_ci	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
5138c2ecf20Sopenharmony_ci		vhost_disable_notify(&net->dev, vq);
5148c2ecf20Sopenharmony_ci		vhost_poll_queue(&vq->poll);
5158c2ecf20Sopenharmony_ci	}
5168c2ecf20Sopenharmony_ci}
5178c2ecf20Sopenharmony_ci
5188c2ecf20Sopenharmony_cistatic void vhost_net_busy_poll(struct vhost_net *net,
5198c2ecf20Sopenharmony_ci				struct vhost_virtqueue *rvq,
5208c2ecf20Sopenharmony_ci				struct vhost_virtqueue *tvq,
5218c2ecf20Sopenharmony_ci				bool *busyloop_intr,
5228c2ecf20Sopenharmony_ci				bool poll_rx)
5238c2ecf20Sopenharmony_ci{
5248c2ecf20Sopenharmony_ci	unsigned long busyloop_timeout;
5258c2ecf20Sopenharmony_ci	unsigned long endtime;
5268c2ecf20Sopenharmony_ci	struct socket *sock;
5278c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
5288c2ecf20Sopenharmony_ci
5298c2ecf20Sopenharmony_ci	/* Try to hold the vq mutex of the paired virtqueue. We can't
5308c2ecf20Sopenharmony_ci	 * use mutex_lock() here since we could not guarantee a
5318c2ecf20Sopenharmony_ci	 * consistenet lock ordering.
5328c2ecf20Sopenharmony_ci	 */
5338c2ecf20Sopenharmony_ci	if (!mutex_trylock(&vq->mutex))
5348c2ecf20Sopenharmony_ci		return;
5358c2ecf20Sopenharmony_ci
5368c2ecf20Sopenharmony_ci	vhost_disable_notify(&net->dev, vq);
5378c2ecf20Sopenharmony_ci	sock = vhost_vq_get_backend(rvq);
5388c2ecf20Sopenharmony_ci
5398c2ecf20Sopenharmony_ci	busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
5408c2ecf20Sopenharmony_ci				     tvq->busyloop_timeout;
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci	preempt_disable();
5438c2ecf20Sopenharmony_ci	endtime = busy_clock() + busyloop_timeout;
5448c2ecf20Sopenharmony_ci
5458c2ecf20Sopenharmony_ci	while (vhost_can_busy_poll(endtime)) {
5468c2ecf20Sopenharmony_ci		if (vhost_has_work(&net->dev)) {
5478c2ecf20Sopenharmony_ci			*busyloop_intr = true;
5488c2ecf20Sopenharmony_ci			break;
5498c2ecf20Sopenharmony_ci		}
5508c2ecf20Sopenharmony_ci
5518c2ecf20Sopenharmony_ci		if ((sock_has_rx_data(sock) &&
5528c2ecf20Sopenharmony_ci		     !vhost_vq_avail_empty(&net->dev, rvq)) ||
5538c2ecf20Sopenharmony_ci		    !vhost_vq_avail_empty(&net->dev, tvq))
5548c2ecf20Sopenharmony_ci			break;
5558c2ecf20Sopenharmony_ci
5568c2ecf20Sopenharmony_ci		cpu_relax();
5578c2ecf20Sopenharmony_ci	}
5588c2ecf20Sopenharmony_ci
5598c2ecf20Sopenharmony_ci	preempt_enable();
5608c2ecf20Sopenharmony_ci
5618c2ecf20Sopenharmony_ci	if (poll_rx || sock_has_rx_data(sock))
5628c2ecf20Sopenharmony_ci		vhost_net_busy_poll_try_queue(net, vq);
5638c2ecf20Sopenharmony_ci	else if (!poll_rx) /* On tx here, sock has no rx data. */
5648c2ecf20Sopenharmony_ci		vhost_enable_notify(&net->dev, rvq);
5658c2ecf20Sopenharmony_ci
5668c2ecf20Sopenharmony_ci	mutex_unlock(&vq->mutex);
5678c2ecf20Sopenharmony_ci}
5688c2ecf20Sopenharmony_ci
5698c2ecf20Sopenharmony_cistatic int vhost_net_tx_get_vq_desc(struct vhost_net *net,
5708c2ecf20Sopenharmony_ci				    struct vhost_net_virtqueue *tnvq,
5718c2ecf20Sopenharmony_ci				    unsigned int *out_num, unsigned int *in_num,
5728c2ecf20Sopenharmony_ci				    struct msghdr *msghdr, bool *busyloop_intr)
5738c2ecf20Sopenharmony_ci{
5748c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
5758c2ecf20Sopenharmony_ci	struct vhost_virtqueue *rvq = &rnvq->vq;
5768c2ecf20Sopenharmony_ci	struct vhost_virtqueue *tvq = &tnvq->vq;
5778c2ecf20Sopenharmony_ci
5788c2ecf20Sopenharmony_ci	int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
5798c2ecf20Sopenharmony_ci				  out_num, in_num, NULL, NULL);
5808c2ecf20Sopenharmony_ci
5818c2ecf20Sopenharmony_ci	if (r == tvq->num && tvq->busyloop_timeout) {
5828c2ecf20Sopenharmony_ci		/* Flush batched packets first */
5838c2ecf20Sopenharmony_ci		if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq)))
5848c2ecf20Sopenharmony_ci			vhost_tx_batch(net, tnvq,
5858c2ecf20Sopenharmony_ci				       vhost_vq_get_backend(tvq),
5868c2ecf20Sopenharmony_ci				       msghdr);
5878c2ecf20Sopenharmony_ci
5888c2ecf20Sopenharmony_ci		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
5898c2ecf20Sopenharmony_ci
5908c2ecf20Sopenharmony_ci		r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
5918c2ecf20Sopenharmony_ci				      out_num, in_num, NULL, NULL);
5928c2ecf20Sopenharmony_ci	}
5938c2ecf20Sopenharmony_ci
5948c2ecf20Sopenharmony_ci	return r;
5958c2ecf20Sopenharmony_ci}
5968c2ecf20Sopenharmony_ci
5978c2ecf20Sopenharmony_cistatic bool vhost_exceeds_maxpend(struct vhost_net *net)
5988c2ecf20Sopenharmony_ci{
5998c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
6008c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = &nvq->vq;
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_ci	return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
6038c2ecf20Sopenharmony_ci	       min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
6048c2ecf20Sopenharmony_ci}
6058c2ecf20Sopenharmony_ci
6068c2ecf20Sopenharmony_cistatic size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
6078c2ecf20Sopenharmony_ci			    size_t hdr_size, int out)
6088c2ecf20Sopenharmony_ci{
6098c2ecf20Sopenharmony_ci	/* Skip header. TODO: support TSO. */
6108c2ecf20Sopenharmony_ci	size_t len = iov_length(vq->iov, out);
6118c2ecf20Sopenharmony_ci
6128c2ecf20Sopenharmony_ci	iov_iter_init(iter, WRITE, vq->iov, out, len);
6138c2ecf20Sopenharmony_ci	iov_iter_advance(iter, hdr_size);
6148c2ecf20Sopenharmony_ci
6158c2ecf20Sopenharmony_ci	return iov_iter_count(iter);
6168c2ecf20Sopenharmony_ci}
6178c2ecf20Sopenharmony_ci
6188c2ecf20Sopenharmony_cistatic int get_tx_bufs(struct vhost_net *net,
6198c2ecf20Sopenharmony_ci		       struct vhost_net_virtqueue *nvq,
6208c2ecf20Sopenharmony_ci		       struct msghdr *msg,
6218c2ecf20Sopenharmony_ci		       unsigned int *out, unsigned int *in,
6228c2ecf20Sopenharmony_ci		       size_t *len, bool *busyloop_intr)
6238c2ecf20Sopenharmony_ci{
6248c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = &nvq->vq;
6258c2ecf20Sopenharmony_ci	int ret;
6268c2ecf20Sopenharmony_ci
6278c2ecf20Sopenharmony_ci	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
6288c2ecf20Sopenharmony_ci
6298c2ecf20Sopenharmony_ci	if (ret < 0 || ret == vq->num)
6308c2ecf20Sopenharmony_ci		return ret;
6318c2ecf20Sopenharmony_ci
6328c2ecf20Sopenharmony_ci	if (*in) {
6338c2ecf20Sopenharmony_ci		vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n",
6348c2ecf20Sopenharmony_ci			*out, *in);
6358c2ecf20Sopenharmony_ci		return -EFAULT;
6368c2ecf20Sopenharmony_ci	}
6378c2ecf20Sopenharmony_ci
6388c2ecf20Sopenharmony_ci	/* Sanity check */
6398c2ecf20Sopenharmony_ci	*len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out);
6408c2ecf20Sopenharmony_ci	if (*len == 0) {
6418c2ecf20Sopenharmony_ci		vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
6428c2ecf20Sopenharmony_ci			*len, nvq->vhost_hlen);
6438c2ecf20Sopenharmony_ci		return -EFAULT;
6448c2ecf20Sopenharmony_ci	}
6458c2ecf20Sopenharmony_ci
6468c2ecf20Sopenharmony_ci	return ret;
6478c2ecf20Sopenharmony_ci}
6488c2ecf20Sopenharmony_ci
6498c2ecf20Sopenharmony_cistatic bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
6508c2ecf20Sopenharmony_ci{
6518c2ecf20Sopenharmony_ci	return total_len < VHOST_NET_WEIGHT &&
6528c2ecf20Sopenharmony_ci	       !vhost_vq_avail_empty(vq->dev, vq);
6538c2ecf20Sopenharmony_ci}
6548c2ecf20Sopenharmony_ci
6558c2ecf20Sopenharmony_cistatic bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
6568c2ecf20Sopenharmony_ci				       struct page_frag *pfrag, gfp_t gfp)
6578c2ecf20Sopenharmony_ci{
6588c2ecf20Sopenharmony_ci	if (pfrag->page) {
6598c2ecf20Sopenharmony_ci		if (pfrag->offset + sz <= pfrag->size)
6608c2ecf20Sopenharmony_ci			return true;
6618c2ecf20Sopenharmony_ci		__page_frag_cache_drain(pfrag->page, net->refcnt_bias);
6628c2ecf20Sopenharmony_ci	}
6638c2ecf20Sopenharmony_ci
6648c2ecf20Sopenharmony_ci	pfrag->offset = 0;
6658c2ecf20Sopenharmony_ci	net->refcnt_bias = 0;
6668c2ecf20Sopenharmony_ci	if (SKB_FRAG_PAGE_ORDER) {
6678c2ecf20Sopenharmony_ci		/* Avoid direct reclaim but allow kswapd to wake */
6688c2ecf20Sopenharmony_ci		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
6698c2ecf20Sopenharmony_ci					  __GFP_COMP | __GFP_NOWARN |
6708c2ecf20Sopenharmony_ci					  __GFP_NORETRY,
6718c2ecf20Sopenharmony_ci					  SKB_FRAG_PAGE_ORDER);
6728c2ecf20Sopenharmony_ci		if (likely(pfrag->page)) {
6738c2ecf20Sopenharmony_ci			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
6748c2ecf20Sopenharmony_ci			goto done;
6758c2ecf20Sopenharmony_ci		}
6768c2ecf20Sopenharmony_ci	}
6778c2ecf20Sopenharmony_ci	pfrag->page = alloc_page(gfp);
6788c2ecf20Sopenharmony_ci	if (likely(pfrag->page)) {
6798c2ecf20Sopenharmony_ci		pfrag->size = PAGE_SIZE;
6808c2ecf20Sopenharmony_ci		goto done;
6818c2ecf20Sopenharmony_ci	}
6828c2ecf20Sopenharmony_ci	return false;
6838c2ecf20Sopenharmony_ci
6848c2ecf20Sopenharmony_cidone:
6858c2ecf20Sopenharmony_ci	net->refcnt_bias = USHRT_MAX;
6868c2ecf20Sopenharmony_ci	page_ref_add(pfrag->page, USHRT_MAX - 1);
6878c2ecf20Sopenharmony_ci	return true;
6888c2ecf20Sopenharmony_ci}
6898c2ecf20Sopenharmony_ci
6908c2ecf20Sopenharmony_ci#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
6918c2ecf20Sopenharmony_ci
6928c2ecf20Sopenharmony_cistatic int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
6938c2ecf20Sopenharmony_ci			       struct iov_iter *from)
6948c2ecf20Sopenharmony_ci{
6958c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = &nvq->vq;
6968c2ecf20Sopenharmony_ci	struct vhost_net *net = container_of(vq->dev, struct vhost_net,
6978c2ecf20Sopenharmony_ci					     dev);
6988c2ecf20Sopenharmony_ci	struct socket *sock = vhost_vq_get_backend(vq);
6998c2ecf20Sopenharmony_ci	struct page_frag *alloc_frag = &net->page_frag;
7008c2ecf20Sopenharmony_ci	struct virtio_net_hdr *gso;
7018c2ecf20Sopenharmony_ci	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
7028c2ecf20Sopenharmony_ci	struct tun_xdp_hdr *hdr;
7038c2ecf20Sopenharmony_ci	size_t len = iov_iter_count(from);
7048c2ecf20Sopenharmony_ci	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
7058c2ecf20Sopenharmony_ci	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
7068c2ecf20Sopenharmony_ci	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
7078c2ecf20Sopenharmony_ci	int sock_hlen = nvq->sock_hlen;
7088c2ecf20Sopenharmony_ci	void *buf;
7098c2ecf20Sopenharmony_ci	int copied;
7108c2ecf20Sopenharmony_ci
7118c2ecf20Sopenharmony_ci	if (unlikely(len < nvq->sock_hlen))
7128c2ecf20Sopenharmony_ci		return -EFAULT;
7138c2ecf20Sopenharmony_ci
7148c2ecf20Sopenharmony_ci	if (SKB_DATA_ALIGN(len + pad) +
7158c2ecf20Sopenharmony_ci	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
7168c2ecf20Sopenharmony_ci		return -ENOSPC;
7178c2ecf20Sopenharmony_ci
7188c2ecf20Sopenharmony_ci	buflen += SKB_DATA_ALIGN(len + pad);
7198c2ecf20Sopenharmony_ci	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
7208c2ecf20Sopenharmony_ci	if (unlikely(!vhost_net_page_frag_refill(net, buflen,
7218c2ecf20Sopenharmony_ci						 alloc_frag, GFP_KERNEL)))
7228c2ecf20Sopenharmony_ci		return -ENOMEM;
7238c2ecf20Sopenharmony_ci
7248c2ecf20Sopenharmony_ci	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
7258c2ecf20Sopenharmony_ci	copied = copy_page_from_iter(alloc_frag->page,
7268c2ecf20Sopenharmony_ci				     alloc_frag->offset +
7278c2ecf20Sopenharmony_ci				     offsetof(struct tun_xdp_hdr, gso),
7288c2ecf20Sopenharmony_ci				     sock_hlen, from);
7298c2ecf20Sopenharmony_ci	if (copied != sock_hlen)
7308c2ecf20Sopenharmony_ci		return -EFAULT;
7318c2ecf20Sopenharmony_ci
7328c2ecf20Sopenharmony_ci	hdr = buf;
7338c2ecf20Sopenharmony_ci	gso = &hdr->gso;
7348c2ecf20Sopenharmony_ci
7358c2ecf20Sopenharmony_ci	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
7368c2ecf20Sopenharmony_ci	    vhost16_to_cpu(vq, gso->csum_start) +
7378c2ecf20Sopenharmony_ci	    vhost16_to_cpu(vq, gso->csum_offset) + 2 >
7388c2ecf20Sopenharmony_ci	    vhost16_to_cpu(vq, gso->hdr_len)) {
7398c2ecf20Sopenharmony_ci		gso->hdr_len = cpu_to_vhost16(vq,
7408c2ecf20Sopenharmony_ci			       vhost16_to_cpu(vq, gso->csum_start) +
7418c2ecf20Sopenharmony_ci			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
7428c2ecf20Sopenharmony_ci
7438c2ecf20Sopenharmony_ci		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
7448c2ecf20Sopenharmony_ci			return -EINVAL;
7458c2ecf20Sopenharmony_ci	}
7468c2ecf20Sopenharmony_ci
7478c2ecf20Sopenharmony_ci	len -= sock_hlen;
7488c2ecf20Sopenharmony_ci	copied = copy_page_from_iter(alloc_frag->page,
7498c2ecf20Sopenharmony_ci				     alloc_frag->offset + pad,
7508c2ecf20Sopenharmony_ci				     len, from);
7518c2ecf20Sopenharmony_ci	if (copied != len)
7528c2ecf20Sopenharmony_ci		return -EFAULT;
7538c2ecf20Sopenharmony_ci
7548c2ecf20Sopenharmony_ci	xdp->data_hard_start = buf;
7558c2ecf20Sopenharmony_ci	xdp->data = buf + pad;
7568c2ecf20Sopenharmony_ci	xdp->data_end = xdp->data + len;
7578c2ecf20Sopenharmony_ci	hdr->buflen = buflen;
7588c2ecf20Sopenharmony_ci	xdp->frame_sz = buflen;
7598c2ecf20Sopenharmony_ci
7608c2ecf20Sopenharmony_ci	--net->refcnt_bias;
7618c2ecf20Sopenharmony_ci	alloc_frag->offset += buflen;
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_ci	++nvq->batched_xdp;
7648c2ecf20Sopenharmony_ci
7658c2ecf20Sopenharmony_ci	return 0;
7668c2ecf20Sopenharmony_ci}
7678c2ecf20Sopenharmony_ci
7688c2ecf20Sopenharmony_cistatic void handle_tx_copy(struct vhost_net *net, struct socket *sock)
7698c2ecf20Sopenharmony_ci{
7708c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
7718c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = &nvq->vq;
7728c2ecf20Sopenharmony_ci	unsigned out, in;
7738c2ecf20Sopenharmony_ci	int head;
7748c2ecf20Sopenharmony_ci	struct msghdr msg = {
7758c2ecf20Sopenharmony_ci		.msg_name = NULL,
7768c2ecf20Sopenharmony_ci		.msg_namelen = 0,
7778c2ecf20Sopenharmony_ci		.msg_control = NULL,
7788c2ecf20Sopenharmony_ci		.msg_controllen = 0,
7798c2ecf20Sopenharmony_ci		.msg_flags = MSG_DONTWAIT,
7808c2ecf20Sopenharmony_ci	};
7818c2ecf20Sopenharmony_ci	size_t len, total_len = 0;
7828c2ecf20Sopenharmony_ci	int err;
7838c2ecf20Sopenharmony_ci	int sent_pkts = 0;
7848c2ecf20Sopenharmony_ci	bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
7858c2ecf20Sopenharmony_ci
7868c2ecf20Sopenharmony_ci	do {
7878c2ecf20Sopenharmony_ci		bool busyloop_intr = false;
7888c2ecf20Sopenharmony_ci
7898c2ecf20Sopenharmony_ci		if (nvq->done_idx == VHOST_NET_BATCH)
7908c2ecf20Sopenharmony_ci			vhost_tx_batch(net, nvq, sock, &msg);
7918c2ecf20Sopenharmony_ci
7928c2ecf20Sopenharmony_ci		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
7938c2ecf20Sopenharmony_ci				   &busyloop_intr);
7948c2ecf20Sopenharmony_ci		/* On error, stop handling until the next kick. */
7958c2ecf20Sopenharmony_ci		if (unlikely(head < 0))
7968c2ecf20Sopenharmony_ci			break;
7978c2ecf20Sopenharmony_ci		/* Nothing new?  Wait for eventfd to tell us they refilled. */
7988c2ecf20Sopenharmony_ci		if (head == vq->num) {
7998c2ecf20Sopenharmony_ci			if (unlikely(busyloop_intr)) {
8008c2ecf20Sopenharmony_ci				vhost_poll_queue(&vq->poll);
8018c2ecf20Sopenharmony_ci			} else if (unlikely(vhost_enable_notify(&net->dev,
8028c2ecf20Sopenharmony_ci								vq))) {
8038c2ecf20Sopenharmony_ci				vhost_disable_notify(&net->dev, vq);
8048c2ecf20Sopenharmony_ci				continue;
8058c2ecf20Sopenharmony_ci			}
8068c2ecf20Sopenharmony_ci			break;
8078c2ecf20Sopenharmony_ci		}
8088c2ecf20Sopenharmony_ci
8098c2ecf20Sopenharmony_ci		total_len += len;
8108c2ecf20Sopenharmony_ci
8118c2ecf20Sopenharmony_ci		/* For simplicity, TX batching is only enabled if
8128c2ecf20Sopenharmony_ci		 * sndbuf is unlimited.
8138c2ecf20Sopenharmony_ci		 */
8148c2ecf20Sopenharmony_ci		if (sock_can_batch) {
8158c2ecf20Sopenharmony_ci			err = vhost_net_build_xdp(nvq, &msg.msg_iter);
8168c2ecf20Sopenharmony_ci			if (!err) {
8178c2ecf20Sopenharmony_ci				goto done;
8188c2ecf20Sopenharmony_ci			} else if (unlikely(err != -ENOSPC)) {
8198c2ecf20Sopenharmony_ci				vhost_tx_batch(net, nvq, sock, &msg);
8208c2ecf20Sopenharmony_ci				vhost_discard_vq_desc(vq, 1);
8218c2ecf20Sopenharmony_ci				vhost_net_enable_vq(net, vq);
8228c2ecf20Sopenharmony_ci				break;
8238c2ecf20Sopenharmony_ci			}
8248c2ecf20Sopenharmony_ci
8258c2ecf20Sopenharmony_ci			/* We can't build XDP buff, go for single
8268c2ecf20Sopenharmony_ci			 * packet path but let's flush batched
8278c2ecf20Sopenharmony_ci			 * packets.
8288c2ecf20Sopenharmony_ci			 */
8298c2ecf20Sopenharmony_ci			vhost_tx_batch(net, nvq, sock, &msg);
8308c2ecf20Sopenharmony_ci			msg.msg_control = NULL;
8318c2ecf20Sopenharmony_ci		} else {
8328c2ecf20Sopenharmony_ci			if (tx_can_batch(vq, total_len))
8338c2ecf20Sopenharmony_ci				msg.msg_flags |= MSG_MORE;
8348c2ecf20Sopenharmony_ci			else
8358c2ecf20Sopenharmony_ci				msg.msg_flags &= ~MSG_MORE;
8368c2ecf20Sopenharmony_ci		}
8378c2ecf20Sopenharmony_ci
8388c2ecf20Sopenharmony_ci		/* TODO: Check specific error and bomb out unless ENOBUFS? */
8398c2ecf20Sopenharmony_ci		err = sock->ops->sendmsg(sock, &msg, len);
8408c2ecf20Sopenharmony_ci		if (unlikely(err < 0)) {
8418c2ecf20Sopenharmony_ci			vhost_discard_vq_desc(vq, 1);
8428c2ecf20Sopenharmony_ci			vhost_net_enable_vq(net, vq);
8438c2ecf20Sopenharmony_ci			break;
8448c2ecf20Sopenharmony_ci		}
8458c2ecf20Sopenharmony_ci		if (err != len)
8468c2ecf20Sopenharmony_ci			pr_debug("Truncated TX packet: len %d != %zd\n",
8478c2ecf20Sopenharmony_ci				 err, len);
8488c2ecf20Sopenharmony_cidone:
8498c2ecf20Sopenharmony_ci		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
8508c2ecf20Sopenharmony_ci		vq->heads[nvq->done_idx].len = 0;
8518c2ecf20Sopenharmony_ci		++nvq->done_idx;
8528c2ecf20Sopenharmony_ci	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
8538c2ecf20Sopenharmony_ci
8548c2ecf20Sopenharmony_ci	vhost_tx_batch(net, nvq, sock, &msg);
8558c2ecf20Sopenharmony_ci}
8568c2ecf20Sopenharmony_ci
8578c2ecf20Sopenharmony_cistatic void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
8588c2ecf20Sopenharmony_ci{
8598c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
8608c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = &nvq->vq;
8618c2ecf20Sopenharmony_ci	unsigned out, in;
8628c2ecf20Sopenharmony_ci	int head;
8638c2ecf20Sopenharmony_ci	struct msghdr msg = {
8648c2ecf20Sopenharmony_ci		.msg_name = NULL,
8658c2ecf20Sopenharmony_ci		.msg_namelen = 0,
8668c2ecf20Sopenharmony_ci		.msg_control = NULL,
8678c2ecf20Sopenharmony_ci		.msg_controllen = 0,
8688c2ecf20Sopenharmony_ci		.msg_flags = MSG_DONTWAIT,
8698c2ecf20Sopenharmony_ci	};
8708c2ecf20Sopenharmony_ci	struct tun_msg_ctl ctl;
8718c2ecf20Sopenharmony_ci	size_t len, total_len = 0;
8728c2ecf20Sopenharmony_ci	int err;
8738c2ecf20Sopenharmony_ci	struct vhost_net_ubuf_ref *ubufs;
8748c2ecf20Sopenharmony_ci	struct ubuf_info *ubuf;
8758c2ecf20Sopenharmony_ci	bool zcopy_used;
8768c2ecf20Sopenharmony_ci	int sent_pkts = 0;
8778c2ecf20Sopenharmony_ci
8788c2ecf20Sopenharmony_ci	do {
8798c2ecf20Sopenharmony_ci		bool busyloop_intr;
8808c2ecf20Sopenharmony_ci
8818c2ecf20Sopenharmony_ci		/* Release DMAs done buffers first */
8828c2ecf20Sopenharmony_ci		vhost_zerocopy_signal_used(net, vq);
8838c2ecf20Sopenharmony_ci
8848c2ecf20Sopenharmony_ci		busyloop_intr = false;
8858c2ecf20Sopenharmony_ci		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
8868c2ecf20Sopenharmony_ci				   &busyloop_intr);
8878c2ecf20Sopenharmony_ci		/* On error, stop handling until the next kick. */
8888c2ecf20Sopenharmony_ci		if (unlikely(head < 0))
8898c2ecf20Sopenharmony_ci			break;
8908c2ecf20Sopenharmony_ci		/* Nothing new?  Wait for eventfd to tell us they refilled. */
8918c2ecf20Sopenharmony_ci		if (head == vq->num) {
8928c2ecf20Sopenharmony_ci			if (unlikely(busyloop_intr)) {
8938c2ecf20Sopenharmony_ci				vhost_poll_queue(&vq->poll);
8948c2ecf20Sopenharmony_ci			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
8958c2ecf20Sopenharmony_ci				vhost_disable_notify(&net->dev, vq);
8968c2ecf20Sopenharmony_ci				continue;
8978c2ecf20Sopenharmony_ci			}
8988c2ecf20Sopenharmony_ci			break;
8998c2ecf20Sopenharmony_ci		}
9008c2ecf20Sopenharmony_ci
9018c2ecf20Sopenharmony_ci		zcopy_used = len >= VHOST_GOODCOPY_LEN
9028c2ecf20Sopenharmony_ci			     && !vhost_exceeds_maxpend(net)
9038c2ecf20Sopenharmony_ci			     && vhost_net_tx_select_zcopy(net);
9048c2ecf20Sopenharmony_ci
9058c2ecf20Sopenharmony_ci		/* use msg_control to pass vhost zerocopy ubuf info to skb */
9068c2ecf20Sopenharmony_ci		if (zcopy_used) {
9078c2ecf20Sopenharmony_ci			ubuf = nvq->ubuf_info + nvq->upend_idx;
9088c2ecf20Sopenharmony_ci			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
9098c2ecf20Sopenharmony_ci			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
9108c2ecf20Sopenharmony_ci			ubuf->callback = vhost_zerocopy_callback;
9118c2ecf20Sopenharmony_ci			ubuf->ctx = nvq->ubufs;
9128c2ecf20Sopenharmony_ci			ubuf->desc = nvq->upend_idx;
9138c2ecf20Sopenharmony_ci			refcount_set(&ubuf->refcnt, 1);
9148c2ecf20Sopenharmony_ci			msg.msg_control = &ctl;
9158c2ecf20Sopenharmony_ci			ctl.type = TUN_MSG_UBUF;
9168c2ecf20Sopenharmony_ci			ctl.ptr = ubuf;
9178c2ecf20Sopenharmony_ci			msg.msg_controllen = sizeof(ctl);
9188c2ecf20Sopenharmony_ci			ubufs = nvq->ubufs;
9198c2ecf20Sopenharmony_ci			atomic_inc(&ubufs->refcount);
9208c2ecf20Sopenharmony_ci			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
9218c2ecf20Sopenharmony_ci		} else {
9228c2ecf20Sopenharmony_ci			msg.msg_control = NULL;
9238c2ecf20Sopenharmony_ci			ubufs = NULL;
9248c2ecf20Sopenharmony_ci		}
9258c2ecf20Sopenharmony_ci		total_len += len;
9268c2ecf20Sopenharmony_ci		if (tx_can_batch(vq, total_len) &&
9278c2ecf20Sopenharmony_ci		    likely(!vhost_exceeds_maxpend(net))) {
9288c2ecf20Sopenharmony_ci			msg.msg_flags |= MSG_MORE;
9298c2ecf20Sopenharmony_ci		} else {
9308c2ecf20Sopenharmony_ci			msg.msg_flags &= ~MSG_MORE;
9318c2ecf20Sopenharmony_ci		}
9328c2ecf20Sopenharmony_ci
9338c2ecf20Sopenharmony_ci		/* TODO: Check specific error and bomb out unless ENOBUFS? */
9348c2ecf20Sopenharmony_ci		err = sock->ops->sendmsg(sock, &msg, len);
9358c2ecf20Sopenharmony_ci		if (unlikely(err < 0)) {
9368c2ecf20Sopenharmony_ci			if (zcopy_used) {
9378c2ecf20Sopenharmony_ci				if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS)
9388c2ecf20Sopenharmony_ci					vhost_net_ubuf_put(ubufs);
9398c2ecf20Sopenharmony_ci				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
9408c2ecf20Sopenharmony_ci					% UIO_MAXIOV;
9418c2ecf20Sopenharmony_ci			}
9428c2ecf20Sopenharmony_ci			vhost_discard_vq_desc(vq, 1);
9438c2ecf20Sopenharmony_ci			vhost_net_enable_vq(net, vq);
9448c2ecf20Sopenharmony_ci			break;
9458c2ecf20Sopenharmony_ci		}
9468c2ecf20Sopenharmony_ci		if (err != len)
9478c2ecf20Sopenharmony_ci			pr_debug("Truncated TX packet: "
9488c2ecf20Sopenharmony_ci				 " len %d != %zd\n", err, len);
9498c2ecf20Sopenharmony_ci		if (!zcopy_used)
9508c2ecf20Sopenharmony_ci			vhost_add_used_and_signal(&net->dev, vq, head, 0);
9518c2ecf20Sopenharmony_ci		else
9528c2ecf20Sopenharmony_ci			vhost_zerocopy_signal_used(net, vq);
9538c2ecf20Sopenharmony_ci		vhost_net_tx_packet(net);
9548c2ecf20Sopenharmony_ci	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
9558c2ecf20Sopenharmony_ci}
9568c2ecf20Sopenharmony_ci
9578c2ecf20Sopenharmony_ci/* Expects to be always run from workqueue - which acts as
9588c2ecf20Sopenharmony_ci * read-size critical section for our kind of RCU. */
9598c2ecf20Sopenharmony_cistatic void handle_tx(struct vhost_net *net)
9608c2ecf20Sopenharmony_ci{
9618c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
9628c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = &nvq->vq;
9638c2ecf20Sopenharmony_ci	struct socket *sock;
9648c2ecf20Sopenharmony_ci
9658c2ecf20Sopenharmony_ci	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
9668c2ecf20Sopenharmony_ci	sock = vhost_vq_get_backend(vq);
9678c2ecf20Sopenharmony_ci	if (!sock)
9688c2ecf20Sopenharmony_ci		goto out;
9698c2ecf20Sopenharmony_ci
9708c2ecf20Sopenharmony_ci	if (!vq_meta_prefetch(vq))
9718c2ecf20Sopenharmony_ci		goto out;
9728c2ecf20Sopenharmony_ci
9738c2ecf20Sopenharmony_ci	vhost_disable_notify(&net->dev, vq);
9748c2ecf20Sopenharmony_ci	vhost_net_disable_vq(net, vq);
9758c2ecf20Sopenharmony_ci
9768c2ecf20Sopenharmony_ci	if (vhost_sock_zcopy(sock))
9778c2ecf20Sopenharmony_ci		handle_tx_zerocopy(net, sock);
9788c2ecf20Sopenharmony_ci	else
9798c2ecf20Sopenharmony_ci		handle_tx_copy(net, sock);
9808c2ecf20Sopenharmony_ci
9818c2ecf20Sopenharmony_ciout:
9828c2ecf20Sopenharmony_ci	mutex_unlock(&vq->mutex);
9838c2ecf20Sopenharmony_ci}
9848c2ecf20Sopenharmony_ci
9858c2ecf20Sopenharmony_cistatic int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
9868c2ecf20Sopenharmony_ci{
9878c2ecf20Sopenharmony_ci	struct sk_buff *head;
9888c2ecf20Sopenharmony_ci	int len = 0;
9898c2ecf20Sopenharmony_ci	unsigned long flags;
9908c2ecf20Sopenharmony_ci
9918c2ecf20Sopenharmony_ci	if (rvq->rx_ring)
9928c2ecf20Sopenharmony_ci		return vhost_net_buf_peek(rvq);
9938c2ecf20Sopenharmony_ci
9948c2ecf20Sopenharmony_ci	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
9958c2ecf20Sopenharmony_ci	head = skb_peek(&sk->sk_receive_queue);
9968c2ecf20Sopenharmony_ci	if (likely(head)) {
9978c2ecf20Sopenharmony_ci		len = head->len;
9988c2ecf20Sopenharmony_ci		if (skb_vlan_tag_present(head))
9998c2ecf20Sopenharmony_ci			len += VLAN_HLEN;
10008c2ecf20Sopenharmony_ci	}
10018c2ecf20Sopenharmony_ci
10028c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
10038c2ecf20Sopenharmony_ci	return len;
10048c2ecf20Sopenharmony_ci}
10058c2ecf20Sopenharmony_ci
10068c2ecf20Sopenharmony_cistatic int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
10078c2ecf20Sopenharmony_ci				      bool *busyloop_intr)
10088c2ecf20Sopenharmony_ci{
10098c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
10108c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
10118c2ecf20Sopenharmony_ci	struct vhost_virtqueue *rvq = &rnvq->vq;
10128c2ecf20Sopenharmony_ci	struct vhost_virtqueue *tvq = &tnvq->vq;
10138c2ecf20Sopenharmony_ci	int len = peek_head_len(rnvq, sk);
10148c2ecf20Sopenharmony_ci
10158c2ecf20Sopenharmony_ci	if (!len && rvq->busyloop_timeout) {
10168c2ecf20Sopenharmony_ci		/* Flush batched heads first */
10178c2ecf20Sopenharmony_ci		vhost_net_signal_used(rnvq);
10188c2ecf20Sopenharmony_ci		/* Both tx vq and rx socket were polled here */
10198c2ecf20Sopenharmony_ci		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
10208c2ecf20Sopenharmony_ci
10218c2ecf20Sopenharmony_ci		len = peek_head_len(rnvq, sk);
10228c2ecf20Sopenharmony_ci	}
10238c2ecf20Sopenharmony_ci
10248c2ecf20Sopenharmony_ci	return len;
10258c2ecf20Sopenharmony_ci}
10268c2ecf20Sopenharmony_ci
10278c2ecf20Sopenharmony_ci/* This is a multi-buffer version of vhost_get_desc, that works if
10288c2ecf20Sopenharmony_ci *	vq has read descriptors only.
10298c2ecf20Sopenharmony_ci * @vq		- the relevant virtqueue
10308c2ecf20Sopenharmony_ci * @datalen	- data length we'll be reading
10318c2ecf20Sopenharmony_ci * @iovcount	- returned count of io vectors we fill
10328c2ecf20Sopenharmony_ci * @log		- vhost log
10338c2ecf20Sopenharmony_ci * @log_num	- log offset
10348c2ecf20Sopenharmony_ci * @quota       - headcount quota, 1 for big buffer
10358c2ecf20Sopenharmony_ci *	returns number of buffer heads allocated, negative on error
10368c2ecf20Sopenharmony_ci */
10378c2ecf20Sopenharmony_cistatic int get_rx_bufs(struct vhost_virtqueue *vq,
10388c2ecf20Sopenharmony_ci		       struct vring_used_elem *heads,
10398c2ecf20Sopenharmony_ci		       int datalen,
10408c2ecf20Sopenharmony_ci		       unsigned *iovcount,
10418c2ecf20Sopenharmony_ci		       struct vhost_log *log,
10428c2ecf20Sopenharmony_ci		       unsigned *log_num,
10438c2ecf20Sopenharmony_ci		       unsigned int quota)
10448c2ecf20Sopenharmony_ci{
10458c2ecf20Sopenharmony_ci	unsigned int out, in;
10468c2ecf20Sopenharmony_ci	int seg = 0;
10478c2ecf20Sopenharmony_ci	int headcount = 0;
10488c2ecf20Sopenharmony_ci	unsigned d;
10498c2ecf20Sopenharmony_ci	int r, nlogs = 0;
10508c2ecf20Sopenharmony_ci	/* len is always initialized before use since we are always called with
10518c2ecf20Sopenharmony_ci	 * datalen > 0.
10528c2ecf20Sopenharmony_ci	 */
10538c2ecf20Sopenharmony_ci	u32 len;
10548c2ecf20Sopenharmony_ci
10558c2ecf20Sopenharmony_ci	while (datalen > 0 && headcount < quota) {
10568c2ecf20Sopenharmony_ci		if (unlikely(seg >= UIO_MAXIOV)) {
10578c2ecf20Sopenharmony_ci			r = -ENOBUFS;
10588c2ecf20Sopenharmony_ci			goto err;
10598c2ecf20Sopenharmony_ci		}
10608c2ecf20Sopenharmony_ci		r = vhost_get_vq_desc(vq, vq->iov + seg,
10618c2ecf20Sopenharmony_ci				      ARRAY_SIZE(vq->iov) - seg, &out,
10628c2ecf20Sopenharmony_ci				      &in, log, log_num);
10638c2ecf20Sopenharmony_ci		if (unlikely(r < 0))
10648c2ecf20Sopenharmony_ci			goto err;
10658c2ecf20Sopenharmony_ci
10668c2ecf20Sopenharmony_ci		d = r;
10678c2ecf20Sopenharmony_ci		if (d == vq->num) {
10688c2ecf20Sopenharmony_ci			r = 0;
10698c2ecf20Sopenharmony_ci			goto err;
10708c2ecf20Sopenharmony_ci		}
10718c2ecf20Sopenharmony_ci		if (unlikely(out || in <= 0)) {
10728c2ecf20Sopenharmony_ci			vq_err(vq, "unexpected descriptor format for RX: "
10738c2ecf20Sopenharmony_ci				"out %d, in %d\n", out, in);
10748c2ecf20Sopenharmony_ci			r = -EINVAL;
10758c2ecf20Sopenharmony_ci			goto err;
10768c2ecf20Sopenharmony_ci		}
10778c2ecf20Sopenharmony_ci		if (unlikely(log)) {
10788c2ecf20Sopenharmony_ci			nlogs += *log_num;
10798c2ecf20Sopenharmony_ci			log += *log_num;
10808c2ecf20Sopenharmony_ci		}
10818c2ecf20Sopenharmony_ci		heads[headcount].id = cpu_to_vhost32(vq, d);
10828c2ecf20Sopenharmony_ci		len = iov_length(vq->iov + seg, in);
10838c2ecf20Sopenharmony_ci		heads[headcount].len = cpu_to_vhost32(vq, len);
10848c2ecf20Sopenharmony_ci		datalen -= len;
10858c2ecf20Sopenharmony_ci		++headcount;
10868c2ecf20Sopenharmony_ci		seg += in;
10878c2ecf20Sopenharmony_ci	}
10888c2ecf20Sopenharmony_ci	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
10898c2ecf20Sopenharmony_ci	*iovcount = seg;
10908c2ecf20Sopenharmony_ci	if (unlikely(log))
10918c2ecf20Sopenharmony_ci		*log_num = nlogs;
10928c2ecf20Sopenharmony_ci
10938c2ecf20Sopenharmony_ci	/* Detect overrun */
10948c2ecf20Sopenharmony_ci	if (unlikely(datalen > 0)) {
10958c2ecf20Sopenharmony_ci		r = UIO_MAXIOV + 1;
10968c2ecf20Sopenharmony_ci		goto err;
10978c2ecf20Sopenharmony_ci	}
10988c2ecf20Sopenharmony_ci	return headcount;
10998c2ecf20Sopenharmony_cierr:
11008c2ecf20Sopenharmony_ci	vhost_discard_vq_desc(vq, headcount);
11018c2ecf20Sopenharmony_ci	return r;
11028c2ecf20Sopenharmony_ci}
11038c2ecf20Sopenharmony_ci
11048c2ecf20Sopenharmony_ci/* Expects to be always run from workqueue - which acts as
11058c2ecf20Sopenharmony_ci * read-size critical section for our kind of RCU. */
11068c2ecf20Sopenharmony_cistatic void handle_rx(struct vhost_net *net)
11078c2ecf20Sopenharmony_ci{
11088c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
11098c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = &nvq->vq;
11108c2ecf20Sopenharmony_ci	unsigned in, log;
11118c2ecf20Sopenharmony_ci	struct vhost_log *vq_log;
11128c2ecf20Sopenharmony_ci	struct msghdr msg = {
11138c2ecf20Sopenharmony_ci		.msg_name = NULL,
11148c2ecf20Sopenharmony_ci		.msg_namelen = 0,
11158c2ecf20Sopenharmony_ci		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
11168c2ecf20Sopenharmony_ci		.msg_controllen = 0,
11178c2ecf20Sopenharmony_ci		.msg_flags = MSG_DONTWAIT,
11188c2ecf20Sopenharmony_ci	};
11198c2ecf20Sopenharmony_ci	struct virtio_net_hdr hdr = {
11208c2ecf20Sopenharmony_ci		.flags = 0,
11218c2ecf20Sopenharmony_ci		.gso_type = VIRTIO_NET_HDR_GSO_NONE
11228c2ecf20Sopenharmony_ci	};
11238c2ecf20Sopenharmony_ci	size_t total_len = 0;
11248c2ecf20Sopenharmony_ci	int err, mergeable;
11258c2ecf20Sopenharmony_ci	s16 headcount;
11268c2ecf20Sopenharmony_ci	size_t vhost_hlen, sock_hlen;
11278c2ecf20Sopenharmony_ci	size_t vhost_len, sock_len;
11288c2ecf20Sopenharmony_ci	bool busyloop_intr = false;
11298c2ecf20Sopenharmony_ci	struct socket *sock;
11308c2ecf20Sopenharmony_ci	struct iov_iter fixup;
11318c2ecf20Sopenharmony_ci	__virtio16 num_buffers;
11328c2ecf20Sopenharmony_ci	int recv_pkts = 0;
11338c2ecf20Sopenharmony_ci
11348c2ecf20Sopenharmony_ci	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
11358c2ecf20Sopenharmony_ci	sock = vhost_vq_get_backend(vq);
11368c2ecf20Sopenharmony_ci	if (!sock)
11378c2ecf20Sopenharmony_ci		goto out;
11388c2ecf20Sopenharmony_ci
11398c2ecf20Sopenharmony_ci	if (!vq_meta_prefetch(vq))
11408c2ecf20Sopenharmony_ci		goto out;
11418c2ecf20Sopenharmony_ci
11428c2ecf20Sopenharmony_ci	vhost_disable_notify(&net->dev, vq);
11438c2ecf20Sopenharmony_ci	vhost_net_disable_vq(net, vq);
11448c2ecf20Sopenharmony_ci
11458c2ecf20Sopenharmony_ci	vhost_hlen = nvq->vhost_hlen;
11468c2ecf20Sopenharmony_ci	sock_hlen = nvq->sock_hlen;
11478c2ecf20Sopenharmony_ci
11488c2ecf20Sopenharmony_ci	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
11498c2ecf20Sopenharmony_ci		vq->log : NULL;
11508c2ecf20Sopenharmony_ci	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
11518c2ecf20Sopenharmony_ci
11528c2ecf20Sopenharmony_ci	do {
11538c2ecf20Sopenharmony_ci		sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
11548c2ecf20Sopenharmony_ci						      &busyloop_intr);
11558c2ecf20Sopenharmony_ci		if (!sock_len)
11568c2ecf20Sopenharmony_ci			break;
11578c2ecf20Sopenharmony_ci		sock_len += sock_hlen;
11588c2ecf20Sopenharmony_ci		vhost_len = sock_len + vhost_hlen;
11598c2ecf20Sopenharmony_ci		headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
11608c2ecf20Sopenharmony_ci					vhost_len, &in, vq_log, &log,
11618c2ecf20Sopenharmony_ci					likely(mergeable) ? UIO_MAXIOV : 1);
11628c2ecf20Sopenharmony_ci		/* On error, stop handling until the next kick. */
11638c2ecf20Sopenharmony_ci		if (unlikely(headcount < 0))
11648c2ecf20Sopenharmony_ci			goto out;
11658c2ecf20Sopenharmony_ci		/* OK, now we need to know about added descriptors. */
11668c2ecf20Sopenharmony_ci		if (!headcount) {
11678c2ecf20Sopenharmony_ci			if (unlikely(busyloop_intr)) {
11688c2ecf20Sopenharmony_ci				vhost_poll_queue(&vq->poll);
11698c2ecf20Sopenharmony_ci			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
11708c2ecf20Sopenharmony_ci				/* They have slipped one in as we were
11718c2ecf20Sopenharmony_ci				 * doing that: check again. */
11728c2ecf20Sopenharmony_ci				vhost_disable_notify(&net->dev, vq);
11738c2ecf20Sopenharmony_ci				continue;
11748c2ecf20Sopenharmony_ci			}
11758c2ecf20Sopenharmony_ci			/* Nothing new?  Wait for eventfd to tell us
11768c2ecf20Sopenharmony_ci			 * they refilled. */
11778c2ecf20Sopenharmony_ci			goto out;
11788c2ecf20Sopenharmony_ci		}
11798c2ecf20Sopenharmony_ci		busyloop_intr = false;
11808c2ecf20Sopenharmony_ci		if (nvq->rx_ring)
11818c2ecf20Sopenharmony_ci			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
11828c2ecf20Sopenharmony_ci		/* On overrun, truncate and discard */
11838c2ecf20Sopenharmony_ci		if (unlikely(headcount > UIO_MAXIOV)) {
11848c2ecf20Sopenharmony_ci			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
11858c2ecf20Sopenharmony_ci			err = sock->ops->recvmsg(sock, &msg,
11868c2ecf20Sopenharmony_ci						 1, MSG_DONTWAIT | MSG_TRUNC);
11878c2ecf20Sopenharmony_ci			pr_debug("Discarded rx packet: len %zd\n", sock_len);
11888c2ecf20Sopenharmony_ci			continue;
11898c2ecf20Sopenharmony_ci		}
11908c2ecf20Sopenharmony_ci		/* We don't need to be notified again. */
11918c2ecf20Sopenharmony_ci		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
11928c2ecf20Sopenharmony_ci		fixup = msg.msg_iter;
11938c2ecf20Sopenharmony_ci		if (unlikely((vhost_hlen))) {
11948c2ecf20Sopenharmony_ci			/* We will supply the header ourselves
11958c2ecf20Sopenharmony_ci			 * TODO: support TSO.
11968c2ecf20Sopenharmony_ci			 */
11978c2ecf20Sopenharmony_ci			iov_iter_advance(&msg.msg_iter, vhost_hlen);
11988c2ecf20Sopenharmony_ci		}
11998c2ecf20Sopenharmony_ci		err = sock->ops->recvmsg(sock, &msg,
12008c2ecf20Sopenharmony_ci					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
12018c2ecf20Sopenharmony_ci		/* Userspace might have consumed the packet meanwhile:
12028c2ecf20Sopenharmony_ci		 * it's not supposed to do this usually, but might be hard
12038c2ecf20Sopenharmony_ci		 * to prevent. Discard data we got (if any) and keep going. */
12048c2ecf20Sopenharmony_ci		if (unlikely(err != sock_len)) {
12058c2ecf20Sopenharmony_ci			pr_debug("Discarded rx packet: "
12068c2ecf20Sopenharmony_ci				 " len %d, expected %zd\n", err, sock_len);
12078c2ecf20Sopenharmony_ci			vhost_discard_vq_desc(vq, headcount);
12088c2ecf20Sopenharmony_ci			continue;
12098c2ecf20Sopenharmony_ci		}
12108c2ecf20Sopenharmony_ci		/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
12118c2ecf20Sopenharmony_ci		if (unlikely(vhost_hlen)) {
12128c2ecf20Sopenharmony_ci			if (copy_to_iter(&hdr, sizeof(hdr),
12138c2ecf20Sopenharmony_ci					 &fixup) != sizeof(hdr)) {
12148c2ecf20Sopenharmony_ci				vq_err(vq, "Unable to write vnet_hdr "
12158c2ecf20Sopenharmony_ci				       "at addr %p\n", vq->iov->iov_base);
12168c2ecf20Sopenharmony_ci				goto out;
12178c2ecf20Sopenharmony_ci			}
12188c2ecf20Sopenharmony_ci		} else {
12198c2ecf20Sopenharmony_ci			/* Header came from socket; we'll need to patch
12208c2ecf20Sopenharmony_ci			 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
12218c2ecf20Sopenharmony_ci			 */
12228c2ecf20Sopenharmony_ci			iov_iter_advance(&fixup, sizeof(hdr));
12238c2ecf20Sopenharmony_ci		}
12248c2ecf20Sopenharmony_ci		/* TODO: Should check and handle checksum. */
12258c2ecf20Sopenharmony_ci
12268c2ecf20Sopenharmony_ci		num_buffers = cpu_to_vhost16(vq, headcount);
12278c2ecf20Sopenharmony_ci		if (likely(mergeable) &&
12288c2ecf20Sopenharmony_ci		    copy_to_iter(&num_buffers, sizeof num_buffers,
12298c2ecf20Sopenharmony_ci				 &fixup) != sizeof num_buffers) {
12308c2ecf20Sopenharmony_ci			vq_err(vq, "Failed num_buffers write");
12318c2ecf20Sopenharmony_ci			vhost_discard_vq_desc(vq, headcount);
12328c2ecf20Sopenharmony_ci			goto out;
12338c2ecf20Sopenharmony_ci		}
12348c2ecf20Sopenharmony_ci		nvq->done_idx += headcount;
12358c2ecf20Sopenharmony_ci		if (nvq->done_idx > VHOST_NET_BATCH)
12368c2ecf20Sopenharmony_ci			vhost_net_signal_used(nvq);
12378c2ecf20Sopenharmony_ci		if (unlikely(vq_log))
12388c2ecf20Sopenharmony_ci			vhost_log_write(vq, vq_log, log, vhost_len,
12398c2ecf20Sopenharmony_ci					vq->iov, in);
12408c2ecf20Sopenharmony_ci		total_len += vhost_len;
12418c2ecf20Sopenharmony_ci	} while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
12428c2ecf20Sopenharmony_ci
12438c2ecf20Sopenharmony_ci	if (unlikely(busyloop_intr))
12448c2ecf20Sopenharmony_ci		vhost_poll_queue(&vq->poll);
12458c2ecf20Sopenharmony_ci	else if (!sock_len)
12468c2ecf20Sopenharmony_ci		vhost_net_enable_vq(net, vq);
12478c2ecf20Sopenharmony_ciout:
12488c2ecf20Sopenharmony_ci	vhost_net_signal_used(nvq);
12498c2ecf20Sopenharmony_ci	mutex_unlock(&vq->mutex);
12508c2ecf20Sopenharmony_ci}
12518c2ecf20Sopenharmony_ci
12528c2ecf20Sopenharmony_cistatic void handle_tx_kick(struct vhost_work *work)
12538c2ecf20Sopenharmony_ci{
12548c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
12558c2ecf20Sopenharmony_ci						  poll.work);
12568c2ecf20Sopenharmony_ci	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
12578c2ecf20Sopenharmony_ci
12588c2ecf20Sopenharmony_ci	handle_tx(net);
12598c2ecf20Sopenharmony_ci}
12608c2ecf20Sopenharmony_ci
12618c2ecf20Sopenharmony_cistatic void handle_rx_kick(struct vhost_work *work)
12628c2ecf20Sopenharmony_ci{
12638c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
12648c2ecf20Sopenharmony_ci						  poll.work);
12658c2ecf20Sopenharmony_ci	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
12668c2ecf20Sopenharmony_ci
12678c2ecf20Sopenharmony_ci	handle_rx(net);
12688c2ecf20Sopenharmony_ci}
12698c2ecf20Sopenharmony_ci
12708c2ecf20Sopenharmony_cistatic void handle_tx_net(struct vhost_work *work)
12718c2ecf20Sopenharmony_ci{
12728c2ecf20Sopenharmony_ci	struct vhost_net *net = container_of(work, struct vhost_net,
12738c2ecf20Sopenharmony_ci					     poll[VHOST_NET_VQ_TX].work);
12748c2ecf20Sopenharmony_ci	handle_tx(net);
12758c2ecf20Sopenharmony_ci}
12768c2ecf20Sopenharmony_ci
12778c2ecf20Sopenharmony_cistatic void handle_rx_net(struct vhost_work *work)
12788c2ecf20Sopenharmony_ci{
12798c2ecf20Sopenharmony_ci	struct vhost_net *net = container_of(work, struct vhost_net,
12808c2ecf20Sopenharmony_ci					     poll[VHOST_NET_VQ_RX].work);
12818c2ecf20Sopenharmony_ci	handle_rx(net);
12828c2ecf20Sopenharmony_ci}
12838c2ecf20Sopenharmony_ci
12848c2ecf20Sopenharmony_cistatic int vhost_net_open(struct inode *inode, struct file *f)
12858c2ecf20Sopenharmony_ci{
12868c2ecf20Sopenharmony_ci	struct vhost_net *n;
12878c2ecf20Sopenharmony_ci	struct vhost_dev *dev;
12888c2ecf20Sopenharmony_ci	struct vhost_virtqueue **vqs;
12898c2ecf20Sopenharmony_ci	void **queue;
12908c2ecf20Sopenharmony_ci	struct xdp_buff *xdp;
12918c2ecf20Sopenharmony_ci	int i;
12928c2ecf20Sopenharmony_ci
12938c2ecf20Sopenharmony_ci	n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
12948c2ecf20Sopenharmony_ci	if (!n)
12958c2ecf20Sopenharmony_ci		return -ENOMEM;
12968c2ecf20Sopenharmony_ci	vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL);
12978c2ecf20Sopenharmony_ci	if (!vqs) {
12988c2ecf20Sopenharmony_ci		kvfree(n);
12998c2ecf20Sopenharmony_ci		return -ENOMEM;
13008c2ecf20Sopenharmony_ci	}
13018c2ecf20Sopenharmony_ci
13028c2ecf20Sopenharmony_ci	queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *),
13038c2ecf20Sopenharmony_ci			      GFP_KERNEL);
13048c2ecf20Sopenharmony_ci	if (!queue) {
13058c2ecf20Sopenharmony_ci		kfree(vqs);
13068c2ecf20Sopenharmony_ci		kvfree(n);
13078c2ecf20Sopenharmony_ci		return -ENOMEM;
13088c2ecf20Sopenharmony_ci	}
13098c2ecf20Sopenharmony_ci	n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
13108c2ecf20Sopenharmony_ci
13118c2ecf20Sopenharmony_ci	xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL);
13128c2ecf20Sopenharmony_ci	if (!xdp) {
13138c2ecf20Sopenharmony_ci		kfree(vqs);
13148c2ecf20Sopenharmony_ci		kvfree(n);
13158c2ecf20Sopenharmony_ci		kfree(queue);
13168c2ecf20Sopenharmony_ci		return -ENOMEM;
13178c2ecf20Sopenharmony_ci	}
13188c2ecf20Sopenharmony_ci	n->vqs[VHOST_NET_VQ_TX].xdp = xdp;
13198c2ecf20Sopenharmony_ci
13208c2ecf20Sopenharmony_ci	dev = &n->dev;
13218c2ecf20Sopenharmony_ci	vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
13228c2ecf20Sopenharmony_ci	vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
13238c2ecf20Sopenharmony_ci	n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
13248c2ecf20Sopenharmony_ci	n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
13258c2ecf20Sopenharmony_ci	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
13268c2ecf20Sopenharmony_ci		n->vqs[i].ubufs = NULL;
13278c2ecf20Sopenharmony_ci		n->vqs[i].ubuf_info = NULL;
13288c2ecf20Sopenharmony_ci		n->vqs[i].upend_idx = 0;
13298c2ecf20Sopenharmony_ci		n->vqs[i].done_idx = 0;
13308c2ecf20Sopenharmony_ci		n->vqs[i].batched_xdp = 0;
13318c2ecf20Sopenharmony_ci		n->vqs[i].vhost_hlen = 0;
13328c2ecf20Sopenharmony_ci		n->vqs[i].sock_hlen = 0;
13338c2ecf20Sopenharmony_ci		n->vqs[i].rx_ring = NULL;
13348c2ecf20Sopenharmony_ci		vhost_net_buf_init(&n->vqs[i].rxq);
13358c2ecf20Sopenharmony_ci	}
13368c2ecf20Sopenharmony_ci	vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
13378c2ecf20Sopenharmony_ci		       UIO_MAXIOV + VHOST_NET_BATCH,
13388c2ecf20Sopenharmony_ci		       VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
13398c2ecf20Sopenharmony_ci		       NULL);
13408c2ecf20Sopenharmony_ci
13418c2ecf20Sopenharmony_ci	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
13428c2ecf20Sopenharmony_ci	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
13438c2ecf20Sopenharmony_ci
13448c2ecf20Sopenharmony_ci	f->private_data = n;
13458c2ecf20Sopenharmony_ci	n->page_frag.page = NULL;
13468c2ecf20Sopenharmony_ci	n->refcnt_bias = 0;
13478c2ecf20Sopenharmony_ci
13488c2ecf20Sopenharmony_ci	return 0;
13498c2ecf20Sopenharmony_ci}
13508c2ecf20Sopenharmony_ci
13518c2ecf20Sopenharmony_cistatic struct socket *vhost_net_stop_vq(struct vhost_net *n,
13528c2ecf20Sopenharmony_ci					struct vhost_virtqueue *vq)
13538c2ecf20Sopenharmony_ci{
13548c2ecf20Sopenharmony_ci	struct socket *sock;
13558c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq =
13568c2ecf20Sopenharmony_ci		container_of(vq, struct vhost_net_virtqueue, vq);
13578c2ecf20Sopenharmony_ci
13588c2ecf20Sopenharmony_ci	mutex_lock(&vq->mutex);
13598c2ecf20Sopenharmony_ci	sock = vhost_vq_get_backend(vq);
13608c2ecf20Sopenharmony_ci	vhost_net_disable_vq(n, vq);
13618c2ecf20Sopenharmony_ci	vhost_vq_set_backend(vq, NULL);
13628c2ecf20Sopenharmony_ci	vhost_net_buf_unproduce(nvq);
13638c2ecf20Sopenharmony_ci	nvq->rx_ring = NULL;
13648c2ecf20Sopenharmony_ci	mutex_unlock(&vq->mutex);
13658c2ecf20Sopenharmony_ci	return sock;
13668c2ecf20Sopenharmony_ci}
13678c2ecf20Sopenharmony_ci
13688c2ecf20Sopenharmony_cistatic void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
13698c2ecf20Sopenharmony_ci			   struct socket **rx_sock)
13708c2ecf20Sopenharmony_ci{
13718c2ecf20Sopenharmony_ci	*tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
13728c2ecf20Sopenharmony_ci	*rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
13738c2ecf20Sopenharmony_ci}
13748c2ecf20Sopenharmony_ci
13758c2ecf20Sopenharmony_cistatic void vhost_net_flush_vq(struct vhost_net *n, int index)
13768c2ecf20Sopenharmony_ci{
13778c2ecf20Sopenharmony_ci	vhost_poll_flush(n->poll + index);
13788c2ecf20Sopenharmony_ci	vhost_poll_flush(&n->vqs[index].vq.poll);
13798c2ecf20Sopenharmony_ci}
13808c2ecf20Sopenharmony_ci
13818c2ecf20Sopenharmony_cistatic void vhost_net_flush(struct vhost_net *n)
13828c2ecf20Sopenharmony_ci{
13838c2ecf20Sopenharmony_ci	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
13848c2ecf20Sopenharmony_ci	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
13858c2ecf20Sopenharmony_ci	if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
13868c2ecf20Sopenharmony_ci		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13878c2ecf20Sopenharmony_ci		n->tx_flush = true;
13888c2ecf20Sopenharmony_ci		mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13898c2ecf20Sopenharmony_ci		/* Wait for all lower device DMAs done. */
13908c2ecf20Sopenharmony_ci		vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
13918c2ecf20Sopenharmony_ci		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13928c2ecf20Sopenharmony_ci		n->tx_flush = false;
13938c2ecf20Sopenharmony_ci		atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
13948c2ecf20Sopenharmony_ci		mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13958c2ecf20Sopenharmony_ci	}
13968c2ecf20Sopenharmony_ci}
13978c2ecf20Sopenharmony_ci
13988c2ecf20Sopenharmony_cistatic int vhost_net_release(struct inode *inode, struct file *f)
13998c2ecf20Sopenharmony_ci{
14008c2ecf20Sopenharmony_ci	struct vhost_net *n = f->private_data;
14018c2ecf20Sopenharmony_ci	struct socket *tx_sock;
14028c2ecf20Sopenharmony_ci	struct socket *rx_sock;
14038c2ecf20Sopenharmony_ci
14048c2ecf20Sopenharmony_ci	vhost_net_stop(n, &tx_sock, &rx_sock);
14058c2ecf20Sopenharmony_ci	vhost_net_flush(n);
14068c2ecf20Sopenharmony_ci	vhost_dev_stop(&n->dev);
14078c2ecf20Sopenharmony_ci	vhost_dev_cleanup(&n->dev);
14088c2ecf20Sopenharmony_ci	vhost_net_vq_reset(n);
14098c2ecf20Sopenharmony_ci	if (tx_sock)
14108c2ecf20Sopenharmony_ci		sockfd_put(tx_sock);
14118c2ecf20Sopenharmony_ci	if (rx_sock)
14128c2ecf20Sopenharmony_ci		sockfd_put(rx_sock);
14138c2ecf20Sopenharmony_ci	/* Make sure no callbacks are outstanding */
14148c2ecf20Sopenharmony_ci	synchronize_rcu();
14158c2ecf20Sopenharmony_ci	/* We do an extra flush before freeing memory,
14168c2ecf20Sopenharmony_ci	 * since jobs can re-queue themselves. */
14178c2ecf20Sopenharmony_ci	vhost_net_flush(n);
14188c2ecf20Sopenharmony_ci	kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
14198c2ecf20Sopenharmony_ci	kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
14208c2ecf20Sopenharmony_ci	kfree(n->dev.vqs);
14218c2ecf20Sopenharmony_ci	if (n->page_frag.page)
14228c2ecf20Sopenharmony_ci		__page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
14238c2ecf20Sopenharmony_ci	kvfree(n);
14248c2ecf20Sopenharmony_ci	return 0;
14258c2ecf20Sopenharmony_ci}
14268c2ecf20Sopenharmony_ci
14278c2ecf20Sopenharmony_cistatic struct socket *get_raw_socket(int fd)
14288c2ecf20Sopenharmony_ci{
14298c2ecf20Sopenharmony_ci	int r;
14308c2ecf20Sopenharmony_ci	struct socket *sock = sockfd_lookup(fd, &r);
14318c2ecf20Sopenharmony_ci
14328c2ecf20Sopenharmony_ci	if (!sock)
14338c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOTSOCK);
14348c2ecf20Sopenharmony_ci
14358c2ecf20Sopenharmony_ci	/* Parameter checking */
14368c2ecf20Sopenharmony_ci	if (sock->sk->sk_type != SOCK_RAW) {
14378c2ecf20Sopenharmony_ci		r = -ESOCKTNOSUPPORT;
14388c2ecf20Sopenharmony_ci		goto err;
14398c2ecf20Sopenharmony_ci	}
14408c2ecf20Sopenharmony_ci
14418c2ecf20Sopenharmony_ci	if (sock->sk->sk_family != AF_PACKET) {
14428c2ecf20Sopenharmony_ci		r = -EPFNOSUPPORT;
14438c2ecf20Sopenharmony_ci		goto err;
14448c2ecf20Sopenharmony_ci	}
14458c2ecf20Sopenharmony_ci	return sock;
14468c2ecf20Sopenharmony_cierr:
14478c2ecf20Sopenharmony_ci	sockfd_put(sock);
14488c2ecf20Sopenharmony_ci	return ERR_PTR(r);
14498c2ecf20Sopenharmony_ci}
14508c2ecf20Sopenharmony_ci
14518c2ecf20Sopenharmony_cistatic struct ptr_ring *get_tap_ptr_ring(struct file *file)
14528c2ecf20Sopenharmony_ci{
14538c2ecf20Sopenharmony_ci	struct ptr_ring *ring;
14548c2ecf20Sopenharmony_ci	ring = tun_get_tx_ring(file);
14558c2ecf20Sopenharmony_ci	if (!IS_ERR(ring))
14568c2ecf20Sopenharmony_ci		goto out;
14578c2ecf20Sopenharmony_ci	ring = tap_get_ptr_ring(file);
14588c2ecf20Sopenharmony_ci	if (!IS_ERR(ring))
14598c2ecf20Sopenharmony_ci		goto out;
14608c2ecf20Sopenharmony_ci	ring = NULL;
14618c2ecf20Sopenharmony_ciout:
14628c2ecf20Sopenharmony_ci	return ring;
14638c2ecf20Sopenharmony_ci}
14648c2ecf20Sopenharmony_ci
14658c2ecf20Sopenharmony_cistatic struct socket *get_tap_socket(int fd)
14668c2ecf20Sopenharmony_ci{
14678c2ecf20Sopenharmony_ci	struct file *file = fget(fd);
14688c2ecf20Sopenharmony_ci	struct socket *sock;
14698c2ecf20Sopenharmony_ci
14708c2ecf20Sopenharmony_ci	if (!file)
14718c2ecf20Sopenharmony_ci		return ERR_PTR(-EBADF);
14728c2ecf20Sopenharmony_ci	sock = tun_get_socket(file);
14738c2ecf20Sopenharmony_ci	if (!IS_ERR(sock))
14748c2ecf20Sopenharmony_ci		return sock;
14758c2ecf20Sopenharmony_ci	sock = tap_get_socket(file);
14768c2ecf20Sopenharmony_ci	if (IS_ERR(sock))
14778c2ecf20Sopenharmony_ci		fput(file);
14788c2ecf20Sopenharmony_ci	return sock;
14798c2ecf20Sopenharmony_ci}
14808c2ecf20Sopenharmony_ci
14818c2ecf20Sopenharmony_cistatic struct socket *get_socket(int fd)
14828c2ecf20Sopenharmony_ci{
14838c2ecf20Sopenharmony_ci	struct socket *sock;
14848c2ecf20Sopenharmony_ci
14858c2ecf20Sopenharmony_ci	/* special case to disable backend */
14868c2ecf20Sopenharmony_ci	if (fd == -1)
14878c2ecf20Sopenharmony_ci		return NULL;
14888c2ecf20Sopenharmony_ci	sock = get_raw_socket(fd);
14898c2ecf20Sopenharmony_ci	if (!IS_ERR(sock))
14908c2ecf20Sopenharmony_ci		return sock;
14918c2ecf20Sopenharmony_ci	sock = get_tap_socket(fd);
14928c2ecf20Sopenharmony_ci	if (!IS_ERR(sock))
14938c2ecf20Sopenharmony_ci		return sock;
14948c2ecf20Sopenharmony_ci	return ERR_PTR(-ENOTSOCK);
14958c2ecf20Sopenharmony_ci}
14968c2ecf20Sopenharmony_ci
14978c2ecf20Sopenharmony_cistatic long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
14988c2ecf20Sopenharmony_ci{
14998c2ecf20Sopenharmony_ci	struct socket *sock, *oldsock;
15008c2ecf20Sopenharmony_ci	struct vhost_virtqueue *vq;
15018c2ecf20Sopenharmony_ci	struct vhost_net_virtqueue *nvq;
15028c2ecf20Sopenharmony_ci	struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL;
15038c2ecf20Sopenharmony_ci	int r;
15048c2ecf20Sopenharmony_ci
15058c2ecf20Sopenharmony_ci	mutex_lock(&n->dev.mutex);
15068c2ecf20Sopenharmony_ci	r = vhost_dev_check_owner(&n->dev);
15078c2ecf20Sopenharmony_ci	if (r)
15088c2ecf20Sopenharmony_ci		goto err;
15098c2ecf20Sopenharmony_ci
15108c2ecf20Sopenharmony_ci	if (index >= VHOST_NET_VQ_MAX) {
15118c2ecf20Sopenharmony_ci		r = -ENOBUFS;
15128c2ecf20Sopenharmony_ci		goto err;
15138c2ecf20Sopenharmony_ci	}
15148c2ecf20Sopenharmony_ci	vq = &n->vqs[index].vq;
15158c2ecf20Sopenharmony_ci	nvq = &n->vqs[index];
15168c2ecf20Sopenharmony_ci	mutex_lock(&vq->mutex);
15178c2ecf20Sopenharmony_ci
15188c2ecf20Sopenharmony_ci	if (fd == -1)
15198c2ecf20Sopenharmony_ci		vhost_clear_msg(&n->dev);
15208c2ecf20Sopenharmony_ci
15218c2ecf20Sopenharmony_ci	/* Verify that ring has been setup correctly. */
15228c2ecf20Sopenharmony_ci	if (!vhost_vq_access_ok(vq)) {
15238c2ecf20Sopenharmony_ci		r = -EFAULT;
15248c2ecf20Sopenharmony_ci		goto err_vq;
15258c2ecf20Sopenharmony_ci	}
15268c2ecf20Sopenharmony_ci	sock = get_socket(fd);
15278c2ecf20Sopenharmony_ci	if (IS_ERR(sock)) {
15288c2ecf20Sopenharmony_ci		r = PTR_ERR(sock);
15298c2ecf20Sopenharmony_ci		goto err_vq;
15308c2ecf20Sopenharmony_ci	}
15318c2ecf20Sopenharmony_ci
15328c2ecf20Sopenharmony_ci	/* start polling new socket */
15338c2ecf20Sopenharmony_ci	oldsock = vhost_vq_get_backend(vq);
15348c2ecf20Sopenharmony_ci	if (sock != oldsock) {
15358c2ecf20Sopenharmony_ci		ubufs = vhost_net_ubuf_alloc(vq,
15368c2ecf20Sopenharmony_ci					     sock && vhost_sock_zcopy(sock));
15378c2ecf20Sopenharmony_ci		if (IS_ERR(ubufs)) {
15388c2ecf20Sopenharmony_ci			r = PTR_ERR(ubufs);
15398c2ecf20Sopenharmony_ci			goto err_ubufs;
15408c2ecf20Sopenharmony_ci		}
15418c2ecf20Sopenharmony_ci
15428c2ecf20Sopenharmony_ci		vhost_net_disable_vq(n, vq);
15438c2ecf20Sopenharmony_ci		vhost_vq_set_backend(vq, sock);
15448c2ecf20Sopenharmony_ci		vhost_net_buf_unproduce(nvq);
15458c2ecf20Sopenharmony_ci		r = vhost_vq_init_access(vq);
15468c2ecf20Sopenharmony_ci		if (r)
15478c2ecf20Sopenharmony_ci			goto err_used;
15488c2ecf20Sopenharmony_ci		r = vhost_net_enable_vq(n, vq);
15498c2ecf20Sopenharmony_ci		if (r)
15508c2ecf20Sopenharmony_ci			goto err_used;
15518c2ecf20Sopenharmony_ci		if (index == VHOST_NET_VQ_RX) {
15528c2ecf20Sopenharmony_ci			if (sock)
15538c2ecf20Sopenharmony_ci				nvq->rx_ring = get_tap_ptr_ring(sock->file);
15548c2ecf20Sopenharmony_ci			else
15558c2ecf20Sopenharmony_ci				nvq->rx_ring = NULL;
15568c2ecf20Sopenharmony_ci		}
15578c2ecf20Sopenharmony_ci
15588c2ecf20Sopenharmony_ci		oldubufs = nvq->ubufs;
15598c2ecf20Sopenharmony_ci		nvq->ubufs = ubufs;
15608c2ecf20Sopenharmony_ci
15618c2ecf20Sopenharmony_ci		n->tx_packets = 0;
15628c2ecf20Sopenharmony_ci		n->tx_zcopy_err = 0;
15638c2ecf20Sopenharmony_ci		n->tx_flush = false;
15648c2ecf20Sopenharmony_ci	}
15658c2ecf20Sopenharmony_ci
15668c2ecf20Sopenharmony_ci	mutex_unlock(&vq->mutex);
15678c2ecf20Sopenharmony_ci
15688c2ecf20Sopenharmony_ci	if (oldubufs) {
15698c2ecf20Sopenharmony_ci		vhost_net_ubuf_put_wait_and_free(oldubufs);
15708c2ecf20Sopenharmony_ci		mutex_lock(&vq->mutex);
15718c2ecf20Sopenharmony_ci		vhost_zerocopy_signal_used(n, vq);
15728c2ecf20Sopenharmony_ci		mutex_unlock(&vq->mutex);
15738c2ecf20Sopenharmony_ci	}
15748c2ecf20Sopenharmony_ci
15758c2ecf20Sopenharmony_ci	if (oldsock) {
15768c2ecf20Sopenharmony_ci		vhost_net_flush_vq(n, index);
15778c2ecf20Sopenharmony_ci		sockfd_put(oldsock);
15788c2ecf20Sopenharmony_ci	}
15798c2ecf20Sopenharmony_ci
15808c2ecf20Sopenharmony_ci	mutex_unlock(&n->dev.mutex);
15818c2ecf20Sopenharmony_ci	return 0;
15828c2ecf20Sopenharmony_ci
15838c2ecf20Sopenharmony_cierr_used:
15848c2ecf20Sopenharmony_ci	vhost_vq_set_backend(vq, oldsock);
15858c2ecf20Sopenharmony_ci	vhost_net_enable_vq(n, vq);
15868c2ecf20Sopenharmony_ci	if (ubufs)
15878c2ecf20Sopenharmony_ci		vhost_net_ubuf_put_wait_and_free(ubufs);
15888c2ecf20Sopenharmony_cierr_ubufs:
15898c2ecf20Sopenharmony_ci	if (sock)
15908c2ecf20Sopenharmony_ci		sockfd_put(sock);
15918c2ecf20Sopenharmony_cierr_vq:
15928c2ecf20Sopenharmony_ci	mutex_unlock(&vq->mutex);
15938c2ecf20Sopenharmony_cierr:
15948c2ecf20Sopenharmony_ci	mutex_unlock(&n->dev.mutex);
15958c2ecf20Sopenharmony_ci	return r;
15968c2ecf20Sopenharmony_ci}
15978c2ecf20Sopenharmony_ci
15988c2ecf20Sopenharmony_cistatic long vhost_net_reset_owner(struct vhost_net *n)
15998c2ecf20Sopenharmony_ci{
16008c2ecf20Sopenharmony_ci	struct socket *tx_sock = NULL;
16018c2ecf20Sopenharmony_ci	struct socket *rx_sock = NULL;
16028c2ecf20Sopenharmony_ci	long err;
16038c2ecf20Sopenharmony_ci	struct vhost_iotlb *umem;
16048c2ecf20Sopenharmony_ci
16058c2ecf20Sopenharmony_ci	mutex_lock(&n->dev.mutex);
16068c2ecf20Sopenharmony_ci	err = vhost_dev_check_owner(&n->dev);
16078c2ecf20Sopenharmony_ci	if (err)
16088c2ecf20Sopenharmony_ci		goto done;
16098c2ecf20Sopenharmony_ci	umem = vhost_dev_reset_owner_prepare();
16108c2ecf20Sopenharmony_ci	if (!umem) {
16118c2ecf20Sopenharmony_ci		err = -ENOMEM;
16128c2ecf20Sopenharmony_ci		goto done;
16138c2ecf20Sopenharmony_ci	}
16148c2ecf20Sopenharmony_ci	vhost_net_stop(n, &tx_sock, &rx_sock);
16158c2ecf20Sopenharmony_ci	vhost_net_flush(n);
16168c2ecf20Sopenharmony_ci	vhost_dev_stop(&n->dev);
16178c2ecf20Sopenharmony_ci	vhost_dev_reset_owner(&n->dev, umem);
16188c2ecf20Sopenharmony_ci	vhost_net_vq_reset(n);
16198c2ecf20Sopenharmony_cidone:
16208c2ecf20Sopenharmony_ci	mutex_unlock(&n->dev.mutex);
16218c2ecf20Sopenharmony_ci	if (tx_sock)
16228c2ecf20Sopenharmony_ci		sockfd_put(tx_sock);
16238c2ecf20Sopenharmony_ci	if (rx_sock)
16248c2ecf20Sopenharmony_ci		sockfd_put(rx_sock);
16258c2ecf20Sopenharmony_ci	return err;
16268c2ecf20Sopenharmony_ci}
16278c2ecf20Sopenharmony_ci
16288c2ecf20Sopenharmony_cistatic int vhost_net_set_features(struct vhost_net *n, u64 features)
16298c2ecf20Sopenharmony_ci{
16308c2ecf20Sopenharmony_ci	size_t vhost_hlen, sock_hlen, hdr_len;
16318c2ecf20Sopenharmony_ci	int i;
16328c2ecf20Sopenharmony_ci
16338c2ecf20Sopenharmony_ci	hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
16348c2ecf20Sopenharmony_ci			       (1ULL << VIRTIO_F_VERSION_1))) ?
16358c2ecf20Sopenharmony_ci			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
16368c2ecf20Sopenharmony_ci			sizeof(struct virtio_net_hdr);
16378c2ecf20Sopenharmony_ci	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
16388c2ecf20Sopenharmony_ci		/* vhost provides vnet_hdr */
16398c2ecf20Sopenharmony_ci		vhost_hlen = hdr_len;
16408c2ecf20Sopenharmony_ci		sock_hlen = 0;
16418c2ecf20Sopenharmony_ci	} else {
16428c2ecf20Sopenharmony_ci		/* socket provides vnet_hdr */
16438c2ecf20Sopenharmony_ci		vhost_hlen = 0;
16448c2ecf20Sopenharmony_ci		sock_hlen = hdr_len;
16458c2ecf20Sopenharmony_ci	}
16468c2ecf20Sopenharmony_ci	mutex_lock(&n->dev.mutex);
16478c2ecf20Sopenharmony_ci	if ((features & (1 << VHOST_F_LOG_ALL)) &&
16488c2ecf20Sopenharmony_ci	    !vhost_log_access_ok(&n->dev))
16498c2ecf20Sopenharmony_ci		goto out_unlock;
16508c2ecf20Sopenharmony_ci
16518c2ecf20Sopenharmony_ci	if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
16528c2ecf20Sopenharmony_ci		if (vhost_init_device_iotlb(&n->dev, true))
16538c2ecf20Sopenharmony_ci			goto out_unlock;
16548c2ecf20Sopenharmony_ci	}
16558c2ecf20Sopenharmony_ci
16568c2ecf20Sopenharmony_ci	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
16578c2ecf20Sopenharmony_ci		mutex_lock(&n->vqs[i].vq.mutex);
16588c2ecf20Sopenharmony_ci		n->vqs[i].vq.acked_features = features;
16598c2ecf20Sopenharmony_ci		n->vqs[i].vhost_hlen = vhost_hlen;
16608c2ecf20Sopenharmony_ci		n->vqs[i].sock_hlen = sock_hlen;
16618c2ecf20Sopenharmony_ci		mutex_unlock(&n->vqs[i].vq.mutex);
16628c2ecf20Sopenharmony_ci	}
16638c2ecf20Sopenharmony_ci	mutex_unlock(&n->dev.mutex);
16648c2ecf20Sopenharmony_ci	return 0;
16658c2ecf20Sopenharmony_ci
16668c2ecf20Sopenharmony_ciout_unlock:
16678c2ecf20Sopenharmony_ci	mutex_unlock(&n->dev.mutex);
16688c2ecf20Sopenharmony_ci	return -EFAULT;
16698c2ecf20Sopenharmony_ci}
16708c2ecf20Sopenharmony_ci
16718c2ecf20Sopenharmony_cistatic long vhost_net_set_owner(struct vhost_net *n)
16728c2ecf20Sopenharmony_ci{
16738c2ecf20Sopenharmony_ci	int r;
16748c2ecf20Sopenharmony_ci
16758c2ecf20Sopenharmony_ci	mutex_lock(&n->dev.mutex);
16768c2ecf20Sopenharmony_ci	if (vhost_dev_has_owner(&n->dev)) {
16778c2ecf20Sopenharmony_ci		r = -EBUSY;
16788c2ecf20Sopenharmony_ci		goto out;
16798c2ecf20Sopenharmony_ci	}
16808c2ecf20Sopenharmony_ci	r = vhost_net_set_ubuf_info(n);
16818c2ecf20Sopenharmony_ci	if (r)
16828c2ecf20Sopenharmony_ci		goto out;
16838c2ecf20Sopenharmony_ci	r = vhost_dev_set_owner(&n->dev);
16848c2ecf20Sopenharmony_ci	if (r)
16858c2ecf20Sopenharmony_ci		vhost_net_clear_ubuf_info(n);
16868c2ecf20Sopenharmony_ci	vhost_net_flush(n);
16878c2ecf20Sopenharmony_ciout:
16888c2ecf20Sopenharmony_ci	mutex_unlock(&n->dev.mutex);
16898c2ecf20Sopenharmony_ci	return r;
16908c2ecf20Sopenharmony_ci}
16918c2ecf20Sopenharmony_ci
16928c2ecf20Sopenharmony_cistatic long vhost_net_ioctl(struct file *f, unsigned int ioctl,
16938c2ecf20Sopenharmony_ci			    unsigned long arg)
16948c2ecf20Sopenharmony_ci{
16958c2ecf20Sopenharmony_ci	struct vhost_net *n = f->private_data;
16968c2ecf20Sopenharmony_ci	void __user *argp = (void __user *)arg;
16978c2ecf20Sopenharmony_ci	u64 __user *featurep = argp;
16988c2ecf20Sopenharmony_ci	struct vhost_vring_file backend;
16998c2ecf20Sopenharmony_ci	u64 features;
17008c2ecf20Sopenharmony_ci	int r;
17018c2ecf20Sopenharmony_ci
17028c2ecf20Sopenharmony_ci	switch (ioctl) {
17038c2ecf20Sopenharmony_ci	case VHOST_NET_SET_BACKEND:
17048c2ecf20Sopenharmony_ci		if (copy_from_user(&backend, argp, sizeof backend))
17058c2ecf20Sopenharmony_ci			return -EFAULT;
17068c2ecf20Sopenharmony_ci		return vhost_net_set_backend(n, backend.index, backend.fd);
17078c2ecf20Sopenharmony_ci	case VHOST_GET_FEATURES:
17088c2ecf20Sopenharmony_ci		features = VHOST_NET_FEATURES;
17098c2ecf20Sopenharmony_ci		if (copy_to_user(featurep, &features, sizeof features))
17108c2ecf20Sopenharmony_ci			return -EFAULT;
17118c2ecf20Sopenharmony_ci		return 0;
17128c2ecf20Sopenharmony_ci	case VHOST_SET_FEATURES:
17138c2ecf20Sopenharmony_ci		if (copy_from_user(&features, featurep, sizeof features))
17148c2ecf20Sopenharmony_ci			return -EFAULT;
17158c2ecf20Sopenharmony_ci		if (features & ~VHOST_NET_FEATURES)
17168c2ecf20Sopenharmony_ci			return -EOPNOTSUPP;
17178c2ecf20Sopenharmony_ci		return vhost_net_set_features(n, features);
17188c2ecf20Sopenharmony_ci	case VHOST_GET_BACKEND_FEATURES:
17198c2ecf20Sopenharmony_ci		features = VHOST_NET_BACKEND_FEATURES;
17208c2ecf20Sopenharmony_ci		if (copy_to_user(featurep, &features, sizeof(features)))
17218c2ecf20Sopenharmony_ci			return -EFAULT;
17228c2ecf20Sopenharmony_ci		return 0;
17238c2ecf20Sopenharmony_ci	case VHOST_SET_BACKEND_FEATURES:
17248c2ecf20Sopenharmony_ci		if (copy_from_user(&features, featurep, sizeof(features)))
17258c2ecf20Sopenharmony_ci			return -EFAULT;
17268c2ecf20Sopenharmony_ci		if (features & ~VHOST_NET_BACKEND_FEATURES)
17278c2ecf20Sopenharmony_ci			return -EOPNOTSUPP;
17288c2ecf20Sopenharmony_ci		vhost_set_backend_features(&n->dev, features);
17298c2ecf20Sopenharmony_ci		return 0;
17308c2ecf20Sopenharmony_ci	case VHOST_RESET_OWNER:
17318c2ecf20Sopenharmony_ci		return vhost_net_reset_owner(n);
17328c2ecf20Sopenharmony_ci	case VHOST_SET_OWNER:
17338c2ecf20Sopenharmony_ci		return vhost_net_set_owner(n);
17348c2ecf20Sopenharmony_ci	default:
17358c2ecf20Sopenharmony_ci		mutex_lock(&n->dev.mutex);
17368c2ecf20Sopenharmony_ci		r = vhost_dev_ioctl(&n->dev, ioctl, argp);
17378c2ecf20Sopenharmony_ci		if (r == -ENOIOCTLCMD)
17388c2ecf20Sopenharmony_ci			r = vhost_vring_ioctl(&n->dev, ioctl, argp);
17398c2ecf20Sopenharmony_ci		else
17408c2ecf20Sopenharmony_ci			vhost_net_flush(n);
17418c2ecf20Sopenharmony_ci		mutex_unlock(&n->dev.mutex);
17428c2ecf20Sopenharmony_ci		return r;
17438c2ecf20Sopenharmony_ci	}
17448c2ecf20Sopenharmony_ci}
17458c2ecf20Sopenharmony_ci
17468c2ecf20Sopenharmony_cistatic ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
17478c2ecf20Sopenharmony_ci{
17488c2ecf20Sopenharmony_ci	struct file *file = iocb->ki_filp;
17498c2ecf20Sopenharmony_ci	struct vhost_net *n = file->private_data;
17508c2ecf20Sopenharmony_ci	struct vhost_dev *dev = &n->dev;
17518c2ecf20Sopenharmony_ci	int noblock = file->f_flags & O_NONBLOCK;
17528c2ecf20Sopenharmony_ci
17538c2ecf20Sopenharmony_ci	return vhost_chr_read_iter(dev, to, noblock);
17548c2ecf20Sopenharmony_ci}
17558c2ecf20Sopenharmony_ci
17568c2ecf20Sopenharmony_cistatic ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
17578c2ecf20Sopenharmony_ci					struct iov_iter *from)
17588c2ecf20Sopenharmony_ci{
17598c2ecf20Sopenharmony_ci	struct file *file = iocb->ki_filp;
17608c2ecf20Sopenharmony_ci	struct vhost_net *n = file->private_data;
17618c2ecf20Sopenharmony_ci	struct vhost_dev *dev = &n->dev;
17628c2ecf20Sopenharmony_ci
17638c2ecf20Sopenharmony_ci	return vhost_chr_write_iter(dev, from);
17648c2ecf20Sopenharmony_ci}
17658c2ecf20Sopenharmony_ci
17668c2ecf20Sopenharmony_cistatic __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait)
17678c2ecf20Sopenharmony_ci{
17688c2ecf20Sopenharmony_ci	struct vhost_net *n = file->private_data;
17698c2ecf20Sopenharmony_ci	struct vhost_dev *dev = &n->dev;
17708c2ecf20Sopenharmony_ci
17718c2ecf20Sopenharmony_ci	return vhost_chr_poll(file, dev, wait);
17728c2ecf20Sopenharmony_ci}
17738c2ecf20Sopenharmony_ci
17748c2ecf20Sopenharmony_cistatic const struct file_operations vhost_net_fops = {
17758c2ecf20Sopenharmony_ci	.owner          = THIS_MODULE,
17768c2ecf20Sopenharmony_ci	.release        = vhost_net_release,
17778c2ecf20Sopenharmony_ci	.read_iter      = vhost_net_chr_read_iter,
17788c2ecf20Sopenharmony_ci	.write_iter     = vhost_net_chr_write_iter,
17798c2ecf20Sopenharmony_ci	.poll           = vhost_net_chr_poll,
17808c2ecf20Sopenharmony_ci	.unlocked_ioctl = vhost_net_ioctl,
17818c2ecf20Sopenharmony_ci	.compat_ioctl   = compat_ptr_ioctl,
17828c2ecf20Sopenharmony_ci	.open           = vhost_net_open,
17838c2ecf20Sopenharmony_ci	.llseek		= noop_llseek,
17848c2ecf20Sopenharmony_ci};
17858c2ecf20Sopenharmony_ci
17868c2ecf20Sopenharmony_cistatic struct miscdevice vhost_net_misc = {
17878c2ecf20Sopenharmony_ci	.minor = VHOST_NET_MINOR,
17888c2ecf20Sopenharmony_ci	.name = "vhost-net",
17898c2ecf20Sopenharmony_ci	.fops = &vhost_net_fops,
17908c2ecf20Sopenharmony_ci};
17918c2ecf20Sopenharmony_ci
17928c2ecf20Sopenharmony_cistatic int vhost_net_init(void)
17938c2ecf20Sopenharmony_ci{
17948c2ecf20Sopenharmony_ci	if (experimental_zcopytx)
17958c2ecf20Sopenharmony_ci		vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
17968c2ecf20Sopenharmony_ci	return misc_register(&vhost_net_misc);
17978c2ecf20Sopenharmony_ci}
17988c2ecf20Sopenharmony_cimodule_init(vhost_net_init);
17998c2ecf20Sopenharmony_ci
18008c2ecf20Sopenharmony_cistatic void vhost_net_exit(void)
18018c2ecf20Sopenharmony_ci{
18028c2ecf20Sopenharmony_ci	misc_deregister(&vhost_net_misc);
18038c2ecf20Sopenharmony_ci}
18048c2ecf20Sopenharmony_cimodule_exit(vhost_net_exit);
18058c2ecf20Sopenharmony_ci
18068c2ecf20Sopenharmony_ciMODULE_VERSION("0.0.1");
18078c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL v2");
18088c2ecf20Sopenharmony_ciMODULE_AUTHOR("Michael S. Tsirkin");
18098c2ecf20Sopenharmony_ciMODULE_DESCRIPTION("Host kernel accelerator for virtio net");
18108c2ecf20Sopenharmony_ciMODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
18118c2ecf20Sopenharmony_ciMODULE_ALIAS("devname:vhost-net");
1812