18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
38c2ecf20Sopenharmony_ci
48c2ecf20Sopenharmony_ci#include <linux/skmsg.h>
58c2ecf20Sopenharmony_ci#include <linux/filter.h>
68c2ecf20Sopenharmony_ci#include <linux/bpf.h>
78c2ecf20Sopenharmony_ci#include <linux/init.h>
88c2ecf20Sopenharmony_ci#include <linux/wait.h>
98c2ecf20Sopenharmony_ci#include <linux/util_macros.h>
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci#include <net/inet_common.h>
128c2ecf20Sopenharmony_ci#include <net/tls.h>
138c2ecf20Sopenharmony_ci
148c2ecf20Sopenharmony_ciint __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
158c2ecf20Sopenharmony_ci		      struct msghdr *msg, int len, int flags)
168c2ecf20Sopenharmony_ci{
178c2ecf20Sopenharmony_ci	struct iov_iter *iter = &msg->msg_iter;
188c2ecf20Sopenharmony_ci	int peek = flags & MSG_PEEK;
198c2ecf20Sopenharmony_ci	struct sk_msg *msg_rx;
208c2ecf20Sopenharmony_ci	int i, copied = 0;
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_ci	msg_rx = list_first_entry_or_null(&psock->ingress_msg,
238c2ecf20Sopenharmony_ci					  struct sk_msg, list);
248c2ecf20Sopenharmony_ci
258c2ecf20Sopenharmony_ci	while (copied != len) {
268c2ecf20Sopenharmony_ci		struct scatterlist *sge;
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_ci		if (unlikely(!msg_rx))
298c2ecf20Sopenharmony_ci			break;
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_ci		i = msg_rx->sg.start;
328c2ecf20Sopenharmony_ci		do {
338c2ecf20Sopenharmony_ci			struct page *page;
348c2ecf20Sopenharmony_ci			int copy;
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_ci			sge = sk_msg_elem(msg_rx, i);
378c2ecf20Sopenharmony_ci			copy = sge->length;
388c2ecf20Sopenharmony_ci			page = sg_page(sge);
398c2ecf20Sopenharmony_ci			if (copied + copy > len)
408c2ecf20Sopenharmony_ci				copy = len - copied;
418c2ecf20Sopenharmony_ci			copy = copy_page_to_iter(page, sge->offset, copy, iter);
428c2ecf20Sopenharmony_ci			if (!copy)
438c2ecf20Sopenharmony_ci				return copied ? copied : -EFAULT;
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_ci			copied += copy;
468c2ecf20Sopenharmony_ci			if (likely(!peek)) {
478c2ecf20Sopenharmony_ci				sge->offset += copy;
488c2ecf20Sopenharmony_ci				sge->length -= copy;
498c2ecf20Sopenharmony_ci				if (!msg_rx->skb)
508c2ecf20Sopenharmony_ci					sk_mem_uncharge(sk, copy);
518c2ecf20Sopenharmony_ci				msg_rx->sg.size -= copy;
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci				if (!sge->length) {
548c2ecf20Sopenharmony_ci					sk_msg_iter_var_next(i);
558c2ecf20Sopenharmony_ci					if (!msg_rx->skb)
568c2ecf20Sopenharmony_ci						put_page(page);
578c2ecf20Sopenharmony_ci				}
588c2ecf20Sopenharmony_ci			} else {
598c2ecf20Sopenharmony_ci				/* Lets not optimize peek case if copy_page_to_iter
608c2ecf20Sopenharmony_ci				 * didn't copy the entire length lets just break.
618c2ecf20Sopenharmony_ci				 */
628c2ecf20Sopenharmony_ci				if (copy != sge->length)
638c2ecf20Sopenharmony_ci					return copied;
648c2ecf20Sopenharmony_ci				sk_msg_iter_var_next(i);
658c2ecf20Sopenharmony_ci			}
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_ci			if (copied == len)
688c2ecf20Sopenharmony_ci				break;
698c2ecf20Sopenharmony_ci		} while (i != msg_rx->sg.end);
708c2ecf20Sopenharmony_ci
718c2ecf20Sopenharmony_ci		if (unlikely(peek)) {
728c2ecf20Sopenharmony_ci			if (msg_rx == list_last_entry(&psock->ingress_msg,
738c2ecf20Sopenharmony_ci						      struct sk_msg, list))
748c2ecf20Sopenharmony_ci				break;
758c2ecf20Sopenharmony_ci			msg_rx = list_next_entry(msg_rx, list);
768c2ecf20Sopenharmony_ci			continue;
778c2ecf20Sopenharmony_ci		}
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_ci		msg_rx->sg.start = i;
808c2ecf20Sopenharmony_ci		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
818c2ecf20Sopenharmony_ci			list_del(&msg_rx->list);
828c2ecf20Sopenharmony_ci			if (msg_rx->skb)
838c2ecf20Sopenharmony_ci				consume_skb(msg_rx->skb);
848c2ecf20Sopenharmony_ci			kfree(msg_rx);
858c2ecf20Sopenharmony_ci		}
868c2ecf20Sopenharmony_ci		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
878c2ecf20Sopenharmony_ci						  struct sk_msg, list);
888c2ecf20Sopenharmony_ci	}
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci	return copied;
918c2ecf20Sopenharmony_ci}
928c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_cistatic int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
958c2ecf20Sopenharmony_ci			   struct sk_msg *msg, u32 apply_bytes, int flags)
968c2ecf20Sopenharmony_ci{
978c2ecf20Sopenharmony_ci	bool apply = apply_bytes;
988c2ecf20Sopenharmony_ci	struct scatterlist *sge;
998c2ecf20Sopenharmony_ci	u32 size, copied = 0;
1008c2ecf20Sopenharmony_ci	struct sk_msg *tmp;
1018c2ecf20Sopenharmony_ci	int i, ret = 0;
1028c2ecf20Sopenharmony_ci
1038c2ecf20Sopenharmony_ci	tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
1048c2ecf20Sopenharmony_ci	if (unlikely(!tmp))
1058c2ecf20Sopenharmony_ci		return -ENOMEM;
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	lock_sock(sk);
1088c2ecf20Sopenharmony_ci	tmp->sg.start = msg->sg.start;
1098c2ecf20Sopenharmony_ci	i = msg->sg.start;
1108c2ecf20Sopenharmony_ci	do {
1118c2ecf20Sopenharmony_ci		sge = sk_msg_elem(msg, i);
1128c2ecf20Sopenharmony_ci		size = (apply && apply_bytes < sge->length) ?
1138c2ecf20Sopenharmony_ci			apply_bytes : sge->length;
1148c2ecf20Sopenharmony_ci		if (!sk_wmem_schedule(sk, size)) {
1158c2ecf20Sopenharmony_ci			if (!copied)
1168c2ecf20Sopenharmony_ci				ret = -ENOMEM;
1178c2ecf20Sopenharmony_ci			break;
1188c2ecf20Sopenharmony_ci		}
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci		sk_mem_charge(sk, size);
1218c2ecf20Sopenharmony_ci		sk_msg_xfer(tmp, msg, i, size);
1228c2ecf20Sopenharmony_ci		copied += size;
1238c2ecf20Sopenharmony_ci		if (sge->length)
1248c2ecf20Sopenharmony_ci			get_page(sk_msg_page(tmp, i));
1258c2ecf20Sopenharmony_ci		sk_msg_iter_var_next(i);
1268c2ecf20Sopenharmony_ci		tmp->sg.end = i;
1278c2ecf20Sopenharmony_ci		if (apply) {
1288c2ecf20Sopenharmony_ci			apply_bytes -= size;
1298c2ecf20Sopenharmony_ci			if (!apply_bytes) {
1308c2ecf20Sopenharmony_ci				if (sge->length)
1318c2ecf20Sopenharmony_ci					sk_msg_iter_var_prev(i);
1328c2ecf20Sopenharmony_ci				break;
1338c2ecf20Sopenharmony_ci			}
1348c2ecf20Sopenharmony_ci		}
1358c2ecf20Sopenharmony_ci	} while (i != msg->sg.end);
1368c2ecf20Sopenharmony_ci
1378c2ecf20Sopenharmony_ci	if (!ret) {
1388c2ecf20Sopenharmony_ci		msg->sg.start = i;
1398c2ecf20Sopenharmony_ci		sk_psock_queue_msg(psock, tmp);
1408c2ecf20Sopenharmony_ci		sk_psock_data_ready(sk, psock);
1418c2ecf20Sopenharmony_ci	} else {
1428c2ecf20Sopenharmony_ci		sk_msg_free(sk, tmp);
1438c2ecf20Sopenharmony_ci		kfree(tmp);
1448c2ecf20Sopenharmony_ci	}
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ci	release_sock(sk);
1478c2ecf20Sopenharmony_ci	return ret;
1488c2ecf20Sopenharmony_ci}
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_cistatic int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
1518c2ecf20Sopenharmony_ci			int flags, bool uncharge)
1528c2ecf20Sopenharmony_ci{
1538c2ecf20Sopenharmony_ci	bool apply = apply_bytes;
1548c2ecf20Sopenharmony_ci	struct scatterlist *sge;
1558c2ecf20Sopenharmony_ci	struct page *page;
1568c2ecf20Sopenharmony_ci	int size, ret = 0;
1578c2ecf20Sopenharmony_ci	u32 off;
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_ci	while (1) {
1608c2ecf20Sopenharmony_ci		bool has_tx_ulp;
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci		sge = sk_msg_elem(msg, msg->sg.start);
1638c2ecf20Sopenharmony_ci		size = (apply && apply_bytes < sge->length) ?
1648c2ecf20Sopenharmony_ci			apply_bytes : sge->length;
1658c2ecf20Sopenharmony_ci		off  = sge->offset;
1668c2ecf20Sopenharmony_ci		page = sg_page(sge);
1678c2ecf20Sopenharmony_ci
1688c2ecf20Sopenharmony_ci		tcp_rate_check_app_limited(sk);
1698c2ecf20Sopenharmony_ciretry:
1708c2ecf20Sopenharmony_ci		has_tx_ulp = tls_sw_has_ctx_tx(sk);
1718c2ecf20Sopenharmony_ci		if (has_tx_ulp) {
1728c2ecf20Sopenharmony_ci			flags |= MSG_SENDPAGE_NOPOLICY;
1738c2ecf20Sopenharmony_ci			ret = kernel_sendpage_locked(sk,
1748c2ecf20Sopenharmony_ci						     page, off, size, flags);
1758c2ecf20Sopenharmony_ci		} else {
1768c2ecf20Sopenharmony_ci			ret = do_tcp_sendpages(sk, page, off, size, flags);
1778c2ecf20Sopenharmony_ci		}
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci		if (ret <= 0)
1808c2ecf20Sopenharmony_ci			return ret;
1818c2ecf20Sopenharmony_ci		if (apply)
1828c2ecf20Sopenharmony_ci			apply_bytes -= ret;
1838c2ecf20Sopenharmony_ci		msg->sg.size -= ret;
1848c2ecf20Sopenharmony_ci		sge->offset += ret;
1858c2ecf20Sopenharmony_ci		sge->length -= ret;
1868c2ecf20Sopenharmony_ci		if (uncharge)
1878c2ecf20Sopenharmony_ci			sk_mem_uncharge(sk, ret);
1888c2ecf20Sopenharmony_ci		if (ret != size) {
1898c2ecf20Sopenharmony_ci			size -= ret;
1908c2ecf20Sopenharmony_ci			off  += ret;
1918c2ecf20Sopenharmony_ci			goto retry;
1928c2ecf20Sopenharmony_ci		}
1938c2ecf20Sopenharmony_ci		if (!sge->length) {
1948c2ecf20Sopenharmony_ci			put_page(page);
1958c2ecf20Sopenharmony_ci			sk_msg_iter_next(msg, start);
1968c2ecf20Sopenharmony_ci			sg_init_table(sge, 1);
1978c2ecf20Sopenharmony_ci			if (msg->sg.start == msg->sg.end)
1988c2ecf20Sopenharmony_ci				break;
1998c2ecf20Sopenharmony_ci		}
2008c2ecf20Sopenharmony_ci		if (apply && !apply_bytes)
2018c2ecf20Sopenharmony_ci			break;
2028c2ecf20Sopenharmony_ci	}
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_ci	return 0;
2058c2ecf20Sopenharmony_ci}
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_cistatic int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
2088c2ecf20Sopenharmony_ci			       u32 apply_bytes, int flags, bool uncharge)
2098c2ecf20Sopenharmony_ci{
2108c2ecf20Sopenharmony_ci	int ret;
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	lock_sock(sk);
2138c2ecf20Sopenharmony_ci	ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
2148c2ecf20Sopenharmony_ci	release_sock(sk);
2158c2ecf20Sopenharmony_ci	return ret;
2168c2ecf20Sopenharmony_ci}
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ciint tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
2198c2ecf20Sopenharmony_ci			  u32 bytes, int flags)
2208c2ecf20Sopenharmony_ci{
2218c2ecf20Sopenharmony_ci	bool ingress = sk_msg_to_ingress(msg);
2228c2ecf20Sopenharmony_ci	struct sk_psock *psock = sk_psock_get(sk);
2238c2ecf20Sopenharmony_ci	int ret;
2248c2ecf20Sopenharmony_ci
2258c2ecf20Sopenharmony_ci	if (unlikely(!psock))
2268c2ecf20Sopenharmony_ci		return -EPIPE;
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci	ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
2298c2ecf20Sopenharmony_ci			tcp_bpf_push_locked(sk, msg, bytes, flags, false);
2308c2ecf20Sopenharmony_ci	sk_psock_put(sk, psock);
2318c2ecf20Sopenharmony_ci	return ret;
2328c2ecf20Sopenharmony_ci}
2338c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
2348c2ecf20Sopenharmony_ci
2358c2ecf20Sopenharmony_ci#ifdef CONFIG_BPF_STREAM_PARSER
2368c2ecf20Sopenharmony_cistatic bool tcp_bpf_stream_read(const struct sock *sk)
2378c2ecf20Sopenharmony_ci{
2388c2ecf20Sopenharmony_ci	struct sk_psock *psock;
2398c2ecf20Sopenharmony_ci	bool empty = true;
2408c2ecf20Sopenharmony_ci
2418c2ecf20Sopenharmony_ci	rcu_read_lock();
2428c2ecf20Sopenharmony_ci	psock = sk_psock(sk);
2438c2ecf20Sopenharmony_ci	if (likely(psock))
2448c2ecf20Sopenharmony_ci		empty = list_empty(&psock->ingress_msg);
2458c2ecf20Sopenharmony_ci	rcu_read_unlock();
2468c2ecf20Sopenharmony_ci	return !empty;
2478c2ecf20Sopenharmony_ci}
2488c2ecf20Sopenharmony_ci
2498c2ecf20Sopenharmony_cistatic int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
2508c2ecf20Sopenharmony_ci			     int flags, long timeo, int *err)
2518c2ecf20Sopenharmony_ci{
2528c2ecf20Sopenharmony_ci	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2538c2ecf20Sopenharmony_ci	int ret = 0;
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ci	if (sk->sk_shutdown & RCV_SHUTDOWN)
2568c2ecf20Sopenharmony_ci		return 1;
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_ci	if (!timeo)
2598c2ecf20Sopenharmony_ci		return ret;
2608c2ecf20Sopenharmony_ci
2618c2ecf20Sopenharmony_ci	add_wait_queue(sk_sleep(sk), &wait);
2628c2ecf20Sopenharmony_ci	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2638c2ecf20Sopenharmony_ci	ret = sk_wait_event(sk, &timeo,
2648c2ecf20Sopenharmony_ci			    !list_empty(&psock->ingress_msg) ||
2658c2ecf20Sopenharmony_ci			    !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait);
2668c2ecf20Sopenharmony_ci	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2678c2ecf20Sopenharmony_ci	remove_wait_queue(sk_sleep(sk), &wait);
2688c2ecf20Sopenharmony_ci	return ret;
2698c2ecf20Sopenharmony_ci}
2708c2ecf20Sopenharmony_ci
2718c2ecf20Sopenharmony_cistatic int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
2728c2ecf20Sopenharmony_ci		    int nonblock, int flags, int *addr_len)
2738c2ecf20Sopenharmony_ci{
2748c2ecf20Sopenharmony_ci	struct sk_psock *psock;
2758c2ecf20Sopenharmony_ci	int copied, ret;
2768c2ecf20Sopenharmony_ci
2778c2ecf20Sopenharmony_ci	if (unlikely(flags & MSG_ERRQUEUE))
2788c2ecf20Sopenharmony_ci		return inet_recv_error(sk, msg, len, addr_len);
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ci	psock = sk_psock_get(sk);
2818c2ecf20Sopenharmony_ci	if (unlikely(!psock))
2828c2ecf20Sopenharmony_ci		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
2838c2ecf20Sopenharmony_ci	if (!skb_queue_empty(&sk->sk_receive_queue) &&
2848c2ecf20Sopenharmony_ci	    sk_psock_queue_empty(psock)) {
2858c2ecf20Sopenharmony_ci		sk_psock_put(sk, psock);
2868c2ecf20Sopenharmony_ci		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
2878c2ecf20Sopenharmony_ci	}
2888c2ecf20Sopenharmony_ci	lock_sock(sk);
2898c2ecf20Sopenharmony_cimsg_bytes_ready:
2908c2ecf20Sopenharmony_ci	copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
2918c2ecf20Sopenharmony_ci	if (!copied) {
2928c2ecf20Sopenharmony_ci		int data, err = 0;
2938c2ecf20Sopenharmony_ci		long timeo;
2948c2ecf20Sopenharmony_ci
2958c2ecf20Sopenharmony_ci		timeo = sock_rcvtimeo(sk, nonblock);
2968c2ecf20Sopenharmony_ci		data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
2978c2ecf20Sopenharmony_ci		if (data) {
2988c2ecf20Sopenharmony_ci			if (!sk_psock_queue_empty(psock))
2998c2ecf20Sopenharmony_ci				goto msg_bytes_ready;
3008c2ecf20Sopenharmony_ci			release_sock(sk);
3018c2ecf20Sopenharmony_ci			sk_psock_put(sk, psock);
3028c2ecf20Sopenharmony_ci			return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
3038c2ecf20Sopenharmony_ci		}
3048c2ecf20Sopenharmony_ci		if (err) {
3058c2ecf20Sopenharmony_ci			ret = err;
3068c2ecf20Sopenharmony_ci			goto out;
3078c2ecf20Sopenharmony_ci		}
3088c2ecf20Sopenharmony_ci		copied = -EAGAIN;
3098c2ecf20Sopenharmony_ci	}
3108c2ecf20Sopenharmony_ci	ret = copied;
3118c2ecf20Sopenharmony_ciout:
3128c2ecf20Sopenharmony_ci	release_sock(sk);
3138c2ecf20Sopenharmony_ci	sk_psock_put(sk, psock);
3148c2ecf20Sopenharmony_ci	return ret;
3158c2ecf20Sopenharmony_ci}
3168c2ecf20Sopenharmony_ci
3178c2ecf20Sopenharmony_cistatic int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
3188c2ecf20Sopenharmony_ci				struct sk_msg *msg, int *copied, int flags)
3198c2ecf20Sopenharmony_ci{
3208c2ecf20Sopenharmony_ci	bool cork = false, enospc = sk_msg_full(msg);
3218c2ecf20Sopenharmony_ci	struct sock *sk_redir;
3228c2ecf20Sopenharmony_ci	u32 tosend, origsize, sent, delta = 0;
3238c2ecf20Sopenharmony_ci	u32 eval;
3248c2ecf20Sopenharmony_ci	int ret;
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_cimore_data:
3278c2ecf20Sopenharmony_ci	if (psock->eval == __SK_NONE) {
3288c2ecf20Sopenharmony_ci		/* Track delta in msg size to add/subtract it on SK_DROP from
3298c2ecf20Sopenharmony_ci		 * returned to user copied size. This ensures user doesn't
3308c2ecf20Sopenharmony_ci		 * get a positive return code with msg_cut_data and SK_DROP
3318c2ecf20Sopenharmony_ci		 * verdict.
3328c2ecf20Sopenharmony_ci		 */
3338c2ecf20Sopenharmony_ci		delta = msg->sg.size;
3348c2ecf20Sopenharmony_ci		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
3358c2ecf20Sopenharmony_ci		delta -= msg->sg.size;
3368c2ecf20Sopenharmony_ci	}
3378c2ecf20Sopenharmony_ci
3388c2ecf20Sopenharmony_ci	if (msg->cork_bytes &&
3398c2ecf20Sopenharmony_ci	    msg->cork_bytes > msg->sg.size && !enospc) {
3408c2ecf20Sopenharmony_ci		psock->cork_bytes = msg->cork_bytes - msg->sg.size;
3418c2ecf20Sopenharmony_ci		if (!psock->cork) {
3428c2ecf20Sopenharmony_ci			psock->cork = kzalloc(sizeof(*psock->cork),
3438c2ecf20Sopenharmony_ci					      GFP_ATOMIC | __GFP_NOWARN);
3448c2ecf20Sopenharmony_ci			if (!psock->cork)
3458c2ecf20Sopenharmony_ci				return -ENOMEM;
3468c2ecf20Sopenharmony_ci		}
3478c2ecf20Sopenharmony_ci		memcpy(psock->cork, msg, sizeof(*msg));
3488c2ecf20Sopenharmony_ci		return 0;
3498c2ecf20Sopenharmony_ci	}
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_ci	tosend = msg->sg.size;
3528c2ecf20Sopenharmony_ci	if (psock->apply_bytes && psock->apply_bytes < tosend)
3538c2ecf20Sopenharmony_ci		tosend = psock->apply_bytes;
3548c2ecf20Sopenharmony_ci	eval = __SK_NONE;
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci	switch (psock->eval) {
3578c2ecf20Sopenharmony_ci	case __SK_PASS:
3588c2ecf20Sopenharmony_ci		ret = tcp_bpf_push(sk, msg, tosend, flags, true);
3598c2ecf20Sopenharmony_ci		if (unlikely(ret)) {
3608c2ecf20Sopenharmony_ci			*copied -= sk_msg_free(sk, msg);
3618c2ecf20Sopenharmony_ci			break;
3628c2ecf20Sopenharmony_ci		}
3638c2ecf20Sopenharmony_ci		sk_msg_apply_bytes(psock, tosend);
3648c2ecf20Sopenharmony_ci		break;
3658c2ecf20Sopenharmony_ci	case __SK_REDIRECT:
3668c2ecf20Sopenharmony_ci		sk_redir = psock->sk_redir;
3678c2ecf20Sopenharmony_ci		sk_msg_apply_bytes(psock, tosend);
3688c2ecf20Sopenharmony_ci		if (!psock->apply_bytes) {
3698c2ecf20Sopenharmony_ci			/* Clean up before releasing the sock lock. */
3708c2ecf20Sopenharmony_ci			eval = psock->eval;
3718c2ecf20Sopenharmony_ci			psock->eval = __SK_NONE;
3728c2ecf20Sopenharmony_ci			psock->sk_redir = NULL;
3738c2ecf20Sopenharmony_ci		}
3748c2ecf20Sopenharmony_ci		if (psock->cork) {
3758c2ecf20Sopenharmony_ci			cork = true;
3768c2ecf20Sopenharmony_ci			psock->cork = NULL;
3778c2ecf20Sopenharmony_ci		}
3788c2ecf20Sopenharmony_ci		sk_msg_return(sk, msg, tosend);
3798c2ecf20Sopenharmony_ci		release_sock(sk);
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci		origsize = msg->sg.size;
3828c2ecf20Sopenharmony_ci		ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
3838c2ecf20Sopenharmony_ci		sent = origsize - msg->sg.size;
3848c2ecf20Sopenharmony_ci
3858c2ecf20Sopenharmony_ci		if (eval == __SK_REDIRECT)
3868c2ecf20Sopenharmony_ci			sock_put(sk_redir);
3878c2ecf20Sopenharmony_ci
3888c2ecf20Sopenharmony_ci		lock_sock(sk);
3898c2ecf20Sopenharmony_ci		if (unlikely(ret < 0)) {
3908c2ecf20Sopenharmony_ci			int free = sk_msg_free_nocharge(sk, msg);
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci			if (!cork)
3938c2ecf20Sopenharmony_ci				*copied -= free;
3948c2ecf20Sopenharmony_ci		}
3958c2ecf20Sopenharmony_ci		if (cork) {
3968c2ecf20Sopenharmony_ci			sk_msg_free(sk, msg);
3978c2ecf20Sopenharmony_ci			kfree(msg);
3988c2ecf20Sopenharmony_ci			msg = NULL;
3998c2ecf20Sopenharmony_ci			ret = 0;
4008c2ecf20Sopenharmony_ci		}
4018c2ecf20Sopenharmony_ci		break;
4028c2ecf20Sopenharmony_ci	case __SK_DROP:
4038c2ecf20Sopenharmony_ci	default:
4048c2ecf20Sopenharmony_ci		sk_msg_free_partial(sk, msg, tosend);
4058c2ecf20Sopenharmony_ci		sk_msg_apply_bytes(psock, tosend);
4068c2ecf20Sopenharmony_ci		*copied -= (tosend + delta);
4078c2ecf20Sopenharmony_ci		return -EACCES;
4088c2ecf20Sopenharmony_ci	}
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ci	if (likely(!ret)) {
4118c2ecf20Sopenharmony_ci		if (!psock->apply_bytes) {
4128c2ecf20Sopenharmony_ci			psock->eval =  __SK_NONE;
4138c2ecf20Sopenharmony_ci			if (psock->sk_redir) {
4148c2ecf20Sopenharmony_ci				sock_put(psock->sk_redir);
4158c2ecf20Sopenharmony_ci				psock->sk_redir = NULL;
4168c2ecf20Sopenharmony_ci			}
4178c2ecf20Sopenharmony_ci		}
4188c2ecf20Sopenharmony_ci		if (msg &&
4198c2ecf20Sopenharmony_ci		    msg->sg.data[msg->sg.start].page_link &&
4208c2ecf20Sopenharmony_ci		    msg->sg.data[msg->sg.start].length) {
4218c2ecf20Sopenharmony_ci			if (eval == __SK_REDIRECT)
4228c2ecf20Sopenharmony_ci				sk_mem_charge(sk, tosend - sent);
4238c2ecf20Sopenharmony_ci			goto more_data;
4248c2ecf20Sopenharmony_ci		}
4258c2ecf20Sopenharmony_ci	}
4268c2ecf20Sopenharmony_ci	return ret;
4278c2ecf20Sopenharmony_ci}
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_cistatic int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
4308c2ecf20Sopenharmony_ci{
4318c2ecf20Sopenharmony_ci	struct sk_msg tmp, *msg_tx = NULL;
4328c2ecf20Sopenharmony_ci	int copied = 0, err = 0;
4338c2ecf20Sopenharmony_ci	struct sk_psock *psock;
4348c2ecf20Sopenharmony_ci	long timeo;
4358c2ecf20Sopenharmony_ci	int flags;
4368c2ecf20Sopenharmony_ci
4378c2ecf20Sopenharmony_ci	/* Don't let internal do_tcp_sendpages() flags through */
4388c2ecf20Sopenharmony_ci	flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED);
4398c2ecf20Sopenharmony_ci	flags |= MSG_NO_SHARED_FRAGS;
4408c2ecf20Sopenharmony_ci
4418c2ecf20Sopenharmony_ci	psock = sk_psock_get(sk);
4428c2ecf20Sopenharmony_ci	if (unlikely(!psock))
4438c2ecf20Sopenharmony_ci		return tcp_sendmsg(sk, msg, size);
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci	lock_sock(sk);
4468c2ecf20Sopenharmony_ci	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
4478c2ecf20Sopenharmony_ci	while (msg_data_left(msg)) {
4488c2ecf20Sopenharmony_ci		bool enospc = false;
4498c2ecf20Sopenharmony_ci		u32 copy, osize;
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_ci		if (sk->sk_err) {
4528c2ecf20Sopenharmony_ci			err = -sk->sk_err;
4538c2ecf20Sopenharmony_ci			goto out_err;
4548c2ecf20Sopenharmony_ci		}
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci		copy = msg_data_left(msg);
4578c2ecf20Sopenharmony_ci		if (!sk_stream_memory_free(sk))
4588c2ecf20Sopenharmony_ci			goto wait_for_sndbuf;
4598c2ecf20Sopenharmony_ci		if (psock->cork) {
4608c2ecf20Sopenharmony_ci			msg_tx = psock->cork;
4618c2ecf20Sopenharmony_ci		} else {
4628c2ecf20Sopenharmony_ci			msg_tx = &tmp;
4638c2ecf20Sopenharmony_ci			sk_msg_init(msg_tx);
4648c2ecf20Sopenharmony_ci		}
4658c2ecf20Sopenharmony_ci
4668c2ecf20Sopenharmony_ci		osize = msg_tx->sg.size;
4678c2ecf20Sopenharmony_ci		err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
4688c2ecf20Sopenharmony_ci		if (err) {
4698c2ecf20Sopenharmony_ci			if (err != -ENOSPC)
4708c2ecf20Sopenharmony_ci				goto wait_for_memory;
4718c2ecf20Sopenharmony_ci			enospc = true;
4728c2ecf20Sopenharmony_ci			copy = msg_tx->sg.size - osize;
4738c2ecf20Sopenharmony_ci		}
4748c2ecf20Sopenharmony_ci
4758c2ecf20Sopenharmony_ci		err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
4768c2ecf20Sopenharmony_ci					       copy);
4778c2ecf20Sopenharmony_ci		if (err < 0) {
4788c2ecf20Sopenharmony_ci			sk_msg_trim(sk, msg_tx, osize);
4798c2ecf20Sopenharmony_ci			goto out_err;
4808c2ecf20Sopenharmony_ci		}
4818c2ecf20Sopenharmony_ci
4828c2ecf20Sopenharmony_ci		copied += copy;
4838c2ecf20Sopenharmony_ci		if (psock->cork_bytes) {
4848c2ecf20Sopenharmony_ci			if (size > psock->cork_bytes)
4858c2ecf20Sopenharmony_ci				psock->cork_bytes = 0;
4868c2ecf20Sopenharmony_ci			else
4878c2ecf20Sopenharmony_ci				psock->cork_bytes -= size;
4888c2ecf20Sopenharmony_ci			if (psock->cork_bytes && !enospc)
4898c2ecf20Sopenharmony_ci				goto out_err;
4908c2ecf20Sopenharmony_ci			/* All cork bytes are accounted, rerun the prog. */
4918c2ecf20Sopenharmony_ci			psock->eval = __SK_NONE;
4928c2ecf20Sopenharmony_ci			psock->cork_bytes = 0;
4938c2ecf20Sopenharmony_ci		}
4948c2ecf20Sopenharmony_ci
4958c2ecf20Sopenharmony_ci		err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
4968c2ecf20Sopenharmony_ci		if (unlikely(err < 0))
4978c2ecf20Sopenharmony_ci			goto out_err;
4988c2ecf20Sopenharmony_ci		continue;
4998c2ecf20Sopenharmony_ciwait_for_sndbuf:
5008c2ecf20Sopenharmony_ci		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
5018c2ecf20Sopenharmony_ciwait_for_memory:
5028c2ecf20Sopenharmony_ci		err = sk_stream_wait_memory(sk, &timeo);
5038c2ecf20Sopenharmony_ci		if (err) {
5048c2ecf20Sopenharmony_ci			if (msg_tx && msg_tx != psock->cork)
5058c2ecf20Sopenharmony_ci				sk_msg_free(sk, msg_tx);
5068c2ecf20Sopenharmony_ci			goto out_err;
5078c2ecf20Sopenharmony_ci		}
5088c2ecf20Sopenharmony_ci	}
5098c2ecf20Sopenharmony_ciout_err:
5108c2ecf20Sopenharmony_ci	if (err < 0)
5118c2ecf20Sopenharmony_ci		err = sk_stream_error(sk, msg->msg_flags, err);
5128c2ecf20Sopenharmony_ci	release_sock(sk);
5138c2ecf20Sopenharmony_ci	sk_psock_put(sk, psock);
5148c2ecf20Sopenharmony_ci	return copied > 0 ? copied : err;
5158c2ecf20Sopenharmony_ci}
5168c2ecf20Sopenharmony_ci
5178c2ecf20Sopenharmony_cistatic int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
5188c2ecf20Sopenharmony_ci			    size_t size, int flags)
5198c2ecf20Sopenharmony_ci{
5208c2ecf20Sopenharmony_ci	struct sk_msg tmp, *msg = NULL;
5218c2ecf20Sopenharmony_ci	int err = 0, copied = 0;
5228c2ecf20Sopenharmony_ci	struct sk_psock *psock;
5238c2ecf20Sopenharmony_ci	bool enospc = false;
5248c2ecf20Sopenharmony_ci
5258c2ecf20Sopenharmony_ci	psock = sk_psock_get(sk);
5268c2ecf20Sopenharmony_ci	if (unlikely(!psock))
5278c2ecf20Sopenharmony_ci		return tcp_sendpage(sk, page, offset, size, flags);
5288c2ecf20Sopenharmony_ci
5298c2ecf20Sopenharmony_ci	lock_sock(sk);
5308c2ecf20Sopenharmony_ci	if (psock->cork) {
5318c2ecf20Sopenharmony_ci		msg = psock->cork;
5328c2ecf20Sopenharmony_ci	} else {
5338c2ecf20Sopenharmony_ci		msg = &tmp;
5348c2ecf20Sopenharmony_ci		sk_msg_init(msg);
5358c2ecf20Sopenharmony_ci	}
5368c2ecf20Sopenharmony_ci
5378c2ecf20Sopenharmony_ci	/* Catch case where ring is full and sendpage is stalled. */
5388c2ecf20Sopenharmony_ci	if (unlikely(sk_msg_full(msg)))
5398c2ecf20Sopenharmony_ci		goto out_err;
5408c2ecf20Sopenharmony_ci
5418c2ecf20Sopenharmony_ci	sk_msg_page_add(msg, page, size, offset);
5428c2ecf20Sopenharmony_ci	sk_mem_charge(sk, size);
5438c2ecf20Sopenharmony_ci	copied = size;
5448c2ecf20Sopenharmony_ci	if (sk_msg_full(msg))
5458c2ecf20Sopenharmony_ci		enospc = true;
5468c2ecf20Sopenharmony_ci	if (psock->cork_bytes) {
5478c2ecf20Sopenharmony_ci		if (size > psock->cork_bytes)
5488c2ecf20Sopenharmony_ci			psock->cork_bytes = 0;
5498c2ecf20Sopenharmony_ci		else
5508c2ecf20Sopenharmony_ci			psock->cork_bytes -= size;
5518c2ecf20Sopenharmony_ci		if (psock->cork_bytes && !enospc)
5528c2ecf20Sopenharmony_ci			goto out_err;
5538c2ecf20Sopenharmony_ci		/* All cork bytes are accounted, rerun the prog. */
5548c2ecf20Sopenharmony_ci		psock->eval = __SK_NONE;
5558c2ecf20Sopenharmony_ci		psock->cork_bytes = 0;
5568c2ecf20Sopenharmony_ci	}
5578c2ecf20Sopenharmony_ci
5588c2ecf20Sopenharmony_ci	err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
5598c2ecf20Sopenharmony_ciout_err:
5608c2ecf20Sopenharmony_ci	release_sock(sk);
5618c2ecf20Sopenharmony_ci	sk_psock_put(sk, psock);
5628c2ecf20Sopenharmony_ci	return copied ? copied : err;
5638c2ecf20Sopenharmony_ci}
5648c2ecf20Sopenharmony_ci
5658c2ecf20Sopenharmony_cienum {
5668c2ecf20Sopenharmony_ci	TCP_BPF_IPV4,
5678c2ecf20Sopenharmony_ci	TCP_BPF_IPV6,
5688c2ecf20Sopenharmony_ci	TCP_BPF_NUM_PROTS,
5698c2ecf20Sopenharmony_ci};
5708c2ecf20Sopenharmony_ci
5718c2ecf20Sopenharmony_cienum {
5728c2ecf20Sopenharmony_ci	TCP_BPF_BASE,
5738c2ecf20Sopenharmony_ci	TCP_BPF_TX,
5748c2ecf20Sopenharmony_ci	TCP_BPF_NUM_CFGS,
5758c2ecf20Sopenharmony_ci};
5768c2ecf20Sopenharmony_ci
5778c2ecf20Sopenharmony_cistatic struct proto *tcpv6_prot_saved __read_mostly;
5788c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(tcpv6_prot_lock);
5798c2ecf20Sopenharmony_cistatic struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
5808c2ecf20Sopenharmony_ci
5818c2ecf20Sopenharmony_cistatic void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
5828c2ecf20Sopenharmony_ci				   struct proto *base)
5838c2ecf20Sopenharmony_ci{
5848c2ecf20Sopenharmony_ci	prot[TCP_BPF_BASE]			= *base;
5858c2ecf20Sopenharmony_ci	prot[TCP_BPF_BASE].close		= sock_map_close;
5868c2ecf20Sopenharmony_ci	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
5878c2ecf20Sopenharmony_ci	prot[TCP_BPF_BASE].stream_memory_read	= tcp_bpf_stream_read;
5888c2ecf20Sopenharmony_ci
5898c2ecf20Sopenharmony_ci	prot[TCP_BPF_TX]			= prot[TCP_BPF_BASE];
5908c2ecf20Sopenharmony_ci	prot[TCP_BPF_TX].sendmsg		= tcp_bpf_sendmsg;
5918c2ecf20Sopenharmony_ci	prot[TCP_BPF_TX].sendpage		= tcp_bpf_sendpage;
5928c2ecf20Sopenharmony_ci}
5938c2ecf20Sopenharmony_ci
5948c2ecf20Sopenharmony_cistatic void tcp_bpf_check_v6_needs_rebuild(struct proto *ops)
5958c2ecf20Sopenharmony_ci{
5968c2ecf20Sopenharmony_ci	if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
5978c2ecf20Sopenharmony_ci		spin_lock_bh(&tcpv6_prot_lock);
5988c2ecf20Sopenharmony_ci		if (likely(ops != tcpv6_prot_saved)) {
5998c2ecf20Sopenharmony_ci			tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
6008c2ecf20Sopenharmony_ci			smp_store_release(&tcpv6_prot_saved, ops);
6018c2ecf20Sopenharmony_ci		}
6028c2ecf20Sopenharmony_ci		spin_unlock_bh(&tcpv6_prot_lock);
6038c2ecf20Sopenharmony_ci	}
6048c2ecf20Sopenharmony_ci}
6058c2ecf20Sopenharmony_ci
6068c2ecf20Sopenharmony_cistatic int __init tcp_bpf_v4_build_proto(void)
6078c2ecf20Sopenharmony_ci{
6088c2ecf20Sopenharmony_ci	tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
6098c2ecf20Sopenharmony_ci	return 0;
6108c2ecf20Sopenharmony_ci}
6118c2ecf20Sopenharmony_cilate_initcall(tcp_bpf_v4_build_proto);
6128c2ecf20Sopenharmony_ci
6138c2ecf20Sopenharmony_cistatic int tcp_bpf_assert_proto_ops(struct proto *ops)
6148c2ecf20Sopenharmony_ci{
6158c2ecf20Sopenharmony_ci	/* In order to avoid retpoline, we make assumptions when we call
6168c2ecf20Sopenharmony_ci	 * into ops if e.g. a psock is not present. Make sure they are
6178c2ecf20Sopenharmony_ci	 * indeed valid assumptions.
6188c2ecf20Sopenharmony_ci	 */
6198c2ecf20Sopenharmony_ci	return ops->recvmsg  == tcp_recvmsg &&
6208c2ecf20Sopenharmony_ci	       ops->sendmsg  == tcp_sendmsg &&
6218c2ecf20Sopenharmony_ci	       ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
6228c2ecf20Sopenharmony_ci}
6238c2ecf20Sopenharmony_ci
6248c2ecf20Sopenharmony_cistruct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
6258c2ecf20Sopenharmony_ci{
6268c2ecf20Sopenharmony_ci	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
6278c2ecf20Sopenharmony_ci	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
6288c2ecf20Sopenharmony_ci
6298c2ecf20Sopenharmony_ci	if (sk->sk_family == AF_INET6) {
6308c2ecf20Sopenharmony_ci		if (tcp_bpf_assert_proto_ops(psock->sk_proto))
6318c2ecf20Sopenharmony_ci			return ERR_PTR(-EINVAL);
6328c2ecf20Sopenharmony_ci
6338c2ecf20Sopenharmony_ci		tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
6348c2ecf20Sopenharmony_ci	}
6358c2ecf20Sopenharmony_ci
6368c2ecf20Sopenharmony_ci	return &tcp_bpf_prots[family][config];
6378c2ecf20Sopenharmony_ci}
6388c2ecf20Sopenharmony_ci
6398c2ecf20Sopenharmony_ci/* If a child got cloned from a listening socket that had tcp_bpf
6408c2ecf20Sopenharmony_ci * protocol callbacks installed, we need to restore the callbacks to
6418c2ecf20Sopenharmony_ci * the default ones because the child does not inherit the psock state
6428c2ecf20Sopenharmony_ci * that tcp_bpf callbacks expect.
6438c2ecf20Sopenharmony_ci */
6448c2ecf20Sopenharmony_civoid tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
6458c2ecf20Sopenharmony_ci{
6468c2ecf20Sopenharmony_ci	struct proto *prot = newsk->sk_prot;
6478c2ecf20Sopenharmony_ci
6488c2ecf20Sopenharmony_ci	if (is_insidevar(prot, tcp_bpf_prots))
6498c2ecf20Sopenharmony_ci		newsk->sk_prot = sk->sk_prot_creator;
6508c2ecf20Sopenharmony_ci}
6518c2ecf20Sopenharmony_ci#endif /* CONFIG_BPF_STREAM_PARSER */
652