18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ 38c2ecf20Sopenharmony_ci 48c2ecf20Sopenharmony_ci#include <linux/skmsg.h> 58c2ecf20Sopenharmony_ci#include <linux/filter.h> 68c2ecf20Sopenharmony_ci#include <linux/bpf.h> 78c2ecf20Sopenharmony_ci#include <linux/init.h> 88c2ecf20Sopenharmony_ci#include <linux/wait.h> 98c2ecf20Sopenharmony_ci#include <linux/util_macros.h> 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci#include <net/inet_common.h> 128c2ecf20Sopenharmony_ci#include <net/tls.h> 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ciint __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, 158c2ecf20Sopenharmony_ci struct msghdr *msg, int len, int flags) 168c2ecf20Sopenharmony_ci{ 178c2ecf20Sopenharmony_ci struct iov_iter *iter = &msg->msg_iter; 188c2ecf20Sopenharmony_ci int peek = flags & MSG_PEEK; 198c2ecf20Sopenharmony_ci struct sk_msg *msg_rx; 208c2ecf20Sopenharmony_ci int i, copied = 0; 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci msg_rx = list_first_entry_or_null(&psock->ingress_msg, 238c2ecf20Sopenharmony_ci struct sk_msg, list); 248c2ecf20Sopenharmony_ci 258c2ecf20Sopenharmony_ci while (copied != len) { 268c2ecf20Sopenharmony_ci struct scatterlist *sge; 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_ci if (unlikely(!msg_rx)) 298c2ecf20Sopenharmony_ci break; 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci i = msg_rx->sg.start; 328c2ecf20Sopenharmony_ci do { 338c2ecf20Sopenharmony_ci struct page *page; 348c2ecf20Sopenharmony_ci int copy; 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_ci sge = sk_msg_elem(msg_rx, i); 378c2ecf20Sopenharmony_ci copy = sge->length; 388c2ecf20Sopenharmony_ci page = sg_page(sge); 398c2ecf20Sopenharmony_ci if (copied + copy > len) 408c2ecf20Sopenharmony_ci copy = len - copied; 418c2ecf20Sopenharmony_ci copy = copy_page_to_iter(page, sge->offset, copy, iter); 428c2ecf20Sopenharmony_ci if (!copy) 438c2ecf20Sopenharmony_ci return copied ? copied : -EFAULT; 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci copied += copy; 468c2ecf20Sopenharmony_ci if (likely(!peek)) { 478c2ecf20Sopenharmony_ci sge->offset += copy; 488c2ecf20Sopenharmony_ci sge->length -= copy; 498c2ecf20Sopenharmony_ci if (!msg_rx->skb) 508c2ecf20Sopenharmony_ci sk_mem_uncharge(sk, copy); 518c2ecf20Sopenharmony_ci msg_rx->sg.size -= copy; 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci if (!sge->length) { 548c2ecf20Sopenharmony_ci sk_msg_iter_var_next(i); 558c2ecf20Sopenharmony_ci if (!msg_rx->skb) 568c2ecf20Sopenharmony_ci put_page(page); 578c2ecf20Sopenharmony_ci } 588c2ecf20Sopenharmony_ci } else { 598c2ecf20Sopenharmony_ci /* Lets not optimize peek case if copy_page_to_iter 608c2ecf20Sopenharmony_ci * didn't copy the entire length lets just break. 618c2ecf20Sopenharmony_ci */ 628c2ecf20Sopenharmony_ci if (copy != sge->length) 638c2ecf20Sopenharmony_ci return copied; 648c2ecf20Sopenharmony_ci sk_msg_iter_var_next(i); 658c2ecf20Sopenharmony_ci } 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci if (copied == len) 688c2ecf20Sopenharmony_ci break; 698c2ecf20Sopenharmony_ci } while (i != msg_rx->sg.end); 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci if (unlikely(peek)) { 728c2ecf20Sopenharmony_ci if (msg_rx == list_last_entry(&psock->ingress_msg, 738c2ecf20Sopenharmony_ci struct sk_msg, list)) 748c2ecf20Sopenharmony_ci break; 758c2ecf20Sopenharmony_ci msg_rx = list_next_entry(msg_rx, list); 768c2ecf20Sopenharmony_ci continue; 778c2ecf20Sopenharmony_ci } 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci msg_rx->sg.start = i; 808c2ecf20Sopenharmony_ci if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { 818c2ecf20Sopenharmony_ci list_del(&msg_rx->list); 828c2ecf20Sopenharmony_ci if (msg_rx->skb) 838c2ecf20Sopenharmony_ci consume_skb(msg_rx->skb); 848c2ecf20Sopenharmony_ci kfree(msg_rx); 858c2ecf20Sopenharmony_ci } 868c2ecf20Sopenharmony_ci msg_rx = list_first_entry_or_null(&psock->ingress_msg, 878c2ecf20Sopenharmony_ci struct sk_msg, list); 888c2ecf20Sopenharmony_ci } 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci return copied; 918c2ecf20Sopenharmony_ci} 928c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_cistatic int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, 958c2ecf20Sopenharmony_ci struct sk_msg *msg, u32 apply_bytes, int flags) 968c2ecf20Sopenharmony_ci{ 978c2ecf20Sopenharmony_ci bool apply = apply_bytes; 988c2ecf20Sopenharmony_ci struct scatterlist *sge; 998c2ecf20Sopenharmony_ci u32 size, copied = 0; 1008c2ecf20Sopenharmony_ci struct sk_msg *tmp; 1018c2ecf20Sopenharmony_ci int i, ret = 0; 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); 1048c2ecf20Sopenharmony_ci if (unlikely(!tmp)) 1058c2ecf20Sopenharmony_ci return -ENOMEM; 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci lock_sock(sk); 1088c2ecf20Sopenharmony_ci tmp->sg.start = msg->sg.start; 1098c2ecf20Sopenharmony_ci i = msg->sg.start; 1108c2ecf20Sopenharmony_ci do { 1118c2ecf20Sopenharmony_ci sge = sk_msg_elem(msg, i); 1128c2ecf20Sopenharmony_ci size = (apply && apply_bytes < sge->length) ? 1138c2ecf20Sopenharmony_ci apply_bytes : sge->length; 1148c2ecf20Sopenharmony_ci if (!sk_wmem_schedule(sk, size)) { 1158c2ecf20Sopenharmony_ci if (!copied) 1168c2ecf20Sopenharmony_ci ret = -ENOMEM; 1178c2ecf20Sopenharmony_ci break; 1188c2ecf20Sopenharmony_ci } 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci sk_mem_charge(sk, size); 1218c2ecf20Sopenharmony_ci sk_msg_xfer(tmp, msg, i, size); 1228c2ecf20Sopenharmony_ci copied += size; 1238c2ecf20Sopenharmony_ci if (sge->length) 1248c2ecf20Sopenharmony_ci get_page(sk_msg_page(tmp, i)); 1258c2ecf20Sopenharmony_ci sk_msg_iter_var_next(i); 1268c2ecf20Sopenharmony_ci tmp->sg.end = i; 1278c2ecf20Sopenharmony_ci if (apply) { 1288c2ecf20Sopenharmony_ci apply_bytes -= size; 1298c2ecf20Sopenharmony_ci if (!apply_bytes) { 1308c2ecf20Sopenharmony_ci if (sge->length) 1318c2ecf20Sopenharmony_ci sk_msg_iter_var_prev(i); 1328c2ecf20Sopenharmony_ci break; 1338c2ecf20Sopenharmony_ci } 1348c2ecf20Sopenharmony_ci } 1358c2ecf20Sopenharmony_ci } while (i != msg->sg.end); 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci if (!ret) { 1388c2ecf20Sopenharmony_ci msg->sg.start = i; 1398c2ecf20Sopenharmony_ci sk_psock_queue_msg(psock, tmp); 1408c2ecf20Sopenharmony_ci sk_psock_data_ready(sk, psock); 1418c2ecf20Sopenharmony_ci } else { 1428c2ecf20Sopenharmony_ci sk_msg_free(sk, tmp); 1438c2ecf20Sopenharmony_ci kfree(tmp); 1448c2ecf20Sopenharmony_ci } 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ci release_sock(sk); 1478c2ecf20Sopenharmony_ci return ret; 1488c2ecf20Sopenharmony_ci} 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_cistatic int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, 1518c2ecf20Sopenharmony_ci int flags, bool uncharge) 1528c2ecf20Sopenharmony_ci{ 1538c2ecf20Sopenharmony_ci bool apply = apply_bytes; 1548c2ecf20Sopenharmony_ci struct scatterlist *sge; 1558c2ecf20Sopenharmony_ci struct page *page; 1568c2ecf20Sopenharmony_ci int size, ret = 0; 1578c2ecf20Sopenharmony_ci u32 off; 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_ci while (1) { 1608c2ecf20Sopenharmony_ci bool has_tx_ulp; 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci sge = sk_msg_elem(msg, msg->sg.start); 1638c2ecf20Sopenharmony_ci size = (apply && apply_bytes < sge->length) ? 1648c2ecf20Sopenharmony_ci apply_bytes : sge->length; 1658c2ecf20Sopenharmony_ci off = sge->offset; 1668c2ecf20Sopenharmony_ci page = sg_page(sge); 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci tcp_rate_check_app_limited(sk); 1698c2ecf20Sopenharmony_ciretry: 1708c2ecf20Sopenharmony_ci has_tx_ulp = tls_sw_has_ctx_tx(sk); 1718c2ecf20Sopenharmony_ci if (has_tx_ulp) { 1728c2ecf20Sopenharmony_ci flags |= MSG_SENDPAGE_NOPOLICY; 1738c2ecf20Sopenharmony_ci ret = kernel_sendpage_locked(sk, 1748c2ecf20Sopenharmony_ci page, off, size, flags); 1758c2ecf20Sopenharmony_ci } else { 1768c2ecf20Sopenharmony_ci ret = do_tcp_sendpages(sk, page, off, size, flags); 1778c2ecf20Sopenharmony_ci } 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci if (ret <= 0) 1808c2ecf20Sopenharmony_ci return ret; 1818c2ecf20Sopenharmony_ci if (apply) 1828c2ecf20Sopenharmony_ci apply_bytes -= ret; 1838c2ecf20Sopenharmony_ci msg->sg.size -= ret; 1848c2ecf20Sopenharmony_ci sge->offset += ret; 1858c2ecf20Sopenharmony_ci sge->length -= ret; 1868c2ecf20Sopenharmony_ci if (uncharge) 1878c2ecf20Sopenharmony_ci sk_mem_uncharge(sk, ret); 1888c2ecf20Sopenharmony_ci if (ret != size) { 1898c2ecf20Sopenharmony_ci size -= ret; 1908c2ecf20Sopenharmony_ci off += ret; 1918c2ecf20Sopenharmony_ci goto retry; 1928c2ecf20Sopenharmony_ci } 1938c2ecf20Sopenharmony_ci if (!sge->length) { 1948c2ecf20Sopenharmony_ci put_page(page); 1958c2ecf20Sopenharmony_ci sk_msg_iter_next(msg, start); 1968c2ecf20Sopenharmony_ci sg_init_table(sge, 1); 1978c2ecf20Sopenharmony_ci if (msg->sg.start == msg->sg.end) 1988c2ecf20Sopenharmony_ci break; 1998c2ecf20Sopenharmony_ci } 2008c2ecf20Sopenharmony_ci if (apply && !apply_bytes) 2018c2ecf20Sopenharmony_ci break; 2028c2ecf20Sopenharmony_ci } 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci return 0; 2058c2ecf20Sopenharmony_ci} 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_cistatic int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, 2088c2ecf20Sopenharmony_ci u32 apply_bytes, int flags, bool uncharge) 2098c2ecf20Sopenharmony_ci{ 2108c2ecf20Sopenharmony_ci int ret; 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci lock_sock(sk); 2138c2ecf20Sopenharmony_ci ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge); 2148c2ecf20Sopenharmony_ci release_sock(sk); 2158c2ecf20Sopenharmony_ci return ret; 2168c2ecf20Sopenharmony_ci} 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_ciint tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, 2198c2ecf20Sopenharmony_ci u32 bytes, int flags) 2208c2ecf20Sopenharmony_ci{ 2218c2ecf20Sopenharmony_ci bool ingress = sk_msg_to_ingress(msg); 2228c2ecf20Sopenharmony_ci struct sk_psock *psock = sk_psock_get(sk); 2238c2ecf20Sopenharmony_ci int ret; 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ci if (unlikely(!psock)) 2268c2ecf20Sopenharmony_ci return -EPIPE; 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : 2298c2ecf20Sopenharmony_ci tcp_bpf_push_locked(sk, msg, bytes, flags, false); 2308c2ecf20Sopenharmony_ci sk_psock_put(sk, psock); 2318c2ecf20Sopenharmony_ci return ret; 2328c2ecf20Sopenharmony_ci} 2338c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); 2348c2ecf20Sopenharmony_ci 2358c2ecf20Sopenharmony_ci#ifdef CONFIG_BPF_STREAM_PARSER 2368c2ecf20Sopenharmony_cistatic bool tcp_bpf_stream_read(const struct sock *sk) 2378c2ecf20Sopenharmony_ci{ 2388c2ecf20Sopenharmony_ci struct sk_psock *psock; 2398c2ecf20Sopenharmony_ci bool empty = true; 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci rcu_read_lock(); 2428c2ecf20Sopenharmony_ci psock = sk_psock(sk); 2438c2ecf20Sopenharmony_ci if (likely(psock)) 2448c2ecf20Sopenharmony_ci empty = list_empty(&psock->ingress_msg); 2458c2ecf20Sopenharmony_ci rcu_read_unlock(); 2468c2ecf20Sopenharmony_ci return !empty; 2478c2ecf20Sopenharmony_ci} 2488c2ecf20Sopenharmony_ci 2498c2ecf20Sopenharmony_cistatic int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, 2508c2ecf20Sopenharmony_ci int flags, long timeo, int *err) 2518c2ecf20Sopenharmony_ci{ 2528c2ecf20Sopenharmony_ci DEFINE_WAIT_FUNC(wait, woken_wake_function); 2538c2ecf20Sopenharmony_ci int ret = 0; 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_ci if (sk->sk_shutdown & RCV_SHUTDOWN) 2568c2ecf20Sopenharmony_ci return 1; 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci if (!timeo) 2598c2ecf20Sopenharmony_ci return ret; 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci add_wait_queue(sk_sleep(sk), &wait); 2628c2ecf20Sopenharmony_ci sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2638c2ecf20Sopenharmony_ci ret = sk_wait_event(sk, &timeo, 2648c2ecf20Sopenharmony_ci !list_empty(&psock->ingress_msg) || 2658c2ecf20Sopenharmony_ci !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait); 2668c2ecf20Sopenharmony_ci sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2678c2ecf20Sopenharmony_ci remove_wait_queue(sk_sleep(sk), &wait); 2688c2ecf20Sopenharmony_ci return ret; 2698c2ecf20Sopenharmony_ci} 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_cistatic int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 2728c2ecf20Sopenharmony_ci int nonblock, int flags, int *addr_len) 2738c2ecf20Sopenharmony_ci{ 2748c2ecf20Sopenharmony_ci struct sk_psock *psock; 2758c2ecf20Sopenharmony_ci int copied, ret; 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci if (unlikely(flags & MSG_ERRQUEUE)) 2788c2ecf20Sopenharmony_ci return inet_recv_error(sk, msg, len, addr_len); 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci psock = sk_psock_get(sk); 2818c2ecf20Sopenharmony_ci if (unlikely(!psock)) 2828c2ecf20Sopenharmony_ci return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); 2838c2ecf20Sopenharmony_ci if (!skb_queue_empty(&sk->sk_receive_queue) && 2848c2ecf20Sopenharmony_ci sk_psock_queue_empty(psock)) { 2858c2ecf20Sopenharmony_ci sk_psock_put(sk, psock); 2868c2ecf20Sopenharmony_ci return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); 2878c2ecf20Sopenharmony_ci } 2888c2ecf20Sopenharmony_ci lock_sock(sk); 2898c2ecf20Sopenharmony_cimsg_bytes_ready: 2908c2ecf20Sopenharmony_ci copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); 2918c2ecf20Sopenharmony_ci if (!copied) { 2928c2ecf20Sopenharmony_ci int data, err = 0; 2938c2ecf20Sopenharmony_ci long timeo; 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci timeo = sock_rcvtimeo(sk, nonblock); 2968c2ecf20Sopenharmony_ci data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); 2978c2ecf20Sopenharmony_ci if (data) { 2988c2ecf20Sopenharmony_ci if (!sk_psock_queue_empty(psock)) 2998c2ecf20Sopenharmony_ci goto msg_bytes_ready; 3008c2ecf20Sopenharmony_ci release_sock(sk); 3018c2ecf20Sopenharmony_ci sk_psock_put(sk, psock); 3028c2ecf20Sopenharmony_ci return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); 3038c2ecf20Sopenharmony_ci } 3048c2ecf20Sopenharmony_ci if (err) { 3058c2ecf20Sopenharmony_ci ret = err; 3068c2ecf20Sopenharmony_ci goto out; 3078c2ecf20Sopenharmony_ci } 3088c2ecf20Sopenharmony_ci copied = -EAGAIN; 3098c2ecf20Sopenharmony_ci } 3108c2ecf20Sopenharmony_ci ret = copied; 3118c2ecf20Sopenharmony_ciout: 3128c2ecf20Sopenharmony_ci release_sock(sk); 3138c2ecf20Sopenharmony_ci sk_psock_put(sk, psock); 3148c2ecf20Sopenharmony_ci return ret; 3158c2ecf20Sopenharmony_ci} 3168c2ecf20Sopenharmony_ci 3178c2ecf20Sopenharmony_cistatic int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, 3188c2ecf20Sopenharmony_ci struct sk_msg *msg, int *copied, int flags) 3198c2ecf20Sopenharmony_ci{ 3208c2ecf20Sopenharmony_ci bool cork = false, enospc = sk_msg_full(msg); 3218c2ecf20Sopenharmony_ci struct sock *sk_redir; 3228c2ecf20Sopenharmony_ci u32 tosend, origsize, sent, delta = 0; 3238c2ecf20Sopenharmony_ci u32 eval; 3248c2ecf20Sopenharmony_ci int ret; 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_cimore_data: 3278c2ecf20Sopenharmony_ci if (psock->eval == __SK_NONE) { 3288c2ecf20Sopenharmony_ci /* Track delta in msg size to add/subtract it on SK_DROP from 3298c2ecf20Sopenharmony_ci * returned to user copied size. This ensures user doesn't 3308c2ecf20Sopenharmony_ci * get a positive return code with msg_cut_data and SK_DROP 3318c2ecf20Sopenharmony_ci * verdict. 3328c2ecf20Sopenharmony_ci */ 3338c2ecf20Sopenharmony_ci delta = msg->sg.size; 3348c2ecf20Sopenharmony_ci psock->eval = sk_psock_msg_verdict(sk, psock, msg); 3358c2ecf20Sopenharmony_ci delta -= msg->sg.size; 3368c2ecf20Sopenharmony_ci } 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_ci if (msg->cork_bytes && 3398c2ecf20Sopenharmony_ci msg->cork_bytes > msg->sg.size && !enospc) { 3408c2ecf20Sopenharmony_ci psock->cork_bytes = msg->cork_bytes - msg->sg.size; 3418c2ecf20Sopenharmony_ci if (!psock->cork) { 3428c2ecf20Sopenharmony_ci psock->cork = kzalloc(sizeof(*psock->cork), 3438c2ecf20Sopenharmony_ci GFP_ATOMIC | __GFP_NOWARN); 3448c2ecf20Sopenharmony_ci if (!psock->cork) 3458c2ecf20Sopenharmony_ci return -ENOMEM; 3468c2ecf20Sopenharmony_ci } 3478c2ecf20Sopenharmony_ci memcpy(psock->cork, msg, sizeof(*msg)); 3488c2ecf20Sopenharmony_ci return 0; 3498c2ecf20Sopenharmony_ci } 3508c2ecf20Sopenharmony_ci 3518c2ecf20Sopenharmony_ci tosend = msg->sg.size; 3528c2ecf20Sopenharmony_ci if (psock->apply_bytes && psock->apply_bytes < tosend) 3538c2ecf20Sopenharmony_ci tosend = psock->apply_bytes; 3548c2ecf20Sopenharmony_ci eval = __SK_NONE; 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_ci switch (psock->eval) { 3578c2ecf20Sopenharmony_ci case __SK_PASS: 3588c2ecf20Sopenharmony_ci ret = tcp_bpf_push(sk, msg, tosend, flags, true); 3598c2ecf20Sopenharmony_ci if (unlikely(ret)) { 3608c2ecf20Sopenharmony_ci *copied -= sk_msg_free(sk, msg); 3618c2ecf20Sopenharmony_ci break; 3628c2ecf20Sopenharmony_ci } 3638c2ecf20Sopenharmony_ci sk_msg_apply_bytes(psock, tosend); 3648c2ecf20Sopenharmony_ci break; 3658c2ecf20Sopenharmony_ci case __SK_REDIRECT: 3668c2ecf20Sopenharmony_ci sk_redir = psock->sk_redir; 3678c2ecf20Sopenharmony_ci sk_msg_apply_bytes(psock, tosend); 3688c2ecf20Sopenharmony_ci if (!psock->apply_bytes) { 3698c2ecf20Sopenharmony_ci /* Clean up before releasing the sock lock. */ 3708c2ecf20Sopenharmony_ci eval = psock->eval; 3718c2ecf20Sopenharmony_ci psock->eval = __SK_NONE; 3728c2ecf20Sopenharmony_ci psock->sk_redir = NULL; 3738c2ecf20Sopenharmony_ci } 3748c2ecf20Sopenharmony_ci if (psock->cork) { 3758c2ecf20Sopenharmony_ci cork = true; 3768c2ecf20Sopenharmony_ci psock->cork = NULL; 3778c2ecf20Sopenharmony_ci } 3788c2ecf20Sopenharmony_ci sk_msg_return(sk, msg, tosend); 3798c2ecf20Sopenharmony_ci release_sock(sk); 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci origsize = msg->sg.size; 3828c2ecf20Sopenharmony_ci ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); 3838c2ecf20Sopenharmony_ci sent = origsize - msg->sg.size; 3848c2ecf20Sopenharmony_ci 3858c2ecf20Sopenharmony_ci if (eval == __SK_REDIRECT) 3868c2ecf20Sopenharmony_ci sock_put(sk_redir); 3878c2ecf20Sopenharmony_ci 3888c2ecf20Sopenharmony_ci lock_sock(sk); 3898c2ecf20Sopenharmony_ci if (unlikely(ret < 0)) { 3908c2ecf20Sopenharmony_ci int free = sk_msg_free_nocharge(sk, msg); 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci if (!cork) 3938c2ecf20Sopenharmony_ci *copied -= free; 3948c2ecf20Sopenharmony_ci } 3958c2ecf20Sopenharmony_ci if (cork) { 3968c2ecf20Sopenharmony_ci sk_msg_free(sk, msg); 3978c2ecf20Sopenharmony_ci kfree(msg); 3988c2ecf20Sopenharmony_ci msg = NULL; 3998c2ecf20Sopenharmony_ci ret = 0; 4008c2ecf20Sopenharmony_ci } 4018c2ecf20Sopenharmony_ci break; 4028c2ecf20Sopenharmony_ci case __SK_DROP: 4038c2ecf20Sopenharmony_ci default: 4048c2ecf20Sopenharmony_ci sk_msg_free_partial(sk, msg, tosend); 4058c2ecf20Sopenharmony_ci sk_msg_apply_bytes(psock, tosend); 4068c2ecf20Sopenharmony_ci *copied -= (tosend + delta); 4078c2ecf20Sopenharmony_ci return -EACCES; 4088c2ecf20Sopenharmony_ci } 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_ci if (likely(!ret)) { 4118c2ecf20Sopenharmony_ci if (!psock->apply_bytes) { 4128c2ecf20Sopenharmony_ci psock->eval = __SK_NONE; 4138c2ecf20Sopenharmony_ci if (psock->sk_redir) { 4148c2ecf20Sopenharmony_ci sock_put(psock->sk_redir); 4158c2ecf20Sopenharmony_ci psock->sk_redir = NULL; 4168c2ecf20Sopenharmony_ci } 4178c2ecf20Sopenharmony_ci } 4188c2ecf20Sopenharmony_ci if (msg && 4198c2ecf20Sopenharmony_ci msg->sg.data[msg->sg.start].page_link && 4208c2ecf20Sopenharmony_ci msg->sg.data[msg->sg.start].length) { 4218c2ecf20Sopenharmony_ci if (eval == __SK_REDIRECT) 4228c2ecf20Sopenharmony_ci sk_mem_charge(sk, tosend - sent); 4238c2ecf20Sopenharmony_ci goto more_data; 4248c2ecf20Sopenharmony_ci } 4258c2ecf20Sopenharmony_ci } 4268c2ecf20Sopenharmony_ci return ret; 4278c2ecf20Sopenharmony_ci} 4288c2ecf20Sopenharmony_ci 4298c2ecf20Sopenharmony_cistatic int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) 4308c2ecf20Sopenharmony_ci{ 4318c2ecf20Sopenharmony_ci struct sk_msg tmp, *msg_tx = NULL; 4328c2ecf20Sopenharmony_ci int copied = 0, err = 0; 4338c2ecf20Sopenharmony_ci struct sk_psock *psock; 4348c2ecf20Sopenharmony_ci long timeo; 4358c2ecf20Sopenharmony_ci int flags; 4368c2ecf20Sopenharmony_ci 4378c2ecf20Sopenharmony_ci /* Don't let internal do_tcp_sendpages() flags through */ 4388c2ecf20Sopenharmony_ci flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); 4398c2ecf20Sopenharmony_ci flags |= MSG_NO_SHARED_FRAGS; 4408c2ecf20Sopenharmony_ci 4418c2ecf20Sopenharmony_ci psock = sk_psock_get(sk); 4428c2ecf20Sopenharmony_ci if (unlikely(!psock)) 4438c2ecf20Sopenharmony_ci return tcp_sendmsg(sk, msg, size); 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci lock_sock(sk); 4468c2ecf20Sopenharmony_ci timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 4478c2ecf20Sopenharmony_ci while (msg_data_left(msg)) { 4488c2ecf20Sopenharmony_ci bool enospc = false; 4498c2ecf20Sopenharmony_ci u32 copy, osize; 4508c2ecf20Sopenharmony_ci 4518c2ecf20Sopenharmony_ci if (sk->sk_err) { 4528c2ecf20Sopenharmony_ci err = -sk->sk_err; 4538c2ecf20Sopenharmony_ci goto out_err; 4548c2ecf20Sopenharmony_ci } 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci copy = msg_data_left(msg); 4578c2ecf20Sopenharmony_ci if (!sk_stream_memory_free(sk)) 4588c2ecf20Sopenharmony_ci goto wait_for_sndbuf; 4598c2ecf20Sopenharmony_ci if (psock->cork) { 4608c2ecf20Sopenharmony_ci msg_tx = psock->cork; 4618c2ecf20Sopenharmony_ci } else { 4628c2ecf20Sopenharmony_ci msg_tx = &tmp; 4638c2ecf20Sopenharmony_ci sk_msg_init(msg_tx); 4648c2ecf20Sopenharmony_ci } 4658c2ecf20Sopenharmony_ci 4668c2ecf20Sopenharmony_ci osize = msg_tx->sg.size; 4678c2ecf20Sopenharmony_ci err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1); 4688c2ecf20Sopenharmony_ci if (err) { 4698c2ecf20Sopenharmony_ci if (err != -ENOSPC) 4708c2ecf20Sopenharmony_ci goto wait_for_memory; 4718c2ecf20Sopenharmony_ci enospc = true; 4728c2ecf20Sopenharmony_ci copy = msg_tx->sg.size - osize; 4738c2ecf20Sopenharmony_ci } 4748c2ecf20Sopenharmony_ci 4758c2ecf20Sopenharmony_ci err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, 4768c2ecf20Sopenharmony_ci copy); 4778c2ecf20Sopenharmony_ci if (err < 0) { 4788c2ecf20Sopenharmony_ci sk_msg_trim(sk, msg_tx, osize); 4798c2ecf20Sopenharmony_ci goto out_err; 4808c2ecf20Sopenharmony_ci } 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci copied += copy; 4838c2ecf20Sopenharmony_ci if (psock->cork_bytes) { 4848c2ecf20Sopenharmony_ci if (size > psock->cork_bytes) 4858c2ecf20Sopenharmony_ci psock->cork_bytes = 0; 4868c2ecf20Sopenharmony_ci else 4878c2ecf20Sopenharmony_ci psock->cork_bytes -= size; 4888c2ecf20Sopenharmony_ci if (psock->cork_bytes && !enospc) 4898c2ecf20Sopenharmony_ci goto out_err; 4908c2ecf20Sopenharmony_ci /* All cork bytes are accounted, rerun the prog. */ 4918c2ecf20Sopenharmony_ci psock->eval = __SK_NONE; 4928c2ecf20Sopenharmony_ci psock->cork_bytes = 0; 4938c2ecf20Sopenharmony_ci } 4948c2ecf20Sopenharmony_ci 4958c2ecf20Sopenharmony_ci err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags); 4968c2ecf20Sopenharmony_ci if (unlikely(err < 0)) 4978c2ecf20Sopenharmony_ci goto out_err; 4988c2ecf20Sopenharmony_ci continue; 4998c2ecf20Sopenharmony_ciwait_for_sndbuf: 5008c2ecf20Sopenharmony_ci set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 5018c2ecf20Sopenharmony_ciwait_for_memory: 5028c2ecf20Sopenharmony_ci err = sk_stream_wait_memory(sk, &timeo); 5038c2ecf20Sopenharmony_ci if (err) { 5048c2ecf20Sopenharmony_ci if (msg_tx && msg_tx != psock->cork) 5058c2ecf20Sopenharmony_ci sk_msg_free(sk, msg_tx); 5068c2ecf20Sopenharmony_ci goto out_err; 5078c2ecf20Sopenharmony_ci } 5088c2ecf20Sopenharmony_ci } 5098c2ecf20Sopenharmony_ciout_err: 5108c2ecf20Sopenharmony_ci if (err < 0) 5118c2ecf20Sopenharmony_ci err = sk_stream_error(sk, msg->msg_flags, err); 5128c2ecf20Sopenharmony_ci release_sock(sk); 5138c2ecf20Sopenharmony_ci sk_psock_put(sk, psock); 5148c2ecf20Sopenharmony_ci return copied > 0 ? copied : err; 5158c2ecf20Sopenharmony_ci} 5168c2ecf20Sopenharmony_ci 5178c2ecf20Sopenharmony_cistatic int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, 5188c2ecf20Sopenharmony_ci size_t size, int flags) 5198c2ecf20Sopenharmony_ci{ 5208c2ecf20Sopenharmony_ci struct sk_msg tmp, *msg = NULL; 5218c2ecf20Sopenharmony_ci int err = 0, copied = 0; 5228c2ecf20Sopenharmony_ci struct sk_psock *psock; 5238c2ecf20Sopenharmony_ci bool enospc = false; 5248c2ecf20Sopenharmony_ci 5258c2ecf20Sopenharmony_ci psock = sk_psock_get(sk); 5268c2ecf20Sopenharmony_ci if (unlikely(!psock)) 5278c2ecf20Sopenharmony_ci return tcp_sendpage(sk, page, offset, size, flags); 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ci lock_sock(sk); 5308c2ecf20Sopenharmony_ci if (psock->cork) { 5318c2ecf20Sopenharmony_ci msg = psock->cork; 5328c2ecf20Sopenharmony_ci } else { 5338c2ecf20Sopenharmony_ci msg = &tmp; 5348c2ecf20Sopenharmony_ci sk_msg_init(msg); 5358c2ecf20Sopenharmony_ci } 5368c2ecf20Sopenharmony_ci 5378c2ecf20Sopenharmony_ci /* Catch case where ring is full and sendpage is stalled. */ 5388c2ecf20Sopenharmony_ci if (unlikely(sk_msg_full(msg))) 5398c2ecf20Sopenharmony_ci goto out_err; 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci sk_msg_page_add(msg, page, size, offset); 5428c2ecf20Sopenharmony_ci sk_mem_charge(sk, size); 5438c2ecf20Sopenharmony_ci copied = size; 5448c2ecf20Sopenharmony_ci if (sk_msg_full(msg)) 5458c2ecf20Sopenharmony_ci enospc = true; 5468c2ecf20Sopenharmony_ci if (psock->cork_bytes) { 5478c2ecf20Sopenharmony_ci if (size > psock->cork_bytes) 5488c2ecf20Sopenharmony_ci psock->cork_bytes = 0; 5498c2ecf20Sopenharmony_ci else 5508c2ecf20Sopenharmony_ci psock->cork_bytes -= size; 5518c2ecf20Sopenharmony_ci if (psock->cork_bytes && !enospc) 5528c2ecf20Sopenharmony_ci goto out_err; 5538c2ecf20Sopenharmony_ci /* All cork bytes are accounted, rerun the prog. */ 5548c2ecf20Sopenharmony_ci psock->eval = __SK_NONE; 5558c2ecf20Sopenharmony_ci psock->cork_bytes = 0; 5568c2ecf20Sopenharmony_ci } 5578c2ecf20Sopenharmony_ci 5588c2ecf20Sopenharmony_ci err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); 5598c2ecf20Sopenharmony_ciout_err: 5608c2ecf20Sopenharmony_ci release_sock(sk); 5618c2ecf20Sopenharmony_ci sk_psock_put(sk, psock); 5628c2ecf20Sopenharmony_ci return copied ? copied : err; 5638c2ecf20Sopenharmony_ci} 5648c2ecf20Sopenharmony_ci 5658c2ecf20Sopenharmony_cienum { 5668c2ecf20Sopenharmony_ci TCP_BPF_IPV4, 5678c2ecf20Sopenharmony_ci TCP_BPF_IPV6, 5688c2ecf20Sopenharmony_ci TCP_BPF_NUM_PROTS, 5698c2ecf20Sopenharmony_ci}; 5708c2ecf20Sopenharmony_ci 5718c2ecf20Sopenharmony_cienum { 5728c2ecf20Sopenharmony_ci TCP_BPF_BASE, 5738c2ecf20Sopenharmony_ci TCP_BPF_TX, 5748c2ecf20Sopenharmony_ci TCP_BPF_NUM_CFGS, 5758c2ecf20Sopenharmony_ci}; 5768c2ecf20Sopenharmony_ci 5778c2ecf20Sopenharmony_cistatic struct proto *tcpv6_prot_saved __read_mostly; 5788c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(tcpv6_prot_lock); 5798c2ecf20Sopenharmony_cistatic struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS]; 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_cistatic void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], 5828c2ecf20Sopenharmony_ci struct proto *base) 5838c2ecf20Sopenharmony_ci{ 5848c2ecf20Sopenharmony_ci prot[TCP_BPF_BASE] = *base; 5858c2ecf20Sopenharmony_ci prot[TCP_BPF_BASE].close = sock_map_close; 5868c2ecf20Sopenharmony_ci prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; 5878c2ecf20Sopenharmony_ci prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; 5888c2ecf20Sopenharmony_ci 5898c2ecf20Sopenharmony_ci prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; 5908c2ecf20Sopenharmony_ci prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; 5918c2ecf20Sopenharmony_ci prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; 5928c2ecf20Sopenharmony_ci} 5938c2ecf20Sopenharmony_ci 5948c2ecf20Sopenharmony_cistatic void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) 5958c2ecf20Sopenharmony_ci{ 5968c2ecf20Sopenharmony_ci if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { 5978c2ecf20Sopenharmony_ci spin_lock_bh(&tcpv6_prot_lock); 5988c2ecf20Sopenharmony_ci if (likely(ops != tcpv6_prot_saved)) { 5998c2ecf20Sopenharmony_ci tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); 6008c2ecf20Sopenharmony_ci smp_store_release(&tcpv6_prot_saved, ops); 6018c2ecf20Sopenharmony_ci } 6028c2ecf20Sopenharmony_ci spin_unlock_bh(&tcpv6_prot_lock); 6038c2ecf20Sopenharmony_ci } 6048c2ecf20Sopenharmony_ci} 6058c2ecf20Sopenharmony_ci 6068c2ecf20Sopenharmony_cistatic int __init tcp_bpf_v4_build_proto(void) 6078c2ecf20Sopenharmony_ci{ 6088c2ecf20Sopenharmony_ci tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); 6098c2ecf20Sopenharmony_ci return 0; 6108c2ecf20Sopenharmony_ci} 6118c2ecf20Sopenharmony_cilate_initcall(tcp_bpf_v4_build_proto); 6128c2ecf20Sopenharmony_ci 6138c2ecf20Sopenharmony_cistatic int tcp_bpf_assert_proto_ops(struct proto *ops) 6148c2ecf20Sopenharmony_ci{ 6158c2ecf20Sopenharmony_ci /* In order to avoid retpoline, we make assumptions when we call 6168c2ecf20Sopenharmony_ci * into ops if e.g. a psock is not present. Make sure they are 6178c2ecf20Sopenharmony_ci * indeed valid assumptions. 6188c2ecf20Sopenharmony_ci */ 6198c2ecf20Sopenharmony_ci return ops->recvmsg == tcp_recvmsg && 6208c2ecf20Sopenharmony_ci ops->sendmsg == tcp_sendmsg && 6218c2ecf20Sopenharmony_ci ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; 6228c2ecf20Sopenharmony_ci} 6238c2ecf20Sopenharmony_ci 6248c2ecf20Sopenharmony_cistruct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) 6258c2ecf20Sopenharmony_ci{ 6268c2ecf20Sopenharmony_ci int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; 6278c2ecf20Sopenharmony_ci int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; 6288c2ecf20Sopenharmony_ci 6298c2ecf20Sopenharmony_ci if (sk->sk_family == AF_INET6) { 6308c2ecf20Sopenharmony_ci if (tcp_bpf_assert_proto_ops(psock->sk_proto)) 6318c2ecf20Sopenharmony_ci return ERR_PTR(-EINVAL); 6328c2ecf20Sopenharmony_ci 6338c2ecf20Sopenharmony_ci tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); 6348c2ecf20Sopenharmony_ci } 6358c2ecf20Sopenharmony_ci 6368c2ecf20Sopenharmony_ci return &tcp_bpf_prots[family][config]; 6378c2ecf20Sopenharmony_ci} 6388c2ecf20Sopenharmony_ci 6398c2ecf20Sopenharmony_ci/* If a child got cloned from a listening socket that had tcp_bpf 6408c2ecf20Sopenharmony_ci * protocol callbacks installed, we need to restore the callbacks to 6418c2ecf20Sopenharmony_ci * the default ones because the child does not inherit the psock state 6428c2ecf20Sopenharmony_ci * that tcp_bpf callbacks expect. 6438c2ecf20Sopenharmony_ci */ 6448c2ecf20Sopenharmony_civoid tcp_bpf_clone(const struct sock *sk, struct sock *newsk) 6458c2ecf20Sopenharmony_ci{ 6468c2ecf20Sopenharmony_ci struct proto *prot = newsk->sk_prot; 6478c2ecf20Sopenharmony_ci 6488c2ecf20Sopenharmony_ci if (is_insidevar(prot, tcp_bpf_prots)) 6498c2ecf20Sopenharmony_ci newsk->sk_prot = sk->sk_prot_creator; 6508c2ecf20Sopenharmony_ci} 6518c2ecf20Sopenharmony_ci#endif /* CONFIG_BPF_STREAM_PARSER */ 652