18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * This software is available to you under a choice of one of two 58c2ecf20Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 68c2ecf20Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 78c2ecf20Sopenharmony_ci * COPYING in the main directory of this source tree, or the 88c2ecf20Sopenharmony_ci * OpenIB.org BSD license below: 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Redistribution and use in source and binary forms, with or 118c2ecf20Sopenharmony_ci * without modification, are permitted provided that the following 128c2ecf20Sopenharmony_ci * conditions are met: 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * - Redistributions of source code must retain the above 158c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 168c2ecf20Sopenharmony_ci * disclaimer. 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * - Redistributions in binary form must reproduce the above 198c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 208c2ecf20Sopenharmony_ci * disclaimer in the documentation and/or other materials 218c2ecf20Sopenharmony_ci * provided with the distribution. 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 248c2ecf20Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 258c2ecf20Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 268c2ecf20Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 278c2ecf20Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 288c2ecf20Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 298c2ecf20Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 308c2ecf20Sopenharmony_ci * SOFTWARE. 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ci#include <linux/kernel.h> 348c2ecf20Sopenharmony_ci#include <linux/in.h> 358c2ecf20Sopenharmony_ci#include <net/tcp.h> 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci#include "rds_single_path.h" 388c2ecf20Sopenharmony_ci#include "rds.h" 398c2ecf20Sopenharmony_ci#include "tcp.h" 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_civoid rds_tcp_xmit_path_prepare(struct rds_conn_path *cp) 428c2ecf20Sopenharmony_ci{ 438c2ecf20Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci tcp_sock_set_cork(tc->t_sock->sk, true); 468c2ecf20Sopenharmony_ci} 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_civoid rds_tcp_xmit_path_complete(struct rds_conn_path *cp) 498c2ecf20Sopenharmony_ci{ 508c2ecf20Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci tcp_sock_set_cork(tc->t_sock->sk, false); 538c2ecf20Sopenharmony_ci} 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci/* the core send_sem serializes this with other xmit and shutdown */ 568c2ecf20Sopenharmony_cistatic int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) 578c2ecf20Sopenharmony_ci{ 588c2ecf20Sopenharmony_ci struct kvec vec = { 598c2ecf20Sopenharmony_ci .iov_base = data, 608c2ecf20Sopenharmony_ci .iov_len = len, 618c2ecf20Sopenharmony_ci }; 628c2ecf20Sopenharmony_ci struct msghdr msg = { 638c2ecf20Sopenharmony_ci .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, 648c2ecf20Sopenharmony_ci }; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); 678c2ecf20Sopenharmony_ci} 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci/* the core send_sem serializes this with other xmit and shutdown */ 708c2ecf20Sopenharmony_ciint rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, 718c2ecf20Sopenharmony_ci unsigned int hdr_off, unsigned int sg, unsigned int off) 728c2ecf20Sopenharmony_ci{ 738c2ecf20Sopenharmony_ci struct rds_conn_path *cp = rm->m_inc.i_conn_path; 748c2ecf20Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 758c2ecf20Sopenharmony_ci int done = 0; 768c2ecf20Sopenharmony_ci int ret = 0; 778c2ecf20Sopenharmony_ci int more; 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci if (hdr_off == 0) { 808c2ecf20Sopenharmony_ci /* 818c2ecf20Sopenharmony_ci * m_ack_seq is set to the sequence number of the last byte of 828c2ecf20Sopenharmony_ci * header and data. see rds_tcp_is_acked(). 838c2ecf20Sopenharmony_ci */ 848c2ecf20Sopenharmony_ci tc->t_last_sent_nxt = rds_tcp_write_seq(tc); 858c2ecf20Sopenharmony_ci rm->m_ack_seq = tc->t_last_sent_nxt + 868c2ecf20Sopenharmony_ci sizeof(struct rds_header) + 878c2ecf20Sopenharmony_ci be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; 888c2ecf20Sopenharmony_ci smp_mb__before_atomic(); 898c2ecf20Sopenharmony_ci set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); 908c2ecf20Sopenharmony_ci tc->t_last_expected_una = rm->m_ack_seq + 1; 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) 938c2ecf20Sopenharmony_ci rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", 968c2ecf20Sopenharmony_ci rm, rds_tcp_write_seq(tc), 978c2ecf20Sopenharmony_ci (unsigned long long)rm->m_ack_seq); 988c2ecf20Sopenharmony_ci } 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ci if (hdr_off < sizeof(struct rds_header)) { 1018c2ecf20Sopenharmony_ci /* see rds_tcp_write_space() */ 1028c2ecf20Sopenharmony_ci set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags); 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci ret = rds_tcp_sendmsg(tc->t_sock, 1058c2ecf20Sopenharmony_ci (void *)&rm->m_inc.i_hdr + hdr_off, 1068c2ecf20Sopenharmony_ci sizeof(rm->m_inc.i_hdr) - hdr_off); 1078c2ecf20Sopenharmony_ci if (ret < 0) 1088c2ecf20Sopenharmony_ci goto out; 1098c2ecf20Sopenharmony_ci done += ret; 1108c2ecf20Sopenharmony_ci if (hdr_off + done != sizeof(struct rds_header)) 1118c2ecf20Sopenharmony_ci goto out; 1128c2ecf20Sopenharmony_ci } 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0; 1158c2ecf20Sopenharmony_ci while (sg < rm->data.op_nents) { 1168c2ecf20Sopenharmony_ci int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci ret = tc->t_sock->ops->sendpage(tc->t_sock, 1198c2ecf20Sopenharmony_ci sg_page(&rm->data.op_sg[sg]), 1208c2ecf20Sopenharmony_ci rm->data.op_sg[sg].offset + off, 1218c2ecf20Sopenharmony_ci rm->data.op_sg[sg].length - off, 1228c2ecf20Sopenharmony_ci flags); 1238c2ecf20Sopenharmony_ci rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), 1248c2ecf20Sopenharmony_ci rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, 1258c2ecf20Sopenharmony_ci ret); 1268c2ecf20Sopenharmony_ci if (ret <= 0) 1278c2ecf20Sopenharmony_ci break; 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci off += ret; 1308c2ecf20Sopenharmony_ci done += ret; 1318c2ecf20Sopenharmony_ci if (off == rm->data.op_sg[sg].length) { 1328c2ecf20Sopenharmony_ci off = 0; 1338c2ecf20Sopenharmony_ci sg++; 1348c2ecf20Sopenharmony_ci } 1358c2ecf20Sopenharmony_ci if (sg == rm->data.op_nents - 1) 1368c2ecf20Sopenharmony_ci more = 0; 1378c2ecf20Sopenharmony_ci } 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_ciout: 1408c2ecf20Sopenharmony_ci if (ret <= 0) { 1418c2ecf20Sopenharmony_ci /* write_space will hit after EAGAIN, all else fatal */ 1428c2ecf20Sopenharmony_ci if (ret == -EAGAIN) { 1438c2ecf20Sopenharmony_ci rds_tcp_stats_inc(s_tcp_sndbuf_full); 1448c2ecf20Sopenharmony_ci ret = 0; 1458c2ecf20Sopenharmony_ci } else { 1468c2ecf20Sopenharmony_ci /* No need to disconnect/reconnect if path_drop 1478c2ecf20Sopenharmony_ci * has already been triggered, because, e.g., of 1488c2ecf20Sopenharmony_ci * an incoming RST. 1498c2ecf20Sopenharmony_ci */ 1508c2ecf20Sopenharmony_ci if (rds_conn_path_up(cp)) { 1518c2ecf20Sopenharmony_ci pr_warn("RDS/tcp: send to %pI6c on cp [%d]" 1528c2ecf20Sopenharmony_ci "returned %d, " 1538c2ecf20Sopenharmony_ci "disconnecting and reconnecting\n", 1548c2ecf20Sopenharmony_ci &conn->c_faddr, cp->cp_index, ret); 1558c2ecf20Sopenharmony_ci rds_conn_path_drop(cp, false); 1568c2ecf20Sopenharmony_ci } 1578c2ecf20Sopenharmony_ci } 1588c2ecf20Sopenharmony_ci } 1598c2ecf20Sopenharmony_ci if (done == 0) 1608c2ecf20Sopenharmony_ci done = ret; 1618c2ecf20Sopenharmony_ci return done; 1628c2ecf20Sopenharmony_ci} 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci/* 1658c2ecf20Sopenharmony_ci * rm->m_ack_seq is set to the tcp sequence number that corresponds to the 1668c2ecf20Sopenharmony_ci * last byte of the message, including the header. This means that the 1678c2ecf20Sopenharmony_ci * entire message has been received if rm->m_ack_seq is "before" the next 1688c2ecf20Sopenharmony_ci * unacked byte of the TCP sequence space. We have to do very careful 1698c2ecf20Sopenharmony_ci * wrapping 32bit comparisons here. 1708c2ecf20Sopenharmony_ci */ 1718c2ecf20Sopenharmony_cistatic int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) 1728c2ecf20Sopenharmony_ci{ 1738c2ecf20Sopenharmony_ci if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) 1748c2ecf20Sopenharmony_ci return 0; 1758c2ecf20Sopenharmony_ci return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0; 1768c2ecf20Sopenharmony_ci} 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_civoid rds_tcp_write_space(struct sock *sk) 1798c2ecf20Sopenharmony_ci{ 1808c2ecf20Sopenharmony_ci void (*write_space)(struct sock *sk); 1818c2ecf20Sopenharmony_ci struct rds_conn_path *cp; 1828c2ecf20Sopenharmony_ci struct rds_tcp_connection *tc; 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_ci read_lock_bh(&sk->sk_callback_lock); 1858c2ecf20Sopenharmony_ci cp = sk->sk_user_data; 1868c2ecf20Sopenharmony_ci if (!cp) { 1878c2ecf20Sopenharmony_ci write_space = sk->sk_write_space; 1888c2ecf20Sopenharmony_ci goto out; 1898c2ecf20Sopenharmony_ci } 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci tc = cp->cp_transport_data; 1928c2ecf20Sopenharmony_ci rdsdebug("write_space for tc %p\n", tc); 1938c2ecf20Sopenharmony_ci write_space = tc->t_orig_write_space; 1948c2ecf20Sopenharmony_ci rds_tcp_stats_inc(s_tcp_write_space_calls); 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); 1978c2ecf20Sopenharmony_ci tc->t_last_seen_una = rds_tcp_snd_una(tc); 1988c2ecf20Sopenharmony_ci rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked); 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci rcu_read_lock(); 2018c2ecf20Sopenharmony_ci if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf && 2028c2ecf20Sopenharmony_ci !rds_destroy_pending(cp->cp_conn)) 2038c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 0); 2048c2ecf20Sopenharmony_ci rcu_read_unlock(); 2058c2ecf20Sopenharmony_ci 2068c2ecf20Sopenharmony_ciout: 2078c2ecf20Sopenharmony_ci read_unlock_bh(&sk->sk_callback_lock); 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci /* 2108c2ecf20Sopenharmony_ci * write_space is only called when data leaves tcp's send queue if 2118c2ecf20Sopenharmony_ci * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put 2128c2ecf20Sopenharmony_ci * data in tcp's send queue because we use write_space to parse the 2138c2ecf20Sopenharmony_ci * sequence numbers and notice that rds messages have been fully 2148c2ecf20Sopenharmony_ci * received. 2158c2ecf20Sopenharmony_ci * 2168c2ecf20Sopenharmony_ci * tcp's write_space clears SOCK_NOSPACE if the send queue has more 2178c2ecf20Sopenharmony_ci * than a certain amount of space. So we need to set it again *after* 2188c2ecf20Sopenharmony_ci * we call tcp's write_space or else we might only get called on the 2198c2ecf20Sopenharmony_ci * first of a series of incoming tcp acks. 2208c2ecf20Sopenharmony_ci */ 2218c2ecf20Sopenharmony_ci write_space(sk); 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_ci if (sk->sk_socket) 2248c2ecf20Sopenharmony_ci set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2258c2ecf20Sopenharmony_ci} 226