18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * This software is available to you under a choice of one of two 58c2ecf20Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 68c2ecf20Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 78c2ecf20Sopenharmony_ci * COPYING in the main directory of this source tree, or the 88c2ecf20Sopenharmony_ci * OpenIB.org BSD license below: 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Redistribution and use in source and binary forms, with or 118c2ecf20Sopenharmony_ci * without modification, are permitted provided that the following 128c2ecf20Sopenharmony_ci * conditions are met: 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * - Redistributions of source code must retain the above 158c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 168c2ecf20Sopenharmony_ci * disclaimer. 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * - Redistributions in binary form must reproduce the above 198c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 208c2ecf20Sopenharmony_ci * disclaimer in the documentation and/or other materials 218c2ecf20Sopenharmony_ci * provided with the distribution. 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 248c2ecf20Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 258c2ecf20Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 268c2ecf20Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 278c2ecf20Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 288c2ecf20Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 298c2ecf20Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 308c2ecf20Sopenharmony_ci * SOFTWARE. 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ci#include <linux/kernel.h> 348c2ecf20Sopenharmony_ci#include <linux/in.h> 358c2ecf20Sopenharmony_ci#include <net/tcp.h> 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci#include "rds.h" 388c2ecf20Sopenharmony_ci#include "tcp.h" 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_civoid rds_tcp_state_change(struct sock *sk) 418c2ecf20Sopenharmony_ci{ 428c2ecf20Sopenharmony_ci void (*state_change)(struct sock *sk); 438c2ecf20Sopenharmony_ci struct rds_conn_path *cp; 448c2ecf20Sopenharmony_ci struct rds_tcp_connection *tc; 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ci read_lock_bh(&sk->sk_callback_lock); 478c2ecf20Sopenharmony_ci cp = sk->sk_user_data; 488c2ecf20Sopenharmony_ci if (!cp) { 498c2ecf20Sopenharmony_ci state_change = sk->sk_state_change; 508c2ecf20Sopenharmony_ci goto out; 518c2ecf20Sopenharmony_ci } 528c2ecf20Sopenharmony_ci tc = cp->cp_transport_data; 538c2ecf20Sopenharmony_ci state_change = tc->t_orig_state_change; 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ci switch (sk->sk_state) { 588c2ecf20Sopenharmony_ci /* ignore connecting sockets as they make progress */ 598c2ecf20Sopenharmony_ci case TCP_SYN_SENT: 608c2ecf20Sopenharmony_ci case TCP_SYN_RECV: 618c2ecf20Sopenharmony_ci break; 628c2ecf20Sopenharmony_ci case TCP_ESTABLISHED: 638c2ecf20Sopenharmony_ci /* Force the peer to reconnect so that we have the 648c2ecf20Sopenharmony_ci * TCP ports going from <smaller-ip>.<transient> to 658c2ecf20Sopenharmony_ci * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the 668c2ecf20Sopenharmony_ci * RDS connection as RDS_CONN_UP until the reconnect, 678c2ecf20Sopenharmony_ci * to avoid RDS datagram loss. 688c2ecf20Sopenharmony_ci */ 698c2ecf20Sopenharmony_ci if (rds_addr_cmp(&cp->cp_conn->c_laddr, 708c2ecf20Sopenharmony_ci &cp->cp_conn->c_faddr) >= 0 && 718c2ecf20Sopenharmony_ci rds_conn_path_transition(cp, RDS_CONN_CONNECTING, 728c2ecf20Sopenharmony_ci RDS_CONN_ERROR)) { 738c2ecf20Sopenharmony_ci rds_conn_path_drop(cp, false); 748c2ecf20Sopenharmony_ci } else { 758c2ecf20Sopenharmony_ci rds_connect_path_complete(cp, RDS_CONN_CONNECTING); 768c2ecf20Sopenharmony_ci } 778c2ecf20Sopenharmony_ci break; 788c2ecf20Sopenharmony_ci case TCP_CLOSE_WAIT: 798c2ecf20Sopenharmony_ci case TCP_CLOSE: 808c2ecf20Sopenharmony_ci rds_conn_path_drop(cp, false); 818c2ecf20Sopenharmony_ci default: 828c2ecf20Sopenharmony_ci break; 838c2ecf20Sopenharmony_ci } 848c2ecf20Sopenharmony_ciout: 858c2ecf20Sopenharmony_ci read_unlock_bh(&sk->sk_callback_lock); 868c2ecf20Sopenharmony_ci state_change(sk); 878c2ecf20Sopenharmony_ci} 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ciint rds_tcp_conn_path_connect(struct rds_conn_path *cp) 908c2ecf20Sopenharmony_ci{ 918c2ecf20Sopenharmony_ci struct socket *sock = NULL; 928c2ecf20Sopenharmony_ci struct sockaddr_in6 sin6; 938c2ecf20Sopenharmony_ci struct sockaddr_in sin; 948c2ecf20Sopenharmony_ci struct sockaddr *addr; 958c2ecf20Sopenharmony_ci int addrlen; 968c2ecf20Sopenharmony_ci bool isv6; 978c2ecf20Sopenharmony_ci int ret; 988c2ecf20Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 998c2ecf20Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci /* for multipath rds,we only trigger the connection after 1028c2ecf20Sopenharmony_ci * the handshake probe has determined the number of paths. 1038c2ecf20Sopenharmony_ci */ 1048c2ecf20Sopenharmony_ci if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2) 1058c2ecf20Sopenharmony_ci return -EAGAIN; 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci mutex_lock(&tc->t_conn_path_lock); 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci if (rds_conn_path_up(cp)) { 1108c2ecf20Sopenharmony_ci mutex_unlock(&tc->t_conn_path_lock); 1118c2ecf20Sopenharmony_ci return 0; 1128c2ecf20Sopenharmony_ci } 1138c2ecf20Sopenharmony_ci if (ipv6_addr_v4mapped(&conn->c_laddr)) { 1148c2ecf20Sopenharmony_ci ret = sock_create_kern(rds_conn_net(conn), PF_INET, 1158c2ecf20Sopenharmony_ci SOCK_STREAM, IPPROTO_TCP, &sock); 1168c2ecf20Sopenharmony_ci isv6 = false; 1178c2ecf20Sopenharmony_ci } else { 1188c2ecf20Sopenharmony_ci ret = sock_create_kern(rds_conn_net(conn), PF_INET6, 1198c2ecf20Sopenharmony_ci SOCK_STREAM, IPPROTO_TCP, &sock); 1208c2ecf20Sopenharmony_ci isv6 = true; 1218c2ecf20Sopenharmony_ci } 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci if (ret < 0) 1248c2ecf20Sopenharmony_ci goto out; 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci rds_tcp_tune(sock); 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_ci if (isv6) { 1298c2ecf20Sopenharmony_ci sin6.sin6_family = AF_INET6; 1308c2ecf20Sopenharmony_ci sin6.sin6_addr = conn->c_laddr; 1318c2ecf20Sopenharmony_ci sin6.sin6_port = 0; 1328c2ecf20Sopenharmony_ci sin6.sin6_flowinfo = 0; 1338c2ecf20Sopenharmony_ci sin6.sin6_scope_id = conn->c_dev_if; 1348c2ecf20Sopenharmony_ci addr = (struct sockaddr *)&sin6; 1358c2ecf20Sopenharmony_ci addrlen = sizeof(sin6); 1368c2ecf20Sopenharmony_ci } else { 1378c2ecf20Sopenharmony_ci sin.sin_family = AF_INET; 1388c2ecf20Sopenharmony_ci sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; 1398c2ecf20Sopenharmony_ci sin.sin_port = 0; 1408c2ecf20Sopenharmony_ci addr = (struct sockaddr *)&sin; 1418c2ecf20Sopenharmony_ci addrlen = sizeof(sin); 1428c2ecf20Sopenharmony_ci } 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci ret = kernel_bind(sock, addr, addrlen); 1458c2ecf20Sopenharmony_ci if (ret) { 1468c2ecf20Sopenharmony_ci rdsdebug("bind failed with %d at address %pI6c\n", 1478c2ecf20Sopenharmony_ci ret, &conn->c_laddr); 1488c2ecf20Sopenharmony_ci goto out; 1498c2ecf20Sopenharmony_ci } 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci if (isv6) { 1528c2ecf20Sopenharmony_ci sin6.sin6_family = AF_INET6; 1538c2ecf20Sopenharmony_ci sin6.sin6_addr = conn->c_faddr; 1548c2ecf20Sopenharmony_ci sin6.sin6_port = htons(RDS_TCP_PORT); 1558c2ecf20Sopenharmony_ci sin6.sin6_flowinfo = 0; 1568c2ecf20Sopenharmony_ci sin6.sin6_scope_id = conn->c_dev_if; 1578c2ecf20Sopenharmony_ci addr = (struct sockaddr *)&sin6; 1588c2ecf20Sopenharmony_ci addrlen = sizeof(sin6); 1598c2ecf20Sopenharmony_ci } else { 1608c2ecf20Sopenharmony_ci sin.sin_family = AF_INET; 1618c2ecf20Sopenharmony_ci sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; 1628c2ecf20Sopenharmony_ci sin.sin_port = htons(RDS_TCP_PORT); 1638c2ecf20Sopenharmony_ci addr = (struct sockaddr *)&sin; 1648c2ecf20Sopenharmony_ci addrlen = sizeof(sin); 1658c2ecf20Sopenharmony_ci } 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci /* 1688c2ecf20Sopenharmony_ci * once we call connect() we can start getting callbacks and they 1698c2ecf20Sopenharmony_ci * own the socket 1708c2ecf20Sopenharmony_ci */ 1718c2ecf20Sopenharmony_ci rds_tcp_set_callbacks(sock, cp); 1728c2ecf20Sopenharmony_ci ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); 1738c2ecf20Sopenharmony_ci 1748c2ecf20Sopenharmony_ci rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); 1758c2ecf20Sopenharmony_ci if (ret == -EINPROGRESS) 1768c2ecf20Sopenharmony_ci ret = 0; 1778c2ecf20Sopenharmony_ci if (ret == 0) { 1788c2ecf20Sopenharmony_ci rds_tcp_keepalive(sock); 1798c2ecf20Sopenharmony_ci sock = NULL; 1808c2ecf20Sopenharmony_ci } else { 1818c2ecf20Sopenharmony_ci rds_tcp_restore_callbacks(sock, cp->cp_transport_data); 1828c2ecf20Sopenharmony_ci } 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_ciout: 1858c2ecf20Sopenharmony_ci mutex_unlock(&tc->t_conn_path_lock); 1868c2ecf20Sopenharmony_ci if (sock) 1878c2ecf20Sopenharmony_ci sock_release(sock); 1888c2ecf20Sopenharmony_ci return ret; 1898c2ecf20Sopenharmony_ci} 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci/* 1928c2ecf20Sopenharmony_ci * Before killing the tcp socket this needs to serialize with callbacks. The 1938c2ecf20Sopenharmony_ci * caller has already grabbed the sending sem so we're serialized with other 1948c2ecf20Sopenharmony_ci * senders. 1958c2ecf20Sopenharmony_ci * 1968c2ecf20Sopenharmony_ci * TCP calls the callbacks with the sock lock so we hold it while we reset the 1978c2ecf20Sopenharmony_ci * callbacks to those set by TCP. Our callbacks won't execute again once we 1988c2ecf20Sopenharmony_ci * hold the sock lock. 1998c2ecf20Sopenharmony_ci */ 2008c2ecf20Sopenharmony_civoid rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) 2018c2ecf20Sopenharmony_ci{ 2028c2ecf20Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 2038c2ecf20Sopenharmony_ci struct socket *sock = tc->t_sock; 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci rdsdebug("shutting down conn %p tc %p sock %p\n", 2068c2ecf20Sopenharmony_ci cp->cp_conn, tc, sock); 2078c2ecf20Sopenharmony_ci 2088c2ecf20Sopenharmony_ci if (sock) { 2098c2ecf20Sopenharmony_ci if (rds_destroy_pending(cp->cp_conn)) 2108c2ecf20Sopenharmony_ci sock_no_linger(sock->sk); 2118c2ecf20Sopenharmony_ci sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); 2128c2ecf20Sopenharmony_ci lock_sock(sock->sk); 2138c2ecf20Sopenharmony_ci rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci release_sock(sock->sk); 2168c2ecf20Sopenharmony_ci sock_release(sock); 2178c2ecf20Sopenharmony_ci } 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci if (tc->t_tinc) { 2208c2ecf20Sopenharmony_ci rds_inc_put(&tc->t_tinc->ti_inc); 2218c2ecf20Sopenharmony_ci tc->t_tinc = NULL; 2228c2ecf20Sopenharmony_ci } 2238c2ecf20Sopenharmony_ci tc->t_tinc_hdr_rem = sizeof(struct rds_header); 2248c2ecf20Sopenharmony_ci tc->t_tinc_data_rem = 0; 2258c2ecf20Sopenharmony_ci} 226