162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * This software is available to you under a choice of one of two 562306a36Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 662306a36Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 762306a36Sopenharmony_ci * COPYING in the main directory of this source tree, or the 862306a36Sopenharmony_ci * OpenIB.org BSD license below: 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Redistribution and use in source and binary forms, with or 1162306a36Sopenharmony_ci * without modification, are permitted provided that the following 1262306a36Sopenharmony_ci * conditions are met: 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * - Redistributions of source code must retain the above 1562306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 1662306a36Sopenharmony_ci * disclaimer. 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * - Redistributions in binary form must reproduce the above 1962306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 2062306a36Sopenharmony_ci * disclaimer in the documentation and/or other materials 2162306a36Sopenharmony_ci * provided with the distribution. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2462306a36Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2562306a36Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2662306a36Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2762306a36Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2862306a36Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2962306a36Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3062306a36Sopenharmony_ci * SOFTWARE. 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci */ 3362306a36Sopenharmony_ci#include <linux/kernel.h> 3462306a36Sopenharmony_ci#include <linux/in.h> 3562306a36Sopenharmony_ci#include <net/tcp.h> 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci#include "rds.h" 3862306a36Sopenharmony_ci#include "tcp.h" 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_civoid rds_tcp_state_change(struct sock *sk) 4162306a36Sopenharmony_ci{ 4262306a36Sopenharmony_ci void (*state_change)(struct sock *sk); 4362306a36Sopenharmony_ci struct rds_conn_path *cp; 4462306a36Sopenharmony_ci struct rds_tcp_connection *tc; 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci read_lock_bh(&sk->sk_callback_lock); 4762306a36Sopenharmony_ci cp = sk->sk_user_data; 4862306a36Sopenharmony_ci if (!cp) { 4962306a36Sopenharmony_ci state_change = sk->sk_state_change; 5062306a36Sopenharmony_ci goto out; 5162306a36Sopenharmony_ci } 5262306a36Sopenharmony_ci tc = cp->cp_transport_data; 5362306a36Sopenharmony_ci state_change = tc->t_orig_state_change; 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci switch (sk->sk_state) { 5862306a36Sopenharmony_ci /* ignore connecting sockets as they make progress */ 5962306a36Sopenharmony_ci case TCP_SYN_SENT: 6062306a36Sopenharmony_ci case TCP_SYN_RECV: 6162306a36Sopenharmony_ci break; 6262306a36Sopenharmony_ci case TCP_ESTABLISHED: 6362306a36Sopenharmony_ci /* Force the peer to reconnect so that we have the 6462306a36Sopenharmony_ci * TCP ports going from <smaller-ip>.<transient> to 6562306a36Sopenharmony_ci * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the 6662306a36Sopenharmony_ci * RDS connection as RDS_CONN_UP until the reconnect, 6762306a36Sopenharmony_ci * to avoid RDS datagram loss. 6862306a36Sopenharmony_ci */ 6962306a36Sopenharmony_ci if (rds_addr_cmp(&cp->cp_conn->c_laddr, 7062306a36Sopenharmony_ci &cp->cp_conn->c_faddr) >= 0 && 7162306a36Sopenharmony_ci rds_conn_path_transition(cp, RDS_CONN_CONNECTING, 7262306a36Sopenharmony_ci RDS_CONN_ERROR)) { 7362306a36Sopenharmony_ci rds_conn_path_drop(cp, false); 7462306a36Sopenharmony_ci } else { 7562306a36Sopenharmony_ci rds_connect_path_complete(cp, RDS_CONN_CONNECTING); 7662306a36Sopenharmony_ci } 7762306a36Sopenharmony_ci break; 7862306a36Sopenharmony_ci case TCP_CLOSE_WAIT: 7962306a36Sopenharmony_ci case TCP_CLOSE: 8062306a36Sopenharmony_ci rds_conn_path_drop(cp, false); 8162306a36Sopenharmony_ci break; 8262306a36Sopenharmony_ci default: 8362306a36Sopenharmony_ci break; 8462306a36Sopenharmony_ci } 8562306a36Sopenharmony_ciout: 8662306a36Sopenharmony_ci read_unlock_bh(&sk->sk_callback_lock); 8762306a36Sopenharmony_ci state_change(sk); 8862306a36Sopenharmony_ci} 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ciint rds_tcp_conn_path_connect(struct rds_conn_path *cp) 9162306a36Sopenharmony_ci{ 9262306a36Sopenharmony_ci struct socket *sock = NULL; 9362306a36Sopenharmony_ci struct sockaddr_in6 sin6; 9462306a36Sopenharmony_ci struct sockaddr_in sin; 9562306a36Sopenharmony_ci struct sockaddr *addr; 9662306a36Sopenharmony_ci int addrlen; 9762306a36Sopenharmony_ci bool isv6; 9862306a36Sopenharmony_ci int ret; 9962306a36Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 10062306a36Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci /* for multipath rds,we only trigger the connection after 10362306a36Sopenharmony_ci * the handshake probe has determined the number of paths. 10462306a36Sopenharmony_ci */ 10562306a36Sopenharmony_ci if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2) 10662306a36Sopenharmony_ci return -EAGAIN; 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci mutex_lock(&tc->t_conn_path_lock); 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci if (rds_conn_path_up(cp)) { 11162306a36Sopenharmony_ci mutex_unlock(&tc->t_conn_path_lock); 11262306a36Sopenharmony_ci return 0; 11362306a36Sopenharmony_ci } 11462306a36Sopenharmony_ci if (ipv6_addr_v4mapped(&conn->c_laddr)) { 11562306a36Sopenharmony_ci ret = sock_create_kern(rds_conn_net(conn), PF_INET, 11662306a36Sopenharmony_ci SOCK_STREAM, IPPROTO_TCP, &sock); 11762306a36Sopenharmony_ci isv6 = false; 11862306a36Sopenharmony_ci } else { 11962306a36Sopenharmony_ci ret = sock_create_kern(rds_conn_net(conn), PF_INET6, 12062306a36Sopenharmony_ci SOCK_STREAM, IPPROTO_TCP, &sock); 12162306a36Sopenharmony_ci isv6 = true; 12262306a36Sopenharmony_ci } 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci if (ret < 0) 12562306a36Sopenharmony_ci goto out; 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci if (!rds_tcp_tune(sock)) { 12862306a36Sopenharmony_ci ret = -EINVAL; 12962306a36Sopenharmony_ci goto out; 13062306a36Sopenharmony_ci } 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci if (isv6) { 13362306a36Sopenharmony_ci sin6.sin6_family = AF_INET6; 13462306a36Sopenharmony_ci sin6.sin6_addr = conn->c_laddr; 13562306a36Sopenharmony_ci sin6.sin6_port = 0; 13662306a36Sopenharmony_ci sin6.sin6_flowinfo = 0; 13762306a36Sopenharmony_ci sin6.sin6_scope_id = conn->c_dev_if; 13862306a36Sopenharmony_ci addr = (struct sockaddr *)&sin6; 13962306a36Sopenharmony_ci addrlen = sizeof(sin6); 14062306a36Sopenharmony_ci } else { 14162306a36Sopenharmony_ci sin.sin_family = AF_INET; 14262306a36Sopenharmony_ci sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; 14362306a36Sopenharmony_ci sin.sin_port = 0; 14462306a36Sopenharmony_ci addr = (struct sockaddr *)&sin; 14562306a36Sopenharmony_ci addrlen = sizeof(sin); 14662306a36Sopenharmony_ci } 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci ret = kernel_bind(sock, addr, addrlen); 14962306a36Sopenharmony_ci if (ret) { 15062306a36Sopenharmony_ci rdsdebug("bind failed with %d at address %pI6c\n", 15162306a36Sopenharmony_ci ret, &conn->c_laddr); 15262306a36Sopenharmony_ci goto out; 15362306a36Sopenharmony_ci } 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci if (isv6) { 15662306a36Sopenharmony_ci sin6.sin6_family = AF_INET6; 15762306a36Sopenharmony_ci sin6.sin6_addr = conn->c_faddr; 15862306a36Sopenharmony_ci sin6.sin6_port = htons(RDS_TCP_PORT); 15962306a36Sopenharmony_ci sin6.sin6_flowinfo = 0; 16062306a36Sopenharmony_ci sin6.sin6_scope_id = conn->c_dev_if; 16162306a36Sopenharmony_ci addr = (struct sockaddr *)&sin6; 16262306a36Sopenharmony_ci addrlen = sizeof(sin6); 16362306a36Sopenharmony_ci } else { 16462306a36Sopenharmony_ci sin.sin_family = AF_INET; 16562306a36Sopenharmony_ci sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; 16662306a36Sopenharmony_ci sin.sin_port = htons(RDS_TCP_PORT); 16762306a36Sopenharmony_ci addr = (struct sockaddr *)&sin; 16862306a36Sopenharmony_ci addrlen = sizeof(sin); 16962306a36Sopenharmony_ci } 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci /* 17262306a36Sopenharmony_ci * once we call connect() we can start getting callbacks and they 17362306a36Sopenharmony_ci * own the socket 17462306a36Sopenharmony_ci */ 17562306a36Sopenharmony_ci rds_tcp_set_callbacks(sock, cp); 17662306a36Sopenharmony_ci ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); 17962306a36Sopenharmony_ci if (ret == -EINPROGRESS) 18062306a36Sopenharmony_ci ret = 0; 18162306a36Sopenharmony_ci if (ret == 0) { 18262306a36Sopenharmony_ci rds_tcp_keepalive(sock); 18362306a36Sopenharmony_ci sock = NULL; 18462306a36Sopenharmony_ci } else { 18562306a36Sopenharmony_ci rds_tcp_restore_callbacks(sock, cp->cp_transport_data); 18662306a36Sopenharmony_ci } 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ciout: 18962306a36Sopenharmony_ci mutex_unlock(&tc->t_conn_path_lock); 19062306a36Sopenharmony_ci if (sock) 19162306a36Sopenharmony_ci sock_release(sock); 19262306a36Sopenharmony_ci return ret; 19362306a36Sopenharmony_ci} 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci/* 19662306a36Sopenharmony_ci * Before killing the tcp socket this needs to serialize with callbacks. The 19762306a36Sopenharmony_ci * caller has already grabbed the sending sem so we're serialized with other 19862306a36Sopenharmony_ci * senders. 19962306a36Sopenharmony_ci * 20062306a36Sopenharmony_ci * TCP calls the callbacks with the sock lock so we hold it while we reset the 20162306a36Sopenharmony_ci * callbacks to those set by TCP. Our callbacks won't execute again once we 20262306a36Sopenharmony_ci * hold the sock lock. 20362306a36Sopenharmony_ci */ 20462306a36Sopenharmony_civoid rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) 20562306a36Sopenharmony_ci{ 20662306a36Sopenharmony_ci struct rds_tcp_connection *tc = cp->cp_transport_data; 20762306a36Sopenharmony_ci struct socket *sock = tc->t_sock; 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci rdsdebug("shutting down conn %p tc %p sock %p\n", 21062306a36Sopenharmony_ci cp->cp_conn, tc, sock); 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci if (sock) { 21362306a36Sopenharmony_ci if (rds_destroy_pending(cp->cp_conn)) 21462306a36Sopenharmony_ci sock_no_linger(sock->sk); 21562306a36Sopenharmony_ci sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); 21662306a36Sopenharmony_ci lock_sock(sock->sk); 21762306a36Sopenharmony_ci rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci release_sock(sock->sk); 22062306a36Sopenharmony_ci sock_release(sock); 22162306a36Sopenharmony_ci } 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci if (tc->t_tinc) { 22462306a36Sopenharmony_ci rds_inc_put(&tc->t_tinc->ti_inc); 22562306a36Sopenharmony_ci tc->t_tinc = NULL; 22662306a36Sopenharmony_ci } 22762306a36Sopenharmony_ci tc->t_tinc_hdr_rem = sizeof(struct rds_header); 22862306a36Sopenharmony_ci tc->t_tinc_data_rem = 0; 22962306a36Sopenharmony_ci} 230