18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * This software is available to you under a choice of one of two
58c2ecf20Sopenharmony_ci * licenses.  You may choose to be licensed under the terms of the GNU
68c2ecf20Sopenharmony_ci * General Public License (GPL) Version 2, available from the file
78c2ecf20Sopenharmony_ci * COPYING in the main directory of this source tree, or the
88c2ecf20Sopenharmony_ci * OpenIB.org BSD license below:
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci *     Redistribution and use in source and binary forms, with or
118c2ecf20Sopenharmony_ci *     without modification, are permitted provided that the following
128c2ecf20Sopenharmony_ci *     conditions are met:
138c2ecf20Sopenharmony_ci *
148c2ecf20Sopenharmony_ci *      - Redistributions of source code must retain the above
158c2ecf20Sopenharmony_ci *        copyright notice, this list of conditions and the following
168c2ecf20Sopenharmony_ci *        disclaimer.
178c2ecf20Sopenharmony_ci *
188c2ecf20Sopenharmony_ci *      - Redistributions in binary form must reproduce the above
198c2ecf20Sopenharmony_ci *        copyright notice, this list of conditions and the following
208c2ecf20Sopenharmony_ci *        disclaimer in the documentation and/or other materials
218c2ecf20Sopenharmony_ci *        provided with the distribution.
228c2ecf20Sopenharmony_ci *
238c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
248c2ecf20Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
258c2ecf20Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
268c2ecf20Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
278c2ecf20Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
288c2ecf20Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
298c2ecf20Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
308c2ecf20Sopenharmony_ci * SOFTWARE.
318c2ecf20Sopenharmony_ci *
328c2ecf20Sopenharmony_ci */
338c2ecf20Sopenharmony_ci#include <linux/kernel.h>
348c2ecf20Sopenharmony_ci#include <linux/in.h>
358c2ecf20Sopenharmony_ci#include <net/tcp.h>
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci#include "rds.h"
388c2ecf20Sopenharmony_ci#include "tcp.h"
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_civoid rds_tcp_state_change(struct sock *sk)
418c2ecf20Sopenharmony_ci{
428c2ecf20Sopenharmony_ci	void (*state_change)(struct sock *sk);
438c2ecf20Sopenharmony_ci	struct rds_conn_path *cp;
448c2ecf20Sopenharmony_ci	struct rds_tcp_connection *tc;
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci	read_lock_bh(&sk->sk_callback_lock);
478c2ecf20Sopenharmony_ci	cp = sk->sk_user_data;
488c2ecf20Sopenharmony_ci	if (!cp) {
498c2ecf20Sopenharmony_ci		state_change = sk->sk_state_change;
508c2ecf20Sopenharmony_ci		goto out;
518c2ecf20Sopenharmony_ci	}
528c2ecf20Sopenharmony_ci	tc = cp->cp_transport_data;
538c2ecf20Sopenharmony_ci	state_change = tc->t_orig_state_change;
548c2ecf20Sopenharmony_ci
558c2ecf20Sopenharmony_ci	rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci	switch (sk->sk_state) {
588c2ecf20Sopenharmony_ci	/* ignore connecting sockets as they make progress */
598c2ecf20Sopenharmony_ci	case TCP_SYN_SENT:
608c2ecf20Sopenharmony_ci	case TCP_SYN_RECV:
618c2ecf20Sopenharmony_ci		break;
628c2ecf20Sopenharmony_ci	case TCP_ESTABLISHED:
638c2ecf20Sopenharmony_ci		/* Force the peer to reconnect so that we have the
648c2ecf20Sopenharmony_ci		 * TCP ports going from <smaller-ip>.<transient> to
658c2ecf20Sopenharmony_ci		 * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
668c2ecf20Sopenharmony_ci		 * RDS connection as RDS_CONN_UP until the reconnect,
678c2ecf20Sopenharmony_ci		 * to avoid RDS datagram loss.
688c2ecf20Sopenharmony_ci		 */
698c2ecf20Sopenharmony_ci		if (rds_addr_cmp(&cp->cp_conn->c_laddr,
708c2ecf20Sopenharmony_ci				 &cp->cp_conn->c_faddr) >= 0 &&
718c2ecf20Sopenharmony_ci		    rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
728c2ecf20Sopenharmony_ci					     RDS_CONN_ERROR)) {
738c2ecf20Sopenharmony_ci			rds_conn_path_drop(cp, false);
748c2ecf20Sopenharmony_ci		} else {
758c2ecf20Sopenharmony_ci			rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
768c2ecf20Sopenharmony_ci		}
778c2ecf20Sopenharmony_ci		break;
788c2ecf20Sopenharmony_ci	case TCP_CLOSE_WAIT:
798c2ecf20Sopenharmony_ci	case TCP_CLOSE:
808c2ecf20Sopenharmony_ci		rds_conn_path_drop(cp, false);
818c2ecf20Sopenharmony_ci	default:
828c2ecf20Sopenharmony_ci		break;
838c2ecf20Sopenharmony_ci	}
848c2ecf20Sopenharmony_ciout:
858c2ecf20Sopenharmony_ci	read_unlock_bh(&sk->sk_callback_lock);
868c2ecf20Sopenharmony_ci	state_change(sk);
878c2ecf20Sopenharmony_ci}
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_ciint rds_tcp_conn_path_connect(struct rds_conn_path *cp)
908c2ecf20Sopenharmony_ci{
918c2ecf20Sopenharmony_ci	struct socket *sock = NULL;
928c2ecf20Sopenharmony_ci	struct sockaddr_in6 sin6;
938c2ecf20Sopenharmony_ci	struct sockaddr_in sin;
948c2ecf20Sopenharmony_ci	struct sockaddr *addr;
958c2ecf20Sopenharmony_ci	int addrlen;
968c2ecf20Sopenharmony_ci	bool isv6;
978c2ecf20Sopenharmony_ci	int ret;
988c2ecf20Sopenharmony_ci	struct rds_connection *conn = cp->cp_conn;
998c2ecf20Sopenharmony_ci	struct rds_tcp_connection *tc = cp->cp_transport_data;
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	/* for multipath rds,we only trigger the connection after
1028c2ecf20Sopenharmony_ci	 * the handshake probe has determined the number of paths.
1038c2ecf20Sopenharmony_ci	 */
1048c2ecf20Sopenharmony_ci	if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2)
1058c2ecf20Sopenharmony_ci		return -EAGAIN;
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	mutex_lock(&tc->t_conn_path_lock);
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci	if (rds_conn_path_up(cp)) {
1108c2ecf20Sopenharmony_ci		mutex_unlock(&tc->t_conn_path_lock);
1118c2ecf20Sopenharmony_ci		return 0;
1128c2ecf20Sopenharmony_ci	}
1138c2ecf20Sopenharmony_ci	if (ipv6_addr_v4mapped(&conn->c_laddr)) {
1148c2ecf20Sopenharmony_ci		ret = sock_create_kern(rds_conn_net(conn), PF_INET,
1158c2ecf20Sopenharmony_ci				       SOCK_STREAM, IPPROTO_TCP, &sock);
1168c2ecf20Sopenharmony_ci		isv6 = false;
1178c2ecf20Sopenharmony_ci	} else {
1188c2ecf20Sopenharmony_ci		ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
1198c2ecf20Sopenharmony_ci				       SOCK_STREAM, IPPROTO_TCP, &sock);
1208c2ecf20Sopenharmony_ci		isv6 = true;
1218c2ecf20Sopenharmony_ci	}
1228c2ecf20Sopenharmony_ci
1238c2ecf20Sopenharmony_ci	if (ret < 0)
1248c2ecf20Sopenharmony_ci		goto out;
1258c2ecf20Sopenharmony_ci
1268c2ecf20Sopenharmony_ci	rds_tcp_tune(sock);
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci	if (isv6) {
1298c2ecf20Sopenharmony_ci		sin6.sin6_family = AF_INET6;
1308c2ecf20Sopenharmony_ci		sin6.sin6_addr = conn->c_laddr;
1318c2ecf20Sopenharmony_ci		sin6.sin6_port = 0;
1328c2ecf20Sopenharmony_ci		sin6.sin6_flowinfo = 0;
1338c2ecf20Sopenharmony_ci		sin6.sin6_scope_id = conn->c_dev_if;
1348c2ecf20Sopenharmony_ci		addr = (struct sockaddr *)&sin6;
1358c2ecf20Sopenharmony_ci		addrlen = sizeof(sin6);
1368c2ecf20Sopenharmony_ci	} else {
1378c2ecf20Sopenharmony_ci		sin.sin_family = AF_INET;
1388c2ecf20Sopenharmony_ci		sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
1398c2ecf20Sopenharmony_ci		sin.sin_port = 0;
1408c2ecf20Sopenharmony_ci		addr = (struct sockaddr *)&sin;
1418c2ecf20Sopenharmony_ci		addrlen = sizeof(sin);
1428c2ecf20Sopenharmony_ci	}
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	ret = kernel_bind(sock, addr, addrlen);
1458c2ecf20Sopenharmony_ci	if (ret) {
1468c2ecf20Sopenharmony_ci		rdsdebug("bind failed with %d at address %pI6c\n",
1478c2ecf20Sopenharmony_ci			 ret, &conn->c_laddr);
1488c2ecf20Sopenharmony_ci		goto out;
1498c2ecf20Sopenharmony_ci	}
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci	if (isv6) {
1528c2ecf20Sopenharmony_ci		sin6.sin6_family = AF_INET6;
1538c2ecf20Sopenharmony_ci		sin6.sin6_addr = conn->c_faddr;
1548c2ecf20Sopenharmony_ci		sin6.sin6_port = htons(RDS_TCP_PORT);
1558c2ecf20Sopenharmony_ci		sin6.sin6_flowinfo = 0;
1568c2ecf20Sopenharmony_ci		sin6.sin6_scope_id = conn->c_dev_if;
1578c2ecf20Sopenharmony_ci		addr = (struct sockaddr *)&sin6;
1588c2ecf20Sopenharmony_ci		addrlen = sizeof(sin6);
1598c2ecf20Sopenharmony_ci	} else {
1608c2ecf20Sopenharmony_ci		sin.sin_family = AF_INET;
1618c2ecf20Sopenharmony_ci		sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
1628c2ecf20Sopenharmony_ci		sin.sin_port = htons(RDS_TCP_PORT);
1638c2ecf20Sopenharmony_ci		addr = (struct sockaddr *)&sin;
1648c2ecf20Sopenharmony_ci		addrlen = sizeof(sin);
1658c2ecf20Sopenharmony_ci	}
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ci	/*
1688c2ecf20Sopenharmony_ci	 * once we call connect() we can start getting callbacks and they
1698c2ecf20Sopenharmony_ci	 * own the socket
1708c2ecf20Sopenharmony_ci	 */
1718c2ecf20Sopenharmony_ci	rds_tcp_set_callbacks(sock, cp);
1728c2ecf20Sopenharmony_ci	ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK);
1738c2ecf20Sopenharmony_ci
1748c2ecf20Sopenharmony_ci	rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
1758c2ecf20Sopenharmony_ci	if (ret == -EINPROGRESS)
1768c2ecf20Sopenharmony_ci		ret = 0;
1778c2ecf20Sopenharmony_ci	if (ret == 0) {
1788c2ecf20Sopenharmony_ci		rds_tcp_keepalive(sock);
1798c2ecf20Sopenharmony_ci		sock = NULL;
1808c2ecf20Sopenharmony_ci	} else {
1818c2ecf20Sopenharmony_ci		rds_tcp_restore_callbacks(sock, cp->cp_transport_data);
1828c2ecf20Sopenharmony_ci	}
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_ciout:
1858c2ecf20Sopenharmony_ci	mutex_unlock(&tc->t_conn_path_lock);
1868c2ecf20Sopenharmony_ci	if (sock)
1878c2ecf20Sopenharmony_ci		sock_release(sock);
1888c2ecf20Sopenharmony_ci	return ret;
1898c2ecf20Sopenharmony_ci}
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci/*
1928c2ecf20Sopenharmony_ci * Before killing the tcp socket this needs to serialize with callbacks.  The
1938c2ecf20Sopenharmony_ci * caller has already grabbed the sending sem so we're serialized with other
1948c2ecf20Sopenharmony_ci * senders.
1958c2ecf20Sopenharmony_ci *
1968c2ecf20Sopenharmony_ci * TCP calls the callbacks with the sock lock so we hold it while we reset the
1978c2ecf20Sopenharmony_ci * callbacks to those set by TCP.  Our callbacks won't execute again once we
1988c2ecf20Sopenharmony_ci * hold the sock lock.
1998c2ecf20Sopenharmony_ci */
2008c2ecf20Sopenharmony_civoid rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
2018c2ecf20Sopenharmony_ci{
2028c2ecf20Sopenharmony_ci	struct rds_tcp_connection *tc = cp->cp_transport_data;
2038c2ecf20Sopenharmony_ci	struct socket *sock = tc->t_sock;
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	rdsdebug("shutting down conn %p tc %p sock %p\n",
2068c2ecf20Sopenharmony_ci		 cp->cp_conn, tc, sock);
2078c2ecf20Sopenharmony_ci
2088c2ecf20Sopenharmony_ci	if (sock) {
2098c2ecf20Sopenharmony_ci		if (rds_destroy_pending(cp->cp_conn))
2108c2ecf20Sopenharmony_ci			sock_no_linger(sock->sk);
2118c2ecf20Sopenharmony_ci		sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
2128c2ecf20Sopenharmony_ci		lock_sock(sock->sk);
2138c2ecf20Sopenharmony_ci		rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
2148c2ecf20Sopenharmony_ci
2158c2ecf20Sopenharmony_ci		release_sock(sock->sk);
2168c2ecf20Sopenharmony_ci		sock_release(sock);
2178c2ecf20Sopenharmony_ci	}
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_ci	if (tc->t_tinc) {
2208c2ecf20Sopenharmony_ci		rds_inc_put(&tc->t_tinc->ti_inc);
2218c2ecf20Sopenharmony_ci		tc->t_tinc = NULL;
2228c2ecf20Sopenharmony_ci	}
2238c2ecf20Sopenharmony_ci	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
2248c2ecf20Sopenharmony_ci	tc->t_tinc_data_rem = 0;
2258c2ecf20Sopenharmony_ci}
226