162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * This software is available to you under a choice of one of two
562306a36Sopenharmony_ci * licenses.  You may choose to be licensed under the terms of the GNU
662306a36Sopenharmony_ci * General Public License (GPL) Version 2, available from the file
762306a36Sopenharmony_ci * COPYING in the main directory of this source tree, or the
862306a36Sopenharmony_ci * OpenIB.org BSD license below:
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci *     Redistribution and use in source and binary forms, with or
1162306a36Sopenharmony_ci *     without modification, are permitted provided that the following
1262306a36Sopenharmony_ci *     conditions are met:
1362306a36Sopenharmony_ci *
1462306a36Sopenharmony_ci *      - Redistributions of source code must retain the above
1562306a36Sopenharmony_ci *        copyright notice, this list of conditions and the following
1662306a36Sopenharmony_ci *        disclaimer.
1762306a36Sopenharmony_ci *
1862306a36Sopenharmony_ci *      - Redistributions in binary form must reproduce the above
1962306a36Sopenharmony_ci *        copyright notice, this list of conditions and the following
2062306a36Sopenharmony_ci *        disclaimer in the documentation and/or other materials
2162306a36Sopenharmony_ci *        provided with the distribution.
2262306a36Sopenharmony_ci *
2362306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2462306a36Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2562306a36Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2662306a36Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2762306a36Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2862306a36Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2962306a36Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3062306a36Sopenharmony_ci * SOFTWARE.
3162306a36Sopenharmony_ci *
3262306a36Sopenharmony_ci */
3362306a36Sopenharmony_ci#include <linux/kernel.h>
3462306a36Sopenharmony_ci#include <linux/random.h>
3562306a36Sopenharmony_ci#include <linux/export.h>
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ci#include "rds.h"
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci/*
4062306a36Sopenharmony_ci * All of connection management is simplified by serializing it through
4162306a36Sopenharmony_ci * work queues that execute in a connection managing thread.
4262306a36Sopenharmony_ci *
4362306a36Sopenharmony_ci * TCP wants to send acks through sendpage() in response to data_ready(),
4462306a36Sopenharmony_ci * but it needs a process context to do so.
4562306a36Sopenharmony_ci *
4662306a36Sopenharmony_ci * The receive paths need to allocate but can't drop packets (!) so we have
4762306a36Sopenharmony_ci * a thread around to block allocating if the receive fast path sees an
4862306a36Sopenharmony_ci * allocation failure.
4962306a36Sopenharmony_ci */
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci/* Grand Unified Theory of connection life cycle:
5262306a36Sopenharmony_ci * At any point in time, the connection can be in one of these states:
5362306a36Sopenharmony_ci * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
5462306a36Sopenharmony_ci *
5562306a36Sopenharmony_ci * The following transitions are possible:
5662306a36Sopenharmony_ci *  ANY		  -> ERROR
5762306a36Sopenharmony_ci *  UP		  -> DISCONNECTING
5862306a36Sopenharmony_ci *  ERROR	  -> DISCONNECTING
5962306a36Sopenharmony_ci *  DISCONNECTING -> DOWN
6062306a36Sopenharmony_ci *  DOWN	  -> CONNECTING
6162306a36Sopenharmony_ci *  CONNECTING	  -> UP
6262306a36Sopenharmony_ci *
6362306a36Sopenharmony_ci * Transition to state DISCONNECTING/DOWN:
6462306a36Sopenharmony_ci *  -	Inside the shutdown worker; synchronizes with xmit path
6562306a36Sopenharmony_ci *	through RDS_IN_XMIT, and with connection management callbacks
6662306a36Sopenharmony_ci *	via c_cm_lock.
6762306a36Sopenharmony_ci *
6862306a36Sopenharmony_ci *	For receive callbacks, we rely on the underlying transport
6962306a36Sopenharmony_ci *	(TCP, IB/RDMA) to provide the necessary synchronisation.
7062306a36Sopenharmony_ci */
7162306a36Sopenharmony_cistruct workqueue_struct *rds_wq;
7262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_wq);
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_civoid rds_connect_path_complete(struct rds_conn_path *cp, int curr)
7562306a36Sopenharmony_ci{
7662306a36Sopenharmony_ci	if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) {
7762306a36Sopenharmony_ci		printk(KERN_WARNING "%s: Cannot transition to state UP, "
7862306a36Sopenharmony_ci				"current state is %d\n",
7962306a36Sopenharmony_ci				__func__,
8062306a36Sopenharmony_ci				atomic_read(&cp->cp_state));
8162306a36Sopenharmony_ci		rds_conn_path_drop(cp, false);
8262306a36Sopenharmony_ci		return;
8362306a36Sopenharmony_ci	}
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_ci	rdsdebug("conn %p for %pI6c to %pI6c complete\n",
8662306a36Sopenharmony_ci		 cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
8762306a36Sopenharmony_ci
8862306a36Sopenharmony_ci	cp->cp_reconnect_jiffies = 0;
8962306a36Sopenharmony_ci	set_bit(0, &cp->cp_conn->c_map_queued);
9062306a36Sopenharmony_ci	rcu_read_lock();
9162306a36Sopenharmony_ci	if (!rds_destroy_pending(cp->cp_conn)) {
9262306a36Sopenharmony_ci		queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
9362306a36Sopenharmony_ci		queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
9462306a36Sopenharmony_ci	}
9562306a36Sopenharmony_ci	rcu_read_unlock();
9662306a36Sopenharmony_ci	cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
9762306a36Sopenharmony_ci}
9862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_connect_path_complete);
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_civoid rds_connect_complete(struct rds_connection *conn)
10162306a36Sopenharmony_ci{
10262306a36Sopenharmony_ci	rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING);
10362306a36Sopenharmony_ci}
10462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_connect_complete);
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci/*
10762306a36Sopenharmony_ci * This random exponential backoff is relied on to eventually resolve racing
10862306a36Sopenharmony_ci * connects.
10962306a36Sopenharmony_ci *
11062306a36Sopenharmony_ci * If connect attempts race then both parties drop both connections and come
11162306a36Sopenharmony_ci * here to wait for a random amount of time before trying again.  Eventually
11262306a36Sopenharmony_ci * the backoff range will be so much greater than the time it takes to
11362306a36Sopenharmony_ci * establish a connection that one of the pair will establish the connection
11462306a36Sopenharmony_ci * before the other's random delay fires.
11562306a36Sopenharmony_ci *
11662306a36Sopenharmony_ci * Connection attempts that arrive while a connection is already established
11762306a36Sopenharmony_ci * are also considered to be racing connects.  This lets a connection from
11862306a36Sopenharmony_ci * a rebooted machine replace an existing stale connection before the transport
11962306a36Sopenharmony_ci * notices that the connection has failed.
12062306a36Sopenharmony_ci *
12162306a36Sopenharmony_ci * We should *always* start with a random backoff; otherwise a broken connection
12262306a36Sopenharmony_ci * will always take several iterations to be re-established.
12362306a36Sopenharmony_ci */
12462306a36Sopenharmony_civoid rds_queue_reconnect(struct rds_conn_path *cp)
12562306a36Sopenharmony_ci{
12662306a36Sopenharmony_ci	unsigned long rand;
12762306a36Sopenharmony_ci	struct rds_connection *conn = cp->cp_conn;
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci	rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n",
13062306a36Sopenharmony_ci		 conn, &conn->c_laddr, &conn->c_faddr,
13162306a36Sopenharmony_ci		 cp->cp_reconnect_jiffies);
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci	/* let peer with smaller addr initiate reconnect, to avoid duels */
13462306a36Sopenharmony_ci	if (conn->c_trans->t_type == RDS_TRANS_TCP &&
13562306a36Sopenharmony_ci	    rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0)
13662306a36Sopenharmony_ci		return;
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci	set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
13962306a36Sopenharmony_ci	if (cp->cp_reconnect_jiffies == 0) {
14062306a36Sopenharmony_ci		cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
14162306a36Sopenharmony_ci		rcu_read_lock();
14262306a36Sopenharmony_ci		if (!rds_destroy_pending(cp->cp_conn))
14362306a36Sopenharmony_ci			queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
14462306a36Sopenharmony_ci		rcu_read_unlock();
14562306a36Sopenharmony_ci		return;
14662306a36Sopenharmony_ci	}
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci	get_random_bytes(&rand, sizeof(rand));
14962306a36Sopenharmony_ci	rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n",
15062306a36Sopenharmony_ci		 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
15162306a36Sopenharmony_ci		 conn, &conn->c_laddr, &conn->c_faddr);
15262306a36Sopenharmony_ci	rcu_read_lock();
15362306a36Sopenharmony_ci	if (!rds_destroy_pending(cp->cp_conn))
15462306a36Sopenharmony_ci		queue_delayed_work(rds_wq, &cp->cp_conn_w,
15562306a36Sopenharmony_ci				   rand % cp->cp_reconnect_jiffies);
15662306a36Sopenharmony_ci	rcu_read_unlock();
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci	cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
15962306a36Sopenharmony_ci					rds_sysctl_reconnect_max_jiffies);
16062306a36Sopenharmony_ci}
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_civoid rds_connect_worker(struct work_struct *work)
16362306a36Sopenharmony_ci{
16462306a36Sopenharmony_ci	struct rds_conn_path *cp = container_of(work,
16562306a36Sopenharmony_ci						struct rds_conn_path,
16662306a36Sopenharmony_ci						cp_conn_w.work);
16762306a36Sopenharmony_ci	struct rds_connection *conn = cp->cp_conn;
16862306a36Sopenharmony_ci	int ret;
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	if (cp->cp_index > 0 &&
17162306a36Sopenharmony_ci	    rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0)
17262306a36Sopenharmony_ci		return;
17362306a36Sopenharmony_ci	clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
17462306a36Sopenharmony_ci	ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
17562306a36Sopenharmony_ci	if (ret) {
17662306a36Sopenharmony_ci		ret = conn->c_trans->conn_path_connect(cp);
17762306a36Sopenharmony_ci		rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n",
17862306a36Sopenharmony_ci			 conn, &conn->c_laddr, &conn->c_faddr, ret);
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci		if (ret) {
18162306a36Sopenharmony_ci			if (rds_conn_path_transition(cp,
18262306a36Sopenharmony_ci						     RDS_CONN_CONNECTING,
18362306a36Sopenharmony_ci						     RDS_CONN_DOWN))
18462306a36Sopenharmony_ci				rds_queue_reconnect(cp);
18562306a36Sopenharmony_ci			else
18662306a36Sopenharmony_ci				rds_conn_path_error(cp, "connect failed\n");
18762306a36Sopenharmony_ci		}
18862306a36Sopenharmony_ci	}
18962306a36Sopenharmony_ci}
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_civoid rds_send_worker(struct work_struct *work)
19262306a36Sopenharmony_ci{
19362306a36Sopenharmony_ci	struct rds_conn_path *cp = container_of(work,
19462306a36Sopenharmony_ci						struct rds_conn_path,
19562306a36Sopenharmony_ci						cp_send_w.work);
19662306a36Sopenharmony_ci	int ret;
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	if (rds_conn_path_state(cp) == RDS_CONN_UP) {
19962306a36Sopenharmony_ci		clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags);
20062306a36Sopenharmony_ci		ret = rds_send_xmit(cp);
20162306a36Sopenharmony_ci		cond_resched();
20262306a36Sopenharmony_ci		rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
20362306a36Sopenharmony_ci		switch (ret) {
20462306a36Sopenharmony_ci		case -EAGAIN:
20562306a36Sopenharmony_ci			rds_stats_inc(s_send_immediate_retry);
20662306a36Sopenharmony_ci			queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
20762306a36Sopenharmony_ci			break;
20862306a36Sopenharmony_ci		case -ENOMEM:
20962306a36Sopenharmony_ci			rds_stats_inc(s_send_delayed_retry);
21062306a36Sopenharmony_ci			queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
21162306a36Sopenharmony_ci			break;
21262306a36Sopenharmony_ci		default:
21362306a36Sopenharmony_ci			break;
21462306a36Sopenharmony_ci		}
21562306a36Sopenharmony_ci	}
21662306a36Sopenharmony_ci}
21762306a36Sopenharmony_ci
21862306a36Sopenharmony_civoid rds_recv_worker(struct work_struct *work)
21962306a36Sopenharmony_ci{
22062306a36Sopenharmony_ci	struct rds_conn_path *cp = container_of(work,
22162306a36Sopenharmony_ci						struct rds_conn_path,
22262306a36Sopenharmony_ci						cp_recv_w.work);
22362306a36Sopenharmony_ci	int ret;
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_ci	if (rds_conn_path_state(cp) == RDS_CONN_UP) {
22662306a36Sopenharmony_ci		ret = cp->cp_conn->c_trans->recv_path(cp);
22762306a36Sopenharmony_ci		rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
22862306a36Sopenharmony_ci		switch (ret) {
22962306a36Sopenharmony_ci		case -EAGAIN:
23062306a36Sopenharmony_ci			rds_stats_inc(s_recv_immediate_retry);
23162306a36Sopenharmony_ci			queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
23262306a36Sopenharmony_ci			break;
23362306a36Sopenharmony_ci		case -ENOMEM:
23462306a36Sopenharmony_ci			rds_stats_inc(s_recv_delayed_retry);
23562306a36Sopenharmony_ci			queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
23662306a36Sopenharmony_ci			break;
23762306a36Sopenharmony_ci		default:
23862306a36Sopenharmony_ci			break;
23962306a36Sopenharmony_ci		}
24062306a36Sopenharmony_ci	}
24162306a36Sopenharmony_ci}
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_civoid rds_shutdown_worker(struct work_struct *work)
24462306a36Sopenharmony_ci{
24562306a36Sopenharmony_ci	struct rds_conn_path *cp = container_of(work,
24662306a36Sopenharmony_ci						struct rds_conn_path,
24762306a36Sopenharmony_ci						cp_down_w);
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci	rds_conn_shutdown(cp);
25062306a36Sopenharmony_ci}
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_civoid rds_threads_exit(void)
25362306a36Sopenharmony_ci{
25462306a36Sopenharmony_ci	destroy_workqueue(rds_wq);
25562306a36Sopenharmony_ci}
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ciint rds_threads_init(void)
25862306a36Sopenharmony_ci{
25962306a36Sopenharmony_ci	rds_wq = create_singlethread_workqueue("krdsd");
26062306a36Sopenharmony_ci	if (!rds_wq)
26162306a36Sopenharmony_ci		return -ENOMEM;
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci	return 0;
26462306a36Sopenharmony_ci}
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci/* Compare two IPv6 addresses.  Return 0 if the two addresses are equal.
26762306a36Sopenharmony_ci * Return 1 if the first is greater.  Return -1 if the second is greater.
26862306a36Sopenharmony_ci */
26962306a36Sopenharmony_ciint rds_addr_cmp(const struct in6_addr *addr1,
27062306a36Sopenharmony_ci		 const struct in6_addr *addr2)
27162306a36Sopenharmony_ci{
27262306a36Sopenharmony_ci#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
27362306a36Sopenharmony_ci	const __be64 *a1, *a2;
27462306a36Sopenharmony_ci	u64 x, y;
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	a1 = (__be64 *)addr1;
27762306a36Sopenharmony_ci	a2 = (__be64 *)addr2;
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	if (*a1 != *a2) {
28062306a36Sopenharmony_ci		if (be64_to_cpu(*a1) < be64_to_cpu(*a2))
28162306a36Sopenharmony_ci			return -1;
28262306a36Sopenharmony_ci		else
28362306a36Sopenharmony_ci			return 1;
28462306a36Sopenharmony_ci	} else {
28562306a36Sopenharmony_ci		x = be64_to_cpu(*++a1);
28662306a36Sopenharmony_ci		y = be64_to_cpu(*++a2);
28762306a36Sopenharmony_ci		if (x < y)
28862306a36Sopenharmony_ci			return -1;
28962306a36Sopenharmony_ci		else if (x > y)
29062306a36Sopenharmony_ci			return 1;
29162306a36Sopenharmony_ci		else
29262306a36Sopenharmony_ci			return 0;
29362306a36Sopenharmony_ci	}
29462306a36Sopenharmony_ci#else
29562306a36Sopenharmony_ci	u32 a, b;
29662306a36Sopenharmony_ci	int i;
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	for (i = 0; i < 4; i++) {
29962306a36Sopenharmony_ci		if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) {
30062306a36Sopenharmony_ci			a = ntohl(addr1->s6_addr32[i]);
30162306a36Sopenharmony_ci			b = ntohl(addr2->s6_addr32[i]);
30262306a36Sopenharmony_ci			if (a < b)
30362306a36Sopenharmony_ci				return -1;
30462306a36Sopenharmony_ci			else if (a > b)
30562306a36Sopenharmony_ci				return 1;
30662306a36Sopenharmony_ci		}
30762306a36Sopenharmony_ci	}
30862306a36Sopenharmony_ci	return 0;
30962306a36Sopenharmony_ci#endif
31062306a36Sopenharmony_ci}
31162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_addr_cmp);
312