162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * This software is available to you under a choice of one of two 562306a36Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 662306a36Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 762306a36Sopenharmony_ci * COPYING in the main directory of this source tree, or the 862306a36Sopenharmony_ci * OpenIB.org BSD license below: 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Redistribution and use in source and binary forms, with or 1162306a36Sopenharmony_ci * without modification, are permitted provided that the following 1262306a36Sopenharmony_ci * conditions are met: 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * - Redistributions of source code must retain the above 1562306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 1662306a36Sopenharmony_ci * disclaimer. 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * - Redistributions in binary form must reproduce the above 1962306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 2062306a36Sopenharmony_ci * disclaimer in the documentation and/or other materials 2162306a36Sopenharmony_ci * provided with the distribution. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2462306a36Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2562306a36Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2662306a36Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2762306a36Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2862306a36Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2962306a36Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3062306a36Sopenharmony_ci * SOFTWARE. 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci */ 3362306a36Sopenharmony_ci#include <linux/kernel.h> 3462306a36Sopenharmony_ci#include <linux/random.h> 3562306a36Sopenharmony_ci#include <linux/export.h> 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci#include "rds.h" 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci/* 4062306a36Sopenharmony_ci * All of connection management is simplified by serializing it through 4162306a36Sopenharmony_ci * work queues that execute in a connection managing thread. 4262306a36Sopenharmony_ci * 4362306a36Sopenharmony_ci * TCP wants to send acks through sendpage() in response to data_ready(), 4462306a36Sopenharmony_ci * but it needs a process context to do so. 4562306a36Sopenharmony_ci * 4662306a36Sopenharmony_ci * The receive paths need to allocate but can't drop packets (!) so we have 4762306a36Sopenharmony_ci * a thread around to block allocating if the receive fast path sees an 4862306a36Sopenharmony_ci * allocation failure. 4962306a36Sopenharmony_ci */ 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci/* Grand Unified Theory of connection life cycle: 5262306a36Sopenharmony_ci * At any point in time, the connection can be in one of these states: 5362306a36Sopenharmony_ci * DOWN, CONNECTING, UP, DISCONNECTING, ERROR 5462306a36Sopenharmony_ci * 5562306a36Sopenharmony_ci * The following transitions are possible: 5662306a36Sopenharmony_ci * ANY -> ERROR 5762306a36Sopenharmony_ci * UP -> DISCONNECTING 5862306a36Sopenharmony_ci * ERROR -> DISCONNECTING 5962306a36Sopenharmony_ci * DISCONNECTING -> DOWN 6062306a36Sopenharmony_ci * DOWN -> CONNECTING 6162306a36Sopenharmony_ci * CONNECTING -> UP 6262306a36Sopenharmony_ci * 6362306a36Sopenharmony_ci * Transition to state DISCONNECTING/DOWN: 6462306a36Sopenharmony_ci * - Inside the shutdown worker; synchronizes with xmit path 6562306a36Sopenharmony_ci * through RDS_IN_XMIT, and with connection management callbacks 6662306a36Sopenharmony_ci * via c_cm_lock. 6762306a36Sopenharmony_ci * 6862306a36Sopenharmony_ci * For receive callbacks, we rely on the underlying transport 6962306a36Sopenharmony_ci * (TCP, IB/RDMA) to provide the necessary synchronisation. 7062306a36Sopenharmony_ci */ 7162306a36Sopenharmony_cistruct workqueue_struct *rds_wq; 7262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_wq); 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_civoid rds_connect_path_complete(struct rds_conn_path *cp, int curr) 7562306a36Sopenharmony_ci{ 7662306a36Sopenharmony_ci if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) { 7762306a36Sopenharmony_ci printk(KERN_WARNING "%s: Cannot transition to state UP, " 7862306a36Sopenharmony_ci "current state is %d\n", 7962306a36Sopenharmony_ci __func__, 8062306a36Sopenharmony_ci atomic_read(&cp->cp_state)); 8162306a36Sopenharmony_ci rds_conn_path_drop(cp, false); 8262306a36Sopenharmony_ci return; 8362306a36Sopenharmony_ci } 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci rdsdebug("conn %p for %pI6c to %pI6c complete\n", 8662306a36Sopenharmony_ci cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci cp->cp_reconnect_jiffies = 0; 8962306a36Sopenharmony_ci set_bit(0, &cp->cp_conn->c_map_queued); 9062306a36Sopenharmony_ci rcu_read_lock(); 9162306a36Sopenharmony_ci if (!rds_destroy_pending(cp->cp_conn)) { 9262306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 0); 9362306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); 9462306a36Sopenharmony_ci } 9562306a36Sopenharmony_ci rcu_read_unlock(); 9662306a36Sopenharmony_ci cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; 9762306a36Sopenharmony_ci} 9862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_connect_path_complete); 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_civoid rds_connect_complete(struct rds_connection *conn) 10162306a36Sopenharmony_ci{ 10262306a36Sopenharmony_ci rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING); 10362306a36Sopenharmony_ci} 10462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_connect_complete); 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci/* 10762306a36Sopenharmony_ci * This random exponential backoff is relied on to eventually resolve racing 10862306a36Sopenharmony_ci * connects. 10962306a36Sopenharmony_ci * 11062306a36Sopenharmony_ci * If connect attempts race then both parties drop both connections and come 11162306a36Sopenharmony_ci * here to wait for a random amount of time before trying again. Eventually 11262306a36Sopenharmony_ci * the backoff range will be so much greater than the time it takes to 11362306a36Sopenharmony_ci * establish a connection that one of the pair will establish the connection 11462306a36Sopenharmony_ci * before the other's random delay fires. 11562306a36Sopenharmony_ci * 11662306a36Sopenharmony_ci * Connection attempts that arrive while a connection is already established 11762306a36Sopenharmony_ci * are also considered to be racing connects. This lets a connection from 11862306a36Sopenharmony_ci * a rebooted machine replace an existing stale connection before the transport 11962306a36Sopenharmony_ci * notices that the connection has failed. 12062306a36Sopenharmony_ci * 12162306a36Sopenharmony_ci * We should *always* start with a random backoff; otherwise a broken connection 12262306a36Sopenharmony_ci * will always take several iterations to be re-established. 12362306a36Sopenharmony_ci */ 12462306a36Sopenharmony_civoid rds_queue_reconnect(struct rds_conn_path *cp) 12562306a36Sopenharmony_ci{ 12662306a36Sopenharmony_ci unsigned long rand; 12762306a36Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n", 13062306a36Sopenharmony_ci conn, &conn->c_laddr, &conn->c_faddr, 13162306a36Sopenharmony_ci cp->cp_reconnect_jiffies); 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci /* let peer with smaller addr initiate reconnect, to avoid duels */ 13462306a36Sopenharmony_ci if (conn->c_trans->t_type == RDS_TRANS_TCP && 13562306a36Sopenharmony_ci rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0) 13662306a36Sopenharmony_ci return; 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); 13962306a36Sopenharmony_ci if (cp->cp_reconnect_jiffies == 0) { 14062306a36Sopenharmony_ci cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; 14162306a36Sopenharmony_ci rcu_read_lock(); 14262306a36Sopenharmony_ci if (!rds_destroy_pending(cp->cp_conn)) 14362306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); 14462306a36Sopenharmony_ci rcu_read_unlock(); 14562306a36Sopenharmony_ci return; 14662306a36Sopenharmony_ci } 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci get_random_bytes(&rand, sizeof(rand)); 14962306a36Sopenharmony_ci rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n", 15062306a36Sopenharmony_ci rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, 15162306a36Sopenharmony_ci conn, &conn->c_laddr, &conn->c_faddr); 15262306a36Sopenharmony_ci rcu_read_lock(); 15362306a36Sopenharmony_ci if (!rds_destroy_pending(cp->cp_conn)) 15462306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_conn_w, 15562306a36Sopenharmony_ci rand % cp->cp_reconnect_jiffies); 15662306a36Sopenharmony_ci rcu_read_unlock(); 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, 15962306a36Sopenharmony_ci rds_sysctl_reconnect_max_jiffies); 16062306a36Sopenharmony_ci} 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_civoid rds_connect_worker(struct work_struct *work) 16362306a36Sopenharmony_ci{ 16462306a36Sopenharmony_ci struct rds_conn_path *cp = container_of(work, 16562306a36Sopenharmony_ci struct rds_conn_path, 16662306a36Sopenharmony_ci cp_conn_w.work); 16762306a36Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 16862306a36Sopenharmony_ci int ret; 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci if (cp->cp_index > 0 && 17162306a36Sopenharmony_ci rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0) 17262306a36Sopenharmony_ci return; 17362306a36Sopenharmony_ci clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); 17462306a36Sopenharmony_ci ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); 17562306a36Sopenharmony_ci if (ret) { 17662306a36Sopenharmony_ci ret = conn->c_trans->conn_path_connect(cp); 17762306a36Sopenharmony_ci rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n", 17862306a36Sopenharmony_ci conn, &conn->c_laddr, &conn->c_faddr, ret); 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci if (ret) { 18162306a36Sopenharmony_ci if (rds_conn_path_transition(cp, 18262306a36Sopenharmony_ci RDS_CONN_CONNECTING, 18362306a36Sopenharmony_ci RDS_CONN_DOWN)) 18462306a36Sopenharmony_ci rds_queue_reconnect(cp); 18562306a36Sopenharmony_ci else 18662306a36Sopenharmony_ci rds_conn_path_error(cp, "connect failed\n"); 18762306a36Sopenharmony_ci } 18862306a36Sopenharmony_ci } 18962306a36Sopenharmony_ci} 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_civoid rds_send_worker(struct work_struct *work) 19262306a36Sopenharmony_ci{ 19362306a36Sopenharmony_ci struct rds_conn_path *cp = container_of(work, 19462306a36Sopenharmony_ci struct rds_conn_path, 19562306a36Sopenharmony_ci cp_send_w.work); 19662306a36Sopenharmony_ci int ret; 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci if (rds_conn_path_state(cp) == RDS_CONN_UP) { 19962306a36Sopenharmony_ci clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); 20062306a36Sopenharmony_ci ret = rds_send_xmit(cp); 20162306a36Sopenharmony_ci cond_resched(); 20262306a36Sopenharmony_ci rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); 20362306a36Sopenharmony_ci switch (ret) { 20462306a36Sopenharmony_ci case -EAGAIN: 20562306a36Sopenharmony_ci rds_stats_inc(s_send_immediate_retry); 20662306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 0); 20762306a36Sopenharmony_ci break; 20862306a36Sopenharmony_ci case -ENOMEM: 20962306a36Sopenharmony_ci rds_stats_inc(s_send_delayed_retry); 21062306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 2); 21162306a36Sopenharmony_ci break; 21262306a36Sopenharmony_ci default: 21362306a36Sopenharmony_ci break; 21462306a36Sopenharmony_ci } 21562306a36Sopenharmony_ci } 21662306a36Sopenharmony_ci} 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_civoid rds_recv_worker(struct work_struct *work) 21962306a36Sopenharmony_ci{ 22062306a36Sopenharmony_ci struct rds_conn_path *cp = container_of(work, 22162306a36Sopenharmony_ci struct rds_conn_path, 22262306a36Sopenharmony_ci cp_recv_w.work); 22362306a36Sopenharmony_ci int ret; 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci if (rds_conn_path_state(cp) == RDS_CONN_UP) { 22662306a36Sopenharmony_ci ret = cp->cp_conn->c_trans->recv_path(cp); 22762306a36Sopenharmony_ci rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); 22862306a36Sopenharmony_ci switch (ret) { 22962306a36Sopenharmony_ci case -EAGAIN: 23062306a36Sopenharmony_ci rds_stats_inc(s_recv_immediate_retry); 23162306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); 23262306a36Sopenharmony_ci break; 23362306a36Sopenharmony_ci case -ENOMEM: 23462306a36Sopenharmony_ci rds_stats_inc(s_recv_delayed_retry); 23562306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); 23662306a36Sopenharmony_ci break; 23762306a36Sopenharmony_ci default: 23862306a36Sopenharmony_ci break; 23962306a36Sopenharmony_ci } 24062306a36Sopenharmony_ci } 24162306a36Sopenharmony_ci} 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_civoid rds_shutdown_worker(struct work_struct *work) 24462306a36Sopenharmony_ci{ 24562306a36Sopenharmony_ci struct rds_conn_path *cp = container_of(work, 24662306a36Sopenharmony_ci struct rds_conn_path, 24762306a36Sopenharmony_ci cp_down_w); 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci rds_conn_shutdown(cp); 25062306a36Sopenharmony_ci} 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_civoid rds_threads_exit(void) 25362306a36Sopenharmony_ci{ 25462306a36Sopenharmony_ci destroy_workqueue(rds_wq); 25562306a36Sopenharmony_ci} 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ciint rds_threads_init(void) 25862306a36Sopenharmony_ci{ 25962306a36Sopenharmony_ci rds_wq = create_singlethread_workqueue("krdsd"); 26062306a36Sopenharmony_ci if (!rds_wq) 26162306a36Sopenharmony_ci return -ENOMEM; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci return 0; 26462306a36Sopenharmony_ci} 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci/* Compare two IPv6 addresses. Return 0 if the two addresses are equal. 26762306a36Sopenharmony_ci * Return 1 if the first is greater. Return -1 if the second is greater. 26862306a36Sopenharmony_ci */ 26962306a36Sopenharmony_ciint rds_addr_cmp(const struct in6_addr *addr1, 27062306a36Sopenharmony_ci const struct in6_addr *addr2) 27162306a36Sopenharmony_ci{ 27262306a36Sopenharmony_ci#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 27362306a36Sopenharmony_ci const __be64 *a1, *a2; 27462306a36Sopenharmony_ci u64 x, y; 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci a1 = (__be64 *)addr1; 27762306a36Sopenharmony_ci a2 = (__be64 *)addr2; 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci if (*a1 != *a2) { 28062306a36Sopenharmony_ci if (be64_to_cpu(*a1) < be64_to_cpu(*a2)) 28162306a36Sopenharmony_ci return -1; 28262306a36Sopenharmony_ci else 28362306a36Sopenharmony_ci return 1; 28462306a36Sopenharmony_ci } else { 28562306a36Sopenharmony_ci x = be64_to_cpu(*++a1); 28662306a36Sopenharmony_ci y = be64_to_cpu(*++a2); 28762306a36Sopenharmony_ci if (x < y) 28862306a36Sopenharmony_ci return -1; 28962306a36Sopenharmony_ci else if (x > y) 29062306a36Sopenharmony_ci return 1; 29162306a36Sopenharmony_ci else 29262306a36Sopenharmony_ci return 0; 29362306a36Sopenharmony_ci } 29462306a36Sopenharmony_ci#else 29562306a36Sopenharmony_ci u32 a, b; 29662306a36Sopenharmony_ci int i; 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci for (i = 0; i < 4; i++) { 29962306a36Sopenharmony_ci if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) { 30062306a36Sopenharmony_ci a = ntohl(addr1->s6_addr32[i]); 30162306a36Sopenharmony_ci b = ntohl(addr2->s6_addr32[i]); 30262306a36Sopenharmony_ci if (a < b) 30362306a36Sopenharmony_ci return -1; 30462306a36Sopenharmony_ci else if (a > b) 30562306a36Sopenharmony_ci return 1; 30662306a36Sopenharmony_ci } 30762306a36Sopenharmony_ci } 30862306a36Sopenharmony_ci return 0; 30962306a36Sopenharmony_ci#endif 31062306a36Sopenharmony_ci} 31162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_addr_cmp); 312