18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * This software is available to you under a choice of one of two 58c2ecf20Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 68c2ecf20Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 78c2ecf20Sopenharmony_ci * COPYING in the main directory of this source tree, or the 88c2ecf20Sopenharmony_ci * OpenIB.org BSD license below: 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Redistribution and use in source and binary forms, with or 118c2ecf20Sopenharmony_ci * without modification, are permitted provided that the following 128c2ecf20Sopenharmony_ci * conditions are met: 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * - Redistributions of source code must retain the above 158c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 168c2ecf20Sopenharmony_ci * disclaimer. 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * - Redistributions in binary form must reproduce the above 198c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 208c2ecf20Sopenharmony_ci * disclaimer in the documentation and/or other materials 218c2ecf20Sopenharmony_ci * provided with the distribution. 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 248c2ecf20Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 258c2ecf20Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 268c2ecf20Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 278c2ecf20Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 288c2ecf20Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 298c2ecf20Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 308c2ecf20Sopenharmony_ci * SOFTWARE. 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ci#include <linux/kernel.h> 348c2ecf20Sopenharmony_ci#include <linux/random.h> 358c2ecf20Sopenharmony_ci#include <linux/export.h> 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci#include "rds.h" 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci/* 408c2ecf20Sopenharmony_ci * All of connection management is simplified by serializing it through 418c2ecf20Sopenharmony_ci * work queues that execute in a connection managing thread. 428c2ecf20Sopenharmony_ci * 438c2ecf20Sopenharmony_ci * TCP wants to send acks through sendpage() in response to data_ready(), 448c2ecf20Sopenharmony_ci * but it needs a process context to do so. 458c2ecf20Sopenharmony_ci * 468c2ecf20Sopenharmony_ci * The receive paths need to allocate but can't drop packets (!) so we have 478c2ecf20Sopenharmony_ci * a thread around to block allocating if the receive fast path sees an 488c2ecf20Sopenharmony_ci * allocation failure. 498c2ecf20Sopenharmony_ci */ 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_ci/* Grand Unified Theory of connection life cycle: 528c2ecf20Sopenharmony_ci * At any point in time, the connection can be in one of these states: 538c2ecf20Sopenharmony_ci * DOWN, CONNECTING, UP, DISCONNECTING, ERROR 548c2ecf20Sopenharmony_ci * 558c2ecf20Sopenharmony_ci * The following transitions are possible: 568c2ecf20Sopenharmony_ci * ANY -> ERROR 578c2ecf20Sopenharmony_ci * UP -> DISCONNECTING 588c2ecf20Sopenharmony_ci * ERROR -> DISCONNECTING 598c2ecf20Sopenharmony_ci * DISCONNECTING -> DOWN 608c2ecf20Sopenharmony_ci * DOWN -> CONNECTING 618c2ecf20Sopenharmony_ci * CONNECTING -> UP 628c2ecf20Sopenharmony_ci * 638c2ecf20Sopenharmony_ci * Transition to state DISCONNECTING/DOWN: 648c2ecf20Sopenharmony_ci * - Inside the shutdown worker; synchronizes with xmit path 658c2ecf20Sopenharmony_ci * through RDS_IN_XMIT, and with connection management callbacks 668c2ecf20Sopenharmony_ci * via c_cm_lock. 678c2ecf20Sopenharmony_ci * 688c2ecf20Sopenharmony_ci * For receive callbacks, we rely on the underlying transport 698c2ecf20Sopenharmony_ci * (TCP, IB/RDMA) to provide the necessary synchronisation. 708c2ecf20Sopenharmony_ci */ 718c2ecf20Sopenharmony_cistruct workqueue_struct *rds_wq; 728c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_wq); 738c2ecf20Sopenharmony_ci 748c2ecf20Sopenharmony_civoid rds_connect_path_complete(struct rds_conn_path *cp, int curr) 758c2ecf20Sopenharmony_ci{ 768c2ecf20Sopenharmony_ci if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) { 778c2ecf20Sopenharmony_ci printk(KERN_WARNING "%s: Cannot transition to state UP, " 788c2ecf20Sopenharmony_ci "current state is %d\n", 798c2ecf20Sopenharmony_ci __func__, 808c2ecf20Sopenharmony_ci atomic_read(&cp->cp_state)); 818c2ecf20Sopenharmony_ci rds_conn_path_drop(cp, false); 828c2ecf20Sopenharmony_ci return; 838c2ecf20Sopenharmony_ci } 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci rdsdebug("conn %p for %pI6c to %pI6c complete\n", 868c2ecf20Sopenharmony_ci cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci cp->cp_reconnect_jiffies = 0; 898c2ecf20Sopenharmony_ci set_bit(0, &cp->cp_conn->c_map_queued); 908c2ecf20Sopenharmony_ci rcu_read_lock(); 918c2ecf20Sopenharmony_ci if (!rds_destroy_pending(cp->cp_conn)) { 928c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 0); 938c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); 948c2ecf20Sopenharmony_ci } 958c2ecf20Sopenharmony_ci rcu_read_unlock(); 968c2ecf20Sopenharmony_ci cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; 978c2ecf20Sopenharmony_ci} 988c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_connect_path_complete); 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_civoid rds_connect_complete(struct rds_connection *conn) 1018c2ecf20Sopenharmony_ci{ 1028c2ecf20Sopenharmony_ci rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING); 1038c2ecf20Sopenharmony_ci} 1048c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_connect_complete); 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci/* 1078c2ecf20Sopenharmony_ci * This random exponential backoff is relied on to eventually resolve racing 1088c2ecf20Sopenharmony_ci * connects. 1098c2ecf20Sopenharmony_ci * 1108c2ecf20Sopenharmony_ci * If connect attempts race then both parties drop both connections and come 1118c2ecf20Sopenharmony_ci * here to wait for a random amount of time before trying again. Eventually 1128c2ecf20Sopenharmony_ci * the backoff range will be so much greater than the time it takes to 1138c2ecf20Sopenharmony_ci * establish a connection that one of the pair will establish the connection 1148c2ecf20Sopenharmony_ci * before the other's random delay fires. 1158c2ecf20Sopenharmony_ci * 1168c2ecf20Sopenharmony_ci * Connection attempts that arrive while a connection is already established 1178c2ecf20Sopenharmony_ci * are also considered to be racing connects. This lets a connection from 1188c2ecf20Sopenharmony_ci * a rebooted machine replace an existing stale connection before the transport 1198c2ecf20Sopenharmony_ci * notices that the connection has failed. 1208c2ecf20Sopenharmony_ci * 1218c2ecf20Sopenharmony_ci * We should *always* start with a random backoff; otherwise a broken connection 1228c2ecf20Sopenharmony_ci * will always take several iterations to be re-established. 1238c2ecf20Sopenharmony_ci */ 1248c2ecf20Sopenharmony_civoid rds_queue_reconnect(struct rds_conn_path *cp) 1258c2ecf20Sopenharmony_ci{ 1268c2ecf20Sopenharmony_ci unsigned long rand; 1278c2ecf20Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n", 1308c2ecf20Sopenharmony_ci conn, &conn->c_laddr, &conn->c_faddr, 1318c2ecf20Sopenharmony_ci cp->cp_reconnect_jiffies); 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci /* let peer with smaller addr initiate reconnect, to avoid duels */ 1348c2ecf20Sopenharmony_ci if (conn->c_trans->t_type == RDS_TRANS_TCP && 1358c2ecf20Sopenharmony_ci rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0) 1368c2ecf20Sopenharmony_ci return; 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); 1398c2ecf20Sopenharmony_ci if (cp->cp_reconnect_jiffies == 0) { 1408c2ecf20Sopenharmony_ci cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; 1418c2ecf20Sopenharmony_ci rcu_read_lock(); 1428c2ecf20Sopenharmony_ci if (!rds_destroy_pending(cp->cp_conn)) 1438c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); 1448c2ecf20Sopenharmony_ci rcu_read_unlock(); 1458c2ecf20Sopenharmony_ci return; 1468c2ecf20Sopenharmony_ci } 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci get_random_bytes(&rand, sizeof(rand)); 1498c2ecf20Sopenharmony_ci rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n", 1508c2ecf20Sopenharmony_ci rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, 1518c2ecf20Sopenharmony_ci conn, &conn->c_laddr, &conn->c_faddr); 1528c2ecf20Sopenharmony_ci rcu_read_lock(); 1538c2ecf20Sopenharmony_ci if (!rds_destroy_pending(cp->cp_conn)) 1548c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_conn_w, 1558c2ecf20Sopenharmony_ci rand % cp->cp_reconnect_jiffies); 1568c2ecf20Sopenharmony_ci rcu_read_unlock(); 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, 1598c2ecf20Sopenharmony_ci rds_sysctl_reconnect_max_jiffies); 1608c2ecf20Sopenharmony_ci} 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_civoid rds_connect_worker(struct work_struct *work) 1638c2ecf20Sopenharmony_ci{ 1648c2ecf20Sopenharmony_ci struct rds_conn_path *cp = container_of(work, 1658c2ecf20Sopenharmony_ci struct rds_conn_path, 1668c2ecf20Sopenharmony_ci cp_conn_w.work); 1678c2ecf20Sopenharmony_ci struct rds_connection *conn = cp->cp_conn; 1688c2ecf20Sopenharmony_ci int ret; 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_ci if (cp->cp_index > 0 && 1718c2ecf20Sopenharmony_ci rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0) 1728c2ecf20Sopenharmony_ci return; 1738c2ecf20Sopenharmony_ci clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); 1748c2ecf20Sopenharmony_ci ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); 1758c2ecf20Sopenharmony_ci if (ret) { 1768c2ecf20Sopenharmony_ci ret = conn->c_trans->conn_path_connect(cp); 1778c2ecf20Sopenharmony_ci rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n", 1788c2ecf20Sopenharmony_ci conn, &conn->c_laddr, &conn->c_faddr, ret); 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_ci if (ret) { 1818c2ecf20Sopenharmony_ci if (rds_conn_path_transition(cp, 1828c2ecf20Sopenharmony_ci RDS_CONN_CONNECTING, 1838c2ecf20Sopenharmony_ci RDS_CONN_DOWN)) 1848c2ecf20Sopenharmony_ci rds_queue_reconnect(cp); 1858c2ecf20Sopenharmony_ci else 1868c2ecf20Sopenharmony_ci rds_conn_path_error(cp, "connect failed\n"); 1878c2ecf20Sopenharmony_ci } 1888c2ecf20Sopenharmony_ci } 1898c2ecf20Sopenharmony_ci} 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_civoid rds_send_worker(struct work_struct *work) 1928c2ecf20Sopenharmony_ci{ 1938c2ecf20Sopenharmony_ci struct rds_conn_path *cp = container_of(work, 1948c2ecf20Sopenharmony_ci struct rds_conn_path, 1958c2ecf20Sopenharmony_ci cp_send_w.work); 1968c2ecf20Sopenharmony_ci int ret; 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci if (rds_conn_path_state(cp) == RDS_CONN_UP) { 1998c2ecf20Sopenharmony_ci clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); 2008c2ecf20Sopenharmony_ci ret = rds_send_xmit(cp); 2018c2ecf20Sopenharmony_ci cond_resched(); 2028c2ecf20Sopenharmony_ci rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); 2038c2ecf20Sopenharmony_ci switch (ret) { 2048c2ecf20Sopenharmony_ci case -EAGAIN: 2058c2ecf20Sopenharmony_ci rds_stats_inc(s_send_immediate_retry); 2068c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 0); 2078c2ecf20Sopenharmony_ci break; 2088c2ecf20Sopenharmony_ci case -ENOMEM: 2098c2ecf20Sopenharmony_ci rds_stats_inc(s_send_delayed_retry); 2108c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 2); 2118c2ecf20Sopenharmony_ci default: 2128c2ecf20Sopenharmony_ci break; 2138c2ecf20Sopenharmony_ci } 2148c2ecf20Sopenharmony_ci } 2158c2ecf20Sopenharmony_ci} 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_civoid rds_recv_worker(struct work_struct *work) 2188c2ecf20Sopenharmony_ci{ 2198c2ecf20Sopenharmony_ci struct rds_conn_path *cp = container_of(work, 2208c2ecf20Sopenharmony_ci struct rds_conn_path, 2218c2ecf20Sopenharmony_ci cp_recv_w.work); 2228c2ecf20Sopenharmony_ci int ret; 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_ci if (rds_conn_path_state(cp) == RDS_CONN_UP) { 2258c2ecf20Sopenharmony_ci ret = cp->cp_conn->c_trans->recv_path(cp); 2268c2ecf20Sopenharmony_ci rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); 2278c2ecf20Sopenharmony_ci switch (ret) { 2288c2ecf20Sopenharmony_ci case -EAGAIN: 2298c2ecf20Sopenharmony_ci rds_stats_inc(s_recv_immediate_retry); 2308c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); 2318c2ecf20Sopenharmony_ci break; 2328c2ecf20Sopenharmony_ci case -ENOMEM: 2338c2ecf20Sopenharmony_ci rds_stats_inc(s_recv_delayed_retry); 2348c2ecf20Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); 2358c2ecf20Sopenharmony_ci default: 2368c2ecf20Sopenharmony_ci break; 2378c2ecf20Sopenharmony_ci } 2388c2ecf20Sopenharmony_ci } 2398c2ecf20Sopenharmony_ci} 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_civoid rds_shutdown_worker(struct work_struct *work) 2428c2ecf20Sopenharmony_ci{ 2438c2ecf20Sopenharmony_ci struct rds_conn_path *cp = container_of(work, 2448c2ecf20Sopenharmony_ci struct rds_conn_path, 2458c2ecf20Sopenharmony_ci cp_down_w); 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci rds_conn_shutdown(cp); 2488c2ecf20Sopenharmony_ci} 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_civoid rds_threads_exit(void) 2518c2ecf20Sopenharmony_ci{ 2528c2ecf20Sopenharmony_ci destroy_workqueue(rds_wq); 2538c2ecf20Sopenharmony_ci} 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_ciint rds_threads_init(void) 2568c2ecf20Sopenharmony_ci{ 2578c2ecf20Sopenharmony_ci rds_wq = create_singlethread_workqueue("krdsd"); 2588c2ecf20Sopenharmony_ci if (!rds_wq) 2598c2ecf20Sopenharmony_ci return -ENOMEM; 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci return 0; 2628c2ecf20Sopenharmony_ci} 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_ci/* Compare two IPv6 addresses. Return 0 if the two addresses are equal. 2658c2ecf20Sopenharmony_ci * Return 1 if the first is greater. Return -1 if the second is greater. 2668c2ecf20Sopenharmony_ci */ 2678c2ecf20Sopenharmony_ciint rds_addr_cmp(const struct in6_addr *addr1, 2688c2ecf20Sopenharmony_ci const struct in6_addr *addr2) 2698c2ecf20Sopenharmony_ci{ 2708c2ecf20Sopenharmony_ci#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 2718c2ecf20Sopenharmony_ci const __be64 *a1, *a2; 2728c2ecf20Sopenharmony_ci u64 x, y; 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci a1 = (__be64 *)addr1; 2758c2ecf20Sopenharmony_ci a2 = (__be64 *)addr2; 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci if (*a1 != *a2) { 2788c2ecf20Sopenharmony_ci if (be64_to_cpu(*a1) < be64_to_cpu(*a2)) 2798c2ecf20Sopenharmony_ci return -1; 2808c2ecf20Sopenharmony_ci else 2818c2ecf20Sopenharmony_ci return 1; 2828c2ecf20Sopenharmony_ci } else { 2838c2ecf20Sopenharmony_ci x = be64_to_cpu(*++a1); 2848c2ecf20Sopenharmony_ci y = be64_to_cpu(*++a2); 2858c2ecf20Sopenharmony_ci if (x < y) 2868c2ecf20Sopenharmony_ci return -1; 2878c2ecf20Sopenharmony_ci else if (x > y) 2888c2ecf20Sopenharmony_ci return 1; 2898c2ecf20Sopenharmony_ci else 2908c2ecf20Sopenharmony_ci return 0; 2918c2ecf20Sopenharmony_ci } 2928c2ecf20Sopenharmony_ci#else 2938c2ecf20Sopenharmony_ci u32 a, b; 2948c2ecf20Sopenharmony_ci int i; 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci for (i = 0; i < 4; i++) { 2978c2ecf20Sopenharmony_ci if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) { 2988c2ecf20Sopenharmony_ci a = ntohl(addr1->s6_addr32[i]); 2998c2ecf20Sopenharmony_ci b = ntohl(addr2->s6_addr32[i]); 3008c2ecf20Sopenharmony_ci if (a < b) 3018c2ecf20Sopenharmony_ci return -1; 3028c2ecf20Sopenharmony_ci else if (a > b) 3038c2ecf20Sopenharmony_ci return 1; 3048c2ecf20Sopenharmony_ci } 3058c2ecf20Sopenharmony_ci } 3068c2ecf20Sopenharmony_ci return 0; 3078c2ecf20Sopenharmony_ci#endif 3088c2ecf20Sopenharmony_ci} 3098c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_addr_cmp); 310