18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * This software is available to you under a choice of one of two 58c2ecf20Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 68c2ecf20Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 78c2ecf20Sopenharmony_ci * COPYING in the main directory of this source tree, or the 88c2ecf20Sopenharmony_ci * OpenIB.org BSD license below: 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * Redistribution and use in source and binary forms, with or 118c2ecf20Sopenharmony_ci * without modification, are permitted provided that the following 128c2ecf20Sopenharmony_ci * conditions are met: 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * - Redistributions of source code must retain the above 158c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 168c2ecf20Sopenharmony_ci * disclaimer. 178c2ecf20Sopenharmony_ci * 188c2ecf20Sopenharmony_ci * - Redistributions in binary form must reproduce the above 198c2ecf20Sopenharmony_ci * copyright notice, this list of conditions and the following 208c2ecf20Sopenharmony_ci * disclaimer in the documentation and/or other materials 218c2ecf20Sopenharmony_ci * provided with the distribution. 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 248c2ecf20Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 258c2ecf20Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 268c2ecf20Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 278c2ecf20Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 288c2ecf20Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 298c2ecf20Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 308c2ecf20Sopenharmony_ci * SOFTWARE. 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ci#include <linux/kernel.h> 348c2ecf20Sopenharmony_ci#include <linux/in.h> 358c2ecf20Sopenharmony_ci#include <linux/if.h> 368c2ecf20Sopenharmony_ci#include <linux/netdevice.h> 378c2ecf20Sopenharmony_ci#include <linux/inetdevice.h> 388c2ecf20Sopenharmony_ci#include <linux/if_arp.h> 398c2ecf20Sopenharmony_ci#include <linux/delay.h> 408c2ecf20Sopenharmony_ci#include <linux/slab.h> 418c2ecf20Sopenharmony_ci#include <linux/module.h> 428c2ecf20Sopenharmony_ci#include <net/addrconf.h> 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci#include "rds_single_path.h" 458c2ecf20Sopenharmony_ci#include "rds.h" 468c2ecf20Sopenharmony_ci#include "ib.h" 478c2ecf20Sopenharmony_ci#include "ib_mr.h" 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_cistatic unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; 508c2ecf20Sopenharmony_cistatic unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; 518c2ecf20Sopenharmony_ciunsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; 528c2ecf20Sopenharmony_cistatic atomic_t rds_ib_unloading; 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_cimodule_param(rds_ib_mr_1m_pool_size, int, 0444); 558c2ecf20Sopenharmony_ciMODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA"); 568c2ecf20Sopenharmony_cimodule_param(rds_ib_mr_8k_pool_size, int, 0444); 578c2ecf20Sopenharmony_ciMODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA"); 588c2ecf20Sopenharmony_cimodule_param(rds_ib_retry_count, int, 0444); 598c2ecf20Sopenharmony_ciMODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci/* 628c2ecf20Sopenharmony_ci * we have a clumsy combination of RCU and a rwsem protecting this list 638c2ecf20Sopenharmony_ci * because it is used both in the get_mr fast path and while blocking in 648c2ecf20Sopenharmony_ci * the FMR flushing path. 658c2ecf20Sopenharmony_ci */ 668c2ecf20Sopenharmony_ciDECLARE_RWSEM(rds_ib_devices_lock); 678c2ecf20Sopenharmony_cistruct list_head rds_ib_devices; 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci/* NOTE: if also grabbing ibdev lock, grab this first */ 708c2ecf20Sopenharmony_ciDEFINE_SPINLOCK(ib_nodev_conns_lock); 718c2ecf20Sopenharmony_ciLIST_HEAD(ib_nodev_conns); 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_cistatic void rds_ib_nodev_connect(void) 748c2ecf20Sopenharmony_ci{ 758c2ecf20Sopenharmony_ci struct rds_ib_connection *ic; 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci spin_lock(&ib_nodev_conns_lock); 788c2ecf20Sopenharmony_ci list_for_each_entry(ic, &ib_nodev_conns, ib_node) 798c2ecf20Sopenharmony_ci rds_conn_connect_if_down(ic->conn); 808c2ecf20Sopenharmony_ci spin_unlock(&ib_nodev_conns_lock); 818c2ecf20Sopenharmony_ci} 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_cistatic void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) 848c2ecf20Sopenharmony_ci{ 858c2ecf20Sopenharmony_ci struct rds_ib_connection *ic; 868c2ecf20Sopenharmony_ci unsigned long flags; 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci spin_lock_irqsave(&rds_ibdev->spinlock, flags); 898c2ecf20Sopenharmony_ci list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) 908c2ecf20Sopenharmony_ci rds_conn_path_drop(&ic->conn->c_path[0], true); 918c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); 928c2ecf20Sopenharmony_ci} 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci/* 958c2ecf20Sopenharmony_ci * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references 968c2ecf20Sopenharmony_ci * from interrupt context so we push freing off into a work struct in krdsd. 978c2ecf20Sopenharmony_ci */ 988c2ecf20Sopenharmony_cistatic void rds_ib_dev_free(struct work_struct *work) 998c2ecf20Sopenharmony_ci{ 1008c2ecf20Sopenharmony_ci struct rds_ib_ipaddr *i_ipaddr, *i_next; 1018c2ecf20Sopenharmony_ci struct rds_ib_device *rds_ibdev = container_of(work, 1028c2ecf20Sopenharmony_ci struct rds_ib_device, free_work); 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci if (rds_ibdev->mr_8k_pool) 1058c2ecf20Sopenharmony_ci rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool); 1068c2ecf20Sopenharmony_ci if (rds_ibdev->mr_1m_pool) 1078c2ecf20Sopenharmony_ci rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); 1088c2ecf20Sopenharmony_ci if (rds_ibdev->pd) 1098c2ecf20Sopenharmony_ci ib_dealloc_pd(rds_ibdev->pd); 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { 1128c2ecf20Sopenharmony_ci list_del(&i_ipaddr->list); 1138c2ecf20Sopenharmony_ci kfree(i_ipaddr); 1148c2ecf20Sopenharmony_ci } 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci kfree(rds_ibdev->vector_load); 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci kfree(rds_ibdev); 1198c2ecf20Sopenharmony_ci} 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_civoid rds_ib_dev_put(struct rds_ib_device *rds_ibdev) 1228c2ecf20Sopenharmony_ci{ 1238c2ecf20Sopenharmony_ci BUG_ON(refcount_read(&rds_ibdev->refcount) == 0); 1248c2ecf20Sopenharmony_ci if (refcount_dec_and_test(&rds_ibdev->refcount)) 1258c2ecf20Sopenharmony_ci queue_work(rds_wq, &rds_ibdev->free_work); 1268c2ecf20Sopenharmony_ci} 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_cistatic int rds_ib_add_one(struct ib_device *device) 1298c2ecf20Sopenharmony_ci{ 1308c2ecf20Sopenharmony_ci struct rds_ib_device *rds_ibdev; 1318c2ecf20Sopenharmony_ci int ret; 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci /* Only handle IB (no iWARP) devices */ 1348c2ecf20Sopenharmony_ci if (device->node_type != RDMA_NODE_IB_CA) 1358c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci /* Device must support FRWR */ 1388c2ecf20Sopenharmony_ci if (!(device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) 1398c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, 1428c2ecf20Sopenharmony_ci ibdev_to_node(device)); 1438c2ecf20Sopenharmony_ci if (!rds_ibdev) 1448c2ecf20Sopenharmony_ci return -ENOMEM; 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ci spin_lock_init(&rds_ibdev->spinlock); 1478c2ecf20Sopenharmony_ci refcount_set(&rds_ibdev->refcount, 1); 1488c2ecf20Sopenharmony_ci INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); 1518c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&rds_ibdev->conn_list); 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci rds_ibdev->max_wrs = device->attrs.max_qp_wr; 1548c2ecf20Sopenharmony_ci rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE); 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci rds_ibdev->odp_capable = 1578c2ecf20Sopenharmony_ci !!(device->attrs.device_cap_flags & 1588c2ecf20Sopenharmony_ci IB_DEVICE_ON_DEMAND_PAGING) && 1598c2ecf20Sopenharmony_ci !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & 1608c2ecf20Sopenharmony_ci IB_ODP_SUPPORT_WRITE) && 1618c2ecf20Sopenharmony_ci !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & 1628c2ecf20Sopenharmony_ci IB_ODP_SUPPORT_READ); 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci rds_ibdev->max_1m_mrs = device->attrs.max_mr ? 1658c2ecf20Sopenharmony_ci min_t(unsigned int, (device->attrs.max_mr / 2), 1668c2ecf20Sopenharmony_ci rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size; 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci rds_ibdev->max_8k_mrs = device->attrs.max_mr ? 1698c2ecf20Sopenharmony_ci min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), 1708c2ecf20Sopenharmony_ci rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size; 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; 1738c2ecf20Sopenharmony_ci rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci rds_ibdev->vector_load = kcalloc(device->num_comp_vectors, 1768c2ecf20Sopenharmony_ci sizeof(int), 1778c2ecf20Sopenharmony_ci GFP_KERNEL); 1788c2ecf20Sopenharmony_ci if (!rds_ibdev->vector_load) { 1798c2ecf20Sopenharmony_ci pr_err("RDS/IB: %s failed to allocate vector memory\n", 1808c2ecf20Sopenharmony_ci __func__); 1818c2ecf20Sopenharmony_ci ret = -ENOMEM; 1828c2ecf20Sopenharmony_ci goto put_dev; 1838c2ecf20Sopenharmony_ci } 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci rds_ibdev->dev = device; 1868c2ecf20Sopenharmony_ci rds_ibdev->pd = ib_alloc_pd(device, 0); 1878c2ecf20Sopenharmony_ci if (IS_ERR(rds_ibdev->pd)) { 1888c2ecf20Sopenharmony_ci ret = PTR_ERR(rds_ibdev->pd); 1898c2ecf20Sopenharmony_ci rds_ibdev->pd = NULL; 1908c2ecf20Sopenharmony_ci goto put_dev; 1918c2ecf20Sopenharmony_ci } 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci rds_ibdev->mr_1m_pool = 1948c2ecf20Sopenharmony_ci rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); 1958c2ecf20Sopenharmony_ci if (IS_ERR(rds_ibdev->mr_1m_pool)) { 1968c2ecf20Sopenharmony_ci ret = PTR_ERR(rds_ibdev->mr_1m_pool); 1978c2ecf20Sopenharmony_ci rds_ibdev->mr_1m_pool = NULL; 1988c2ecf20Sopenharmony_ci goto put_dev; 1998c2ecf20Sopenharmony_ci } 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_ci rds_ibdev->mr_8k_pool = 2028c2ecf20Sopenharmony_ci rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL); 2038c2ecf20Sopenharmony_ci if (IS_ERR(rds_ibdev->mr_8k_pool)) { 2048c2ecf20Sopenharmony_ci ret = PTR_ERR(rds_ibdev->mr_8k_pool); 2058c2ecf20Sopenharmony_ci rds_ibdev->mr_8k_pool = NULL; 2068c2ecf20Sopenharmony_ci goto put_dev; 2078c2ecf20Sopenharmony_ci } 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", 2108c2ecf20Sopenharmony_ci device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge, 2118c2ecf20Sopenharmony_ci rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs); 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci pr_info("RDS/IB: %s: added\n", device->name); 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci down_write(&rds_ib_devices_lock); 2168c2ecf20Sopenharmony_ci list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); 2178c2ecf20Sopenharmony_ci up_write(&rds_ib_devices_lock); 2188c2ecf20Sopenharmony_ci refcount_inc(&rds_ibdev->refcount); 2198c2ecf20Sopenharmony_ci 2208c2ecf20Sopenharmony_ci ib_set_client_data(device, &rds_ib_client, rds_ibdev); 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci rds_ib_nodev_connect(); 2238c2ecf20Sopenharmony_ci return 0; 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ciput_dev: 2268c2ecf20Sopenharmony_ci rds_ib_dev_put(rds_ibdev); 2278c2ecf20Sopenharmony_ci return ret; 2288c2ecf20Sopenharmony_ci} 2298c2ecf20Sopenharmony_ci 2308c2ecf20Sopenharmony_ci/* 2318c2ecf20Sopenharmony_ci * New connections use this to find the device to associate with the 2328c2ecf20Sopenharmony_ci * connection. It's not in the fast path so we're not concerned about the 2338c2ecf20Sopenharmony_ci * performance of the IB call. (As of this writing, it uses an interrupt 2348c2ecf20Sopenharmony_ci * blocking spinlock to serialize walking a per-device list of all registered 2358c2ecf20Sopenharmony_ci * clients.) 2368c2ecf20Sopenharmony_ci * 2378c2ecf20Sopenharmony_ci * RCU is used to handle incoming connections racing with device teardown. 2388c2ecf20Sopenharmony_ci * Rather than use a lock to serialize removal from the client_data and 2398c2ecf20Sopenharmony_ci * getting a new reference, we use an RCU grace period. The destruction 2408c2ecf20Sopenharmony_ci * path removes the device from client_data and then waits for all RCU 2418c2ecf20Sopenharmony_ci * readers to finish. 2428c2ecf20Sopenharmony_ci * 2438c2ecf20Sopenharmony_ci * A new connection can get NULL from this if its arriving on a 2448c2ecf20Sopenharmony_ci * device that is in the process of being removed. 2458c2ecf20Sopenharmony_ci */ 2468c2ecf20Sopenharmony_cistruct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) 2478c2ecf20Sopenharmony_ci{ 2488c2ecf20Sopenharmony_ci struct rds_ib_device *rds_ibdev; 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_ci rcu_read_lock(); 2518c2ecf20Sopenharmony_ci rds_ibdev = ib_get_client_data(device, &rds_ib_client); 2528c2ecf20Sopenharmony_ci if (rds_ibdev) 2538c2ecf20Sopenharmony_ci refcount_inc(&rds_ibdev->refcount); 2548c2ecf20Sopenharmony_ci rcu_read_unlock(); 2558c2ecf20Sopenharmony_ci return rds_ibdev; 2568c2ecf20Sopenharmony_ci} 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci/* 2598c2ecf20Sopenharmony_ci * The IB stack is letting us know that a device is going away. This can 2608c2ecf20Sopenharmony_ci * happen if the underlying HCA driver is removed or if PCI hotplug is removing 2618c2ecf20Sopenharmony_ci * the pci function, for example. 2628c2ecf20Sopenharmony_ci * 2638c2ecf20Sopenharmony_ci * This can be called at any time and can be racing with any other RDS path. 2648c2ecf20Sopenharmony_ci */ 2658c2ecf20Sopenharmony_cistatic void rds_ib_remove_one(struct ib_device *device, void *client_data) 2668c2ecf20Sopenharmony_ci{ 2678c2ecf20Sopenharmony_ci struct rds_ib_device *rds_ibdev = client_data; 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci rds_ib_dev_shutdown(rds_ibdev); 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci /* stop connection attempts from getting a reference to this device. */ 2728c2ecf20Sopenharmony_ci ib_set_client_data(device, &rds_ib_client, NULL); 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci down_write(&rds_ib_devices_lock); 2758c2ecf20Sopenharmony_ci list_del_rcu(&rds_ibdev->list); 2768c2ecf20Sopenharmony_ci up_write(&rds_ib_devices_lock); 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci /* 2798c2ecf20Sopenharmony_ci * This synchronize rcu is waiting for readers of both the ib 2808c2ecf20Sopenharmony_ci * client data and the devices list to finish before we drop 2818c2ecf20Sopenharmony_ci * both of those references. 2828c2ecf20Sopenharmony_ci */ 2838c2ecf20Sopenharmony_ci synchronize_rcu(); 2848c2ecf20Sopenharmony_ci rds_ib_dev_put(rds_ibdev); 2858c2ecf20Sopenharmony_ci rds_ib_dev_put(rds_ibdev); 2868c2ecf20Sopenharmony_ci} 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_cistruct ib_client rds_ib_client = { 2898c2ecf20Sopenharmony_ci .name = "rds_ib", 2908c2ecf20Sopenharmony_ci .add = rds_ib_add_one, 2918c2ecf20Sopenharmony_ci .remove = rds_ib_remove_one 2928c2ecf20Sopenharmony_ci}; 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_cistatic int rds_ib_conn_info_visitor(struct rds_connection *conn, 2958c2ecf20Sopenharmony_ci void *buffer) 2968c2ecf20Sopenharmony_ci{ 2978c2ecf20Sopenharmony_ci struct rds_info_rdma_connection *iinfo = buffer; 2988c2ecf20Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci /* We will only ever look at IB transports */ 3018c2ecf20Sopenharmony_ci if (conn->c_trans != &rds_ib_transport) 3028c2ecf20Sopenharmony_ci return 0; 3038c2ecf20Sopenharmony_ci if (conn->c_isv6) 3048c2ecf20Sopenharmony_ci return 0; 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci iinfo->src_addr = conn->c_laddr.s6_addr32[3]; 3078c2ecf20Sopenharmony_ci iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; 3088c2ecf20Sopenharmony_ci if (ic) { 3098c2ecf20Sopenharmony_ci iinfo->tos = conn->c_tos; 3108c2ecf20Sopenharmony_ci iinfo->sl = ic->i_sl; 3118c2ecf20Sopenharmony_ci } 3128c2ecf20Sopenharmony_ci 3138c2ecf20Sopenharmony_ci memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); 3148c2ecf20Sopenharmony_ci memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); 3158c2ecf20Sopenharmony_ci if (rds_conn_state(conn) == RDS_CONN_UP) { 3168c2ecf20Sopenharmony_ci struct rds_ib_device *rds_ibdev; 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid, 3198c2ecf20Sopenharmony_ci (union ib_gid *)&iinfo->dst_gid); 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci rds_ibdev = ic->rds_ibdev; 3228c2ecf20Sopenharmony_ci iinfo->max_send_wr = ic->i_send_ring.w_nr; 3238c2ecf20Sopenharmony_ci iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 3248c2ecf20Sopenharmony_ci iinfo->max_send_sge = rds_ibdev->max_sge; 3258c2ecf20Sopenharmony_ci rds_ib_get_mr_info(rds_ibdev, iinfo); 3268c2ecf20Sopenharmony_ci iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs); 3278c2ecf20Sopenharmony_ci } 3288c2ecf20Sopenharmony_ci return 1; 3298c2ecf20Sopenharmony_ci} 3308c2ecf20Sopenharmony_ci 3318c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6) 3328c2ecf20Sopenharmony_ci/* IPv6 version of rds_ib_conn_info_visitor(). */ 3338c2ecf20Sopenharmony_cistatic int rds6_ib_conn_info_visitor(struct rds_connection *conn, 3348c2ecf20Sopenharmony_ci void *buffer) 3358c2ecf20Sopenharmony_ci{ 3368c2ecf20Sopenharmony_ci struct rds6_info_rdma_connection *iinfo6 = buffer; 3378c2ecf20Sopenharmony_ci struct rds_ib_connection *ic = conn->c_transport_data; 3388c2ecf20Sopenharmony_ci 3398c2ecf20Sopenharmony_ci /* We will only ever look at IB transports */ 3408c2ecf20Sopenharmony_ci if (conn->c_trans != &rds_ib_transport) 3418c2ecf20Sopenharmony_ci return 0; 3428c2ecf20Sopenharmony_ci 3438c2ecf20Sopenharmony_ci iinfo6->src_addr = conn->c_laddr; 3448c2ecf20Sopenharmony_ci iinfo6->dst_addr = conn->c_faddr; 3458c2ecf20Sopenharmony_ci if (ic) { 3468c2ecf20Sopenharmony_ci iinfo6->tos = conn->c_tos; 3478c2ecf20Sopenharmony_ci iinfo6->sl = ic->i_sl; 3488c2ecf20Sopenharmony_ci } 3498c2ecf20Sopenharmony_ci 3508c2ecf20Sopenharmony_ci memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid)); 3518c2ecf20Sopenharmony_ci memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid)); 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci if (rds_conn_state(conn) == RDS_CONN_UP) { 3548c2ecf20Sopenharmony_ci struct rds_ib_device *rds_ibdev; 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_ci rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid, 3578c2ecf20Sopenharmony_ci (union ib_gid *)&iinfo6->dst_gid); 3588c2ecf20Sopenharmony_ci rds_ibdev = ic->rds_ibdev; 3598c2ecf20Sopenharmony_ci iinfo6->max_send_wr = ic->i_send_ring.w_nr; 3608c2ecf20Sopenharmony_ci iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; 3618c2ecf20Sopenharmony_ci iinfo6->max_send_sge = rds_ibdev->max_sge; 3628c2ecf20Sopenharmony_ci rds6_ib_get_mr_info(rds_ibdev, iinfo6); 3638c2ecf20Sopenharmony_ci iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs); 3648c2ecf20Sopenharmony_ci } 3658c2ecf20Sopenharmony_ci return 1; 3668c2ecf20Sopenharmony_ci} 3678c2ecf20Sopenharmony_ci#endif 3688c2ecf20Sopenharmony_ci 3698c2ecf20Sopenharmony_cistatic void rds_ib_ic_info(struct socket *sock, unsigned int len, 3708c2ecf20Sopenharmony_ci struct rds_info_iterator *iter, 3718c2ecf20Sopenharmony_ci struct rds_info_lengths *lens) 3728c2ecf20Sopenharmony_ci{ 3738c2ecf20Sopenharmony_ci u64 buffer[(sizeof(struct rds_info_rdma_connection) + 7) / 8]; 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci rds_for_each_conn_info(sock, len, iter, lens, 3768c2ecf20Sopenharmony_ci rds_ib_conn_info_visitor, 3778c2ecf20Sopenharmony_ci buffer, 3788c2ecf20Sopenharmony_ci sizeof(struct rds_info_rdma_connection)); 3798c2ecf20Sopenharmony_ci} 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6) 3828c2ecf20Sopenharmony_ci/* IPv6 version of rds_ib_ic_info(). */ 3838c2ecf20Sopenharmony_cistatic void rds6_ib_ic_info(struct socket *sock, unsigned int len, 3848c2ecf20Sopenharmony_ci struct rds_info_iterator *iter, 3858c2ecf20Sopenharmony_ci struct rds_info_lengths *lens) 3868c2ecf20Sopenharmony_ci{ 3878c2ecf20Sopenharmony_ci u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8]; 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci rds_for_each_conn_info(sock, len, iter, lens, 3908c2ecf20Sopenharmony_ci rds6_ib_conn_info_visitor, 3918c2ecf20Sopenharmony_ci buffer, 3928c2ecf20Sopenharmony_ci sizeof(struct rds6_info_rdma_connection)); 3938c2ecf20Sopenharmony_ci} 3948c2ecf20Sopenharmony_ci#endif 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_ci/* 3978c2ecf20Sopenharmony_ci * Early RDS/IB was built to only bind to an address if there is an IPoIB 3988c2ecf20Sopenharmony_ci * device with that address set. 3998c2ecf20Sopenharmony_ci * 4008c2ecf20Sopenharmony_ci * If it were me, I'd advocate for something more flexible. Sending and 4018c2ecf20Sopenharmony_ci * receiving should be device-agnostic. Transports would try and maintain 4028c2ecf20Sopenharmony_ci * connections between peers who have messages queued. Userspace would be 4038c2ecf20Sopenharmony_ci * allowed to influence which paths have priority. We could call userspace 4048c2ecf20Sopenharmony_ci * asserting this policy "routing". 4058c2ecf20Sopenharmony_ci */ 4068c2ecf20Sopenharmony_cistatic int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, 4078c2ecf20Sopenharmony_ci __u32 scope_id) 4088c2ecf20Sopenharmony_ci{ 4098c2ecf20Sopenharmony_ci int ret; 4108c2ecf20Sopenharmony_ci struct rdma_cm_id *cm_id; 4118c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6) 4128c2ecf20Sopenharmony_ci struct sockaddr_in6 sin6; 4138c2ecf20Sopenharmony_ci#endif 4148c2ecf20Sopenharmony_ci struct sockaddr_in sin; 4158c2ecf20Sopenharmony_ci struct sockaddr *sa; 4168c2ecf20Sopenharmony_ci bool isv4; 4178c2ecf20Sopenharmony_ci 4188c2ecf20Sopenharmony_ci isv4 = ipv6_addr_v4mapped(addr); 4198c2ecf20Sopenharmony_ci /* Create a CMA ID and try to bind it. This catches both 4208c2ecf20Sopenharmony_ci * IB and iWARP capable NICs. 4218c2ecf20Sopenharmony_ci */ 4228c2ecf20Sopenharmony_ci cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, 4238c2ecf20Sopenharmony_ci NULL, RDMA_PS_TCP, IB_QPT_RC); 4248c2ecf20Sopenharmony_ci if (IS_ERR(cm_id)) 4258c2ecf20Sopenharmony_ci return PTR_ERR(cm_id); 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ci if (isv4) { 4288c2ecf20Sopenharmony_ci memset(&sin, 0, sizeof(sin)); 4298c2ecf20Sopenharmony_ci sin.sin_family = AF_INET; 4308c2ecf20Sopenharmony_ci sin.sin_addr.s_addr = addr->s6_addr32[3]; 4318c2ecf20Sopenharmony_ci sa = (struct sockaddr *)&sin; 4328c2ecf20Sopenharmony_ci } else { 4338c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6) 4348c2ecf20Sopenharmony_ci memset(&sin6, 0, sizeof(sin6)); 4358c2ecf20Sopenharmony_ci sin6.sin6_family = AF_INET6; 4368c2ecf20Sopenharmony_ci sin6.sin6_addr = *addr; 4378c2ecf20Sopenharmony_ci sin6.sin6_scope_id = scope_id; 4388c2ecf20Sopenharmony_ci sa = (struct sockaddr *)&sin6; 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci /* XXX Do a special IPv6 link local address check here. The 4418c2ecf20Sopenharmony_ci * reason is that rdma_bind_addr() always succeeds with IPv6 4428c2ecf20Sopenharmony_ci * link local address regardless it is indeed configured in a 4438c2ecf20Sopenharmony_ci * system. 4448c2ecf20Sopenharmony_ci */ 4458c2ecf20Sopenharmony_ci if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { 4468c2ecf20Sopenharmony_ci struct net_device *dev; 4478c2ecf20Sopenharmony_ci 4488c2ecf20Sopenharmony_ci if (scope_id == 0) { 4498c2ecf20Sopenharmony_ci ret = -EADDRNOTAVAIL; 4508c2ecf20Sopenharmony_ci goto out; 4518c2ecf20Sopenharmony_ci } 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_ci /* Use init_net for now as RDS is not network 4548c2ecf20Sopenharmony_ci * name space aware. 4558c2ecf20Sopenharmony_ci */ 4568c2ecf20Sopenharmony_ci dev = dev_get_by_index(&init_net, scope_id); 4578c2ecf20Sopenharmony_ci if (!dev) { 4588c2ecf20Sopenharmony_ci ret = -EADDRNOTAVAIL; 4598c2ecf20Sopenharmony_ci goto out; 4608c2ecf20Sopenharmony_ci } 4618c2ecf20Sopenharmony_ci if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { 4628c2ecf20Sopenharmony_ci dev_put(dev); 4638c2ecf20Sopenharmony_ci ret = -EADDRNOTAVAIL; 4648c2ecf20Sopenharmony_ci goto out; 4658c2ecf20Sopenharmony_ci } 4668c2ecf20Sopenharmony_ci dev_put(dev); 4678c2ecf20Sopenharmony_ci } 4688c2ecf20Sopenharmony_ci#else 4698c2ecf20Sopenharmony_ci ret = -EADDRNOTAVAIL; 4708c2ecf20Sopenharmony_ci goto out; 4718c2ecf20Sopenharmony_ci#endif 4728c2ecf20Sopenharmony_ci } 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci /* rdma_bind_addr will only succeed for IB & iWARP devices */ 4758c2ecf20Sopenharmony_ci ret = rdma_bind_addr(cm_id, sa); 4768c2ecf20Sopenharmony_ci /* due to this, we will claim to support iWARP devices unless we 4778c2ecf20Sopenharmony_ci check node_type. */ 4788c2ecf20Sopenharmony_ci if (ret || !cm_id->device || 4798c2ecf20Sopenharmony_ci cm_id->device->node_type != RDMA_NODE_IB_CA) 4808c2ecf20Sopenharmony_ci ret = -EADDRNOTAVAIL; 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci rdsdebug("addr %pI6c%%%u ret %d node type %d\n", 4838c2ecf20Sopenharmony_ci addr, scope_id, ret, 4848c2ecf20Sopenharmony_ci cm_id->device ? cm_id->device->node_type : -1); 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ciout: 4878c2ecf20Sopenharmony_ci rdma_destroy_id(cm_id); 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci return ret; 4908c2ecf20Sopenharmony_ci} 4918c2ecf20Sopenharmony_ci 4928c2ecf20Sopenharmony_cistatic void rds_ib_unregister_client(void) 4938c2ecf20Sopenharmony_ci{ 4948c2ecf20Sopenharmony_ci ib_unregister_client(&rds_ib_client); 4958c2ecf20Sopenharmony_ci /* wait for rds_ib_dev_free() to complete */ 4968c2ecf20Sopenharmony_ci flush_workqueue(rds_wq); 4978c2ecf20Sopenharmony_ci} 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_cistatic void rds_ib_set_unloading(void) 5008c2ecf20Sopenharmony_ci{ 5018c2ecf20Sopenharmony_ci atomic_set(&rds_ib_unloading, 1); 5028c2ecf20Sopenharmony_ci} 5038c2ecf20Sopenharmony_ci 5048c2ecf20Sopenharmony_cistatic bool rds_ib_is_unloading(struct rds_connection *conn) 5058c2ecf20Sopenharmony_ci{ 5068c2ecf20Sopenharmony_ci struct rds_conn_path *cp = &conn->c_path[0]; 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci return (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags) || 5098c2ecf20Sopenharmony_ci atomic_read(&rds_ib_unloading) != 0); 5108c2ecf20Sopenharmony_ci} 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_civoid rds_ib_exit(void) 5138c2ecf20Sopenharmony_ci{ 5148c2ecf20Sopenharmony_ci rds_ib_set_unloading(); 5158c2ecf20Sopenharmony_ci synchronize_rcu(); 5168c2ecf20Sopenharmony_ci rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 5178c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6) 5188c2ecf20Sopenharmony_ci rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); 5198c2ecf20Sopenharmony_ci#endif 5208c2ecf20Sopenharmony_ci rds_ib_unregister_client(); 5218c2ecf20Sopenharmony_ci rds_ib_destroy_nodev_conns(); 5228c2ecf20Sopenharmony_ci rds_ib_sysctl_exit(); 5238c2ecf20Sopenharmony_ci rds_ib_recv_exit(); 5248c2ecf20Sopenharmony_ci rds_trans_unregister(&rds_ib_transport); 5258c2ecf20Sopenharmony_ci rds_ib_mr_exit(); 5268c2ecf20Sopenharmony_ci} 5278c2ecf20Sopenharmony_ci 5288c2ecf20Sopenharmony_cistatic u8 rds_ib_get_tos_map(u8 tos) 5298c2ecf20Sopenharmony_ci{ 5308c2ecf20Sopenharmony_ci /* 1:1 user to transport map for RDMA transport. 5318c2ecf20Sopenharmony_ci * In future, if custom map is desired, hook can export 5328c2ecf20Sopenharmony_ci * user configurable map. 5338c2ecf20Sopenharmony_ci */ 5348c2ecf20Sopenharmony_ci return tos; 5358c2ecf20Sopenharmony_ci} 5368c2ecf20Sopenharmony_ci 5378c2ecf20Sopenharmony_cistruct rds_transport rds_ib_transport = { 5388c2ecf20Sopenharmony_ci .laddr_check = rds_ib_laddr_check, 5398c2ecf20Sopenharmony_ci .xmit_path_complete = rds_ib_xmit_path_complete, 5408c2ecf20Sopenharmony_ci .xmit = rds_ib_xmit, 5418c2ecf20Sopenharmony_ci .xmit_rdma = rds_ib_xmit_rdma, 5428c2ecf20Sopenharmony_ci .xmit_atomic = rds_ib_xmit_atomic, 5438c2ecf20Sopenharmony_ci .recv_path = rds_ib_recv_path, 5448c2ecf20Sopenharmony_ci .conn_alloc = rds_ib_conn_alloc, 5458c2ecf20Sopenharmony_ci .conn_free = rds_ib_conn_free, 5468c2ecf20Sopenharmony_ci .conn_path_connect = rds_ib_conn_path_connect, 5478c2ecf20Sopenharmony_ci .conn_path_shutdown = rds_ib_conn_path_shutdown, 5488c2ecf20Sopenharmony_ci .inc_copy_to_user = rds_ib_inc_copy_to_user, 5498c2ecf20Sopenharmony_ci .inc_free = rds_ib_inc_free, 5508c2ecf20Sopenharmony_ci .cm_initiate_connect = rds_ib_cm_initiate_connect, 5518c2ecf20Sopenharmony_ci .cm_handle_connect = rds_ib_cm_handle_connect, 5528c2ecf20Sopenharmony_ci .cm_connect_complete = rds_ib_cm_connect_complete, 5538c2ecf20Sopenharmony_ci .stats_info_copy = rds_ib_stats_info_copy, 5548c2ecf20Sopenharmony_ci .exit = rds_ib_exit, 5558c2ecf20Sopenharmony_ci .get_mr = rds_ib_get_mr, 5568c2ecf20Sopenharmony_ci .sync_mr = rds_ib_sync_mr, 5578c2ecf20Sopenharmony_ci .free_mr = rds_ib_free_mr, 5588c2ecf20Sopenharmony_ci .flush_mrs = rds_ib_flush_mrs, 5598c2ecf20Sopenharmony_ci .get_tos_map = rds_ib_get_tos_map, 5608c2ecf20Sopenharmony_ci .t_owner = THIS_MODULE, 5618c2ecf20Sopenharmony_ci .t_name = "infiniband", 5628c2ecf20Sopenharmony_ci .t_unloading = rds_ib_is_unloading, 5638c2ecf20Sopenharmony_ci .t_type = RDS_TRANS_IB 5648c2ecf20Sopenharmony_ci}; 5658c2ecf20Sopenharmony_ci 5668c2ecf20Sopenharmony_ciint rds_ib_init(void) 5678c2ecf20Sopenharmony_ci{ 5688c2ecf20Sopenharmony_ci int ret; 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&rds_ib_devices); 5718c2ecf20Sopenharmony_ci 5728c2ecf20Sopenharmony_ci ret = rds_ib_mr_init(); 5738c2ecf20Sopenharmony_ci if (ret) 5748c2ecf20Sopenharmony_ci goto out; 5758c2ecf20Sopenharmony_ci 5768c2ecf20Sopenharmony_ci ret = ib_register_client(&rds_ib_client); 5778c2ecf20Sopenharmony_ci if (ret) 5788c2ecf20Sopenharmony_ci goto out_mr_exit; 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci ret = rds_ib_sysctl_init(); 5818c2ecf20Sopenharmony_ci if (ret) 5828c2ecf20Sopenharmony_ci goto out_ibreg; 5838c2ecf20Sopenharmony_ci 5848c2ecf20Sopenharmony_ci ret = rds_ib_recv_init(); 5858c2ecf20Sopenharmony_ci if (ret) 5868c2ecf20Sopenharmony_ci goto out_sysctl; 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_ci rds_trans_register(&rds_ib_transport); 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 5918c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_IPV6) 5928c2ecf20Sopenharmony_ci rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); 5938c2ecf20Sopenharmony_ci#endif 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci goto out; 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_ciout_sysctl: 5988c2ecf20Sopenharmony_ci rds_ib_sysctl_exit(); 5998c2ecf20Sopenharmony_ciout_ibreg: 6008c2ecf20Sopenharmony_ci rds_ib_unregister_client(); 6018c2ecf20Sopenharmony_ciout_mr_exit: 6028c2ecf20Sopenharmony_ci rds_ib_mr_exit(); 6038c2ecf20Sopenharmony_ciout: 6048c2ecf20Sopenharmony_ci return ret; 6058c2ecf20Sopenharmony_ci} 6068c2ecf20Sopenharmony_ci 6078c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL"); 608