1/* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34#include <linux/module.h> 35#include <linux/string.h> 36#include <linux/errno.h> 37#include <linux/kernel.h> 38#include <linux/slab.h> 39#include <linux/init.h> 40#include <linux/netdevice.h> 41#include <net/net_namespace.h> 42#include <linux/security.h> 43#include <linux/notifier.h> 44#include <linux/hashtable.h> 45#include <rdma/rdma_netlink.h> 46#include <rdma/ib_addr.h> 47#include <rdma/ib_cache.h> 48#include <rdma/rdma_counter.h> 49 50#include "core_priv.h" 51#include "restrack.h" 52 53MODULE_AUTHOR("Roland Dreier"); 54MODULE_DESCRIPTION("core kernel InfiniBand API"); 55MODULE_LICENSE("Dual BSD/GPL"); 56 57struct workqueue_struct *ib_comp_wq; 58struct workqueue_struct *ib_comp_unbound_wq; 59struct workqueue_struct *ib_wq; 60EXPORT_SYMBOL_GPL(ib_wq); 61 62/* 63 * Each of the three rwsem locks (devices, clients, client_data) protects the 64 * xarray of the same name. Specifically it allows the caller to assert that 65 * the MARK will/will not be changing under the lock, and for devices and 66 * clients, that the value in the xarray is still a valid pointer. Change of 67 * the MARK is linked to the object state, so holding the lock and testing the 68 * MARK also asserts that the contained object is in a certain state. 69 * 70 * This is used to build a two stage register/unregister flow where objects 71 * can continue to be in the xarray even though they are still in progress to 72 * register/unregister. 73 * 74 * The xarray itself provides additional locking, and restartable iteration, 75 * which is also relied on. 76 * 77 * Locks should not be nested, with the exception of client_data, which is 78 * allowed to nest under the read side of the other two locks. 79 * 80 * The devices_rwsem also protects the device name list, any change or 81 * assignment of device name must also hold the write side to guarantee unique 82 * names. 83 */ 84 85/* 86 * devices contains devices that have had their names assigned. The 87 * devices may not be registered. Users that care about the registration 88 * status need to call ib_device_try_get() on the device to ensure it is 89 * registered, and keep it registered, for the required duration. 90 * 91 */ 92static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 93static DECLARE_RWSEM(devices_rwsem); 94#define DEVICE_REGISTERED XA_MARK_1 95 96static u32 highest_client_id; 97#define CLIENT_REGISTERED XA_MARK_1 98static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 99static DECLARE_RWSEM(clients_rwsem); 100 101static void ib_client_put(struct ib_client *client) 102{ 103 if (refcount_dec_and_test(&client->uses)) 104 complete(&client->uses_zero); 105} 106 107/* 108 * If client_data is registered then the corresponding client must also still 109 * be registered. 110 */ 111#define CLIENT_DATA_REGISTERED XA_MARK_1 112 113unsigned int rdma_dev_net_id; 114 115/* 116 * A list of net namespaces is maintained in an xarray. This is necessary 117 * because we can't get the locking right using the existing net ns list. We 118 * would require a init_net callback after the list is updated. 119 */ 120static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 121/* 122 * rwsem to protect accessing the rdma_nets xarray entries. 123 */ 124static DECLARE_RWSEM(rdma_nets_rwsem); 125 126bool ib_devices_shared_netns = true; 127module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 128MODULE_PARM_DESC(netns_mode, 129 "Share device among net namespaces; default=1 (shared)"); 130/** 131 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 132 * from a specified net namespace or not. 133 * @dev: Pointer to rdma device which needs to be checked 134 * @net: Pointer to net namesapce for which access to be checked 135 * 136 * When the rdma device is in shared mode, it ignores the net namespace. 137 * When the rdma device is exclusive to a net namespace, rdma device net 138 * namespace is checked against the specified one. 139 */ 140bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 141{ 142 return (ib_devices_shared_netns || 143 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 144} 145EXPORT_SYMBOL(rdma_dev_access_netns); 146 147/* 148 * xarray has this behavior where it won't iterate over NULL values stored in 149 * allocated arrays. So we need our own iterator to see all values stored in 150 * the array. This does the same thing as xa_for_each except that it also 151 * returns NULL valued entries if the array is allocating. Simplified to only 152 * work on simple xarrays. 153 */ 154static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 155 xa_mark_t filter) 156{ 157 XA_STATE(xas, xa, *indexp); 158 void *entry; 159 160 rcu_read_lock(); 161 do { 162 entry = xas_find_marked(&xas, ULONG_MAX, filter); 163 if (xa_is_zero(entry)) 164 break; 165 } while (xas_retry(&xas, entry)); 166 rcu_read_unlock(); 167 168 if (entry) { 169 *indexp = xas.xa_index; 170 if (xa_is_zero(entry)) 171 return NULL; 172 return entry; 173 } 174 return XA_ERROR(-ENOENT); 175} 176#define xan_for_each_marked(xa, index, entry, filter) \ 177 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 178 !xa_is_err(entry); \ 179 (index)++, entry = xan_find_marked(xa, &(index), filter)) 180 181/* RCU hash table mapping netdevice pointers to struct ib_port_data */ 182static DEFINE_SPINLOCK(ndev_hash_lock); 183static DECLARE_HASHTABLE(ndev_hash, 5); 184 185static void free_netdevs(struct ib_device *ib_dev); 186static void ib_unregister_work(struct work_struct *work); 187static void __ib_unregister_device(struct ib_device *device); 188static int ib_security_change(struct notifier_block *nb, unsigned long event, 189 void *lsm_data); 190static void ib_policy_change_task(struct work_struct *work); 191static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 192 193static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 194 struct va_format *vaf) 195{ 196 if (ibdev && ibdev->dev.parent) 197 dev_printk_emit(level[1] - '0', 198 ibdev->dev.parent, 199 "%s %s %s: %pV", 200 dev_driver_string(ibdev->dev.parent), 201 dev_name(ibdev->dev.parent), 202 dev_name(&ibdev->dev), 203 vaf); 204 else if (ibdev) 205 printk("%s%s: %pV", 206 level, dev_name(&ibdev->dev), vaf); 207 else 208 printk("%s(NULL ib_device): %pV", level, vaf); 209} 210 211void ibdev_printk(const char *level, const struct ib_device *ibdev, 212 const char *format, ...) 213{ 214 struct va_format vaf; 215 va_list args; 216 217 va_start(args, format); 218 219 vaf.fmt = format; 220 vaf.va = &args; 221 222 __ibdev_printk(level, ibdev, &vaf); 223 224 va_end(args); 225} 226EXPORT_SYMBOL(ibdev_printk); 227 228#define define_ibdev_printk_level(func, level) \ 229void func(const struct ib_device *ibdev, const char *fmt, ...) \ 230{ \ 231 struct va_format vaf; \ 232 va_list args; \ 233 \ 234 va_start(args, fmt); \ 235 \ 236 vaf.fmt = fmt; \ 237 vaf.va = &args; \ 238 \ 239 __ibdev_printk(level, ibdev, &vaf); \ 240 \ 241 va_end(args); \ 242} \ 243EXPORT_SYMBOL(func); 244 245define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 246define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 247define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 248define_ibdev_printk_level(ibdev_err, KERN_ERR); 249define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 250define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 251define_ibdev_printk_level(ibdev_info, KERN_INFO); 252 253static struct notifier_block ibdev_lsm_nb = { 254 .notifier_call = ib_security_change, 255}; 256 257static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 258 struct net *net); 259 260/* Pointer to the RCU head at the start of the ib_port_data array */ 261struct ib_port_data_rcu { 262 struct rcu_head rcu_head; 263 struct ib_port_data pdata[]; 264}; 265 266static void ib_device_check_mandatory(struct ib_device *device) 267{ 268#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 269 static const struct { 270 size_t offset; 271 char *name; 272 } mandatory_table[] = { 273 IB_MANDATORY_FUNC(query_device), 274 IB_MANDATORY_FUNC(query_port), 275 IB_MANDATORY_FUNC(alloc_pd), 276 IB_MANDATORY_FUNC(dealloc_pd), 277 IB_MANDATORY_FUNC(create_qp), 278 IB_MANDATORY_FUNC(modify_qp), 279 IB_MANDATORY_FUNC(destroy_qp), 280 IB_MANDATORY_FUNC(post_send), 281 IB_MANDATORY_FUNC(post_recv), 282 IB_MANDATORY_FUNC(create_cq), 283 IB_MANDATORY_FUNC(destroy_cq), 284 IB_MANDATORY_FUNC(poll_cq), 285 IB_MANDATORY_FUNC(req_notify_cq), 286 IB_MANDATORY_FUNC(get_dma_mr), 287 IB_MANDATORY_FUNC(dereg_mr), 288 IB_MANDATORY_FUNC(get_port_immutable) 289 }; 290 int i; 291 292 device->kverbs_provider = true; 293 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 294 if (!*(void **) ((void *) &device->ops + 295 mandatory_table[i].offset)) { 296 device->kverbs_provider = false; 297 break; 298 } 299 } 300} 301 302/* 303 * Caller must perform ib_device_put() to return the device reference count 304 * when ib_device_get_by_index() returns valid device pointer. 305 */ 306struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 307{ 308 struct ib_device *device; 309 310 down_read(&devices_rwsem); 311 device = xa_load(&devices, index); 312 if (device) { 313 if (!rdma_dev_access_netns(device, net)) { 314 device = NULL; 315 goto out; 316 } 317 318 if (!ib_device_try_get(device)) 319 device = NULL; 320 } 321out: 322 up_read(&devices_rwsem); 323 return device; 324} 325 326/** 327 * ib_device_put - Release IB device reference 328 * @device: device whose reference to be released 329 * 330 * ib_device_put() releases reference to the IB device to allow it to be 331 * unregistered and eventually free. 332 */ 333void ib_device_put(struct ib_device *device) 334{ 335 if (refcount_dec_and_test(&device->refcount)) 336 complete(&device->unreg_completion); 337} 338EXPORT_SYMBOL(ib_device_put); 339 340static struct ib_device *__ib_device_get_by_name(const char *name) 341{ 342 struct ib_device *device; 343 unsigned long index; 344 345 xa_for_each (&devices, index, device) 346 if (!strcmp(name, dev_name(&device->dev))) 347 return device; 348 349 return NULL; 350} 351 352/** 353 * ib_device_get_by_name - Find an IB device by name 354 * @name: The name to look for 355 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 356 * 357 * Find and hold an ib_device by its name. The caller must call 358 * ib_device_put() on the returned pointer. 359 */ 360struct ib_device *ib_device_get_by_name(const char *name, 361 enum rdma_driver_id driver_id) 362{ 363 struct ib_device *device; 364 365 down_read(&devices_rwsem); 366 device = __ib_device_get_by_name(name); 367 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 368 device->ops.driver_id != driver_id) 369 device = NULL; 370 371 if (device) { 372 if (!ib_device_try_get(device)) 373 device = NULL; 374 } 375 up_read(&devices_rwsem); 376 return device; 377} 378EXPORT_SYMBOL(ib_device_get_by_name); 379 380static int rename_compat_devs(struct ib_device *device) 381{ 382 struct ib_core_device *cdev; 383 unsigned long index; 384 int ret = 0; 385 386 mutex_lock(&device->compat_devs_mutex); 387 xa_for_each (&device->compat_devs, index, cdev) { 388 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 389 if (ret) { 390 dev_warn(&cdev->dev, 391 "Fail to rename compatdev to new name %s\n", 392 dev_name(&device->dev)); 393 break; 394 } 395 } 396 mutex_unlock(&device->compat_devs_mutex); 397 return ret; 398} 399 400int ib_device_rename(struct ib_device *ibdev, const char *name) 401{ 402 unsigned long index; 403 void *client_data; 404 int ret; 405 406 down_write(&devices_rwsem); 407 if (!strcmp(name, dev_name(&ibdev->dev))) { 408 up_write(&devices_rwsem); 409 return 0; 410 } 411 412 if (__ib_device_get_by_name(name)) { 413 up_write(&devices_rwsem); 414 return -EEXIST; 415 } 416 417 ret = device_rename(&ibdev->dev, name); 418 if (ret) { 419 up_write(&devices_rwsem); 420 return ret; 421 } 422 423 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 424 ret = rename_compat_devs(ibdev); 425 426 downgrade_write(&devices_rwsem); 427 down_read(&ibdev->client_data_rwsem); 428 xan_for_each_marked(&ibdev->client_data, index, client_data, 429 CLIENT_DATA_REGISTERED) { 430 struct ib_client *client = xa_load(&clients, index); 431 432 if (!client || !client->rename) 433 continue; 434 435 client->rename(ibdev, client_data); 436 } 437 up_read(&ibdev->client_data_rwsem); 438 up_read(&devices_rwsem); 439 return 0; 440} 441 442int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 443{ 444 if (use_dim > 1) 445 return -EINVAL; 446 ibdev->use_cq_dim = use_dim; 447 448 return 0; 449} 450 451static int alloc_name(struct ib_device *ibdev, const char *name) 452{ 453 struct ib_device *device; 454 unsigned long index; 455 struct ida inuse; 456 int rc; 457 int i; 458 459 lockdep_assert_held_write(&devices_rwsem); 460 ida_init(&inuse); 461 xa_for_each (&devices, index, device) { 462 char buf[IB_DEVICE_NAME_MAX]; 463 464 if (sscanf(dev_name(&device->dev), name, &i) != 1) 465 continue; 466 if (i < 0 || i >= INT_MAX) 467 continue; 468 snprintf(buf, sizeof buf, name, i); 469 if (strcmp(buf, dev_name(&device->dev)) != 0) 470 continue; 471 472 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 473 if (rc < 0) 474 goto out; 475 } 476 477 rc = ida_alloc(&inuse, GFP_KERNEL); 478 if (rc < 0) 479 goto out; 480 481 rc = dev_set_name(&ibdev->dev, name, rc); 482out: 483 ida_destroy(&inuse); 484 return rc; 485} 486 487static void ib_device_release(struct device *device) 488{ 489 struct ib_device *dev = container_of(device, struct ib_device, dev); 490 491 free_netdevs(dev); 492 WARN_ON(refcount_read(&dev->refcount)); 493 if (dev->port_data) { 494 ib_cache_release_one(dev); 495 ib_security_release_port_pkey_list(dev); 496 rdma_counter_release(dev); 497 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 498 pdata[0]), 499 rcu_head); 500 } 501 502 mutex_destroy(&dev->unregistration_lock); 503 mutex_destroy(&dev->compat_devs_mutex); 504 505 xa_destroy(&dev->compat_devs); 506 xa_destroy(&dev->client_data); 507 kfree_rcu(dev, rcu_head); 508} 509 510static int ib_device_uevent(struct device *device, 511 struct kobj_uevent_env *env) 512{ 513 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 514 return -ENOMEM; 515 516 /* 517 * It would be nice to pass the node GUID with the event... 518 */ 519 520 return 0; 521} 522 523static const void *net_namespace(struct device *d) 524{ 525 struct ib_core_device *coredev = 526 container_of(d, struct ib_core_device, dev); 527 528 return read_pnet(&coredev->rdma_net); 529} 530 531static struct class ib_class = { 532 .name = "infiniband", 533 .dev_release = ib_device_release, 534 .dev_uevent = ib_device_uevent, 535 .ns_type = &net_ns_type_operations, 536 .namespace = net_namespace, 537}; 538 539static void rdma_init_coredev(struct ib_core_device *coredev, 540 struct ib_device *dev, struct net *net) 541{ 542 /* This BUILD_BUG_ON is intended to catch layout change 543 * of union of ib_core_device and device. 544 * dev must be the first element as ib_core and providers 545 * driver uses it. Adding anything in ib_core_device before 546 * device will break this assumption. 547 */ 548 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 549 offsetof(struct ib_device, dev)); 550 551 coredev->dev.class = &ib_class; 552 coredev->dev.groups = dev->groups; 553 device_initialize(&coredev->dev); 554 coredev->owner = dev; 555 INIT_LIST_HEAD(&coredev->port_list); 556 write_pnet(&coredev->rdma_net, net); 557} 558 559/** 560 * _ib_alloc_device - allocate an IB device struct 561 * @size:size of structure to allocate 562 * 563 * Low-level drivers should use ib_alloc_device() to allocate &struct 564 * ib_device. @size is the size of the structure to be allocated, 565 * including any private data used by the low-level driver. 566 * ib_dealloc_device() must be used to free structures allocated with 567 * ib_alloc_device(). 568 */ 569struct ib_device *_ib_alloc_device(size_t size) 570{ 571 struct ib_device *device; 572 573 if (WARN_ON(size < sizeof(struct ib_device))) 574 return NULL; 575 576 device = kzalloc(size, GFP_KERNEL); 577 if (!device) 578 return NULL; 579 580 if (rdma_restrack_init(device)) { 581 kfree(device); 582 return NULL; 583 } 584 585 device->groups[0] = &ib_dev_attr_group; 586 rdma_init_coredev(&device->coredev, device, &init_net); 587 588 INIT_LIST_HEAD(&device->event_handler_list); 589 spin_lock_init(&device->qp_open_list_lock); 590 init_rwsem(&device->event_handler_rwsem); 591 mutex_init(&device->unregistration_lock); 592 /* 593 * client_data needs to be alloc because we don't want our mark to be 594 * destroyed if the user stores NULL in the client data. 595 */ 596 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 597 init_rwsem(&device->client_data_rwsem); 598 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 599 mutex_init(&device->compat_devs_mutex); 600 init_completion(&device->unreg_completion); 601 INIT_WORK(&device->unregistration_work, ib_unregister_work); 602 603 device->uverbs_ex_cmd_mask = 604 BIT_ULL(IB_USER_VERBS_EX_CMD_CREATE_FLOW) | 605 BIT_ULL(IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | 606 BIT_ULL(IB_USER_VERBS_EX_CMD_CREATE_WQ) | 607 BIT_ULL(IB_USER_VERBS_EX_CMD_DESTROY_FLOW) | 608 BIT_ULL(IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL) | 609 BIT_ULL(IB_USER_VERBS_EX_CMD_DESTROY_WQ) | 610 BIT_ULL(IB_USER_VERBS_EX_CMD_MODIFY_CQ) | 611 BIT_ULL(IB_USER_VERBS_EX_CMD_MODIFY_WQ) | 612 BIT_ULL(IB_USER_VERBS_EX_CMD_QUERY_DEVICE); 613 614 return device; 615} 616EXPORT_SYMBOL(_ib_alloc_device); 617 618/** 619 * ib_dealloc_device - free an IB device struct 620 * @device:structure to free 621 * 622 * Free a structure allocated with ib_alloc_device(). 623 */ 624void ib_dealloc_device(struct ib_device *device) 625{ 626 if (device->ops.dealloc_driver) 627 device->ops.dealloc_driver(device); 628 629 /* 630 * ib_unregister_driver() requires all devices to remain in the xarray 631 * while their ops are callable. The last op we call is dealloc_driver 632 * above. This is needed to create a fence on op callbacks prior to 633 * allowing the driver module to unload. 634 */ 635 down_write(&devices_rwsem); 636 if (xa_load(&devices, device->index) == device) 637 xa_erase(&devices, device->index); 638 up_write(&devices_rwsem); 639 640 /* Expedite releasing netdev references */ 641 free_netdevs(device); 642 643 WARN_ON(!xa_empty(&device->compat_devs)); 644 WARN_ON(!xa_empty(&device->client_data)); 645 WARN_ON(refcount_read(&device->refcount)); 646 rdma_restrack_clean(device); 647 /* Balances with device_initialize */ 648 put_device(&device->dev); 649} 650EXPORT_SYMBOL(ib_dealloc_device); 651 652/* 653 * add_client_context() and remove_client_context() must be safe against 654 * parallel calls on the same device - registration/unregistration of both the 655 * device and client can be occurring in parallel. 656 * 657 * The routines need to be a fence, any caller must not return until the add 658 * or remove is fully completed. 659 */ 660static int add_client_context(struct ib_device *device, 661 struct ib_client *client) 662{ 663 int ret = 0; 664 665 if (!device->kverbs_provider && !client->no_kverbs_req) 666 return 0; 667 668 down_write(&device->client_data_rwsem); 669 /* 670 * So long as the client is registered hold both the client and device 671 * unregistration locks. 672 */ 673 if (!refcount_inc_not_zero(&client->uses)) 674 goto out_unlock; 675 refcount_inc(&device->refcount); 676 677 /* 678 * Another caller to add_client_context got here first and has already 679 * completely initialized context. 680 */ 681 if (xa_get_mark(&device->client_data, client->client_id, 682 CLIENT_DATA_REGISTERED)) 683 goto out; 684 685 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 686 GFP_KERNEL)); 687 if (ret) 688 goto out; 689 downgrade_write(&device->client_data_rwsem); 690 if (client->add) { 691 if (client->add(device)) { 692 /* 693 * If a client fails to add then the error code is 694 * ignored, but we won't call any more ops on this 695 * client. 696 */ 697 xa_erase(&device->client_data, client->client_id); 698 up_read(&device->client_data_rwsem); 699 ib_device_put(device); 700 ib_client_put(client); 701 return 0; 702 } 703 } 704 705 /* Readers shall not see a client until add has been completed */ 706 xa_set_mark(&device->client_data, client->client_id, 707 CLIENT_DATA_REGISTERED); 708 up_read(&device->client_data_rwsem); 709 return 0; 710 711out: 712 ib_device_put(device); 713 ib_client_put(client); 714out_unlock: 715 up_write(&device->client_data_rwsem); 716 return ret; 717} 718 719static void remove_client_context(struct ib_device *device, 720 unsigned int client_id) 721{ 722 struct ib_client *client; 723 void *client_data; 724 725 down_write(&device->client_data_rwsem); 726 if (!xa_get_mark(&device->client_data, client_id, 727 CLIENT_DATA_REGISTERED)) { 728 up_write(&device->client_data_rwsem); 729 return; 730 } 731 client_data = xa_load(&device->client_data, client_id); 732 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 733 client = xa_load(&clients, client_id); 734 up_write(&device->client_data_rwsem); 735 736 /* 737 * Notice we cannot be holding any exclusive locks when calling the 738 * remove callback as the remove callback can recurse back into any 739 * public functions in this module and thus try for any locks those 740 * functions take. 741 * 742 * For this reason clients and drivers should not call the 743 * unregistration functions will holdling any locks. 744 */ 745 if (client->remove) 746 client->remove(device, client_data); 747 748 xa_erase(&device->client_data, client_id); 749 ib_device_put(device); 750 ib_client_put(client); 751} 752 753static int alloc_port_data(struct ib_device *device) 754{ 755 struct ib_port_data_rcu *pdata_rcu; 756 unsigned int port; 757 758 if (device->port_data) 759 return 0; 760 761 /* This can only be called once the physical port range is defined */ 762 if (WARN_ON(!device->phys_port_cnt)) 763 return -EINVAL; 764 765 /* 766 * device->port_data is indexed directly by the port number to make 767 * access to this data as efficient as possible. 768 * 769 * Therefore port_data is declared as a 1 based array with potential 770 * empty slots at the beginning. 771 */ 772 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 773 rdma_end_port(device) + 1), 774 GFP_KERNEL); 775 if (!pdata_rcu) 776 return -ENOMEM; 777 /* 778 * The rcu_head is put in front of the port data array and the stored 779 * pointer is adjusted since we never need to see that member until 780 * kfree_rcu. 781 */ 782 device->port_data = pdata_rcu->pdata; 783 784 rdma_for_each_port (device, port) { 785 struct ib_port_data *pdata = &device->port_data[port]; 786 787 pdata->ib_dev = device; 788 spin_lock_init(&pdata->pkey_list_lock); 789 INIT_LIST_HEAD(&pdata->pkey_list); 790 spin_lock_init(&pdata->netdev_lock); 791 INIT_HLIST_NODE(&pdata->ndev_hash_link); 792 } 793 return 0; 794} 795 796static int verify_immutable(const struct ib_device *dev, u8 port) 797{ 798 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 799 rdma_max_mad_size(dev, port) != 0); 800} 801 802static int setup_port_data(struct ib_device *device) 803{ 804 unsigned int port; 805 int ret; 806 807 ret = alloc_port_data(device); 808 if (ret) 809 return ret; 810 811 rdma_for_each_port (device, port) { 812 struct ib_port_data *pdata = &device->port_data[port]; 813 814 ret = device->ops.get_port_immutable(device, port, 815 &pdata->immutable); 816 if (ret) 817 return ret; 818 819 if (verify_immutable(device, port)) 820 return -EINVAL; 821 } 822 return 0; 823} 824 825void ib_get_device_fw_str(struct ib_device *dev, char *str) 826{ 827 if (dev->ops.get_dev_fw_str) 828 dev->ops.get_dev_fw_str(dev, str); 829 else 830 str[0] = '\0'; 831} 832EXPORT_SYMBOL(ib_get_device_fw_str); 833 834static void ib_policy_change_task(struct work_struct *work) 835{ 836 struct ib_device *dev; 837 unsigned long index; 838 839 down_read(&devices_rwsem); 840 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 841 unsigned int i; 842 843 rdma_for_each_port (dev, i) { 844 u64 sp; 845 int ret = ib_get_cached_subnet_prefix(dev, 846 i, 847 &sp); 848 849 WARN_ONCE(ret, 850 "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", 851 ret); 852 if (!ret) 853 ib_security_cache_change(dev, i, sp); 854 } 855 } 856 up_read(&devices_rwsem); 857} 858 859static int ib_security_change(struct notifier_block *nb, unsigned long event, 860 void *lsm_data) 861{ 862 if (event != LSM_POLICY_CHANGE) 863 return NOTIFY_DONE; 864 865 schedule_work(&ib_policy_change_work); 866 ib_mad_agent_security_change(); 867 868 return NOTIFY_OK; 869} 870 871static void compatdev_release(struct device *dev) 872{ 873 struct ib_core_device *cdev = 874 container_of(dev, struct ib_core_device, dev); 875 876 kfree(cdev); 877} 878 879static int add_one_compat_dev(struct ib_device *device, 880 struct rdma_dev_net *rnet) 881{ 882 struct ib_core_device *cdev; 883 int ret; 884 885 lockdep_assert_held(&rdma_nets_rwsem); 886 if (!ib_devices_shared_netns) 887 return 0; 888 889 /* 890 * Create and add compat device in all namespaces other than where it 891 * is currently bound to. 892 */ 893 if (net_eq(read_pnet(&rnet->net), 894 read_pnet(&device->coredev.rdma_net))) 895 return 0; 896 897 /* 898 * The first of init_net() or ib_register_device() to take the 899 * compat_devs_mutex wins and gets to add the device. Others will wait 900 * for completion here. 901 */ 902 mutex_lock(&device->compat_devs_mutex); 903 cdev = xa_load(&device->compat_devs, rnet->id); 904 if (cdev) { 905 ret = 0; 906 goto done; 907 } 908 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 909 if (ret) 910 goto done; 911 912 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 913 if (!cdev) { 914 ret = -ENOMEM; 915 goto cdev_err; 916 } 917 918 cdev->dev.parent = device->dev.parent; 919 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 920 cdev->dev.release = compatdev_release; 921 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 922 if (ret) 923 goto add_err; 924 925 ret = device_add(&cdev->dev); 926 if (ret) 927 goto add_err; 928 ret = ib_setup_port_attrs(cdev); 929 if (ret) 930 goto port_err; 931 932 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 933 cdev, GFP_KERNEL)); 934 if (ret) 935 goto insert_err; 936 937 mutex_unlock(&device->compat_devs_mutex); 938 return 0; 939 940insert_err: 941 ib_free_port_attrs(cdev); 942port_err: 943 device_del(&cdev->dev); 944add_err: 945 put_device(&cdev->dev); 946cdev_err: 947 xa_release(&device->compat_devs, rnet->id); 948done: 949 mutex_unlock(&device->compat_devs_mutex); 950 return ret; 951} 952 953static void remove_one_compat_dev(struct ib_device *device, u32 id) 954{ 955 struct ib_core_device *cdev; 956 957 mutex_lock(&device->compat_devs_mutex); 958 cdev = xa_erase(&device->compat_devs, id); 959 mutex_unlock(&device->compat_devs_mutex); 960 if (cdev) { 961 ib_free_port_attrs(cdev); 962 device_del(&cdev->dev); 963 put_device(&cdev->dev); 964 } 965} 966 967static void remove_compat_devs(struct ib_device *device) 968{ 969 struct ib_core_device *cdev; 970 unsigned long index; 971 972 xa_for_each (&device->compat_devs, index, cdev) 973 remove_one_compat_dev(device, index); 974} 975 976static int add_compat_devs(struct ib_device *device) 977{ 978 struct rdma_dev_net *rnet; 979 unsigned long index; 980 int ret = 0; 981 982 lockdep_assert_held(&devices_rwsem); 983 984 down_read(&rdma_nets_rwsem); 985 xa_for_each (&rdma_nets, index, rnet) { 986 ret = add_one_compat_dev(device, rnet); 987 if (ret) 988 break; 989 } 990 up_read(&rdma_nets_rwsem); 991 return ret; 992} 993 994static void remove_all_compat_devs(void) 995{ 996 struct ib_compat_device *cdev; 997 struct ib_device *dev; 998 unsigned long index; 999 1000 down_read(&devices_rwsem); 1001 xa_for_each (&devices, index, dev) { 1002 unsigned long c_index = 0; 1003 1004 /* Hold nets_rwsem so that any other thread modifying this 1005 * system param can sync with this thread. 1006 */ 1007 down_read(&rdma_nets_rwsem); 1008 xa_for_each (&dev->compat_devs, c_index, cdev) 1009 remove_one_compat_dev(dev, c_index); 1010 up_read(&rdma_nets_rwsem); 1011 } 1012 up_read(&devices_rwsem); 1013} 1014 1015static int add_all_compat_devs(void) 1016{ 1017 struct rdma_dev_net *rnet; 1018 struct ib_device *dev; 1019 unsigned long index; 1020 int ret = 0; 1021 1022 down_read(&devices_rwsem); 1023 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1024 unsigned long net_index = 0; 1025 1026 /* Hold nets_rwsem so that any other thread modifying this 1027 * system param can sync with this thread. 1028 */ 1029 down_read(&rdma_nets_rwsem); 1030 xa_for_each (&rdma_nets, net_index, rnet) { 1031 ret = add_one_compat_dev(dev, rnet); 1032 if (ret) 1033 break; 1034 } 1035 up_read(&rdma_nets_rwsem); 1036 } 1037 up_read(&devices_rwsem); 1038 if (ret) 1039 remove_all_compat_devs(); 1040 return ret; 1041} 1042 1043int rdma_compatdev_set(u8 enable) 1044{ 1045 struct rdma_dev_net *rnet; 1046 unsigned long index; 1047 int ret = 0; 1048 1049 down_write(&rdma_nets_rwsem); 1050 if (ib_devices_shared_netns == enable) { 1051 up_write(&rdma_nets_rwsem); 1052 return 0; 1053 } 1054 1055 /* enable/disable of compat devices is not supported 1056 * when more than default init_net exists. 1057 */ 1058 xa_for_each (&rdma_nets, index, rnet) { 1059 ret++; 1060 break; 1061 } 1062 if (!ret) 1063 ib_devices_shared_netns = enable; 1064 up_write(&rdma_nets_rwsem); 1065 if (ret) 1066 return -EBUSY; 1067 1068 if (enable) 1069 ret = add_all_compat_devs(); 1070 else 1071 remove_all_compat_devs(); 1072 return ret; 1073} 1074 1075static void rdma_dev_exit_net(struct net *net) 1076{ 1077 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1078 struct ib_device *dev; 1079 unsigned long index; 1080 int ret; 1081 1082 down_write(&rdma_nets_rwsem); 1083 /* 1084 * Prevent the ID from being re-used and hide the id from xa_for_each. 1085 */ 1086 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1087 WARN_ON(ret); 1088 up_write(&rdma_nets_rwsem); 1089 1090 down_read(&devices_rwsem); 1091 xa_for_each (&devices, index, dev) { 1092 get_device(&dev->dev); 1093 /* 1094 * Release the devices_rwsem so that pontentially blocking 1095 * device_del, doesn't hold the devices_rwsem for too long. 1096 */ 1097 up_read(&devices_rwsem); 1098 1099 remove_one_compat_dev(dev, rnet->id); 1100 1101 /* 1102 * If the real device is in the NS then move it back to init. 1103 */ 1104 rdma_dev_change_netns(dev, net, &init_net); 1105 1106 put_device(&dev->dev); 1107 down_read(&devices_rwsem); 1108 } 1109 up_read(&devices_rwsem); 1110 1111 rdma_nl_net_exit(rnet); 1112 xa_erase(&rdma_nets, rnet->id); 1113} 1114 1115static __net_init int rdma_dev_init_net(struct net *net) 1116{ 1117 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1118 unsigned long index; 1119 struct ib_device *dev; 1120 int ret; 1121 1122 write_pnet(&rnet->net, net); 1123 1124 ret = rdma_nl_net_init(rnet); 1125 if (ret) 1126 return ret; 1127 1128 /* No need to create any compat devices in default init_net. */ 1129 if (net_eq(net, &init_net)) 1130 return 0; 1131 1132 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1133 if (ret) { 1134 rdma_nl_net_exit(rnet); 1135 return ret; 1136 } 1137 1138 down_read(&devices_rwsem); 1139 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1140 /* Hold nets_rwsem so that netlink command cannot change 1141 * system configuration for device sharing mode. 1142 */ 1143 down_read(&rdma_nets_rwsem); 1144 ret = add_one_compat_dev(dev, rnet); 1145 up_read(&rdma_nets_rwsem); 1146 if (ret) 1147 break; 1148 } 1149 up_read(&devices_rwsem); 1150 1151 if (ret) 1152 rdma_dev_exit_net(net); 1153 1154 return ret; 1155} 1156 1157/* 1158 * Assign the unique string device name and the unique device index. This is 1159 * undone by ib_dealloc_device. 1160 */ 1161static int assign_name(struct ib_device *device, const char *name) 1162{ 1163 static u32 last_id; 1164 int ret; 1165 1166 down_write(&devices_rwsem); 1167 /* Assign a unique name to the device */ 1168 if (strchr(name, '%')) 1169 ret = alloc_name(device, name); 1170 else 1171 ret = dev_set_name(&device->dev, name); 1172 if (ret) 1173 goto out; 1174 1175 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1176 ret = -ENFILE; 1177 goto out; 1178 } 1179 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1180 1181 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1182 &last_id, GFP_KERNEL); 1183 if (ret > 0) 1184 ret = 0; 1185 1186out: 1187 up_write(&devices_rwsem); 1188 return ret; 1189} 1190 1191/* 1192 * setup_device() allocates memory and sets up data that requires calling the 1193 * device ops, this is the only reason these actions are not done during 1194 * ib_alloc_device. It is undone by ib_dealloc_device(). 1195 */ 1196static int setup_device(struct ib_device *device) 1197{ 1198 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1199 int ret; 1200 1201 ib_device_check_mandatory(device); 1202 1203 ret = setup_port_data(device); 1204 if (ret) { 1205 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1206 return ret; 1207 } 1208 1209 memset(&device->attrs, 0, sizeof(device->attrs)); 1210 ret = device->ops.query_device(device, &device->attrs, &uhw); 1211 if (ret) { 1212 dev_warn(&device->dev, 1213 "Couldn't query the device attributes\n"); 1214 return ret; 1215 } 1216 1217 return 0; 1218} 1219 1220static void disable_device(struct ib_device *device) 1221{ 1222 u32 cid; 1223 1224 WARN_ON(!refcount_read(&device->refcount)); 1225 1226 down_write(&devices_rwsem); 1227 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1228 up_write(&devices_rwsem); 1229 1230 /* 1231 * Remove clients in LIFO order, see assign_client_id. This could be 1232 * more efficient if xarray learns to reverse iterate. Since no new 1233 * clients can be added to this ib_device past this point we only need 1234 * the maximum possible client_id value here. 1235 */ 1236 down_read(&clients_rwsem); 1237 cid = highest_client_id; 1238 up_read(&clients_rwsem); 1239 while (cid) { 1240 cid--; 1241 remove_client_context(device, cid); 1242 } 1243 1244 ib_cq_pool_destroy(device); 1245 1246 /* Pairs with refcount_set in enable_device */ 1247 ib_device_put(device); 1248 wait_for_completion(&device->unreg_completion); 1249 1250 /* 1251 * compat devices must be removed after device refcount drops to zero. 1252 * Otherwise init_net() may add more compatdevs after removing compat 1253 * devices and before device is disabled. 1254 */ 1255 remove_compat_devs(device); 1256} 1257 1258/* 1259 * An enabled device is visible to all clients and to all the public facing 1260 * APIs that return a device pointer. This always returns with a new get, even 1261 * if it fails. 1262 */ 1263static int enable_device_and_get(struct ib_device *device) 1264{ 1265 struct ib_client *client; 1266 unsigned long index; 1267 int ret = 0; 1268 1269 /* 1270 * One ref belongs to the xa and the other belongs to this 1271 * thread. This is needed to guard against parallel unregistration. 1272 */ 1273 refcount_set(&device->refcount, 2); 1274 down_write(&devices_rwsem); 1275 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1276 1277 /* 1278 * By using downgrade_write() we ensure that no other thread can clear 1279 * DEVICE_REGISTERED while we are completing the client setup. 1280 */ 1281 downgrade_write(&devices_rwsem); 1282 1283 if (device->ops.enable_driver) { 1284 ret = device->ops.enable_driver(device); 1285 if (ret) 1286 goto out; 1287 } 1288 1289 ib_cq_pool_init(device); 1290 1291 down_read(&clients_rwsem); 1292 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1293 ret = add_client_context(device, client); 1294 if (ret) 1295 break; 1296 } 1297 up_read(&clients_rwsem); 1298 if (!ret) 1299 ret = add_compat_devs(device); 1300out: 1301 up_read(&devices_rwsem); 1302 return ret; 1303} 1304 1305static void prevent_dealloc_device(struct ib_device *ib_dev) 1306{ 1307} 1308 1309/** 1310 * ib_register_device - Register an IB device with IB core 1311 * @device: Device to register 1312 * @name: unique string device name. This may include a '%' which will 1313 * cause a unique index to be added to the passed device name. 1314 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB 1315 * device will be used. In this case the caller should fully 1316 * setup the ibdev for DMA. This usually means using dma_virt_ops. 1317 * 1318 * Low-level drivers use ib_register_device() to register their 1319 * devices with the IB core. All registered clients will receive a 1320 * callback for each device that is added. @device must be allocated 1321 * with ib_alloc_device(). 1322 * 1323 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1324 * asynchronously then the device pointer may become freed as soon as this 1325 * function returns. 1326 */ 1327int ib_register_device(struct ib_device *device, const char *name, 1328 struct device *dma_device) 1329{ 1330 int ret; 1331 1332 ret = assign_name(device, name); 1333 if (ret) 1334 return ret; 1335 1336 /* 1337 * If the caller does not provide a DMA capable device then the IB core 1338 * will set up ib_sge and scatterlist structures that stash the kernel 1339 * virtual address into the address field. 1340 */ 1341 WARN_ON(dma_device && !dma_device->dma_parms); 1342 device->dma_device = dma_device; 1343 1344 ret = setup_device(device); 1345 if (ret) 1346 return ret; 1347 1348 ret = ib_cache_setup_one(device); 1349 if (ret) { 1350 dev_warn(&device->dev, 1351 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1352 return ret; 1353 } 1354 1355 ib_device_register_rdmacg(device); 1356 1357 rdma_counter_init(device); 1358 1359 /* 1360 * Ensure that ADD uevent is not fired because it 1361 * is too early amd device is not initialized yet. 1362 */ 1363 dev_set_uevent_suppress(&device->dev, true); 1364 ret = device_add(&device->dev); 1365 if (ret) 1366 goto cg_cleanup; 1367 1368 ret = ib_device_register_sysfs(device); 1369 if (ret) { 1370 dev_warn(&device->dev, 1371 "Couldn't register device with driver model\n"); 1372 goto dev_cleanup; 1373 } 1374 1375 ret = enable_device_and_get(device); 1376 if (ret) { 1377 void (*dealloc_fn)(struct ib_device *); 1378 1379 /* 1380 * If we hit this error flow then we don't want to 1381 * automatically dealloc the device since the caller is 1382 * expected to call ib_dealloc_device() after 1383 * ib_register_device() fails. This is tricky due to the 1384 * possibility for a parallel unregistration along with this 1385 * error flow. Since we have a refcount here we know any 1386 * parallel flow is stopped in disable_device and will see the 1387 * special dealloc_driver pointer, causing the responsibility to 1388 * ib_dealloc_device() to revert back to this thread. 1389 */ 1390 dealloc_fn = device->ops.dealloc_driver; 1391 device->ops.dealloc_driver = prevent_dealloc_device; 1392 ib_device_put(device); 1393 __ib_unregister_device(device); 1394 device->ops.dealloc_driver = dealloc_fn; 1395 dev_set_uevent_suppress(&device->dev, false); 1396 return ret; 1397 } 1398 dev_set_uevent_suppress(&device->dev, false); 1399 /* Mark for userspace that device is ready */ 1400 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1401 ib_device_put(device); 1402 1403 return 0; 1404 1405dev_cleanup: 1406 device_del(&device->dev); 1407cg_cleanup: 1408 dev_set_uevent_suppress(&device->dev, false); 1409 ib_device_unregister_rdmacg(device); 1410 ib_cache_cleanup_one(device); 1411 return ret; 1412} 1413EXPORT_SYMBOL(ib_register_device); 1414 1415/* Callers must hold a get on the device. */ 1416static void __ib_unregister_device(struct ib_device *ib_dev) 1417{ 1418 /* 1419 * We have a registration lock so that all the calls to unregister are 1420 * fully fenced, once any unregister returns the device is truely 1421 * unregistered even if multiple callers are unregistering it at the 1422 * same time. This also interacts with the registration flow and 1423 * provides sane semantics if register and unregister are racing. 1424 */ 1425 mutex_lock(&ib_dev->unregistration_lock); 1426 if (!refcount_read(&ib_dev->refcount)) 1427 goto out; 1428 1429 disable_device(ib_dev); 1430 1431 /* Expedite removing unregistered pointers from the hash table */ 1432 free_netdevs(ib_dev); 1433 1434 ib_device_unregister_sysfs(ib_dev); 1435 device_del(&ib_dev->dev); 1436 ib_device_unregister_rdmacg(ib_dev); 1437 ib_cache_cleanup_one(ib_dev); 1438 1439 /* 1440 * Drivers using the new flow may not call ib_dealloc_device except 1441 * in error unwind prior to registration success. 1442 */ 1443 if (ib_dev->ops.dealloc_driver && 1444 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1445 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1446 ib_dealloc_device(ib_dev); 1447 } 1448out: 1449 mutex_unlock(&ib_dev->unregistration_lock); 1450} 1451 1452/** 1453 * ib_unregister_device - Unregister an IB device 1454 * @ib_dev: The device to unregister 1455 * 1456 * Unregister an IB device. All clients will receive a remove callback. 1457 * 1458 * Callers should call this routine only once, and protect against races with 1459 * registration. Typically it should only be called as part of a remove 1460 * callback in an implementation of driver core's struct device_driver and 1461 * related. 1462 * 1463 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1464 * this function. 1465 */ 1466void ib_unregister_device(struct ib_device *ib_dev) 1467{ 1468 get_device(&ib_dev->dev); 1469 __ib_unregister_device(ib_dev); 1470 put_device(&ib_dev->dev); 1471} 1472EXPORT_SYMBOL(ib_unregister_device); 1473 1474/** 1475 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1476 * @ib_dev: The device to unregister 1477 * 1478 * This is the same as ib_unregister_device(), except it includes an internal 1479 * ib_device_put() that should match a 'get' obtained by the caller. 1480 * 1481 * It is safe to call this routine concurrently from multiple threads while 1482 * holding the 'get'. When the function returns the device is fully 1483 * unregistered. 1484 * 1485 * Drivers using this flow MUST use the driver_unregister callback to clean up 1486 * their resources associated with the device and dealloc it. 1487 */ 1488void ib_unregister_device_and_put(struct ib_device *ib_dev) 1489{ 1490 WARN_ON(!ib_dev->ops.dealloc_driver); 1491 get_device(&ib_dev->dev); 1492 ib_device_put(ib_dev); 1493 __ib_unregister_device(ib_dev); 1494 put_device(&ib_dev->dev); 1495} 1496EXPORT_SYMBOL(ib_unregister_device_and_put); 1497 1498/** 1499 * ib_unregister_driver - Unregister all IB devices for a driver 1500 * @driver_id: The driver to unregister 1501 * 1502 * This implements a fence for device unregistration. It only returns once all 1503 * devices associated with the driver_id have fully completed their 1504 * unregistration and returned from ib_unregister_device*(). 1505 * 1506 * If device's are not yet unregistered it goes ahead and starts unregistering 1507 * them. 1508 * 1509 * This does not block creation of new devices with the given driver_id, that 1510 * is the responsibility of the caller. 1511 */ 1512void ib_unregister_driver(enum rdma_driver_id driver_id) 1513{ 1514 struct ib_device *ib_dev; 1515 unsigned long index; 1516 1517 down_read(&devices_rwsem); 1518 xa_for_each (&devices, index, ib_dev) { 1519 if (ib_dev->ops.driver_id != driver_id) 1520 continue; 1521 1522 get_device(&ib_dev->dev); 1523 up_read(&devices_rwsem); 1524 1525 WARN_ON(!ib_dev->ops.dealloc_driver); 1526 __ib_unregister_device(ib_dev); 1527 1528 put_device(&ib_dev->dev); 1529 down_read(&devices_rwsem); 1530 } 1531 up_read(&devices_rwsem); 1532} 1533EXPORT_SYMBOL(ib_unregister_driver); 1534 1535static void ib_unregister_work(struct work_struct *work) 1536{ 1537 struct ib_device *ib_dev = 1538 container_of(work, struct ib_device, unregistration_work); 1539 1540 __ib_unregister_device(ib_dev); 1541 put_device(&ib_dev->dev); 1542} 1543 1544/** 1545 * ib_unregister_device_queued - Unregister a device using a work queue 1546 * @ib_dev: The device to unregister 1547 * 1548 * This schedules an asynchronous unregistration using a WQ for the device. A 1549 * driver should use this to avoid holding locks while doing unregistration, 1550 * such as holding the RTNL lock. 1551 * 1552 * Drivers using this API must use ib_unregister_driver before module unload 1553 * to ensure that all scheduled unregistrations have completed. 1554 */ 1555void ib_unregister_device_queued(struct ib_device *ib_dev) 1556{ 1557 WARN_ON(!refcount_read(&ib_dev->refcount)); 1558 WARN_ON(!ib_dev->ops.dealloc_driver); 1559 get_device(&ib_dev->dev); 1560 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) 1561 put_device(&ib_dev->dev); 1562} 1563EXPORT_SYMBOL(ib_unregister_device_queued); 1564 1565/* 1566 * The caller must pass in a device that has the kref held and the refcount 1567 * released. If the device is in cur_net and still registered then it is moved 1568 * into net. 1569 */ 1570static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1571 struct net *net) 1572{ 1573 int ret2 = -EINVAL; 1574 int ret; 1575 1576 mutex_lock(&device->unregistration_lock); 1577 1578 /* 1579 * If a device not under ib_device_get() or if the unregistration_lock 1580 * is not held, the namespace can be changed, or it can be unregistered. 1581 * Check again under the lock. 1582 */ 1583 if (refcount_read(&device->refcount) == 0 || 1584 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1585 ret = -ENODEV; 1586 goto out; 1587 } 1588 1589 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1590 disable_device(device); 1591 1592 /* 1593 * At this point no one can be using the device, so it is safe to 1594 * change the namespace. 1595 */ 1596 write_pnet(&device->coredev.rdma_net, net); 1597 1598 down_read(&devices_rwsem); 1599 /* 1600 * Currently rdma devices are system wide unique. So the device name 1601 * is guaranteed free in the new namespace. Publish the new namespace 1602 * at the sysfs level. 1603 */ 1604 ret = device_rename(&device->dev, dev_name(&device->dev)); 1605 up_read(&devices_rwsem); 1606 if (ret) { 1607 dev_warn(&device->dev, 1608 "%s: Couldn't rename device after namespace change\n", 1609 __func__); 1610 /* Try and put things back and re-enable the device */ 1611 write_pnet(&device->coredev.rdma_net, cur_net); 1612 } 1613 1614 ret2 = enable_device_and_get(device); 1615 if (ret2) { 1616 /* 1617 * This shouldn't really happen, but if it does, let the user 1618 * retry at later point. So don't disable the device. 1619 */ 1620 dev_warn(&device->dev, 1621 "%s: Couldn't re-enable device after namespace change\n", 1622 __func__); 1623 } 1624 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1625 1626 ib_device_put(device); 1627out: 1628 mutex_unlock(&device->unregistration_lock); 1629 if (ret) 1630 return ret; 1631 return ret2; 1632} 1633 1634int ib_device_set_netns_put(struct sk_buff *skb, 1635 struct ib_device *dev, u32 ns_fd) 1636{ 1637 struct net *net; 1638 int ret; 1639 1640 net = get_net_ns_by_fd(ns_fd); 1641 if (IS_ERR(net)) { 1642 ret = PTR_ERR(net); 1643 goto net_err; 1644 } 1645 1646 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1647 ret = -EPERM; 1648 goto ns_err; 1649 } 1650 1651 /* 1652 * Currently supported only for those providers which support 1653 * disassociation and don't do port specific sysfs init. Once a 1654 * port_cleanup infrastructure is implemented, this limitation will be 1655 * removed. 1656 */ 1657 if (!dev->ops.disassociate_ucontext || dev->ops.init_port || 1658 ib_devices_shared_netns) { 1659 ret = -EOPNOTSUPP; 1660 goto ns_err; 1661 } 1662 1663 get_device(&dev->dev); 1664 ib_device_put(dev); 1665 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1666 put_device(&dev->dev); 1667 1668 put_net(net); 1669 return ret; 1670 1671ns_err: 1672 put_net(net); 1673net_err: 1674 ib_device_put(dev); 1675 return ret; 1676} 1677 1678static struct pernet_operations rdma_dev_net_ops = { 1679 .init = rdma_dev_init_net, 1680 .exit = rdma_dev_exit_net, 1681 .id = &rdma_dev_net_id, 1682 .size = sizeof(struct rdma_dev_net), 1683}; 1684 1685static int assign_client_id(struct ib_client *client) 1686{ 1687 int ret; 1688 1689 down_write(&clients_rwsem); 1690 /* 1691 * The add/remove callbacks must be called in FIFO/LIFO order. To 1692 * achieve this we assign client_ids so they are sorted in 1693 * registration order. 1694 */ 1695 client->client_id = highest_client_id; 1696 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1697 if (ret) 1698 goto out; 1699 1700 highest_client_id++; 1701 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1702 1703out: 1704 up_write(&clients_rwsem); 1705 return ret; 1706} 1707 1708static void remove_client_id(struct ib_client *client) 1709{ 1710 down_write(&clients_rwsem); 1711 xa_erase(&clients, client->client_id); 1712 for (; highest_client_id; highest_client_id--) 1713 if (xa_load(&clients, highest_client_id - 1)) 1714 break; 1715 up_write(&clients_rwsem); 1716} 1717 1718/** 1719 * ib_register_client - Register an IB client 1720 * @client:Client to register 1721 * 1722 * Upper level users of the IB drivers can use ib_register_client() to 1723 * register callbacks for IB device addition and removal. When an IB 1724 * device is added, each registered client's add method will be called 1725 * (in the order the clients were registered), and when a device is 1726 * removed, each client's remove method will be called (in the reverse 1727 * order that clients were registered). In addition, when 1728 * ib_register_client() is called, the client will receive an add 1729 * callback for all devices already registered. 1730 */ 1731int ib_register_client(struct ib_client *client) 1732{ 1733 struct ib_device *device; 1734 unsigned long index; 1735 int ret; 1736 1737 refcount_set(&client->uses, 1); 1738 init_completion(&client->uses_zero); 1739 ret = assign_client_id(client); 1740 if (ret) 1741 return ret; 1742 1743 down_read(&devices_rwsem); 1744 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1745 ret = add_client_context(device, client); 1746 if (ret) { 1747 up_read(&devices_rwsem); 1748 ib_unregister_client(client); 1749 return ret; 1750 } 1751 } 1752 up_read(&devices_rwsem); 1753 return 0; 1754} 1755EXPORT_SYMBOL(ib_register_client); 1756 1757/** 1758 * ib_unregister_client - Unregister an IB client 1759 * @client:Client to unregister 1760 * 1761 * Upper level users use ib_unregister_client() to remove their client 1762 * registration. When ib_unregister_client() is called, the client 1763 * will receive a remove callback for each IB device still registered. 1764 * 1765 * This is a full fence, once it returns no client callbacks will be called, 1766 * or are running in another thread. 1767 */ 1768void ib_unregister_client(struct ib_client *client) 1769{ 1770 struct ib_device *device; 1771 unsigned long index; 1772 1773 down_write(&clients_rwsem); 1774 ib_client_put(client); 1775 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1776 up_write(&clients_rwsem); 1777 1778 /* We do not want to have locks while calling client->remove() */ 1779 rcu_read_lock(); 1780 xa_for_each (&devices, index, device) { 1781 if (!ib_device_try_get(device)) 1782 continue; 1783 rcu_read_unlock(); 1784 1785 remove_client_context(device, client->client_id); 1786 1787 ib_device_put(device); 1788 rcu_read_lock(); 1789 } 1790 rcu_read_unlock(); 1791 1792 /* 1793 * remove_client_context() is not a fence, it can return even though a 1794 * removal is ongoing. Wait until all removals are completed. 1795 */ 1796 wait_for_completion(&client->uses_zero); 1797 remove_client_id(client); 1798} 1799EXPORT_SYMBOL(ib_unregister_client); 1800 1801static int __ib_get_global_client_nl_info(const char *client_name, 1802 struct ib_client_nl_info *res) 1803{ 1804 struct ib_client *client; 1805 unsigned long index; 1806 int ret = -ENOENT; 1807 1808 down_read(&clients_rwsem); 1809 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1810 if (strcmp(client->name, client_name) != 0) 1811 continue; 1812 if (!client->get_global_nl_info) { 1813 ret = -EOPNOTSUPP; 1814 break; 1815 } 1816 ret = client->get_global_nl_info(res); 1817 if (WARN_ON(ret == -ENOENT)) 1818 ret = -EINVAL; 1819 if (!ret && res->cdev) 1820 get_device(res->cdev); 1821 break; 1822 } 1823 up_read(&clients_rwsem); 1824 return ret; 1825} 1826 1827static int __ib_get_client_nl_info(struct ib_device *ibdev, 1828 const char *client_name, 1829 struct ib_client_nl_info *res) 1830{ 1831 unsigned long index; 1832 void *client_data; 1833 int ret = -ENOENT; 1834 1835 down_read(&ibdev->client_data_rwsem); 1836 xan_for_each_marked (&ibdev->client_data, index, client_data, 1837 CLIENT_DATA_REGISTERED) { 1838 struct ib_client *client = xa_load(&clients, index); 1839 1840 if (!client || strcmp(client->name, client_name) != 0) 1841 continue; 1842 if (!client->get_nl_info) { 1843 ret = -EOPNOTSUPP; 1844 break; 1845 } 1846 ret = client->get_nl_info(ibdev, client_data, res); 1847 if (WARN_ON(ret == -ENOENT)) 1848 ret = -EINVAL; 1849 1850 /* 1851 * The cdev is guaranteed valid as long as we are inside the 1852 * client_data_rwsem as remove_one can't be called. Keep it 1853 * valid for the caller. 1854 */ 1855 if (!ret && res->cdev) 1856 get_device(res->cdev); 1857 break; 1858 } 1859 up_read(&ibdev->client_data_rwsem); 1860 1861 return ret; 1862} 1863 1864/** 1865 * ib_get_client_nl_info - Fetch the nl_info from a client 1866 * @device - IB device 1867 * @client_name - Name of the client 1868 * @res - Result of the query 1869 */ 1870int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1871 struct ib_client_nl_info *res) 1872{ 1873 int ret; 1874 1875 if (ibdev) 1876 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1877 else 1878 ret = __ib_get_global_client_nl_info(client_name, res); 1879#ifdef CONFIG_MODULES 1880 if (ret == -ENOENT) { 1881 request_module("rdma-client-%s", client_name); 1882 if (ibdev) 1883 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1884 else 1885 ret = __ib_get_global_client_nl_info(client_name, res); 1886 } 1887#endif 1888 if (ret) { 1889 if (ret == -ENOENT) 1890 return -EOPNOTSUPP; 1891 return ret; 1892 } 1893 1894 if (WARN_ON(!res->cdev)) 1895 return -EINVAL; 1896 return 0; 1897} 1898 1899/** 1900 * ib_set_client_data - Set IB client context 1901 * @device:Device to set context for 1902 * @client:Client to set context for 1903 * @data:Context to set 1904 * 1905 * ib_set_client_data() sets client context data that can be retrieved with 1906 * ib_get_client_data(). This can only be called while the client is 1907 * registered to the device, once the ib_client remove() callback returns this 1908 * cannot be called. 1909 */ 1910void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1911 void *data) 1912{ 1913 void *rc; 1914 1915 if (WARN_ON(IS_ERR(data))) 1916 data = NULL; 1917 1918 rc = xa_store(&device->client_data, client->client_id, data, 1919 GFP_KERNEL); 1920 WARN_ON(xa_is_err(rc)); 1921} 1922EXPORT_SYMBOL(ib_set_client_data); 1923 1924/** 1925 * ib_register_event_handler - Register an IB event handler 1926 * @event_handler:Handler to register 1927 * 1928 * ib_register_event_handler() registers an event handler that will be 1929 * called back when asynchronous IB events occur (as defined in 1930 * chapter 11 of the InfiniBand Architecture Specification). This 1931 * callback occurs in workqueue context. 1932 */ 1933void ib_register_event_handler(struct ib_event_handler *event_handler) 1934{ 1935 down_write(&event_handler->device->event_handler_rwsem); 1936 list_add_tail(&event_handler->list, 1937 &event_handler->device->event_handler_list); 1938 up_write(&event_handler->device->event_handler_rwsem); 1939} 1940EXPORT_SYMBOL(ib_register_event_handler); 1941 1942/** 1943 * ib_unregister_event_handler - Unregister an event handler 1944 * @event_handler:Handler to unregister 1945 * 1946 * Unregister an event handler registered with 1947 * ib_register_event_handler(). 1948 */ 1949void ib_unregister_event_handler(struct ib_event_handler *event_handler) 1950{ 1951 down_write(&event_handler->device->event_handler_rwsem); 1952 list_del(&event_handler->list); 1953 up_write(&event_handler->device->event_handler_rwsem); 1954} 1955EXPORT_SYMBOL(ib_unregister_event_handler); 1956 1957void ib_dispatch_event_clients(struct ib_event *event) 1958{ 1959 struct ib_event_handler *handler; 1960 1961 down_read(&event->device->event_handler_rwsem); 1962 1963 list_for_each_entry(handler, &event->device->event_handler_list, list) 1964 handler->handler(handler, event); 1965 1966 up_read(&event->device->event_handler_rwsem); 1967} 1968 1969static int iw_query_port(struct ib_device *device, 1970 u8 port_num, 1971 struct ib_port_attr *port_attr) 1972{ 1973 struct in_device *inetdev; 1974 struct net_device *netdev; 1975 1976 memset(port_attr, 0, sizeof(*port_attr)); 1977 1978 netdev = ib_device_get_netdev(device, port_num); 1979 if (!netdev) 1980 return -ENODEV; 1981 1982 port_attr->max_mtu = IB_MTU_4096; 1983 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 1984 1985 if (!netif_carrier_ok(netdev)) { 1986 port_attr->state = IB_PORT_DOWN; 1987 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 1988 } else { 1989 rcu_read_lock(); 1990 inetdev = __in_dev_get_rcu(netdev); 1991 1992 if (inetdev && inetdev->ifa_list) { 1993 port_attr->state = IB_PORT_ACTIVE; 1994 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 1995 } else { 1996 port_attr->state = IB_PORT_INIT; 1997 port_attr->phys_state = 1998 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 1999 } 2000 2001 rcu_read_unlock(); 2002 } 2003 2004 dev_put(netdev); 2005 return device->ops.query_port(device, port_num, port_attr); 2006} 2007 2008static int __ib_query_port(struct ib_device *device, 2009 u8 port_num, 2010 struct ib_port_attr *port_attr) 2011{ 2012 union ib_gid gid = {}; 2013 int err; 2014 2015 memset(port_attr, 0, sizeof(*port_attr)); 2016 2017 err = device->ops.query_port(device, port_num, port_attr); 2018 if (err || port_attr->subnet_prefix) 2019 return err; 2020 2021 if (rdma_port_get_link_layer(device, port_num) != 2022 IB_LINK_LAYER_INFINIBAND) 2023 return 0; 2024 2025 err = device->ops.query_gid(device, port_num, 0, &gid); 2026 if (err) 2027 return err; 2028 2029 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); 2030 return 0; 2031} 2032 2033/** 2034 * ib_query_port - Query IB port attributes 2035 * @device:Device to query 2036 * @port_num:Port number to query 2037 * @port_attr:Port attributes 2038 * 2039 * ib_query_port() returns the attributes of a port through the 2040 * @port_attr pointer. 2041 */ 2042int ib_query_port(struct ib_device *device, 2043 u8 port_num, 2044 struct ib_port_attr *port_attr) 2045{ 2046 if (!rdma_is_port_valid(device, port_num)) 2047 return -EINVAL; 2048 2049 if (rdma_protocol_iwarp(device, port_num)) 2050 return iw_query_port(device, port_num, port_attr); 2051 else 2052 return __ib_query_port(device, port_num, port_attr); 2053} 2054EXPORT_SYMBOL(ib_query_port); 2055 2056static void add_ndev_hash(struct ib_port_data *pdata) 2057{ 2058 unsigned long flags; 2059 2060 might_sleep(); 2061 2062 spin_lock_irqsave(&ndev_hash_lock, flags); 2063 if (hash_hashed(&pdata->ndev_hash_link)) { 2064 hash_del_rcu(&pdata->ndev_hash_link); 2065 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2066 /* 2067 * We cannot do hash_add_rcu after a hash_del_rcu until the 2068 * grace period 2069 */ 2070 synchronize_rcu(); 2071 spin_lock_irqsave(&ndev_hash_lock, flags); 2072 } 2073 if (pdata->netdev) 2074 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2075 (uintptr_t)pdata->netdev); 2076 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2077} 2078 2079/** 2080 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2081 * @ib_dev: Device to modify 2082 * @ndev: net_device to affiliate, may be NULL 2083 * @port: IB port the net_device is connected to 2084 * 2085 * Drivers should use this to link the ib_device to a netdev so the netdev 2086 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2087 * affiliated with any port. 2088 * 2089 * The caller must ensure that the given ndev is not unregistered or 2090 * unregistering, and that either the ib_device is unregistered or 2091 * ib_device_set_netdev() is called with NULL when the ndev sends a 2092 * NETDEV_UNREGISTER event. 2093 */ 2094int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2095 unsigned int port) 2096{ 2097 struct net_device *old_ndev; 2098 struct ib_port_data *pdata; 2099 unsigned long flags; 2100 int ret; 2101 2102 /* 2103 * Drivers wish to call this before ib_register_driver, so we have to 2104 * setup the port data early. 2105 */ 2106 ret = alloc_port_data(ib_dev); 2107 if (ret) 2108 return ret; 2109 2110 if (!rdma_is_port_valid(ib_dev, port)) 2111 return -EINVAL; 2112 2113 pdata = &ib_dev->port_data[port]; 2114 spin_lock_irqsave(&pdata->netdev_lock, flags); 2115 old_ndev = rcu_dereference_protected( 2116 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2117 if (old_ndev == ndev) { 2118 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2119 return 0; 2120 } 2121 2122 if (ndev) 2123 dev_hold(ndev); 2124 rcu_assign_pointer(pdata->netdev, ndev); 2125 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2126 2127 add_ndev_hash(pdata); 2128 if (old_ndev) 2129 dev_put(old_ndev); 2130 2131 return 0; 2132} 2133EXPORT_SYMBOL(ib_device_set_netdev); 2134 2135static void free_netdevs(struct ib_device *ib_dev) 2136{ 2137 unsigned long flags; 2138 unsigned int port; 2139 2140 if (!ib_dev->port_data) 2141 return; 2142 2143 rdma_for_each_port (ib_dev, port) { 2144 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2145 struct net_device *ndev; 2146 2147 spin_lock_irqsave(&pdata->netdev_lock, flags); 2148 ndev = rcu_dereference_protected( 2149 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2150 if (ndev) { 2151 spin_lock(&ndev_hash_lock); 2152 hash_del_rcu(&pdata->ndev_hash_link); 2153 spin_unlock(&ndev_hash_lock); 2154 2155 /* 2156 * If this is the last dev_put there is still a 2157 * synchronize_rcu before the netdev is kfreed, so we 2158 * can continue to rely on unlocked pointer 2159 * comparisons after the put 2160 */ 2161 rcu_assign_pointer(pdata->netdev, NULL); 2162 dev_put(ndev); 2163 } 2164 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2165 } 2166} 2167 2168struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2169 unsigned int port) 2170{ 2171 struct ib_port_data *pdata; 2172 struct net_device *res; 2173 2174 if (!rdma_is_port_valid(ib_dev, port)) 2175 return NULL; 2176 2177 pdata = &ib_dev->port_data[port]; 2178 2179 /* 2180 * New drivers should use ib_device_set_netdev() not the legacy 2181 * get_netdev(). 2182 */ 2183 if (ib_dev->ops.get_netdev) 2184 res = ib_dev->ops.get_netdev(ib_dev, port); 2185 else { 2186 spin_lock(&pdata->netdev_lock); 2187 res = rcu_dereference_protected( 2188 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2189 if (res) 2190 dev_hold(res); 2191 spin_unlock(&pdata->netdev_lock); 2192 } 2193 2194 /* 2195 * If we are starting to unregister expedite things by preventing 2196 * propagation of an unregistering netdev. 2197 */ 2198 if (res && res->reg_state != NETREG_REGISTERED) { 2199 dev_put(res); 2200 return NULL; 2201 } 2202 2203 return res; 2204} 2205 2206/** 2207 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2208 * @ndev: netdev to locate 2209 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2210 * 2211 * Find and hold an ib_device that is associated with a netdev via 2212 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2213 * returned pointer. 2214 */ 2215struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2216 enum rdma_driver_id driver_id) 2217{ 2218 struct ib_device *res = NULL; 2219 struct ib_port_data *cur; 2220 2221 rcu_read_lock(); 2222 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2223 (uintptr_t)ndev) { 2224 if (rcu_access_pointer(cur->netdev) == ndev && 2225 (driver_id == RDMA_DRIVER_UNKNOWN || 2226 cur->ib_dev->ops.driver_id == driver_id) && 2227 ib_device_try_get(cur->ib_dev)) { 2228 res = cur->ib_dev; 2229 break; 2230 } 2231 } 2232 rcu_read_unlock(); 2233 2234 return res; 2235} 2236EXPORT_SYMBOL(ib_device_get_by_netdev); 2237 2238/** 2239 * ib_enum_roce_netdev - enumerate all RoCE ports 2240 * @ib_dev : IB device we want to query 2241 * @filter: Should we call the callback? 2242 * @filter_cookie: Cookie passed to filter 2243 * @cb: Callback to call for each found RoCE ports 2244 * @cookie: Cookie passed back to the callback 2245 * 2246 * Enumerates all of the physical RoCE ports of ib_dev 2247 * which are related to netdevice and calls callback() on each 2248 * device for which filter() function returns non zero. 2249 */ 2250void ib_enum_roce_netdev(struct ib_device *ib_dev, 2251 roce_netdev_filter filter, 2252 void *filter_cookie, 2253 roce_netdev_callback cb, 2254 void *cookie) 2255{ 2256 unsigned int port; 2257 2258 rdma_for_each_port (ib_dev, port) 2259 if (rdma_protocol_roce(ib_dev, port)) { 2260 struct net_device *idev = 2261 ib_device_get_netdev(ib_dev, port); 2262 2263 if (filter(ib_dev, port, idev, filter_cookie)) 2264 cb(ib_dev, port, idev, cookie); 2265 2266 if (idev) 2267 dev_put(idev); 2268 } 2269} 2270 2271/** 2272 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2273 * @filter: Should we call the callback? 2274 * @filter_cookie: Cookie passed to filter 2275 * @cb: Callback to call for each found RoCE ports 2276 * @cookie: Cookie passed back to the callback 2277 * 2278 * Enumerates all RoCE devices' physical ports which are related 2279 * to netdevices and calls callback() on each device for which 2280 * filter() function returns non zero. 2281 */ 2282void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2283 void *filter_cookie, 2284 roce_netdev_callback cb, 2285 void *cookie) 2286{ 2287 struct ib_device *dev; 2288 unsigned long index; 2289 2290 down_read(&devices_rwsem); 2291 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2292 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2293 up_read(&devices_rwsem); 2294} 2295 2296/** 2297 * ib_enum_all_devs - enumerate all ib_devices 2298 * @cb: Callback to call for each found ib_device 2299 * 2300 * Enumerates all ib_devices and calls callback() on each device. 2301 */ 2302int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2303 struct netlink_callback *cb) 2304{ 2305 unsigned long index; 2306 struct ib_device *dev; 2307 unsigned int idx = 0; 2308 int ret = 0; 2309 2310 down_read(&devices_rwsem); 2311 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2312 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2313 continue; 2314 2315 ret = nldev_cb(dev, skb, cb, idx); 2316 if (ret) 2317 break; 2318 idx++; 2319 } 2320 up_read(&devices_rwsem); 2321 return ret; 2322} 2323 2324/** 2325 * ib_query_pkey - Get P_Key table entry 2326 * @device:Device to query 2327 * @port_num:Port number to query 2328 * @index:P_Key table index to query 2329 * @pkey:Returned P_Key 2330 * 2331 * ib_query_pkey() fetches the specified P_Key table entry. 2332 */ 2333int ib_query_pkey(struct ib_device *device, 2334 u8 port_num, u16 index, u16 *pkey) 2335{ 2336 if (!rdma_is_port_valid(device, port_num)) 2337 return -EINVAL; 2338 2339 if (!device->ops.query_pkey) 2340 return -EOPNOTSUPP; 2341 2342 return device->ops.query_pkey(device, port_num, index, pkey); 2343} 2344EXPORT_SYMBOL(ib_query_pkey); 2345 2346/** 2347 * ib_modify_device - Change IB device attributes 2348 * @device:Device to modify 2349 * @device_modify_mask:Mask of attributes to change 2350 * @device_modify:New attribute values 2351 * 2352 * ib_modify_device() changes a device's attributes as specified by 2353 * the @device_modify_mask and @device_modify structure. 2354 */ 2355int ib_modify_device(struct ib_device *device, 2356 int device_modify_mask, 2357 struct ib_device_modify *device_modify) 2358{ 2359 if (!device->ops.modify_device) 2360 return -EOPNOTSUPP; 2361 2362 return device->ops.modify_device(device, device_modify_mask, 2363 device_modify); 2364} 2365EXPORT_SYMBOL(ib_modify_device); 2366 2367/** 2368 * ib_modify_port - Modifies the attributes for the specified port. 2369 * @device: The device to modify. 2370 * @port_num: The number of the port to modify. 2371 * @port_modify_mask: Mask used to specify which attributes of the port 2372 * to change. 2373 * @port_modify: New attribute values for the port. 2374 * 2375 * ib_modify_port() changes a port's attributes as specified by the 2376 * @port_modify_mask and @port_modify structure. 2377 */ 2378int ib_modify_port(struct ib_device *device, 2379 u8 port_num, int port_modify_mask, 2380 struct ib_port_modify *port_modify) 2381{ 2382 int rc; 2383 2384 if (!rdma_is_port_valid(device, port_num)) 2385 return -EINVAL; 2386 2387 if (device->ops.modify_port) 2388 rc = device->ops.modify_port(device, port_num, 2389 port_modify_mask, 2390 port_modify); 2391 else if (rdma_protocol_roce(device, port_num) && 2392 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2393 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2394 rc = 0; 2395 else 2396 rc = -EOPNOTSUPP; 2397 return rc; 2398} 2399EXPORT_SYMBOL(ib_modify_port); 2400 2401/** 2402 * ib_find_gid - Returns the port number and GID table index where 2403 * a specified GID value occurs. Its searches only for IB link layer. 2404 * @device: The device to query. 2405 * @gid: The GID value to search for. 2406 * @port_num: The port number of the device where the GID value was found. 2407 * @index: The index into the GID table where the GID was found. This 2408 * parameter may be NULL. 2409 */ 2410int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2411 u8 *port_num, u16 *index) 2412{ 2413 union ib_gid tmp_gid; 2414 unsigned int port; 2415 int ret, i; 2416 2417 rdma_for_each_port (device, port) { 2418 if (!rdma_protocol_ib(device, port)) 2419 continue; 2420 2421 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2422 ++i) { 2423 ret = rdma_query_gid(device, port, i, &tmp_gid); 2424 if (ret) 2425 continue; 2426 2427 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2428 *port_num = port; 2429 if (index) 2430 *index = i; 2431 return 0; 2432 } 2433 } 2434 } 2435 2436 return -ENOENT; 2437} 2438EXPORT_SYMBOL(ib_find_gid); 2439 2440/** 2441 * ib_find_pkey - Returns the PKey table index where a specified 2442 * PKey value occurs. 2443 * @device: The device to query. 2444 * @port_num: The port number of the device to search for the PKey. 2445 * @pkey: The PKey value to search for. 2446 * @index: The index into the PKey table where the PKey was found. 2447 */ 2448int ib_find_pkey(struct ib_device *device, 2449 u8 port_num, u16 pkey, u16 *index) 2450{ 2451 int ret, i; 2452 u16 tmp_pkey; 2453 int partial_ix = -1; 2454 2455 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2456 ++i) { 2457 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2458 if (ret) 2459 return ret; 2460 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2461 /* if there is full-member pkey take it.*/ 2462 if (tmp_pkey & 0x8000) { 2463 *index = i; 2464 return 0; 2465 } 2466 if (partial_ix < 0) 2467 partial_ix = i; 2468 } 2469 } 2470 2471 /*no full-member, if exists take the limited*/ 2472 if (partial_ix >= 0) { 2473 *index = partial_ix; 2474 return 0; 2475 } 2476 return -ENOENT; 2477} 2478EXPORT_SYMBOL(ib_find_pkey); 2479 2480/** 2481 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2482 * for a received CM request 2483 * @dev: An RDMA device on which the request has been received. 2484 * @port: Port number on the RDMA device. 2485 * @pkey: The Pkey the request came on. 2486 * @gid: A GID that the net_dev uses to communicate. 2487 * @addr: Contains the IP address that the request specified as its 2488 * destination. 2489 * 2490 */ 2491struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2492 u8 port, 2493 u16 pkey, 2494 const union ib_gid *gid, 2495 const struct sockaddr *addr) 2496{ 2497 struct net_device *net_dev = NULL; 2498 unsigned long index; 2499 void *client_data; 2500 2501 if (!rdma_protocol_ib(dev, port)) 2502 return NULL; 2503 2504 /* 2505 * Holding the read side guarantees that the client will not become 2506 * unregistered while we are calling get_net_dev_by_params() 2507 */ 2508 down_read(&dev->client_data_rwsem); 2509 xan_for_each_marked (&dev->client_data, index, client_data, 2510 CLIENT_DATA_REGISTERED) { 2511 struct ib_client *client = xa_load(&clients, index); 2512 2513 if (!client || !client->get_net_dev_by_params) 2514 continue; 2515 2516 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2517 addr, client_data); 2518 if (net_dev) 2519 break; 2520 } 2521 up_read(&dev->client_data_rwsem); 2522 2523 return net_dev; 2524} 2525EXPORT_SYMBOL(ib_get_net_dev_by_params); 2526 2527void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2528{ 2529 struct ib_device_ops *dev_ops = &dev->ops; 2530#define SET_DEVICE_OP(ptr, name) \ 2531 do { \ 2532 if (ops->name) \ 2533 if (!((ptr)->name)) \ 2534 (ptr)->name = ops->name; \ 2535 } while (0) 2536 2537#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2538 2539 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2540 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2541 dev_ops->driver_id != ops->driver_id); 2542 dev_ops->driver_id = ops->driver_id; 2543 } 2544 if (ops->owner) { 2545 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2546 dev_ops->owner = ops->owner; 2547 } 2548 if (ops->uverbs_abi_ver) 2549 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2550 2551 dev_ops->uverbs_no_driver_id_binding |= 2552 ops->uverbs_no_driver_id_binding; 2553 2554 SET_DEVICE_OP(dev_ops, add_gid); 2555 SET_DEVICE_OP(dev_ops, advise_mr); 2556 SET_DEVICE_OP(dev_ops, alloc_dm); 2557 SET_DEVICE_OP(dev_ops, alloc_hw_stats); 2558 SET_DEVICE_OP(dev_ops, alloc_mr); 2559 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2560 SET_DEVICE_OP(dev_ops, alloc_mw); 2561 SET_DEVICE_OP(dev_ops, alloc_pd); 2562 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2563 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2564 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2565 SET_DEVICE_OP(dev_ops, attach_mcast); 2566 SET_DEVICE_OP(dev_ops, check_mr_status); 2567 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2568 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2569 SET_DEVICE_OP(dev_ops, counter_dealloc); 2570 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2571 SET_DEVICE_OP(dev_ops, counter_update_stats); 2572 SET_DEVICE_OP(dev_ops, create_ah); 2573 SET_DEVICE_OP(dev_ops, create_counters); 2574 SET_DEVICE_OP(dev_ops, create_cq); 2575 SET_DEVICE_OP(dev_ops, create_flow); 2576 SET_DEVICE_OP(dev_ops, create_flow_action_esp); 2577 SET_DEVICE_OP(dev_ops, create_qp); 2578 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2579 SET_DEVICE_OP(dev_ops, create_srq); 2580 SET_DEVICE_OP(dev_ops, create_wq); 2581 SET_DEVICE_OP(dev_ops, dealloc_dm); 2582 SET_DEVICE_OP(dev_ops, dealloc_driver); 2583 SET_DEVICE_OP(dev_ops, dealloc_mw); 2584 SET_DEVICE_OP(dev_ops, dealloc_pd); 2585 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2586 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2587 SET_DEVICE_OP(dev_ops, del_gid); 2588 SET_DEVICE_OP(dev_ops, dereg_mr); 2589 SET_DEVICE_OP(dev_ops, destroy_ah); 2590 SET_DEVICE_OP(dev_ops, destroy_counters); 2591 SET_DEVICE_OP(dev_ops, destroy_cq); 2592 SET_DEVICE_OP(dev_ops, destroy_flow); 2593 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2594 SET_DEVICE_OP(dev_ops, destroy_qp); 2595 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2596 SET_DEVICE_OP(dev_ops, destroy_srq); 2597 SET_DEVICE_OP(dev_ops, destroy_wq); 2598 SET_DEVICE_OP(dev_ops, detach_mcast); 2599 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2600 SET_DEVICE_OP(dev_ops, drain_rq); 2601 SET_DEVICE_OP(dev_ops, drain_sq); 2602 SET_DEVICE_OP(dev_ops, enable_driver); 2603 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2604 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2605 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2606 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2607 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2608 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2609 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2610 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2611 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2612 SET_DEVICE_OP(dev_ops, get_dma_mr); 2613 SET_DEVICE_OP(dev_ops, get_hw_stats); 2614 SET_DEVICE_OP(dev_ops, get_link_layer); 2615 SET_DEVICE_OP(dev_ops, get_netdev); 2616 SET_DEVICE_OP(dev_ops, get_port_immutable); 2617 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2618 SET_DEVICE_OP(dev_ops, get_vf_config); 2619 SET_DEVICE_OP(dev_ops, get_vf_guid); 2620 SET_DEVICE_OP(dev_ops, get_vf_stats); 2621 SET_DEVICE_OP(dev_ops, init_port); 2622 SET_DEVICE_OP(dev_ops, iw_accept); 2623 SET_DEVICE_OP(dev_ops, iw_add_ref); 2624 SET_DEVICE_OP(dev_ops, iw_connect); 2625 SET_DEVICE_OP(dev_ops, iw_create_listen); 2626 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2627 SET_DEVICE_OP(dev_ops, iw_get_qp); 2628 SET_DEVICE_OP(dev_ops, iw_reject); 2629 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2630 SET_DEVICE_OP(dev_ops, map_mr_sg); 2631 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2632 SET_DEVICE_OP(dev_ops, mmap); 2633 SET_DEVICE_OP(dev_ops, mmap_free); 2634 SET_DEVICE_OP(dev_ops, modify_ah); 2635 SET_DEVICE_OP(dev_ops, modify_cq); 2636 SET_DEVICE_OP(dev_ops, modify_device); 2637 SET_DEVICE_OP(dev_ops, modify_flow_action_esp); 2638 SET_DEVICE_OP(dev_ops, modify_port); 2639 SET_DEVICE_OP(dev_ops, modify_qp); 2640 SET_DEVICE_OP(dev_ops, modify_srq); 2641 SET_DEVICE_OP(dev_ops, modify_wq); 2642 SET_DEVICE_OP(dev_ops, peek_cq); 2643 SET_DEVICE_OP(dev_ops, poll_cq); 2644 SET_DEVICE_OP(dev_ops, post_recv); 2645 SET_DEVICE_OP(dev_ops, post_send); 2646 SET_DEVICE_OP(dev_ops, post_srq_recv); 2647 SET_DEVICE_OP(dev_ops, process_mad); 2648 SET_DEVICE_OP(dev_ops, query_ah); 2649 SET_DEVICE_OP(dev_ops, query_device); 2650 SET_DEVICE_OP(dev_ops, query_gid); 2651 SET_DEVICE_OP(dev_ops, query_pkey); 2652 SET_DEVICE_OP(dev_ops, query_port); 2653 SET_DEVICE_OP(dev_ops, query_qp); 2654 SET_DEVICE_OP(dev_ops, query_srq); 2655 SET_DEVICE_OP(dev_ops, query_ucontext); 2656 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2657 SET_DEVICE_OP(dev_ops, read_counters); 2658 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2659 SET_DEVICE_OP(dev_ops, reg_user_mr); 2660 SET_DEVICE_OP(dev_ops, req_ncomp_notif); 2661 SET_DEVICE_OP(dev_ops, req_notify_cq); 2662 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2663 SET_DEVICE_OP(dev_ops, resize_cq); 2664 SET_DEVICE_OP(dev_ops, set_vf_guid); 2665 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2666 2667 SET_OBJ_SIZE(dev_ops, ib_ah); 2668 SET_OBJ_SIZE(dev_ops, ib_counters); 2669 SET_OBJ_SIZE(dev_ops, ib_cq); 2670 SET_OBJ_SIZE(dev_ops, ib_mw); 2671 SET_OBJ_SIZE(dev_ops, ib_pd); 2672 SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); 2673 SET_OBJ_SIZE(dev_ops, ib_srq); 2674 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2675 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2676} 2677EXPORT_SYMBOL(ib_set_device_ops); 2678 2679#ifdef CONFIG_INFINIBAND_VIRT_DMA 2680int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) 2681{ 2682 struct scatterlist *s; 2683 int i; 2684 2685 for_each_sg(sg, s, nents, i) { 2686 sg_dma_address(s) = (uintptr_t)sg_virt(s); 2687 sg_dma_len(s) = s->length; 2688 } 2689 return nents; 2690} 2691EXPORT_SYMBOL(ib_dma_virt_map_sg); 2692#endif /* CONFIG_INFINIBAND_VIRT_DMA */ 2693 2694static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2695 [RDMA_NL_LS_OP_RESOLVE] = { 2696 .doit = ib_nl_handle_resolve_resp, 2697 .flags = RDMA_NL_ADMIN_PERM, 2698 }, 2699 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2700 .doit = ib_nl_handle_set_timeout, 2701 .flags = RDMA_NL_ADMIN_PERM, 2702 }, 2703 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2704 .doit = ib_nl_handle_ip_res_resp, 2705 .flags = RDMA_NL_ADMIN_PERM, 2706 }, 2707}; 2708 2709static int __init ib_core_init(void) 2710{ 2711 int ret; 2712 2713 ib_wq = alloc_workqueue("infiniband", 0, 0); 2714 if (!ib_wq) 2715 return -ENOMEM; 2716 2717 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2718 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2719 if (!ib_comp_wq) { 2720 ret = -ENOMEM; 2721 goto err; 2722 } 2723 2724 ib_comp_unbound_wq = 2725 alloc_workqueue("ib-comp-unb-wq", 2726 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2727 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2728 if (!ib_comp_unbound_wq) { 2729 ret = -ENOMEM; 2730 goto err_comp; 2731 } 2732 2733 ret = class_register(&ib_class); 2734 if (ret) { 2735 pr_warn("Couldn't create InfiniBand device class\n"); 2736 goto err_comp_unbound; 2737 } 2738 2739 rdma_nl_init(); 2740 2741 ret = addr_init(); 2742 if (ret) { 2743 pr_warn("Couldn't init IB address resolution\n"); 2744 goto err_ibnl; 2745 } 2746 2747 ret = ib_mad_init(); 2748 if (ret) { 2749 pr_warn("Couldn't init IB MAD\n"); 2750 goto err_addr; 2751 } 2752 2753 ret = ib_sa_init(); 2754 if (ret) { 2755 pr_warn("Couldn't init SA\n"); 2756 goto err_mad; 2757 } 2758 2759 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 2760 if (ret) { 2761 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2762 goto err_sa; 2763 } 2764 2765 ret = register_pernet_device(&rdma_dev_net_ops); 2766 if (ret) { 2767 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2768 goto err_compat; 2769 } 2770 2771 nldev_init(); 2772 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2773 ret = roce_gid_mgmt_init(); 2774 if (ret) { 2775 pr_warn("Couldn't init RoCE GID management\n"); 2776 goto err_parent; 2777 } 2778 2779 return 0; 2780 2781err_parent: 2782 rdma_nl_unregister(RDMA_NL_LS); 2783 nldev_exit(); 2784 unregister_pernet_device(&rdma_dev_net_ops); 2785err_compat: 2786 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2787err_sa: 2788 ib_sa_cleanup(); 2789err_mad: 2790 ib_mad_cleanup(); 2791err_addr: 2792 addr_cleanup(); 2793err_ibnl: 2794 class_unregister(&ib_class); 2795err_comp_unbound: 2796 destroy_workqueue(ib_comp_unbound_wq); 2797err_comp: 2798 destroy_workqueue(ib_comp_wq); 2799err: 2800 destroy_workqueue(ib_wq); 2801 return ret; 2802} 2803 2804static void __exit ib_core_cleanup(void) 2805{ 2806 roce_gid_mgmt_cleanup(); 2807 rdma_nl_unregister(RDMA_NL_LS); 2808 nldev_exit(); 2809 unregister_pernet_device(&rdma_dev_net_ops); 2810 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2811 ib_sa_cleanup(); 2812 ib_mad_cleanup(); 2813 addr_cleanup(); 2814 rdma_nl_exit(); 2815 class_unregister(&ib_class); 2816 destroy_workqueue(ib_comp_unbound_wq); 2817 destroy_workqueue(ib_comp_wq); 2818 /* Make sure that any pending umem accounting work is done. */ 2819 destroy_workqueue(ib_wq); 2820 flush_workqueue(system_unbound_wq); 2821 WARN_ON(!xa_empty(&clients)); 2822 WARN_ON(!xa_empty(&devices)); 2823} 2824 2825MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2826 2827/* ib core relies on netdev stack to first register net_ns_type_operations 2828 * ns kobject type before ib_core initialization. 2829 */ 2830fs_initcall(ib_core_init); 2831module_exit(ib_core_cleanup); 2832