1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Network block device - make block devices work over TCP 4 * 5 * Note that you can not swap over this thing, yet. Seems to work but 6 * deadlocks sometimes - you can not swap over TCP in general. 7 * 8 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz> 9 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com> 10 * 11 * (part of code stolen from loop.c) 12 */ 13 14#include <linux/major.h> 15 16#include <linux/blkdev.h> 17#include <linux/module.h> 18#include <linux/init.h> 19#include <linux/sched.h> 20#include <linux/sched/mm.h> 21#include <linux/fs.h> 22#include <linux/bio.h> 23#include <linux/stat.h> 24#include <linux/errno.h> 25#include <linux/file.h> 26#include <linux/ioctl.h> 27#include <linux/mutex.h> 28#include <linux/compiler.h> 29#include <linux/completion.h> 30#include <linux/err.h> 31#include <linux/kernel.h> 32#include <linux/slab.h> 33#include <net/sock.h> 34#include <linux/net.h> 35#include <linux/kthread.h> 36#include <linux/types.h> 37#include <linux/debugfs.h> 38#include <linux/blk-mq.h> 39 40#include <linux/uaccess.h> 41#include <asm/types.h> 42 43#include <linux/nbd.h> 44#include <linux/nbd-netlink.h> 45#include <net/genetlink.h> 46 47#define CREATE_TRACE_POINTS 48#include <trace/events/nbd.h> 49 50static DEFINE_IDR(nbd_index_idr); 51static DEFINE_MUTEX(nbd_index_mutex); 52static int nbd_total_devices = 0; 53 54struct nbd_sock { 55 struct socket *sock; 56 struct mutex tx_lock; 57 struct request *pending; 58 int sent; 59 bool dead; 60 int fallback_index; 61 int cookie; 62}; 63 64struct recv_thread_args { 65 struct work_struct work; 66 struct nbd_device *nbd; 67 int index; 68}; 69 70struct link_dead_args { 71 struct work_struct work; 72 int index; 73}; 74 75#define NBD_RT_TIMEDOUT 0 76#define NBD_RT_DISCONNECT_REQUESTED 1 77#define NBD_RT_DISCONNECTED 2 78#define NBD_RT_HAS_PID_FILE 3 79#define NBD_RT_HAS_CONFIG_REF 4 80#define NBD_RT_BOUND 5 81#define NBD_RT_DISCONNECT_ON_CLOSE 6 82 83#define NBD_DESTROY_ON_DISCONNECT 0 84#define NBD_DISCONNECT_REQUESTED 1 85 86struct nbd_config { 87 u32 flags; 88 unsigned long runtime_flags; 89 u64 dead_conn_timeout; 90 91 struct nbd_sock **socks; 92 int num_connections; 93 atomic_t live_connections; 94 wait_queue_head_t conn_wait; 95 96 atomic_t recv_threads; 97 wait_queue_head_t recv_wq; 98 loff_t blksize; 99 loff_t bytesize; 100#if IS_ENABLED(CONFIG_DEBUG_FS) 101 struct dentry *dbg_dir; 102#endif 103}; 104 105struct nbd_device { 106 struct blk_mq_tag_set tag_set; 107 108 int index; 109 refcount_t config_refs; 110 refcount_t refs; 111 struct nbd_config *config; 112 struct mutex config_lock; 113 struct gendisk *disk; 114 struct workqueue_struct *recv_workq; 115 116 struct list_head list; 117 struct task_struct *task_setup; 118 119 struct completion *destroy_complete; 120 unsigned long flags; 121 pid_t pid; /* pid of nbd-client, if attached */ 122}; 123 124#define NBD_CMD_REQUEUED 1 125/* 126 * This flag will be set if nbd_queue_rq() succeed, and will be checked and 127 * cleared in completion. Both setting and clearing of the flag are protected 128 * by cmd->lock. 129 */ 130#define NBD_CMD_INFLIGHT 2 131 132struct nbd_cmd { 133 struct nbd_device *nbd; 134 struct mutex lock; 135 int index; 136 int cookie; 137 int retries; 138 blk_status_t status; 139 unsigned long flags; 140 u32 cmd_cookie; 141}; 142 143#if IS_ENABLED(CONFIG_DEBUG_FS) 144static struct dentry *nbd_dbg_dir; 145#endif 146 147#define nbd_name(nbd) ((nbd)->disk->disk_name) 148 149#define NBD_MAGIC 0x68797548 150 151#define NBD_DEF_BLKSIZE 1024 152 153static unsigned int nbds_max = 16; 154static int max_part = 16; 155static int part_shift; 156 157static int nbd_dev_dbg_init(struct nbd_device *nbd); 158static void nbd_dev_dbg_close(struct nbd_device *nbd); 159static void nbd_config_put(struct nbd_device *nbd); 160static void nbd_connect_reply(struct genl_info *info, int index); 161static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info); 162static void nbd_dead_link_work(struct work_struct *work); 163static void nbd_disconnect_and_put(struct nbd_device *nbd); 164 165static inline struct device *nbd_to_dev(struct nbd_device *nbd) 166{ 167 return disk_to_dev(nbd->disk); 168} 169 170static void nbd_requeue_cmd(struct nbd_cmd *cmd) 171{ 172 struct request *req = blk_mq_rq_from_pdu(cmd); 173 174 if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags)) 175 blk_mq_requeue_request(req, true); 176} 177 178#define NBD_COOKIE_BITS 32 179 180static u64 nbd_cmd_handle(struct nbd_cmd *cmd) 181{ 182 struct request *req = blk_mq_rq_from_pdu(cmd); 183 u32 tag = blk_mq_unique_tag(req); 184 u64 cookie = cmd->cmd_cookie; 185 186 return (cookie << NBD_COOKIE_BITS) | tag; 187} 188 189static u32 nbd_handle_to_tag(u64 handle) 190{ 191 return (u32)handle; 192} 193 194static u32 nbd_handle_to_cookie(u64 handle) 195{ 196 return (u32)(handle >> NBD_COOKIE_BITS); 197} 198 199static const char *nbdcmd_to_ascii(int cmd) 200{ 201 switch (cmd) { 202 case NBD_CMD_READ: return "read"; 203 case NBD_CMD_WRITE: return "write"; 204 case NBD_CMD_DISC: return "disconnect"; 205 case NBD_CMD_FLUSH: return "flush"; 206 case NBD_CMD_TRIM: return "trim/discard"; 207 } 208 return "invalid"; 209} 210 211static ssize_t pid_show(struct device *dev, 212 struct device_attribute *attr, char *buf) 213{ 214 struct gendisk *disk = dev_to_disk(dev); 215 struct nbd_device *nbd = (struct nbd_device *)disk->private_data; 216 217 return sprintf(buf, "%d\n", nbd->pid); 218} 219 220static const struct device_attribute pid_attr = { 221 .attr = { .name = "pid", .mode = 0444}, 222 .show = pid_show, 223}; 224 225static void nbd_dev_remove(struct nbd_device *nbd) 226{ 227 struct gendisk *disk = nbd->disk; 228 struct request_queue *q; 229 230 if (disk) { 231 q = disk->queue; 232 del_gendisk(disk); 233 blk_cleanup_queue(q); 234 blk_mq_free_tag_set(&nbd->tag_set); 235 disk->private_data = NULL; 236 put_disk(disk); 237 } 238 239 /* 240 * Place this in the last just before the nbd is freed to 241 * make sure that the disk and the related kobject are also 242 * totally removed to avoid duplicate creation of the same 243 * one. 244 */ 245 if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete) 246 complete(nbd->destroy_complete); 247 248 kfree(nbd); 249} 250 251static void nbd_put(struct nbd_device *nbd) 252{ 253 if (refcount_dec_and_mutex_lock(&nbd->refs, 254 &nbd_index_mutex)) { 255 idr_remove(&nbd_index_idr, nbd->index); 256 nbd_dev_remove(nbd); 257 mutex_unlock(&nbd_index_mutex); 258 } 259} 260 261static int nbd_disconnected(struct nbd_config *config) 262{ 263 return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) || 264 test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); 265} 266 267static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, 268 int notify) 269{ 270 if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) { 271 struct link_dead_args *args; 272 args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO); 273 if (args) { 274 INIT_WORK(&args->work, nbd_dead_link_work); 275 args->index = nbd->index; 276 queue_work(system_wq, &args->work); 277 } 278 } 279 if (!nsock->dead) { 280 kernel_sock_shutdown(nsock->sock, SHUT_RDWR); 281 if (atomic_dec_return(&nbd->config->live_connections) == 0) { 282 if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED, 283 &nbd->config->runtime_flags)) { 284 set_bit(NBD_RT_DISCONNECTED, 285 &nbd->config->runtime_flags); 286 dev_info(nbd_to_dev(nbd), 287 "Disconnected due to user request.\n"); 288 } 289 } 290 } 291 nsock->dead = true; 292 nsock->pending = NULL; 293 nsock->sent = 0; 294} 295 296static void nbd_size_clear(struct nbd_device *nbd) 297{ 298 if (nbd->config->bytesize) { 299 set_capacity(nbd->disk, 0); 300 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); 301 } 302} 303 304static void nbd_size_update(struct nbd_device *nbd, bool start) 305{ 306 struct nbd_config *config = nbd->config; 307 struct block_device *bdev = bdget_disk(nbd->disk, 0); 308 sector_t nr_sectors = config->bytesize >> 9; 309 310 if (config->flags & NBD_FLAG_SEND_TRIM) { 311 nbd->disk->queue->limits.discard_granularity = config->blksize; 312 nbd->disk->queue->limits.discard_alignment = config->blksize; 313 blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); 314 } 315 blk_queue_logical_block_size(nbd->disk->queue, config->blksize); 316 blk_queue_physical_block_size(nbd->disk->queue, config->blksize); 317 set_capacity(nbd->disk, nr_sectors); 318 if (bdev) { 319 if (bdev->bd_disk) { 320 bd_set_nr_sectors(bdev, nr_sectors); 321 if (start) 322 set_blocksize(bdev, config->blksize); 323 } else 324 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); 325 bdput(bdev); 326 } 327 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); 328} 329 330static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize, 331 loff_t nr_blocks) 332{ 333 struct nbd_config *config = nbd->config; 334 config->blksize = blocksize; 335 config->bytesize = blocksize * nr_blocks; 336 if (nbd->pid) 337 nbd_size_update(nbd, false); 338} 339 340static void nbd_complete_rq(struct request *req) 341{ 342 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); 343 344 dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req, 345 cmd->status ? "failed" : "done"); 346 347 blk_mq_end_request(req, cmd->status); 348} 349 350/* 351 * Forcibly shutdown the socket causing all listeners to error 352 */ 353static void sock_shutdown(struct nbd_device *nbd) 354{ 355 struct nbd_config *config = nbd->config; 356 int i; 357 358 if (config->num_connections == 0) 359 return; 360 if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) 361 return; 362 363 for (i = 0; i < config->num_connections; i++) { 364 struct nbd_sock *nsock = config->socks[i]; 365 mutex_lock(&nsock->tx_lock); 366 nbd_mark_nsock_dead(nbd, nsock, 0); 367 mutex_unlock(&nsock->tx_lock); 368 } 369 dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n"); 370} 371 372static u32 req_to_nbd_cmd_type(struct request *req) 373{ 374 switch (req_op(req)) { 375 case REQ_OP_DISCARD: 376 return NBD_CMD_TRIM; 377 case REQ_OP_FLUSH: 378 return NBD_CMD_FLUSH; 379 case REQ_OP_WRITE: 380 return NBD_CMD_WRITE; 381 case REQ_OP_READ: 382 return NBD_CMD_READ; 383 default: 384 return U32_MAX; 385 } 386} 387 388static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, 389 bool reserved) 390{ 391 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); 392 struct nbd_device *nbd = cmd->nbd; 393 struct nbd_config *config; 394 395 if (!mutex_trylock(&cmd->lock)) 396 return BLK_EH_RESET_TIMER; 397 398 if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { 399 mutex_unlock(&cmd->lock); 400 return BLK_EH_DONE; 401 } 402 403 if (!refcount_inc_not_zero(&nbd->config_refs)) { 404 cmd->status = BLK_STS_TIMEOUT; 405 mutex_unlock(&cmd->lock); 406 goto done; 407 } 408 config = nbd->config; 409 410 if (config->num_connections > 1 || 411 (config->num_connections == 1 && nbd->tag_set.timeout)) { 412 dev_err_ratelimited(nbd_to_dev(nbd), 413 "Connection timed out, retrying (%d/%d alive)\n", 414 atomic_read(&config->live_connections), 415 config->num_connections); 416 /* 417 * Hooray we have more connections, requeue this IO, the submit 418 * path will put it on a real connection. Or if only one 419 * connection is configured, the submit path will wait util 420 * a new connection is reconfigured or util dead timeout. 421 */ 422 if (config->socks) { 423 if (cmd->index < config->num_connections) { 424 struct nbd_sock *nsock = 425 config->socks[cmd->index]; 426 mutex_lock(&nsock->tx_lock); 427 /* We can have multiple outstanding requests, so 428 * we don't want to mark the nsock dead if we've 429 * already reconnected with a new socket, so 430 * only mark it dead if its the same socket we 431 * were sent out on. 432 */ 433 if (cmd->cookie == nsock->cookie) 434 nbd_mark_nsock_dead(nbd, nsock, 1); 435 mutex_unlock(&nsock->tx_lock); 436 } 437 mutex_unlock(&cmd->lock); 438 nbd_requeue_cmd(cmd); 439 nbd_config_put(nbd); 440 return BLK_EH_DONE; 441 } 442 } 443 444 if (!nbd->tag_set.timeout) { 445 /* 446 * Userspace sets timeout=0 to disable socket disconnection, 447 * so just warn and reset the timer. 448 */ 449 struct nbd_sock *nsock = config->socks[cmd->index]; 450 cmd->retries++; 451 dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n", 452 req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)), 453 (unsigned long long)blk_rq_pos(req) << 9, 454 blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries); 455 456 mutex_lock(&nsock->tx_lock); 457 if (cmd->cookie != nsock->cookie) { 458 nbd_requeue_cmd(cmd); 459 mutex_unlock(&nsock->tx_lock); 460 mutex_unlock(&cmd->lock); 461 nbd_config_put(nbd); 462 return BLK_EH_DONE; 463 } 464 mutex_unlock(&nsock->tx_lock); 465 mutex_unlock(&cmd->lock); 466 nbd_config_put(nbd); 467 return BLK_EH_RESET_TIMER; 468 } 469 470 dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); 471 set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); 472 cmd->status = BLK_STS_IOERR; 473 mutex_unlock(&cmd->lock); 474 sock_shutdown(nbd); 475 nbd_config_put(nbd); 476done: 477 blk_mq_complete_request(req); 478 return BLK_EH_DONE; 479} 480 481/* 482 * Send or receive packet. Return a positive value on success and 483 * negtive value on failue, and never return 0. 484 */ 485static int sock_xmit(struct nbd_device *nbd, int index, int send, 486 struct iov_iter *iter, int msg_flags, int *sent) 487{ 488 struct nbd_config *config = nbd->config; 489 struct socket *sock = config->socks[index]->sock; 490 int result; 491 struct msghdr msg; 492 unsigned int noreclaim_flag; 493 494 if (unlikely(!sock)) { 495 dev_err_ratelimited(disk_to_dev(nbd->disk), 496 "Attempted %s on closed socket in sock_xmit\n", 497 (send ? "send" : "recv")); 498 return -EINVAL; 499 } 500 501 msg.msg_iter = *iter; 502 503 noreclaim_flag = memalloc_noreclaim_save(); 504 do { 505 sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; 506 msg.msg_name = NULL; 507 msg.msg_namelen = 0; 508 msg.msg_control = NULL; 509 msg.msg_controllen = 0; 510 msg.msg_flags = msg_flags | MSG_NOSIGNAL; 511 512 if (send) 513 result = sock_sendmsg(sock, &msg); 514 else 515 result = sock_recvmsg(sock, &msg, msg.msg_flags); 516 517 if (result <= 0) { 518 if (result == 0) 519 result = -EPIPE; /* short read */ 520 break; 521 } 522 if (sent) 523 *sent += result; 524 } while (msg_data_left(&msg)); 525 526 memalloc_noreclaim_restore(noreclaim_flag); 527 528 return result; 529} 530 531/* 532 * Different settings for sk->sk_sndtimeo can result in different return values 533 * if there is a signal pending when we enter sendmsg, because reasons? 534 */ 535static inline int was_interrupted(int result) 536{ 537 return result == -ERESTARTSYS || result == -EINTR; 538} 539 540/* always call with the tx_lock held */ 541static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) 542{ 543 struct request *req = blk_mq_rq_from_pdu(cmd); 544 struct nbd_config *config = nbd->config; 545 struct nbd_sock *nsock = config->socks[index]; 546 int result; 547 struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)}; 548 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)}; 549 struct iov_iter from; 550 unsigned long size = blk_rq_bytes(req); 551 struct bio *bio; 552 u64 handle; 553 u32 type; 554 u32 nbd_cmd_flags = 0; 555 int sent = nsock->sent, skip = 0; 556 557 iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); 558 559 type = req_to_nbd_cmd_type(req); 560 if (type == U32_MAX) 561 return -EIO; 562 563 if (rq_data_dir(req) == WRITE && 564 (config->flags & NBD_FLAG_READ_ONLY)) { 565 dev_err_ratelimited(disk_to_dev(nbd->disk), 566 "Write on read-only\n"); 567 return -EIO; 568 } 569 570 if (req->cmd_flags & REQ_FUA) 571 nbd_cmd_flags |= NBD_CMD_FLAG_FUA; 572 573 /* We did a partial send previously, and we at least sent the whole 574 * request struct, so just go and send the rest of the pages in the 575 * request. 576 */ 577 if (sent) { 578 if (sent >= sizeof(request)) { 579 skip = sent - sizeof(request); 580 581 /* initialize handle for tracing purposes */ 582 handle = nbd_cmd_handle(cmd); 583 584 goto send_pages; 585 } 586 iov_iter_advance(&from, sent); 587 } else { 588 cmd->cmd_cookie++; 589 } 590 cmd->index = index; 591 cmd->cookie = nsock->cookie; 592 cmd->retries = 0; 593 request.type = htonl(type | nbd_cmd_flags); 594 if (type != NBD_CMD_FLUSH) { 595 request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); 596 request.len = htonl(size); 597 } 598 handle = nbd_cmd_handle(cmd); 599 memcpy(request.handle, &handle, sizeof(handle)); 600 601 trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd)); 602 603 dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", 604 req, nbdcmd_to_ascii(type), 605 (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); 606 result = sock_xmit(nbd, index, 1, &from, 607 (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent); 608 trace_nbd_header_sent(req, handle); 609 if (result < 0) { 610 if (was_interrupted(result)) { 611 /* If we havne't sent anything we can just return BUSY, 612 * however if we have sent something we need to make 613 * sure we only allow this req to be sent until we are 614 * completely done. 615 */ 616 if (sent) { 617 nsock->pending = req; 618 nsock->sent = sent; 619 } 620 set_bit(NBD_CMD_REQUEUED, &cmd->flags); 621 return BLK_STS_RESOURCE; 622 } 623 dev_err_ratelimited(disk_to_dev(nbd->disk), 624 "Send control failed (result %d)\n", result); 625 return -EAGAIN; 626 } 627send_pages: 628 if (type != NBD_CMD_WRITE) 629 goto out; 630 631 bio = req->bio; 632 while (bio) { 633 struct bio *next = bio->bi_next; 634 struct bvec_iter iter; 635 struct bio_vec bvec; 636 637 bio_for_each_segment(bvec, bio, iter) { 638 bool is_last = !next && bio_iter_last(bvec, iter); 639 int flags = is_last ? 0 : MSG_MORE; 640 641 dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", 642 req, bvec.bv_len); 643 iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len); 644 if (skip) { 645 if (skip >= iov_iter_count(&from)) { 646 skip -= iov_iter_count(&from); 647 continue; 648 } 649 iov_iter_advance(&from, skip); 650 skip = 0; 651 } 652 result = sock_xmit(nbd, index, 1, &from, flags, &sent); 653 if (result < 0) { 654 if (was_interrupted(result)) { 655 /* We've already sent the header, we 656 * have no choice but to set pending and 657 * return BUSY. 658 */ 659 nsock->pending = req; 660 nsock->sent = sent; 661 set_bit(NBD_CMD_REQUEUED, &cmd->flags); 662 return BLK_STS_RESOURCE; 663 } 664 dev_err(disk_to_dev(nbd->disk), 665 "Send data failed (result %d)\n", 666 result); 667 return -EAGAIN; 668 } 669 /* 670 * The completion might already have come in, 671 * so break for the last one instead of letting 672 * the iterator do it. This prevents use-after-free 673 * of the bio. 674 */ 675 if (is_last) 676 break; 677 } 678 bio = next; 679 } 680out: 681 trace_nbd_payload_sent(req, handle); 682 nsock->pending = NULL; 683 nsock->sent = 0; 684 return 0; 685} 686 687static int nbd_read_reply(struct nbd_device *nbd, int index, 688 struct nbd_reply *reply) 689{ 690 struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)}; 691 struct iov_iter to; 692 int result; 693 694 reply->magic = 0; 695 iov_iter_kvec(&to, READ, &iov, 1, sizeof(*reply)); 696 result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); 697 if (result < 0) { 698 if (!nbd_disconnected(nbd->config)) 699 dev_err(disk_to_dev(nbd->disk), 700 "Receive control failed (result %d)\n", result); 701 return result; 702 } 703 704 if (ntohl(reply->magic) != NBD_REPLY_MAGIC) { 705 dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", 706 (unsigned long)ntohl(reply->magic)); 707 return -EPROTO; 708 } 709 710 return 0; 711} 712 713/* NULL returned = something went wrong, inform userspace */ 714static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, 715 struct nbd_reply *reply) 716{ 717 int result; 718 struct nbd_cmd *cmd; 719 struct request *req = NULL; 720 u64 handle; 721 u16 hwq; 722 u32 tag; 723 int ret = 0; 724 725 memcpy(&handle, reply->handle, sizeof(handle)); 726 tag = nbd_handle_to_tag(handle); 727 hwq = blk_mq_unique_tag_to_hwq(tag); 728 if (hwq < nbd->tag_set.nr_hw_queues) 729 req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq], 730 blk_mq_unique_tag_to_tag(tag)); 731 if (!req || !blk_mq_request_started(req)) { 732 dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n", 733 tag, req); 734 return ERR_PTR(-ENOENT); 735 } 736 trace_nbd_header_received(req, handle); 737 cmd = blk_mq_rq_to_pdu(req); 738 739 mutex_lock(&cmd->lock); 740 if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { 741 dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)", 742 tag, cmd->status, cmd->flags); 743 ret = -ENOENT; 744 goto out; 745 } 746 if (cmd->index != index) { 747 dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)", 748 tag, index, cmd->index); 749 } 750 if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) { 751 dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n", 752 req, cmd->cmd_cookie, nbd_handle_to_cookie(handle)); 753 ret = -ENOENT; 754 goto out; 755 } 756 if (cmd->status != BLK_STS_OK) { 757 dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n", 758 req); 759 ret = -ENOENT; 760 goto out; 761 } 762 if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) { 763 dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n", 764 req); 765 ret = -ENOENT; 766 goto out; 767 } 768 if (ntohl(reply->error)) { 769 dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", 770 ntohl(reply->error)); 771 cmd->status = BLK_STS_IOERR; 772 goto out; 773 } 774 775 dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); 776 if (rq_data_dir(req) != WRITE) { 777 struct req_iterator iter; 778 struct bio_vec bvec; 779 struct iov_iter to; 780 781 rq_for_each_segment(bvec, req, iter) { 782 iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len); 783 result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); 784 if (result < 0) { 785 dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", 786 result); 787 /* 788 * If we've disconnected, we need to make sure we 789 * complete this request, otherwise error out 790 * and let the timeout stuff handle resubmitting 791 * this request onto another connection. 792 */ 793 if (nbd_disconnected(nbd->config)) { 794 cmd->status = BLK_STS_IOERR; 795 goto out; 796 } 797 ret = -EIO; 798 goto out; 799 } 800 dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", 801 req, bvec.bv_len); 802 } 803 } 804out: 805 trace_nbd_payload_received(req, handle); 806 mutex_unlock(&cmd->lock); 807 return ret ? ERR_PTR(ret) : cmd; 808} 809 810static void recv_work(struct work_struct *work) 811{ 812 struct recv_thread_args *args = container_of(work, 813 struct recv_thread_args, 814 work); 815 struct nbd_device *nbd = args->nbd; 816 struct nbd_config *config = nbd->config; 817 struct request_queue *q = nbd->disk->queue; 818 struct nbd_sock *nsock; 819 struct nbd_cmd *cmd; 820 struct request *rq; 821 822 while (1) { 823 struct nbd_reply reply; 824 825 if (nbd_read_reply(nbd, args->index, &reply)) 826 break; 827 828 /* 829 * Grab .q_usage_counter so request pool won't go away, then no 830 * request use-after-free is possible during nbd_handle_reply(). 831 * If queue is frozen, there won't be any inflight requests, we 832 * needn't to handle the incoming garbage message. 833 */ 834 if (!percpu_ref_tryget(&q->q_usage_counter)) { 835 dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n", 836 __func__); 837 break; 838 } 839 840 cmd = nbd_handle_reply(nbd, args->index, &reply); 841 if (IS_ERR(cmd)) { 842 percpu_ref_put(&q->q_usage_counter); 843 break; 844 } 845 846 rq = blk_mq_rq_from_pdu(cmd); 847 if (likely(!blk_should_fake_timeout(rq->q))) 848 blk_mq_complete_request(rq); 849 percpu_ref_put(&q->q_usage_counter); 850 } 851 852 nsock = config->socks[args->index]; 853 mutex_lock(&nsock->tx_lock); 854 nbd_mark_nsock_dead(nbd, nsock, 1); 855 mutex_unlock(&nsock->tx_lock); 856 857 nbd_config_put(nbd); 858 atomic_dec(&config->recv_threads); 859 wake_up(&config->recv_wq); 860 kfree(args); 861} 862 863static bool nbd_clear_req(struct request *req, void *data, bool reserved) 864{ 865 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); 866 867 /* don't abort one completed request */ 868 if (blk_mq_request_completed(req)) 869 return true; 870 871 mutex_lock(&cmd->lock); 872 if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { 873 mutex_unlock(&cmd->lock); 874 return true; 875 } 876 cmd->status = BLK_STS_IOERR; 877 mutex_unlock(&cmd->lock); 878 879 blk_mq_complete_request(req); 880 return true; 881} 882 883static void nbd_clear_que(struct nbd_device *nbd) 884{ 885 blk_mq_quiesce_queue(nbd->disk->queue); 886 blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); 887 blk_mq_unquiesce_queue(nbd->disk->queue); 888 dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); 889} 890 891static int find_fallback(struct nbd_device *nbd, int index) 892{ 893 struct nbd_config *config = nbd->config; 894 int new_index = -1; 895 struct nbd_sock *nsock = config->socks[index]; 896 int fallback = nsock->fallback_index; 897 898 if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) 899 return new_index; 900 901 if (config->num_connections <= 1) { 902 dev_err_ratelimited(disk_to_dev(nbd->disk), 903 "Dead connection, failed to find a fallback\n"); 904 return new_index; 905 } 906 907 if (fallback >= 0 && fallback < config->num_connections && 908 !config->socks[fallback]->dead) 909 return fallback; 910 911 if (nsock->fallback_index < 0 || 912 nsock->fallback_index >= config->num_connections || 913 config->socks[nsock->fallback_index]->dead) { 914 int i; 915 for (i = 0; i < config->num_connections; i++) { 916 if (i == index) 917 continue; 918 if (!config->socks[i]->dead) { 919 new_index = i; 920 break; 921 } 922 } 923 nsock->fallback_index = new_index; 924 if (new_index < 0) { 925 dev_err_ratelimited(disk_to_dev(nbd->disk), 926 "Dead connection, failed to find a fallback\n"); 927 return new_index; 928 } 929 } 930 new_index = nsock->fallback_index; 931 return new_index; 932} 933 934static int wait_for_reconnect(struct nbd_device *nbd) 935{ 936 struct nbd_config *config = nbd->config; 937 if (!config->dead_conn_timeout) 938 return 0; 939 940 if (!wait_event_timeout(config->conn_wait, 941 test_bit(NBD_RT_DISCONNECTED, 942 &config->runtime_flags) || 943 atomic_read(&config->live_connections) > 0, 944 config->dead_conn_timeout)) 945 return 0; 946 947 return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); 948} 949 950static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) 951{ 952 struct request *req = blk_mq_rq_from_pdu(cmd); 953 struct nbd_device *nbd = cmd->nbd; 954 struct nbd_config *config; 955 struct nbd_sock *nsock; 956 int ret; 957 958 if (!refcount_inc_not_zero(&nbd->config_refs)) { 959 dev_err_ratelimited(disk_to_dev(nbd->disk), 960 "Socks array is empty\n"); 961 return -EINVAL; 962 } 963 config = nbd->config; 964 965 if (index >= config->num_connections) { 966 dev_err_ratelimited(disk_to_dev(nbd->disk), 967 "Attempted send on invalid socket\n"); 968 nbd_config_put(nbd); 969 return -EINVAL; 970 } 971 cmd->status = BLK_STS_OK; 972again: 973 nsock = config->socks[index]; 974 mutex_lock(&nsock->tx_lock); 975 if (nsock->dead) { 976 int old_index = index; 977 index = find_fallback(nbd, index); 978 mutex_unlock(&nsock->tx_lock); 979 if (index < 0) { 980 if (wait_for_reconnect(nbd)) { 981 index = old_index; 982 goto again; 983 } 984 /* All the sockets should already be down at this point, 985 * we just want to make sure that DISCONNECTED is set so 986 * any requests that come in that were queue'ed waiting 987 * for the reconnect timer don't trigger the timer again 988 * and instead just error out. 989 */ 990 sock_shutdown(nbd); 991 nbd_config_put(nbd); 992 return -EIO; 993 } 994 goto again; 995 } 996 997 /* Handle the case that we have a pending request that was partially 998 * transmitted that _has_ to be serviced first. We need to call requeue 999 * here so that it gets put _after_ the request that is already on the 1000 * dispatch list. 1001 */ 1002 blk_mq_start_request(req); 1003 if (unlikely(nsock->pending && nsock->pending != req)) { 1004 nbd_requeue_cmd(cmd); 1005 ret = 0; 1006 goto out; 1007 } 1008 /* 1009 * Some failures are related to the link going down, so anything that 1010 * returns EAGAIN can be retried on a different socket. 1011 */ 1012 ret = nbd_send_cmd(nbd, cmd, index); 1013 /* 1014 * Access to this flag is protected by cmd->lock, thus it's safe to set 1015 * the flag after nbd_send_cmd() succeed to send request to server. 1016 */ 1017 if (!ret) 1018 __set_bit(NBD_CMD_INFLIGHT, &cmd->flags); 1019 else if (ret == -EAGAIN) { 1020 dev_err_ratelimited(disk_to_dev(nbd->disk), 1021 "Request send failed, requeueing\n"); 1022 nbd_mark_nsock_dead(nbd, nsock, 1); 1023 nbd_requeue_cmd(cmd); 1024 ret = 0; 1025 } 1026out: 1027 mutex_unlock(&nsock->tx_lock); 1028 nbd_config_put(nbd); 1029 return ret; 1030} 1031 1032static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, 1033 const struct blk_mq_queue_data *bd) 1034{ 1035 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); 1036 int ret; 1037 1038 /* 1039 * Since we look at the bio's to send the request over the network we 1040 * need to make sure the completion work doesn't mark this request done 1041 * before we are done doing our send. This keeps us from dereferencing 1042 * freed data if we have particularly fast completions (ie we get the 1043 * completion before we exit sock_xmit on the last bvec) or in the case 1044 * that the server is misbehaving (or there was an error) before we're 1045 * done sending everything over the wire. 1046 */ 1047 mutex_lock(&cmd->lock); 1048 clear_bit(NBD_CMD_REQUEUED, &cmd->flags); 1049 1050 /* We can be called directly from the user space process, which means we 1051 * could possibly have signals pending so our sendmsg will fail. In 1052 * this case we need to return that we are busy, otherwise error out as 1053 * appropriate. 1054 */ 1055 ret = nbd_handle_cmd(cmd, hctx->queue_num); 1056 if (ret < 0) 1057 ret = BLK_STS_IOERR; 1058 else if (!ret) 1059 ret = BLK_STS_OK; 1060 mutex_unlock(&cmd->lock); 1061 1062 return ret; 1063} 1064 1065static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd, 1066 int *err) 1067{ 1068 struct socket *sock; 1069 1070 *err = 0; 1071 sock = sockfd_lookup(fd, err); 1072 if (!sock) 1073 return NULL; 1074 1075 if (sock->ops->shutdown == sock_no_shutdown) { 1076 dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n"); 1077 *err = -EINVAL; 1078 sockfd_put(sock); 1079 return NULL; 1080 } 1081 1082 return sock; 1083} 1084 1085static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, 1086 bool netlink) 1087{ 1088 struct nbd_config *config = nbd->config; 1089 struct socket *sock; 1090 struct nbd_sock **socks; 1091 struct nbd_sock *nsock; 1092 int err; 1093 1094 sock = nbd_get_socket(nbd, arg, &err); 1095 if (!sock) 1096 return err; 1097 1098 /* 1099 * We need to make sure we don't get any errant requests while we're 1100 * reallocating the ->socks array. 1101 */ 1102 blk_mq_freeze_queue(nbd->disk->queue); 1103 1104 if (!netlink && !nbd->task_setup && 1105 !test_bit(NBD_RT_BOUND, &config->runtime_flags)) 1106 nbd->task_setup = current; 1107 1108 if (!netlink && 1109 (nbd->task_setup != current || 1110 test_bit(NBD_RT_BOUND, &config->runtime_flags))) { 1111 dev_err(disk_to_dev(nbd->disk), 1112 "Device being setup by another task"); 1113 err = -EBUSY; 1114 goto put_socket; 1115 } 1116 1117 nsock = kzalloc(sizeof(*nsock), GFP_KERNEL); 1118 if (!nsock) { 1119 err = -ENOMEM; 1120 goto put_socket; 1121 } 1122 1123 socks = krealloc(config->socks, (config->num_connections + 1) * 1124 sizeof(struct nbd_sock *), GFP_KERNEL); 1125 if (!socks) { 1126 kfree(nsock); 1127 err = -ENOMEM; 1128 goto put_socket; 1129 } 1130 1131 config->socks = socks; 1132 1133 nsock->fallback_index = -1; 1134 nsock->dead = false; 1135 mutex_init(&nsock->tx_lock); 1136 nsock->sock = sock; 1137 nsock->pending = NULL; 1138 nsock->sent = 0; 1139 nsock->cookie = 0; 1140 socks[config->num_connections++] = nsock; 1141 atomic_inc(&config->live_connections); 1142 blk_mq_unfreeze_queue(nbd->disk->queue); 1143 1144 return 0; 1145 1146put_socket: 1147 blk_mq_unfreeze_queue(nbd->disk->queue); 1148 sockfd_put(sock); 1149 return err; 1150} 1151 1152static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) 1153{ 1154 struct nbd_config *config = nbd->config; 1155 struct socket *sock, *old; 1156 struct recv_thread_args *args; 1157 int i; 1158 int err; 1159 1160 sock = nbd_get_socket(nbd, arg, &err); 1161 if (!sock) 1162 return err; 1163 1164 args = kzalloc(sizeof(*args), GFP_KERNEL); 1165 if (!args) { 1166 sockfd_put(sock); 1167 return -ENOMEM; 1168 } 1169 1170 for (i = 0; i < config->num_connections; i++) { 1171 struct nbd_sock *nsock = config->socks[i]; 1172 1173 if (!nsock->dead) 1174 continue; 1175 1176 mutex_lock(&nsock->tx_lock); 1177 if (!nsock->dead) { 1178 mutex_unlock(&nsock->tx_lock); 1179 continue; 1180 } 1181 sk_set_memalloc(sock->sk); 1182 if (nbd->tag_set.timeout) 1183 sock->sk->sk_sndtimeo = nbd->tag_set.timeout; 1184 atomic_inc(&config->recv_threads); 1185 refcount_inc(&nbd->config_refs); 1186 old = nsock->sock; 1187 nsock->fallback_index = -1; 1188 nsock->sock = sock; 1189 nsock->dead = false; 1190 INIT_WORK(&args->work, recv_work); 1191 args->index = i; 1192 args->nbd = nbd; 1193 nsock->cookie++; 1194 mutex_unlock(&nsock->tx_lock); 1195 sockfd_put(old); 1196 1197 clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); 1198 1199 /* We take the tx_mutex in an error path in the recv_work, so we 1200 * need to queue_work outside of the tx_mutex. 1201 */ 1202 queue_work(nbd->recv_workq, &args->work); 1203 1204 atomic_inc(&config->live_connections); 1205 wake_up(&config->conn_wait); 1206 return 0; 1207 } 1208 sockfd_put(sock); 1209 kfree(args); 1210 return -ENOSPC; 1211} 1212 1213static void nbd_bdev_reset(struct block_device *bdev) 1214{ 1215 if (bdev->bd_openers > 1) 1216 return; 1217 bd_set_nr_sectors(bdev, 0); 1218} 1219 1220static void nbd_parse_flags(struct nbd_device *nbd) 1221{ 1222 struct nbd_config *config = nbd->config; 1223 if (config->flags & NBD_FLAG_READ_ONLY) 1224 set_disk_ro(nbd->disk, true); 1225 else 1226 set_disk_ro(nbd->disk, false); 1227 if (config->flags & NBD_FLAG_SEND_TRIM) 1228 blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue); 1229 if (config->flags & NBD_FLAG_SEND_FLUSH) { 1230 if (config->flags & NBD_FLAG_SEND_FUA) 1231 blk_queue_write_cache(nbd->disk->queue, true, true); 1232 else 1233 blk_queue_write_cache(nbd->disk->queue, true, false); 1234 } 1235 else 1236 blk_queue_write_cache(nbd->disk->queue, false, false); 1237} 1238 1239static void send_disconnects(struct nbd_device *nbd) 1240{ 1241 struct nbd_config *config = nbd->config; 1242 struct nbd_request request = { 1243 .magic = htonl(NBD_REQUEST_MAGIC), 1244 .type = htonl(NBD_CMD_DISC), 1245 }; 1246 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)}; 1247 struct iov_iter from; 1248 int i, ret; 1249 1250 for (i = 0; i < config->num_connections; i++) { 1251 struct nbd_sock *nsock = config->socks[i]; 1252 1253 iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); 1254 mutex_lock(&nsock->tx_lock); 1255 ret = sock_xmit(nbd, i, 1, &from, 0, NULL); 1256 if (ret < 0) 1257 dev_err(disk_to_dev(nbd->disk), 1258 "Send disconnect failed %d\n", ret); 1259 mutex_unlock(&nsock->tx_lock); 1260 } 1261} 1262 1263static int nbd_disconnect(struct nbd_device *nbd) 1264{ 1265 struct nbd_config *config = nbd->config; 1266 1267 dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); 1268 set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); 1269 set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags); 1270 send_disconnects(nbd); 1271 return 0; 1272} 1273 1274static void nbd_clear_sock(struct nbd_device *nbd) 1275{ 1276 sock_shutdown(nbd); 1277 nbd_clear_que(nbd); 1278 nbd->task_setup = NULL; 1279} 1280 1281static void nbd_config_put(struct nbd_device *nbd) 1282{ 1283 if (refcount_dec_and_mutex_lock(&nbd->config_refs, 1284 &nbd->config_lock)) { 1285 struct nbd_config *config = nbd->config; 1286 nbd_dev_dbg_close(nbd); 1287 nbd_size_clear(nbd); 1288 if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, 1289 &config->runtime_flags)) 1290 device_remove_file(disk_to_dev(nbd->disk), &pid_attr); 1291 nbd->pid = 0; 1292 nbd_clear_sock(nbd); 1293 if (config->num_connections) { 1294 int i; 1295 for (i = 0; i < config->num_connections; i++) { 1296 sockfd_put(config->socks[i]->sock); 1297 kfree(config->socks[i]); 1298 } 1299 kfree(config->socks); 1300 } 1301 kfree(nbd->config); 1302 nbd->config = NULL; 1303 1304 if (nbd->recv_workq) 1305 destroy_workqueue(nbd->recv_workq); 1306 nbd->recv_workq = NULL; 1307 1308 nbd->tag_set.timeout = 0; 1309 nbd->disk->queue->limits.discard_granularity = 0; 1310 nbd->disk->queue->limits.discard_alignment = 0; 1311 blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); 1312 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue); 1313 1314 mutex_unlock(&nbd->config_lock); 1315 nbd_put(nbd); 1316 module_put(THIS_MODULE); 1317 } 1318} 1319 1320static int nbd_start_device(struct nbd_device *nbd) 1321{ 1322 struct nbd_config *config = nbd->config; 1323 int num_connections = config->num_connections; 1324 int error = 0, i; 1325 1326 if (nbd->pid) 1327 return -EBUSY; 1328 if (!config->socks) 1329 return -EINVAL; 1330 if (num_connections > 1 && 1331 !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) { 1332 dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n"); 1333 return -EINVAL; 1334 } 1335 1336 nbd->recv_workq = alloc_workqueue("knbd%d-recv", 1337 WQ_MEM_RECLAIM | WQ_HIGHPRI | 1338 WQ_UNBOUND, 0, nbd->index); 1339 if (!nbd->recv_workq) { 1340 dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n"); 1341 return -ENOMEM; 1342 } 1343 1344 blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); 1345 nbd->pid = task_pid_nr(current); 1346 1347 nbd_parse_flags(nbd); 1348 1349 error = device_create_file(disk_to_dev(nbd->disk), &pid_attr); 1350 if (error) { 1351 dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); 1352 return error; 1353 } 1354 set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags); 1355 1356 nbd_dev_dbg_init(nbd); 1357 for (i = 0; i < num_connections; i++) { 1358 struct recv_thread_args *args; 1359 1360 args = kzalloc(sizeof(*args), GFP_KERNEL); 1361 if (!args) { 1362 sock_shutdown(nbd); 1363 /* 1364 * If num_connections is m (2 < m), 1365 * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful. 1366 * But NO.(n + 1) failed. We still have n recv threads. 1367 * So, add flush_workqueue here to prevent recv threads 1368 * dropping the last config_refs and trying to destroy 1369 * the workqueue from inside the workqueue. 1370 */ 1371 if (i) 1372 flush_workqueue(nbd->recv_workq); 1373 return -ENOMEM; 1374 } 1375 sk_set_memalloc(config->socks[i]->sock->sk); 1376 if (nbd->tag_set.timeout) 1377 config->socks[i]->sock->sk->sk_sndtimeo = 1378 nbd->tag_set.timeout; 1379 atomic_inc(&config->recv_threads); 1380 refcount_inc(&nbd->config_refs); 1381 INIT_WORK(&args->work, recv_work); 1382 args->nbd = nbd; 1383 args->index = i; 1384 queue_work(nbd->recv_workq, &args->work); 1385 } 1386 nbd_size_update(nbd, true); 1387 return error; 1388} 1389 1390static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev) 1391{ 1392 struct nbd_config *config = nbd->config; 1393 int ret; 1394 1395 ret = nbd_start_device(nbd); 1396 if (ret) 1397 return ret; 1398 1399 if (max_part) 1400 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); 1401 mutex_unlock(&nbd->config_lock); 1402 ret = wait_event_interruptible(config->recv_wq, 1403 atomic_read(&config->recv_threads) == 0); 1404 if (ret) { 1405 sock_shutdown(nbd); 1406 nbd_clear_que(nbd); 1407 } 1408 1409 flush_workqueue(nbd->recv_workq); 1410 mutex_lock(&nbd->config_lock); 1411 nbd_bdev_reset(bdev); 1412 /* user requested, ignore socket errors */ 1413 if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) 1414 ret = 0; 1415 if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags)) 1416 ret = -ETIMEDOUT; 1417 return ret; 1418} 1419 1420static void nbd_clear_sock_ioctl(struct nbd_device *nbd, 1421 struct block_device *bdev) 1422{ 1423 nbd_clear_sock(nbd); 1424 __invalidate_device(bdev, true); 1425 nbd_bdev_reset(bdev); 1426 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, 1427 &nbd->config->runtime_flags)) 1428 nbd_config_put(nbd); 1429} 1430 1431static bool nbd_is_valid_blksize(unsigned long blksize) 1432{ 1433 if (!blksize || !is_power_of_2(blksize) || blksize < 512 || 1434 blksize > PAGE_SIZE) 1435 return false; 1436 return true; 1437} 1438 1439static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout) 1440{ 1441 nbd->tag_set.timeout = timeout * HZ; 1442 if (timeout) 1443 blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); 1444 else 1445 blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ); 1446} 1447 1448/* Must be called with config_lock held */ 1449static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, 1450 unsigned int cmd, unsigned long arg) 1451{ 1452 struct nbd_config *config = nbd->config; 1453 loff_t bytesize; 1454 1455 switch (cmd) { 1456 case NBD_DISCONNECT: 1457 return nbd_disconnect(nbd); 1458 case NBD_CLEAR_SOCK: 1459 nbd_clear_sock_ioctl(nbd, bdev); 1460 return 0; 1461 case NBD_SET_SOCK: 1462 return nbd_add_socket(nbd, arg, false); 1463 case NBD_SET_BLKSIZE: 1464 if (!arg) 1465 arg = NBD_DEF_BLKSIZE; 1466 if (!nbd_is_valid_blksize(arg)) 1467 return -EINVAL; 1468 nbd_size_set(nbd, arg, 1469 div_s64(config->bytesize, arg)); 1470 return 0; 1471 case NBD_SET_SIZE: 1472 nbd_size_set(nbd, config->blksize, 1473 div_s64(arg, config->blksize)); 1474 return 0; 1475 case NBD_SET_SIZE_BLOCKS: 1476 if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize)) 1477 return -EINVAL; 1478 nbd_size_set(nbd, config->blksize, arg); 1479 return 0; 1480 case NBD_SET_TIMEOUT: 1481 nbd_set_cmd_timeout(nbd, arg); 1482 return 0; 1483 1484 case NBD_SET_FLAGS: 1485 config->flags = arg; 1486 return 0; 1487 case NBD_DO_IT: 1488 return nbd_start_device_ioctl(nbd, bdev); 1489 case NBD_CLEAR_QUE: 1490 /* 1491 * This is for compatibility only. The queue is always cleared 1492 * by NBD_DO_IT or NBD_CLEAR_SOCK. 1493 */ 1494 return 0; 1495 case NBD_PRINT_DEBUG: 1496 /* 1497 * For compatibility only, we no longer keep a list of 1498 * outstanding requests. 1499 */ 1500 return 0; 1501 } 1502 return -ENOTTY; 1503} 1504 1505static int nbd_ioctl(struct block_device *bdev, fmode_t mode, 1506 unsigned int cmd, unsigned long arg) 1507{ 1508 struct nbd_device *nbd = bdev->bd_disk->private_data; 1509 struct nbd_config *config = nbd->config; 1510 int error = -EINVAL; 1511 1512 if (!capable(CAP_SYS_ADMIN)) 1513 return -EPERM; 1514 1515 /* The block layer will pass back some non-nbd ioctls in case we have 1516 * special handling for them, but we don't so just return an error. 1517 */ 1518 if (_IOC_TYPE(cmd) != 0xab) 1519 return -EINVAL; 1520 1521 mutex_lock(&nbd->config_lock); 1522 1523 /* Don't allow ioctl operations on a nbd device that was created with 1524 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine. 1525 */ 1526 if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || 1527 (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK)) 1528 error = __nbd_ioctl(bdev, nbd, cmd, arg); 1529 else 1530 dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n"); 1531 mutex_unlock(&nbd->config_lock); 1532 return error; 1533} 1534 1535static struct nbd_config *nbd_alloc_config(void) 1536{ 1537 struct nbd_config *config; 1538 1539 if (!try_module_get(THIS_MODULE)) 1540 return ERR_PTR(-ENODEV); 1541 1542 config = kzalloc(sizeof(struct nbd_config), GFP_NOFS); 1543 if (!config) { 1544 module_put(THIS_MODULE); 1545 return ERR_PTR(-ENOMEM); 1546 } 1547 1548 atomic_set(&config->recv_threads, 0); 1549 init_waitqueue_head(&config->recv_wq); 1550 init_waitqueue_head(&config->conn_wait); 1551 config->blksize = NBD_DEF_BLKSIZE; 1552 atomic_set(&config->live_connections, 0); 1553 return config; 1554} 1555 1556static int nbd_open(struct block_device *bdev, fmode_t mode) 1557{ 1558 struct nbd_device *nbd; 1559 int ret = 0; 1560 1561 mutex_lock(&nbd_index_mutex); 1562 nbd = bdev->bd_disk->private_data; 1563 if (!nbd) { 1564 ret = -ENXIO; 1565 goto out; 1566 } 1567 if (!refcount_inc_not_zero(&nbd->refs)) { 1568 ret = -ENXIO; 1569 goto out; 1570 } 1571 if (!refcount_inc_not_zero(&nbd->config_refs)) { 1572 struct nbd_config *config; 1573 1574 mutex_lock(&nbd->config_lock); 1575 if (refcount_inc_not_zero(&nbd->config_refs)) { 1576 mutex_unlock(&nbd->config_lock); 1577 goto out; 1578 } 1579 config = nbd_alloc_config(); 1580 if (IS_ERR(config)) { 1581 ret = PTR_ERR(config); 1582 mutex_unlock(&nbd->config_lock); 1583 goto out; 1584 } 1585 nbd->config = config; 1586 refcount_set(&nbd->config_refs, 1); 1587 refcount_inc(&nbd->refs); 1588 mutex_unlock(&nbd->config_lock); 1589 set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); 1590 } else if (nbd_disconnected(nbd->config)) { 1591 set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); 1592 } 1593out: 1594 mutex_unlock(&nbd_index_mutex); 1595 return ret; 1596} 1597 1598static void nbd_release(struct gendisk *disk, fmode_t mode) 1599{ 1600 struct nbd_device *nbd = disk->private_data; 1601 struct block_device *bdev = bdget_disk(disk, 0); 1602 1603 if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && 1604 bdev->bd_openers == 0) 1605 nbd_disconnect_and_put(nbd); 1606 bdput(bdev); 1607 1608 nbd_config_put(nbd); 1609 nbd_put(nbd); 1610} 1611 1612static const struct block_device_operations nbd_fops = 1613{ 1614 .owner = THIS_MODULE, 1615 .open = nbd_open, 1616 .release = nbd_release, 1617 .ioctl = nbd_ioctl, 1618 .compat_ioctl = nbd_ioctl, 1619}; 1620 1621#if IS_ENABLED(CONFIG_DEBUG_FS) 1622 1623static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) 1624{ 1625 struct nbd_device *nbd = s->private; 1626 1627 if (nbd->pid) 1628 seq_printf(s, "recv: %d\n", nbd->pid); 1629 1630 return 0; 1631} 1632 1633static int nbd_dbg_tasks_open(struct inode *inode, struct file *file) 1634{ 1635 return single_open(file, nbd_dbg_tasks_show, inode->i_private); 1636} 1637 1638static const struct file_operations nbd_dbg_tasks_ops = { 1639 .open = nbd_dbg_tasks_open, 1640 .read = seq_read, 1641 .llseek = seq_lseek, 1642 .release = single_release, 1643}; 1644 1645static int nbd_dbg_flags_show(struct seq_file *s, void *unused) 1646{ 1647 struct nbd_device *nbd = s->private; 1648 u32 flags = nbd->config->flags; 1649 1650 seq_printf(s, "Hex: 0x%08x\n\n", flags); 1651 1652 seq_puts(s, "Known flags:\n"); 1653 1654 if (flags & NBD_FLAG_HAS_FLAGS) 1655 seq_puts(s, "NBD_FLAG_HAS_FLAGS\n"); 1656 if (flags & NBD_FLAG_READ_ONLY) 1657 seq_puts(s, "NBD_FLAG_READ_ONLY\n"); 1658 if (flags & NBD_FLAG_SEND_FLUSH) 1659 seq_puts(s, "NBD_FLAG_SEND_FLUSH\n"); 1660 if (flags & NBD_FLAG_SEND_FUA) 1661 seq_puts(s, "NBD_FLAG_SEND_FUA\n"); 1662 if (flags & NBD_FLAG_SEND_TRIM) 1663 seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); 1664 1665 return 0; 1666} 1667 1668static int nbd_dbg_flags_open(struct inode *inode, struct file *file) 1669{ 1670 return single_open(file, nbd_dbg_flags_show, inode->i_private); 1671} 1672 1673static const struct file_operations nbd_dbg_flags_ops = { 1674 .open = nbd_dbg_flags_open, 1675 .read = seq_read, 1676 .llseek = seq_lseek, 1677 .release = single_release, 1678}; 1679 1680static int nbd_dev_dbg_init(struct nbd_device *nbd) 1681{ 1682 struct dentry *dir; 1683 struct nbd_config *config = nbd->config; 1684 1685 if (!nbd_dbg_dir) 1686 return -EIO; 1687 1688 dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir); 1689 if (IS_ERR(dir)) { 1690 dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n", 1691 nbd_name(nbd)); 1692 return -EIO; 1693 } 1694 config->dbg_dir = dir; 1695 1696 debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); 1697 debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize); 1698 debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout); 1699 debugfs_create_u64("blocksize", 0444, dir, &config->blksize); 1700 debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops); 1701 1702 return 0; 1703} 1704 1705static void nbd_dev_dbg_close(struct nbd_device *nbd) 1706{ 1707 debugfs_remove_recursive(nbd->config->dbg_dir); 1708} 1709 1710static int nbd_dbg_init(void) 1711{ 1712 struct dentry *dbg_dir; 1713 1714 dbg_dir = debugfs_create_dir("nbd", NULL); 1715 if (IS_ERR(dbg_dir)) 1716 return -EIO; 1717 1718 nbd_dbg_dir = dbg_dir; 1719 1720 return 0; 1721} 1722 1723static void nbd_dbg_close(void) 1724{ 1725 debugfs_remove_recursive(nbd_dbg_dir); 1726} 1727 1728#else /* IS_ENABLED(CONFIG_DEBUG_FS) */ 1729 1730static int nbd_dev_dbg_init(struct nbd_device *nbd) 1731{ 1732 return 0; 1733} 1734 1735static void nbd_dev_dbg_close(struct nbd_device *nbd) 1736{ 1737} 1738 1739static int nbd_dbg_init(void) 1740{ 1741 return 0; 1742} 1743 1744static void nbd_dbg_close(void) 1745{ 1746} 1747 1748#endif 1749 1750static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq, 1751 unsigned int hctx_idx, unsigned int numa_node) 1752{ 1753 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); 1754 cmd->nbd = set->driver_data; 1755 cmd->flags = 0; 1756 mutex_init(&cmd->lock); 1757 return 0; 1758} 1759 1760static const struct blk_mq_ops nbd_mq_ops = { 1761 .queue_rq = nbd_queue_rq, 1762 .complete = nbd_complete_rq, 1763 .init_request = nbd_init_request, 1764 .timeout = nbd_xmit_timeout, 1765}; 1766 1767static int nbd_dev_add(int index) 1768{ 1769 struct nbd_device *nbd; 1770 struct gendisk *disk; 1771 struct request_queue *q; 1772 int err = -ENOMEM; 1773 1774 nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL); 1775 if (!nbd) 1776 goto out; 1777 1778 disk = alloc_disk(1 << part_shift); 1779 if (!disk) 1780 goto out_free_nbd; 1781 1782 if (index >= 0) { 1783 err = idr_alloc(&nbd_index_idr, nbd, index, index + 1, 1784 GFP_KERNEL); 1785 if (err == -ENOSPC) 1786 err = -EEXIST; 1787 } else { 1788 err = idr_alloc(&nbd_index_idr, nbd, 0, 1789 (MINORMASK >> part_shift) + 1, GFP_KERNEL); 1790 if (err >= 0) 1791 index = err; 1792 } 1793 if (err < 0) 1794 goto out_free_disk; 1795 1796 nbd->index = index; 1797 nbd->disk = disk; 1798 nbd->tag_set.ops = &nbd_mq_ops; 1799 nbd->tag_set.nr_hw_queues = 1; 1800 nbd->tag_set.queue_depth = 128; 1801 nbd->tag_set.numa_node = NUMA_NO_NODE; 1802 nbd->tag_set.cmd_size = sizeof(struct nbd_cmd); 1803 nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | 1804 BLK_MQ_F_BLOCKING; 1805 nbd->tag_set.driver_data = nbd; 1806 nbd->destroy_complete = NULL; 1807 1808 err = blk_mq_alloc_tag_set(&nbd->tag_set); 1809 if (err) 1810 goto out_free_idr; 1811 1812 q = blk_mq_init_queue(&nbd->tag_set); 1813 if (IS_ERR(q)) { 1814 err = PTR_ERR(q); 1815 goto out_free_tags; 1816 } 1817 disk->queue = q; 1818 1819 /* 1820 * Tell the block layer that we are not a rotational device 1821 */ 1822 blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); 1823 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); 1824 disk->queue->limits.discard_granularity = 0; 1825 disk->queue->limits.discard_alignment = 0; 1826 blk_queue_max_discard_sectors(disk->queue, 0); 1827 blk_queue_max_segment_size(disk->queue, UINT_MAX); 1828 blk_queue_max_segments(disk->queue, USHRT_MAX); 1829 blk_queue_max_hw_sectors(disk->queue, 65536); 1830 disk->queue->limits.max_sectors = 256; 1831 1832 mutex_init(&nbd->config_lock); 1833 refcount_set(&nbd->config_refs, 0); 1834 refcount_set(&nbd->refs, 1); 1835 INIT_LIST_HEAD(&nbd->list); 1836 disk->major = NBD_MAJOR; 1837 disk->first_minor = index << part_shift; 1838 disk->fops = &nbd_fops; 1839 disk->private_data = nbd; 1840 sprintf(disk->disk_name, "nbd%d", index); 1841 add_disk(disk); 1842 nbd_total_devices++; 1843 return index; 1844 1845out_free_tags: 1846 blk_mq_free_tag_set(&nbd->tag_set); 1847out_free_idr: 1848 idr_remove(&nbd_index_idr, index); 1849out_free_disk: 1850 put_disk(disk); 1851out_free_nbd: 1852 kfree(nbd); 1853out: 1854 return err; 1855} 1856 1857static int find_free_cb(int id, void *ptr, void *data) 1858{ 1859 struct nbd_device *nbd = ptr; 1860 struct nbd_device **found = data; 1861 1862 if (!refcount_read(&nbd->config_refs)) { 1863 *found = nbd; 1864 return 1; 1865 } 1866 return 0; 1867} 1868 1869/* Netlink interface. */ 1870static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = { 1871 [NBD_ATTR_INDEX] = { .type = NLA_U32 }, 1872 [NBD_ATTR_SIZE_BYTES] = { .type = NLA_U64 }, 1873 [NBD_ATTR_BLOCK_SIZE_BYTES] = { .type = NLA_U64 }, 1874 [NBD_ATTR_TIMEOUT] = { .type = NLA_U64 }, 1875 [NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 }, 1876 [NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 }, 1877 [NBD_ATTR_SOCKETS] = { .type = NLA_NESTED}, 1878 [NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 }, 1879 [NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED}, 1880}; 1881 1882static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = { 1883 [NBD_SOCK_FD] = { .type = NLA_U32 }, 1884}; 1885 1886/* We don't use this right now since we don't parse the incoming list, but we 1887 * still want it here so userspace knows what to expect. 1888 */ 1889static const struct nla_policy __attribute__((unused)) 1890nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = { 1891 [NBD_DEVICE_INDEX] = { .type = NLA_U32 }, 1892 [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 }, 1893}; 1894 1895static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) 1896{ 1897 struct nbd_config *config = nbd->config; 1898 u64 bsize = config->blksize; 1899 u64 bytes = config->bytesize; 1900 1901 if (info->attrs[NBD_ATTR_SIZE_BYTES]) 1902 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); 1903 1904 if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { 1905 bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); 1906 if (!bsize) 1907 bsize = NBD_DEF_BLKSIZE; 1908 if (!nbd_is_valid_blksize(bsize)) { 1909 printk(KERN_ERR "Invalid block size %llu\n", bsize); 1910 return -EINVAL; 1911 } 1912 } 1913 1914 if (bytes != config->bytesize || bsize != config->blksize) 1915 nbd_size_set(nbd, bsize, div64_u64(bytes, bsize)); 1916 return 0; 1917} 1918 1919static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) 1920{ 1921 DECLARE_COMPLETION_ONSTACK(destroy_complete); 1922 struct nbd_device *nbd = NULL; 1923 struct nbd_config *config; 1924 int index = -1; 1925 int ret; 1926 bool put_dev = false; 1927 1928 if (!netlink_capable(skb, CAP_SYS_ADMIN)) 1929 return -EPERM; 1930 1931 if (info->attrs[NBD_ATTR_INDEX]) { 1932 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); 1933 1934 /* 1935 * Too big first_minor can cause duplicate creation of 1936 * sysfs files/links, since index << part_shift might overflow, or 1937 * MKDEV() expect that the max bits of first_minor is 20. 1938 */ 1939 if (index < 0 || index > MINORMASK >> part_shift) { 1940 printk(KERN_ERR "nbd: illegal input index %d\n", index); 1941 return -EINVAL; 1942 } 1943 } 1944 if (!info->attrs[NBD_ATTR_SOCKETS]) { 1945 printk(KERN_ERR "nbd: must specify at least one socket\n"); 1946 return -EINVAL; 1947 } 1948 if (!info->attrs[NBD_ATTR_SIZE_BYTES]) { 1949 printk(KERN_ERR "nbd: must specify a size in bytes for the device\n"); 1950 return -EINVAL; 1951 } 1952again: 1953 mutex_lock(&nbd_index_mutex); 1954 if (index == -1) { 1955 ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd); 1956 if (ret == 0) { 1957 int new_index; 1958 new_index = nbd_dev_add(-1); 1959 if (new_index < 0) { 1960 mutex_unlock(&nbd_index_mutex); 1961 printk(KERN_ERR "nbd: failed to add new device\n"); 1962 return new_index; 1963 } 1964 nbd = idr_find(&nbd_index_idr, new_index); 1965 } 1966 } else { 1967 nbd = idr_find(&nbd_index_idr, index); 1968 if (!nbd) { 1969 ret = nbd_dev_add(index); 1970 if (ret < 0) { 1971 mutex_unlock(&nbd_index_mutex); 1972 printk(KERN_ERR "nbd: failed to add new device\n"); 1973 return ret; 1974 } 1975 nbd = idr_find(&nbd_index_idr, index); 1976 } 1977 } 1978 if (!nbd) { 1979 printk(KERN_ERR "nbd: couldn't find device at index %d\n", 1980 index); 1981 mutex_unlock(&nbd_index_mutex); 1982 return -EINVAL; 1983 } 1984 1985 if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && 1986 test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { 1987 nbd->destroy_complete = &destroy_complete; 1988 mutex_unlock(&nbd_index_mutex); 1989 1990 /* Wait untill the the nbd stuff is totally destroyed */ 1991 wait_for_completion(&destroy_complete); 1992 goto again; 1993 } 1994 1995 if (!refcount_inc_not_zero(&nbd->refs)) { 1996 mutex_unlock(&nbd_index_mutex); 1997 if (index == -1) 1998 goto again; 1999 printk(KERN_ERR "nbd: device at index %d is going down\n", 2000 index); 2001 return -EINVAL; 2002 } 2003 mutex_unlock(&nbd_index_mutex); 2004 2005 mutex_lock(&nbd->config_lock); 2006 if (refcount_read(&nbd->config_refs)) { 2007 mutex_unlock(&nbd->config_lock); 2008 nbd_put(nbd); 2009 if (index == -1) 2010 goto again; 2011 printk(KERN_ERR "nbd: nbd%d already in use\n", index); 2012 return -EBUSY; 2013 } 2014 if (WARN_ON(nbd->config)) { 2015 mutex_unlock(&nbd->config_lock); 2016 nbd_put(nbd); 2017 return -EINVAL; 2018 } 2019 config = nbd_alloc_config(); 2020 if (IS_ERR(config)) { 2021 mutex_unlock(&nbd->config_lock); 2022 nbd_put(nbd); 2023 printk(KERN_ERR "nbd: couldn't allocate config\n"); 2024 return PTR_ERR(config); 2025 } 2026 nbd->config = config; 2027 refcount_set(&nbd->config_refs, 1); 2028 set_bit(NBD_RT_BOUND, &config->runtime_flags); 2029 2030 ret = nbd_genl_size_set(info, nbd); 2031 if (ret) 2032 goto out; 2033 2034 if (info->attrs[NBD_ATTR_TIMEOUT]) 2035 nbd_set_cmd_timeout(nbd, 2036 nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); 2037 if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { 2038 config->dead_conn_timeout = 2039 nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); 2040 config->dead_conn_timeout *= HZ; 2041 } 2042 if (info->attrs[NBD_ATTR_SERVER_FLAGS]) 2043 config->flags = 2044 nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]); 2045 if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { 2046 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); 2047 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { 2048 /* 2049 * We have 1 ref to keep the device around, and then 1 2050 * ref for our current operation here, which will be 2051 * inherited by the config. If we already have 2052 * DESTROY_ON_DISCONNECT set then we know we don't have 2053 * that extra ref already held so we don't need the 2054 * put_dev. 2055 */ 2056 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, 2057 &nbd->flags)) 2058 put_dev = true; 2059 } else { 2060 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, 2061 &nbd->flags)) 2062 refcount_inc(&nbd->refs); 2063 } 2064 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { 2065 set_bit(NBD_RT_DISCONNECT_ON_CLOSE, 2066 &config->runtime_flags); 2067 } 2068 } 2069 2070 if (info->attrs[NBD_ATTR_SOCKETS]) { 2071 struct nlattr *attr; 2072 int rem, fd; 2073 2074 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS], 2075 rem) { 2076 struct nlattr *socks[NBD_SOCK_MAX+1]; 2077 2078 if (nla_type(attr) != NBD_SOCK_ITEM) { 2079 printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); 2080 ret = -EINVAL; 2081 goto out; 2082 } 2083 ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX, 2084 attr, 2085 nbd_sock_policy, 2086 info->extack); 2087 if (ret != 0) { 2088 printk(KERN_ERR "nbd: error processing sock list\n"); 2089 ret = -EINVAL; 2090 goto out; 2091 } 2092 if (!socks[NBD_SOCK_FD]) 2093 continue; 2094 fd = (int)nla_get_u32(socks[NBD_SOCK_FD]); 2095 ret = nbd_add_socket(nbd, fd, true); 2096 if (ret) 2097 goto out; 2098 } 2099 } 2100 ret = nbd_start_device(nbd); 2101out: 2102 mutex_unlock(&nbd->config_lock); 2103 if (!ret) { 2104 set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); 2105 refcount_inc(&nbd->config_refs); 2106 nbd_connect_reply(info, nbd->index); 2107 } 2108 nbd_config_put(nbd); 2109 if (put_dev) 2110 nbd_put(nbd); 2111 return ret; 2112} 2113 2114static void nbd_disconnect_and_put(struct nbd_device *nbd) 2115{ 2116 mutex_lock(&nbd->config_lock); 2117 nbd_disconnect(nbd); 2118 sock_shutdown(nbd); 2119 wake_up(&nbd->config->conn_wait); 2120 /* 2121 * Make sure recv thread has finished, so it does not drop the last 2122 * config ref and try to destroy the workqueue from inside the work 2123 * queue. And this also ensure that we can safely call nbd_clear_que() 2124 * to cancel the inflight I/Os. 2125 */ 2126 if (nbd->recv_workq) 2127 flush_workqueue(nbd->recv_workq); 2128 nbd_clear_que(nbd); 2129 nbd->task_setup = NULL; 2130 mutex_unlock(&nbd->config_lock); 2131 2132 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, 2133 &nbd->config->runtime_flags)) 2134 nbd_config_put(nbd); 2135} 2136 2137static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) 2138{ 2139 struct nbd_device *nbd; 2140 int index; 2141 2142 if (!netlink_capable(skb, CAP_SYS_ADMIN)) 2143 return -EPERM; 2144 2145 if (!info->attrs[NBD_ATTR_INDEX]) { 2146 printk(KERN_ERR "nbd: must specify an index to disconnect\n"); 2147 return -EINVAL; 2148 } 2149 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); 2150 mutex_lock(&nbd_index_mutex); 2151 nbd = idr_find(&nbd_index_idr, index); 2152 if (!nbd) { 2153 mutex_unlock(&nbd_index_mutex); 2154 printk(KERN_ERR "nbd: couldn't find device at index %d\n", 2155 index); 2156 return -EINVAL; 2157 } 2158 if (!refcount_inc_not_zero(&nbd->refs)) { 2159 mutex_unlock(&nbd_index_mutex); 2160 printk(KERN_ERR "nbd: device at index %d is going down\n", 2161 index); 2162 return -EINVAL; 2163 } 2164 mutex_unlock(&nbd_index_mutex); 2165 if (!refcount_inc_not_zero(&nbd->config_refs)) { 2166 nbd_put(nbd); 2167 return 0; 2168 } 2169 nbd_disconnect_and_put(nbd); 2170 nbd_config_put(nbd); 2171 nbd_put(nbd); 2172 return 0; 2173} 2174 2175static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) 2176{ 2177 struct nbd_device *nbd = NULL; 2178 struct nbd_config *config; 2179 int index; 2180 int ret = 0; 2181 bool put_dev = false; 2182 2183 if (!netlink_capable(skb, CAP_SYS_ADMIN)) 2184 return -EPERM; 2185 2186 if (!info->attrs[NBD_ATTR_INDEX]) { 2187 printk(KERN_ERR "nbd: must specify a device to reconfigure\n"); 2188 return -EINVAL; 2189 } 2190 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); 2191 mutex_lock(&nbd_index_mutex); 2192 nbd = idr_find(&nbd_index_idr, index); 2193 if (!nbd) { 2194 mutex_unlock(&nbd_index_mutex); 2195 printk(KERN_ERR "nbd: couldn't find a device at index %d\n", 2196 index); 2197 return -EINVAL; 2198 } 2199 if (!refcount_inc_not_zero(&nbd->refs)) { 2200 mutex_unlock(&nbd_index_mutex); 2201 printk(KERN_ERR "nbd: device at index %d is going down\n", 2202 index); 2203 return -EINVAL; 2204 } 2205 mutex_unlock(&nbd_index_mutex); 2206 2207 if (!refcount_inc_not_zero(&nbd->config_refs)) { 2208 dev_err(nbd_to_dev(nbd), 2209 "not configured, cannot reconfigure\n"); 2210 nbd_put(nbd); 2211 return -EINVAL; 2212 } 2213 2214 mutex_lock(&nbd->config_lock); 2215 config = nbd->config; 2216 if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || 2217 !nbd->pid) { 2218 dev_err(nbd_to_dev(nbd), 2219 "not configured, cannot reconfigure\n"); 2220 ret = -EINVAL; 2221 goto out; 2222 } 2223 2224 ret = nbd_genl_size_set(info, nbd); 2225 if (ret) 2226 goto out; 2227 2228 if (info->attrs[NBD_ATTR_TIMEOUT]) 2229 nbd_set_cmd_timeout(nbd, 2230 nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); 2231 if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { 2232 config->dead_conn_timeout = 2233 nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); 2234 config->dead_conn_timeout *= HZ; 2235 } 2236 if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { 2237 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); 2238 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { 2239 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, 2240 &nbd->flags)) 2241 put_dev = true; 2242 } else { 2243 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, 2244 &nbd->flags)) 2245 refcount_inc(&nbd->refs); 2246 } 2247 2248 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { 2249 set_bit(NBD_RT_DISCONNECT_ON_CLOSE, 2250 &config->runtime_flags); 2251 } else { 2252 clear_bit(NBD_RT_DISCONNECT_ON_CLOSE, 2253 &config->runtime_flags); 2254 } 2255 } 2256 2257 if (info->attrs[NBD_ATTR_SOCKETS]) { 2258 struct nlattr *attr; 2259 int rem, fd; 2260 2261 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS], 2262 rem) { 2263 struct nlattr *socks[NBD_SOCK_MAX+1]; 2264 2265 if (nla_type(attr) != NBD_SOCK_ITEM) { 2266 printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); 2267 ret = -EINVAL; 2268 goto out; 2269 } 2270 ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX, 2271 attr, 2272 nbd_sock_policy, 2273 info->extack); 2274 if (ret != 0) { 2275 printk(KERN_ERR "nbd: error processing sock list\n"); 2276 ret = -EINVAL; 2277 goto out; 2278 } 2279 if (!socks[NBD_SOCK_FD]) 2280 continue; 2281 fd = (int)nla_get_u32(socks[NBD_SOCK_FD]); 2282 ret = nbd_reconnect_socket(nbd, fd); 2283 if (ret) { 2284 if (ret == -ENOSPC) 2285 ret = 0; 2286 goto out; 2287 } 2288 dev_info(nbd_to_dev(nbd), "reconnected socket\n"); 2289 } 2290 } 2291out: 2292 mutex_unlock(&nbd->config_lock); 2293 nbd_config_put(nbd); 2294 nbd_put(nbd); 2295 if (put_dev) 2296 nbd_put(nbd); 2297 return ret; 2298} 2299 2300static const struct genl_small_ops nbd_connect_genl_ops[] = { 2301 { 2302 .cmd = NBD_CMD_CONNECT, 2303 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2304 .doit = nbd_genl_connect, 2305 }, 2306 { 2307 .cmd = NBD_CMD_DISCONNECT, 2308 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2309 .doit = nbd_genl_disconnect, 2310 }, 2311 { 2312 .cmd = NBD_CMD_RECONFIGURE, 2313 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2314 .doit = nbd_genl_reconfigure, 2315 }, 2316 { 2317 .cmd = NBD_CMD_STATUS, 2318 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2319 .doit = nbd_genl_status, 2320 }, 2321}; 2322 2323static const struct genl_multicast_group nbd_mcast_grps[] = { 2324 { .name = NBD_GENL_MCAST_GROUP_NAME, }, 2325}; 2326 2327static struct genl_family nbd_genl_family __ro_after_init = { 2328 .hdrsize = 0, 2329 .name = NBD_GENL_FAMILY_NAME, 2330 .version = NBD_GENL_VERSION, 2331 .module = THIS_MODULE, 2332 .small_ops = nbd_connect_genl_ops, 2333 .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops), 2334 .maxattr = NBD_ATTR_MAX, 2335 .policy = nbd_attr_policy, 2336 .mcgrps = nbd_mcast_grps, 2337 .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), 2338}; 2339 2340static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply) 2341{ 2342 struct nlattr *dev_opt; 2343 u8 connected = 0; 2344 int ret; 2345 2346 /* This is a little racey, but for status it's ok. The 2347 * reason we don't take a ref here is because we can't 2348 * take a ref in the index == -1 case as we would need 2349 * to put under the nbd_index_mutex, which could 2350 * deadlock if we are configured to remove ourselves 2351 * once we're disconnected. 2352 */ 2353 if (refcount_read(&nbd->config_refs)) 2354 connected = 1; 2355 dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM); 2356 if (!dev_opt) 2357 return -EMSGSIZE; 2358 ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index); 2359 if (ret) 2360 return -EMSGSIZE; 2361 ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED, 2362 connected); 2363 if (ret) 2364 return -EMSGSIZE; 2365 nla_nest_end(reply, dev_opt); 2366 return 0; 2367} 2368 2369static int status_cb(int id, void *ptr, void *data) 2370{ 2371 struct nbd_device *nbd = ptr; 2372 return populate_nbd_status(nbd, (struct sk_buff *)data); 2373} 2374 2375static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info) 2376{ 2377 struct nlattr *dev_list; 2378 struct sk_buff *reply; 2379 void *reply_head; 2380 size_t msg_size; 2381 int index = -1; 2382 int ret = -ENOMEM; 2383 2384 if (info->attrs[NBD_ATTR_INDEX]) 2385 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); 2386 2387 mutex_lock(&nbd_index_mutex); 2388 2389 msg_size = nla_total_size(nla_attr_size(sizeof(u32)) + 2390 nla_attr_size(sizeof(u8))); 2391 msg_size *= (index == -1) ? nbd_total_devices : 1; 2392 2393 reply = genlmsg_new(msg_size, GFP_KERNEL); 2394 if (!reply) 2395 goto out; 2396 reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0, 2397 NBD_CMD_STATUS); 2398 if (!reply_head) { 2399 nlmsg_free(reply); 2400 goto out; 2401 } 2402 2403 dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST); 2404 if (!dev_list) { 2405 nlmsg_free(reply); 2406 ret = -EMSGSIZE; 2407 goto out; 2408 } 2409 2410 if (index == -1) { 2411 ret = idr_for_each(&nbd_index_idr, &status_cb, reply); 2412 if (ret) { 2413 nlmsg_free(reply); 2414 goto out; 2415 } 2416 } else { 2417 struct nbd_device *nbd; 2418 nbd = idr_find(&nbd_index_idr, index); 2419 if (nbd) { 2420 ret = populate_nbd_status(nbd, reply); 2421 if (ret) { 2422 nlmsg_free(reply); 2423 goto out; 2424 } 2425 } 2426 } 2427 nla_nest_end(reply, dev_list); 2428 genlmsg_end(reply, reply_head); 2429 ret = genlmsg_reply(reply, info); 2430out: 2431 mutex_unlock(&nbd_index_mutex); 2432 return ret; 2433} 2434 2435static void nbd_connect_reply(struct genl_info *info, int index) 2436{ 2437 struct sk_buff *skb; 2438 void *msg_head; 2439 int ret; 2440 2441 skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL); 2442 if (!skb) 2443 return; 2444 msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0, 2445 NBD_CMD_CONNECT); 2446 if (!msg_head) { 2447 nlmsg_free(skb); 2448 return; 2449 } 2450 ret = nla_put_u32(skb, NBD_ATTR_INDEX, index); 2451 if (ret) { 2452 nlmsg_free(skb); 2453 return; 2454 } 2455 genlmsg_end(skb, msg_head); 2456 genlmsg_reply(skb, info); 2457} 2458 2459static void nbd_mcast_index(int index) 2460{ 2461 struct sk_buff *skb; 2462 void *msg_head; 2463 int ret; 2464 2465 skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL); 2466 if (!skb) 2467 return; 2468 msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0, 2469 NBD_CMD_LINK_DEAD); 2470 if (!msg_head) { 2471 nlmsg_free(skb); 2472 return; 2473 } 2474 ret = nla_put_u32(skb, NBD_ATTR_INDEX, index); 2475 if (ret) { 2476 nlmsg_free(skb); 2477 return; 2478 } 2479 genlmsg_end(skb, msg_head); 2480 genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL); 2481} 2482 2483static void nbd_dead_link_work(struct work_struct *work) 2484{ 2485 struct link_dead_args *args = container_of(work, struct link_dead_args, 2486 work); 2487 nbd_mcast_index(args->index); 2488 kfree(args); 2489} 2490 2491static int __init nbd_init(void) 2492{ 2493 int i; 2494 2495 BUILD_BUG_ON(sizeof(struct nbd_request) != 28); 2496 2497 if (max_part < 0) { 2498 printk(KERN_ERR "nbd: max_part must be >= 0\n"); 2499 return -EINVAL; 2500 } 2501 2502 part_shift = 0; 2503 if (max_part > 0) { 2504 part_shift = fls(max_part); 2505 2506 /* 2507 * Adjust max_part according to part_shift as it is exported 2508 * to user space so that user can know the max number of 2509 * partition kernel should be able to manage. 2510 * 2511 * Note that -1 is required because partition 0 is reserved 2512 * for the whole disk. 2513 */ 2514 max_part = (1UL << part_shift) - 1; 2515 } 2516 2517 if ((1UL << part_shift) > DISK_MAX_PARTS) 2518 return -EINVAL; 2519 2520 if (nbds_max > 1UL << (MINORBITS - part_shift)) 2521 return -EINVAL; 2522 2523 if (register_blkdev(NBD_MAJOR, "nbd")) 2524 return -EIO; 2525 2526 if (genl_register_family(&nbd_genl_family)) { 2527 unregister_blkdev(NBD_MAJOR, "nbd"); 2528 return -EINVAL; 2529 } 2530 nbd_dbg_init(); 2531 2532 mutex_lock(&nbd_index_mutex); 2533 for (i = 0; i < nbds_max; i++) 2534 nbd_dev_add(i); 2535 mutex_unlock(&nbd_index_mutex); 2536 return 0; 2537} 2538 2539static int nbd_exit_cb(int id, void *ptr, void *data) 2540{ 2541 struct list_head *list = (struct list_head *)data; 2542 struct nbd_device *nbd = ptr; 2543 2544 list_add_tail(&nbd->list, list); 2545 return 0; 2546} 2547 2548static void __exit nbd_cleanup(void) 2549{ 2550 struct nbd_device *nbd; 2551 LIST_HEAD(del_list); 2552 2553 /* 2554 * Unregister netlink interface prior to waiting 2555 * for the completion of netlink commands. 2556 */ 2557 genl_unregister_family(&nbd_genl_family); 2558 2559 nbd_dbg_close(); 2560 2561 mutex_lock(&nbd_index_mutex); 2562 idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list); 2563 mutex_unlock(&nbd_index_mutex); 2564 2565 while (!list_empty(&del_list)) { 2566 nbd = list_first_entry(&del_list, struct nbd_device, list); 2567 list_del_init(&nbd->list); 2568 if (refcount_read(&nbd->config_refs)) 2569 printk(KERN_ERR "nbd: possibly leaking nbd_config (ref %d)\n", 2570 refcount_read(&nbd->config_refs)); 2571 if (refcount_read(&nbd->refs) != 1) 2572 printk(KERN_ERR "nbd: possibly leaking a device\n"); 2573 nbd_put(nbd); 2574 } 2575 2576 idr_destroy(&nbd_index_idr); 2577 unregister_blkdev(NBD_MAJOR, "nbd"); 2578} 2579 2580module_init(nbd_init); 2581module_exit(nbd_cleanup); 2582 2583MODULE_DESCRIPTION("Network Block Device"); 2584MODULE_LICENSE("GPL"); 2585 2586module_param(nbds_max, int, 0444); 2587MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)"); 2588module_param(max_part, int, 0444); 2589MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)"); 2590