1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and 4 * Shaohua Li <shli@fb.com> 5 */ 6#include <linux/module.h> 7 8#include <linux/moduleparam.h> 9#include <linux/sched.h> 10#include <linux/fs.h> 11#include <linux/init.h> 12#include "null_blk.h" 13 14#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) 15#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) 16#define SECTOR_MASK (PAGE_SECTORS - 1) 17 18#define FREE_BATCH 16 19 20#define TICKS_PER_SEC 50ULL 21#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) 22 23#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 24static DECLARE_FAULT_ATTR(null_timeout_attr); 25static DECLARE_FAULT_ATTR(null_requeue_attr); 26static DECLARE_FAULT_ATTR(null_init_hctx_attr); 27#endif 28 29static inline u64 mb_per_tick(int mbps) 30{ 31 return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); 32} 33 34/* 35 * Status flags for nullb_device. 36 * 37 * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. 38 * UP: Device is currently on and visible in userspace. 39 * THROTTLED: Device is being throttled. 40 * CACHE: Device is using a write-back cache. 41 */ 42enum nullb_device_flags { 43 NULLB_DEV_FL_CONFIGURED = 0, 44 NULLB_DEV_FL_UP = 1, 45 NULLB_DEV_FL_THROTTLED = 2, 46 NULLB_DEV_FL_CACHE = 3, 47}; 48 49#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) 50/* 51 * nullb_page is a page in memory for nullb devices. 52 * 53 * @page: The page holding the data. 54 * @bitmap: The bitmap represents which sector in the page has data. 55 * Each bit represents one block size. For example, sector 8 56 * will use the 7th bit 57 * The highest 2 bits of bitmap are for special purpose. LOCK means the cache 58 * page is being flushing to storage. FREE means the cache page is freed and 59 * should be skipped from flushing to storage. Please see 60 * null_make_cache_space 61 */ 62struct nullb_page { 63 struct page *page; 64 DECLARE_BITMAP(bitmap, MAP_SZ); 65}; 66#define NULLB_PAGE_LOCK (MAP_SZ - 1) 67#define NULLB_PAGE_FREE (MAP_SZ - 2) 68 69static LIST_HEAD(nullb_list); 70static struct mutex lock; 71static int null_major; 72static DEFINE_IDA(nullb_indexes); 73static struct blk_mq_tag_set tag_set; 74 75enum { 76 NULL_IRQ_NONE = 0, 77 NULL_IRQ_SOFTIRQ = 1, 78 NULL_IRQ_TIMER = 2, 79}; 80 81enum { 82 NULL_Q_BIO = 0, 83 NULL_Q_RQ = 1, 84 NULL_Q_MQ = 2, 85}; 86 87static int g_no_sched; 88module_param_named(no_sched, g_no_sched, int, 0444); 89MODULE_PARM_DESC(no_sched, "No io scheduler"); 90 91static int g_submit_queues = 1; 92module_param_named(submit_queues, g_submit_queues, int, 0444); 93MODULE_PARM_DESC(submit_queues, "Number of submission queues"); 94 95static int g_home_node = NUMA_NO_NODE; 96module_param_named(home_node, g_home_node, int, 0444); 97MODULE_PARM_DESC(home_node, "Home node for the device"); 98 99#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 100/* 101 * For more details about fault injection, please refer to 102 * Documentation/fault-injection/fault-injection.rst. 103 */ 104static char g_timeout_str[80]; 105module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); 106MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>"); 107 108static char g_requeue_str[80]; 109module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); 110MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>"); 111 112static char g_init_hctx_str[80]; 113module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); 114MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); 115#endif 116 117static int g_queue_mode = NULL_Q_MQ; 118 119static int null_param_store_val(const char *str, int *val, int min, int max) 120{ 121 int ret, new_val; 122 123 ret = kstrtoint(str, 10, &new_val); 124 if (ret) 125 return -EINVAL; 126 127 if (new_val < min || new_val > max) 128 return -EINVAL; 129 130 *val = new_val; 131 return 0; 132} 133 134static int null_set_queue_mode(const char *str, const struct kernel_param *kp) 135{ 136 return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); 137} 138 139static const struct kernel_param_ops null_queue_mode_param_ops = { 140 .set = null_set_queue_mode, 141 .get = param_get_int, 142}; 143 144device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); 145MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); 146 147static int g_gb = 250; 148module_param_named(gb, g_gb, int, 0444); 149MODULE_PARM_DESC(gb, "Size in GB"); 150 151static int g_bs = 512; 152module_param_named(bs, g_bs, int, 0444); 153MODULE_PARM_DESC(bs, "Block size (in bytes)"); 154 155static unsigned int nr_devices = 1; 156module_param(nr_devices, uint, 0444); 157MODULE_PARM_DESC(nr_devices, "Number of devices to register"); 158 159static bool g_blocking; 160module_param_named(blocking, g_blocking, bool, 0444); 161MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); 162 163static bool shared_tags; 164module_param(shared_tags, bool, 0444); 165MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); 166 167static bool g_shared_tag_bitmap; 168module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); 169MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); 170 171static int g_irqmode = NULL_IRQ_SOFTIRQ; 172 173static int null_set_irqmode(const char *str, const struct kernel_param *kp) 174{ 175 return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, 176 NULL_IRQ_TIMER); 177} 178 179static const struct kernel_param_ops null_irqmode_param_ops = { 180 .set = null_set_irqmode, 181 .get = param_get_int, 182}; 183 184device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); 185MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); 186 187static unsigned long g_completion_nsec = 10000; 188module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); 189MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); 190 191static int g_hw_queue_depth = 64; 192module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); 193MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); 194 195static bool g_use_per_node_hctx; 196module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); 197MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); 198 199static bool g_zoned; 200module_param_named(zoned, g_zoned, bool, S_IRUGO); 201MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); 202 203static unsigned long g_zone_size = 256; 204module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); 205MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); 206 207static unsigned long g_zone_capacity; 208module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); 209MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); 210 211static unsigned int g_zone_nr_conv; 212module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); 213MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); 214 215static unsigned int g_zone_max_open; 216module_param_named(zone_max_open, g_zone_max_open, uint, 0444); 217MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); 218 219static unsigned int g_zone_max_active; 220module_param_named(zone_max_active, g_zone_max_active, uint, 0444); 221MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); 222 223static struct nullb_device *null_alloc_dev(void); 224static void null_free_dev(struct nullb_device *dev); 225static void null_del_dev(struct nullb *nullb); 226static int null_add_dev(struct nullb_device *dev); 227static void null_free_device_storage(struct nullb_device *dev, bool is_cache); 228 229static inline struct nullb_device *to_nullb_device(struct config_item *item) 230{ 231 return item ? container_of(item, struct nullb_device, item) : NULL; 232} 233 234static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) 235{ 236 return snprintf(page, PAGE_SIZE, "%u\n", val); 237} 238 239static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, 240 char *page) 241{ 242 return snprintf(page, PAGE_SIZE, "%lu\n", val); 243} 244 245static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) 246{ 247 return snprintf(page, PAGE_SIZE, "%u\n", val); 248} 249 250static ssize_t nullb_device_uint_attr_store(unsigned int *val, 251 const char *page, size_t count) 252{ 253 unsigned int tmp; 254 int result; 255 256 result = kstrtouint(page, 0, &tmp); 257 if (result < 0) 258 return result; 259 260 *val = tmp; 261 return count; 262} 263 264static ssize_t nullb_device_ulong_attr_store(unsigned long *val, 265 const char *page, size_t count) 266{ 267 int result; 268 unsigned long tmp; 269 270 result = kstrtoul(page, 0, &tmp); 271 if (result < 0) 272 return result; 273 274 *val = tmp; 275 return count; 276} 277 278static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, 279 size_t count) 280{ 281 bool tmp; 282 int result; 283 284 result = kstrtobool(page, &tmp); 285 if (result < 0) 286 return result; 287 288 *val = tmp; 289 return count; 290} 291 292/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ 293#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ 294static ssize_t \ 295nullb_device_##NAME##_show(struct config_item *item, char *page) \ 296{ \ 297 return nullb_device_##TYPE##_attr_show( \ 298 to_nullb_device(item)->NAME, page); \ 299} \ 300static ssize_t \ 301nullb_device_##NAME##_store(struct config_item *item, const char *page, \ 302 size_t count) \ 303{ \ 304 int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ 305 struct nullb_device *dev = to_nullb_device(item); \ 306 TYPE new_value = 0; \ 307 int ret; \ 308 \ 309 ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ 310 if (ret < 0) \ 311 return ret; \ 312 if (apply_fn) \ 313 ret = apply_fn(dev, new_value); \ 314 else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ 315 ret = -EBUSY; \ 316 if (ret < 0) \ 317 return ret; \ 318 dev->NAME = new_value; \ 319 return count; \ 320} \ 321CONFIGFS_ATTR(nullb_device_, NAME); 322 323static int nullb_apply_submit_queues(struct nullb_device *dev, 324 unsigned int submit_queues) 325{ 326 struct nullb *nullb = dev->nullb; 327 struct blk_mq_tag_set *set; 328 329 if (!nullb) 330 return 0; 331 332 /* 333 * Make sure that null_init_hctx() does not access nullb->queues[] past 334 * the end of that array. 335 */ 336 if (submit_queues > nr_cpu_ids) 337 return -EINVAL; 338 set = nullb->tag_set; 339 blk_mq_update_nr_hw_queues(set, submit_queues); 340 return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; 341} 342 343NULLB_DEVICE_ATTR(size, ulong, NULL); 344NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); 345NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); 346NULLB_DEVICE_ATTR(home_node, uint, NULL); 347NULLB_DEVICE_ATTR(queue_mode, uint, NULL); 348NULLB_DEVICE_ATTR(blocksize, uint, NULL); 349NULLB_DEVICE_ATTR(irqmode, uint, NULL); 350NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); 351NULLB_DEVICE_ATTR(index, uint, NULL); 352NULLB_DEVICE_ATTR(blocking, bool, NULL); 353NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); 354NULLB_DEVICE_ATTR(memory_backed, bool, NULL); 355NULLB_DEVICE_ATTR(discard, bool, NULL); 356NULLB_DEVICE_ATTR(mbps, uint, NULL); 357NULLB_DEVICE_ATTR(cache_size, ulong, NULL); 358NULLB_DEVICE_ATTR(zoned, bool, NULL); 359NULLB_DEVICE_ATTR(zone_size, ulong, NULL); 360NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); 361NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); 362NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); 363NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); 364 365static ssize_t nullb_device_power_show(struct config_item *item, char *page) 366{ 367 return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); 368} 369 370static ssize_t nullb_device_power_store(struct config_item *item, 371 const char *page, size_t count) 372{ 373 struct nullb_device *dev = to_nullb_device(item); 374 bool newp = false; 375 ssize_t ret; 376 377 ret = nullb_device_bool_attr_store(&newp, page, count); 378 if (ret < 0) 379 return ret; 380 381 if (!dev->power && newp) { 382 if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) 383 return count; 384 if (null_add_dev(dev)) { 385 clear_bit(NULLB_DEV_FL_UP, &dev->flags); 386 return -ENOMEM; 387 } 388 389 set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); 390 dev->power = newp; 391 } else if (dev->power && !newp) { 392 if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { 393 mutex_lock(&lock); 394 dev->power = newp; 395 null_del_dev(dev->nullb); 396 mutex_unlock(&lock); 397 } 398 clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); 399 } 400 401 return count; 402} 403 404CONFIGFS_ATTR(nullb_device_, power); 405 406static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) 407{ 408 struct nullb_device *t_dev = to_nullb_device(item); 409 410 return badblocks_show(&t_dev->badblocks, page, 0); 411} 412 413static ssize_t nullb_device_badblocks_store(struct config_item *item, 414 const char *page, size_t count) 415{ 416 struct nullb_device *t_dev = to_nullb_device(item); 417 char *orig, *buf, *tmp; 418 u64 start, end; 419 int ret; 420 421 orig = kstrndup(page, count, GFP_KERNEL); 422 if (!orig) 423 return -ENOMEM; 424 425 buf = strstrip(orig); 426 427 ret = -EINVAL; 428 if (buf[0] != '+' && buf[0] != '-') 429 goto out; 430 tmp = strchr(&buf[1], '-'); 431 if (!tmp) 432 goto out; 433 *tmp = '\0'; 434 ret = kstrtoull(buf + 1, 0, &start); 435 if (ret) 436 goto out; 437 ret = kstrtoull(tmp + 1, 0, &end); 438 if (ret) 439 goto out; 440 ret = -EINVAL; 441 if (start > end) 442 goto out; 443 /* enable badblocks */ 444 cmpxchg(&t_dev->badblocks.shift, -1, 0); 445 if (buf[0] == '+') 446 ret = badblocks_set(&t_dev->badblocks, start, 447 end - start + 1, 1); 448 else 449 ret = badblocks_clear(&t_dev->badblocks, start, 450 end - start + 1); 451 if (ret == 0) 452 ret = count; 453out: 454 kfree(orig); 455 return ret; 456} 457CONFIGFS_ATTR(nullb_device_, badblocks); 458 459static struct configfs_attribute *nullb_device_attrs[] = { 460 &nullb_device_attr_size, 461 &nullb_device_attr_completion_nsec, 462 &nullb_device_attr_submit_queues, 463 &nullb_device_attr_home_node, 464 &nullb_device_attr_queue_mode, 465 &nullb_device_attr_blocksize, 466 &nullb_device_attr_irqmode, 467 &nullb_device_attr_hw_queue_depth, 468 &nullb_device_attr_index, 469 &nullb_device_attr_blocking, 470 &nullb_device_attr_use_per_node_hctx, 471 &nullb_device_attr_power, 472 &nullb_device_attr_memory_backed, 473 &nullb_device_attr_discard, 474 &nullb_device_attr_mbps, 475 &nullb_device_attr_cache_size, 476 &nullb_device_attr_badblocks, 477 &nullb_device_attr_zoned, 478 &nullb_device_attr_zone_size, 479 &nullb_device_attr_zone_capacity, 480 &nullb_device_attr_zone_nr_conv, 481 &nullb_device_attr_zone_max_open, 482 &nullb_device_attr_zone_max_active, 483 NULL, 484}; 485 486static void nullb_device_release(struct config_item *item) 487{ 488 struct nullb_device *dev = to_nullb_device(item); 489 490 null_free_device_storage(dev, false); 491 null_free_dev(dev); 492} 493 494static struct configfs_item_operations nullb_device_ops = { 495 .release = nullb_device_release, 496}; 497 498static const struct config_item_type nullb_device_type = { 499 .ct_item_ops = &nullb_device_ops, 500 .ct_attrs = nullb_device_attrs, 501 .ct_owner = THIS_MODULE, 502}; 503 504static struct 505config_item *nullb_group_make_item(struct config_group *group, const char *name) 506{ 507 struct nullb_device *dev; 508 509 dev = null_alloc_dev(); 510 if (!dev) 511 return ERR_PTR(-ENOMEM); 512 513 config_item_init_type_name(&dev->item, name, &nullb_device_type); 514 515 return &dev->item; 516} 517 518static void 519nullb_group_drop_item(struct config_group *group, struct config_item *item) 520{ 521 struct nullb_device *dev = to_nullb_device(item); 522 523 if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { 524 mutex_lock(&lock); 525 dev->power = false; 526 null_del_dev(dev->nullb); 527 mutex_unlock(&lock); 528 } 529 530 config_item_put(item); 531} 532 533static ssize_t memb_group_features_show(struct config_item *item, char *page) 534{ 535 return snprintf(page, PAGE_SIZE, 536 "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); 537} 538 539CONFIGFS_ATTR_RO(memb_group_, features); 540 541static struct configfs_attribute *nullb_group_attrs[] = { 542 &memb_group_attr_features, 543 NULL, 544}; 545 546static struct configfs_group_operations nullb_group_ops = { 547 .make_item = nullb_group_make_item, 548 .drop_item = nullb_group_drop_item, 549}; 550 551static const struct config_item_type nullb_group_type = { 552 .ct_group_ops = &nullb_group_ops, 553 .ct_attrs = nullb_group_attrs, 554 .ct_owner = THIS_MODULE, 555}; 556 557static struct configfs_subsystem nullb_subsys = { 558 .su_group = { 559 .cg_item = { 560 .ci_namebuf = "nullb", 561 .ci_type = &nullb_group_type, 562 }, 563 }, 564}; 565 566static inline int null_cache_active(struct nullb *nullb) 567{ 568 return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); 569} 570 571static struct nullb_device *null_alloc_dev(void) 572{ 573 struct nullb_device *dev; 574 575 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 576 if (!dev) 577 return NULL; 578 INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); 579 INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); 580 if (badblocks_init(&dev->badblocks, 0)) { 581 kfree(dev); 582 return NULL; 583 } 584 585 dev->size = g_gb * 1024; 586 dev->completion_nsec = g_completion_nsec; 587 dev->submit_queues = g_submit_queues; 588 dev->home_node = g_home_node; 589 dev->queue_mode = g_queue_mode; 590 dev->blocksize = g_bs; 591 dev->irqmode = g_irqmode; 592 dev->hw_queue_depth = g_hw_queue_depth; 593 dev->blocking = g_blocking; 594 dev->use_per_node_hctx = g_use_per_node_hctx; 595 dev->zoned = g_zoned; 596 dev->zone_size = g_zone_size; 597 dev->zone_capacity = g_zone_capacity; 598 dev->zone_nr_conv = g_zone_nr_conv; 599 dev->zone_max_open = g_zone_max_open; 600 dev->zone_max_active = g_zone_max_active; 601 return dev; 602} 603 604static void null_free_dev(struct nullb_device *dev) 605{ 606 if (!dev) 607 return; 608 609 null_free_zoned_dev(dev); 610 badblocks_exit(&dev->badblocks); 611 kfree(dev); 612} 613 614static void put_tag(struct nullb_queue *nq, unsigned int tag) 615{ 616 clear_bit_unlock(tag, nq->tag_map); 617 618 if (waitqueue_active(&nq->wait)) 619 wake_up(&nq->wait); 620} 621 622static unsigned int get_tag(struct nullb_queue *nq) 623{ 624 unsigned int tag; 625 626 do { 627 tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); 628 if (tag >= nq->queue_depth) 629 return -1U; 630 } while (test_and_set_bit_lock(tag, nq->tag_map)); 631 632 return tag; 633} 634 635static void free_cmd(struct nullb_cmd *cmd) 636{ 637 put_tag(cmd->nq, cmd->tag); 638} 639 640static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); 641 642static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) 643{ 644 struct nullb_cmd *cmd; 645 unsigned int tag; 646 647 tag = get_tag(nq); 648 if (tag != -1U) { 649 cmd = &nq->cmds[tag]; 650 cmd->tag = tag; 651 cmd->error = BLK_STS_OK; 652 cmd->nq = nq; 653 if (nq->dev->irqmode == NULL_IRQ_TIMER) { 654 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, 655 HRTIMER_MODE_REL); 656 cmd->timer.function = null_cmd_timer_expired; 657 } 658 return cmd; 659 } 660 661 return NULL; 662} 663 664static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) 665{ 666 struct nullb_cmd *cmd; 667 DEFINE_WAIT(wait); 668 669 cmd = __alloc_cmd(nq); 670 if (cmd || !can_wait) 671 return cmd; 672 673 do { 674 prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); 675 cmd = __alloc_cmd(nq); 676 if (cmd) 677 break; 678 679 io_schedule(); 680 } while (1); 681 682 finish_wait(&nq->wait, &wait); 683 return cmd; 684} 685 686static void end_cmd(struct nullb_cmd *cmd) 687{ 688 int queue_mode = cmd->nq->dev->queue_mode; 689 690 switch (queue_mode) { 691 case NULL_Q_MQ: 692 blk_mq_end_request(cmd->rq, cmd->error); 693 return; 694 case NULL_Q_BIO: 695 cmd->bio->bi_status = cmd->error; 696 bio_endio(cmd->bio); 697 break; 698 } 699 700 free_cmd(cmd); 701} 702 703static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) 704{ 705 end_cmd(container_of(timer, struct nullb_cmd, timer)); 706 707 return HRTIMER_NORESTART; 708} 709 710static void null_cmd_end_timer(struct nullb_cmd *cmd) 711{ 712 ktime_t kt = cmd->nq->dev->completion_nsec; 713 714 hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); 715} 716 717static void null_complete_rq(struct request *rq) 718{ 719 end_cmd(blk_mq_rq_to_pdu(rq)); 720} 721 722static struct nullb_page *null_alloc_page(gfp_t gfp_flags) 723{ 724 struct nullb_page *t_page; 725 726 t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); 727 if (!t_page) 728 goto out; 729 730 t_page->page = alloc_pages(gfp_flags, 0); 731 if (!t_page->page) 732 goto out_freepage; 733 734 memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); 735 return t_page; 736out_freepage: 737 kfree(t_page); 738out: 739 return NULL; 740} 741 742static void null_free_page(struct nullb_page *t_page) 743{ 744 __set_bit(NULLB_PAGE_FREE, t_page->bitmap); 745 if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) 746 return; 747 __free_page(t_page->page); 748 kfree(t_page); 749} 750 751static bool null_page_empty(struct nullb_page *page) 752{ 753 int size = MAP_SZ - 2; 754 755 return find_first_bit(page->bitmap, size) == size; 756} 757 758static void null_free_sector(struct nullb *nullb, sector_t sector, 759 bool is_cache) 760{ 761 unsigned int sector_bit; 762 u64 idx; 763 struct nullb_page *t_page, *ret; 764 struct radix_tree_root *root; 765 766 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 767 idx = sector >> PAGE_SECTORS_SHIFT; 768 sector_bit = (sector & SECTOR_MASK); 769 770 t_page = radix_tree_lookup(root, idx); 771 if (t_page) { 772 __clear_bit(sector_bit, t_page->bitmap); 773 774 if (null_page_empty(t_page)) { 775 ret = radix_tree_delete_item(root, idx, t_page); 776 WARN_ON(ret != t_page); 777 null_free_page(ret); 778 if (is_cache) 779 nullb->dev->curr_cache -= PAGE_SIZE; 780 } 781 } 782} 783 784static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, 785 struct nullb_page *t_page, bool is_cache) 786{ 787 struct radix_tree_root *root; 788 789 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 790 791 if (radix_tree_insert(root, idx, t_page)) { 792 null_free_page(t_page); 793 t_page = radix_tree_lookup(root, idx); 794 WARN_ON(!t_page || t_page->page->index != idx); 795 } else if (is_cache) 796 nullb->dev->curr_cache += PAGE_SIZE; 797 798 return t_page; 799} 800 801static void null_free_device_storage(struct nullb_device *dev, bool is_cache) 802{ 803 unsigned long pos = 0; 804 int nr_pages; 805 struct nullb_page *ret, *t_pages[FREE_BATCH]; 806 struct radix_tree_root *root; 807 808 root = is_cache ? &dev->cache : &dev->data; 809 810 do { 811 int i; 812 813 nr_pages = radix_tree_gang_lookup(root, 814 (void **)t_pages, pos, FREE_BATCH); 815 816 for (i = 0; i < nr_pages; i++) { 817 pos = t_pages[i]->page->index; 818 ret = radix_tree_delete_item(root, pos, t_pages[i]); 819 WARN_ON(ret != t_pages[i]); 820 null_free_page(ret); 821 } 822 823 pos++; 824 } while (nr_pages == FREE_BATCH); 825 826 if (is_cache) 827 dev->curr_cache = 0; 828} 829 830static struct nullb_page *__null_lookup_page(struct nullb *nullb, 831 sector_t sector, bool for_write, bool is_cache) 832{ 833 unsigned int sector_bit; 834 u64 idx; 835 struct nullb_page *t_page; 836 struct radix_tree_root *root; 837 838 idx = sector >> PAGE_SECTORS_SHIFT; 839 sector_bit = (sector & SECTOR_MASK); 840 841 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 842 t_page = radix_tree_lookup(root, idx); 843 WARN_ON(t_page && t_page->page->index != idx); 844 845 if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) 846 return t_page; 847 848 return NULL; 849} 850 851static struct nullb_page *null_lookup_page(struct nullb *nullb, 852 sector_t sector, bool for_write, bool ignore_cache) 853{ 854 struct nullb_page *page = NULL; 855 856 if (!ignore_cache) 857 page = __null_lookup_page(nullb, sector, for_write, true); 858 if (page) 859 return page; 860 return __null_lookup_page(nullb, sector, for_write, false); 861} 862 863static struct nullb_page *null_insert_page(struct nullb *nullb, 864 sector_t sector, bool ignore_cache) 865 __releases(&nullb->lock) 866 __acquires(&nullb->lock) 867{ 868 u64 idx; 869 struct nullb_page *t_page; 870 871 t_page = null_lookup_page(nullb, sector, true, ignore_cache); 872 if (t_page) 873 return t_page; 874 875 spin_unlock_irq(&nullb->lock); 876 877 t_page = null_alloc_page(GFP_NOIO); 878 if (!t_page) 879 goto out_lock; 880 881 if (radix_tree_preload(GFP_NOIO)) 882 goto out_freepage; 883 884 spin_lock_irq(&nullb->lock); 885 idx = sector >> PAGE_SECTORS_SHIFT; 886 t_page->page->index = idx; 887 t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); 888 radix_tree_preload_end(); 889 890 return t_page; 891out_freepage: 892 null_free_page(t_page); 893out_lock: 894 spin_lock_irq(&nullb->lock); 895 return null_lookup_page(nullb, sector, true, ignore_cache); 896} 897 898static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) 899{ 900 int i; 901 unsigned int offset; 902 u64 idx; 903 struct nullb_page *t_page, *ret; 904 void *dst, *src; 905 906 idx = c_page->page->index; 907 908 t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); 909 910 __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); 911 if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { 912 null_free_page(c_page); 913 if (t_page && null_page_empty(t_page)) { 914 ret = radix_tree_delete_item(&nullb->dev->data, 915 idx, t_page); 916 null_free_page(t_page); 917 } 918 return 0; 919 } 920 921 if (!t_page) 922 return -ENOMEM; 923 924 src = kmap_atomic(c_page->page); 925 dst = kmap_atomic(t_page->page); 926 927 for (i = 0; i < PAGE_SECTORS; 928 i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { 929 if (test_bit(i, c_page->bitmap)) { 930 offset = (i << SECTOR_SHIFT); 931 memcpy(dst + offset, src + offset, 932 nullb->dev->blocksize); 933 __set_bit(i, t_page->bitmap); 934 } 935 } 936 937 kunmap_atomic(dst); 938 kunmap_atomic(src); 939 940 ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); 941 null_free_page(ret); 942 nullb->dev->curr_cache -= PAGE_SIZE; 943 944 return 0; 945} 946 947static int null_make_cache_space(struct nullb *nullb, unsigned long n) 948{ 949 int i, err, nr_pages; 950 struct nullb_page *c_pages[FREE_BATCH]; 951 unsigned long flushed = 0, one_round; 952 953again: 954 if ((nullb->dev->cache_size * 1024 * 1024) > 955 nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) 956 return 0; 957 958 nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, 959 (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); 960 /* 961 * nullb_flush_cache_page could unlock before using the c_pages. To 962 * avoid race, we don't allow page free 963 */ 964 for (i = 0; i < nr_pages; i++) { 965 nullb->cache_flush_pos = c_pages[i]->page->index; 966 /* 967 * We found the page which is being flushed to disk by other 968 * threads 969 */ 970 if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) 971 c_pages[i] = NULL; 972 else 973 __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); 974 } 975 976 one_round = 0; 977 for (i = 0; i < nr_pages; i++) { 978 if (c_pages[i] == NULL) 979 continue; 980 err = null_flush_cache_page(nullb, c_pages[i]); 981 if (err) 982 return err; 983 one_round++; 984 } 985 flushed += one_round << PAGE_SHIFT; 986 987 if (n > flushed) { 988 if (nr_pages == 0) 989 nullb->cache_flush_pos = 0; 990 if (one_round == 0) { 991 /* give other threads a chance */ 992 spin_unlock_irq(&nullb->lock); 993 spin_lock_irq(&nullb->lock); 994 } 995 goto again; 996 } 997 return 0; 998} 999 1000static int copy_to_nullb(struct nullb *nullb, struct page *source, 1001 unsigned int off, sector_t sector, size_t n, bool is_fua) 1002{ 1003 size_t temp, count = 0; 1004 unsigned int offset; 1005 struct nullb_page *t_page; 1006 void *dst, *src; 1007 1008 while (count < n) { 1009 temp = min_t(size_t, nullb->dev->blocksize, n - count); 1010 1011 if (null_cache_active(nullb) && !is_fua) 1012 null_make_cache_space(nullb, PAGE_SIZE); 1013 1014 offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; 1015 t_page = null_insert_page(nullb, sector, 1016 !null_cache_active(nullb) || is_fua); 1017 if (!t_page) 1018 return -ENOSPC; 1019 1020 src = kmap_atomic(source); 1021 dst = kmap_atomic(t_page->page); 1022 memcpy(dst + offset, src + off + count, temp); 1023 kunmap_atomic(dst); 1024 kunmap_atomic(src); 1025 1026 __set_bit(sector & SECTOR_MASK, t_page->bitmap); 1027 1028 if (is_fua) 1029 null_free_sector(nullb, sector, true); 1030 1031 count += temp; 1032 sector += temp >> SECTOR_SHIFT; 1033 } 1034 return 0; 1035} 1036 1037static int copy_from_nullb(struct nullb *nullb, struct page *dest, 1038 unsigned int off, sector_t sector, size_t n) 1039{ 1040 size_t temp, count = 0; 1041 unsigned int offset; 1042 struct nullb_page *t_page; 1043 void *dst, *src; 1044 1045 while (count < n) { 1046 temp = min_t(size_t, nullb->dev->blocksize, n - count); 1047 1048 offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; 1049 t_page = null_lookup_page(nullb, sector, false, 1050 !null_cache_active(nullb)); 1051 1052 dst = kmap_atomic(dest); 1053 if (!t_page) { 1054 memset(dst + off + count, 0, temp); 1055 goto next; 1056 } 1057 src = kmap_atomic(t_page->page); 1058 memcpy(dst + off + count, src + offset, temp); 1059 kunmap_atomic(src); 1060next: 1061 kunmap_atomic(dst); 1062 1063 count += temp; 1064 sector += temp >> SECTOR_SHIFT; 1065 } 1066 return 0; 1067} 1068 1069static void nullb_fill_pattern(struct nullb *nullb, struct page *page, 1070 unsigned int len, unsigned int off) 1071{ 1072 void *dst; 1073 1074 dst = kmap_atomic(page); 1075 memset(dst + off, 0xFF, len); 1076 kunmap_atomic(dst); 1077} 1078 1079static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) 1080{ 1081 size_t temp; 1082 1083 spin_lock_irq(&nullb->lock); 1084 while (n > 0) { 1085 temp = min_t(size_t, n, nullb->dev->blocksize); 1086 null_free_sector(nullb, sector, false); 1087 if (null_cache_active(nullb)) 1088 null_free_sector(nullb, sector, true); 1089 sector += temp >> SECTOR_SHIFT; 1090 n -= temp; 1091 } 1092 spin_unlock_irq(&nullb->lock); 1093} 1094 1095static int null_handle_flush(struct nullb *nullb) 1096{ 1097 int err; 1098 1099 if (!null_cache_active(nullb)) 1100 return 0; 1101 1102 spin_lock_irq(&nullb->lock); 1103 while (true) { 1104 err = null_make_cache_space(nullb, 1105 nullb->dev->cache_size * 1024 * 1024); 1106 if (err || nullb->dev->curr_cache == 0) 1107 break; 1108 } 1109 1110 WARN_ON(!radix_tree_empty(&nullb->dev->cache)); 1111 spin_unlock_irq(&nullb->lock); 1112 return err; 1113} 1114 1115static int null_transfer(struct nullb *nullb, struct page *page, 1116 unsigned int len, unsigned int off, bool is_write, sector_t sector, 1117 bool is_fua) 1118{ 1119 struct nullb_device *dev = nullb->dev; 1120 unsigned int valid_len = len; 1121 int err = 0; 1122 1123 if (!is_write) { 1124 if (dev->zoned) 1125 valid_len = null_zone_valid_read_len(nullb, 1126 sector, len); 1127 1128 if (valid_len) { 1129 err = copy_from_nullb(nullb, page, off, 1130 sector, valid_len); 1131 off += valid_len; 1132 len -= valid_len; 1133 } 1134 1135 if (len) 1136 nullb_fill_pattern(nullb, page, len, off); 1137 flush_dcache_page(page); 1138 } else { 1139 flush_dcache_page(page); 1140 err = copy_to_nullb(nullb, page, off, sector, len, is_fua); 1141 } 1142 1143 return err; 1144} 1145 1146static int null_handle_rq(struct nullb_cmd *cmd) 1147{ 1148 struct request *rq = cmd->rq; 1149 struct nullb *nullb = cmd->nq->dev->nullb; 1150 int err; 1151 unsigned int len; 1152 sector_t sector; 1153 struct req_iterator iter; 1154 struct bio_vec bvec; 1155 1156 sector = blk_rq_pos(rq); 1157 1158 if (req_op(rq) == REQ_OP_DISCARD) { 1159 null_handle_discard(nullb, sector, blk_rq_bytes(rq)); 1160 return 0; 1161 } 1162 1163 spin_lock_irq(&nullb->lock); 1164 rq_for_each_segment(bvec, rq, iter) { 1165 len = bvec.bv_len; 1166 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1167 op_is_write(req_op(rq)), sector, 1168 rq->cmd_flags & REQ_FUA); 1169 if (err) { 1170 spin_unlock_irq(&nullb->lock); 1171 return err; 1172 } 1173 sector += len >> SECTOR_SHIFT; 1174 } 1175 spin_unlock_irq(&nullb->lock); 1176 1177 return 0; 1178} 1179 1180static int null_handle_bio(struct nullb_cmd *cmd) 1181{ 1182 struct bio *bio = cmd->bio; 1183 struct nullb *nullb = cmd->nq->dev->nullb; 1184 int err; 1185 unsigned int len; 1186 sector_t sector; 1187 struct bio_vec bvec; 1188 struct bvec_iter iter; 1189 1190 sector = bio->bi_iter.bi_sector; 1191 1192 if (bio_op(bio) == REQ_OP_DISCARD) { 1193 null_handle_discard(nullb, sector, 1194 bio_sectors(bio) << SECTOR_SHIFT); 1195 return 0; 1196 } 1197 1198 spin_lock_irq(&nullb->lock); 1199 bio_for_each_segment(bvec, bio, iter) { 1200 len = bvec.bv_len; 1201 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1202 op_is_write(bio_op(bio)), sector, 1203 bio->bi_opf & REQ_FUA); 1204 if (err) { 1205 spin_unlock_irq(&nullb->lock); 1206 return err; 1207 } 1208 sector += len >> SECTOR_SHIFT; 1209 } 1210 spin_unlock_irq(&nullb->lock); 1211 return 0; 1212} 1213 1214static void null_stop_queue(struct nullb *nullb) 1215{ 1216 struct request_queue *q = nullb->q; 1217 1218 if (nullb->dev->queue_mode == NULL_Q_MQ) 1219 blk_mq_stop_hw_queues(q); 1220} 1221 1222static void null_restart_queue_async(struct nullb *nullb) 1223{ 1224 struct request_queue *q = nullb->q; 1225 1226 if (nullb->dev->queue_mode == NULL_Q_MQ) 1227 blk_mq_start_stopped_hw_queues(q, true); 1228} 1229 1230static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) 1231{ 1232 struct nullb_device *dev = cmd->nq->dev; 1233 struct nullb *nullb = dev->nullb; 1234 blk_status_t sts = BLK_STS_OK; 1235 struct request *rq = cmd->rq; 1236 1237 if (!hrtimer_active(&nullb->bw_timer)) 1238 hrtimer_restart(&nullb->bw_timer); 1239 1240 if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { 1241 null_stop_queue(nullb); 1242 /* race with timer */ 1243 if (atomic_long_read(&nullb->cur_bytes) > 0) 1244 null_restart_queue_async(nullb); 1245 /* requeue request */ 1246 sts = BLK_STS_DEV_RESOURCE; 1247 } 1248 return sts; 1249} 1250 1251static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, 1252 sector_t sector, 1253 sector_t nr_sectors) 1254{ 1255 struct badblocks *bb = &cmd->nq->dev->badblocks; 1256 sector_t first_bad; 1257 int bad_sectors; 1258 1259 if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) 1260 return BLK_STS_IOERR; 1261 1262 return BLK_STS_OK; 1263} 1264 1265static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, 1266 enum req_opf op) 1267{ 1268 struct nullb_device *dev = cmd->nq->dev; 1269 int err; 1270 1271 if (dev->queue_mode == NULL_Q_BIO) 1272 err = null_handle_bio(cmd); 1273 else 1274 err = null_handle_rq(cmd); 1275 1276 return errno_to_blk_status(err); 1277} 1278 1279static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) 1280{ 1281 struct nullb_device *dev = cmd->nq->dev; 1282 struct bio *bio; 1283 1284 if (dev->memory_backed) 1285 return; 1286 1287 if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { 1288 zero_fill_bio(cmd->bio); 1289 } else if (req_op(cmd->rq) == REQ_OP_READ) { 1290 __rq_for_each_bio(bio, cmd->rq) 1291 zero_fill_bio(bio); 1292 } 1293} 1294 1295static inline void nullb_complete_cmd(struct nullb_cmd *cmd) 1296{ 1297 /* 1298 * Since root privileges are required to configure the null_blk 1299 * driver, it is fine that this driver does not initialize the 1300 * data buffers of read commands. Zero-initialize these buffers 1301 * anyway if KMSAN is enabled to prevent that KMSAN complains 1302 * about null_blk not initializing read data buffers. 1303 */ 1304 if (IS_ENABLED(CONFIG_KMSAN)) 1305 nullb_zero_read_cmd_buffer(cmd); 1306 1307 /* Complete IO by inline, softirq or timer */ 1308 switch (cmd->nq->dev->irqmode) { 1309 case NULL_IRQ_SOFTIRQ: 1310 switch (cmd->nq->dev->queue_mode) { 1311 case NULL_Q_MQ: 1312 blk_mq_complete_request(cmd->rq); 1313 break; 1314 case NULL_Q_BIO: 1315 /* 1316 * XXX: no proper submitting cpu information available. 1317 */ 1318 end_cmd(cmd); 1319 break; 1320 } 1321 break; 1322 case NULL_IRQ_NONE: 1323 end_cmd(cmd); 1324 break; 1325 case NULL_IRQ_TIMER: 1326 null_cmd_end_timer(cmd); 1327 break; 1328 } 1329} 1330 1331blk_status_t null_process_cmd(struct nullb_cmd *cmd, 1332 enum req_opf op, sector_t sector, 1333 unsigned int nr_sectors) 1334{ 1335 struct nullb_device *dev = cmd->nq->dev; 1336 blk_status_t ret; 1337 1338 if (dev->badblocks.shift != -1) { 1339 ret = null_handle_badblocks(cmd, sector, nr_sectors); 1340 if (ret != BLK_STS_OK) 1341 return ret; 1342 } 1343 1344 if (dev->memory_backed) 1345 return null_handle_memory_backed(cmd, op); 1346 1347 return BLK_STS_OK; 1348} 1349 1350static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, 1351 sector_t nr_sectors, enum req_opf op) 1352{ 1353 struct nullb_device *dev = cmd->nq->dev; 1354 struct nullb *nullb = dev->nullb; 1355 blk_status_t sts; 1356 1357 if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { 1358 sts = null_handle_throttled(cmd); 1359 if (sts != BLK_STS_OK) 1360 return sts; 1361 } 1362 1363 if (op == REQ_OP_FLUSH) { 1364 cmd->error = errno_to_blk_status(null_handle_flush(nullb)); 1365 goto out; 1366 } 1367 1368 if (dev->zoned) 1369 sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors); 1370 else 1371 sts = null_process_cmd(cmd, op, sector, nr_sectors); 1372 1373 /* Do not overwrite errors (e.g. timeout errors) */ 1374 if (cmd->error == BLK_STS_OK) 1375 cmd->error = sts; 1376 1377out: 1378 nullb_complete_cmd(cmd); 1379 return BLK_STS_OK; 1380} 1381 1382static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) 1383{ 1384 struct nullb *nullb = container_of(timer, struct nullb, bw_timer); 1385 ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); 1386 unsigned int mbps = nullb->dev->mbps; 1387 1388 if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) 1389 return HRTIMER_NORESTART; 1390 1391 atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); 1392 null_restart_queue_async(nullb); 1393 1394 hrtimer_forward_now(&nullb->bw_timer, timer_interval); 1395 1396 return HRTIMER_RESTART; 1397} 1398 1399static void nullb_setup_bwtimer(struct nullb *nullb) 1400{ 1401 ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); 1402 1403 hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1404 nullb->bw_timer.function = nullb_bwtimer_fn; 1405 atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); 1406 hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); 1407} 1408 1409static struct nullb_queue *nullb_to_queue(struct nullb *nullb) 1410{ 1411 int index = 0; 1412 1413 if (nullb->nr_queues != 1) 1414 index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); 1415 1416 return &nullb->queues[index]; 1417} 1418 1419static blk_qc_t null_submit_bio(struct bio *bio) 1420{ 1421 sector_t sector = bio->bi_iter.bi_sector; 1422 sector_t nr_sectors = bio_sectors(bio); 1423 struct nullb *nullb = bio->bi_disk->private_data; 1424 struct nullb_queue *nq = nullb_to_queue(nullb); 1425 struct nullb_cmd *cmd; 1426 1427 cmd = alloc_cmd(nq, 1); 1428 cmd->bio = bio; 1429 1430 null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); 1431 return BLK_QC_T_NONE; 1432} 1433 1434static bool should_timeout_request(struct request *rq) 1435{ 1436#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1437 if (g_timeout_str[0]) 1438 return should_fail(&null_timeout_attr, 1); 1439#endif 1440 return false; 1441} 1442 1443static bool should_requeue_request(struct request *rq) 1444{ 1445#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1446 if (g_requeue_str[0]) 1447 return should_fail(&null_requeue_attr, 1); 1448#endif 1449 return false; 1450} 1451 1452static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) 1453{ 1454 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); 1455 1456 pr_info("rq %p timed out\n", rq); 1457 1458 /* 1459 * If the device is marked as blocking (i.e. memory backed or zoned 1460 * device), the submission path may be blocked waiting for resources 1461 * and cause real timeouts. For these real timeouts, the submission 1462 * path will complete the request using blk_mq_complete_request(). 1463 * Only fake timeouts need to execute blk_mq_complete_request() here. 1464 */ 1465 cmd->error = BLK_STS_TIMEOUT; 1466 if (cmd->fake_timeout) 1467 blk_mq_complete_request(rq); 1468 return BLK_EH_DONE; 1469} 1470 1471static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, 1472 const struct blk_mq_queue_data *bd) 1473{ 1474 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); 1475 struct nullb_queue *nq = hctx->driver_data; 1476 sector_t nr_sectors = blk_rq_sectors(bd->rq); 1477 sector_t sector = blk_rq_pos(bd->rq); 1478 1479 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 1480 1481 if (nq->dev->irqmode == NULL_IRQ_TIMER) { 1482 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1483 cmd->timer.function = null_cmd_timer_expired; 1484 } 1485 cmd->rq = bd->rq; 1486 cmd->error = BLK_STS_OK; 1487 cmd->nq = nq; 1488 cmd->fake_timeout = should_timeout_request(bd->rq) || 1489 blk_should_fake_timeout(bd->rq->q); 1490 1491 blk_mq_start_request(bd->rq); 1492 1493 if (should_requeue_request(bd->rq)) { 1494 /* 1495 * Alternate between hitting the core BUSY path, and the 1496 * driver driven requeue path 1497 */ 1498 nq->requeue_selection++; 1499 if (nq->requeue_selection & 1) 1500 return BLK_STS_RESOURCE; 1501 else { 1502 blk_mq_requeue_request(bd->rq, true); 1503 return BLK_STS_OK; 1504 } 1505 } 1506 if (cmd->fake_timeout) 1507 return BLK_STS_OK; 1508 1509 return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); 1510} 1511 1512static void cleanup_queue(struct nullb_queue *nq) 1513{ 1514 kfree(nq->tag_map); 1515 kfree(nq->cmds); 1516} 1517 1518static void cleanup_queues(struct nullb *nullb) 1519{ 1520 int i; 1521 1522 for (i = 0; i < nullb->nr_queues; i++) 1523 cleanup_queue(&nullb->queues[i]); 1524 1525 kfree(nullb->queues); 1526} 1527 1528static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1529{ 1530 struct nullb_queue *nq = hctx->driver_data; 1531 struct nullb *nullb = nq->dev->nullb; 1532 1533 nullb->nr_queues--; 1534} 1535 1536static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) 1537{ 1538 init_waitqueue_head(&nq->wait); 1539 nq->queue_depth = nullb->queue_depth; 1540 nq->dev = nullb->dev; 1541} 1542 1543static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, 1544 unsigned int hctx_idx) 1545{ 1546 struct nullb *nullb = hctx->queue->queuedata; 1547 struct nullb_queue *nq; 1548 1549#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1550 if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) 1551 return -EFAULT; 1552#endif 1553 1554 nq = &nullb->queues[hctx_idx]; 1555 hctx->driver_data = nq; 1556 null_init_queue(nullb, nq); 1557 nullb->nr_queues++; 1558 1559 return 0; 1560} 1561 1562static const struct blk_mq_ops null_mq_ops = { 1563 .queue_rq = null_queue_rq, 1564 .complete = null_complete_rq, 1565 .timeout = null_timeout_rq, 1566 .init_hctx = null_init_hctx, 1567 .exit_hctx = null_exit_hctx, 1568}; 1569 1570static void null_del_dev(struct nullb *nullb) 1571{ 1572 struct nullb_device *dev; 1573 1574 if (!nullb) 1575 return; 1576 1577 dev = nullb->dev; 1578 1579 ida_simple_remove(&nullb_indexes, nullb->index); 1580 1581 list_del_init(&nullb->list); 1582 1583 del_gendisk(nullb->disk); 1584 1585 if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { 1586 hrtimer_cancel(&nullb->bw_timer); 1587 atomic_long_set(&nullb->cur_bytes, LONG_MAX); 1588 null_restart_queue_async(nullb); 1589 } 1590 1591 blk_cleanup_queue(nullb->q); 1592 if (dev->queue_mode == NULL_Q_MQ && 1593 nullb->tag_set == &nullb->__tag_set) 1594 blk_mq_free_tag_set(nullb->tag_set); 1595 put_disk(nullb->disk); 1596 cleanup_queues(nullb); 1597 if (null_cache_active(nullb)) 1598 null_free_device_storage(nullb->dev, true); 1599 kfree(nullb); 1600 dev->nullb = NULL; 1601} 1602 1603static void null_config_discard(struct nullb *nullb) 1604{ 1605 if (nullb->dev->discard == false) 1606 return; 1607 1608 if (nullb->dev->zoned) { 1609 nullb->dev->discard = false; 1610 pr_info("discard option is ignored in zoned mode\n"); 1611 return; 1612 } 1613 1614 nullb->q->limits.discard_granularity = nullb->dev->blocksize; 1615 nullb->q->limits.discard_alignment = nullb->dev->blocksize; 1616 blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); 1617 blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); 1618} 1619 1620static const struct block_device_operations null_bio_ops = { 1621 .owner = THIS_MODULE, 1622 .submit_bio = null_submit_bio, 1623 .report_zones = null_report_zones, 1624}; 1625 1626static const struct block_device_operations null_rq_ops = { 1627 .owner = THIS_MODULE, 1628 .report_zones = null_report_zones, 1629}; 1630 1631static int setup_commands(struct nullb_queue *nq) 1632{ 1633 struct nullb_cmd *cmd; 1634 int i, tag_size; 1635 1636 nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); 1637 if (!nq->cmds) 1638 return -ENOMEM; 1639 1640 tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; 1641 nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); 1642 if (!nq->tag_map) { 1643 kfree(nq->cmds); 1644 return -ENOMEM; 1645 } 1646 1647 for (i = 0; i < nq->queue_depth; i++) { 1648 cmd = &nq->cmds[i]; 1649 cmd->tag = -1U; 1650 } 1651 1652 return 0; 1653} 1654 1655static int setup_queues(struct nullb *nullb) 1656{ 1657 nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), 1658 GFP_KERNEL); 1659 if (!nullb->queues) 1660 return -ENOMEM; 1661 1662 nullb->queue_depth = nullb->dev->hw_queue_depth; 1663 1664 return 0; 1665} 1666 1667static int init_driver_queues(struct nullb *nullb) 1668{ 1669 struct nullb_queue *nq; 1670 int i, ret = 0; 1671 1672 for (i = 0; i < nullb->dev->submit_queues; i++) { 1673 nq = &nullb->queues[i]; 1674 1675 null_init_queue(nullb, nq); 1676 1677 ret = setup_commands(nq); 1678 if (ret) 1679 return ret; 1680 nullb->nr_queues++; 1681 } 1682 return 0; 1683} 1684 1685static int null_gendisk_register(struct nullb *nullb) 1686{ 1687 sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; 1688 struct gendisk *disk; 1689 1690 disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); 1691 if (!disk) 1692 return -ENOMEM; 1693 set_capacity(disk, size); 1694 1695 disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; 1696 disk->major = null_major; 1697 disk->first_minor = nullb->index; 1698 if (queue_is_mq(nullb->q)) 1699 disk->fops = &null_rq_ops; 1700 else 1701 disk->fops = &null_bio_ops; 1702 disk->private_data = nullb; 1703 disk->queue = nullb->q; 1704 strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); 1705 1706 if (nullb->dev->zoned) { 1707 int ret = null_register_zoned_dev(nullb); 1708 1709 if (ret) 1710 return ret; 1711 } 1712 1713 add_disk(disk); 1714 return 0; 1715} 1716 1717static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) 1718{ 1719 set->ops = &null_mq_ops; 1720 set->nr_hw_queues = nullb ? nullb->dev->submit_queues : 1721 g_submit_queues; 1722 set->queue_depth = nullb ? nullb->dev->hw_queue_depth : 1723 g_hw_queue_depth; 1724 set->numa_node = nullb ? nullb->dev->home_node : g_home_node; 1725 set->cmd_size = sizeof(struct nullb_cmd); 1726 set->flags = BLK_MQ_F_SHOULD_MERGE; 1727 if (g_no_sched) 1728 set->flags |= BLK_MQ_F_NO_SCHED; 1729 if (g_shared_tag_bitmap) 1730 set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1731 set->driver_data = NULL; 1732 1733 if ((nullb && nullb->dev->blocking) || g_blocking) 1734 set->flags |= BLK_MQ_F_BLOCKING; 1735 1736 return blk_mq_alloc_tag_set(set); 1737} 1738 1739static int null_validate_conf(struct nullb_device *dev) 1740{ 1741 if (dev->queue_mode == NULL_Q_RQ) { 1742 pr_err("legacy IO path is no longer available\n"); 1743 return -EINVAL; 1744 } 1745 1746 if (blk_validate_block_size(dev->blocksize)) 1747 return -EINVAL; 1748 1749 if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { 1750 if (dev->submit_queues != nr_online_nodes) 1751 dev->submit_queues = nr_online_nodes; 1752 } else if (dev->submit_queues > nr_cpu_ids) 1753 dev->submit_queues = nr_cpu_ids; 1754 else if (dev->submit_queues == 0) 1755 dev->submit_queues = 1; 1756 1757 dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); 1758 dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); 1759 1760 /* Do memory allocation, so set blocking */ 1761 if (dev->memory_backed) 1762 dev->blocking = true; 1763 else /* cache is meaningless */ 1764 dev->cache_size = 0; 1765 dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, 1766 dev->cache_size); 1767 dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); 1768 /* can not stop a queue */ 1769 if (dev->queue_mode == NULL_Q_BIO) 1770 dev->mbps = 0; 1771 1772 if (dev->zoned && 1773 (!dev->zone_size || !is_power_of_2(dev->zone_size))) { 1774 pr_err("zone_size must be power-of-two\n"); 1775 return -EINVAL; 1776 } 1777 1778 return 0; 1779} 1780 1781#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1782static bool __null_setup_fault(struct fault_attr *attr, char *str) 1783{ 1784 if (!str[0]) 1785 return true; 1786 1787 if (!setup_fault_attr(attr, str)) 1788 return false; 1789 1790 attr->verbose = 0; 1791 return true; 1792} 1793#endif 1794 1795static bool null_setup_fault(void) 1796{ 1797#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1798 if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) 1799 return false; 1800 if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) 1801 return false; 1802 if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) 1803 return false; 1804#endif 1805 return true; 1806} 1807 1808static int null_add_dev(struct nullb_device *dev) 1809{ 1810 struct nullb *nullb; 1811 int rv; 1812 1813 rv = null_validate_conf(dev); 1814 if (rv) 1815 return rv; 1816 1817 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); 1818 if (!nullb) { 1819 rv = -ENOMEM; 1820 goto out; 1821 } 1822 nullb->dev = dev; 1823 dev->nullb = nullb; 1824 1825 spin_lock_init(&nullb->lock); 1826 1827 rv = setup_queues(nullb); 1828 if (rv) 1829 goto out_free_nullb; 1830 1831 if (dev->queue_mode == NULL_Q_MQ) { 1832 if (shared_tags) { 1833 nullb->tag_set = &tag_set; 1834 rv = 0; 1835 } else { 1836 nullb->tag_set = &nullb->__tag_set; 1837 rv = null_init_tag_set(nullb, nullb->tag_set); 1838 } 1839 1840 if (rv) 1841 goto out_cleanup_queues; 1842 1843 if (!null_setup_fault()) 1844 goto out_cleanup_queues; 1845 1846 nullb->tag_set->timeout = 5 * HZ; 1847 nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); 1848 if (IS_ERR(nullb->q)) { 1849 rv = -ENOMEM; 1850 goto out_cleanup_tags; 1851 } 1852 } else if (dev->queue_mode == NULL_Q_BIO) { 1853 nullb->q = blk_alloc_queue(dev->home_node); 1854 if (!nullb->q) { 1855 rv = -ENOMEM; 1856 goto out_cleanup_queues; 1857 } 1858 rv = init_driver_queues(nullb); 1859 if (rv) 1860 goto out_cleanup_blk_queue; 1861 } 1862 1863 if (dev->mbps) { 1864 set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); 1865 nullb_setup_bwtimer(nullb); 1866 } 1867 1868 if (dev->cache_size > 0) { 1869 set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); 1870 blk_queue_write_cache(nullb->q, true, true); 1871 } 1872 1873 if (dev->zoned) { 1874 rv = null_init_zoned_dev(dev, nullb->q); 1875 if (rv) 1876 goto out_cleanup_blk_queue; 1877 } 1878 1879 nullb->q->queuedata = nullb; 1880 blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); 1881 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); 1882 1883 mutex_lock(&lock); 1884 rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); 1885 if (rv < 0) { 1886 mutex_unlock(&lock); 1887 goto out_cleanup_zone; 1888 } 1889 nullb->index = rv; 1890 dev->index = rv; 1891 mutex_unlock(&lock); 1892 1893 blk_queue_logical_block_size(nullb->q, dev->blocksize); 1894 blk_queue_physical_block_size(nullb->q, dev->blocksize); 1895 1896 null_config_discard(nullb); 1897 1898 sprintf(nullb->disk_name, "nullb%d", nullb->index); 1899 1900 rv = null_gendisk_register(nullb); 1901 if (rv) 1902 goto out_ida_free; 1903 1904 mutex_lock(&lock); 1905 list_add_tail(&nullb->list, &nullb_list); 1906 mutex_unlock(&lock); 1907 1908 return 0; 1909 1910out_ida_free: 1911 ida_free(&nullb_indexes, nullb->index); 1912out_cleanup_zone: 1913 null_free_zoned_dev(dev); 1914out_cleanup_blk_queue: 1915 blk_cleanup_queue(nullb->q); 1916out_cleanup_tags: 1917 if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) 1918 blk_mq_free_tag_set(nullb->tag_set); 1919out_cleanup_queues: 1920 cleanup_queues(nullb); 1921out_free_nullb: 1922 kfree(nullb); 1923 dev->nullb = NULL; 1924out: 1925 return rv; 1926} 1927 1928static int __init null_init(void) 1929{ 1930 int ret = 0; 1931 unsigned int i; 1932 struct nullb *nullb; 1933 struct nullb_device *dev; 1934 1935 if (g_bs > PAGE_SIZE) { 1936 pr_warn("invalid block size\n"); 1937 pr_warn("defaults block size to %lu\n", PAGE_SIZE); 1938 g_bs = PAGE_SIZE; 1939 } 1940 1941 if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { 1942 pr_err("invalid home_node value\n"); 1943 g_home_node = NUMA_NO_NODE; 1944 } 1945 1946 if (g_queue_mode == NULL_Q_RQ) { 1947 pr_err("legacy IO path no longer available\n"); 1948 return -EINVAL; 1949 } 1950 if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { 1951 if (g_submit_queues != nr_online_nodes) { 1952 pr_warn("submit_queues param is set to %u.\n", 1953 nr_online_nodes); 1954 g_submit_queues = nr_online_nodes; 1955 } 1956 } else if (g_submit_queues > nr_cpu_ids) 1957 g_submit_queues = nr_cpu_ids; 1958 else if (g_submit_queues <= 0) 1959 g_submit_queues = 1; 1960 1961 if (g_queue_mode == NULL_Q_MQ && shared_tags) { 1962 ret = null_init_tag_set(NULL, &tag_set); 1963 if (ret) 1964 return ret; 1965 } 1966 1967 config_group_init(&nullb_subsys.su_group); 1968 mutex_init(&nullb_subsys.su_mutex); 1969 1970 ret = configfs_register_subsystem(&nullb_subsys); 1971 if (ret) 1972 goto err_tagset; 1973 1974 mutex_init(&lock); 1975 1976 null_major = register_blkdev(0, "nullb"); 1977 if (null_major < 0) { 1978 ret = null_major; 1979 goto err_conf; 1980 } 1981 1982 for (i = 0; i < nr_devices; i++) { 1983 dev = null_alloc_dev(); 1984 if (!dev) { 1985 ret = -ENOMEM; 1986 goto err_dev; 1987 } 1988 ret = null_add_dev(dev); 1989 if (ret) { 1990 null_free_dev(dev); 1991 goto err_dev; 1992 } 1993 } 1994 1995 pr_info("module loaded\n"); 1996 return 0; 1997 1998err_dev: 1999 while (!list_empty(&nullb_list)) { 2000 nullb = list_entry(nullb_list.next, struct nullb, list); 2001 dev = nullb->dev; 2002 null_del_dev(nullb); 2003 null_free_dev(dev); 2004 } 2005 unregister_blkdev(null_major, "nullb"); 2006err_conf: 2007 configfs_unregister_subsystem(&nullb_subsys); 2008err_tagset: 2009 if (g_queue_mode == NULL_Q_MQ && shared_tags) 2010 blk_mq_free_tag_set(&tag_set); 2011 return ret; 2012} 2013 2014static void __exit null_exit(void) 2015{ 2016 struct nullb *nullb; 2017 2018 configfs_unregister_subsystem(&nullb_subsys); 2019 2020 unregister_blkdev(null_major, "nullb"); 2021 2022 mutex_lock(&lock); 2023 while (!list_empty(&nullb_list)) { 2024 struct nullb_device *dev; 2025 2026 nullb = list_entry(nullb_list.next, struct nullb, list); 2027 dev = nullb->dev; 2028 null_del_dev(nullb); 2029 null_free_dev(dev); 2030 } 2031 mutex_unlock(&lock); 2032 2033 if (g_queue_mode == NULL_Q_MQ && shared_tags) 2034 blk_mq_free_tag_set(&tag_set); 2035} 2036 2037module_init(null_init); 2038module_exit(null_exit); 2039 2040MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); 2041MODULE_LICENSE("GPL"); 2042