1// SPDX-License-Identifier: GPL-2.0 2#include <linux/vmalloc.h> 3#include <linux/bitmap.h> 4#include "null_blk.h" 5 6#define CREATE_TRACE_POINTS 7#include "trace.h" 8 9#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) 10 11static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) 12{ 13 return sect >> ilog2(dev->zone_size_sects); 14} 15 16int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) 17{ 18 sector_t dev_capacity_sects, zone_capacity_sects; 19 sector_t sector = 0; 20 unsigned int i; 21 22 if (!is_power_of_2(dev->zone_size)) { 23 pr_err("zone_size must be power-of-two\n"); 24 return -EINVAL; 25 } 26 if (dev->zone_size > dev->size) { 27 pr_err("Zone size larger than device capacity\n"); 28 return -EINVAL; 29 } 30 31 if (!dev->zone_capacity) 32 dev->zone_capacity = dev->zone_size; 33 34 if (dev->zone_capacity > dev->zone_size) { 35 pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", 36 dev->zone_capacity, dev->zone_size); 37 return -EINVAL; 38 } 39 40 zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); 41 dev_capacity_sects = MB_TO_SECTS(dev->size); 42 dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); 43 dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); 44 if (dev_capacity_sects & (dev->zone_size_sects - 1)) 45 dev->nr_zones++; 46 47 dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), 48 GFP_KERNEL | __GFP_ZERO); 49 if (!dev->zones) 50 return -ENOMEM; 51 52 /* 53 * With memory backing, the zone_lock spinlock needs to be temporarily 54 * released to avoid scheduling in atomic context. To guarantee zone 55 * information protection, use a bitmap to lock zones with 56 * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing 57 * implies that the queue is marked with BLK_MQ_F_BLOCKING. 58 */ 59 spin_lock_init(&dev->zone_lock); 60 if (dev->memory_backed) { 61 dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); 62 if (!dev->zone_locks) { 63 kvfree(dev->zones); 64 return -ENOMEM; 65 } 66 } 67 68 if (dev->zone_nr_conv >= dev->nr_zones) { 69 dev->zone_nr_conv = dev->nr_zones - 1; 70 pr_info("changed the number of conventional zones to %u", 71 dev->zone_nr_conv); 72 } 73 74 /* Max active zones has to be < nbr of seq zones in order to be enforceable */ 75 if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { 76 dev->zone_max_active = 0; 77 pr_info("zone_max_active limit disabled, limit >= zone count\n"); 78 } 79 80 /* Max open zones has to be <= max active zones */ 81 if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { 82 dev->zone_max_open = dev->zone_max_active; 83 pr_info("changed the maximum number of open zones to %u\n", 84 dev->nr_zones); 85 } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { 86 dev->zone_max_open = 0; 87 pr_info("zone_max_open limit disabled, limit >= zone count\n"); 88 } 89 90 for (i = 0; i < dev->zone_nr_conv; i++) { 91 struct blk_zone *zone = &dev->zones[i]; 92 93 zone->start = sector; 94 zone->len = dev->zone_size_sects; 95 zone->capacity = zone->len; 96 zone->wp = zone->start + zone->len; 97 zone->type = BLK_ZONE_TYPE_CONVENTIONAL; 98 zone->cond = BLK_ZONE_COND_NOT_WP; 99 100 sector += dev->zone_size_sects; 101 } 102 103 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 104 struct blk_zone *zone = &dev->zones[i]; 105 106 zone->start = zone->wp = sector; 107 if (zone->start + dev->zone_size_sects > dev_capacity_sects) 108 zone->len = dev_capacity_sects - zone->start; 109 else 110 zone->len = dev->zone_size_sects; 111 zone->capacity = 112 min_t(sector_t, zone->len, zone_capacity_sects); 113 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; 114 zone->cond = BLK_ZONE_COND_EMPTY; 115 116 sector += dev->zone_size_sects; 117 } 118 119 q->limits.zoned = BLK_ZONED_HM; 120 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); 121 blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); 122 123 return 0; 124} 125 126int null_register_zoned_dev(struct nullb *nullb) 127{ 128 struct nullb_device *dev = nullb->dev; 129 struct request_queue *q = nullb->q; 130 131 if (queue_is_mq(q)) { 132 int ret = blk_revalidate_disk_zones(nullb->disk, NULL); 133 134 if (ret) 135 return ret; 136 } else { 137 blk_queue_chunk_sectors(q, dev->zone_size_sects); 138 q->nr_zones = blkdev_nr_zones(nullb->disk); 139 } 140 141 blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); 142 blk_queue_max_open_zones(q, dev->zone_max_open); 143 blk_queue_max_active_zones(q, dev->zone_max_active); 144 145 return 0; 146} 147 148void null_free_zoned_dev(struct nullb_device *dev) 149{ 150 bitmap_free(dev->zone_locks); 151 kvfree(dev->zones); 152 dev->zones = NULL; 153} 154 155static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) 156{ 157 if (dev->memory_backed) 158 wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); 159 spin_lock_irq(&dev->zone_lock); 160} 161 162static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) 163{ 164 spin_unlock_irq(&dev->zone_lock); 165 166 if (dev->memory_backed) 167 clear_and_wake_up_bit(zno, dev->zone_locks); 168} 169 170int null_report_zones(struct gendisk *disk, sector_t sector, 171 unsigned int nr_zones, report_zones_cb cb, void *data) 172{ 173 struct nullb *nullb = disk->private_data; 174 struct nullb_device *dev = nullb->dev; 175 unsigned int first_zone, i, zno; 176 struct blk_zone zone; 177 int error; 178 179 first_zone = null_zone_no(dev, sector); 180 if (first_zone >= dev->nr_zones) 181 return 0; 182 183 nr_zones = min(nr_zones, dev->nr_zones - first_zone); 184 trace_nullb_report_zones(nullb, nr_zones); 185 186 zno = first_zone; 187 for (i = 0; i < nr_zones; i++, zno++) { 188 /* 189 * Stacked DM target drivers will remap the zone information by 190 * modifying the zone information passed to the report callback. 191 * So use a local copy to avoid corruption of the device zone 192 * array. 193 */ 194 null_lock_zone(dev, zno); 195 memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); 196 null_unlock_zone(dev, zno); 197 198 error = cb(&zone, i, data); 199 if (error) 200 return error; 201 } 202 203 return nr_zones; 204} 205 206/* 207 * This is called in the case of memory backing from null_process_cmd() 208 * with the target zone already locked. 209 */ 210size_t null_zone_valid_read_len(struct nullb *nullb, 211 sector_t sector, unsigned int len) 212{ 213 struct nullb_device *dev = nullb->dev; 214 struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; 215 unsigned int nr_sectors = len >> SECTOR_SHIFT; 216 217 /* Read must be below the write pointer position */ 218 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || 219 sector + nr_sectors <= zone->wp) 220 return len; 221 222 if (sector > zone->wp) 223 return 0; 224 225 return (zone->wp - sector) << SECTOR_SHIFT; 226} 227 228static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) 229{ 230 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 231 return BLK_STS_IOERR; 232 233 switch (zone->cond) { 234 case BLK_ZONE_COND_CLOSED: 235 /* close operation on closed is not an error */ 236 return BLK_STS_OK; 237 case BLK_ZONE_COND_IMP_OPEN: 238 dev->nr_zones_imp_open--; 239 break; 240 case BLK_ZONE_COND_EXP_OPEN: 241 dev->nr_zones_exp_open--; 242 break; 243 case BLK_ZONE_COND_EMPTY: 244 case BLK_ZONE_COND_FULL: 245 default: 246 return BLK_STS_IOERR; 247 } 248 249 if (zone->wp == zone->start) { 250 zone->cond = BLK_ZONE_COND_EMPTY; 251 } else { 252 zone->cond = BLK_ZONE_COND_CLOSED; 253 dev->nr_zones_closed++; 254 } 255 256 return BLK_STS_OK; 257} 258 259static void null_close_first_imp_zone(struct nullb_device *dev) 260{ 261 unsigned int i; 262 263 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 264 if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { 265 null_close_zone(dev, &dev->zones[i]); 266 return; 267 } 268 } 269} 270 271static blk_status_t null_check_active(struct nullb_device *dev) 272{ 273 if (!dev->zone_max_active) 274 return BLK_STS_OK; 275 276 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + 277 dev->nr_zones_closed < dev->zone_max_active) 278 return BLK_STS_OK; 279 280 return BLK_STS_ZONE_ACTIVE_RESOURCE; 281} 282 283static blk_status_t null_check_open(struct nullb_device *dev) 284{ 285 if (!dev->zone_max_open) 286 return BLK_STS_OK; 287 288 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) 289 return BLK_STS_OK; 290 291 if (dev->nr_zones_imp_open) { 292 if (null_check_active(dev) == BLK_STS_OK) { 293 null_close_first_imp_zone(dev); 294 return BLK_STS_OK; 295 } 296 } 297 298 return BLK_STS_ZONE_OPEN_RESOURCE; 299} 300 301/* 302 * This function matches the manage open zone resources function in the ZBC standard, 303 * with the addition of max active zones support (added in the ZNS standard). 304 * 305 * The function determines if a zone can transition to implicit open or explicit open, 306 * while maintaining the max open zone (and max active zone) limit(s). It may close an 307 * implicit open zone in order to make additional zone resources available. 308 * 309 * ZBC states that an implicit open zone shall be closed only if there is not 310 * room within the open limit. However, with the addition of an active limit, 311 * it is not certain that closing an implicit open zone will allow a new zone 312 * to be opened, since we might already be at the active limit capacity. 313 */ 314static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone) 315{ 316 blk_status_t ret; 317 318 switch (zone->cond) { 319 case BLK_ZONE_COND_EMPTY: 320 ret = null_check_active(dev); 321 if (ret != BLK_STS_OK) 322 return ret; 323 fallthrough; 324 case BLK_ZONE_COND_CLOSED: 325 return null_check_open(dev); 326 default: 327 /* Should never be called for other states */ 328 WARN_ON(1); 329 return BLK_STS_IOERR; 330 } 331} 332 333static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, 334 unsigned int nr_sectors, bool append) 335{ 336 struct nullb_device *dev = cmd->nq->dev; 337 unsigned int zno = null_zone_no(dev, sector); 338 struct blk_zone *zone = &dev->zones[zno]; 339 blk_status_t ret; 340 341 trace_nullb_zone_op(cmd, zno, zone->cond); 342 343 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { 344 if (append) 345 return BLK_STS_IOERR; 346 return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 347 } 348 349 null_lock_zone(dev, zno); 350 351 switch (zone->cond) { 352 case BLK_ZONE_COND_FULL: 353 /* Cannot write to a full zone */ 354 ret = BLK_STS_IOERR; 355 goto unlock; 356 case BLK_ZONE_COND_EMPTY: 357 case BLK_ZONE_COND_CLOSED: 358 ret = null_check_zone_resources(dev, zone); 359 if (ret != BLK_STS_OK) 360 goto unlock; 361 break; 362 case BLK_ZONE_COND_IMP_OPEN: 363 case BLK_ZONE_COND_EXP_OPEN: 364 break; 365 default: 366 /* Invalid zone condition */ 367 ret = BLK_STS_IOERR; 368 goto unlock; 369 } 370 371 /* 372 * Regular writes must be at the write pointer position. 373 * Zone append writes are automatically issued at the write 374 * pointer and the position returned using the request or BIO 375 * sector. 376 */ 377 if (append) { 378 sector = zone->wp; 379 if (cmd->bio) 380 cmd->bio->bi_iter.bi_sector = sector; 381 else 382 cmd->rq->__sector = sector; 383 } else if (sector != zone->wp) { 384 ret = BLK_STS_IOERR; 385 goto unlock; 386 } 387 388 if (zone->wp + nr_sectors > zone->start + zone->capacity) { 389 ret = BLK_STS_IOERR; 390 goto unlock; 391 } 392 393 if (zone->cond == BLK_ZONE_COND_CLOSED) { 394 dev->nr_zones_closed--; 395 dev->nr_zones_imp_open++; 396 } else if (zone->cond == BLK_ZONE_COND_EMPTY) { 397 dev->nr_zones_imp_open++; 398 } 399 if (zone->cond != BLK_ZONE_COND_EXP_OPEN) 400 zone->cond = BLK_ZONE_COND_IMP_OPEN; 401 402 /* 403 * Memory backing allocation may sleep: release the zone_lock spinlock 404 * to avoid scheduling in atomic context. Zone operation atomicity is 405 * still guaranteed through the zone_locks bitmap. 406 */ 407 if (dev->memory_backed) 408 spin_unlock_irq(&dev->zone_lock); 409 ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 410 if (dev->memory_backed) 411 spin_lock_irq(&dev->zone_lock); 412 413 if (ret != BLK_STS_OK) 414 goto unlock; 415 416 zone->wp += nr_sectors; 417 if (zone->wp == zone->start + zone->capacity) { 418 if (zone->cond == BLK_ZONE_COND_EXP_OPEN) 419 dev->nr_zones_exp_open--; 420 else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) 421 dev->nr_zones_imp_open--; 422 zone->cond = BLK_ZONE_COND_FULL; 423 } 424 ret = BLK_STS_OK; 425 426unlock: 427 null_unlock_zone(dev, zno); 428 429 return ret; 430} 431 432static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) 433{ 434 blk_status_t ret; 435 436 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 437 return BLK_STS_IOERR; 438 439 switch (zone->cond) { 440 case BLK_ZONE_COND_EXP_OPEN: 441 /* open operation on exp open is not an error */ 442 return BLK_STS_OK; 443 case BLK_ZONE_COND_EMPTY: 444 ret = null_check_zone_resources(dev, zone); 445 if (ret != BLK_STS_OK) 446 return ret; 447 break; 448 case BLK_ZONE_COND_IMP_OPEN: 449 dev->nr_zones_imp_open--; 450 break; 451 case BLK_ZONE_COND_CLOSED: 452 ret = null_check_zone_resources(dev, zone); 453 if (ret != BLK_STS_OK) 454 return ret; 455 dev->nr_zones_closed--; 456 break; 457 case BLK_ZONE_COND_FULL: 458 default: 459 return BLK_STS_IOERR; 460 } 461 462 zone->cond = BLK_ZONE_COND_EXP_OPEN; 463 dev->nr_zones_exp_open++; 464 465 return BLK_STS_OK; 466} 467 468static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) 469{ 470 blk_status_t ret; 471 472 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 473 return BLK_STS_IOERR; 474 475 switch (zone->cond) { 476 case BLK_ZONE_COND_FULL: 477 /* finish operation on full is not an error */ 478 return BLK_STS_OK; 479 case BLK_ZONE_COND_EMPTY: 480 ret = null_check_zone_resources(dev, zone); 481 if (ret != BLK_STS_OK) 482 return ret; 483 break; 484 case BLK_ZONE_COND_IMP_OPEN: 485 dev->nr_zones_imp_open--; 486 break; 487 case BLK_ZONE_COND_EXP_OPEN: 488 dev->nr_zones_exp_open--; 489 break; 490 case BLK_ZONE_COND_CLOSED: 491 ret = null_check_zone_resources(dev, zone); 492 if (ret != BLK_STS_OK) 493 return ret; 494 dev->nr_zones_closed--; 495 break; 496 default: 497 return BLK_STS_IOERR; 498 } 499 500 zone->cond = BLK_ZONE_COND_FULL; 501 zone->wp = zone->start + zone->len; 502 503 return BLK_STS_OK; 504} 505 506static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) 507{ 508 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 509 return BLK_STS_IOERR; 510 511 switch (zone->cond) { 512 case BLK_ZONE_COND_EMPTY: 513 /* reset operation on empty is not an error */ 514 return BLK_STS_OK; 515 case BLK_ZONE_COND_IMP_OPEN: 516 dev->nr_zones_imp_open--; 517 break; 518 case BLK_ZONE_COND_EXP_OPEN: 519 dev->nr_zones_exp_open--; 520 break; 521 case BLK_ZONE_COND_CLOSED: 522 dev->nr_zones_closed--; 523 break; 524 case BLK_ZONE_COND_FULL: 525 break; 526 default: 527 return BLK_STS_IOERR; 528 } 529 530 zone->cond = BLK_ZONE_COND_EMPTY; 531 zone->wp = zone->start; 532 533 return BLK_STS_OK; 534} 535 536static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, 537 sector_t sector) 538{ 539 struct nullb_device *dev = cmd->nq->dev; 540 unsigned int zone_no; 541 struct blk_zone *zone; 542 blk_status_t ret; 543 size_t i; 544 545 if (op == REQ_OP_ZONE_RESET_ALL) { 546 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 547 null_lock_zone(dev, i); 548 zone = &dev->zones[i]; 549 if (zone->cond != BLK_ZONE_COND_EMPTY) { 550 null_reset_zone(dev, zone); 551 trace_nullb_zone_op(cmd, i, zone->cond); 552 } 553 null_unlock_zone(dev, i); 554 } 555 return BLK_STS_OK; 556 } 557 558 zone_no = null_zone_no(dev, sector); 559 zone = &dev->zones[zone_no]; 560 561 null_lock_zone(dev, zone_no); 562 563 switch (op) { 564 case REQ_OP_ZONE_RESET: 565 ret = null_reset_zone(dev, zone); 566 break; 567 case REQ_OP_ZONE_OPEN: 568 ret = null_open_zone(dev, zone); 569 break; 570 case REQ_OP_ZONE_CLOSE: 571 ret = null_close_zone(dev, zone); 572 break; 573 case REQ_OP_ZONE_FINISH: 574 ret = null_finish_zone(dev, zone); 575 break; 576 default: 577 ret = BLK_STS_NOTSUPP; 578 break; 579 } 580 581 if (ret == BLK_STS_OK) 582 trace_nullb_zone_op(cmd, zone_no, zone->cond); 583 584 null_unlock_zone(dev, zone_no); 585 586 return ret; 587} 588 589blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, 590 sector_t sector, sector_t nr_sectors) 591{ 592 struct nullb_device *dev = cmd->nq->dev; 593 unsigned int zno = null_zone_no(dev, sector); 594 blk_status_t sts; 595 596 switch (op) { 597 case REQ_OP_WRITE: 598 sts = null_zone_write(cmd, sector, nr_sectors, false); 599 break; 600 case REQ_OP_ZONE_APPEND: 601 sts = null_zone_write(cmd, sector, nr_sectors, true); 602 break; 603 case REQ_OP_ZONE_RESET: 604 case REQ_OP_ZONE_RESET_ALL: 605 case REQ_OP_ZONE_OPEN: 606 case REQ_OP_ZONE_CLOSE: 607 case REQ_OP_ZONE_FINISH: 608 sts = null_zone_mgmt(cmd, op, sector); 609 break; 610 default: 611 null_lock_zone(dev, zno); 612 sts = null_process_cmd(cmd, op, sector, nr_sectors); 613 null_unlock_zone(dev, zno); 614 } 615 616 return sts; 617} 618