1/* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7#include "dm.h" 8#include "dm-bio-prison-v2.h" 9#include "dm-bio-record.h" 10#include "dm-cache-metadata.h" 11 12#include <linux/dm-io.h> 13#include <linux/dm-kcopyd.h> 14#include <linux/jiffies.h> 15#include <linux/init.h> 16#include <linux/mempool.h> 17#include <linux/module.h> 18#include <linux/rwsem.h> 19#include <linux/slab.h> 20#include <linux/vmalloc.h> 21 22#define DM_MSG_PREFIX "cache" 23 24DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 25 "A percentage of time allocated for copying to and/or from cache"); 26 27/*----------------------------------------------------------------*/ 28 29/* 30 * Glossary: 31 * 32 * oblock: index of an origin block 33 * cblock: index of a cache block 34 * promotion: movement of a block from origin to cache 35 * demotion: movement of a block from cache to origin 36 * migration: movement of a block between the origin and cache device, 37 * either direction 38 */ 39 40/*----------------------------------------------------------------*/ 41 42struct io_tracker { 43 spinlock_t lock; 44 45 /* 46 * Sectors of in-flight IO. 47 */ 48 sector_t in_flight; 49 50 /* 51 * The time, in jiffies, when this device became idle (if it is 52 * indeed idle). 53 */ 54 unsigned long idle_time; 55 unsigned long last_update_time; 56}; 57 58static void iot_init(struct io_tracker *iot) 59{ 60 spin_lock_init(&iot->lock); 61 iot->in_flight = 0ul; 62 iot->idle_time = 0ul; 63 iot->last_update_time = jiffies; 64} 65 66static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 67{ 68 if (iot->in_flight) 69 return false; 70 71 return time_after(jiffies, iot->idle_time + jifs); 72} 73 74static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 75{ 76 bool r; 77 78 spin_lock_irq(&iot->lock); 79 r = __iot_idle_for(iot, jifs); 80 spin_unlock_irq(&iot->lock); 81 82 return r; 83} 84 85static void iot_io_begin(struct io_tracker *iot, sector_t len) 86{ 87 spin_lock_irq(&iot->lock); 88 iot->in_flight += len; 89 spin_unlock_irq(&iot->lock); 90} 91 92static void __iot_io_end(struct io_tracker *iot, sector_t len) 93{ 94 if (!len) 95 return; 96 97 iot->in_flight -= len; 98 if (!iot->in_flight) 99 iot->idle_time = jiffies; 100} 101 102static void iot_io_end(struct io_tracker *iot, sector_t len) 103{ 104 unsigned long flags; 105 106 spin_lock_irqsave(&iot->lock, flags); 107 __iot_io_end(iot, len); 108 spin_unlock_irqrestore(&iot->lock, flags); 109} 110 111/*----------------------------------------------------------------*/ 112 113/* 114 * Represents a chunk of future work. 'input' allows continuations to pass 115 * values between themselves, typically error values. 116 */ 117struct continuation { 118 struct work_struct ws; 119 blk_status_t input; 120}; 121 122static inline void init_continuation(struct continuation *k, 123 void (*fn)(struct work_struct *)) 124{ 125 INIT_WORK(&k->ws, fn); 126 k->input = 0; 127} 128 129static inline void queue_continuation(struct workqueue_struct *wq, 130 struct continuation *k) 131{ 132 queue_work(wq, &k->ws); 133} 134 135/*----------------------------------------------------------------*/ 136 137/* 138 * The batcher collects together pieces of work that need a particular 139 * operation to occur before they can proceed (typically a commit). 140 */ 141struct batcher { 142 /* 143 * The operation that everyone is waiting for. 144 */ 145 blk_status_t (*commit_op)(void *context); 146 void *commit_context; 147 148 /* 149 * This is how bios should be issued once the commit op is complete 150 * (accounted_request). 151 */ 152 void (*issue_op)(struct bio *bio, void *context); 153 void *issue_context; 154 155 /* 156 * Queued work gets put on here after commit. 157 */ 158 struct workqueue_struct *wq; 159 160 spinlock_t lock; 161 struct list_head work_items; 162 struct bio_list bios; 163 struct work_struct commit_work; 164 165 bool commit_scheduled; 166}; 167 168static void __commit(struct work_struct *_ws) 169{ 170 struct batcher *b = container_of(_ws, struct batcher, commit_work); 171 blk_status_t r; 172 struct list_head work_items; 173 struct work_struct *ws, *tmp; 174 struct continuation *k; 175 struct bio *bio; 176 struct bio_list bios; 177 178 INIT_LIST_HEAD(&work_items); 179 bio_list_init(&bios); 180 181 /* 182 * We have to grab these before the commit_op to avoid a race 183 * condition. 184 */ 185 spin_lock_irq(&b->lock); 186 list_splice_init(&b->work_items, &work_items); 187 bio_list_merge(&bios, &b->bios); 188 bio_list_init(&b->bios); 189 b->commit_scheduled = false; 190 spin_unlock_irq(&b->lock); 191 192 r = b->commit_op(b->commit_context); 193 194 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 195 k = container_of(ws, struct continuation, ws); 196 k->input = r; 197 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 198 queue_work(b->wq, ws); 199 } 200 201 while ((bio = bio_list_pop(&bios))) { 202 if (r) { 203 bio->bi_status = r; 204 bio_endio(bio); 205 } else 206 b->issue_op(bio, b->issue_context); 207 } 208} 209 210static void batcher_init(struct batcher *b, 211 blk_status_t (*commit_op)(void *), 212 void *commit_context, 213 void (*issue_op)(struct bio *bio, void *), 214 void *issue_context, 215 struct workqueue_struct *wq) 216{ 217 b->commit_op = commit_op; 218 b->commit_context = commit_context; 219 b->issue_op = issue_op; 220 b->issue_context = issue_context; 221 b->wq = wq; 222 223 spin_lock_init(&b->lock); 224 INIT_LIST_HEAD(&b->work_items); 225 bio_list_init(&b->bios); 226 INIT_WORK(&b->commit_work, __commit); 227 b->commit_scheduled = false; 228} 229 230static void async_commit(struct batcher *b) 231{ 232 queue_work(b->wq, &b->commit_work); 233} 234 235static void continue_after_commit(struct batcher *b, struct continuation *k) 236{ 237 bool commit_scheduled; 238 239 spin_lock_irq(&b->lock); 240 commit_scheduled = b->commit_scheduled; 241 list_add_tail(&k->ws.entry, &b->work_items); 242 spin_unlock_irq(&b->lock); 243 244 if (commit_scheduled) 245 async_commit(b); 246} 247 248/* 249 * Bios are errored if commit failed. 250 */ 251static void issue_after_commit(struct batcher *b, struct bio *bio) 252{ 253 bool commit_scheduled; 254 255 spin_lock_irq(&b->lock); 256 commit_scheduled = b->commit_scheduled; 257 bio_list_add(&b->bios, bio); 258 spin_unlock_irq(&b->lock); 259 260 if (commit_scheduled) 261 async_commit(b); 262} 263 264/* 265 * Call this if some urgent work is waiting for the commit to complete. 266 */ 267static void schedule_commit(struct batcher *b) 268{ 269 bool immediate; 270 271 spin_lock_irq(&b->lock); 272 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 273 b->commit_scheduled = true; 274 spin_unlock_irq(&b->lock); 275 276 if (immediate) 277 async_commit(b); 278} 279 280/* 281 * There are a couple of places where we let a bio run, but want to do some 282 * work before calling its endio function. We do this by temporarily 283 * changing the endio fn. 284 */ 285struct dm_hook_info { 286 bio_end_io_t *bi_end_io; 287}; 288 289static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 290 bio_end_io_t *bi_end_io, void *bi_private) 291{ 292 h->bi_end_io = bio->bi_end_io; 293 294 bio->bi_end_io = bi_end_io; 295 bio->bi_private = bi_private; 296} 297 298static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 299{ 300 bio->bi_end_io = h->bi_end_io; 301} 302 303/*----------------------------------------------------------------*/ 304 305#define MIGRATION_POOL_SIZE 128 306#define COMMIT_PERIOD HZ 307#define MIGRATION_COUNT_WINDOW 10 308 309/* 310 * The block size of the device holding cache data must be 311 * between 32KB and 1GB. 312 */ 313#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 314#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 315 316enum cache_metadata_mode { 317 CM_WRITE, /* metadata may be changed */ 318 CM_READ_ONLY, /* metadata may not be changed */ 319 CM_FAIL 320}; 321 322enum cache_io_mode { 323 /* 324 * Data is written to cached blocks only. These blocks are marked 325 * dirty. If you lose the cache device you will lose data. 326 * Potential performance increase for both reads and writes. 327 */ 328 CM_IO_WRITEBACK, 329 330 /* 331 * Data is written to both cache and origin. Blocks are never 332 * dirty. Potential performance benfit for reads only. 333 */ 334 CM_IO_WRITETHROUGH, 335 336 /* 337 * A degraded mode useful for various cache coherency situations 338 * (eg, rolling back snapshots). Reads and writes always go to the 339 * origin. If a write goes to a cached oblock, then the cache 340 * block is invalidated. 341 */ 342 CM_IO_PASSTHROUGH 343}; 344 345struct cache_features { 346 enum cache_metadata_mode mode; 347 enum cache_io_mode io_mode; 348 unsigned metadata_version; 349 bool discard_passdown:1; 350}; 351 352struct cache_stats { 353 atomic_t read_hit; 354 atomic_t read_miss; 355 atomic_t write_hit; 356 atomic_t write_miss; 357 atomic_t demotion; 358 atomic_t promotion; 359 atomic_t writeback; 360 atomic_t copies_avoided; 361 atomic_t cache_cell_clash; 362 atomic_t commit_count; 363 atomic_t discard_count; 364}; 365 366struct cache { 367 struct dm_target *ti; 368 spinlock_t lock; 369 370 /* 371 * Fields for converting from sectors to blocks. 372 */ 373 int sectors_per_block_shift; 374 sector_t sectors_per_block; 375 376 struct dm_cache_metadata *cmd; 377 378 /* 379 * Metadata is written to this device. 380 */ 381 struct dm_dev *metadata_dev; 382 383 /* 384 * The slower of the two data devices. Typically a spindle. 385 */ 386 struct dm_dev *origin_dev; 387 388 /* 389 * The faster of the two data devices. Typically an SSD. 390 */ 391 struct dm_dev *cache_dev; 392 393 /* 394 * Size of the origin device in _complete_ blocks and native sectors. 395 */ 396 dm_oblock_t origin_blocks; 397 sector_t origin_sectors; 398 399 /* 400 * Size of the cache device in blocks. 401 */ 402 dm_cblock_t cache_size; 403 404 /* 405 * Invalidation fields. 406 */ 407 spinlock_t invalidation_lock; 408 struct list_head invalidation_requests; 409 410 sector_t migration_threshold; 411 wait_queue_head_t migration_wait; 412 atomic_t nr_allocated_migrations; 413 414 /* 415 * The number of in flight migrations that are performing 416 * background io. eg, promotion, writeback. 417 */ 418 atomic_t nr_io_migrations; 419 420 struct bio_list deferred_bios; 421 422 struct rw_semaphore quiesce_lock; 423 424 /* 425 * origin_blocks entries, discarded if set. 426 */ 427 dm_dblock_t discard_nr_blocks; 428 unsigned long *discard_bitset; 429 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 430 431 /* 432 * Rather than reconstructing the table line for the status we just 433 * save it and regurgitate. 434 */ 435 unsigned nr_ctr_args; 436 const char **ctr_args; 437 438 struct dm_kcopyd_client *copier; 439 struct work_struct deferred_bio_worker; 440 struct work_struct migration_worker; 441 struct workqueue_struct *wq; 442 struct delayed_work waker; 443 struct dm_bio_prison_v2 *prison; 444 445 /* 446 * cache_size entries, dirty if set 447 */ 448 unsigned long *dirty_bitset; 449 atomic_t nr_dirty; 450 451 unsigned policy_nr_args; 452 struct dm_cache_policy *policy; 453 454 /* 455 * Cache features such as write-through. 456 */ 457 struct cache_features features; 458 459 struct cache_stats stats; 460 461 bool need_tick_bio:1; 462 bool sized:1; 463 bool invalidate:1; 464 bool commit_requested:1; 465 bool loaded_mappings:1; 466 bool loaded_discards:1; 467 468 struct rw_semaphore background_work_lock; 469 470 struct batcher committer; 471 struct work_struct commit_ws; 472 473 struct io_tracker tracker; 474 475 mempool_t migration_pool; 476 477 struct bio_set bs; 478}; 479 480struct per_bio_data { 481 bool tick:1; 482 unsigned req_nr:2; 483 struct dm_bio_prison_cell_v2 *cell; 484 struct dm_hook_info hook_info; 485 sector_t len; 486}; 487 488struct dm_cache_migration { 489 struct continuation k; 490 struct cache *cache; 491 492 struct policy_work *op; 493 struct bio *overwrite_bio; 494 struct dm_bio_prison_cell_v2 *cell; 495 496 dm_cblock_t invalidate_cblock; 497 dm_oblock_t invalidate_oblock; 498}; 499 500/*----------------------------------------------------------------*/ 501 502static bool writethrough_mode(struct cache *cache) 503{ 504 return cache->features.io_mode == CM_IO_WRITETHROUGH; 505} 506 507static bool writeback_mode(struct cache *cache) 508{ 509 return cache->features.io_mode == CM_IO_WRITEBACK; 510} 511 512static inline bool passthrough_mode(struct cache *cache) 513{ 514 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 515} 516 517/*----------------------------------------------------------------*/ 518 519static void wake_deferred_bio_worker(struct cache *cache) 520{ 521 queue_work(cache->wq, &cache->deferred_bio_worker); 522} 523 524static void wake_migration_worker(struct cache *cache) 525{ 526 if (passthrough_mode(cache)) 527 return; 528 529 queue_work(cache->wq, &cache->migration_worker); 530} 531 532/*----------------------------------------------------------------*/ 533 534static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 535{ 536 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); 537} 538 539static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 540{ 541 dm_bio_prison_free_cell_v2(cache->prison, cell); 542} 543 544static struct dm_cache_migration *alloc_migration(struct cache *cache) 545{ 546 struct dm_cache_migration *mg; 547 548 mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); 549 550 memset(mg, 0, sizeof(*mg)); 551 552 mg->cache = cache; 553 atomic_inc(&cache->nr_allocated_migrations); 554 555 return mg; 556} 557 558static void free_migration(struct dm_cache_migration *mg) 559{ 560 struct cache *cache = mg->cache; 561 562 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 563 wake_up(&cache->migration_wait); 564 565 mempool_free(mg, &cache->migration_pool); 566} 567 568/*----------------------------------------------------------------*/ 569 570static inline dm_oblock_t oblock_succ(dm_oblock_t b) 571{ 572 return to_oblock(from_oblock(b) + 1ull); 573} 574 575static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 576{ 577 key->virtual = 0; 578 key->dev = 0; 579 key->block_begin = from_oblock(begin); 580 key->block_end = from_oblock(end); 581} 582 583/* 584 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 585 * level 1 which prevents *both* READs and WRITEs. 586 */ 587#define WRITE_LOCK_LEVEL 0 588#define READ_WRITE_LOCK_LEVEL 1 589 590static unsigned lock_level(struct bio *bio) 591{ 592 return bio_data_dir(bio) == WRITE ? 593 WRITE_LOCK_LEVEL : 594 READ_WRITE_LOCK_LEVEL; 595} 596 597/*---------------------------------------------------------------- 598 * Per bio data 599 *--------------------------------------------------------------*/ 600 601static struct per_bio_data *get_per_bio_data(struct bio *bio) 602{ 603 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 604 BUG_ON(!pb); 605 return pb; 606} 607 608static struct per_bio_data *init_per_bio_data(struct bio *bio) 609{ 610 struct per_bio_data *pb = get_per_bio_data(bio); 611 612 pb->tick = false; 613 pb->req_nr = dm_bio_get_target_bio_nr(bio); 614 pb->cell = NULL; 615 pb->len = 0; 616 617 return pb; 618} 619 620/*----------------------------------------------------------------*/ 621 622static void defer_bio(struct cache *cache, struct bio *bio) 623{ 624 spin_lock_irq(&cache->lock); 625 bio_list_add(&cache->deferred_bios, bio); 626 spin_unlock_irq(&cache->lock); 627 628 wake_deferred_bio_worker(cache); 629} 630 631static void defer_bios(struct cache *cache, struct bio_list *bios) 632{ 633 spin_lock_irq(&cache->lock); 634 bio_list_merge(&cache->deferred_bios, bios); 635 bio_list_init(bios); 636 spin_unlock_irq(&cache->lock); 637 638 wake_deferred_bio_worker(cache); 639} 640 641/*----------------------------------------------------------------*/ 642 643static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 644{ 645 bool r; 646 struct per_bio_data *pb; 647 struct dm_cell_key_v2 key; 648 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 649 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 650 651 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 652 653 build_key(oblock, end, &key); 654 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 655 if (!r) { 656 /* 657 * Failed to get the lock. 658 */ 659 free_prison_cell(cache, cell_prealloc); 660 return r; 661 } 662 663 if (cell != cell_prealloc) 664 free_prison_cell(cache, cell_prealloc); 665 666 pb = get_per_bio_data(bio); 667 pb->cell = cell; 668 669 return r; 670} 671 672/*----------------------------------------------------------------*/ 673 674static bool is_dirty(struct cache *cache, dm_cblock_t b) 675{ 676 return test_bit(from_cblock(b), cache->dirty_bitset); 677} 678 679static void set_dirty(struct cache *cache, dm_cblock_t cblock) 680{ 681 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 682 atomic_inc(&cache->nr_dirty); 683 policy_set_dirty(cache->policy, cblock); 684 } 685} 686 687/* 688 * These two are called when setting after migrations to force the policy 689 * and dirty bitset to be in sync. 690 */ 691static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 692{ 693 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 694 atomic_inc(&cache->nr_dirty); 695 policy_set_dirty(cache->policy, cblock); 696} 697 698static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 699{ 700 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 701 if (atomic_dec_return(&cache->nr_dirty) == 0) 702 dm_table_event(cache->ti->table); 703 } 704 705 policy_clear_dirty(cache->policy, cblock); 706} 707 708/*----------------------------------------------------------------*/ 709 710static bool block_size_is_power_of_two(struct cache *cache) 711{ 712 return cache->sectors_per_block_shift >= 0; 713} 714 715static dm_block_t block_div(dm_block_t b, uint32_t n) 716{ 717 do_div(b, n); 718 719 return b; 720} 721 722static dm_block_t oblocks_per_dblock(struct cache *cache) 723{ 724 dm_block_t oblocks = cache->discard_block_size; 725 726 if (block_size_is_power_of_two(cache)) 727 oblocks >>= cache->sectors_per_block_shift; 728 else 729 oblocks = block_div(oblocks, cache->sectors_per_block); 730 731 return oblocks; 732} 733 734static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 735{ 736 return to_dblock(block_div(from_oblock(oblock), 737 oblocks_per_dblock(cache))); 738} 739 740static void set_discard(struct cache *cache, dm_dblock_t b) 741{ 742 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 743 atomic_inc(&cache->stats.discard_count); 744 745 spin_lock_irq(&cache->lock); 746 set_bit(from_dblock(b), cache->discard_bitset); 747 spin_unlock_irq(&cache->lock); 748} 749 750static void clear_discard(struct cache *cache, dm_dblock_t b) 751{ 752 spin_lock_irq(&cache->lock); 753 clear_bit(from_dblock(b), cache->discard_bitset); 754 spin_unlock_irq(&cache->lock); 755} 756 757static bool is_discarded(struct cache *cache, dm_dblock_t b) 758{ 759 int r; 760 spin_lock_irq(&cache->lock); 761 r = test_bit(from_dblock(b), cache->discard_bitset); 762 spin_unlock_irq(&cache->lock); 763 764 return r; 765} 766 767static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 768{ 769 int r; 770 spin_lock_irq(&cache->lock); 771 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 772 cache->discard_bitset); 773 spin_unlock_irq(&cache->lock); 774 775 return r; 776} 777 778/*---------------------------------------------------------------- 779 * Remapping 780 *--------------------------------------------------------------*/ 781static void remap_to_origin(struct cache *cache, struct bio *bio) 782{ 783 bio_set_dev(bio, cache->origin_dev->bdev); 784} 785 786static void remap_to_cache(struct cache *cache, struct bio *bio, 787 dm_cblock_t cblock) 788{ 789 sector_t bi_sector = bio->bi_iter.bi_sector; 790 sector_t block = from_cblock(cblock); 791 792 bio_set_dev(bio, cache->cache_dev->bdev); 793 if (!block_size_is_power_of_two(cache)) 794 bio->bi_iter.bi_sector = 795 (block * cache->sectors_per_block) + 796 sector_div(bi_sector, cache->sectors_per_block); 797 else 798 bio->bi_iter.bi_sector = 799 (block << cache->sectors_per_block_shift) | 800 (bi_sector & (cache->sectors_per_block - 1)); 801} 802 803static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 804{ 805 struct per_bio_data *pb; 806 807 spin_lock_irq(&cache->lock); 808 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 809 bio_op(bio) != REQ_OP_DISCARD) { 810 pb = get_per_bio_data(bio); 811 pb->tick = true; 812 cache->need_tick_bio = false; 813 } 814 spin_unlock_irq(&cache->lock); 815} 816 817static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 818 dm_oblock_t oblock, bool bio_has_pbd) 819{ 820 if (bio_has_pbd) 821 check_if_tick_bio_needed(cache, bio); 822 remap_to_origin(cache, bio); 823 if (bio_data_dir(bio) == WRITE) 824 clear_discard(cache, oblock_to_dblock(cache, oblock)); 825} 826 827static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 828 dm_oblock_t oblock) 829{ 830 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 831 __remap_to_origin_clear_discard(cache, bio, oblock, true); 832} 833 834static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 835 dm_oblock_t oblock, dm_cblock_t cblock) 836{ 837 check_if_tick_bio_needed(cache, bio); 838 remap_to_cache(cache, bio, cblock); 839 if (bio_data_dir(bio) == WRITE) { 840 set_dirty(cache, cblock); 841 clear_discard(cache, oblock_to_dblock(cache, oblock)); 842 } 843} 844 845static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 846{ 847 sector_t block_nr = bio->bi_iter.bi_sector; 848 849 if (!block_size_is_power_of_two(cache)) 850 (void) sector_div(block_nr, cache->sectors_per_block); 851 else 852 block_nr >>= cache->sectors_per_block_shift; 853 854 return to_oblock(block_nr); 855} 856 857static bool accountable_bio(struct cache *cache, struct bio *bio) 858{ 859 return bio_op(bio) != REQ_OP_DISCARD; 860} 861 862static void accounted_begin(struct cache *cache, struct bio *bio) 863{ 864 struct per_bio_data *pb; 865 866 if (accountable_bio(cache, bio)) { 867 pb = get_per_bio_data(bio); 868 pb->len = bio_sectors(bio); 869 iot_io_begin(&cache->tracker, pb->len); 870 } 871} 872 873static void accounted_complete(struct cache *cache, struct bio *bio) 874{ 875 struct per_bio_data *pb = get_per_bio_data(bio); 876 877 iot_io_end(&cache->tracker, pb->len); 878} 879 880static void accounted_request(struct cache *cache, struct bio *bio) 881{ 882 accounted_begin(cache, bio); 883 submit_bio_noacct(bio); 884} 885 886static void issue_op(struct bio *bio, void *context) 887{ 888 struct cache *cache = context; 889 accounted_request(cache, bio); 890} 891 892/* 893 * When running in writethrough mode we need to send writes to clean blocks 894 * to both the cache and origin devices. Clone the bio and send them in parallel. 895 */ 896static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 897 dm_oblock_t oblock, dm_cblock_t cblock) 898{ 899 struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs); 900 901 BUG_ON(!origin_bio); 902 903 bio_chain(origin_bio, bio); 904 /* 905 * Passing false to __remap_to_origin_clear_discard() skips 906 * all code that might use per_bio_data (since clone doesn't have it) 907 */ 908 __remap_to_origin_clear_discard(cache, origin_bio, oblock, false); 909 submit_bio(origin_bio); 910 911 remap_to_cache(cache, bio, cblock); 912} 913 914/*---------------------------------------------------------------- 915 * Failure modes 916 *--------------------------------------------------------------*/ 917static enum cache_metadata_mode get_cache_mode(struct cache *cache) 918{ 919 return cache->features.mode; 920} 921 922static const char *cache_device_name(struct cache *cache) 923{ 924 return dm_table_device_name(cache->ti->table); 925} 926 927static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 928{ 929 const char *descs[] = { 930 "write", 931 "read-only", 932 "fail" 933 }; 934 935 dm_table_event(cache->ti->table); 936 DMINFO("%s: switching cache to %s mode", 937 cache_device_name(cache), descs[(int)mode]); 938} 939 940static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 941{ 942 bool needs_check; 943 enum cache_metadata_mode old_mode = get_cache_mode(cache); 944 945 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 946 DMERR("%s: unable to read needs_check flag, setting failure mode.", 947 cache_device_name(cache)); 948 new_mode = CM_FAIL; 949 } 950 951 if (new_mode == CM_WRITE && needs_check) { 952 DMERR("%s: unable to switch cache to write mode until repaired.", 953 cache_device_name(cache)); 954 if (old_mode != new_mode) 955 new_mode = old_mode; 956 else 957 new_mode = CM_READ_ONLY; 958 } 959 960 /* Never move out of fail mode */ 961 if (old_mode == CM_FAIL) 962 new_mode = CM_FAIL; 963 964 switch (new_mode) { 965 case CM_FAIL: 966 case CM_READ_ONLY: 967 dm_cache_metadata_set_read_only(cache->cmd); 968 break; 969 970 case CM_WRITE: 971 dm_cache_metadata_set_read_write(cache->cmd); 972 break; 973 } 974 975 cache->features.mode = new_mode; 976 977 if (new_mode != old_mode) 978 notify_mode_switch(cache, new_mode); 979} 980 981static void abort_transaction(struct cache *cache) 982{ 983 const char *dev_name = cache_device_name(cache); 984 985 if (get_cache_mode(cache) >= CM_READ_ONLY) 986 return; 987 988 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 989 if (dm_cache_metadata_abort(cache->cmd)) { 990 DMERR("%s: failed to abort metadata transaction", dev_name); 991 set_cache_mode(cache, CM_FAIL); 992 } 993 994 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 995 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 996 set_cache_mode(cache, CM_FAIL); 997 } 998} 999 1000static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1001{ 1002 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1003 cache_device_name(cache), op, r); 1004 abort_transaction(cache); 1005 set_cache_mode(cache, CM_READ_ONLY); 1006} 1007 1008/*----------------------------------------------------------------*/ 1009 1010static void load_stats(struct cache *cache) 1011{ 1012 struct dm_cache_statistics stats; 1013 1014 dm_cache_metadata_get_stats(cache->cmd, &stats); 1015 atomic_set(&cache->stats.read_hit, stats.read_hits); 1016 atomic_set(&cache->stats.read_miss, stats.read_misses); 1017 atomic_set(&cache->stats.write_hit, stats.write_hits); 1018 atomic_set(&cache->stats.write_miss, stats.write_misses); 1019} 1020 1021static void save_stats(struct cache *cache) 1022{ 1023 struct dm_cache_statistics stats; 1024 1025 if (get_cache_mode(cache) >= CM_READ_ONLY) 1026 return; 1027 1028 stats.read_hits = atomic_read(&cache->stats.read_hit); 1029 stats.read_misses = atomic_read(&cache->stats.read_miss); 1030 stats.write_hits = atomic_read(&cache->stats.write_hit); 1031 stats.write_misses = atomic_read(&cache->stats.write_miss); 1032 1033 dm_cache_metadata_set_stats(cache->cmd, &stats); 1034} 1035 1036static void update_stats(struct cache_stats *stats, enum policy_operation op) 1037{ 1038 switch (op) { 1039 case POLICY_PROMOTE: 1040 atomic_inc(&stats->promotion); 1041 break; 1042 1043 case POLICY_DEMOTE: 1044 atomic_inc(&stats->demotion); 1045 break; 1046 1047 case POLICY_WRITEBACK: 1048 atomic_inc(&stats->writeback); 1049 break; 1050 } 1051} 1052 1053/*---------------------------------------------------------------- 1054 * Migration processing 1055 * 1056 * Migration covers moving data from the origin device to the cache, or 1057 * vice versa. 1058 *--------------------------------------------------------------*/ 1059 1060static void inc_io_migrations(struct cache *cache) 1061{ 1062 atomic_inc(&cache->nr_io_migrations); 1063} 1064 1065static void dec_io_migrations(struct cache *cache) 1066{ 1067 atomic_dec(&cache->nr_io_migrations); 1068} 1069 1070static bool discard_or_flush(struct bio *bio) 1071{ 1072 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1073} 1074 1075static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1076 dm_dblock_t *b, dm_dblock_t *e) 1077{ 1078 sector_t sb = bio->bi_iter.bi_sector; 1079 sector_t se = bio_end_sector(bio); 1080 1081 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1082 1083 if (se - sb < cache->discard_block_size) 1084 *e = *b; 1085 else 1086 *e = to_dblock(block_div(se, cache->discard_block_size)); 1087} 1088 1089/*----------------------------------------------------------------*/ 1090 1091static void prevent_background_work(struct cache *cache) 1092{ 1093 lockdep_off(); 1094 down_write(&cache->background_work_lock); 1095 lockdep_on(); 1096} 1097 1098static void allow_background_work(struct cache *cache) 1099{ 1100 lockdep_off(); 1101 up_write(&cache->background_work_lock); 1102 lockdep_on(); 1103} 1104 1105static bool background_work_begin(struct cache *cache) 1106{ 1107 bool r; 1108 1109 lockdep_off(); 1110 r = down_read_trylock(&cache->background_work_lock); 1111 lockdep_on(); 1112 1113 return r; 1114} 1115 1116static void background_work_end(struct cache *cache) 1117{ 1118 lockdep_off(); 1119 up_read(&cache->background_work_lock); 1120 lockdep_on(); 1121} 1122 1123/*----------------------------------------------------------------*/ 1124 1125static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1126{ 1127 return (bio_data_dir(bio) == WRITE) && 1128 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1129} 1130 1131static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1132{ 1133 return writeback_mode(cache) && 1134 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1135} 1136 1137static void quiesce(struct dm_cache_migration *mg, 1138 void (*continuation)(struct work_struct *)) 1139{ 1140 init_continuation(&mg->k, continuation); 1141 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1142} 1143 1144static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1145{ 1146 struct continuation *k = container_of(ws, struct continuation, ws); 1147 return container_of(k, struct dm_cache_migration, k); 1148} 1149 1150static void copy_complete(int read_err, unsigned long write_err, void *context) 1151{ 1152 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1153 1154 if (read_err || write_err) 1155 mg->k.input = BLK_STS_IOERR; 1156 1157 queue_continuation(mg->cache->wq, &mg->k); 1158} 1159 1160static void copy(struct dm_cache_migration *mg, bool promote) 1161{ 1162 struct dm_io_region o_region, c_region; 1163 struct cache *cache = mg->cache; 1164 1165 o_region.bdev = cache->origin_dev->bdev; 1166 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1167 o_region.count = cache->sectors_per_block; 1168 1169 c_region.bdev = cache->cache_dev->bdev; 1170 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1171 c_region.count = cache->sectors_per_block; 1172 1173 if (promote) 1174 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1175 else 1176 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1177} 1178 1179static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1180{ 1181 struct per_bio_data *pb = get_per_bio_data(bio); 1182 1183 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1184 free_prison_cell(cache, pb->cell); 1185 pb->cell = NULL; 1186} 1187 1188static void overwrite_endio(struct bio *bio) 1189{ 1190 struct dm_cache_migration *mg = bio->bi_private; 1191 struct cache *cache = mg->cache; 1192 struct per_bio_data *pb = get_per_bio_data(bio); 1193 1194 dm_unhook_bio(&pb->hook_info, bio); 1195 1196 if (bio->bi_status) 1197 mg->k.input = bio->bi_status; 1198 1199 queue_continuation(cache->wq, &mg->k); 1200} 1201 1202static void overwrite(struct dm_cache_migration *mg, 1203 void (*continuation)(struct work_struct *)) 1204{ 1205 struct bio *bio = mg->overwrite_bio; 1206 struct per_bio_data *pb = get_per_bio_data(bio); 1207 1208 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1209 1210 /* 1211 * The overwrite bio is part of the copy operation, as such it does 1212 * not set/clear discard or dirty flags. 1213 */ 1214 if (mg->op->op == POLICY_PROMOTE) 1215 remap_to_cache(mg->cache, bio, mg->op->cblock); 1216 else 1217 remap_to_origin(mg->cache, bio); 1218 1219 init_continuation(&mg->k, continuation); 1220 accounted_request(mg->cache, bio); 1221} 1222 1223/* 1224 * Migration steps: 1225 * 1226 * 1) exclusive lock preventing WRITEs 1227 * 2) quiesce 1228 * 3) copy or issue overwrite bio 1229 * 4) upgrade to exclusive lock preventing READs and WRITEs 1230 * 5) quiesce 1231 * 6) update metadata and commit 1232 * 7) unlock 1233 */ 1234static void mg_complete(struct dm_cache_migration *mg, bool success) 1235{ 1236 struct bio_list bios; 1237 struct cache *cache = mg->cache; 1238 struct policy_work *op = mg->op; 1239 dm_cblock_t cblock = op->cblock; 1240 1241 if (success) 1242 update_stats(&cache->stats, op->op); 1243 1244 switch (op->op) { 1245 case POLICY_PROMOTE: 1246 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1247 policy_complete_background_work(cache->policy, op, success); 1248 1249 if (mg->overwrite_bio) { 1250 if (success) 1251 force_set_dirty(cache, cblock); 1252 else if (mg->k.input) 1253 mg->overwrite_bio->bi_status = mg->k.input; 1254 else 1255 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1256 bio_endio(mg->overwrite_bio); 1257 } else { 1258 if (success) 1259 force_clear_dirty(cache, cblock); 1260 dec_io_migrations(cache); 1261 } 1262 break; 1263 1264 case POLICY_DEMOTE: 1265 /* 1266 * We clear dirty here to update the nr_dirty counter. 1267 */ 1268 if (success) 1269 force_clear_dirty(cache, cblock); 1270 policy_complete_background_work(cache->policy, op, success); 1271 dec_io_migrations(cache); 1272 break; 1273 1274 case POLICY_WRITEBACK: 1275 if (success) 1276 force_clear_dirty(cache, cblock); 1277 policy_complete_background_work(cache->policy, op, success); 1278 dec_io_migrations(cache); 1279 break; 1280 } 1281 1282 bio_list_init(&bios); 1283 if (mg->cell) { 1284 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1285 free_prison_cell(cache, mg->cell); 1286 } 1287 1288 free_migration(mg); 1289 defer_bios(cache, &bios); 1290 wake_migration_worker(cache); 1291 1292 background_work_end(cache); 1293} 1294 1295static void mg_success(struct work_struct *ws) 1296{ 1297 struct dm_cache_migration *mg = ws_to_mg(ws); 1298 mg_complete(mg, mg->k.input == 0); 1299} 1300 1301static void mg_update_metadata(struct work_struct *ws) 1302{ 1303 int r; 1304 struct dm_cache_migration *mg = ws_to_mg(ws); 1305 struct cache *cache = mg->cache; 1306 struct policy_work *op = mg->op; 1307 1308 switch (op->op) { 1309 case POLICY_PROMOTE: 1310 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1311 if (r) { 1312 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1313 cache_device_name(cache)); 1314 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1315 1316 mg_complete(mg, false); 1317 return; 1318 } 1319 mg_complete(mg, true); 1320 break; 1321 1322 case POLICY_DEMOTE: 1323 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1324 if (r) { 1325 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1326 cache_device_name(cache)); 1327 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1328 1329 mg_complete(mg, false); 1330 return; 1331 } 1332 1333 /* 1334 * It would be nice if we only had to commit when a REQ_FLUSH 1335 * comes through. But there's one scenario that we have to 1336 * look out for: 1337 * 1338 * - vblock x in a cache block 1339 * - domotion occurs 1340 * - cache block gets reallocated and over written 1341 * - crash 1342 * 1343 * When we recover, because there was no commit the cache will 1344 * rollback to having the data for vblock x in the cache block. 1345 * But the cache block has since been overwritten, so it'll end 1346 * up pointing to data that was never in 'x' during the history 1347 * of the device. 1348 * 1349 * To avoid this issue we require a commit as part of the 1350 * demotion operation. 1351 */ 1352 init_continuation(&mg->k, mg_success); 1353 continue_after_commit(&cache->committer, &mg->k); 1354 schedule_commit(&cache->committer); 1355 break; 1356 1357 case POLICY_WRITEBACK: 1358 mg_complete(mg, true); 1359 break; 1360 } 1361} 1362 1363static void mg_update_metadata_after_copy(struct work_struct *ws) 1364{ 1365 struct dm_cache_migration *mg = ws_to_mg(ws); 1366 1367 /* 1368 * Did the copy succeed? 1369 */ 1370 if (mg->k.input) 1371 mg_complete(mg, false); 1372 else 1373 mg_update_metadata(ws); 1374} 1375 1376static void mg_upgrade_lock(struct work_struct *ws) 1377{ 1378 int r; 1379 struct dm_cache_migration *mg = ws_to_mg(ws); 1380 1381 /* 1382 * Did the copy succeed? 1383 */ 1384 if (mg->k.input) 1385 mg_complete(mg, false); 1386 1387 else { 1388 /* 1389 * Now we want the lock to prevent both reads and writes. 1390 */ 1391 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1392 READ_WRITE_LOCK_LEVEL); 1393 if (r < 0) 1394 mg_complete(mg, false); 1395 1396 else if (r) 1397 quiesce(mg, mg_update_metadata); 1398 1399 else 1400 mg_update_metadata(ws); 1401 } 1402} 1403 1404static void mg_full_copy(struct work_struct *ws) 1405{ 1406 struct dm_cache_migration *mg = ws_to_mg(ws); 1407 struct cache *cache = mg->cache; 1408 struct policy_work *op = mg->op; 1409 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1410 1411 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1412 is_discarded_oblock(cache, op->oblock)) { 1413 mg_upgrade_lock(ws); 1414 return; 1415 } 1416 1417 init_continuation(&mg->k, mg_upgrade_lock); 1418 copy(mg, is_policy_promote); 1419} 1420 1421static void mg_copy(struct work_struct *ws) 1422{ 1423 struct dm_cache_migration *mg = ws_to_mg(ws); 1424 1425 if (mg->overwrite_bio) { 1426 /* 1427 * No exclusive lock was held when we last checked if the bio 1428 * was optimisable. So we have to check again in case things 1429 * have changed (eg, the block may no longer be discarded). 1430 */ 1431 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1432 /* 1433 * Fallback to a real full copy after doing some tidying up. 1434 */ 1435 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1436 BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ 1437 mg->overwrite_bio = NULL; 1438 inc_io_migrations(mg->cache); 1439 mg_full_copy(ws); 1440 return; 1441 } 1442 1443 /* 1444 * It's safe to do this here, even though it's new data 1445 * because all IO has been locked out of the block. 1446 * 1447 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1448 * so _not_ using mg_upgrade_lock() as continutation. 1449 */ 1450 overwrite(mg, mg_update_metadata_after_copy); 1451 1452 } else 1453 mg_full_copy(ws); 1454} 1455 1456static int mg_lock_writes(struct dm_cache_migration *mg) 1457{ 1458 int r; 1459 struct dm_cell_key_v2 key; 1460 struct cache *cache = mg->cache; 1461 struct dm_bio_prison_cell_v2 *prealloc; 1462 1463 prealloc = alloc_prison_cell(cache); 1464 1465 /* 1466 * Prevent writes to the block, but allow reads to continue. 1467 * Unless we're using an overwrite bio, in which case we lock 1468 * everything. 1469 */ 1470 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1471 r = dm_cell_lock_v2(cache->prison, &key, 1472 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1473 prealloc, &mg->cell); 1474 if (r < 0) { 1475 free_prison_cell(cache, prealloc); 1476 mg_complete(mg, false); 1477 return r; 1478 } 1479 1480 if (mg->cell != prealloc) 1481 free_prison_cell(cache, prealloc); 1482 1483 if (r == 0) 1484 mg_copy(&mg->k.ws); 1485 else 1486 quiesce(mg, mg_copy); 1487 1488 return 0; 1489} 1490 1491static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1492{ 1493 struct dm_cache_migration *mg; 1494 1495 if (!background_work_begin(cache)) { 1496 policy_complete_background_work(cache->policy, op, false); 1497 return -EPERM; 1498 } 1499 1500 mg = alloc_migration(cache); 1501 1502 mg->op = op; 1503 mg->overwrite_bio = bio; 1504 1505 if (!bio) 1506 inc_io_migrations(cache); 1507 1508 return mg_lock_writes(mg); 1509} 1510 1511/*---------------------------------------------------------------- 1512 * invalidation processing 1513 *--------------------------------------------------------------*/ 1514 1515static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1516{ 1517 struct bio_list bios; 1518 struct cache *cache = mg->cache; 1519 1520 bio_list_init(&bios); 1521 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1522 free_prison_cell(cache, mg->cell); 1523 1524 if (!success && mg->overwrite_bio) 1525 bio_io_error(mg->overwrite_bio); 1526 1527 free_migration(mg); 1528 defer_bios(cache, &bios); 1529 1530 background_work_end(cache); 1531} 1532 1533static void invalidate_completed(struct work_struct *ws) 1534{ 1535 struct dm_cache_migration *mg = ws_to_mg(ws); 1536 invalidate_complete(mg, !mg->k.input); 1537} 1538 1539static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1540{ 1541 int r = policy_invalidate_mapping(cache->policy, cblock); 1542 if (!r) { 1543 r = dm_cache_remove_mapping(cache->cmd, cblock); 1544 if (r) { 1545 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1546 cache_device_name(cache)); 1547 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1548 } 1549 1550 } else if (r == -ENODATA) { 1551 /* 1552 * Harmless, already unmapped. 1553 */ 1554 r = 0; 1555 1556 } else 1557 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1558 1559 return r; 1560} 1561 1562static void invalidate_remove(struct work_struct *ws) 1563{ 1564 int r; 1565 struct dm_cache_migration *mg = ws_to_mg(ws); 1566 struct cache *cache = mg->cache; 1567 1568 r = invalidate_cblock(cache, mg->invalidate_cblock); 1569 if (r) { 1570 invalidate_complete(mg, false); 1571 return; 1572 } 1573 1574 init_continuation(&mg->k, invalidate_completed); 1575 continue_after_commit(&cache->committer, &mg->k); 1576 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1577 mg->overwrite_bio = NULL; 1578 schedule_commit(&cache->committer); 1579} 1580 1581static int invalidate_lock(struct dm_cache_migration *mg) 1582{ 1583 int r; 1584 struct dm_cell_key_v2 key; 1585 struct cache *cache = mg->cache; 1586 struct dm_bio_prison_cell_v2 *prealloc; 1587 1588 prealloc = alloc_prison_cell(cache); 1589 1590 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1591 r = dm_cell_lock_v2(cache->prison, &key, 1592 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1593 if (r < 0) { 1594 free_prison_cell(cache, prealloc); 1595 invalidate_complete(mg, false); 1596 return r; 1597 } 1598 1599 if (mg->cell != prealloc) 1600 free_prison_cell(cache, prealloc); 1601 1602 if (r) 1603 quiesce(mg, invalidate_remove); 1604 1605 else { 1606 /* 1607 * We can't call invalidate_remove() directly here because we 1608 * might still be in request context. 1609 */ 1610 init_continuation(&mg->k, invalidate_remove); 1611 queue_work(cache->wq, &mg->k.ws); 1612 } 1613 1614 return 0; 1615} 1616 1617static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1618 dm_oblock_t oblock, struct bio *bio) 1619{ 1620 struct dm_cache_migration *mg; 1621 1622 if (!background_work_begin(cache)) 1623 return -EPERM; 1624 1625 mg = alloc_migration(cache); 1626 1627 mg->overwrite_bio = bio; 1628 mg->invalidate_cblock = cblock; 1629 mg->invalidate_oblock = oblock; 1630 1631 return invalidate_lock(mg); 1632} 1633 1634/*---------------------------------------------------------------- 1635 * bio processing 1636 *--------------------------------------------------------------*/ 1637 1638enum busy { 1639 IDLE, 1640 BUSY 1641}; 1642 1643static enum busy spare_migration_bandwidth(struct cache *cache) 1644{ 1645 bool idle = iot_idle_for(&cache->tracker, HZ); 1646 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1647 cache->sectors_per_block; 1648 1649 if (idle && current_volume <= cache->migration_threshold) 1650 return IDLE; 1651 else 1652 return BUSY; 1653} 1654 1655static void inc_hit_counter(struct cache *cache, struct bio *bio) 1656{ 1657 atomic_inc(bio_data_dir(bio) == READ ? 1658 &cache->stats.read_hit : &cache->stats.write_hit); 1659} 1660 1661static void inc_miss_counter(struct cache *cache, struct bio *bio) 1662{ 1663 atomic_inc(bio_data_dir(bio) == READ ? 1664 &cache->stats.read_miss : &cache->stats.write_miss); 1665} 1666 1667/*----------------------------------------------------------------*/ 1668 1669static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1670 bool *commit_needed) 1671{ 1672 int r, data_dir; 1673 bool rb, background_queued; 1674 dm_cblock_t cblock; 1675 1676 *commit_needed = false; 1677 1678 rb = bio_detain_shared(cache, block, bio); 1679 if (!rb) { 1680 /* 1681 * An exclusive lock is held for this block, so we have to 1682 * wait. We set the commit_needed flag so the current 1683 * transaction will be committed asap, allowing this lock 1684 * to be dropped. 1685 */ 1686 *commit_needed = true; 1687 return DM_MAPIO_SUBMITTED; 1688 } 1689 1690 data_dir = bio_data_dir(bio); 1691 1692 if (optimisable_bio(cache, bio, block)) { 1693 struct policy_work *op = NULL; 1694 1695 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1696 if (unlikely(r && r != -ENOENT)) { 1697 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1698 cache_device_name(cache), r); 1699 bio_io_error(bio); 1700 return DM_MAPIO_SUBMITTED; 1701 } 1702 1703 if (r == -ENOENT && op) { 1704 bio_drop_shared_lock(cache, bio); 1705 BUG_ON(op->op != POLICY_PROMOTE); 1706 mg_start(cache, op, bio); 1707 return DM_MAPIO_SUBMITTED; 1708 } 1709 } else { 1710 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1711 if (unlikely(r && r != -ENOENT)) { 1712 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1713 cache_device_name(cache), r); 1714 bio_io_error(bio); 1715 return DM_MAPIO_SUBMITTED; 1716 } 1717 1718 if (background_queued) 1719 wake_migration_worker(cache); 1720 } 1721 1722 if (r == -ENOENT) { 1723 struct per_bio_data *pb = get_per_bio_data(bio); 1724 1725 /* 1726 * Miss. 1727 */ 1728 inc_miss_counter(cache, bio); 1729 if (pb->req_nr == 0) { 1730 accounted_begin(cache, bio); 1731 remap_to_origin_clear_discard(cache, bio, block); 1732 } else { 1733 /* 1734 * This is a duplicate writethrough io that is no 1735 * longer needed because the block has been demoted. 1736 */ 1737 bio_endio(bio); 1738 return DM_MAPIO_SUBMITTED; 1739 } 1740 } else { 1741 /* 1742 * Hit. 1743 */ 1744 inc_hit_counter(cache, bio); 1745 1746 /* 1747 * Passthrough always maps to the origin, invalidating any 1748 * cache blocks that are written to. 1749 */ 1750 if (passthrough_mode(cache)) { 1751 if (bio_data_dir(bio) == WRITE) { 1752 bio_drop_shared_lock(cache, bio); 1753 atomic_inc(&cache->stats.demotion); 1754 invalidate_start(cache, cblock, block, bio); 1755 } else 1756 remap_to_origin_clear_discard(cache, bio, block); 1757 } else { 1758 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1759 !is_dirty(cache, cblock)) { 1760 remap_to_origin_and_cache(cache, bio, block, cblock); 1761 accounted_begin(cache, bio); 1762 } else 1763 remap_to_cache_dirty(cache, bio, block, cblock); 1764 } 1765 } 1766 1767 /* 1768 * dm core turns FUA requests into a separate payload and FLUSH req. 1769 */ 1770 if (bio->bi_opf & REQ_FUA) { 1771 /* 1772 * issue_after_commit will call accounted_begin a second time. So 1773 * we call accounted_complete() to avoid double accounting. 1774 */ 1775 accounted_complete(cache, bio); 1776 issue_after_commit(&cache->committer, bio); 1777 *commit_needed = true; 1778 return DM_MAPIO_SUBMITTED; 1779 } 1780 1781 return DM_MAPIO_REMAPPED; 1782} 1783 1784static bool process_bio(struct cache *cache, struct bio *bio) 1785{ 1786 bool commit_needed; 1787 1788 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1789 submit_bio_noacct(bio); 1790 1791 return commit_needed; 1792} 1793 1794/* 1795 * A non-zero return indicates read_only or fail_io mode. 1796 */ 1797static int commit(struct cache *cache, bool clean_shutdown) 1798{ 1799 int r; 1800 1801 if (get_cache_mode(cache) >= CM_READ_ONLY) 1802 return -EINVAL; 1803 1804 atomic_inc(&cache->stats.commit_count); 1805 r = dm_cache_commit(cache->cmd, clean_shutdown); 1806 if (r) 1807 metadata_operation_failed(cache, "dm_cache_commit", r); 1808 1809 return r; 1810} 1811 1812/* 1813 * Used by the batcher. 1814 */ 1815static blk_status_t commit_op(void *context) 1816{ 1817 struct cache *cache = context; 1818 1819 if (dm_cache_changed_this_transaction(cache->cmd)) 1820 return errno_to_blk_status(commit(cache, false)); 1821 1822 return 0; 1823} 1824 1825/*----------------------------------------------------------------*/ 1826 1827static bool process_flush_bio(struct cache *cache, struct bio *bio) 1828{ 1829 struct per_bio_data *pb = get_per_bio_data(bio); 1830 1831 if (!pb->req_nr) 1832 remap_to_origin(cache, bio); 1833 else 1834 remap_to_cache(cache, bio, 0); 1835 1836 issue_after_commit(&cache->committer, bio); 1837 return true; 1838} 1839 1840static bool process_discard_bio(struct cache *cache, struct bio *bio) 1841{ 1842 dm_dblock_t b, e; 1843 1844 // FIXME: do we need to lock the region? Or can we just assume the 1845 // user wont be so foolish as to issue discard concurrently with 1846 // other IO? 1847 calc_discard_block_range(cache, bio, &b, &e); 1848 while (b != e) { 1849 set_discard(cache, b); 1850 b = to_dblock(from_dblock(b) + 1); 1851 } 1852 1853 if (cache->features.discard_passdown) { 1854 remap_to_origin(cache, bio); 1855 submit_bio_noacct(bio); 1856 } else 1857 bio_endio(bio); 1858 1859 return false; 1860} 1861 1862static void process_deferred_bios(struct work_struct *ws) 1863{ 1864 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1865 1866 bool commit_needed = false; 1867 struct bio_list bios; 1868 struct bio *bio; 1869 1870 bio_list_init(&bios); 1871 1872 spin_lock_irq(&cache->lock); 1873 bio_list_merge(&bios, &cache->deferred_bios); 1874 bio_list_init(&cache->deferred_bios); 1875 spin_unlock_irq(&cache->lock); 1876 1877 while ((bio = bio_list_pop(&bios))) { 1878 if (bio->bi_opf & REQ_PREFLUSH) 1879 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1880 1881 else if (bio_op(bio) == REQ_OP_DISCARD) 1882 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1883 1884 else 1885 commit_needed = process_bio(cache, bio) || commit_needed; 1886 cond_resched(); 1887 } 1888 1889 if (commit_needed) 1890 schedule_commit(&cache->committer); 1891} 1892 1893/*---------------------------------------------------------------- 1894 * Main worker loop 1895 *--------------------------------------------------------------*/ 1896 1897static void requeue_deferred_bios(struct cache *cache) 1898{ 1899 struct bio *bio; 1900 struct bio_list bios; 1901 1902 bio_list_init(&bios); 1903 bio_list_merge(&bios, &cache->deferred_bios); 1904 bio_list_init(&cache->deferred_bios); 1905 1906 while ((bio = bio_list_pop(&bios))) { 1907 bio->bi_status = BLK_STS_DM_REQUEUE; 1908 bio_endio(bio); 1909 cond_resched(); 1910 } 1911} 1912 1913/* 1914 * We want to commit periodically so that not too much 1915 * unwritten metadata builds up. 1916 */ 1917static void do_waker(struct work_struct *ws) 1918{ 1919 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1920 1921 policy_tick(cache->policy, true); 1922 wake_migration_worker(cache); 1923 schedule_commit(&cache->committer); 1924 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1925} 1926 1927static void check_migrations(struct work_struct *ws) 1928{ 1929 int r; 1930 struct policy_work *op; 1931 struct cache *cache = container_of(ws, struct cache, migration_worker); 1932 enum busy b; 1933 1934 for (;;) { 1935 b = spare_migration_bandwidth(cache); 1936 1937 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1938 if (r == -ENODATA) 1939 break; 1940 1941 if (r) { 1942 DMERR_LIMIT("%s: policy_background_work failed", 1943 cache_device_name(cache)); 1944 break; 1945 } 1946 1947 r = mg_start(cache, op, NULL); 1948 if (r) 1949 break; 1950 1951 cond_resched(); 1952 } 1953} 1954 1955/*---------------------------------------------------------------- 1956 * Target methods 1957 *--------------------------------------------------------------*/ 1958 1959/* 1960 * This function gets called on the error paths of the constructor, so we 1961 * have to cope with a partially initialised struct. 1962 */ 1963static void destroy(struct cache *cache) 1964{ 1965 unsigned i; 1966 1967 mempool_exit(&cache->migration_pool); 1968 1969 if (cache->prison) 1970 dm_bio_prison_destroy_v2(cache->prison); 1971 1972 cancel_delayed_work_sync(&cache->waker); 1973 if (cache->wq) 1974 destroy_workqueue(cache->wq); 1975 1976 if (cache->dirty_bitset) 1977 free_bitset(cache->dirty_bitset); 1978 1979 if (cache->discard_bitset) 1980 free_bitset(cache->discard_bitset); 1981 1982 if (cache->copier) 1983 dm_kcopyd_client_destroy(cache->copier); 1984 1985 if (cache->cmd) 1986 dm_cache_metadata_close(cache->cmd); 1987 1988 if (cache->metadata_dev) 1989 dm_put_device(cache->ti, cache->metadata_dev); 1990 1991 if (cache->origin_dev) 1992 dm_put_device(cache->ti, cache->origin_dev); 1993 1994 if (cache->cache_dev) 1995 dm_put_device(cache->ti, cache->cache_dev); 1996 1997 if (cache->policy) 1998 dm_cache_policy_destroy(cache->policy); 1999 2000 for (i = 0; i < cache->nr_ctr_args ; i++) 2001 kfree(cache->ctr_args[i]); 2002 kfree(cache->ctr_args); 2003 2004 bioset_exit(&cache->bs); 2005 2006 kfree(cache); 2007} 2008 2009static void cache_dtr(struct dm_target *ti) 2010{ 2011 struct cache *cache = ti->private; 2012 2013 destroy(cache); 2014} 2015 2016static sector_t get_dev_size(struct dm_dev *dev) 2017{ 2018 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2019} 2020 2021/*----------------------------------------------------------------*/ 2022 2023/* 2024 * Construct a cache device mapping. 2025 * 2026 * cache <metadata dev> <cache dev> <origin dev> <block size> 2027 * <#feature args> [<feature arg>]* 2028 * <policy> <#policy args> [<policy arg>]* 2029 * 2030 * metadata dev : fast device holding the persistent metadata 2031 * cache dev : fast device holding cached data blocks 2032 * origin dev : slow device holding original data blocks 2033 * block size : cache unit size in sectors 2034 * 2035 * #feature args : number of feature arguments passed 2036 * feature args : writethrough. (The default is writeback.) 2037 * 2038 * policy : the replacement policy to use 2039 * #policy args : an even number of policy arguments corresponding 2040 * to key/value pairs passed to the policy 2041 * policy args : key/value pairs passed to the policy 2042 * E.g. 'sequential_threshold 1024' 2043 * See cache-policies.txt for details. 2044 * 2045 * Optional feature arguments are: 2046 * writethrough : write through caching that prohibits cache block 2047 * content from being different from origin block content. 2048 * Without this argument, the default behaviour is to write 2049 * back cache block contents later for performance reasons, 2050 * so they may differ from the corresponding origin blocks. 2051 */ 2052struct cache_args { 2053 struct dm_target *ti; 2054 2055 struct dm_dev *metadata_dev; 2056 2057 struct dm_dev *cache_dev; 2058 sector_t cache_sectors; 2059 2060 struct dm_dev *origin_dev; 2061 sector_t origin_sectors; 2062 2063 uint32_t block_size; 2064 2065 const char *policy_name; 2066 int policy_argc; 2067 const char **policy_argv; 2068 2069 struct cache_features features; 2070}; 2071 2072static void destroy_cache_args(struct cache_args *ca) 2073{ 2074 if (ca->metadata_dev) 2075 dm_put_device(ca->ti, ca->metadata_dev); 2076 2077 if (ca->cache_dev) 2078 dm_put_device(ca->ti, ca->cache_dev); 2079 2080 if (ca->origin_dev) 2081 dm_put_device(ca->ti, ca->origin_dev); 2082 2083 kfree(ca); 2084} 2085 2086static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2087{ 2088 if (!as->argc) { 2089 *error = "Insufficient args"; 2090 return false; 2091 } 2092 2093 return true; 2094} 2095 2096static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2097 char **error) 2098{ 2099 int r; 2100 sector_t metadata_dev_size; 2101 char b[BDEVNAME_SIZE]; 2102 2103 if (!at_least_one_arg(as, error)) 2104 return -EINVAL; 2105 2106 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2107 &ca->metadata_dev); 2108 if (r) { 2109 *error = "Error opening metadata device"; 2110 return r; 2111 } 2112 2113 metadata_dev_size = get_dev_size(ca->metadata_dev); 2114 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2115 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2116 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2117 2118 return 0; 2119} 2120 2121static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2122 char **error) 2123{ 2124 int r; 2125 2126 if (!at_least_one_arg(as, error)) 2127 return -EINVAL; 2128 2129 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2130 &ca->cache_dev); 2131 if (r) { 2132 *error = "Error opening cache device"; 2133 return r; 2134 } 2135 ca->cache_sectors = get_dev_size(ca->cache_dev); 2136 2137 return 0; 2138} 2139 2140static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2141 char **error) 2142{ 2143 int r; 2144 2145 if (!at_least_one_arg(as, error)) 2146 return -EINVAL; 2147 2148 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2149 &ca->origin_dev); 2150 if (r) { 2151 *error = "Error opening origin device"; 2152 return r; 2153 } 2154 2155 ca->origin_sectors = get_dev_size(ca->origin_dev); 2156 if (ca->ti->len > ca->origin_sectors) { 2157 *error = "Device size larger than cached device"; 2158 return -EINVAL; 2159 } 2160 2161 return 0; 2162} 2163 2164static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2165 char **error) 2166{ 2167 unsigned long block_size; 2168 2169 if (!at_least_one_arg(as, error)) 2170 return -EINVAL; 2171 2172 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2173 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2174 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2175 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2176 *error = "Invalid data block size"; 2177 return -EINVAL; 2178 } 2179 2180 if (block_size > ca->cache_sectors) { 2181 *error = "Data block size is larger than the cache device"; 2182 return -EINVAL; 2183 } 2184 2185 ca->block_size = block_size; 2186 2187 return 0; 2188} 2189 2190static void init_features(struct cache_features *cf) 2191{ 2192 cf->mode = CM_WRITE; 2193 cf->io_mode = CM_IO_WRITEBACK; 2194 cf->metadata_version = 1; 2195 cf->discard_passdown = true; 2196} 2197 2198static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2199 char **error) 2200{ 2201 static const struct dm_arg _args[] = { 2202 {0, 3, "Invalid number of cache feature arguments"}, 2203 }; 2204 2205 int r, mode_ctr = 0; 2206 unsigned argc; 2207 const char *arg; 2208 struct cache_features *cf = &ca->features; 2209 2210 init_features(cf); 2211 2212 r = dm_read_arg_group(_args, as, &argc, error); 2213 if (r) 2214 return -EINVAL; 2215 2216 while (argc--) { 2217 arg = dm_shift_arg(as); 2218 2219 if (!strcasecmp(arg, "writeback")) { 2220 cf->io_mode = CM_IO_WRITEBACK; 2221 mode_ctr++; 2222 } 2223 2224 else if (!strcasecmp(arg, "writethrough")) { 2225 cf->io_mode = CM_IO_WRITETHROUGH; 2226 mode_ctr++; 2227 } 2228 2229 else if (!strcasecmp(arg, "passthrough")) { 2230 cf->io_mode = CM_IO_PASSTHROUGH; 2231 mode_ctr++; 2232 } 2233 2234 else if (!strcasecmp(arg, "metadata2")) 2235 cf->metadata_version = 2; 2236 2237 else if (!strcasecmp(arg, "no_discard_passdown")) 2238 cf->discard_passdown = false; 2239 2240 else { 2241 *error = "Unrecognised cache feature requested"; 2242 return -EINVAL; 2243 } 2244 } 2245 2246 if (mode_ctr > 1) { 2247 *error = "Duplicate cache io_mode features requested"; 2248 return -EINVAL; 2249 } 2250 2251 return 0; 2252} 2253 2254static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2255 char **error) 2256{ 2257 static const struct dm_arg _args[] = { 2258 {0, 1024, "Invalid number of policy arguments"}, 2259 }; 2260 2261 int r; 2262 2263 if (!at_least_one_arg(as, error)) 2264 return -EINVAL; 2265 2266 ca->policy_name = dm_shift_arg(as); 2267 2268 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2269 if (r) 2270 return -EINVAL; 2271 2272 ca->policy_argv = (const char **)as->argv; 2273 dm_consume_args(as, ca->policy_argc); 2274 2275 return 0; 2276} 2277 2278static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2279 char **error) 2280{ 2281 int r; 2282 struct dm_arg_set as; 2283 2284 as.argc = argc; 2285 as.argv = argv; 2286 2287 r = parse_metadata_dev(ca, &as, error); 2288 if (r) 2289 return r; 2290 2291 r = parse_cache_dev(ca, &as, error); 2292 if (r) 2293 return r; 2294 2295 r = parse_origin_dev(ca, &as, error); 2296 if (r) 2297 return r; 2298 2299 r = parse_block_size(ca, &as, error); 2300 if (r) 2301 return r; 2302 2303 r = parse_features(ca, &as, error); 2304 if (r) 2305 return r; 2306 2307 r = parse_policy(ca, &as, error); 2308 if (r) 2309 return r; 2310 2311 return 0; 2312} 2313 2314/*----------------------------------------------------------------*/ 2315 2316static struct kmem_cache *migration_cache; 2317 2318#define NOT_CORE_OPTION 1 2319 2320static int process_config_option(struct cache *cache, const char *key, const char *value) 2321{ 2322 unsigned long tmp; 2323 2324 if (!strcasecmp(key, "migration_threshold")) { 2325 if (kstrtoul(value, 10, &tmp)) 2326 return -EINVAL; 2327 2328 cache->migration_threshold = tmp; 2329 return 0; 2330 } 2331 2332 return NOT_CORE_OPTION; 2333} 2334 2335static int set_config_value(struct cache *cache, const char *key, const char *value) 2336{ 2337 int r = process_config_option(cache, key, value); 2338 2339 if (r == NOT_CORE_OPTION) 2340 r = policy_set_config_value(cache->policy, key, value); 2341 2342 if (r) 2343 DMWARN("bad config value for %s: %s", key, value); 2344 2345 return r; 2346} 2347 2348static int set_config_values(struct cache *cache, int argc, const char **argv) 2349{ 2350 int r = 0; 2351 2352 if (argc & 1) { 2353 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2354 return -EINVAL; 2355 } 2356 2357 while (argc) { 2358 r = set_config_value(cache, argv[0], argv[1]); 2359 if (r) 2360 break; 2361 2362 argc -= 2; 2363 argv += 2; 2364 } 2365 2366 return r; 2367} 2368 2369static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2370 char **error) 2371{ 2372 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2373 cache->cache_size, 2374 cache->origin_sectors, 2375 cache->sectors_per_block); 2376 if (IS_ERR(p)) { 2377 *error = "Error creating cache's policy"; 2378 return PTR_ERR(p); 2379 } 2380 cache->policy = p; 2381 BUG_ON(!cache->policy); 2382 2383 return 0; 2384} 2385 2386/* 2387 * We want the discard block size to be at least the size of the cache 2388 * block size and have no more than 2^14 discard blocks across the origin. 2389 */ 2390#define MAX_DISCARD_BLOCKS (1 << 14) 2391 2392static bool too_many_discard_blocks(sector_t discard_block_size, 2393 sector_t origin_size) 2394{ 2395 (void) sector_div(origin_size, discard_block_size); 2396 2397 return origin_size > MAX_DISCARD_BLOCKS; 2398} 2399 2400static sector_t calculate_discard_block_size(sector_t cache_block_size, 2401 sector_t origin_size) 2402{ 2403 sector_t discard_block_size = cache_block_size; 2404 2405 if (origin_size) 2406 while (too_many_discard_blocks(discard_block_size, origin_size)) 2407 discard_block_size *= 2; 2408 2409 return discard_block_size; 2410} 2411 2412static void set_cache_size(struct cache *cache, dm_cblock_t size) 2413{ 2414 dm_block_t nr_blocks = from_cblock(size); 2415 2416 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2417 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2418 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2419 "Please consider increasing the cache block size to reduce the overall cache block count.", 2420 (unsigned long long) nr_blocks); 2421 2422 cache->cache_size = size; 2423} 2424 2425#define DEFAULT_MIGRATION_THRESHOLD 2048 2426 2427static int cache_create(struct cache_args *ca, struct cache **result) 2428{ 2429 int r = 0; 2430 char **error = &ca->ti->error; 2431 struct cache *cache; 2432 struct dm_target *ti = ca->ti; 2433 dm_block_t origin_blocks; 2434 struct dm_cache_metadata *cmd; 2435 bool may_format = ca->features.mode == CM_WRITE; 2436 2437 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2438 if (!cache) 2439 return -ENOMEM; 2440 2441 cache->ti = ca->ti; 2442 ti->private = cache; 2443 ti->num_flush_bios = 2; 2444 ti->flush_supported = true; 2445 2446 ti->num_discard_bios = 1; 2447 ti->discards_supported = true; 2448 2449 ti->per_io_data_size = sizeof(struct per_bio_data); 2450 2451 cache->features = ca->features; 2452 if (writethrough_mode(cache)) { 2453 /* Create bioset for writethrough bios issued to origin */ 2454 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2455 if (r) 2456 goto bad; 2457 } 2458 2459 cache->metadata_dev = ca->metadata_dev; 2460 cache->origin_dev = ca->origin_dev; 2461 cache->cache_dev = ca->cache_dev; 2462 2463 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2464 2465 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2466 origin_blocks = block_div(origin_blocks, ca->block_size); 2467 cache->origin_blocks = to_oblock(origin_blocks); 2468 2469 cache->sectors_per_block = ca->block_size; 2470 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2471 r = -EINVAL; 2472 goto bad; 2473 } 2474 2475 if (ca->block_size & (ca->block_size - 1)) { 2476 dm_block_t cache_size = ca->cache_sectors; 2477 2478 cache->sectors_per_block_shift = -1; 2479 cache_size = block_div(cache_size, ca->block_size); 2480 set_cache_size(cache, to_cblock(cache_size)); 2481 } else { 2482 cache->sectors_per_block_shift = __ffs(ca->block_size); 2483 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2484 } 2485 2486 r = create_cache_policy(cache, ca, error); 2487 if (r) 2488 goto bad; 2489 2490 cache->policy_nr_args = ca->policy_argc; 2491 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2492 2493 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2494 if (r) { 2495 *error = "Error setting cache policy's config values"; 2496 goto bad; 2497 } 2498 2499 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2500 ca->block_size, may_format, 2501 dm_cache_policy_get_hint_size(cache->policy), 2502 ca->features.metadata_version); 2503 if (IS_ERR(cmd)) { 2504 *error = "Error creating metadata object"; 2505 r = PTR_ERR(cmd); 2506 goto bad; 2507 } 2508 cache->cmd = cmd; 2509 set_cache_mode(cache, CM_WRITE); 2510 if (get_cache_mode(cache) != CM_WRITE) { 2511 *error = "Unable to get write access to metadata, please check/repair metadata."; 2512 r = -EINVAL; 2513 goto bad; 2514 } 2515 2516 if (passthrough_mode(cache)) { 2517 bool all_clean; 2518 2519 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2520 if (r) { 2521 *error = "dm_cache_metadata_all_clean() failed"; 2522 goto bad; 2523 } 2524 2525 if (!all_clean) { 2526 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2527 r = -EINVAL; 2528 goto bad; 2529 } 2530 2531 policy_allow_migrations(cache->policy, false); 2532 } 2533 2534 spin_lock_init(&cache->lock); 2535 bio_list_init(&cache->deferred_bios); 2536 atomic_set(&cache->nr_allocated_migrations, 0); 2537 atomic_set(&cache->nr_io_migrations, 0); 2538 init_waitqueue_head(&cache->migration_wait); 2539 2540 r = -ENOMEM; 2541 atomic_set(&cache->nr_dirty, 0); 2542 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2543 if (!cache->dirty_bitset) { 2544 *error = "could not allocate dirty bitset"; 2545 goto bad; 2546 } 2547 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2548 2549 cache->discard_block_size = 2550 calculate_discard_block_size(cache->sectors_per_block, 2551 cache->origin_sectors); 2552 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2553 cache->discard_block_size)); 2554 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2555 if (!cache->discard_bitset) { 2556 *error = "could not allocate discard bitset"; 2557 goto bad; 2558 } 2559 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2560 2561 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2562 if (IS_ERR(cache->copier)) { 2563 *error = "could not create kcopyd client"; 2564 r = PTR_ERR(cache->copier); 2565 goto bad; 2566 } 2567 2568 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2569 if (!cache->wq) { 2570 *error = "could not create workqueue for metadata object"; 2571 goto bad; 2572 } 2573 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2574 INIT_WORK(&cache->migration_worker, check_migrations); 2575 INIT_DELAYED_WORK(&cache->waker, do_waker); 2576 2577 cache->prison = dm_bio_prison_create_v2(cache->wq); 2578 if (!cache->prison) { 2579 *error = "could not create bio prison"; 2580 goto bad; 2581 } 2582 2583 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2584 migration_cache); 2585 if (r) { 2586 *error = "Error creating cache's migration mempool"; 2587 goto bad; 2588 } 2589 2590 cache->need_tick_bio = true; 2591 cache->sized = false; 2592 cache->invalidate = false; 2593 cache->commit_requested = false; 2594 cache->loaded_mappings = false; 2595 cache->loaded_discards = false; 2596 2597 load_stats(cache); 2598 2599 atomic_set(&cache->stats.demotion, 0); 2600 atomic_set(&cache->stats.promotion, 0); 2601 atomic_set(&cache->stats.copies_avoided, 0); 2602 atomic_set(&cache->stats.cache_cell_clash, 0); 2603 atomic_set(&cache->stats.commit_count, 0); 2604 atomic_set(&cache->stats.discard_count, 0); 2605 2606 spin_lock_init(&cache->invalidation_lock); 2607 INIT_LIST_HEAD(&cache->invalidation_requests); 2608 2609 batcher_init(&cache->committer, commit_op, cache, 2610 issue_op, cache, cache->wq); 2611 iot_init(&cache->tracker); 2612 2613 init_rwsem(&cache->background_work_lock); 2614 prevent_background_work(cache); 2615 2616 *result = cache; 2617 return 0; 2618bad: 2619 destroy(cache); 2620 return r; 2621} 2622 2623static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2624{ 2625 unsigned i; 2626 const char **copy; 2627 2628 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2629 if (!copy) 2630 return -ENOMEM; 2631 for (i = 0; i < argc; i++) { 2632 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2633 if (!copy[i]) { 2634 while (i--) 2635 kfree(copy[i]); 2636 kfree(copy); 2637 return -ENOMEM; 2638 } 2639 } 2640 2641 cache->nr_ctr_args = argc; 2642 cache->ctr_args = copy; 2643 2644 return 0; 2645} 2646 2647static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2648{ 2649 int r = -EINVAL; 2650 struct cache_args *ca; 2651 struct cache *cache = NULL; 2652 2653 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2654 if (!ca) { 2655 ti->error = "Error allocating memory for cache"; 2656 return -ENOMEM; 2657 } 2658 ca->ti = ti; 2659 2660 r = parse_cache_args(ca, argc, argv, &ti->error); 2661 if (r) 2662 goto out; 2663 2664 r = cache_create(ca, &cache); 2665 if (r) 2666 goto out; 2667 2668 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2669 if (r) { 2670 destroy(cache); 2671 goto out; 2672 } 2673 2674 ti->private = cache; 2675out: 2676 destroy_cache_args(ca); 2677 return r; 2678} 2679 2680/*----------------------------------------------------------------*/ 2681 2682static int cache_map(struct dm_target *ti, struct bio *bio) 2683{ 2684 struct cache *cache = ti->private; 2685 2686 int r; 2687 bool commit_needed; 2688 dm_oblock_t block = get_bio_block(cache, bio); 2689 2690 init_per_bio_data(bio); 2691 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2692 /* 2693 * This can only occur if the io goes to a partial block at 2694 * the end of the origin device. We don't cache these. 2695 * Just remap to the origin and carry on. 2696 */ 2697 remap_to_origin(cache, bio); 2698 accounted_begin(cache, bio); 2699 return DM_MAPIO_REMAPPED; 2700 } 2701 2702 if (discard_or_flush(bio)) { 2703 defer_bio(cache, bio); 2704 return DM_MAPIO_SUBMITTED; 2705 } 2706 2707 r = map_bio(cache, bio, block, &commit_needed); 2708 if (commit_needed) 2709 schedule_commit(&cache->committer); 2710 2711 return r; 2712} 2713 2714static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2715{ 2716 struct cache *cache = ti->private; 2717 unsigned long flags; 2718 struct per_bio_data *pb = get_per_bio_data(bio); 2719 2720 if (pb->tick) { 2721 policy_tick(cache->policy, false); 2722 2723 spin_lock_irqsave(&cache->lock, flags); 2724 cache->need_tick_bio = true; 2725 spin_unlock_irqrestore(&cache->lock, flags); 2726 } 2727 2728 bio_drop_shared_lock(cache, bio); 2729 accounted_complete(cache, bio); 2730 2731 return DM_ENDIO_DONE; 2732} 2733 2734static int write_dirty_bitset(struct cache *cache) 2735{ 2736 int r; 2737 2738 if (get_cache_mode(cache) >= CM_READ_ONLY) 2739 return -EINVAL; 2740 2741 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2742 if (r) 2743 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2744 2745 return r; 2746} 2747 2748static int write_discard_bitset(struct cache *cache) 2749{ 2750 unsigned i, r; 2751 2752 if (get_cache_mode(cache) >= CM_READ_ONLY) 2753 return -EINVAL; 2754 2755 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2756 cache->discard_nr_blocks); 2757 if (r) { 2758 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2759 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2760 return r; 2761 } 2762 2763 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2764 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2765 is_discarded(cache, to_dblock(i))); 2766 if (r) { 2767 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2768 return r; 2769 } 2770 } 2771 2772 return 0; 2773} 2774 2775static int write_hints(struct cache *cache) 2776{ 2777 int r; 2778 2779 if (get_cache_mode(cache) >= CM_READ_ONLY) 2780 return -EINVAL; 2781 2782 r = dm_cache_write_hints(cache->cmd, cache->policy); 2783 if (r) { 2784 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2785 return r; 2786 } 2787 2788 return 0; 2789} 2790 2791/* 2792 * returns true on success 2793 */ 2794static bool sync_metadata(struct cache *cache) 2795{ 2796 int r1, r2, r3, r4; 2797 2798 r1 = write_dirty_bitset(cache); 2799 if (r1) 2800 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2801 2802 r2 = write_discard_bitset(cache); 2803 if (r2) 2804 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2805 2806 save_stats(cache); 2807 2808 r3 = write_hints(cache); 2809 if (r3) 2810 DMERR("%s: could not write hints", cache_device_name(cache)); 2811 2812 /* 2813 * If writing the above metadata failed, we still commit, but don't 2814 * set the clean shutdown flag. This will effectively force every 2815 * dirty bit to be set on reload. 2816 */ 2817 r4 = commit(cache, !r1 && !r2 && !r3); 2818 if (r4) 2819 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2820 2821 return !r1 && !r2 && !r3 && !r4; 2822} 2823 2824static void cache_postsuspend(struct dm_target *ti) 2825{ 2826 struct cache *cache = ti->private; 2827 2828 prevent_background_work(cache); 2829 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2830 2831 cancel_delayed_work_sync(&cache->waker); 2832 drain_workqueue(cache->wq); 2833 WARN_ON(cache->tracker.in_flight); 2834 2835 /* 2836 * If it's a flush suspend there won't be any deferred bios, so this 2837 * call is harmless. 2838 */ 2839 requeue_deferred_bios(cache); 2840 2841 if (get_cache_mode(cache) == CM_WRITE) 2842 (void) sync_metadata(cache); 2843} 2844 2845static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2846 bool dirty, uint32_t hint, bool hint_valid) 2847{ 2848 int r; 2849 struct cache *cache = context; 2850 2851 if (dirty) { 2852 set_bit(from_cblock(cblock), cache->dirty_bitset); 2853 atomic_inc(&cache->nr_dirty); 2854 } else 2855 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2856 2857 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2858 if (r) 2859 return r; 2860 2861 return 0; 2862} 2863 2864/* 2865 * The discard block size in the on disk metadata is not 2866 * neccessarily the same as we're currently using. So we have to 2867 * be careful to only set the discarded attribute if we know it 2868 * covers a complete block of the new size. 2869 */ 2870struct discard_load_info { 2871 struct cache *cache; 2872 2873 /* 2874 * These blocks are sized using the on disk dblock size, rather 2875 * than the current one. 2876 */ 2877 dm_block_t block_size; 2878 dm_block_t discard_begin, discard_end; 2879}; 2880 2881static void discard_load_info_init(struct cache *cache, 2882 struct discard_load_info *li) 2883{ 2884 li->cache = cache; 2885 li->discard_begin = li->discard_end = 0; 2886} 2887 2888static void set_discard_range(struct discard_load_info *li) 2889{ 2890 sector_t b, e; 2891 2892 if (li->discard_begin == li->discard_end) 2893 return; 2894 2895 /* 2896 * Convert to sectors. 2897 */ 2898 b = li->discard_begin * li->block_size; 2899 e = li->discard_end * li->block_size; 2900 2901 /* 2902 * Then convert back to the current dblock size. 2903 */ 2904 b = dm_sector_div_up(b, li->cache->discard_block_size); 2905 sector_div(e, li->cache->discard_block_size); 2906 2907 /* 2908 * The origin may have shrunk, so we need to check we're still in 2909 * bounds. 2910 */ 2911 if (e > from_dblock(li->cache->discard_nr_blocks)) 2912 e = from_dblock(li->cache->discard_nr_blocks); 2913 2914 for (; b < e; b++) 2915 set_discard(li->cache, to_dblock(b)); 2916} 2917 2918static int load_discard(void *context, sector_t discard_block_size, 2919 dm_dblock_t dblock, bool discard) 2920{ 2921 struct discard_load_info *li = context; 2922 2923 li->block_size = discard_block_size; 2924 2925 if (discard) { 2926 if (from_dblock(dblock) == li->discard_end) 2927 /* 2928 * We're already in a discard range, just extend it. 2929 */ 2930 li->discard_end = li->discard_end + 1ULL; 2931 2932 else { 2933 /* 2934 * Emit the old range and start a new one. 2935 */ 2936 set_discard_range(li); 2937 li->discard_begin = from_dblock(dblock); 2938 li->discard_end = li->discard_begin + 1ULL; 2939 } 2940 } else { 2941 set_discard_range(li); 2942 li->discard_begin = li->discard_end = 0; 2943 } 2944 2945 return 0; 2946} 2947 2948static dm_cblock_t get_cache_dev_size(struct cache *cache) 2949{ 2950 sector_t size = get_dev_size(cache->cache_dev); 2951 (void) sector_div(size, cache->sectors_per_block); 2952 return to_cblock(size); 2953} 2954 2955static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2956{ 2957 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { 2958 if (cache->sized) { 2959 DMERR("%s: unable to extend cache due to missing cache table reload", 2960 cache_device_name(cache)); 2961 return false; 2962 } 2963 } 2964 2965 /* 2966 * We can't drop a dirty block when shrinking the cache. 2967 */ 2968 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 2969 new_size = to_cblock(from_cblock(new_size) + 1); 2970 if (is_dirty(cache, new_size)) { 2971 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 2972 cache_device_name(cache), 2973 (unsigned long long) from_cblock(new_size)); 2974 return false; 2975 } 2976 } 2977 2978 return true; 2979} 2980 2981static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 2982{ 2983 int r; 2984 2985 r = dm_cache_resize(cache->cmd, new_size); 2986 if (r) { 2987 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 2988 metadata_operation_failed(cache, "dm_cache_resize", r); 2989 return r; 2990 } 2991 2992 set_cache_size(cache, new_size); 2993 2994 return 0; 2995} 2996 2997static int cache_preresume(struct dm_target *ti) 2998{ 2999 int r = 0; 3000 struct cache *cache = ti->private; 3001 dm_cblock_t csize = get_cache_dev_size(cache); 3002 3003 /* 3004 * Check to see if the cache has resized. 3005 */ 3006 if (!cache->sized) { 3007 r = resize_cache_dev(cache, csize); 3008 if (r) 3009 return r; 3010 3011 cache->sized = true; 3012 3013 } else if (csize != cache->cache_size) { 3014 if (!can_resize(cache, csize)) 3015 return -EINVAL; 3016 3017 r = resize_cache_dev(cache, csize); 3018 if (r) 3019 return r; 3020 } 3021 3022 if (!cache->loaded_mappings) { 3023 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3024 load_mapping, cache); 3025 if (r) { 3026 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3027 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3028 return r; 3029 } 3030 3031 cache->loaded_mappings = true; 3032 } 3033 3034 if (!cache->loaded_discards) { 3035 struct discard_load_info li; 3036 3037 /* 3038 * The discard bitset could have been resized, or the 3039 * discard block size changed. To be safe we start by 3040 * setting every dblock to not discarded. 3041 */ 3042 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3043 3044 discard_load_info_init(cache, &li); 3045 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3046 if (r) { 3047 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3048 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3049 return r; 3050 } 3051 set_discard_range(&li); 3052 3053 cache->loaded_discards = true; 3054 } 3055 3056 return r; 3057} 3058 3059static void cache_resume(struct dm_target *ti) 3060{ 3061 struct cache *cache = ti->private; 3062 3063 cache->need_tick_bio = true; 3064 allow_background_work(cache); 3065 do_waker(&cache->waker.work); 3066} 3067 3068static void emit_flags(struct cache *cache, char *result, 3069 unsigned maxlen, ssize_t *sz_ptr) 3070{ 3071 ssize_t sz = *sz_ptr; 3072 struct cache_features *cf = &cache->features; 3073 unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; 3074 3075 DMEMIT("%u ", count); 3076 3077 if (cf->metadata_version == 2) 3078 DMEMIT("metadata2 "); 3079 3080 if (writethrough_mode(cache)) 3081 DMEMIT("writethrough "); 3082 3083 else if (passthrough_mode(cache)) 3084 DMEMIT("passthrough "); 3085 3086 else if (writeback_mode(cache)) 3087 DMEMIT("writeback "); 3088 3089 else { 3090 DMEMIT("unknown "); 3091 DMERR("%s: internal error: unknown io mode: %d", 3092 cache_device_name(cache), (int) cf->io_mode); 3093 } 3094 3095 if (!cf->discard_passdown) 3096 DMEMIT("no_discard_passdown "); 3097 3098 *sz_ptr = sz; 3099} 3100 3101/* 3102 * Status format: 3103 * 3104 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3105 * <cache block size> <#used cache blocks>/<#total cache blocks> 3106 * <#read hits> <#read misses> <#write hits> <#write misses> 3107 * <#demotions> <#promotions> <#dirty> 3108 * <#features> <features>* 3109 * <#core args> <core args> 3110 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3111 */ 3112static void cache_status(struct dm_target *ti, status_type_t type, 3113 unsigned status_flags, char *result, unsigned maxlen) 3114{ 3115 int r = 0; 3116 unsigned i; 3117 ssize_t sz = 0; 3118 dm_block_t nr_free_blocks_metadata = 0; 3119 dm_block_t nr_blocks_metadata = 0; 3120 char buf[BDEVNAME_SIZE]; 3121 struct cache *cache = ti->private; 3122 dm_cblock_t residency; 3123 bool needs_check; 3124 3125 switch (type) { 3126 case STATUSTYPE_INFO: 3127 if (get_cache_mode(cache) == CM_FAIL) { 3128 DMEMIT("Fail"); 3129 break; 3130 } 3131 3132 /* Commit to ensure statistics aren't out-of-date */ 3133 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3134 (void) commit(cache, false); 3135 3136 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3137 if (r) { 3138 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3139 cache_device_name(cache), r); 3140 goto err; 3141 } 3142 3143 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3144 if (r) { 3145 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3146 cache_device_name(cache), r); 3147 goto err; 3148 } 3149 3150 residency = policy_residency(cache->policy); 3151 3152 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3153 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3154 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3155 (unsigned long long)nr_blocks_metadata, 3156 (unsigned long long)cache->sectors_per_block, 3157 (unsigned long long) from_cblock(residency), 3158 (unsigned long long) from_cblock(cache->cache_size), 3159 (unsigned) atomic_read(&cache->stats.read_hit), 3160 (unsigned) atomic_read(&cache->stats.read_miss), 3161 (unsigned) atomic_read(&cache->stats.write_hit), 3162 (unsigned) atomic_read(&cache->stats.write_miss), 3163 (unsigned) atomic_read(&cache->stats.demotion), 3164 (unsigned) atomic_read(&cache->stats.promotion), 3165 (unsigned long) atomic_read(&cache->nr_dirty)); 3166 3167 emit_flags(cache, result, maxlen, &sz); 3168 3169 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3170 3171 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3172 if (sz < maxlen) { 3173 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3174 if (r) 3175 DMERR("%s: policy_emit_config_values returned %d", 3176 cache_device_name(cache), r); 3177 } 3178 3179 if (get_cache_mode(cache) == CM_READ_ONLY) 3180 DMEMIT("ro "); 3181 else 3182 DMEMIT("rw "); 3183 3184 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3185 3186 if (r || needs_check) 3187 DMEMIT("needs_check "); 3188 else 3189 DMEMIT("- "); 3190 3191 break; 3192 3193 case STATUSTYPE_TABLE: 3194 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3195 DMEMIT("%s ", buf); 3196 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3197 DMEMIT("%s ", buf); 3198 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3199 DMEMIT("%s", buf); 3200 3201 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3202 DMEMIT(" %s", cache->ctr_args[i]); 3203 if (cache->nr_ctr_args) 3204 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3205 } 3206 3207 return; 3208 3209err: 3210 DMEMIT("Error"); 3211} 3212 3213/* 3214 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3215 * the one-past-the-end value. 3216 */ 3217struct cblock_range { 3218 dm_cblock_t begin; 3219 dm_cblock_t end; 3220}; 3221 3222/* 3223 * A cache block range can take two forms: 3224 * 3225 * i) A single cblock, eg. '3456' 3226 * ii) A begin and end cblock with a dash between, eg. 123-234 3227 */ 3228static int parse_cblock_range(struct cache *cache, const char *str, 3229 struct cblock_range *result) 3230{ 3231 char dummy; 3232 uint64_t b, e; 3233 int r; 3234 3235 /* 3236 * Try and parse form (ii) first. 3237 */ 3238 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3239 if (r < 0) 3240 return r; 3241 3242 if (r == 2) { 3243 result->begin = to_cblock(b); 3244 result->end = to_cblock(e); 3245 return 0; 3246 } 3247 3248 /* 3249 * That didn't work, try form (i). 3250 */ 3251 r = sscanf(str, "%llu%c", &b, &dummy); 3252 if (r < 0) 3253 return r; 3254 3255 if (r == 1) { 3256 result->begin = to_cblock(b); 3257 result->end = to_cblock(from_cblock(result->begin) + 1u); 3258 return 0; 3259 } 3260 3261 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3262 return -EINVAL; 3263} 3264 3265static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3266{ 3267 uint64_t b = from_cblock(range->begin); 3268 uint64_t e = from_cblock(range->end); 3269 uint64_t n = from_cblock(cache->cache_size); 3270 3271 if (b >= n) { 3272 DMERR("%s: begin cblock out of range: %llu >= %llu", 3273 cache_device_name(cache), b, n); 3274 return -EINVAL; 3275 } 3276 3277 if (e > n) { 3278 DMERR("%s: end cblock out of range: %llu > %llu", 3279 cache_device_name(cache), e, n); 3280 return -EINVAL; 3281 } 3282 3283 if (b >= e) { 3284 DMERR("%s: invalid cblock range: %llu >= %llu", 3285 cache_device_name(cache), b, e); 3286 return -EINVAL; 3287 } 3288 3289 return 0; 3290} 3291 3292static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3293{ 3294 return to_cblock(from_cblock(b) + 1); 3295} 3296 3297static int request_invalidation(struct cache *cache, struct cblock_range *range) 3298{ 3299 int r = 0; 3300 3301 /* 3302 * We don't need to do any locking here because we know we're in 3303 * passthrough mode. There's is potential for a race between an 3304 * invalidation triggered by an io and an invalidation message. This 3305 * is harmless, we must not worry if the policy call fails. 3306 */ 3307 while (range->begin != range->end) { 3308 r = invalidate_cblock(cache, range->begin); 3309 if (r) 3310 return r; 3311 3312 range->begin = cblock_succ(range->begin); 3313 } 3314 3315 cache->commit_requested = true; 3316 return r; 3317} 3318 3319static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3320 const char **cblock_ranges) 3321{ 3322 int r = 0; 3323 unsigned i; 3324 struct cblock_range range; 3325 3326 if (!passthrough_mode(cache)) { 3327 DMERR("%s: cache has to be in passthrough mode for invalidation", 3328 cache_device_name(cache)); 3329 return -EPERM; 3330 } 3331 3332 for (i = 0; i < count; i++) { 3333 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3334 if (r) 3335 break; 3336 3337 r = validate_cblock_range(cache, &range); 3338 if (r) 3339 break; 3340 3341 /* 3342 * Pass begin and end origin blocks to the worker and wake it. 3343 */ 3344 r = request_invalidation(cache, &range); 3345 if (r) 3346 break; 3347 } 3348 3349 return r; 3350} 3351 3352/* 3353 * Supports 3354 * "<key> <value>" 3355 * and 3356 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3357 * 3358 * The key migration_threshold is supported by the cache target core. 3359 */ 3360static int cache_message(struct dm_target *ti, unsigned argc, char **argv, 3361 char *result, unsigned maxlen) 3362{ 3363 struct cache *cache = ti->private; 3364 3365 if (!argc) 3366 return -EINVAL; 3367 3368 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3369 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3370 cache_device_name(cache)); 3371 return -EOPNOTSUPP; 3372 } 3373 3374 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3375 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3376 3377 if (argc != 2) 3378 return -EINVAL; 3379 3380 return set_config_value(cache, argv[0], argv[1]); 3381} 3382 3383static int cache_iterate_devices(struct dm_target *ti, 3384 iterate_devices_callout_fn fn, void *data) 3385{ 3386 int r = 0; 3387 struct cache *cache = ti->private; 3388 3389 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3390 if (!r) 3391 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3392 3393 return r; 3394} 3395 3396static bool origin_dev_supports_discard(struct block_device *origin_bdev) 3397{ 3398 struct request_queue *q = bdev_get_queue(origin_bdev); 3399 3400 return q && blk_queue_discard(q); 3401} 3402 3403/* 3404 * If discard_passdown was enabled verify that the origin device 3405 * supports discards. Disable discard_passdown if not. 3406 */ 3407static void disable_passdown_if_not_supported(struct cache *cache) 3408{ 3409 struct block_device *origin_bdev = cache->origin_dev->bdev; 3410 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3411 const char *reason = NULL; 3412 char buf[BDEVNAME_SIZE]; 3413 3414 if (!cache->features.discard_passdown) 3415 return; 3416 3417 if (!origin_dev_supports_discard(origin_bdev)) 3418 reason = "discard unsupported"; 3419 3420 else if (origin_limits->max_discard_sectors < cache->sectors_per_block) 3421 reason = "max discard sectors smaller than a block"; 3422 3423 if (reason) { 3424 DMWARN("Origin device (%s) %s: Disabling discard passdown.", 3425 bdevname(origin_bdev, buf), reason); 3426 cache->features.discard_passdown = false; 3427 } 3428} 3429 3430static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3431{ 3432 struct block_device *origin_bdev = cache->origin_dev->bdev; 3433 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3434 3435 if (!cache->features.discard_passdown) { 3436 /* No passdown is done so setting own virtual limits */ 3437 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3438 cache->origin_sectors); 3439 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3440 return; 3441 } 3442 3443 /* 3444 * cache_iterate_devices() is stacking both origin and fast device limits 3445 * but discards aren't passed to fast device, so inherit origin's limits. 3446 */ 3447 limits->max_discard_sectors = origin_limits->max_discard_sectors; 3448 limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; 3449 limits->discard_granularity = origin_limits->discard_granularity; 3450 limits->discard_alignment = origin_limits->discard_alignment; 3451 limits->discard_misaligned = origin_limits->discard_misaligned; 3452} 3453 3454static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3455{ 3456 struct cache *cache = ti->private; 3457 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3458 3459 /* 3460 * If the system-determined stacked limits are compatible with the 3461 * cache's blocksize (io_opt is a factor) do not override them. 3462 */ 3463 if (io_opt_sectors < cache->sectors_per_block || 3464 do_div(io_opt_sectors, cache->sectors_per_block)) { 3465 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3466 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3467 } 3468 3469 disable_passdown_if_not_supported(cache); 3470 set_discard_limits(cache, limits); 3471} 3472 3473/*----------------------------------------------------------------*/ 3474 3475static struct target_type cache_target = { 3476 .name = "cache", 3477 .version = {2, 2, 0}, 3478 .module = THIS_MODULE, 3479 .ctr = cache_ctr, 3480 .dtr = cache_dtr, 3481 .map = cache_map, 3482 .end_io = cache_end_io, 3483 .postsuspend = cache_postsuspend, 3484 .preresume = cache_preresume, 3485 .resume = cache_resume, 3486 .status = cache_status, 3487 .message = cache_message, 3488 .iterate_devices = cache_iterate_devices, 3489 .io_hints = cache_io_hints, 3490}; 3491 3492static int __init dm_cache_init(void) 3493{ 3494 int r; 3495 3496 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3497 if (!migration_cache) 3498 return -ENOMEM; 3499 3500 r = dm_register_target(&cache_target); 3501 if (r) { 3502 DMERR("cache target registration failed: %d", r); 3503 kmem_cache_destroy(migration_cache); 3504 return r; 3505 } 3506 3507 return 0; 3508} 3509 3510static void __exit dm_cache_exit(void) 3511{ 3512 dm_unregister_target(&cache_target); 3513 kmem_cache_destroy(migration_cache); 3514} 3515 3516module_init(dm_cache_init); 3517module_exit(dm_cache_exit); 3518 3519MODULE_DESCRIPTION(DM_NAME " cache target"); 3520MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3521MODULE_LICENSE("GPL"); 3522