18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Uses a block device as cache for other block devices; optimized for SSDs. 68c2ecf20Sopenharmony_ci * All allocation is done in buckets, which should match the erase block size 78c2ecf20Sopenharmony_ci * of the device. 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * Buckets containing cached data are kept on a heap sorted by priority; 108c2ecf20Sopenharmony_ci * bucket priority is increased on cache hit, and periodically all the buckets 118c2ecf20Sopenharmony_ci * on the heap have their priority scaled down. This currently is just used as 128c2ecf20Sopenharmony_ci * an LRU but in the future should allow for more intelligent heuristics. 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * Buckets have an 8 bit counter; freeing is accomplished by incrementing the 158c2ecf20Sopenharmony_ci * counter. Garbage collection is used to remove stale pointers. 168c2ecf20Sopenharmony_ci * 178c2ecf20Sopenharmony_ci * Indexing is done via a btree; nodes are not necessarily fully sorted, rather 188c2ecf20Sopenharmony_ci * as keys are inserted we only sort the pages that have not yet been written. 198c2ecf20Sopenharmony_ci * When garbage collection is run, we resort the entire node. 208c2ecf20Sopenharmony_ci * 218c2ecf20Sopenharmony_ci * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst. 228c2ecf20Sopenharmony_ci */ 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci#include "bcache.h" 258c2ecf20Sopenharmony_ci#include "btree.h" 268c2ecf20Sopenharmony_ci#include "debug.h" 278c2ecf20Sopenharmony_ci#include "extents.h" 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci#include <linux/slab.h> 308c2ecf20Sopenharmony_ci#include <linux/bitops.h> 318c2ecf20Sopenharmony_ci#include <linux/hash.h> 328c2ecf20Sopenharmony_ci#include <linux/kthread.h> 338c2ecf20Sopenharmony_ci#include <linux/prefetch.h> 348c2ecf20Sopenharmony_ci#include <linux/random.h> 358c2ecf20Sopenharmony_ci#include <linux/rcupdate.h> 368c2ecf20Sopenharmony_ci#include <linux/sched/clock.h> 378c2ecf20Sopenharmony_ci#include <linux/rculist.h> 388c2ecf20Sopenharmony_ci#include <linux/delay.h> 398c2ecf20Sopenharmony_ci#include <trace/events/bcache.h> 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci/* 428c2ecf20Sopenharmony_ci * Todo: 438c2ecf20Sopenharmony_ci * register_bcache: Return errors out to userspace correctly 448c2ecf20Sopenharmony_ci * 458c2ecf20Sopenharmony_ci * Writeback: don't undirty key until after a cache flush 468c2ecf20Sopenharmony_ci * 478c2ecf20Sopenharmony_ci * Create an iterator for key pointers 488c2ecf20Sopenharmony_ci * 498c2ecf20Sopenharmony_ci * On btree write error, mark bucket such that it won't be freed from the cache 508c2ecf20Sopenharmony_ci * 518c2ecf20Sopenharmony_ci * Journalling: 528c2ecf20Sopenharmony_ci * Check for bad keys in replay 538c2ecf20Sopenharmony_ci * Propagate barriers 548c2ecf20Sopenharmony_ci * Refcount journal entries in journal_replay 558c2ecf20Sopenharmony_ci * 568c2ecf20Sopenharmony_ci * Garbage collection: 578c2ecf20Sopenharmony_ci * Finish incremental gc 588c2ecf20Sopenharmony_ci * Gc should free old UUIDs, data for invalid UUIDs 598c2ecf20Sopenharmony_ci * 608c2ecf20Sopenharmony_ci * Provide a way to list backing device UUIDs we have data cached for, and 618c2ecf20Sopenharmony_ci * probably how long it's been since we've seen them, and a way to invalidate 628c2ecf20Sopenharmony_ci * dirty data for devices that will never be attached again 638c2ecf20Sopenharmony_ci * 648c2ecf20Sopenharmony_ci * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so 658c2ecf20Sopenharmony_ci * that based on that and how much dirty data we have we can keep writeback 668c2ecf20Sopenharmony_ci * from being starved 678c2ecf20Sopenharmony_ci * 688c2ecf20Sopenharmony_ci * Add a tracepoint or somesuch to watch for writeback starvation 698c2ecf20Sopenharmony_ci * 708c2ecf20Sopenharmony_ci * When btree depth > 1 and splitting an interior node, we have to make sure 718c2ecf20Sopenharmony_ci * alloc_bucket() cannot fail. This should be true but is not completely 728c2ecf20Sopenharmony_ci * obvious. 738c2ecf20Sopenharmony_ci * 748c2ecf20Sopenharmony_ci * Plugging? 758c2ecf20Sopenharmony_ci * 768c2ecf20Sopenharmony_ci * If data write is less than hard sector size of ssd, round up offset in open 778c2ecf20Sopenharmony_ci * bucket to the next whole sector 788c2ecf20Sopenharmony_ci * 798c2ecf20Sopenharmony_ci * Superblock needs to be fleshed out for multiple cache devices 808c2ecf20Sopenharmony_ci * 818c2ecf20Sopenharmony_ci * Add a sysfs tunable for the number of writeback IOs in flight 828c2ecf20Sopenharmony_ci * 838c2ecf20Sopenharmony_ci * Add a sysfs tunable for the number of open data buckets 848c2ecf20Sopenharmony_ci * 858c2ecf20Sopenharmony_ci * IO tracking: Can we track when one process is doing io on behalf of another? 868c2ecf20Sopenharmony_ci * IO tracking: Don't use just an average, weigh more recent stuff higher 878c2ecf20Sopenharmony_ci * 888c2ecf20Sopenharmony_ci * Test module load/unload 898c2ecf20Sopenharmony_ci */ 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci#define MAX_NEED_GC 64 928c2ecf20Sopenharmony_ci#define MAX_SAVE_PRIO 72 938c2ecf20Sopenharmony_ci#define MAX_GC_TIMES 100 948c2ecf20Sopenharmony_ci#define MIN_GC_NODES 100 958c2ecf20Sopenharmony_ci#define GC_SLEEP_MS 100 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci#define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci#define PTR_HASH(c, k) \ 1008c2ecf20Sopenharmony_ci (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_cistatic struct workqueue_struct *btree_io_wq; 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci#define insert_lock(s, b) ((b)->level <= (s)->lock) 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_cistatic inline struct bset *write_block(struct btree *b) 1088c2ecf20Sopenharmony_ci{ 1098c2ecf20Sopenharmony_ci return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache); 1108c2ecf20Sopenharmony_ci} 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_cistatic void bch_btree_init_next(struct btree *b) 1138c2ecf20Sopenharmony_ci{ 1148c2ecf20Sopenharmony_ci /* If not a leaf node, always sort */ 1158c2ecf20Sopenharmony_ci if (b->level && b->keys.nsets) 1168c2ecf20Sopenharmony_ci bch_btree_sort(&b->keys, &b->c->sort); 1178c2ecf20Sopenharmony_ci else 1188c2ecf20Sopenharmony_ci bch_btree_sort_lazy(&b->keys, &b->c->sort); 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci if (b->written < btree_blocks(b)) 1218c2ecf20Sopenharmony_ci bch_bset_init_next(&b->keys, write_block(b), 1228c2ecf20Sopenharmony_ci bset_magic(&b->c->cache->sb)); 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci} 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci/* Btree key manipulation */ 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_civoid bkey_put(struct cache_set *c, struct bkey *k) 1298c2ecf20Sopenharmony_ci{ 1308c2ecf20Sopenharmony_ci unsigned int i; 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci for (i = 0; i < KEY_PTRS(k); i++) 1338c2ecf20Sopenharmony_ci if (ptr_available(c, k, i)) 1348c2ecf20Sopenharmony_ci atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); 1358c2ecf20Sopenharmony_ci} 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci/* Btree IO */ 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_cistatic uint64_t btree_csum_set(struct btree *b, struct bset *i) 1408c2ecf20Sopenharmony_ci{ 1418c2ecf20Sopenharmony_ci uint64_t crc = b->key.ptr[0]; 1428c2ecf20Sopenharmony_ci void *data = (void *) i + 8, *end = bset_bkey_last(i); 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci crc = bch_crc64_update(crc, data, end - data); 1458c2ecf20Sopenharmony_ci return crc ^ 0xffffffffffffffffULL; 1468c2ecf20Sopenharmony_ci} 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_civoid bch_btree_node_read_done(struct btree *b) 1498c2ecf20Sopenharmony_ci{ 1508c2ecf20Sopenharmony_ci const char *err = "bad btree header"; 1518c2ecf20Sopenharmony_ci struct bset *i = btree_bset_first(b); 1528c2ecf20Sopenharmony_ci struct btree_iter *iter; 1538c2ecf20Sopenharmony_ci 1548c2ecf20Sopenharmony_ci /* 1558c2ecf20Sopenharmony_ci * c->fill_iter can allocate an iterator with more memory space 1568c2ecf20Sopenharmony_ci * than static MAX_BSETS. 1578c2ecf20Sopenharmony_ci * See the comment arount cache_set->fill_iter. 1588c2ecf20Sopenharmony_ci */ 1598c2ecf20Sopenharmony_ci iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); 1608c2ecf20Sopenharmony_ci iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; 1618c2ecf20Sopenharmony_ci iter->used = 0; 1628c2ecf20Sopenharmony_ci 1638c2ecf20Sopenharmony_ci#ifdef CONFIG_BCACHE_DEBUG 1648c2ecf20Sopenharmony_ci iter->b = &b->keys; 1658c2ecf20Sopenharmony_ci#endif 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci if (!i->seq) 1688c2ecf20Sopenharmony_ci goto err; 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_ci for (; 1718c2ecf20Sopenharmony_ci b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; 1728c2ecf20Sopenharmony_ci i = write_block(b)) { 1738c2ecf20Sopenharmony_ci err = "unsupported bset version"; 1748c2ecf20Sopenharmony_ci if (i->version > BCACHE_BSET_VERSION) 1758c2ecf20Sopenharmony_ci goto err; 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ci err = "bad btree header"; 1788c2ecf20Sopenharmony_ci if (b->written + set_blocks(i, block_bytes(b->c->cache)) > 1798c2ecf20Sopenharmony_ci btree_blocks(b)) 1808c2ecf20Sopenharmony_ci goto err; 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_ci err = "bad magic"; 1838c2ecf20Sopenharmony_ci if (i->magic != bset_magic(&b->c->cache->sb)) 1848c2ecf20Sopenharmony_ci goto err; 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_ci err = "bad checksum"; 1878c2ecf20Sopenharmony_ci switch (i->version) { 1888c2ecf20Sopenharmony_ci case 0: 1898c2ecf20Sopenharmony_ci if (i->csum != csum_set(i)) 1908c2ecf20Sopenharmony_ci goto err; 1918c2ecf20Sopenharmony_ci break; 1928c2ecf20Sopenharmony_ci case BCACHE_BSET_VERSION: 1938c2ecf20Sopenharmony_ci if (i->csum != btree_csum_set(b, i)) 1948c2ecf20Sopenharmony_ci goto err; 1958c2ecf20Sopenharmony_ci break; 1968c2ecf20Sopenharmony_ci } 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci err = "empty set"; 1998c2ecf20Sopenharmony_ci if (i != b->keys.set[0].data && !i->keys) 2008c2ecf20Sopenharmony_ci goto err; 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci b->written += set_blocks(i, block_bytes(b->c->cache)); 2058c2ecf20Sopenharmony_ci } 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci err = "corrupted btree"; 2088c2ecf20Sopenharmony_ci for (i = write_block(b); 2098c2ecf20Sopenharmony_ci bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); 2108c2ecf20Sopenharmony_ci i = ((void *) i) + block_bytes(b->c->cache)) 2118c2ecf20Sopenharmony_ci if (i->seq == b->keys.set[0].data->seq) 2128c2ecf20Sopenharmony_ci goto err; 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci i = b->keys.set[0].data; 2178c2ecf20Sopenharmony_ci err = "short btree key"; 2188c2ecf20Sopenharmony_ci if (b->keys.set[0].size && 2198c2ecf20Sopenharmony_ci bkey_cmp(&b->key, &b->keys.set[0].end) < 0) 2208c2ecf20Sopenharmony_ci goto err; 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci if (b->written < btree_blocks(b)) 2238c2ecf20Sopenharmony_ci bch_bset_init_next(&b->keys, write_block(b), 2248c2ecf20Sopenharmony_ci bset_magic(&b->c->cache->sb)); 2258c2ecf20Sopenharmony_ciout: 2268c2ecf20Sopenharmony_ci mempool_free(iter, &b->c->fill_iter); 2278c2ecf20Sopenharmony_ci return; 2288c2ecf20Sopenharmony_cierr: 2298c2ecf20Sopenharmony_ci set_btree_node_io_error(b); 2308c2ecf20Sopenharmony_ci bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", 2318c2ecf20Sopenharmony_ci err, PTR_BUCKET_NR(b->c, &b->key, 0), 2328c2ecf20Sopenharmony_ci bset_block_offset(b, i), i->keys); 2338c2ecf20Sopenharmony_ci goto out; 2348c2ecf20Sopenharmony_ci} 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_cistatic void btree_node_read_endio(struct bio *bio) 2378c2ecf20Sopenharmony_ci{ 2388c2ecf20Sopenharmony_ci struct closure *cl = bio->bi_private; 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci closure_put(cl); 2418c2ecf20Sopenharmony_ci} 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_cistatic void bch_btree_node_read(struct btree *b) 2448c2ecf20Sopenharmony_ci{ 2458c2ecf20Sopenharmony_ci uint64_t start_time = local_clock(); 2468c2ecf20Sopenharmony_ci struct closure cl; 2478c2ecf20Sopenharmony_ci struct bio *bio; 2488c2ecf20Sopenharmony_ci 2498c2ecf20Sopenharmony_ci trace_bcache_btree_read(b); 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci closure_init_stack(&cl); 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_ci bio = bch_bbio_alloc(b->c); 2548c2ecf20Sopenharmony_ci bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; 2558c2ecf20Sopenharmony_ci bio->bi_end_io = btree_node_read_endio; 2568c2ecf20Sopenharmony_ci bio->bi_private = &cl; 2578c2ecf20Sopenharmony_ci bio->bi_opf = REQ_OP_READ | REQ_META; 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci bch_bio_map(bio, b->keys.set[0].data); 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci bch_submit_bbio(bio, b->c, &b->key, 0); 2628c2ecf20Sopenharmony_ci closure_sync(&cl); 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_ci if (bio->bi_status) 2658c2ecf20Sopenharmony_ci set_btree_node_io_error(b); 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_ci bch_bbio_free(bio, b->c); 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci if (btree_node_io_error(b)) 2708c2ecf20Sopenharmony_ci goto err; 2718c2ecf20Sopenharmony_ci 2728c2ecf20Sopenharmony_ci bch_btree_node_read_done(b); 2738c2ecf20Sopenharmony_ci bch_time_stats_update(&b->c->btree_read_time, start_time); 2748c2ecf20Sopenharmony_ci 2758c2ecf20Sopenharmony_ci return; 2768c2ecf20Sopenharmony_cierr: 2778c2ecf20Sopenharmony_ci bch_cache_set_error(b->c, "io error reading bucket %zu", 2788c2ecf20Sopenharmony_ci PTR_BUCKET_NR(b->c, &b->key, 0)); 2798c2ecf20Sopenharmony_ci} 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_cistatic void btree_complete_write(struct btree *b, struct btree_write *w) 2828c2ecf20Sopenharmony_ci{ 2838c2ecf20Sopenharmony_ci if (w->prio_blocked && 2848c2ecf20Sopenharmony_ci !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 2858c2ecf20Sopenharmony_ci wake_up_allocators(b->c); 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci if (w->journal) { 2888c2ecf20Sopenharmony_ci atomic_dec_bug(w->journal); 2898c2ecf20Sopenharmony_ci __closure_wake_up(&b->c->journal.wait); 2908c2ecf20Sopenharmony_ci } 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci w->prio_blocked = 0; 2938c2ecf20Sopenharmony_ci w->journal = NULL; 2948c2ecf20Sopenharmony_ci} 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_cistatic void btree_node_write_unlock(struct closure *cl) 2978c2ecf20Sopenharmony_ci{ 2988c2ecf20Sopenharmony_ci struct btree *b = container_of(cl, struct btree, io); 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci up(&b->io_mutex); 3018c2ecf20Sopenharmony_ci} 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_cistatic void __btree_node_write_done(struct closure *cl) 3048c2ecf20Sopenharmony_ci{ 3058c2ecf20Sopenharmony_ci struct btree *b = container_of(cl, struct btree, io); 3068c2ecf20Sopenharmony_ci struct btree_write *w = btree_prev_write(b); 3078c2ecf20Sopenharmony_ci 3088c2ecf20Sopenharmony_ci bch_bbio_free(b->bio, b->c); 3098c2ecf20Sopenharmony_ci b->bio = NULL; 3108c2ecf20Sopenharmony_ci btree_complete_write(b, w); 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci if (btree_node_dirty(b)) 3138c2ecf20Sopenharmony_ci queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci closure_return_with_destructor(cl, btree_node_write_unlock); 3168c2ecf20Sopenharmony_ci} 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_cistatic void btree_node_write_done(struct closure *cl) 3198c2ecf20Sopenharmony_ci{ 3208c2ecf20Sopenharmony_ci struct btree *b = container_of(cl, struct btree, io); 3218c2ecf20Sopenharmony_ci 3228c2ecf20Sopenharmony_ci bio_free_pages(b->bio); 3238c2ecf20Sopenharmony_ci __btree_node_write_done(cl); 3248c2ecf20Sopenharmony_ci} 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_cistatic void btree_node_write_endio(struct bio *bio) 3278c2ecf20Sopenharmony_ci{ 3288c2ecf20Sopenharmony_ci struct closure *cl = bio->bi_private; 3298c2ecf20Sopenharmony_ci struct btree *b = container_of(cl, struct btree, io); 3308c2ecf20Sopenharmony_ci 3318c2ecf20Sopenharmony_ci if (bio->bi_status) 3328c2ecf20Sopenharmony_ci set_btree_node_io_error(b); 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree"); 3358c2ecf20Sopenharmony_ci closure_put(cl); 3368c2ecf20Sopenharmony_ci} 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_cistatic void do_btree_node_write(struct btree *b) 3398c2ecf20Sopenharmony_ci{ 3408c2ecf20Sopenharmony_ci struct closure *cl = &b->io; 3418c2ecf20Sopenharmony_ci struct bset *i = btree_bset_last(b); 3428c2ecf20Sopenharmony_ci BKEY_PADDED(key) k; 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_ci i->version = BCACHE_BSET_VERSION; 3458c2ecf20Sopenharmony_ci i->csum = btree_csum_set(b, i); 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_ci BUG_ON(b->bio); 3488c2ecf20Sopenharmony_ci b->bio = bch_bbio_alloc(b->c); 3498c2ecf20Sopenharmony_ci 3508c2ecf20Sopenharmony_ci b->bio->bi_end_io = btree_node_write_endio; 3518c2ecf20Sopenharmony_ci b->bio->bi_private = cl; 3528c2ecf20Sopenharmony_ci b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache)); 3538c2ecf20Sopenharmony_ci b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; 3548c2ecf20Sopenharmony_ci bch_bio_map(b->bio, i); 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_ci /* 3578c2ecf20Sopenharmony_ci * If we're appending to a leaf node, we don't technically need FUA - 3588c2ecf20Sopenharmony_ci * this write just needs to be persisted before the next journal write, 3598c2ecf20Sopenharmony_ci * which will be marked FLUSH|FUA. 3608c2ecf20Sopenharmony_ci * 3618c2ecf20Sopenharmony_ci * Similarly if we're writing a new btree root - the pointer is going to 3628c2ecf20Sopenharmony_ci * be in the next journal entry. 3638c2ecf20Sopenharmony_ci * 3648c2ecf20Sopenharmony_ci * But if we're writing a new btree node (that isn't a root) or 3658c2ecf20Sopenharmony_ci * appending to a non leaf btree node, we need either FUA or a flush 3668c2ecf20Sopenharmony_ci * when we write the parent with the new pointer. FUA is cheaper than a 3678c2ecf20Sopenharmony_ci * flush, and writes appending to leaf nodes aren't blocking anything so 3688c2ecf20Sopenharmony_ci * just make all btree node writes FUA to keep things sane. 3698c2ecf20Sopenharmony_ci */ 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci bkey_copy(&k.key, &b->key); 3728c2ecf20Sopenharmony_ci SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + 3738c2ecf20Sopenharmony_ci bset_sector_offset(&b->keys, i)); 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { 3768c2ecf20Sopenharmony_ci struct bio_vec *bv; 3778c2ecf20Sopenharmony_ci void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 3788c2ecf20Sopenharmony_ci struct bvec_iter_all iter_all; 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_ci bio_for_each_segment_all(bv, b->bio, iter_all) { 3818c2ecf20Sopenharmony_ci memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); 3828c2ecf20Sopenharmony_ci addr += PAGE_SIZE; 3838c2ecf20Sopenharmony_ci } 3848c2ecf20Sopenharmony_ci 3858c2ecf20Sopenharmony_ci bch_submit_bbio(b->bio, b->c, &k.key, 0); 3868c2ecf20Sopenharmony_ci 3878c2ecf20Sopenharmony_ci continue_at(cl, btree_node_write_done, NULL); 3888c2ecf20Sopenharmony_ci } else { 3898c2ecf20Sopenharmony_ci /* 3908c2ecf20Sopenharmony_ci * No problem for multipage bvec since the bio is 3918c2ecf20Sopenharmony_ci * just allocated 3928c2ecf20Sopenharmony_ci */ 3938c2ecf20Sopenharmony_ci b->bio->bi_vcnt = 0; 3948c2ecf20Sopenharmony_ci bch_bio_map(b->bio, i); 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_ci bch_submit_bbio(b->bio, b->c, &k.key, 0); 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci closure_sync(cl); 3998c2ecf20Sopenharmony_ci continue_at_nobarrier(cl, __btree_node_write_done, NULL); 4008c2ecf20Sopenharmony_ci } 4018c2ecf20Sopenharmony_ci} 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_civoid __bch_btree_node_write(struct btree *b, struct closure *parent) 4048c2ecf20Sopenharmony_ci{ 4058c2ecf20Sopenharmony_ci struct bset *i = btree_bset_last(b); 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci lockdep_assert_held(&b->write_lock); 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_ci trace_bcache_btree_write(b); 4108c2ecf20Sopenharmony_ci 4118c2ecf20Sopenharmony_ci BUG_ON(current->bio_list); 4128c2ecf20Sopenharmony_ci BUG_ON(b->written >= btree_blocks(b)); 4138c2ecf20Sopenharmony_ci BUG_ON(b->written && !i->keys); 4148c2ecf20Sopenharmony_ci BUG_ON(btree_bset_first(b)->seq != i->seq); 4158c2ecf20Sopenharmony_ci bch_check_keys(&b->keys, "writing"); 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci cancel_delayed_work(&b->work); 4188c2ecf20Sopenharmony_ci 4198c2ecf20Sopenharmony_ci /* If caller isn't waiting for write, parent refcount is cache set */ 4208c2ecf20Sopenharmony_ci down(&b->io_mutex); 4218c2ecf20Sopenharmony_ci closure_init(&b->io, parent ?: &b->c->cl); 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci clear_bit(BTREE_NODE_dirty, &b->flags); 4248c2ecf20Sopenharmony_ci change_bit(BTREE_NODE_write_idx, &b->flags); 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci do_btree_node_write(b); 4278c2ecf20Sopenharmony_ci 4288c2ecf20Sopenharmony_ci atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, 4298c2ecf20Sopenharmony_ci &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 4308c2ecf20Sopenharmony_ci 4318c2ecf20Sopenharmony_ci b->written += set_blocks(i, block_bytes(b->c->cache)); 4328c2ecf20Sopenharmony_ci} 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_civoid bch_btree_node_write(struct btree *b, struct closure *parent) 4358c2ecf20Sopenharmony_ci{ 4368c2ecf20Sopenharmony_ci unsigned int nsets = b->keys.nsets; 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci lockdep_assert_held(&b->lock); 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci __bch_btree_node_write(b, parent); 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci /* 4438c2ecf20Sopenharmony_ci * do verify if there was more than one set initially (i.e. we did a 4448c2ecf20Sopenharmony_ci * sort) and we sorted down to a single set: 4458c2ecf20Sopenharmony_ci */ 4468c2ecf20Sopenharmony_ci if (nsets && !b->keys.nsets) 4478c2ecf20Sopenharmony_ci bch_btree_verify(b); 4488c2ecf20Sopenharmony_ci 4498c2ecf20Sopenharmony_ci bch_btree_init_next(b); 4508c2ecf20Sopenharmony_ci} 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_cistatic void bch_btree_node_write_sync(struct btree *b) 4538c2ecf20Sopenharmony_ci{ 4548c2ecf20Sopenharmony_ci struct closure cl; 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci closure_init_stack(&cl); 4578c2ecf20Sopenharmony_ci 4588c2ecf20Sopenharmony_ci mutex_lock(&b->write_lock); 4598c2ecf20Sopenharmony_ci bch_btree_node_write(b, &cl); 4608c2ecf20Sopenharmony_ci mutex_unlock(&b->write_lock); 4618c2ecf20Sopenharmony_ci 4628c2ecf20Sopenharmony_ci closure_sync(&cl); 4638c2ecf20Sopenharmony_ci} 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_cistatic void btree_node_write_work(struct work_struct *w) 4668c2ecf20Sopenharmony_ci{ 4678c2ecf20Sopenharmony_ci struct btree *b = container_of(to_delayed_work(w), struct btree, work); 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_ci mutex_lock(&b->write_lock); 4708c2ecf20Sopenharmony_ci if (btree_node_dirty(b)) 4718c2ecf20Sopenharmony_ci __bch_btree_node_write(b, NULL); 4728c2ecf20Sopenharmony_ci mutex_unlock(&b->write_lock); 4738c2ecf20Sopenharmony_ci} 4748c2ecf20Sopenharmony_ci 4758c2ecf20Sopenharmony_cistatic void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 4768c2ecf20Sopenharmony_ci{ 4778c2ecf20Sopenharmony_ci struct bset *i = btree_bset_last(b); 4788c2ecf20Sopenharmony_ci struct btree_write *w = btree_current_write(b); 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_ci lockdep_assert_held(&b->write_lock); 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci BUG_ON(!b->written); 4838c2ecf20Sopenharmony_ci BUG_ON(!i->keys); 4848c2ecf20Sopenharmony_ci 4858c2ecf20Sopenharmony_ci if (!btree_node_dirty(b)) 4868c2ecf20Sopenharmony_ci queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci set_btree_node_dirty(b); 4898c2ecf20Sopenharmony_ci 4908c2ecf20Sopenharmony_ci /* 4918c2ecf20Sopenharmony_ci * w->journal is always the oldest journal pin of all bkeys 4928c2ecf20Sopenharmony_ci * in the leaf node, to make sure the oldest jset seq won't 4938c2ecf20Sopenharmony_ci * be increased before this btree node is flushed. 4948c2ecf20Sopenharmony_ci */ 4958c2ecf20Sopenharmony_ci if (journal_ref) { 4968c2ecf20Sopenharmony_ci if (w->journal && 4978c2ecf20Sopenharmony_ci journal_pin_cmp(b->c, w->journal, journal_ref)) { 4988c2ecf20Sopenharmony_ci atomic_dec_bug(w->journal); 4998c2ecf20Sopenharmony_ci w->journal = NULL; 5008c2ecf20Sopenharmony_ci } 5018c2ecf20Sopenharmony_ci 5028c2ecf20Sopenharmony_ci if (!w->journal) { 5038c2ecf20Sopenharmony_ci w->journal = journal_ref; 5048c2ecf20Sopenharmony_ci atomic_inc(w->journal); 5058c2ecf20Sopenharmony_ci } 5068c2ecf20Sopenharmony_ci } 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci /* Force write if set is too big */ 5098c2ecf20Sopenharmony_ci if (set_bytes(i) > PAGE_SIZE - 48 && 5108c2ecf20Sopenharmony_ci !current->bio_list) 5118c2ecf20Sopenharmony_ci bch_btree_node_write(b, NULL); 5128c2ecf20Sopenharmony_ci} 5138c2ecf20Sopenharmony_ci 5148c2ecf20Sopenharmony_ci/* 5158c2ecf20Sopenharmony_ci * Btree in memory cache - allocation/freeing 5168c2ecf20Sopenharmony_ci * mca -> memory cache 5178c2ecf20Sopenharmony_ci */ 5188c2ecf20Sopenharmony_ci 5198c2ecf20Sopenharmony_ci#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \ 5208c2ecf20Sopenharmony_ci ? c->root->level : 1) * 8 + 16) 5218c2ecf20Sopenharmony_ci#define mca_can_free(c) \ 5228c2ecf20Sopenharmony_ci max_t(int, 0, c->btree_cache_used - mca_reserve(c)) 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_cistatic void mca_data_free(struct btree *b) 5258c2ecf20Sopenharmony_ci{ 5268c2ecf20Sopenharmony_ci BUG_ON(b->io_mutex.count != 1); 5278c2ecf20Sopenharmony_ci 5288c2ecf20Sopenharmony_ci bch_btree_keys_free(&b->keys); 5298c2ecf20Sopenharmony_ci 5308c2ecf20Sopenharmony_ci b->c->btree_cache_used--; 5318c2ecf20Sopenharmony_ci list_move(&b->list, &b->c->btree_cache_freed); 5328c2ecf20Sopenharmony_ci} 5338c2ecf20Sopenharmony_ci 5348c2ecf20Sopenharmony_cistatic void mca_bucket_free(struct btree *b) 5358c2ecf20Sopenharmony_ci{ 5368c2ecf20Sopenharmony_ci BUG_ON(btree_node_dirty(b)); 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci b->key.ptr[0] = 0; 5398c2ecf20Sopenharmony_ci hlist_del_init_rcu(&b->hash); 5408c2ecf20Sopenharmony_ci list_move(&b->list, &b->c->btree_cache_freeable); 5418c2ecf20Sopenharmony_ci} 5428c2ecf20Sopenharmony_ci 5438c2ecf20Sopenharmony_cistatic unsigned int btree_order(struct bkey *k) 5448c2ecf20Sopenharmony_ci{ 5458c2ecf20Sopenharmony_ci return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); 5468c2ecf20Sopenharmony_ci} 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_cistatic void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 5498c2ecf20Sopenharmony_ci{ 5508c2ecf20Sopenharmony_ci if (!bch_btree_keys_alloc(&b->keys, 5518c2ecf20Sopenharmony_ci max_t(unsigned int, 5528c2ecf20Sopenharmony_ci ilog2(b->c->btree_pages), 5538c2ecf20Sopenharmony_ci btree_order(k)), 5548c2ecf20Sopenharmony_ci gfp)) { 5558c2ecf20Sopenharmony_ci b->c->btree_cache_used++; 5568c2ecf20Sopenharmony_ci list_move(&b->list, &b->c->btree_cache); 5578c2ecf20Sopenharmony_ci } else { 5588c2ecf20Sopenharmony_ci list_move(&b->list, &b->c->btree_cache_freed); 5598c2ecf20Sopenharmony_ci } 5608c2ecf20Sopenharmony_ci} 5618c2ecf20Sopenharmony_ci 5628c2ecf20Sopenharmony_cistatic struct btree *mca_bucket_alloc(struct cache_set *c, 5638c2ecf20Sopenharmony_ci struct bkey *k, gfp_t gfp) 5648c2ecf20Sopenharmony_ci{ 5658c2ecf20Sopenharmony_ci /* 5668c2ecf20Sopenharmony_ci * kzalloc() is necessary here for initialization, 5678c2ecf20Sopenharmony_ci * see code comments in bch_btree_keys_init(). 5688c2ecf20Sopenharmony_ci */ 5698c2ecf20Sopenharmony_ci struct btree *b = kzalloc(sizeof(struct btree), gfp); 5708c2ecf20Sopenharmony_ci 5718c2ecf20Sopenharmony_ci if (!b) 5728c2ecf20Sopenharmony_ci return NULL; 5738c2ecf20Sopenharmony_ci 5748c2ecf20Sopenharmony_ci init_rwsem(&b->lock); 5758c2ecf20Sopenharmony_ci lockdep_set_novalidate_class(&b->lock); 5768c2ecf20Sopenharmony_ci mutex_init(&b->write_lock); 5778c2ecf20Sopenharmony_ci lockdep_set_novalidate_class(&b->write_lock); 5788c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&b->list); 5798c2ecf20Sopenharmony_ci INIT_DELAYED_WORK(&b->work, btree_node_write_work); 5808c2ecf20Sopenharmony_ci b->c = c; 5818c2ecf20Sopenharmony_ci sema_init(&b->io_mutex, 1); 5828c2ecf20Sopenharmony_ci 5838c2ecf20Sopenharmony_ci mca_data_alloc(b, k, gfp); 5848c2ecf20Sopenharmony_ci return b; 5858c2ecf20Sopenharmony_ci} 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_cistatic int mca_reap(struct btree *b, unsigned int min_order, bool flush) 5888c2ecf20Sopenharmony_ci{ 5898c2ecf20Sopenharmony_ci struct closure cl; 5908c2ecf20Sopenharmony_ci 5918c2ecf20Sopenharmony_ci closure_init_stack(&cl); 5928c2ecf20Sopenharmony_ci lockdep_assert_held(&b->c->bucket_lock); 5938c2ecf20Sopenharmony_ci 5948c2ecf20Sopenharmony_ci if (!down_write_trylock(&b->lock)) 5958c2ecf20Sopenharmony_ci return -ENOMEM; 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_ci BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); 5988c2ecf20Sopenharmony_ci 5998c2ecf20Sopenharmony_ci if (b->keys.page_order < min_order) 6008c2ecf20Sopenharmony_ci goto out_unlock; 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_ci if (!flush) { 6038c2ecf20Sopenharmony_ci if (btree_node_dirty(b)) 6048c2ecf20Sopenharmony_ci goto out_unlock; 6058c2ecf20Sopenharmony_ci 6068c2ecf20Sopenharmony_ci if (down_trylock(&b->io_mutex)) 6078c2ecf20Sopenharmony_ci goto out_unlock; 6088c2ecf20Sopenharmony_ci up(&b->io_mutex); 6098c2ecf20Sopenharmony_ci } 6108c2ecf20Sopenharmony_ci 6118c2ecf20Sopenharmony_ciretry: 6128c2ecf20Sopenharmony_ci /* 6138c2ecf20Sopenharmony_ci * BTREE_NODE_dirty might be cleared in btree_flush_btree() by 6148c2ecf20Sopenharmony_ci * __bch_btree_node_write(). To avoid an extra flush, acquire 6158c2ecf20Sopenharmony_ci * b->write_lock before checking BTREE_NODE_dirty bit. 6168c2ecf20Sopenharmony_ci */ 6178c2ecf20Sopenharmony_ci mutex_lock(&b->write_lock); 6188c2ecf20Sopenharmony_ci /* 6198c2ecf20Sopenharmony_ci * If this btree node is selected in btree_flush_write() by journal 6208c2ecf20Sopenharmony_ci * code, delay and retry until the node is flushed by journal code 6218c2ecf20Sopenharmony_ci * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). 6228c2ecf20Sopenharmony_ci */ 6238c2ecf20Sopenharmony_ci if (btree_node_journal_flush(b)) { 6248c2ecf20Sopenharmony_ci pr_debug("bnode %p is flushing by journal, retry\n", b); 6258c2ecf20Sopenharmony_ci mutex_unlock(&b->write_lock); 6268c2ecf20Sopenharmony_ci udelay(1); 6278c2ecf20Sopenharmony_ci goto retry; 6288c2ecf20Sopenharmony_ci } 6298c2ecf20Sopenharmony_ci 6308c2ecf20Sopenharmony_ci if (btree_node_dirty(b)) 6318c2ecf20Sopenharmony_ci __bch_btree_node_write(b, &cl); 6328c2ecf20Sopenharmony_ci mutex_unlock(&b->write_lock); 6338c2ecf20Sopenharmony_ci 6348c2ecf20Sopenharmony_ci closure_sync(&cl); 6358c2ecf20Sopenharmony_ci 6368c2ecf20Sopenharmony_ci /* wait for any in flight btree write */ 6378c2ecf20Sopenharmony_ci down(&b->io_mutex); 6388c2ecf20Sopenharmony_ci up(&b->io_mutex); 6398c2ecf20Sopenharmony_ci 6408c2ecf20Sopenharmony_ci return 0; 6418c2ecf20Sopenharmony_ciout_unlock: 6428c2ecf20Sopenharmony_ci rw_unlock(true, b); 6438c2ecf20Sopenharmony_ci return -ENOMEM; 6448c2ecf20Sopenharmony_ci} 6458c2ecf20Sopenharmony_ci 6468c2ecf20Sopenharmony_cistatic unsigned long bch_mca_scan(struct shrinker *shrink, 6478c2ecf20Sopenharmony_ci struct shrink_control *sc) 6488c2ecf20Sopenharmony_ci{ 6498c2ecf20Sopenharmony_ci struct cache_set *c = container_of(shrink, struct cache_set, shrink); 6508c2ecf20Sopenharmony_ci struct btree *b, *t; 6518c2ecf20Sopenharmony_ci unsigned long i, nr = sc->nr_to_scan; 6528c2ecf20Sopenharmony_ci unsigned long freed = 0; 6538c2ecf20Sopenharmony_ci unsigned int btree_cache_used; 6548c2ecf20Sopenharmony_ci 6558c2ecf20Sopenharmony_ci if (c->shrinker_disabled) 6568c2ecf20Sopenharmony_ci return SHRINK_STOP; 6578c2ecf20Sopenharmony_ci 6588c2ecf20Sopenharmony_ci if (c->btree_cache_alloc_lock) 6598c2ecf20Sopenharmony_ci return SHRINK_STOP; 6608c2ecf20Sopenharmony_ci 6618c2ecf20Sopenharmony_ci /* Return -1 if we can't do anything right now */ 6628c2ecf20Sopenharmony_ci if (sc->gfp_mask & __GFP_IO) 6638c2ecf20Sopenharmony_ci mutex_lock(&c->bucket_lock); 6648c2ecf20Sopenharmony_ci else if (!mutex_trylock(&c->bucket_lock)) 6658c2ecf20Sopenharmony_ci return -1; 6668c2ecf20Sopenharmony_ci 6678c2ecf20Sopenharmony_ci /* 6688c2ecf20Sopenharmony_ci * It's _really_ critical that we don't free too many btree nodes - we 6698c2ecf20Sopenharmony_ci * have to always leave ourselves a reserve. The reserve is how we 6708c2ecf20Sopenharmony_ci * guarantee that allocating memory for a new btree node can always 6718c2ecf20Sopenharmony_ci * succeed, so that inserting keys into the btree can always succeed and 6728c2ecf20Sopenharmony_ci * IO can always make forward progress: 6738c2ecf20Sopenharmony_ci */ 6748c2ecf20Sopenharmony_ci nr /= c->btree_pages; 6758c2ecf20Sopenharmony_ci if (nr == 0) 6768c2ecf20Sopenharmony_ci nr = 1; 6778c2ecf20Sopenharmony_ci nr = min_t(unsigned long, nr, mca_can_free(c)); 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci i = 0; 6808c2ecf20Sopenharmony_ci btree_cache_used = c->btree_cache_used; 6818c2ecf20Sopenharmony_ci list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { 6828c2ecf20Sopenharmony_ci if (nr <= 0) 6838c2ecf20Sopenharmony_ci goto out; 6848c2ecf20Sopenharmony_ci 6858c2ecf20Sopenharmony_ci if (!mca_reap(b, 0, false)) { 6868c2ecf20Sopenharmony_ci mca_data_free(b); 6878c2ecf20Sopenharmony_ci rw_unlock(true, b); 6888c2ecf20Sopenharmony_ci freed++; 6898c2ecf20Sopenharmony_ci } 6908c2ecf20Sopenharmony_ci nr--; 6918c2ecf20Sopenharmony_ci i++; 6928c2ecf20Sopenharmony_ci } 6938c2ecf20Sopenharmony_ci 6948c2ecf20Sopenharmony_ci list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { 6958c2ecf20Sopenharmony_ci if (nr <= 0 || i >= btree_cache_used) 6968c2ecf20Sopenharmony_ci goto out; 6978c2ecf20Sopenharmony_ci 6988c2ecf20Sopenharmony_ci if (!mca_reap(b, 0, false)) { 6998c2ecf20Sopenharmony_ci mca_bucket_free(b); 7008c2ecf20Sopenharmony_ci mca_data_free(b); 7018c2ecf20Sopenharmony_ci rw_unlock(true, b); 7028c2ecf20Sopenharmony_ci freed++; 7038c2ecf20Sopenharmony_ci } 7048c2ecf20Sopenharmony_ci 7058c2ecf20Sopenharmony_ci nr--; 7068c2ecf20Sopenharmony_ci i++; 7078c2ecf20Sopenharmony_ci } 7088c2ecf20Sopenharmony_ciout: 7098c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 7108c2ecf20Sopenharmony_ci return freed * c->btree_pages; 7118c2ecf20Sopenharmony_ci} 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_cistatic unsigned long bch_mca_count(struct shrinker *shrink, 7148c2ecf20Sopenharmony_ci struct shrink_control *sc) 7158c2ecf20Sopenharmony_ci{ 7168c2ecf20Sopenharmony_ci struct cache_set *c = container_of(shrink, struct cache_set, shrink); 7178c2ecf20Sopenharmony_ci 7188c2ecf20Sopenharmony_ci if (c->shrinker_disabled) 7198c2ecf20Sopenharmony_ci return 0; 7208c2ecf20Sopenharmony_ci 7218c2ecf20Sopenharmony_ci if (c->btree_cache_alloc_lock) 7228c2ecf20Sopenharmony_ci return 0; 7238c2ecf20Sopenharmony_ci 7248c2ecf20Sopenharmony_ci return mca_can_free(c) * c->btree_pages; 7258c2ecf20Sopenharmony_ci} 7268c2ecf20Sopenharmony_ci 7278c2ecf20Sopenharmony_civoid bch_btree_cache_free(struct cache_set *c) 7288c2ecf20Sopenharmony_ci{ 7298c2ecf20Sopenharmony_ci struct btree *b; 7308c2ecf20Sopenharmony_ci struct closure cl; 7318c2ecf20Sopenharmony_ci 7328c2ecf20Sopenharmony_ci closure_init_stack(&cl); 7338c2ecf20Sopenharmony_ci 7348c2ecf20Sopenharmony_ci if (c->shrink.list.next) 7358c2ecf20Sopenharmony_ci unregister_shrinker(&c->shrink); 7368c2ecf20Sopenharmony_ci 7378c2ecf20Sopenharmony_ci mutex_lock(&c->bucket_lock); 7388c2ecf20Sopenharmony_ci 7398c2ecf20Sopenharmony_ci#ifdef CONFIG_BCACHE_DEBUG 7408c2ecf20Sopenharmony_ci if (c->verify_data) 7418c2ecf20Sopenharmony_ci list_move(&c->verify_data->list, &c->btree_cache); 7428c2ecf20Sopenharmony_ci 7438c2ecf20Sopenharmony_ci free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb))); 7448c2ecf20Sopenharmony_ci#endif 7458c2ecf20Sopenharmony_ci 7468c2ecf20Sopenharmony_ci list_splice(&c->btree_cache_freeable, 7478c2ecf20Sopenharmony_ci &c->btree_cache); 7488c2ecf20Sopenharmony_ci 7498c2ecf20Sopenharmony_ci while (!list_empty(&c->btree_cache)) { 7508c2ecf20Sopenharmony_ci b = list_first_entry(&c->btree_cache, struct btree, list); 7518c2ecf20Sopenharmony_ci 7528c2ecf20Sopenharmony_ci /* 7538c2ecf20Sopenharmony_ci * This function is called by cache_set_free(), no I/O 7548c2ecf20Sopenharmony_ci * request on cache now, it is unnecessary to acquire 7558c2ecf20Sopenharmony_ci * b->write_lock before clearing BTREE_NODE_dirty anymore. 7568c2ecf20Sopenharmony_ci */ 7578c2ecf20Sopenharmony_ci if (btree_node_dirty(b)) { 7588c2ecf20Sopenharmony_ci btree_complete_write(b, btree_current_write(b)); 7598c2ecf20Sopenharmony_ci clear_bit(BTREE_NODE_dirty, &b->flags); 7608c2ecf20Sopenharmony_ci } 7618c2ecf20Sopenharmony_ci mca_data_free(b); 7628c2ecf20Sopenharmony_ci } 7638c2ecf20Sopenharmony_ci 7648c2ecf20Sopenharmony_ci while (!list_empty(&c->btree_cache_freed)) { 7658c2ecf20Sopenharmony_ci b = list_first_entry(&c->btree_cache_freed, 7668c2ecf20Sopenharmony_ci struct btree, list); 7678c2ecf20Sopenharmony_ci list_del(&b->list); 7688c2ecf20Sopenharmony_ci cancel_delayed_work_sync(&b->work); 7698c2ecf20Sopenharmony_ci kfree(b); 7708c2ecf20Sopenharmony_ci } 7718c2ecf20Sopenharmony_ci 7728c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 7738c2ecf20Sopenharmony_ci} 7748c2ecf20Sopenharmony_ci 7758c2ecf20Sopenharmony_ciint bch_btree_cache_alloc(struct cache_set *c) 7768c2ecf20Sopenharmony_ci{ 7778c2ecf20Sopenharmony_ci unsigned int i; 7788c2ecf20Sopenharmony_ci 7798c2ecf20Sopenharmony_ci for (i = 0; i < mca_reserve(c); i++) 7808c2ecf20Sopenharmony_ci if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) 7818c2ecf20Sopenharmony_ci return -ENOMEM; 7828c2ecf20Sopenharmony_ci 7838c2ecf20Sopenharmony_ci list_splice_init(&c->btree_cache, 7848c2ecf20Sopenharmony_ci &c->btree_cache_freeable); 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_ci#ifdef CONFIG_BCACHE_DEBUG 7878c2ecf20Sopenharmony_ci mutex_init(&c->verify_lock); 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_ci c->verify_ondisk = (void *) 7908c2ecf20Sopenharmony_ci __get_free_pages(GFP_KERNEL|__GFP_COMP, 7918c2ecf20Sopenharmony_ci ilog2(meta_bucket_pages(&c->cache->sb))); 7928c2ecf20Sopenharmony_ci if (!c->verify_ondisk) { 7938c2ecf20Sopenharmony_ci /* 7948c2ecf20Sopenharmony_ci * Don't worry about the mca_rereserve buckets 7958c2ecf20Sopenharmony_ci * allocated in previous for-loop, they will be 7968c2ecf20Sopenharmony_ci * handled properly in bch_cache_set_unregister(). 7978c2ecf20Sopenharmony_ci */ 7988c2ecf20Sopenharmony_ci return -ENOMEM; 7998c2ecf20Sopenharmony_ci } 8008c2ecf20Sopenharmony_ci 8018c2ecf20Sopenharmony_ci c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 8028c2ecf20Sopenharmony_ci 8038c2ecf20Sopenharmony_ci if (c->verify_data && 8048c2ecf20Sopenharmony_ci c->verify_data->keys.set->data) 8058c2ecf20Sopenharmony_ci list_del_init(&c->verify_data->list); 8068c2ecf20Sopenharmony_ci else 8078c2ecf20Sopenharmony_ci c->verify_data = NULL; 8088c2ecf20Sopenharmony_ci#endif 8098c2ecf20Sopenharmony_ci 8108c2ecf20Sopenharmony_ci c->shrink.count_objects = bch_mca_count; 8118c2ecf20Sopenharmony_ci c->shrink.scan_objects = bch_mca_scan; 8128c2ecf20Sopenharmony_ci c->shrink.seeks = 4; 8138c2ecf20Sopenharmony_ci c->shrink.batch = c->btree_pages * 2; 8148c2ecf20Sopenharmony_ci 8158c2ecf20Sopenharmony_ci if (register_shrinker(&c->shrink)) 8168c2ecf20Sopenharmony_ci pr_warn("bcache: %s: could not register shrinker\n", 8178c2ecf20Sopenharmony_ci __func__); 8188c2ecf20Sopenharmony_ci 8198c2ecf20Sopenharmony_ci return 0; 8208c2ecf20Sopenharmony_ci} 8218c2ecf20Sopenharmony_ci 8228c2ecf20Sopenharmony_ci/* Btree in memory cache - hash table */ 8238c2ecf20Sopenharmony_ci 8248c2ecf20Sopenharmony_cistatic struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) 8258c2ecf20Sopenharmony_ci{ 8268c2ecf20Sopenharmony_ci return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; 8278c2ecf20Sopenharmony_ci} 8288c2ecf20Sopenharmony_ci 8298c2ecf20Sopenharmony_cistatic struct btree *mca_find(struct cache_set *c, struct bkey *k) 8308c2ecf20Sopenharmony_ci{ 8318c2ecf20Sopenharmony_ci struct btree *b; 8328c2ecf20Sopenharmony_ci 8338c2ecf20Sopenharmony_ci rcu_read_lock(); 8348c2ecf20Sopenharmony_ci hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) 8358c2ecf20Sopenharmony_ci if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) 8368c2ecf20Sopenharmony_ci goto out; 8378c2ecf20Sopenharmony_ci b = NULL; 8388c2ecf20Sopenharmony_ciout: 8398c2ecf20Sopenharmony_ci rcu_read_unlock(); 8408c2ecf20Sopenharmony_ci return b; 8418c2ecf20Sopenharmony_ci} 8428c2ecf20Sopenharmony_ci 8438c2ecf20Sopenharmony_cistatic int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) 8448c2ecf20Sopenharmony_ci{ 8458c2ecf20Sopenharmony_ci spin_lock(&c->btree_cannibalize_lock); 8468c2ecf20Sopenharmony_ci if (likely(c->btree_cache_alloc_lock == NULL)) { 8478c2ecf20Sopenharmony_ci c->btree_cache_alloc_lock = current; 8488c2ecf20Sopenharmony_ci } else if (c->btree_cache_alloc_lock != current) { 8498c2ecf20Sopenharmony_ci if (op) 8508c2ecf20Sopenharmony_ci prepare_to_wait(&c->btree_cache_wait, &op->wait, 8518c2ecf20Sopenharmony_ci TASK_UNINTERRUPTIBLE); 8528c2ecf20Sopenharmony_ci spin_unlock(&c->btree_cannibalize_lock); 8538c2ecf20Sopenharmony_ci return -EINTR; 8548c2ecf20Sopenharmony_ci } 8558c2ecf20Sopenharmony_ci spin_unlock(&c->btree_cannibalize_lock); 8568c2ecf20Sopenharmony_ci 8578c2ecf20Sopenharmony_ci return 0; 8588c2ecf20Sopenharmony_ci} 8598c2ecf20Sopenharmony_ci 8608c2ecf20Sopenharmony_cistatic struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, 8618c2ecf20Sopenharmony_ci struct bkey *k) 8628c2ecf20Sopenharmony_ci{ 8638c2ecf20Sopenharmony_ci struct btree *b; 8648c2ecf20Sopenharmony_ci 8658c2ecf20Sopenharmony_ci trace_bcache_btree_cache_cannibalize(c); 8668c2ecf20Sopenharmony_ci 8678c2ecf20Sopenharmony_ci if (mca_cannibalize_lock(c, op)) 8688c2ecf20Sopenharmony_ci return ERR_PTR(-EINTR); 8698c2ecf20Sopenharmony_ci 8708c2ecf20Sopenharmony_ci list_for_each_entry_reverse(b, &c->btree_cache, list) 8718c2ecf20Sopenharmony_ci if (!mca_reap(b, btree_order(k), false)) 8728c2ecf20Sopenharmony_ci return b; 8738c2ecf20Sopenharmony_ci 8748c2ecf20Sopenharmony_ci list_for_each_entry_reverse(b, &c->btree_cache, list) 8758c2ecf20Sopenharmony_ci if (!mca_reap(b, btree_order(k), true)) 8768c2ecf20Sopenharmony_ci return b; 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci WARN(1, "btree cache cannibalize failed\n"); 8798c2ecf20Sopenharmony_ci return ERR_PTR(-ENOMEM); 8808c2ecf20Sopenharmony_ci} 8818c2ecf20Sopenharmony_ci 8828c2ecf20Sopenharmony_ci/* 8838c2ecf20Sopenharmony_ci * We can only have one thread cannibalizing other cached btree nodes at a time, 8848c2ecf20Sopenharmony_ci * or we'll deadlock. We use an open coded mutex to ensure that, which a 8858c2ecf20Sopenharmony_ci * cannibalize_bucket() will take. This means every time we unlock the root of 8868c2ecf20Sopenharmony_ci * the btree, we need to release this lock if we have it held. 8878c2ecf20Sopenharmony_ci */ 8888c2ecf20Sopenharmony_civoid bch_cannibalize_unlock(struct cache_set *c) 8898c2ecf20Sopenharmony_ci{ 8908c2ecf20Sopenharmony_ci spin_lock(&c->btree_cannibalize_lock); 8918c2ecf20Sopenharmony_ci if (c->btree_cache_alloc_lock == current) { 8928c2ecf20Sopenharmony_ci c->btree_cache_alloc_lock = NULL; 8938c2ecf20Sopenharmony_ci wake_up(&c->btree_cache_wait); 8948c2ecf20Sopenharmony_ci } 8958c2ecf20Sopenharmony_ci spin_unlock(&c->btree_cannibalize_lock); 8968c2ecf20Sopenharmony_ci} 8978c2ecf20Sopenharmony_ci 8988c2ecf20Sopenharmony_cistatic struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, 8998c2ecf20Sopenharmony_ci struct bkey *k, int level) 9008c2ecf20Sopenharmony_ci{ 9018c2ecf20Sopenharmony_ci struct btree *b; 9028c2ecf20Sopenharmony_ci 9038c2ecf20Sopenharmony_ci BUG_ON(current->bio_list); 9048c2ecf20Sopenharmony_ci 9058c2ecf20Sopenharmony_ci lockdep_assert_held(&c->bucket_lock); 9068c2ecf20Sopenharmony_ci 9078c2ecf20Sopenharmony_ci if (mca_find(c, k)) 9088c2ecf20Sopenharmony_ci return NULL; 9098c2ecf20Sopenharmony_ci 9108c2ecf20Sopenharmony_ci /* btree_free() doesn't free memory; it sticks the node on the end of 9118c2ecf20Sopenharmony_ci * the list. Check if there's any freed nodes there: 9128c2ecf20Sopenharmony_ci */ 9138c2ecf20Sopenharmony_ci list_for_each_entry(b, &c->btree_cache_freeable, list) 9148c2ecf20Sopenharmony_ci if (!mca_reap(b, btree_order(k), false)) 9158c2ecf20Sopenharmony_ci goto out; 9168c2ecf20Sopenharmony_ci 9178c2ecf20Sopenharmony_ci /* We never free struct btree itself, just the memory that holds the on 9188c2ecf20Sopenharmony_ci * disk node. Check the freed list before allocating a new one: 9198c2ecf20Sopenharmony_ci */ 9208c2ecf20Sopenharmony_ci list_for_each_entry(b, &c->btree_cache_freed, list) 9218c2ecf20Sopenharmony_ci if (!mca_reap(b, 0, false)) { 9228c2ecf20Sopenharmony_ci mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 9238c2ecf20Sopenharmony_ci if (!b->keys.set[0].data) 9248c2ecf20Sopenharmony_ci goto err; 9258c2ecf20Sopenharmony_ci else 9268c2ecf20Sopenharmony_ci goto out; 9278c2ecf20Sopenharmony_ci } 9288c2ecf20Sopenharmony_ci 9298c2ecf20Sopenharmony_ci b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); 9308c2ecf20Sopenharmony_ci if (!b) 9318c2ecf20Sopenharmony_ci goto err; 9328c2ecf20Sopenharmony_ci 9338c2ecf20Sopenharmony_ci BUG_ON(!down_write_trylock(&b->lock)); 9348c2ecf20Sopenharmony_ci if (!b->keys.set->data) 9358c2ecf20Sopenharmony_ci goto err; 9368c2ecf20Sopenharmony_ciout: 9378c2ecf20Sopenharmony_ci BUG_ON(b->io_mutex.count != 1); 9388c2ecf20Sopenharmony_ci 9398c2ecf20Sopenharmony_ci bkey_copy(&b->key, k); 9408c2ecf20Sopenharmony_ci list_move(&b->list, &c->btree_cache); 9418c2ecf20Sopenharmony_ci hlist_del_init_rcu(&b->hash); 9428c2ecf20Sopenharmony_ci hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 9438c2ecf20Sopenharmony_ci 9448c2ecf20Sopenharmony_ci lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 9458c2ecf20Sopenharmony_ci b->parent = (void *) ~0UL; 9468c2ecf20Sopenharmony_ci b->flags = 0; 9478c2ecf20Sopenharmony_ci b->written = 0; 9488c2ecf20Sopenharmony_ci b->level = level; 9498c2ecf20Sopenharmony_ci 9508c2ecf20Sopenharmony_ci if (!b->level) 9518c2ecf20Sopenharmony_ci bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, 9528c2ecf20Sopenharmony_ci &b->c->expensive_debug_checks); 9538c2ecf20Sopenharmony_ci else 9548c2ecf20Sopenharmony_ci bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, 9558c2ecf20Sopenharmony_ci &b->c->expensive_debug_checks); 9568c2ecf20Sopenharmony_ci 9578c2ecf20Sopenharmony_ci return b; 9588c2ecf20Sopenharmony_cierr: 9598c2ecf20Sopenharmony_ci if (b) 9608c2ecf20Sopenharmony_ci rw_unlock(true, b); 9618c2ecf20Sopenharmony_ci 9628c2ecf20Sopenharmony_ci b = mca_cannibalize(c, op, k); 9638c2ecf20Sopenharmony_ci if (!IS_ERR(b)) 9648c2ecf20Sopenharmony_ci goto out; 9658c2ecf20Sopenharmony_ci 9668c2ecf20Sopenharmony_ci return b; 9678c2ecf20Sopenharmony_ci} 9688c2ecf20Sopenharmony_ci 9698c2ecf20Sopenharmony_ci/* 9708c2ecf20Sopenharmony_ci * bch_btree_node_get - find a btree node in the cache and lock it, reading it 9718c2ecf20Sopenharmony_ci * in from disk if necessary. 9728c2ecf20Sopenharmony_ci * 9738c2ecf20Sopenharmony_ci * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. 9748c2ecf20Sopenharmony_ci * 9758c2ecf20Sopenharmony_ci * The btree node will have either a read or a write lock held, depending on 9768c2ecf20Sopenharmony_ci * level and op->lock. 9778c2ecf20Sopenharmony_ci * 9788c2ecf20Sopenharmony_ci * Note: Only error code or btree pointer will be returned, it is unncessary 9798c2ecf20Sopenharmony_ci * for callers to check NULL pointer. 9808c2ecf20Sopenharmony_ci */ 9818c2ecf20Sopenharmony_cistruct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, 9828c2ecf20Sopenharmony_ci struct bkey *k, int level, bool write, 9838c2ecf20Sopenharmony_ci struct btree *parent) 9848c2ecf20Sopenharmony_ci{ 9858c2ecf20Sopenharmony_ci int i = 0; 9868c2ecf20Sopenharmony_ci struct btree *b; 9878c2ecf20Sopenharmony_ci 9888c2ecf20Sopenharmony_ci BUG_ON(level < 0); 9898c2ecf20Sopenharmony_ciretry: 9908c2ecf20Sopenharmony_ci b = mca_find(c, k); 9918c2ecf20Sopenharmony_ci 9928c2ecf20Sopenharmony_ci if (!b) { 9938c2ecf20Sopenharmony_ci if (current->bio_list) 9948c2ecf20Sopenharmony_ci return ERR_PTR(-EAGAIN); 9958c2ecf20Sopenharmony_ci 9968c2ecf20Sopenharmony_ci mutex_lock(&c->bucket_lock); 9978c2ecf20Sopenharmony_ci b = mca_alloc(c, op, k, level); 9988c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 9998c2ecf20Sopenharmony_ci 10008c2ecf20Sopenharmony_ci if (!b) 10018c2ecf20Sopenharmony_ci goto retry; 10028c2ecf20Sopenharmony_ci if (IS_ERR(b)) 10038c2ecf20Sopenharmony_ci return b; 10048c2ecf20Sopenharmony_ci 10058c2ecf20Sopenharmony_ci bch_btree_node_read(b); 10068c2ecf20Sopenharmony_ci 10078c2ecf20Sopenharmony_ci if (!write) 10088c2ecf20Sopenharmony_ci downgrade_write(&b->lock); 10098c2ecf20Sopenharmony_ci } else { 10108c2ecf20Sopenharmony_ci rw_lock(write, b, level); 10118c2ecf20Sopenharmony_ci if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { 10128c2ecf20Sopenharmony_ci rw_unlock(write, b); 10138c2ecf20Sopenharmony_ci goto retry; 10148c2ecf20Sopenharmony_ci } 10158c2ecf20Sopenharmony_ci BUG_ON(b->level != level); 10168c2ecf20Sopenharmony_ci } 10178c2ecf20Sopenharmony_ci 10188c2ecf20Sopenharmony_ci if (btree_node_io_error(b)) { 10198c2ecf20Sopenharmony_ci rw_unlock(write, b); 10208c2ecf20Sopenharmony_ci return ERR_PTR(-EIO); 10218c2ecf20Sopenharmony_ci } 10228c2ecf20Sopenharmony_ci 10238c2ecf20Sopenharmony_ci BUG_ON(!b->written); 10248c2ecf20Sopenharmony_ci 10258c2ecf20Sopenharmony_ci b->parent = parent; 10268c2ecf20Sopenharmony_ci 10278c2ecf20Sopenharmony_ci for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { 10288c2ecf20Sopenharmony_ci prefetch(b->keys.set[i].tree); 10298c2ecf20Sopenharmony_ci prefetch(b->keys.set[i].data); 10308c2ecf20Sopenharmony_ci } 10318c2ecf20Sopenharmony_ci 10328c2ecf20Sopenharmony_ci for (; i <= b->keys.nsets; i++) 10338c2ecf20Sopenharmony_ci prefetch(b->keys.set[i].data); 10348c2ecf20Sopenharmony_ci 10358c2ecf20Sopenharmony_ci return b; 10368c2ecf20Sopenharmony_ci} 10378c2ecf20Sopenharmony_ci 10388c2ecf20Sopenharmony_cistatic void btree_node_prefetch(struct btree *parent, struct bkey *k) 10398c2ecf20Sopenharmony_ci{ 10408c2ecf20Sopenharmony_ci struct btree *b; 10418c2ecf20Sopenharmony_ci 10428c2ecf20Sopenharmony_ci mutex_lock(&parent->c->bucket_lock); 10438c2ecf20Sopenharmony_ci b = mca_alloc(parent->c, NULL, k, parent->level - 1); 10448c2ecf20Sopenharmony_ci mutex_unlock(&parent->c->bucket_lock); 10458c2ecf20Sopenharmony_ci 10468c2ecf20Sopenharmony_ci if (!IS_ERR_OR_NULL(b)) { 10478c2ecf20Sopenharmony_ci b->parent = parent; 10488c2ecf20Sopenharmony_ci bch_btree_node_read(b); 10498c2ecf20Sopenharmony_ci rw_unlock(true, b); 10508c2ecf20Sopenharmony_ci } 10518c2ecf20Sopenharmony_ci} 10528c2ecf20Sopenharmony_ci 10538c2ecf20Sopenharmony_ci/* Btree alloc */ 10548c2ecf20Sopenharmony_ci 10558c2ecf20Sopenharmony_cistatic void btree_node_free(struct btree *b) 10568c2ecf20Sopenharmony_ci{ 10578c2ecf20Sopenharmony_ci trace_bcache_btree_node_free(b); 10588c2ecf20Sopenharmony_ci 10598c2ecf20Sopenharmony_ci BUG_ON(b == b->c->root); 10608c2ecf20Sopenharmony_ci 10618c2ecf20Sopenharmony_ciretry: 10628c2ecf20Sopenharmony_ci mutex_lock(&b->write_lock); 10638c2ecf20Sopenharmony_ci /* 10648c2ecf20Sopenharmony_ci * If the btree node is selected and flushing in btree_flush_write(), 10658c2ecf20Sopenharmony_ci * delay and retry until the BTREE_NODE_journal_flush bit cleared, 10668c2ecf20Sopenharmony_ci * then it is safe to free the btree node here. Otherwise this btree 10678c2ecf20Sopenharmony_ci * node will be in race condition. 10688c2ecf20Sopenharmony_ci */ 10698c2ecf20Sopenharmony_ci if (btree_node_journal_flush(b)) { 10708c2ecf20Sopenharmony_ci mutex_unlock(&b->write_lock); 10718c2ecf20Sopenharmony_ci pr_debug("bnode %p journal_flush set, retry\n", b); 10728c2ecf20Sopenharmony_ci udelay(1); 10738c2ecf20Sopenharmony_ci goto retry; 10748c2ecf20Sopenharmony_ci } 10758c2ecf20Sopenharmony_ci 10768c2ecf20Sopenharmony_ci if (btree_node_dirty(b)) { 10778c2ecf20Sopenharmony_ci btree_complete_write(b, btree_current_write(b)); 10788c2ecf20Sopenharmony_ci clear_bit(BTREE_NODE_dirty, &b->flags); 10798c2ecf20Sopenharmony_ci } 10808c2ecf20Sopenharmony_ci 10818c2ecf20Sopenharmony_ci mutex_unlock(&b->write_lock); 10828c2ecf20Sopenharmony_ci 10838c2ecf20Sopenharmony_ci cancel_delayed_work(&b->work); 10848c2ecf20Sopenharmony_ci 10858c2ecf20Sopenharmony_ci mutex_lock(&b->c->bucket_lock); 10868c2ecf20Sopenharmony_ci bch_bucket_free(b->c, &b->key); 10878c2ecf20Sopenharmony_ci mca_bucket_free(b); 10888c2ecf20Sopenharmony_ci mutex_unlock(&b->c->bucket_lock); 10898c2ecf20Sopenharmony_ci} 10908c2ecf20Sopenharmony_ci 10918c2ecf20Sopenharmony_ci/* 10928c2ecf20Sopenharmony_ci * Only error code or btree pointer will be returned, it is unncessary for 10938c2ecf20Sopenharmony_ci * callers to check NULL pointer. 10948c2ecf20Sopenharmony_ci */ 10958c2ecf20Sopenharmony_cistruct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, 10968c2ecf20Sopenharmony_ci int level, bool wait, 10978c2ecf20Sopenharmony_ci struct btree *parent) 10988c2ecf20Sopenharmony_ci{ 10998c2ecf20Sopenharmony_ci BKEY_PADDED(key) k; 11008c2ecf20Sopenharmony_ci struct btree *b; 11018c2ecf20Sopenharmony_ci 11028c2ecf20Sopenharmony_ci mutex_lock(&c->bucket_lock); 11038c2ecf20Sopenharmony_ciretry: 11048c2ecf20Sopenharmony_ci /* return ERR_PTR(-EAGAIN) when it fails */ 11058c2ecf20Sopenharmony_ci b = ERR_PTR(-EAGAIN); 11068c2ecf20Sopenharmony_ci if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) 11078c2ecf20Sopenharmony_ci goto err; 11088c2ecf20Sopenharmony_ci 11098c2ecf20Sopenharmony_ci bkey_put(c, &k.key); 11108c2ecf20Sopenharmony_ci SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 11118c2ecf20Sopenharmony_ci 11128c2ecf20Sopenharmony_ci b = mca_alloc(c, op, &k.key, level); 11138c2ecf20Sopenharmony_ci if (IS_ERR(b)) 11148c2ecf20Sopenharmony_ci goto err_free; 11158c2ecf20Sopenharmony_ci 11168c2ecf20Sopenharmony_ci if (!b) { 11178c2ecf20Sopenharmony_ci cache_bug(c, 11188c2ecf20Sopenharmony_ci "Tried to allocate bucket that was in btree cache"); 11198c2ecf20Sopenharmony_ci goto retry; 11208c2ecf20Sopenharmony_ci } 11218c2ecf20Sopenharmony_ci 11228c2ecf20Sopenharmony_ci b->parent = parent; 11238c2ecf20Sopenharmony_ci bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb)); 11248c2ecf20Sopenharmony_ci 11258c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 11268c2ecf20Sopenharmony_ci 11278c2ecf20Sopenharmony_ci trace_bcache_btree_node_alloc(b); 11288c2ecf20Sopenharmony_ci return b; 11298c2ecf20Sopenharmony_cierr_free: 11308c2ecf20Sopenharmony_ci bch_bucket_free(c, &k.key); 11318c2ecf20Sopenharmony_cierr: 11328c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 11338c2ecf20Sopenharmony_ci 11348c2ecf20Sopenharmony_ci trace_bcache_btree_node_alloc_fail(c); 11358c2ecf20Sopenharmony_ci return b; 11368c2ecf20Sopenharmony_ci} 11378c2ecf20Sopenharmony_ci 11388c2ecf20Sopenharmony_cistatic struct btree *bch_btree_node_alloc(struct cache_set *c, 11398c2ecf20Sopenharmony_ci struct btree_op *op, int level, 11408c2ecf20Sopenharmony_ci struct btree *parent) 11418c2ecf20Sopenharmony_ci{ 11428c2ecf20Sopenharmony_ci return __bch_btree_node_alloc(c, op, level, op != NULL, parent); 11438c2ecf20Sopenharmony_ci} 11448c2ecf20Sopenharmony_ci 11458c2ecf20Sopenharmony_cistatic struct btree *btree_node_alloc_replacement(struct btree *b, 11468c2ecf20Sopenharmony_ci struct btree_op *op) 11478c2ecf20Sopenharmony_ci{ 11488c2ecf20Sopenharmony_ci struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); 11498c2ecf20Sopenharmony_ci 11508c2ecf20Sopenharmony_ci if (!IS_ERR(n)) { 11518c2ecf20Sopenharmony_ci mutex_lock(&n->write_lock); 11528c2ecf20Sopenharmony_ci bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); 11538c2ecf20Sopenharmony_ci bkey_copy_key(&n->key, &b->key); 11548c2ecf20Sopenharmony_ci mutex_unlock(&n->write_lock); 11558c2ecf20Sopenharmony_ci } 11568c2ecf20Sopenharmony_ci 11578c2ecf20Sopenharmony_ci return n; 11588c2ecf20Sopenharmony_ci} 11598c2ecf20Sopenharmony_ci 11608c2ecf20Sopenharmony_cistatic void make_btree_freeing_key(struct btree *b, struct bkey *k) 11618c2ecf20Sopenharmony_ci{ 11628c2ecf20Sopenharmony_ci unsigned int i; 11638c2ecf20Sopenharmony_ci 11648c2ecf20Sopenharmony_ci mutex_lock(&b->c->bucket_lock); 11658c2ecf20Sopenharmony_ci 11668c2ecf20Sopenharmony_ci atomic_inc(&b->c->prio_blocked); 11678c2ecf20Sopenharmony_ci 11688c2ecf20Sopenharmony_ci bkey_copy(k, &b->key); 11698c2ecf20Sopenharmony_ci bkey_copy_key(k, &ZERO_KEY); 11708c2ecf20Sopenharmony_ci 11718c2ecf20Sopenharmony_ci for (i = 0; i < KEY_PTRS(k); i++) 11728c2ecf20Sopenharmony_ci SET_PTR_GEN(k, i, 11738c2ecf20Sopenharmony_ci bch_inc_gen(PTR_CACHE(b->c, &b->key, i), 11748c2ecf20Sopenharmony_ci PTR_BUCKET(b->c, &b->key, i))); 11758c2ecf20Sopenharmony_ci 11768c2ecf20Sopenharmony_ci mutex_unlock(&b->c->bucket_lock); 11778c2ecf20Sopenharmony_ci} 11788c2ecf20Sopenharmony_ci 11798c2ecf20Sopenharmony_cistatic int btree_check_reserve(struct btree *b, struct btree_op *op) 11808c2ecf20Sopenharmony_ci{ 11818c2ecf20Sopenharmony_ci struct cache_set *c = b->c; 11828c2ecf20Sopenharmony_ci struct cache *ca = c->cache; 11838c2ecf20Sopenharmony_ci unsigned int reserve = (c->root->level - b->level) * 2 + 1; 11848c2ecf20Sopenharmony_ci 11858c2ecf20Sopenharmony_ci mutex_lock(&c->bucket_lock); 11868c2ecf20Sopenharmony_ci 11878c2ecf20Sopenharmony_ci if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { 11888c2ecf20Sopenharmony_ci if (op) 11898c2ecf20Sopenharmony_ci prepare_to_wait(&c->btree_cache_wait, &op->wait, 11908c2ecf20Sopenharmony_ci TASK_UNINTERRUPTIBLE); 11918c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 11928c2ecf20Sopenharmony_ci return -EINTR; 11938c2ecf20Sopenharmony_ci } 11948c2ecf20Sopenharmony_ci 11958c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 11968c2ecf20Sopenharmony_ci 11978c2ecf20Sopenharmony_ci return mca_cannibalize_lock(b->c, op); 11988c2ecf20Sopenharmony_ci} 11998c2ecf20Sopenharmony_ci 12008c2ecf20Sopenharmony_ci/* Garbage collection */ 12018c2ecf20Sopenharmony_ci 12028c2ecf20Sopenharmony_cistatic uint8_t __bch_btree_mark_key(struct cache_set *c, int level, 12038c2ecf20Sopenharmony_ci struct bkey *k) 12048c2ecf20Sopenharmony_ci{ 12058c2ecf20Sopenharmony_ci uint8_t stale = 0; 12068c2ecf20Sopenharmony_ci unsigned int i; 12078c2ecf20Sopenharmony_ci struct bucket *g; 12088c2ecf20Sopenharmony_ci 12098c2ecf20Sopenharmony_ci /* 12108c2ecf20Sopenharmony_ci * ptr_invalid() can't return true for the keys that mark btree nodes as 12118c2ecf20Sopenharmony_ci * freed, but since ptr_bad() returns true we'll never actually use them 12128c2ecf20Sopenharmony_ci * for anything and thus we don't want mark their pointers here 12138c2ecf20Sopenharmony_ci */ 12148c2ecf20Sopenharmony_ci if (!bkey_cmp(k, &ZERO_KEY)) 12158c2ecf20Sopenharmony_ci return stale; 12168c2ecf20Sopenharmony_ci 12178c2ecf20Sopenharmony_ci for (i = 0; i < KEY_PTRS(k); i++) { 12188c2ecf20Sopenharmony_ci if (!ptr_available(c, k, i)) 12198c2ecf20Sopenharmony_ci continue; 12208c2ecf20Sopenharmony_ci 12218c2ecf20Sopenharmony_ci g = PTR_BUCKET(c, k, i); 12228c2ecf20Sopenharmony_ci 12238c2ecf20Sopenharmony_ci if (gen_after(g->last_gc, PTR_GEN(k, i))) 12248c2ecf20Sopenharmony_ci g->last_gc = PTR_GEN(k, i); 12258c2ecf20Sopenharmony_ci 12268c2ecf20Sopenharmony_ci if (ptr_stale(c, k, i)) { 12278c2ecf20Sopenharmony_ci stale = max(stale, ptr_stale(c, k, i)); 12288c2ecf20Sopenharmony_ci continue; 12298c2ecf20Sopenharmony_ci } 12308c2ecf20Sopenharmony_ci 12318c2ecf20Sopenharmony_ci cache_bug_on(GC_MARK(g) && 12328c2ecf20Sopenharmony_ci (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), 12338c2ecf20Sopenharmony_ci c, "inconsistent ptrs: mark = %llu, level = %i", 12348c2ecf20Sopenharmony_ci GC_MARK(g), level); 12358c2ecf20Sopenharmony_ci 12368c2ecf20Sopenharmony_ci if (level) 12378c2ecf20Sopenharmony_ci SET_GC_MARK(g, GC_MARK_METADATA); 12388c2ecf20Sopenharmony_ci else if (KEY_DIRTY(k)) 12398c2ecf20Sopenharmony_ci SET_GC_MARK(g, GC_MARK_DIRTY); 12408c2ecf20Sopenharmony_ci else if (!GC_MARK(g)) 12418c2ecf20Sopenharmony_ci SET_GC_MARK(g, GC_MARK_RECLAIMABLE); 12428c2ecf20Sopenharmony_ci 12438c2ecf20Sopenharmony_ci /* guard against overflow */ 12448c2ecf20Sopenharmony_ci SET_GC_SECTORS_USED(g, min_t(unsigned int, 12458c2ecf20Sopenharmony_ci GC_SECTORS_USED(g) + KEY_SIZE(k), 12468c2ecf20Sopenharmony_ci MAX_GC_SECTORS_USED)); 12478c2ecf20Sopenharmony_ci 12488c2ecf20Sopenharmony_ci BUG_ON(!GC_SECTORS_USED(g)); 12498c2ecf20Sopenharmony_ci } 12508c2ecf20Sopenharmony_ci 12518c2ecf20Sopenharmony_ci return stale; 12528c2ecf20Sopenharmony_ci} 12538c2ecf20Sopenharmony_ci 12548c2ecf20Sopenharmony_ci#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 12558c2ecf20Sopenharmony_ci 12568c2ecf20Sopenharmony_civoid bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) 12578c2ecf20Sopenharmony_ci{ 12588c2ecf20Sopenharmony_ci unsigned int i; 12598c2ecf20Sopenharmony_ci 12608c2ecf20Sopenharmony_ci for (i = 0; i < KEY_PTRS(k); i++) 12618c2ecf20Sopenharmony_ci if (ptr_available(c, k, i) && 12628c2ecf20Sopenharmony_ci !ptr_stale(c, k, i)) { 12638c2ecf20Sopenharmony_ci struct bucket *b = PTR_BUCKET(c, k, i); 12648c2ecf20Sopenharmony_ci 12658c2ecf20Sopenharmony_ci b->gen = PTR_GEN(k, i); 12668c2ecf20Sopenharmony_ci 12678c2ecf20Sopenharmony_ci if (level && bkey_cmp(k, &ZERO_KEY)) 12688c2ecf20Sopenharmony_ci b->prio = BTREE_PRIO; 12698c2ecf20Sopenharmony_ci else if (!level && b->prio == BTREE_PRIO) 12708c2ecf20Sopenharmony_ci b->prio = INITIAL_PRIO; 12718c2ecf20Sopenharmony_ci } 12728c2ecf20Sopenharmony_ci 12738c2ecf20Sopenharmony_ci __bch_btree_mark_key(c, level, k); 12748c2ecf20Sopenharmony_ci} 12758c2ecf20Sopenharmony_ci 12768c2ecf20Sopenharmony_civoid bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) 12778c2ecf20Sopenharmony_ci{ 12788c2ecf20Sopenharmony_ci stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; 12798c2ecf20Sopenharmony_ci} 12808c2ecf20Sopenharmony_ci 12818c2ecf20Sopenharmony_cistatic bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) 12828c2ecf20Sopenharmony_ci{ 12838c2ecf20Sopenharmony_ci uint8_t stale = 0; 12848c2ecf20Sopenharmony_ci unsigned int keys = 0, good_keys = 0; 12858c2ecf20Sopenharmony_ci struct bkey *k; 12868c2ecf20Sopenharmony_ci struct btree_iter iter; 12878c2ecf20Sopenharmony_ci struct bset_tree *t; 12888c2ecf20Sopenharmony_ci 12898c2ecf20Sopenharmony_ci gc->nodes++; 12908c2ecf20Sopenharmony_ci 12918c2ecf20Sopenharmony_ci for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { 12928c2ecf20Sopenharmony_ci stale = max(stale, btree_mark_key(b, k)); 12938c2ecf20Sopenharmony_ci keys++; 12948c2ecf20Sopenharmony_ci 12958c2ecf20Sopenharmony_ci if (bch_ptr_bad(&b->keys, k)) 12968c2ecf20Sopenharmony_ci continue; 12978c2ecf20Sopenharmony_ci 12988c2ecf20Sopenharmony_ci gc->key_bytes += bkey_u64s(k); 12998c2ecf20Sopenharmony_ci gc->nkeys++; 13008c2ecf20Sopenharmony_ci good_keys++; 13018c2ecf20Sopenharmony_ci 13028c2ecf20Sopenharmony_ci gc->data += KEY_SIZE(k); 13038c2ecf20Sopenharmony_ci } 13048c2ecf20Sopenharmony_ci 13058c2ecf20Sopenharmony_ci for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) 13068c2ecf20Sopenharmony_ci btree_bug_on(t->size && 13078c2ecf20Sopenharmony_ci bset_written(&b->keys, t) && 13088c2ecf20Sopenharmony_ci bkey_cmp(&b->key, &t->end) < 0, 13098c2ecf20Sopenharmony_ci b, "found short btree key in gc"); 13108c2ecf20Sopenharmony_ci 13118c2ecf20Sopenharmony_ci if (b->c->gc_always_rewrite) 13128c2ecf20Sopenharmony_ci return true; 13138c2ecf20Sopenharmony_ci 13148c2ecf20Sopenharmony_ci if (stale > 10) 13158c2ecf20Sopenharmony_ci return true; 13168c2ecf20Sopenharmony_ci 13178c2ecf20Sopenharmony_ci if ((keys - good_keys) * 2 > keys) 13188c2ecf20Sopenharmony_ci return true; 13198c2ecf20Sopenharmony_ci 13208c2ecf20Sopenharmony_ci return false; 13218c2ecf20Sopenharmony_ci} 13228c2ecf20Sopenharmony_ci 13238c2ecf20Sopenharmony_ci#define GC_MERGE_NODES 4U 13248c2ecf20Sopenharmony_ci 13258c2ecf20Sopenharmony_cistruct gc_merge_info { 13268c2ecf20Sopenharmony_ci struct btree *b; 13278c2ecf20Sopenharmony_ci unsigned int keys; 13288c2ecf20Sopenharmony_ci}; 13298c2ecf20Sopenharmony_ci 13308c2ecf20Sopenharmony_cistatic int bch_btree_insert_node(struct btree *b, struct btree_op *op, 13318c2ecf20Sopenharmony_ci struct keylist *insert_keys, 13328c2ecf20Sopenharmony_ci atomic_t *journal_ref, 13338c2ecf20Sopenharmony_ci struct bkey *replace_key); 13348c2ecf20Sopenharmony_ci 13358c2ecf20Sopenharmony_cistatic int btree_gc_coalesce(struct btree *b, struct btree_op *op, 13368c2ecf20Sopenharmony_ci struct gc_stat *gc, struct gc_merge_info *r) 13378c2ecf20Sopenharmony_ci{ 13388c2ecf20Sopenharmony_ci unsigned int i, nodes = 0, keys = 0, blocks; 13398c2ecf20Sopenharmony_ci struct btree *new_nodes[GC_MERGE_NODES]; 13408c2ecf20Sopenharmony_ci struct keylist keylist; 13418c2ecf20Sopenharmony_ci struct closure cl; 13428c2ecf20Sopenharmony_ci struct bkey *k; 13438c2ecf20Sopenharmony_ci 13448c2ecf20Sopenharmony_ci bch_keylist_init(&keylist); 13458c2ecf20Sopenharmony_ci 13468c2ecf20Sopenharmony_ci if (btree_check_reserve(b, NULL)) 13478c2ecf20Sopenharmony_ci return 0; 13488c2ecf20Sopenharmony_ci 13498c2ecf20Sopenharmony_ci memset(new_nodes, 0, sizeof(new_nodes)); 13508c2ecf20Sopenharmony_ci closure_init_stack(&cl); 13518c2ecf20Sopenharmony_ci 13528c2ecf20Sopenharmony_ci while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) 13538c2ecf20Sopenharmony_ci keys += r[nodes++].keys; 13548c2ecf20Sopenharmony_ci 13558c2ecf20Sopenharmony_ci blocks = btree_default_blocks(b->c) * 2 / 3; 13568c2ecf20Sopenharmony_ci 13578c2ecf20Sopenharmony_ci if (nodes < 2 || 13588c2ecf20Sopenharmony_ci __set_blocks(b->keys.set[0].data, keys, 13598c2ecf20Sopenharmony_ci block_bytes(b->c->cache)) > blocks * (nodes - 1)) 13608c2ecf20Sopenharmony_ci return 0; 13618c2ecf20Sopenharmony_ci 13628c2ecf20Sopenharmony_ci for (i = 0; i < nodes; i++) { 13638c2ecf20Sopenharmony_ci new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); 13648c2ecf20Sopenharmony_ci if (IS_ERR(new_nodes[i])) 13658c2ecf20Sopenharmony_ci goto out_nocoalesce; 13668c2ecf20Sopenharmony_ci } 13678c2ecf20Sopenharmony_ci 13688c2ecf20Sopenharmony_ci /* 13698c2ecf20Sopenharmony_ci * We have to check the reserve here, after we've allocated our new 13708c2ecf20Sopenharmony_ci * nodes, to make sure the insert below will succeed - we also check 13718c2ecf20Sopenharmony_ci * before as an optimization to potentially avoid a bunch of expensive 13728c2ecf20Sopenharmony_ci * allocs/sorts 13738c2ecf20Sopenharmony_ci */ 13748c2ecf20Sopenharmony_ci if (btree_check_reserve(b, NULL)) 13758c2ecf20Sopenharmony_ci goto out_nocoalesce; 13768c2ecf20Sopenharmony_ci 13778c2ecf20Sopenharmony_ci for (i = 0; i < nodes; i++) 13788c2ecf20Sopenharmony_ci mutex_lock(&new_nodes[i]->write_lock); 13798c2ecf20Sopenharmony_ci 13808c2ecf20Sopenharmony_ci for (i = nodes - 1; i > 0; --i) { 13818c2ecf20Sopenharmony_ci struct bset *n1 = btree_bset_first(new_nodes[i]); 13828c2ecf20Sopenharmony_ci struct bset *n2 = btree_bset_first(new_nodes[i - 1]); 13838c2ecf20Sopenharmony_ci struct bkey *k, *last = NULL; 13848c2ecf20Sopenharmony_ci 13858c2ecf20Sopenharmony_ci keys = 0; 13868c2ecf20Sopenharmony_ci 13878c2ecf20Sopenharmony_ci if (i > 1) { 13888c2ecf20Sopenharmony_ci for (k = n2->start; 13898c2ecf20Sopenharmony_ci k < bset_bkey_last(n2); 13908c2ecf20Sopenharmony_ci k = bkey_next(k)) { 13918c2ecf20Sopenharmony_ci if (__set_blocks(n1, n1->keys + keys + 13928c2ecf20Sopenharmony_ci bkey_u64s(k), 13938c2ecf20Sopenharmony_ci block_bytes(b->c->cache)) > blocks) 13948c2ecf20Sopenharmony_ci break; 13958c2ecf20Sopenharmony_ci 13968c2ecf20Sopenharmony_ci last = k; 13978c2ecf20Sopenharmony_ci keys += bkey_u64s(k); 13988c2ecf20Sopenharmony_ci } 13998c2ecf20Sopenharmony_ci } else { 14008c2ecf20Sopenharmony_ci /* 14018c2ecf20Sopenharmony_ci * Last node we're not getting rid of - we're getting 14028c2ecf20Sopenharmony_ci * rid of the node at r[0]. Have to try and fit all of 14038c2ecf20Sopenharmony_ci * the remaining keys into this node; we can't ensure 14048c2ecf20Sopenharmony_ci * they will always fit due to rounding and variable 14058c2ecf20Sopenharmony_ci * length keys (shouldn't be possible in practice, 14068c2ecf20Sopenharmony_ci * though) 14078c2ecf20Sopenharmony_ci */ 14088c2ecf20Sopenharmony_ci if (__set_blocks(n1, n1->keys + n2->keys, 14098c2ecf20Sopenharmony_ci block_bytes(b->c->cache)) > 14108c2ecf20Sopenharmony_ci btree_blocks(new_nodes[i])) 14118c2ecf20Sopenharmony_ci goto out_unlock_nocoalesce; 14128c2ecf20Sopenharmony_ci 14138c2ecf20Sopenharmony_ci keys = n2->keys; 14148c2ecf20Sopenharmony_ci /* Take the key of the node we're getting rid of */ 14158c2ecf20Sopenharmony_ci last = &r->b->key; 14168c2ecf20Sopenharmony_ci } 14178c2ecf20Sopenharmony_ci 14188c2ecf20Sopenharmony_ci BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) > 14198c2ecf20Sopenharmony_ci btree_blocks(new_nodes[i])); 14208c2ecf20Sopenharmony_ci 14218c2ecf20Sopenharmony_ci if (last) 14228c2ecf20Sopenharmony_ci bkey_copy_key(&new_nodes[i]->key, last); 14238c2ecf20Sopenharmony_ci 14248c2ecf20Sopenharmony_ci memcpy(bset_bkey_last(n1), 14258c2ecf20Sopenharmony_ci n2->start, 14268c2ecf20Sopenharmony_ci (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); 14278c2ecf20Sopenharmony_ci 14288c2ecf20Sopenharmony_ci n1->keys += keys; 14298c2ecf20Sopenharmony_ci r[i].keys = n1->keys; 14308c2ecf20Sopenharmony_ci 14318c2ecf20Sopenharmony_ci memmove(n2->start, 14328c2ecf20Sopenharmony_ci bset_bkey_idx(n2, keys), 14338c2ecf20Sopenharmony_ci (void *) bset_bkey_last(n2) - 14348c2ecf20Sopenharmony_ci (void *) bset_bkey_idx(n2, keys)); 14358c2ecf20Sopenharmony_ci 14368c2ecf20Sopenharmony_ci n2->keys -= keys; 14378c2ecf20Sopenharmony_ci 14388c2ecf20Sopenharmony_ci if (__bch_keylist_realloc(&keylist, 14398c2ecf20Sopenharmony_ci bkey_u64s(&new_nodes[i]->key))) 14408c2ecf20Sopenharmony_ci goto out_unlock_nocoalesce; 14418c2ecf20Sopenharmony_ci 14428c2ecf20Sopenharmony_ci bch_btree_node_write(new_nodes[i], &cl); 14438c2ecf20Sopenharmony_ci bch_keylist_add(&keylist, &new_nodes[i]->key); 14448c2ecf20Sopenharmony_ci } 14458c2ecf20Sopenharmony_ci 14468c2ecf20Sopenharmony_ci for (i = 0; i < nodes; i++) 14478c2ecf20Sopenharmony_ci mutex_unlock(&new_nodes[i]->write_lock); 14488c2ecf20Sopenharmony_ci 14498c2ecf20Sopenharmony_ci closure_sync(&cl); 14508c2ecf20Sopenharmony_ci 14518c2ecf20Sopenharmony_ci /* We emptied out this node */ 14528c2ecf20Sopenharmony_ci BUG_ON(btree_bset_first(new_nodes[0])->keys); 14538c2ecf20Sopenharmony_ci btree_node_free(new_nodes[0]); 14548c2ecf20Sopenharmony_ci rw_unlock(true, new_nodes[0]); 14558c2ecf20Sopenharmony_ci new_nodes[0] = NULL; 14568c2ecf20Sopenharmony_ci 14578c2ecf20Sopenharmony_ci for (i = 0; i < nodes; i++) { 14588c2ecf20Sopenharmony_ci if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) 14598c2ecf20Sopenharmony_ci goto out_nocoalesce; 14608c2ecf20Sopenharmony_ci 14618c2ecf20Sopenharmony_ci make_btree_freeing_key(r[i].b, keylist.top); 14628c2ecf20Sopenharmony_ci bch_keylist_push(&keylist); 14638c2ecf20Sopenharmony_ci } 14648c2ecf20Sopenharmony_ci 14658c2ecf20Sopenharmony_ci bch_btree_insert_node(b, op, &keylist, NULL, NULL); 14668c2ecf20Sopenharmony_ci BUG_ON(!bch_keylist_empty(&keylist)); 14678c2ecf20Sopenharmony_ci 14688c2ecf20Sopenharmony_ci for (i = 0; i < nodes; i++) { 14698c2ecf20Sopenharmony_ci btree_node_free(r[i].b); 14708c2ecf20Sopenharmony_ci rw_unlock(true, r[i].b); 14718c2ecf20Sopenharmony_ci 14728c2ecf20Sopenharmony_ci r[i].b = new_nodes[i]; 14738c2ecf20Sopenharmony_ci } 14748c2ecf20Sopenharmony_ci 14758c2ecf20Sopenharmony_ci memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); 14768c2ecf20Sopenharmony_ci r[nodes - 1].b = ERR_PTR(-EINTR); 14778c2ecf20Sopenharmony_ci 14788c2ecf20Sopenharmony_ci trace_bcache_btree_gc_coalesce(nodes); 14798c2ecf20Sopenharmony_ci gc->nodes--; 14808c2ecf20Sopenharmony_ci 14818c2ecf20Sopenharmony_ci bch_keylist_free(&keylist); 14828c2ecf20Sopenharmony_ci 14838c2ecf20Sopenharmony_ci /* Invalidated our iterator */ 14848c2ecf20Sopenharmony_ci return -EINTR; 14858c2ecf20Sopenharmony_ci 14868c2ecf20Sopenharmony_ciout_unlock_nocoalesce: 14878c2ecf20Sopenharmony_ci for (i = 0; i < nodes; i++) 14888c2ecf20Sopenharmony_ci mutex_unlock(&new_nodes[i]->write_lock); 14898c2ecf20Sopenharmony_ci 14908c2ecf20Sopenharmony_ciout_nocoalesce: 14918c2ecf20Sopenharmony_ci closure_sync(&cl); 14928c2ecf20Sopenharmony_ci 14938c2ecf20Sopenharmony_ci while ((k = bch_keylist_pop(&keylist))) 14948c2ecf20Sopenharmony_ci if (!bkey_cmp(k, &ZERO_KEY)) 14958c2ecf20Sopenharmony_ci atomic_dec(&b->c->prio_blocked); 14968c2ecf20Sopenharmony_ci bch_keylist_free(&keylist); 14978c2ecf20Sopenharmony_ci 14988c2ecf20Sopenharmony_ci for (i = 0; i < nodes; i++) 14998c2ecf20Sopenharmony_ci if (!IS_ERR_OR_NULL(new_nodes[i])) { 15008c2ecf20Sopenharmony_ci btree_node_free(new_nodes[i]); 15018c2ecf20Sopenharmony_ci rw_unlock(true, new_nodes[i]); 15028c2ecf20Sopenharmony_ci } 15038c2ecf20Sopenharmony_ci return 0; 15048c2ecf20Sopenharmony_ci} 15058c2ecf20Sopenharmony_ci 15068c2ecf20Sopenharmony_cistatic int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, 15078c2ecf20Sopenharmony_ci struct btree *replace) 15088c2ecf20Sopenharmony_ci{ 15098c2ecf20Sopenharmony_ci struct keylist keys; 15108c2ecf20Sopenharmony_ci struct btree *n; 15118c2ecf20Sopenharmony_ci 15128c2ecf20Sopenharmony_ci if (btree_check_reserve(b, NULL)) 15138c2ecf20Sopenharmony_ci return 0; 15148c2ecf20Sopenharmony_ci 15158c2ecf20Sopenharmony_ci n = btree_node_alloc_replacement(replace, NULL); 15168c2ecf20Sopenharmony_ci if (IS_ERR(n)) 15178c2ecf20Sopenharmony_ci return 0; 15188c2ecf20Sopenharmony_ci 15198c2ecf20Sopenharmony_ci /* recheck reserve after allocating replacement node */ 15208c2ecf20Sopenharmony_ci if (btree_check_reserve(b, NULL)) { 15218c2ecf20Sopenharmony_ci btree_node_free(n); 15228c2ecf20Sopenharmony_ci rw_unlock(true, n); 15238c2ecf20Sopenharmony_ci return 0; 15248c2ecf20Sopenharmony_ci } 15258c2ecf20Sopenharmony_ci 15268c2ecf20Sopenharmony_ci bch_btree_node_write_sync(n); 15278c2ecf20Sopenharmony_ci 15288c2ecf20Sopenharmony_ci bch_keylist_init(&keys); 15298c2ecf20Sopenharmony_ci bch_keylist_add(&keys, &n->key); 15308c2ecf20Sopenharmony_ci 15318c2ecf20Sopenharmony_ci make_btree_freeing_key(replace, keys.top); 15328c2ecf20Sopenharmony_ci bch_keylist_push(&keys); 15338c2ecf20Sopenharmony_ci 15348c2ecf20Sopenharmony_ci bch_btree_insert_node(b, op, &keys, NULL, NULL); 15358c2ecf20Sopenharmony_ci BUG_ON(!bch_keylist_empty(&keys)); 15368c2ecf20Sopenharmony_ci 15378c2ecf20Sopenharmony_ci btree_node_free(replace); 15388c2ecf20Sopenharmony_ci rw_unlock(true, n); 15398c2ecf20Sopenharmony_ci 15408c2ecf20Sopenharmony_ci /* Invalidated our iterator */ 15418c2ecf20Sopenharmony_ci return -EINTR; 15428c2ecf20Sopenharmony_ci} 15438c2ecf20Sopenharmony_ci 15448c2ecf20Sopenharmony_cistatic unsigned int btree_gc_count_keys(struct btree *b) 15458c2ecf20Sopenharmony_ci{ 15468c2ecf20Sopenharmony_ci struct bkey *k; 15478c2ecf20Sopenharmony_ci struct btree_iter iter; 15488c2ecf20Sopenharmony_ci unsigned int ret = 0; 15498c2ecf20Sopenharmony_ci 15508c2ecf20Sopenharmony_ci for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) 15518c2ecf20Sopenharmony_ci ret += bkey_u64s(k); 15528c2ecf20Sopenharmony_ci 15538c2ecf20Sopenharmony_ci return ret; 15548c2ecf20Sopenharmony_ci} 15558c2ecf20Sopenharmony_ci 15568c2ecf20Sopenharmony_cistatic size_t btree_gc_min_nodes(struct cache_set *c) 15578c2ecf20Sopenharmony_ci{ 15588c2ecf20Sopenharmony_ci size_t min_nodes; 15598c2ecf20Sopenharmony_ci 15608c2ecf20Sopenharmony_ci /* 15618c2ecf20Sopenharmony_ci * Since incremental GC would stop 100ms when front 15628c2ecf20Sopenharmony_ci * side I/O comes, so when there are many btree nodes, 15638c2ecf20Sopenharmony_ci * if GC only processes constant (100) nodes each time, 15648c2ecf20Sopenharmony_ci * GC would last a long time, and the front side I/Os 15658c2ecf20Sopenharmony_ci * would run out of the buckets (since no new bucket 15668c2ecf20Sopenharmony_ci * can be allocated during GC), and be blocked again. 15678c2ecf20Sopenharmony_ci * So GC should not process constant nodes, but varied 15688c2ecf20Sopenharmony_ci * nodes according to the number of btree nodes, which 15698c2ecf20Sopenharmony_ci * realized by dividing GC into constant(100) times, 15708c2ecf20Sopenharmony_ci * so when there are many btree nodes, GC can process 15718c2ecf20Sopenharmony_ci * more nodes each time, otherwise, GC will process less 15728c2ecf20Sopenharmony_ci * nodes each time (but no less than MIN_GC_NODES) 15738c2ecf20Sopenharmony_ci */ 15748c2ecf20Sopenharmony_ci min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; 15758c2ecf20Sopenharmony_ci if (min_nodes < MIN_GC_NODES) 15768c2ecf20Sopenharmony_ci min_nodes = MIN_GC_NODES; 15778c2ecf20Sopenharmony_ci 15788c2ecf20Sopenharmony_ci return min_nodes; 15798c2ecf20Sopenharmony_ci} 15808c2ecf20Sopenharmony_ci 15818c2ecf20Sopenharmony_ci 15828c2ecf20Sopenharmony_cistatic int btree_gc_recurse(struct btree *b, struct btree_op *op, 15838c2ecf20Sopenharmony_ci struct closure *writes, struct gc_stat *gc) 15848c2ecf20Sopenharmony_ci{ 15858c2ecf20Sopenharmony_ci int ret = 0; 15868c2ecf20Sopenharmony_ci bool should_rewrite; 15878c2ecf20Sopenharmony_ci struct bkey *k; 15888c2ecf20Sopenharmony_ci struct btree_iter iter; 15898c2ecf20Sopenharmony_ci struct gc_merge_info r[GC_MERGE_NODES]; 15908c2ecf20Sopenharmony_ci struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; 15918c2ecf20Sopenharmony_ci 15928c2ecf20Sopenharmony_ci bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 15938c2ecf20Sopenharmony_ci 15948c2ecf20Sopenharmony_ci for (i = r; i < r + ARRAY_SIZE(r); i++) 15958c2ecf20Sopenharmony_ci i->b = ERR_PTR(-EINTR); 15968c2ecf20Sopenharmony_ci 15978c2ecf20Sopenharmony_ci while (1) { 15988c2ecf20Sopenharmony_ci k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 15998c2ecf20Sopenharmony_ci if (k) { 16008c2ecf20Sopenharmony_ci r->b = bch_btree_node_get(b->c, op, k, b->level - 1, 16018c2ecf20Sopenharmony_ci true, b); 16028c2ecf20Sopenharmony_ci if (IS_ERR(r->b)) { 16038c2ecf20Sopenharmony_ci ret = PTR_ERR(r->b); 16048c2ecf20Sopenharmony_ci break; 16058c2ecf20Sopenharmony_ci } 16068c2ecf20Sopenharmony_ci 16078c2ecf20Sopenharmony_ci r->keys = btree_gc_count_keys(r->b); 16088c2ecf20Sopenharmony_ci 16098c2ecf20Sopenharmony_ci ret = btree_gc_coalesce(b, op, gc, r); 16108c2ecf20Sopenharmony_ci if (ret) 16118c2ecf20Sopenharmony_ci break; 16128c2ecf20Sopenharmony_ci } 16138c2ecf20Sopenharmony_ci 16148c2ecf20Sopenharmony_ci if (!last->b) 16158c2ecf20Sopenharmony_ci break; 16168c2ecf20Sopenharmony_ci 16178c2ecf20Sopenharmony_ci if (!IS_ERR(last->b)) { 16188c2ecf20Sopenharmony_ci should_rewrite = btree_gc_mark_node(last->b, gc); 16198c2ecf20Sopenharmony_ci if (should_rewrite) { 16208c2ecf20Sopenharmony_ci ret = btree_gc_rewrite_node(b, op, last->b); 16218c2ecf20Sopenharmony_ci if (ret) 16228c2ecf20Sopenharmony_ci break; 16238c2ecf20Sopenharmony_ci } 16248c2ecf20Sopenharmony_ci 16258c2ecf20Sopenharmony_ci if (last->b->level) { 16268c2ecf20Sopenharmony_ci ret = btree_gc_recurse(last->b, op, writes, gc); 16278c2ecf20Sopenharmony_ci if (ret) 16288c2ecf20Sopenharmony_ci break; 16298c2ecf20Sopenharmony_ci } 16308c2ecf20Sopenharmony_ci 16318c2ecf20Sopenharmony_ci bkey_copy_key(&b->c->gc_done, &last->b->key); 16328c2ecf20Sopenharmony_ci 16338c2ecf20Sopenharmony_ci /* 16348c2ecf20Sopenharmony_ci * Must flush leaf nodes before gc ends, since replace 16358c2ecf20Sopenharmony_ci * operations aren't journalled 16368c2ecf20Sopenharmony_ci */ 16378c2ecf20Sopenharmony_ci mutex_lock(&last->b->write_lock); 16388c2ecf20Sopenharmony_ci if (btree_node_dirty(last->b)) 16398c2ecf20Sopenharmony_ci bch_btree_node_write(last->b, writes); 16408c2ecf20Sopenharmony_ci mutex_unlock(&last->b->write_lock); 16418c2ecf20Sopenharmony_ci rw_unlock(true, last->b); 16428c2ecf20Sopenharmony_ci } 16438c2ecf20Sopenharmony_ci 16448c2ecf20Sopenharmony_ci memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); 16458c2ecf20Sopenharmony_ci r->b = NULL; 16468c2ecf20Sopenharmony_ci 16478c2ecf20Sopenharmony_ci if (atomic_read(&b->c->search_inflight) && 16488c2ecf20Sopenharmony_ci gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { 16498c2ecf20Sopenharmony_ci gc->nodes_pre = gc->nodes; 16508c2ecf20Sopenharmony_ci ret = -EAGAIN; 16518c2ecf20Sopenharmony_ci break; 16528c2ecf20Sopenharmony_ci } 16538c2ecf20Sopenharmony_ci 16548c2ecf20Sopenharmony_ci if (need_resched()) { 16558c2ecf20Sopenharmony_ci ret = -EAGAIN; 16568c2ecf20Sopenharmony_ci break; 16578c2ecf20Sopenharmony_ci } 16588c2ecf20Sopenharmony_ci } 16598c2ecf20Sopenharmony_ci 16608c2ecf20Sopenharmony_ci for (i = r; i < r + ARRAY_SIZE(r); i++) 16618c2ecf20Sopenharmony_ci if (!IS_ERR_OR_NULL(i->b)) { 16628c2ecf20Sopenharmony_ci mutex_lock(&i->b->write_lock); 16638c2ecf20Sopenharmony_ci if (btree_node_dirty(i->b)) 16648c2ecf20Sopenharmony_ci bch_btree_node_write(i->b, writes); 16658c2ecf20Sopenharmony_ci mutex_unlock(&i->b->write_lock); 16668c2ecf20Sopenharmony_ci rw_unlock(true, i->b); 16678c2ecf20Sopenharmony_ci } 16688c2ecf20Sopenharmony_ci 16698c2ecf20Sopenharmony_ci return ret; 16708c2ecf20Sopenharmony_ci} 16718c2ecf20Sopenharmony_ci 16728c2ecf20Sopenharmony_cistatic int bch_btree_gc_root(struct btree *b, struct btree_op *op, 16738c2ecf20Sopenharmony_ci struct closure *writes, struct gc_stat *gc) 16748c2ecf20Sopenharmony_ci{ 16758c2ecf20Sopenharmony_ci struct btree *n = NULL; 16768c2ecf20Sopenharmony_ci int ret = 0; 16778c2ecf20Sopenharmony_ci bool should_rewrite; 16788c2ecf20Sopenharmony_ci 16798c2ecf20Sopenharmony_ci should_rewrite = btree_gc_mark_node(b, gc); 16808c2ecf20Sopenharmony_ci if (should_rewrite) { 16818c2ecf20Sopenharmony_ci n = btree_node_alloc_replacement(b, NULL); 16828c2ecf20Sopenharmony_ci 16838c2ecf20Sopenharmony_ci if (!IS_ERR(n)) { 16848c2ecf20Sopenharmony_ci bch_btree_node_write_sync(n); 16858c2ecf20Sopenharmony_ci 16868c2ecf20Sopenharmony_ci bch_btree_set_root(n); 16878c2ecf20Sopenharmony_ci btree_node_free(b); 16888c2ecf20Sopenharmony_ci rw_unlock(true, n); 16898c2ecf20Sopenharmony_ci 16908c2ecf20Sopenharmony_ci return -EINTR; 16918c2ecf20Sopenharmony_ci } 16928c2ecf20Sopenharmony_ci } 16938c2ecf20Sopenharmony_ci 16948c2ecf20Sopenharmony_ci __bch_btree_mark_key(b->c, b->level + 1, &b->key); 16958c2ecf20Sopenharmony_ci 16968c2ecf20Sopenharmony_ci if (b->level) { 16978c2ecf20Sopenharmony_ci ret = btree_gc_recurse(b, op, writes, gc); 16988c2ecf20Sopenharmony_ci if (ret) 16998c2ecf20Sopenharmony_ci return ret; 17008c2ecf20Sopenharmony_ci } 17018c2ecf20Sopenharmony_ci 17028c2ecf20Sopenharmony_ci bkey_copy_key(&b->c->gc_done, &b->key); 17038c2ecf20Sopenharmony_ci 17048c2ecf20Sopenharmony_ci return ret; 17058c2ecf20Sopenharmony_ci} 17068c2ecf20Sopenharmony_ci 17078c2ecf20Sopenharmony_cistatic void btree_gc_start(struct cache_set *c) 17088c2ecf20Sopenharmony_ci{ 17098c2ecf20Sopenharmony_ci struct cache *ca; 17108c2ecf20Sopenharmony_ci struct bucket *b; 17118c2ecf20Sopenharmony_ci 17128c2ecf20Sopenharmony_ci if (!c->gc_mark_valid) 17138c2ecf20Sopenharmony_ci return; 17148c2ecf20Sopenharmony_ci 17158c2ecf20Sopenharmony_ci mutex_lock(&c->bucket_lock); 17168c2ecf20Sopenharmony_ci 17178c2ecf20Sopenharmony_ci c->gc_mark_valid = 0; 17188c2ecf20Sopenharmony_ci c->gc_done = ZERO_KEY; 17198c2ecf20Sopenharmony_ci 17208c2ecf20Sopenharmony_ci ca = c->cache; 17218c2ecf20Sopenharmony_ci for_each_bucket(b, ca) { 17228c2ecf20Sopenharmony_ci b->last_gc = b->gen; 17238c2ecf20Sopenharmony_ci if (!atomic_read(&b->pin)) { 17248c2ecf20Sopenharmony_ci SET_GC_MARK(b, 0); 17258c2ecf20Sopenharmony_ci SET_GC_SECTORS_USED(b, 0); 17268c2ecf20Sopenharmony_ci } 17278c2ecf20Sopenharmony_ci } 17288c2ecf20Sopenharmony_ci 17298c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 17308c2ecf20Sopenharmony_ci} 17318c2ecf20Sopenharmony_ci 17328c2ecf20Sopenharmony_cistatic void bch_btree_gc_finish(struct cache_set *c) 17338c2ecf20Sopenharmony_ci{ 17348c2ecf20Sopenharmony_ci struct bucket *b; 17358c2ecf20Sopenharmony_ci struct cache *ca; 17368c2ecf20Sopenharmony_ci unsigned int i, j; 17378c2ecf20Sopenharmony_ci uint64_t *k; 17388c2ecf20Sopenharmony_ci 17398c2ecf20Sopenharmony_ci mutex_lock(&c->bucket_lock); 17408c2ecf20Sopenharmony_ci 17418c2ecf20Sopenharmony_ci set_gc_sectors(c); 17428c2ecf20Sopenharmony_ci c->gc_mark_valid = 1; 17438c2ecf20Sopenharmony_ci c->need_gc = 0; 17448c2ecf20Sopenharmony_ci 17458c2ecf20Sopenharmony_ci for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) 17468c2ecf20Sopenharmony_ci SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), 17478c2ecf20Sopenharmony_ci GC_MARK_METADATA); 17488c2ecf20Sopenharmony_ci 17498c2ecf20Sopenharmony_ci /* don't reclaim buckets to which writeback keys point */ 17508c2ecf20Sopenharmony_ci rcu_read_lock(); 17518c2ecf20Sopenharmony_ci for (i = 0; i < c->devices_max_used; i++) { 17528c2ecf20Sopenharmony_ci struct bcache_device *d = c->devices[i]; 17538c2ecf20Sopenharmony_ci struct cached_dev *dc; 17548c2ecf20Sopenharmony_ci struct keybuf_key *w, *n; 17558c2ecf20Sopenharmony_ci 17568c2ecf20Sopenharmony_ci if (!d || UUID_FLASH_ONLY(&c->uuids[i])) 17578c2ecf20Sopenharmony_ci continue; 17588c2ecf20Sopenharmony_ci dc = container_of(d, struct cached_dev, disk); 17598c2ecf20Sopenharmony_ci 17608c2ecf20Sopenharmony_ci spin_lock(&dc->writeback_keys.lock); 17618c2ecf20Sopenharmony_ci rbtree_postorder_for_each_entry_safe(w, n, 17628c2ecf20Sopenharmony_ci &dc->writeback_keys.keys, node) 17638c2ecf20Sopenharmony_ci for (j = 0; j < KEY_PTRS(&w->key); j++) 17648c2ecf20Sopenharmony_ci SET_GC_MARK(PTR_BUCKET(c, &w->key, j), 17658c2ecf20Sopenharmony_ci GC_MARK_DIRTY); 17668c2ecf20Sopenharmony_ci spin_unlock(&dc->writeback_keys.lock); 17678c2ecf20Sopenharmony_ci } 17688c2ecf20Sopenharmony_ci rcu_read_unlock(); 17698c2ecf20Sopenharmony_ci 17708c2ecf20Sopenharmony_ci c->avail_nbuckets = 0; 17718c2ecf20Sopenharmony_ci 17728c2ecf20Sopenharmony_ci ca = c->cache; 17738c2ecf20Sopenharmony_ci ca->invalidate_needs_gc = 0; 17748c2ecf20Sopenharmony_ci 17758c2ecf20Sopenharmony_ci for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) 17768c2ecf20Sopenharmony_ci SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); 17778c2ecf20Sopenharmony_ci 17788c2ecf20Sopenharmony_ci for (k = ca->prio_buckets; 17798c2ecf20Sopenharmony_ci k < ca->prio_buckets + prio_buckets(ca) * 2; k++) 17808c2ecf20Sopenharmony_ci SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); 17818c2ecf20Sopenharmony_ci 17828c2ecf20Sopenharmony_ci for_each_bucket(b, ca) { 17838c2ecf20Sopenharmony_ci c->need_gc = max(c->need_gc, bucket_gc_gen(b)); 17848c2ecf20Sopenharmony_ci 17858c2ecf20Sopenharmony_ci if (atomic_read(&b->pin)) 17868c2ecf20Sopenharmony_ci continue; 17878c2ecf20Sopenharmony_ci 17888c2ecf20Sopenharmony_ci BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); 17898c2ecf20Sopenharmony_ci 17908c2ecf20Sopenharmony_ci if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) 17918c2ecf20Sopenharmony_ci c->avail_nbuckets++; 17928c2ecf20Sopenharmony_ci } 17938c2ecf20Sopenharmony_ci 17948c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 17958c2ecf20Sopenharmony_ci} 17968c2ecf20Sopenharmony_ci 17978c2ecf20Sopenharmony_cistatic void bch_btree_gc(struct cache_set *c) 17988c2ecf20Sopenharmony_ci{ 17998c2ecf20Sopenharmony_ci int ret; 18008c2ecf20Sopenharmony_ci struct gc_stat stats; 18018c2ecf20Sopenharmony_ci struct closure writes; 18028c2ecf20Sopenharmony_ci struct btree_op op; 18038c2ecf20Sopenharmony_ci uint64_t start_time = local_clock(); 18048c2ecf20Sopenharmony_ci 18058c2ecf20Sopenharmony_ci trace_bcache_gc_start(c); 18068c2ecf20Sopenharmony_ci 18078c2ecf20Sopenharmony_ci memset(&stats, 0, sizeof(struct gc_stat)); 18088c2ecf20Sopenharmony_ci closure_init_stack(&writes); 18098c2ecf20Sopenharmony_ci bch_btree_op_init(&op, SHRT_MAX); 18108c2ecf20Sopenharmony_ci 18118c2ecf20Sopenharmony_ci btree_gc_start(c); 18128c2ecf20Sopenharmony_ci 18138c2ecf20Sopenharmony_ci /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ 18148c2ecf20Sopenharmony_ci do { 18158c2ecf20Sopenharmony_ci ret = bcache_btree_root(gc_root, c, &op, &writes, &stats); 18168c2ecf20Sopenharmony_ci closure_sync(&writes); 18178c2ecf20Sopenharmony_ci cond_resched(); 18188c2ecf20Sopenharmony_ci 18198c2ecf20Sopenharmony_ci if (ret == -EAGAIN) 18208c2ecf20Sopenharmony_ci schedule_timeout_interruptible(msecs_to_jiffies 18218c2ecf20Sopenharmony_ci (GC_SLEEP_MS)); 18228c2ecf20Sopenharmony_ci else if (ret) 18238c2ecf20Sopenharmony_ci pr_warn("gc failed!\n"); 18248c2ecf20Sopenharmony_ci } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); 18258c2ecf20Sopenharmony_ci 18268c2ecf20Sopenharmony_ci bch_btree_gc_finish(c); 18278c2ecf20Sopenharmony_ci wake_up_allocators(c); 18288c2ecf20Sopenharmony_ci 18298c2ecf20Sopenharmony_ci bch_time_stats_update(&c->btree_gc_time, start_time); 18308c2ecf20Sopenharmony_ci 18318c2ecf20Sopenharmony_ci stats.key_bytes *= sizeof(uint64_t); 18328c2ecf20Sopenharmony_ci stats.data <<= 9; 18338c2ecf20Sopenharmony_ci bch_update_bucket_in_use(c, &stats); 18348c2ecf20Sopenharmony_ci memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 18358c2ecf20Sopenharmony_ci 18368c2ecf20Sopenharmony_ci trace_bcache_gc_end(c); 18378c2ecf20Sopenharmony_ci 18388c2ecf20Sopenharmony_ci bch_moving_gc(c); 18398c2ecf20Sopenharmony_ci} 18408c2ecf20Sopenharmony_ci 18418c2ecf20Sopenharmony_cistatic bool gc_should_run(struct cache_set *c) 18428c2ecf20Sopenharmony_ci{ 18438c2ecf20Sopenharmony_ci struct cache *ca = c->cache; 18448c2ecf20Sopenharmony_ci 18458c2ecf20Sopenharmony_ci if (ca->invalidate_needs_gc) 18468c2ecf20Sopenharmony_ci return true; 18478c2ecf20Sopenharmony_ci 18488c2ecf20Sopenharmony_ci if (atomic_read(&c->sectors_to_gc) < 0) 18498c2ecf20Sopenharmony_ci return true; 18508c2ecf20Sopenharmony_ci 18518c2ecf20Sopenharmony_ci return false; 18528c2ecf20Sopenharmony_ci} 18538c2ecf20Sopenharmony_ci 18548c2ecf20Sopenharmony_cistatic int bch_gc_thread(void *arg) 18558c2ecf20Sopenharmony_ci{ 18568c2ecf20Sopenharmony_ci struct cache_set *c = arg; 18578c2ecf20Sopenharmony_ci 18588c2ecf20Sopenharmony_ci while (1) { 18598c2ecf20Sopenharmony_ci wait_event_interruptible(c->gc_wait, 18608c2ecf20Sopenharmony_ci kthread_should_stop() || 18618c2ecf20Sopenharmony_ci test_bit(CACHE_SET_IO_DISABLE, &c->flags) || 18628c2ecf20Sopenharmony_ci gc_should_run(c)); 18638c2ecf20Sopenharmony_ci 18648c2ecf20Sopenharmony_ci if (kthread_should_stop() || 18658c2ecf20Sopenharmony_ci test_bit(CACHE_SET_IO_DISABLE, &c->flags)) 18668c2ecf20Sopenharmony_ci break; 18678c2ecf20Sopenharmony_ci 18688c2ecf20Sopenharmony_ci set_gc_sectors(c); 18698c2ecf20Sopenharmony_ci bch_btree_gc(c); 18708c2ecf20Sopenharmony_ci } 18718c2ecf20Sopenharmony_ci 18728c2ecf20Sopenharmony_ci wait_for_kthread_stop(); 18738c2ecf20Sopenharmony_ci return 0; 18748c2ecf20Sopenharmony_ci} 18758c2ecf20Sopenharmony_ci 18768c2ecf20Sopenharmony_ciint bch_gc_thread_start(struct cache_set *c) 18778c2ecf20Sopenharmony_ci{ 18788c2ecf20Sopenharmony_ci c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); 18798c2ecf20Sopenharmony_ci return PTR_ERR_OR_ZERO(c->gc_thread); 18808c2ecf20Sopenharmony_ci} 18818c2ecf20Sopenharmony_ci 18828c2ecf20Sopenharmony_ci/* Initial partial gc */ 18838c2ecf20Sopenharmony_ci 18848c2ecf20Sopenharmony_cistatic int bch_btree_check_recurse(struct btree *b, struct btree_op *op) 18858c2ecf20Sopenharmony_ci{ 18868c2ecf20Sopenharmony_ci int ret = 0; 18878c2ecf20Sopenharmony_ci struct bkey *k, *p = NULL; 18888c2ecf20Sopenharmony_ci struct btree_iter iter; 18898c2ecf20Sopenharmony_ci 18908c2ecf20Sopenharmony_ci for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) 18918c2ecf20Sopenharmony_ci bch_initial_mark_key(b->c, b->level, k); 18928c2ecf20Sopenharmony_ci 18938c2ecf20Sopenharmony_ci bch_initial_mark_key(b->c, b->level + 1, &b->key); 18948c2ecf20Sopenharmony_ci 18958c2ecf20Sopenharmony_ci if (b->level) { 18968c2ecf20Sopenharmony_ci bch_btree_iter_init(&b->keys, &iter, NULL); 18978c2ecf20Sopenharmony_ci 18988c2ecf20Sopenharmony_ci do { 18998c2ecf20Sopenharmony_ci k = bch_btree_iter_next_filter(&iter, &b->keys, 19008c2ecf20Sopenharmony_ci bch_ptr_bad); 19018c2ecf20Sopenharmony_ci if (k) { 19028c2ecf20Sopenharmony_ci btree_node_prefetch(b, k); 19038c2ecf20Sopenharmony_ci /* 19048c2ecf20Sopenharmony_ci * initiallize c->gc_stats.nodes 19058c2ecf20Sopenharmony_ci * for incremental GC 19068c2ecf20Sopenharmony_ci */ 19078c2ecf20Sopenharmony_ci b->c->gc_stats.nodes++; 19088c2ecf20Sopenharmony_ci } 19098c2ecf20Sopenharmony_ci 19108c2ecf20Sopenharmony_ci if (p) 19118c2ecf20Sopenharmony_ci ret = bcache_btree(check_recurse, p, b, op); 19128c2ecf20Sopenharmony_ci 19138c2ecf20Sopenharmony_ci p = k; 19148c2ecf20Sopenharmony_ci } while (p && !ret); 19158c2ecf20Sopenharmony_ci } 19168c2ecf20Sopenharmony_ci 19178c2ecf20Sopenharmony_ci return ret; 19188c2ecf20Sopenharmony_ci} 19198c2ecf20Sopenharmony_ci 19208c2ecf20Sopenharmony_ci 19218c2ecf20Sopenharmony_cistatic int bch_btree_check_thread(void *arg) 19228c2ecf20Sopenharmony_ci{ 19238c2ecf20Sopenharmony_ci int ret; 19248c2ecf20Sopenharmony_ci struct btree_check_info *info = arg; 19258c2ecf20Sopenharmony_ci struct btree_check_state *check_state = info->state; 19268c2ecf20Sopenharmony_ci struct cache_set *c = check_state->c; 19278c2ecf20Sopenharmony_ci struct btree_iter iter; 19288c2ecf20Sopenharmony_ci struct bkey *k, *p; 19298c2ecf20Sopenharmony_ci int cur_idx, prev_idx, skip_nr; 19308c2ecf20Sopenharmony_ci 19318c2ecf20Sopenharmony_ci k = p = NULL; 19328c2ecf20Sopenharmony_ci cur_idx = prev_idx = 0; 19338c2ecf20Sopenharmony_ci ret = 0; 19348c2ecf20Sopenharmony_ci 19358c2ecf20Sopenharmony_ci /* root node keys are checked before thread created */ 19368c2ecf20Sopenharmony_ci bch_btree_iter_init(&c->root->keys, &iter, NULL); 19378c2ecf20Sopenharmony_ci k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); 19388c2ecf20Sopenharmony_ci BUG_ON(!k); 19398c2ecf20Sopenharmony_ci 19408c2ecf20Sopenharmony_ci p = k; 19418c2ecf20Sopenharmony_ci while (k) { 19428c2ecf20Sopenharmony_ci /* 19438c2ecf20Sopenharmony_ci * Fetch a root node key index, skip the keys which 19448c2ecf20Sopenharmony_ci * should be fetched by other threads, then check the 19458c2ecf20Sopenharmony_ci * sub-tree indexed by the fetched key. 19468c2ecf20Sopenharmony_ci */ 19478c2ecf20Sopenharmony_ci spin_lock(&check_state->idx_lock); 19488c2ecf20Sopenharmony_ci cur_idx = check_state->key_idx; 19498c2ecf20Sopenharmony_ci check_state->key_idx++; 19508c2ecf20Sopenharmony_ci spin_unlock(&check_state->idx_lock); 19518c2ecf20Sopenharmony_ci 19528c2ecf20Sopenharmony_ci skip_nr = cur_idx - prev_idx; 19538c2ecf20Sopenharmony_ci 19548c2ecf20Sopenharmony_ci while (skip_nr) { 19558c2ecf20Sopenharmony_ci k = bch_btree_iter_next_filter(&iter, 19568c2ecf20Sopenharmony_ci &c->root->keys, 19578c2ecf20Sopenharmony_ci bch_ptr_bad); 19588c2ecf20Sopenharmony_ci if (k) 19598c2ecf20Sopenharmony_ci p = k; 19608c2ecf20Sopenharmony_ci else { 19618c2ecf20Sopenharmony_ci /* 19628c2ecf20Sopenharmony_ci * No more keys to check in root node, 19638c2ecf20Sopenharmony_ci * current checking threads are enough, 19648c2ecf20Sopenharmony_ci * stop creating more. 19658c2ecf20Sopenharmony_ci */ 19668c2ecf20Sopenharmony_ci atomic_set(&check_state->enough, 1); 19678c2ecf20Sopenharmony_ci /* Update check_state->enough earlier */ 19688c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 19698c2ecf20Sopenharmony_ci goto out; 19708c2ecf20Sopenharmony_ci } 19718c2ecf20Sopenharmony_ci skip_nr--; 19728c2ecf20Sopenharmony_ci cond_resched(); 19738c2ecf20Sopenharmony_ci } 19748c2ecf20Sopenharmony_ci 19758c2ecf20Sopenharmony_ci if (p) { 19768c2ecf20Sopenharmony_ci struct btree_op op; 19778c2ecf20Sopenharmony_ci 19788c2ecf20Sopenharmony_ci btree_node_prefetch(c->root, p); 19798c2ecf20Sopenharmony_ci c->gc_stats.nodes++; 19808c2ecf20Sopenharmony_ci bch_btree_op_init(&op, 0); 19818c2ecf20Sopenharmony_ci ret = bcache_btree(check_recurse, p, c->root, &op); 19828c2ecf20Sopenharmony_ci /* 19838c2ecf20Sopenharmony_ci * The op may be added to cache_set's btree_cache_wait 19848c2ecf20Sopenharmony_ci * in mca_cannibalize(), must ensure it is removed from 19858c2ecf20Sopenharmony_ci * the list and release btree_cache_alloc_lock before 19868c2ecf20Sopenharmony_ci * free op memory. 19878c2ecf20Sopenharmony_ci * Otherwise, the btree_cache_wait will be damaged. 19888c2ecf20Sopenharmony_ci */ 19898c2ecf20Sopenharmony_ci bch_cannibalize_unlock(c); 19908c2ecf20Sopenharmony_ci finish_wait(&c->btree_cache_wait, &(&op)->wait); 19918c2ecf20Sopenharmony_ci if (ret) 19928c2ecf20Sopenharmony_ci goto out; 19938c2ecf20Sopenharmony_ci } 19948c2ecf20Sopenharmony_ci p = NULL; 19958c2ecf20Sopenharmony_ci prev_idx = cur_idx; 19968c2ecf20Sopenharmony_ci cond_resched(); 19978c2ecf20Sopenharmony_ci } 19988c2ecf20Sopenharmony_ci 19998c2ecf20Sopenharmony_ciout: 20008c2ecf20Sopenharmony_ci info->result = ret; 20018c2ecf20Sopenharmony_ci /* update check_state->started among all CPUs */ 20028c2ecf20Sopenharmony_ci smp_mb__before_atomic(); 20038c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&check_state->started)) 20048c2ecf20Sopenharmony_ci wake_up(&check_state->wait); 20058c2ecf20Sopenharmony_ci 20068c2ecf20Sopenharmony_ci return ret; 20078c2ecf20Sopenharmony_ci} 20088c2ecf20Sopenharmony_ci 20098c2ecf20Sopenharmony_ci 20108c2ecf20Sopenharmony_ci 20118c2ecf20Sopenharmony_cistatic int bch_btree_chkthread_nr(void) 20128c2ecf20Sopenharmony_ci{ 20138c2ecf20Sopenharmony_ci int n = num_online_cpus()/2; 20148c2ecf20Sopenharmony_ci 20158c2ecf20Sopenharmony_ci if (n == 0) 20168c2ecf20Sopenharmony_ci n = 1; 20178c2ecf20Sopenharmony_ci else if (n > BCH_BTR_CHKTHREAD_MAX) 20188c2ecf20Sopenharmony_ci n = BCH_BTR_CHKTHREAD_MAX; 20198c2ecf20Sopenharmony_ci 20208c2ecf20Sopenharmony_ci return n; 20218c2ecf20Sopenharmony_ci} 20228c2ecf20Sopenharmony_ci 20238c2ecf20Sopenharmony_ciint bch_btree_check(struct cache_set *c) 20248c2ecf20Sopenharmony_ci{ 20258c2ecf20Sopenharmony_ci int ret = 0; 20268c2ecf20Sopenharmony_ci int i; 20278c2ecf20Sopenharmony_ci struct bkey *k = NULL; 20288c2ecf20Sopenharmony_ci struct btree_iter iter; 20298c2ecf20Sopenharmony_ci struct btree_check_state check_state; 20308c2ecf20Sopenharmony_ci 20318c2ecf20Sopenharmony_ci /* check and mark root node keys */ 20328c2ecf20Sopenharmony_ci for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) 20338c2ecf20Sopenharmony_ci bch_initial_mark_key(c, c->root->level, k); 20348c2ecf20Sopenharmony_ci 20358c2ecf20Sopenharmony_ci bch_initial_mark_key(c, c->root->level + 1, &c->root->key); 20368c2ecf20Sopenharmony_ci 20378c2ecf20Sopenharmony_ci if (c->root->level == 0) 20388c2ecf20Sopenharmony_ci return 0; 20398c2ecf20Sopenharmony_ci 20408c2ecf20Sopenharmony_ci memset(&check_state, 0, sizeof(struct btree_check_state)); 20418c2ecf20Sopenharmony_ci check_state.c = c; 20428c2ecf20Sopenharmony_ci check_state.total_threads = bch_btree_chkthread_nr(); 20438c2ecf20Sopenharmony_ci check_state.key_idx = 0; 20448c2ecf20Sopenharmony_ci spin_lock_init(&check_state.idx_lock); 20458c2ecf20Sopenharmony_ci atomic_set(&check_state.started, 0); 20468c2ecf20Sopenharmony_ci atomic_set(&check_state.enough, 0); 20478c2ecf20Sopenharmony_ci init_waitqueue_head(&check_state.wait); 20488c2ecf20Sopenharmony_ci 20498c2ecf20Sopenharmony_ci rw_lock(0, c->root, c->root->level); 20508c2ecf20Sopenharmony_ci /* 20518c2ecf20Sopenharmony_ci * Run multiple threads to check btree nodes in parallel, 20528c2ecf20Sopenharmony_ci * if check_state.enough is non-zero, it means current 20538c2ecf20Sopenharmony_ci * running check threads are enough, unncessary to create 20548c2ecf20Sopenharmony_ci * more. 20558c2ecf20Sopenharmony_ci */ 20568c2ecf20Sopenharmony_ci for (i = 0; i < check_state.total_threads; i++) { 20578c2ecf20Sopenharmony_ci /* fetch latest check_state.enough earlier */ 20588c2ecf20Sopenharmony_ci smp_mb__before_atomic(); 20598c2ecf20Sopenharmony_ci if (atomic_read(&check_state.enough)) 20608c2ecf20Sopenharmony_ci break; 20618c2ecf20Sopenharmony_ci 20628c2ecf20Sopenharmony_ci check_state.infos[i].result = 0; 20638c2ecf20Sopenharmony_ci check_state.infos[i].state = &check_state; 20648c2ecf20Sopenharmony_ci 20658c2ecf20Sopenharmony_ci check_state.infos[i].thread = 20668c2ecf20Sopenharmony_ci kthread_run(bch_btree_check_thread, 20678c2ecf20Sopenharmony_ci &check_state.infos[i], 20688c2ecf20Sopenharmony_ci "bch_btrchk[%d]", i); 20698c2ecf20Sopenharmony_ci if (IS_ERR(check_state.infos[i].thread)) { 20708c2ecf20Sopenharmony_ci pr_err("fails to run thread bch_btrchk[%d]\n", i); 20718c2ecf20Sopenharmony_ci for (--i; i >= 0; i--) 20728c2ecf20Sopenharmony_ci kthread_stop(check_state.infos[i].thread); 20738c2ecf20Sopenharmony_ci ret = -ENOMEM; 20748c2ecf20Sopenharmony_ci goto out; 20758c2ecf20Sopenharmony_ci } 20768c2ecf20Sopenharmony_ci atomic_inc(&check_state.started); 20778c2ecf20Sopenharmony_ci } 20788c2ecf20Sopenharmony_ci 20798c2ecf20Sopenharmony_ci /* 20808c2ecf20Sopenharmony_ci * Must wait for all threads to stop. 20818c2ecf20Sopenharmony_ci */ 20828c2ecf20Sopenharmony_ci wait_event(check_state.wait, atomic_read(&check_state.started) == 0); 20838c2ecf20Sopenharmony_ci 20848c2ecf20Sopenharmony_ci for (i = 0; i < check_state.total_threads; i++) { 20858c2ecf20Sopenharmony_ci if (check_state.infos[i].result) { 20868c2ecf20Sopenharmony_ci ret = check_state.infos[i].result; 20878c2ecf20Sopenharmony_ci goto out; 20888c2ecf20Sopenharmony_ci } 20898c2ecf20Sopenharmony_ci } 20908c2ecf20Sopenharmony_ci 20918c2ecf20Sopenharmony_ciout: 20928c2ecf20Sopenharmony_ci rw_unlock(0, c->root); 20938c2ecf20Sopenharmony_ci return ret; 20948c2ecf20Sopenharmony_ci} 20958c2ecf20Sopenharmony_ci 20968c2ecf20Sopenharmony_civoid bch_initial_gc_finish(struct cache_set *c) 20978c2ecf20Sopenharmony_ci{ 20988c2ecf20Sopenharmony_ci struct cache *ca = c->cache; 20998c2ecf20Sopenharmony_ci struct bucket *b; 21008c2ecf20Sopenharmony_ci 21018c2ecf20Sopenharmony_ci bch_btree_gc_finish(c); 21028c2ecf20Sopenharmony_ci 21038c2ecf20Sopenharmony_ci mutex_lock(&c->bucket_lock); 21048c2ecf20Sopenharmony_ci 21058c2ecf20Sopenharmony_ci /* 21068c2ecf20Sopenharmony_ci * We need to put some unused buckets directly on the prio freelist in 21078c2ecf20Sopenharmony_ci * order to get the allocator thread started - it needs freed buckets in 21088c2ecf20Sopenharmony_ci * order to rewrite the prios and gens, and it needs to rewrite prios 21098c2ecf20Sopenharmony_ci * and gens in order to free buckets. 21108c2ecf20Sopenharmony_ci * 21118c2ecf20Sopenharmony_ci * This is only safe for buckets that have no live data in them, which 21128c2ecf20Sopenharmony_ci * there should always be some of. 21138c2ecf20Sopenharmony_ci */ 21148c2ecf20Sopenharmony_ci for_each_bucket(b, ca) { 21158c2ecf20Sopenharmony_ci if (fifo_full(&ca->free[RESERVE_PRIO]) && 21168c2ecf20Sopenharmony_ci fifo_full(&ca->free[RESERVE_BTREE])) 21178c2ecf20Sopenharmony_ci break; 21188c2ecf20Sopenharmony_ci 21198c2ecf20Sopenharmony_ci if (bch_can_invalidate_bucket(ca, b) && 21208c2ecf20Sopenharmony_ci !GC_MARK(b)) { 21218c2ecf20Sopenharmony_ci __bch_invalidate_one_bucket(ca, b); 21228c2ecf20Sopenharmony_ci if (!fifo_push(&ca->free[RESERVE_PRIO], 21238c2ecf20Sopenharmony_ci b - ca->buckets)) 21248c2ecf20Sopenharmony_ci fifo_push(&ca->free[RESERVE_BTREE], 21258c2ecf20Sopenharmony_ci b - ca->buckets); 21268c2ecf20Sopenharmony_ci } 21278c2ecf20Sopenharmony_ci } 21288c2ecf20Sopenharmony_ci 21298c2ecf20Sopenharmony_ci mutex_unlock(&c->bucket_lock); 21308c2ecf20Sopenharmony_ci} 21318c2ecf20Sopenharmony_ci 21328c2ecf20Sopenharmony_ci/* Btree insertion */ 21338c2ecf20Sopenharmony_ci 21348c2ecf20Sopenharmony_cistatic bool btree_insert_key(struct btree *b, struct bkey *k, 21358c2ecf20Sopenharmony_ci struct bkey *replace_key) 21368c2ecf20Sopenharmony_ci{ 21378c2ecf20Sopenharmony_ci unsigned int status; 21388c2ecf20Sopenharmony_ci 21398c2ecf20Sopenharmony_ci BUG_ON(bkey_cmp(k, &b->key) > 0); 21408c2ecf20Sopenharmony_ci 21418c2ecf20Sopenharmony_ci status = bch_btree_insert_key(&b->keys, k, replace_key); 21428c2ecf20Sopenharmony_ci if (status != BTREE_INSERT_STATUS_NO_INSERT) { 21438c2ecf20Sopenharmony_ci bch_check_keys(&b->keys, "%u for %s", status, 21448c2ecf20Sopenharmony_ci replace_key ? "replace" : "insert"); 21458c2ecf20Sopenharmony_ci 21468c2ecf20Sopenharmony_ci trace_bcache_btree_insert_key(b, k, replace_key != NULL, 21478c2ecf20Sopenharmony_ci status); 21488c2ecf20Sopenharmony_ci return true; 21498c2ecf20Sopenharmony_ci } else 21508c2ecf20Sopenharmony_ci return false; 21518c2ecf20Sopenharmony_ci} 21528c2ecf20Sopenharmony_ci 21538c2ecf20Sopenharmony_cistatic size_t insert_u64s_remaining(struct btree *b) 21548c2ecf20Sopenharmony_ci{ 21558c2ecf20Sopenharmony_ci long ret = bch_btree_keys_u64s_remaining(&b->keys); 21568c2ecf20Sopenharmony_ci 21578c2ecf20Sopenharmony_ci /* 21588c2ecf20Sopenharmony_ci * Might land in the middle of an existing extent and have to split it 21598c2ecf20Sopenharmony_ci */ 21608c2ecf20Sopenharmony_ci if (b->keys.ops->is_extents) 21618c2ecf20Sopenharmony_ci ret -= KEY_MAX_U64S; 21628c2ecf20Sopenharmony_ci 21638c2ecf20Sopenharmony_ci return max(ret, 0L); 21648c2ecf20Sopenharmony_ci} 21658c2ecf20Sopenharmony_ci 21668c2ecf20Sopenharmony_cistatic bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, 21678c2ecf20Sopenharmony_ci struct keylist *insert_keys, 21688c2ecf20Sopenharmony_ci struct bkey *replace_key) 21698c2ecf20Sopenharmony_ci{ 21708c2ecf20Sopenharmony_ci bool ret = false; 21718c2ecf20Sopenharmony_ci int oldsize = bch_count_data(&b->keys); 21728c2ecf20Sopenharmony_ci 21738c2ecf20Sopenharmony_ci while (!bch_keylist_empty(insert_keys)) { 21748c2ecf20Sopenharmony_ci struct bkey *k = insert_keys->keys; 21758c2ecf20Sopenharmony_ci 21768c2ecf20Sopenharmony_ci if (bkey_u64s(k) > insert_u64s_remaining(b)) 21778c2ecf20Sopenharmony_ci break; 21788c2ecf20Sopenharmony_ci 21798c2ecf20Sopenharmony_ci if (bkey_cmp(k, &b->key) <= 0) { 21808c2ecf20Sopenharmony_ci if (!b->level) 21818c2ecf20Sopenharmony_ci bkey_put(b->c, k); 21828c2ecf20Sopenharmony_ci 21838c2ecf20Sopenharmony_ci ret |= btree_insert_key(b, k, replace_key); 21848c2ecf20Sopenharmony_ci bch_keylist_pop_front(insert_keys); 21858c2ecf20Sopenharmony_ci } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { 21868c2ecf20Sopenharmony_ci BKEY_PADDED(key) temp; 21878c2ecf20Sopenharmony_ci bkey_copy(&temp.key, insert_keys->keys); 21888c2ecf20Sopenharmony_ci 21898c2ecf20Sopenharmony_ci bch_cut_back(&b->key, &temp.key); 21908c2ecf20Sopenharmony_ci bch_cut_front(&b->key, insert_keys->keys); 21918c2ecf20Sopenharmony_ci 21928c2ecf20Sopenharmony_ci ret |= btree_insert_key(b, &temp.key, replace_key); 21938c2ecf20Sopenharmony_ci break; 21948c2ecf20Sopenharmony_ci } else { 21958c2ecf20Sopenharmony_ci break; 21968c2ecf20Sopenharmony_ci } 21978c2ecf20Sopenharmony_ci } 21988c2ecf20Sopenharmony_ci 21998c2ecf20Sopenharmony_ci if (!ret) 22008c2ecf20Sopenharmony_ci op->insert_collision = true; 22018c2ecf20Sopenharmony_ci 22028c2ecf20Sopenharmony_ci BUG_ON(!bch_keylist_empty(insert_keys) && b->level); 22038c2ecf20Sopenharmony_ci 22048c2ecf20Sopenharmony_ci BUG_ON(bch_count_data(&b->keys) < oldsize); 22058c2ecf20Sopenharmony_ci return ret; 22068c2ecf20Sopenharmony_ci} 22078c2ecf20Sopenharmony_ci 22088c2ecf20Sopenharmony_cistatic int btree_split(struct btree *b, struct btree_op *op, 22098c2ecf20Sopenharmony_ci struct keylist *insert_keys, 22108c2ecf20Sopenharmony_ci struct bkey *replace_key) 22118c2ecf20Sopenharmony_ci{ 22128c2ecf20Sopenharmony_ci bool split; 22138c2ecf20Sopenharmony_ci struct btree *n1, *n2 = NULL, *n3 = NULL; 22148c2ecf20Sopenharmony_ci uint64_t start_time = local_clock(); 22158c2ecf20Sopenharmony_ci struct closure cl; 22168c2ecf20Sopenharmony_ci struct keylist parent_keys; 22178c2ecf20Sopenharmony_ci 22188c2ecf20Sopenharmony_ci closure_init_stack(&cl); 22198c2ecf20Sopenharmony_ci bch_keylist_init(&parent_keys); 22208c2ecf20Sopenharmony_ci 22218c2ecf20Sopenharmony_ci if (btree_check_reserve(b, op)) { 22228c2ecf20Sopenharmony_ci if (!b->level) 22238c2ecf20Sopenharmony_ci return -EINTR; 22248c2ecf20Sopenharmony_ci else 22258c2ecf20Sopenharmony_ci WARN(1, "insufficient reserve for split\n"); 22268c2ecf20Sopenharmony_ci } 22278c2ecf20Sopenharmony_ci 22288c2ecf20Sopenharmony_ci n1 = btree_node_alloc_replacement(b, op); 22298c2ecf20Sopenharmony_ci if (IS_ERR(n1)) 22308c2ecf20Sopenharmony_ci goto err; 22318c2ecf20Sopenharmony_ci 22328c2ecf20Sopenharmony_ci split = set_blocks(btree_bset_first(n1), 22338c2ecf20Sopenharmony_ci block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5; 22348c2ecf20Sopenharmony_ci 22358c2ecf20Sopenharmony_ci if (split) { 22368c2ecf20Sopenharmony_ci unsigned int keys = 0; 22378c2ecf20Sopenharmony_ci 22388c2ecf20Sopenharmony_ci trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); 22398c2ecf20Sopenharmony_ci 22408c2ecf20Sopenharmony_ci n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); 22418c2ecf20Sopenharmony_ci if (IS_ERR(n2)) 22428c2ecf20Sopenharmony_ci goto err_free1; 22438c2ecf20Sopenharmony_ci 22448c2ecf20Sopenharmony_ci if (!b->parent) { 22458c2ecf20Sopenharmony_ci n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); 22468c2ecf20Sopenharmony_ci if (IS_ERR(n3)) 22478c2ecf20Sopenharmony_ci goto err_free2; 22488c2ecf20Sopenharmony_ci } 22498c2ecf20Sopenharmony_ci 22508c2ecf20Sopenharmony_ci mutex_lock(&n1->write_lock); 22518c2ecf20Sopenharmony_ci mutex_lock(&n2->write_lock); 22528c2ecf20Sopenharmony_ci 22538c2ecf20Sopenharmony_ci bch_btree_insert_keys(n1, op, insert_keys, replace_key); 22548c2ecf20Sopenharmony_ci 22558c2ecf20Sopenharmony_ci /* 22568c2ecf20Sopenharmony_ci * Has to be a linear search because we don't have an auxiliary 22578c2ecf20Sopenharmony_ci * search tree yet 22588c2ecf20Sopenharmony_ci */ 22598c2ecf20Sopenharmony_ci 22608c2ecf20Sopenharmony_ci while (keys < (btree_bset_first(n1)->keys * 3) / 5) 22618c2ecf20Sopenharmony_ci keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), 22628c2ecf20Sopenharmony_ci keys)); 22638c2ecf20Sopenharmony_ci 22648c2ecf20Sopenharmony_ci bkey_copy_key(&n1->key, 22658c2ecf20Sopenharmony_ci bset_bkey_idx(btree_bset_first(n1), keys)); 22668c2ecf20Sopenharmony_ci keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); 22678c2ecf20Sopenharmony_ci 22688c2ecf20Sopenharmony_ci btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; 22698c2ecf20Sopenharmony_ci btree_bset_first(n1)->keys = keys; 22708c2ecf20Sopenharmony_ci 22718c2ecf20Sopenharmony_ci memcpy(btree_bset_first(n2)->start, 22728c2ecf20Sopenharmony_ci bset_bkey_last(btree_bset_first(n1)), 22738c2ecf20Sopenharmony_ci btree_bset_first(n2)->keys * sizeof(uint64_t)); 22748c2ecf20Sopenharmony_ci 22758c2ecf20Sopenharmony_ci bkey_copy_key(&n2->key, &b->key); 22768c2ecf20Sopenharmony_ci 22778c2ecf20Sopenharmony_ci bch_keylist_add(&parent_keys, &n2->key); 22788c2ecf20Sopenharmony_ci bch_btree_node_write(n2, &cl); 22798c2ecf20Sopenharmony_ci mutex_unlock(&n2->write_lock); 22808c2ecf20Sopenharmony_ci rw_unlock(true, n2); 22818c2ecf20Sopenharmony_ci } else { 22828c2ecf20Sopenharmony_ci trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); 22838c2ecf20Sopenharmony_ci 22848c2ecf20Sopenharmony_ci mutex_lock(&n1->write_lock); 22858c2ecf20Sopenharmony_ci bch_btree_insert_keys(n1, op, insert_keys, replace_key); 22868c2ecf20Sopenharmony_ci } 22878c2ecf20Sopenharmony_ci 22888c2ecf20Sopenharmony_ci bch_keylist_add(&parent_keys, &n1->key); 22898c2ecf20Sopenharmony_ci bch_btree_node_write(n1, &cl); 22908c2ecf20Sopenharmony_ci mutex_unlock(&n1->write_lock); 22918c2ecf20Sopenharmony_ci 22928c2ecf20Sopenharmony_ci if (n3) { 22938c2ecf20Sopenharmony_ci /* Depth increases, make a new root */ 22948c2ecf20Sopenharmony_ci mutex_lock(&n3->write_lock); 22958c2ecf20Sopenharmony_ci bkey_copy_key(&n3->key, &MAX_KEY); 22968c2ecf20Sopenharmony_ci bch_btree_insert_keys(n3, op, &parent_keys, NULL); 22978c2ecf20Sopenharmony_ci bch_btree_node_write(n3, &cl); 22988c2ecf20Sopenharmony_ci mutex_unlock(&n3->write_lock); 22998c2ecf20Sopenharmony_ci 23008c2ecf20Sopenharmony_ci closure_sync(&cl); 23018c2ecf20Sopenharmony_ci bch_btree_set_root(n3); 23028c2ecf20Sopenharmony_ci rw_unlock(true, n3); 23038c2ecf20Sopenharmony_ci } else if (!b->parent) { 23048c2ecf20Sopenharmony_ci /* Root filled up but didn't need to be split */ 23058c2ecf20Sopenharmony_ci closure_sync(&cl); 23068c2ecf20Sopenharmony_ci bch_btree_set_root(n1); 23078c2ecf20Sopenharmony_ci } else { 23088c2ecf20Sopenharmony_ci /* Split a non root node */ 23098c2ecf20Sopenharmony_ci closure_sync(&cl); 23108c2ecf20Sopenharmony_ci make_btree_freeing_key(b, parent_keys.top); 23118c2ecf20Sopenharmony_ci bch_keylist_push(&parent_keys); 23128c2ecf20Sopenharmony_ci 23138c2ecf20Sopenharmony_ci bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); 23148c2ecf20Sopenharmony_ci BUG_ON(!bch_keylist_empty(&parent_keys)); 23158c2ecf20Sopenharmony_ci } 23168c2ecf20Sopenharmony_ci 23178c2ecf20Sopenharmony_ci btree_node_free(b); 23188c2ecf20Sopenharmony_ci rw_unlock(true, n1); 23198c2ecf20Sopenharmony_ci 23208c2ecf20Sopenharmony_ci bch_time_stats_update(&b->c->btree_split_time, start_time); 23218c2ecf20Sopenharmony_ci 23228c2ecf20Sopenharmony_ci return 0; 23238c2ecf20Sopenharmony_cierr_free2: 23248c2ecf20Sopenharmony_ci bkey_put(b->c, &n2->key); 23258c2ecf20Sopenharmony_ci btree_node_free(n2); 23268c2ecf20Sopenharmony_ci rw_unlock(true, n2); 23278c2ecf20Sopenharmony_cierr_free1: 23288c2ecf20Sopenharmony_ci bkey_put(b->c, &n1->key); 23298c2ecf20Sopenharmony_ci btree_node_free(n1); 23308c2ecf20Sopenharmony_ci rw_unlock(true, n1); 23318c2ecf20Sopenharmony_cierr: 23328c2ecf20Sopenharmony_ci WARN(1, "bcache: btree split failed (level %u)", b->level); 23338c2ecf20Sopenharmony_ci 23348c2ecf20Sopenharmony_ci if (n3 == ERR_PTR(-EAGAIN) || 23358c2ecf20Sopenharmony_ci n2 == ERR_PTR(-EAGAIN) || 23368c2ecf20Sopenharmony_ci n1 == ERR_PTR(-EAGAIN)) 23378c2ecf20Sopenharmony_ci return -EAGAIN; 23388c2ecf20Sopenharmony_ci 23398c2ecf20Sopenharmony_ci return -ENOMEM; 23408c2ecf20Sopenharmony_ci} 23418c2ecf20Sopenharmony_ci 23428c2ecf20Sopenharmony_cistatic int bch_btree_insert_node(struct btree *b, struct btree_op *op, 23438c2ecf20Sopenharmony_ci struct keylist *insert_keys, 23448c2ecf20Sopenharmony_ci atomic_t *journal_ref, 23458c2ecf20Sopenharmony_ci struct bkey *replace_key) 23468c2ecf20Sopenharmony_ci{ 23478c2ecf20Sopenharmony_ci struct closure cl; 23488c2ecf20Sopenharmony_ci 23498c2ecf20Sopenharmony_ci BUG_ON(b->level && replace_key); 23508c2ecf20Sopenharmony_ci 23518c2ecf20Sopenharmony_ci closure_init_stack(&cl); 23528c2ecf20Sopenharmony_ci 23538c2ecf20Sopenharmony_ci mutex_lock(&b->write_lock); 23548c2ecf20Sopenharmony_ci 23558c2ecf20Sopenharmony_ci if (write_block(b) != btree_bset_last(b) && 23568c2ecf20Sopenharmony_ci b->keys.last_set_unwritten) 23578c2ecf20Sopenharmony_ci bch_btree_init_next(b); /* just wrote a set */ 23588c2ecf20Sopenharmony_ci 23598c2ecf20Sopenharmony_ci if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { 23608c2ecf20Sopenharmony_ci mutex_unlock(&b->write_lock); 23618c2ecf20Sopenharmony_ci goto split; 23628c2ecf20Sopenharmony_ci } 23638c2ecf20Sopenharmony_ci 23648c2ecf20Sopenharmony_ci BUG_ON(write_block(b) != btree_bset_last(b)); 23658c2ecf20Sopenharmony_ci 23668c2ecf20Sopenharmony_ci if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 23678c2ecf20Sopenharmony_ci if (!b->level) 23688c2ecf20Sopenharmony_ci bch_btree_leaf_dirty(b, journal_ref); 23698c2ecf20Sopenharmony_ci else 23708c2ecf20Sopenharmony_ci bch_btree_node_write(b, &cl); 23718c2ecf20Sopenharmony_ci } 23728c2ecf20Sopenharmony_ci 23738c2ecf20Sopenharmony_ci mutex_unlock(&b->write_lock); 23748c2ecf20Sopenharmony_ci 23758c2ecf20Sopenharmony_ci /* wait for btree node write if necessary, after unlock */ 23768c2ecf20Sopenharmony_ci closure_sync(&cl); 23778c2ecf20Sopenharmony_ci 23788c2ecf20Sopenharmony_ci return 0; 23798c2ecf20Sopenharmony_cisplit: 23808c2ecf20Sopenharmony_ci if (current->bio_list) { 23818c2ecf20Sopenharmony_ci op->lock = b->c->root->level + 1; 23828c2ecf20Sopenharmony_ci return -EAGAIN; 23838c2ecf20Sopenharmony_ci } else if (op->lock <= b->c->root->level) { 23848c2ecf20Sopenharmony_ci op->lock = b->c->root->level + 1; 23858c2ecf20Sopenharmony_ci return -EINTR; 23868c2ecf20Sopenharmony_ci } else { 23878c2ecf20Sopenharmony_ci /* Invalidated all iterators */ 23888c2ecf20Sopenharmony_ci int ret = btree_split(b, op, insert_keys, replace_key); 23898c2ecf20Sopenharmony_ci 23908c2ecf20Sopenharmony_ci if (bch_keylist_empty(insert_keys)) 23918c2ecf20Sopenharmony_ci return 0; 23928c2ecf20Sopenharmony_ci else if (!ret) 23938c2ecf20Sopenharmony_ci return -EINTR; 23948c2ecf20Sopenharmony_ci return ret; 23958c2ecf20Sopenharmony_ci } 23968c2ecf20Sopenharmony_ci} 23978c2ecf20Sopenharmony_ci 23988c2ecf20Sopenharmony_ciint bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 23998c2ecf20Sopenharmony_ci struct bkey *check_key) 24008c2ecf20Sopenharmony_ci{ 24018c2ecf20Sopenharmony_ci int ret = -EINTR; 24028c2ecf20Sopenharmony_ci uint64_t btree_ptr = b->key.ptr[0]; 24038c2ecf20Sopenharmony_ci unsigned long seq = b->seq; 24048c2ecf20Sopenharmony_ci struct keylist insert; 24058c2ecf20Sopenharmony_ci bool upgrade = op->lock == -1; 24068c2ecf20Sopenharmony_ci 24078c2ecf20Sopenharmony_ci bch_keylist_init(&insert); 24088c2ecf20Sopenharmony_ci 24098c2ecf20Sopenharmony_ci if (upgrade) { 24108c2ecf20Sopenharmony_ci rw_unlock(false, b); 24118c2ecf20Sopenharmony_ci rw_lock(true, b, b->level); 24128c2ecf20Sopenharmony_ci 24138c2ecf20Sopenharmony_ci if (b->key.ptr[0] != btree_ptr || 24148c2ecf20Sopenharmony_ci b->seq != seq + 1) { 24158c2ecf20Sopenharmony_ci op->lock = b->level; 24168c2ecf20Sopenharmony_ci goto out; 24178c2ecf20Sopenharmony_ci } 24188c2ecf20Sopenharmony_ci } 24198c2ecf20Sopenharmony_ci 24208c2ecf20Sopenharmony_ci SET_KEY_PTRS(check_key, 1); 24218c2ecf20Sopenharmony_ci get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); 24228c2ecf20Sopenharmony_ci 24238c2ecf20Sopenharmony_ci SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); 24248c2ecf20Sopenharmony_ci 24258c2ecf20Sopenharmony_ci bch_keylist_add(&insert, check_key); 24268c2ecf20Sopenharmony_ci 24278c2ecf20Sopenharmony_ci ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); 24288c2ecf20Sopenharmony_ci 24298c2ecf20Sopenharmony_ci BUG_ON(!ret && !bch_keylist_empty(&insert)); 24308c2ecf20Sopenharmony_ciout: 24318c2ecf20Sopenharmony_ci if (upgrade) 24328c2ecf20Sopenharmony_ci downgrade_write(&b->lock); 24338c2ecf20Sopenharmony_ci return ret; 24348c2ecf20Sopenharmony_ci} 24358c2ecf20Sopenharmony_ci 24368c2ecf20Sopenharmony_cistruct btree_insert_op { 24378c2ecf20Sopenharmony_ci struct btree_op op; 24388c2ecf20Sopenharmony_ci struct keylist *keys; 24398c2ecf20Sopenharmony_ci atomic_t *journal_ref; 24408c2ecf20Sopenharmony_ci struct bkey *replace_key; 24418c2ecf20Sopenharmony_ci}; 24428c2ecf20Sopenharmony_ci 24438c2ecf20Sopenharmony_cistatic int btree_insert_fn(struct btree_op *b_op, struct btree *b) 24448c2ecf20Sopenharmony_ci{ 24458c2ecf20Sopenharmony_ci struct btree_insert_op *op = container_of(b_op, 24468c2ecf20Sopenharmony_ci struct btree_insert_op, op); 24478c2ecf20Sopenharmony_ci 24488c2ecf20Sopenharmony_ci int ret = bch_btree_insert_node(b, &op->op, op->keys, 24498c2ecf20Sopenharmony_ci op->journal_ref, op->replace_key); 24508c2ecf20Sopenharmony_ci if (ret && !bch_keylist_empty(op->keys)) 24518c2ecf20Sopenharmony_ci return ret; 24528c2ecf20Sopenharmony_ci else 24538c2ecf20Sopenharmony_ci return MAP_DONE; 24548c2ecf20Sopenharmony_ci} 24558c2ecf20Sopenharmony_ci 24568c2ecf20Sopenharmony_ciint bch_btree_insert(struct cache_set *c, struct keylist *keys, 24578c2ecf20Sopenharmony_ci atomic_t *journal_ref, struct bkey *replace_key) 24588c2ecf20Sopenharmony_ci{ 24598c2ecf20Sopenharmony_ci struct btree_insert_op op; 24608c2ecf20Sopenharmony_ci int ret = 0; 24618c2ecf20Sopenharmony_ci 24628c2ecf20Sopenharmony_ci BUG_ON(current->bio_list); 24638c2ecf20Sopenharmony_ci BUG_ON(bch_keylist_empty(keys)); 24648c2ecf20Sopenharmony_ci 24658c2ecf20Sopenharmony_ci bch_btree_op_init(&op.op, 0); 24668c2ecf20Sopenharmony_ci op.keys = keys; 24678c2ecf20Sopenharmony_ci op.journal_ref = journal_ref; 24688c2ecf20Sopenharmony_ci op.replace_key = replace_key; 24698c2ecf20Sopenharmony_ci 24708c2ecf20Sopenharmony_ci while (!ret && !bch_keylist_empty(keys)) { 24718c2ecf20Sopenharmony_ci op.op.lock = 0; 24728c2ecf20Sopenharmony_ci ret = bch_btree_map_leaf_nodes(&op.op, c, 24738c2ecf20Sopenharmony_ci &START_KEY(keys->keys), 24748c2ecf20Sopenharmony_ci btree_insert_fn); 24758c2ecf20Sopenharmony_ci } 24768c2ecf20Sopenharmony_ci 24778c2ecf20Sopenharmony_ci if (ret) { 24788c2ecf20Sopenharmony_ci struct bkey *k; 24798c2ecf20Sopenharmony_ci 24808c2ecf20Sopenharmony_ci pr_err("error %i\n", ret); 24818c2ecf20Sopenharmony_ci 24828c2ecf20Sopenharmony_ci while ((k = bch_keylist_pop(keys))) 24838c2ecf20Sopenharmony_ci bkey_put(c, k); 24848c2ecf20Sopenharmony_ci } else if (op.op.insert_collision) 24858c2ecf20Sopenharmony_ci ret = -ESRCH; 24868c2ecf20Sopenharmony_ci 24878c2ecf20Sopenharmony_ci return ret; 24888c2ecf20Sopenharmony_ci} 24898c2ecf20Sopenharmony_ci 24908c2ecf20Sopenharmony_civoid bch_btree_set_root(struct btree *b) 24918c2ecf20Sopenharmony_ci{ 24928c2ecf20Sopenharmony_ci unsigned int i; 24938c2ecf20Sopenharmony_ci struct closure cl; 24948c2ecf20Sopenharmony_ci 24958c2ecf20Sopenharmony_ci closure_init_stack(&cl); 24968c2ecf20Sopenharmony_ci 24978c2ecf20Sopenharmony_ci trace_bcache_btree_set_root(b); 24988c2ecf20Sopenharmony_ci 24998c2ecf20Sopenharmony_ci BUG_ON(!b->written); 25008c2ecf20Sopenharmony_ci 25018c2ecf20Sopenharmony_ci for (i = 0; i < KEY_PTRS(&b->key); i++) 25028c2ecf20Sopenharmony_ci BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); 25038c2ecf20Sopenharmony_ci 25048c2ecf20Sopenharmony_ci mutex_lock(&b->c->bucket_lock); 25058c2ecf20Sopenharmony_ci list_del_init(&b->list); 25068c2ecf20Sopenharmony_ci mutex_unlock(&b->c->bucket_lock); 25078c2ecf20Sopenharmony_ci 25088c2ecf20Sopenharmony_ci b->c->root = b; 25098c2ecf20Sopenharmony_ci 25108c2ecf20Sopenharmony_ci bch_journal_meta(b->c, &cl); 25118c2ecf20Sopenharmony_ci closure_sync(&cl); 25128c2ecf20Sopenharmony_ci} 25138c2ecf20Sopenharmony_ci 25148c2ecf20Sopenharmony_ci/* Map across nodes or keys */ 25158c2ecf20Sopenharmony_ci 25168c2ecf20Sopenharmony_cistatic int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, 25178c2ecf20Sopenharmony_ci struct bkey *from, 25188c2ecf20Sopenharmony_ci btree_map_nodes_fn *fn, int flags) 25198c2ecf20Sopenharmony_ci{ 25208c2ecf20Sopenharmony_ci int ret = MAP_CONTINUE; 25218c2ecf20Sopenharmony_ci 25228c2ecf20Sopenharmony_ci if (b->level) { 25238c2ecf20Sopenharmony_ci struct bkey *k; 25248c2ecf20Sopenharmony_ci struct btree_iter iter; 25258c2ecf20Sopenharmony_ci 25268c2ecf20Sopenharmony_ci bch_btree_iter_init(&b->keys, &iter, from); 25278c2ecf20Sopenharmony_ci 25288c2ecf20Sopenharmony_ci while ((k = bch_btree_iter_next_filter(&iter, &b->keys, 25298c2ecf20Sopenharmony_ci bch_ptr_bad))) { 25308c2ecf20Sopenharmony_ci ret = bcache_btree(map_nodes_recurse, k, b, 25318c2ecf20Sopenharmony_ci op, from, fn, flags); 25328c2ecf20Sopenharmony_ci from = NULL; 25338c2ecf20Sopenharmony_ci 25348c2ecf20Sopenharmony_ci if (ret != MAP_CONTINUE) 25358c2ecf20Sopenharmony_ci return ret; 25368c2ecf20Sopenharmony_ci } 25378c2ecf20Sopenharmony_ci } 25388c2ecf20Sopenharmony_ci 25398c2ecf20Sopenharmony_ci if (!b->level || flags == MAP_ALL_NODES) 25408c2ecf20Sopenharmony_ci ret = fn(op, b); 25418c2ecf20Sopenharmony_ci 25428c2ecf20Sopenharmony_ci return ret; 25438c2ecf20Sopenharmony_ci} 25448c2ecf20Sopenharmony_ci 25458c2ecf20Sopenharmony_ciint __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, 25468c2ecf20Sopenharmony_ci struct bkey *from, btree_map_nodes_fn *fn, int flags) 25478c2ecf20Sopenharmony_ci{ 25488c2ecf20Sopenharmony_ci return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags); 25498c2ecf20Sopenharmony_ci} 25508c2ecf20Sopenharmony_ci 25518c2ecf20Sopenharmony_ciint bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, 25528c2ecf20Sopenharmony_ci struct bkey *from, btree_map_keys_fn *fn, 25538c2ecf20Sopenharmony_ci int flags) 25548c2ecf20Sopenharmony_ci{ 25558c2ecf20Sopenharmony_ci int ret = MAP_CONTINUE; 25568c2ecf20Sopenharmony_ci struct bkey *k; 25578c2ecf20Sopenharmony_ci struct btree_iter iter; 25588c2ecf20Sopenharmony_ci 25598c2ecf20Sopenharmony_ci bch_btree_iter_init(&b->keys, &iter, from); 25608c2ecf20Sopenharmony_ci 25618c2ecf20Sopenharmony_ci while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { 25628c2ecf20Sopenharmony_ci ret = !b->level 25638c2ecf20Sopenharmony_ci ? fn(op, b, k) 25648c2ecf20Sopenharmony_ci : bcache_btree(map_keys_recurse, k, 25658c2ecf20Sopenharmony_ci b, op, from, fn, flags); 25668c2ecf20Sopenharmony_ci from = NULL; 25678c2ecf20Sopenharmony_ci 25688c2ecf20Sopenharmony_ci if (ret != MAP_CONTINUE) 25698c2ecf20Sopenharmony_ci return ret; 25708c2ecf20Sopenharmony_ci } 25718c2ecf20Sopenharmony_ci 25728c2ecf20Sopenharmony_ci if (!b->level && (flags & MAP_END_KEY)) 25738c2ecf20Sopenharmony_ci ret = fn(op, b, &KEY(KEY_INODE(&b->key), 25748c2ecf20Sopenharmony_ci KEY_OFFSET(&b->key), 0)); 25758c2ecf20Sopenharmony_ci 25768c2ecf20Sopenharmony_ci return ret; 25778c2ecf20Sopenharmony_ci} 25788c2ecf20Sopenharmony_ci 25798c2ecf20Sopenharmony_ciint bch_btree_map_keys(struct btree_op *op, struct cache_set *c, 25808c2ecf20Sopenharmony_ci struct bkey *from, btree_map_keys_fn *fn, int flags) 25818c2ecf20Sopenharmony_ci{ 25828c2ecf20Sopenharmony_ci return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags); 25838c2ecf20Sopenharmony_ci} 25848c2ecf20Sopenharmony_ci 25858c2ecf20Sopenharmony_ci/* Keybuf code */ 25868c2ecf20Sopenharmony_ci 25878c2ecf20Sopenharmony_cistatic inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) 25888c2ecf20Sopenharmony_ci{ 25898c2ecf20Sopenharmony_ci /* Overlapping keys compare equal */ 25908c2ecf20Sopenharmony_ci if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) 25918c2ecf20Sopenharmony_ci return -1; 25928c2ecf20Sopenharmony_ci if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) 25938c2ecf20Sopenharmony_ci return 1; 25948c2ecf20Sopenharmony_ci return 0; 25958c2ecf20Sopenharmony_ci} 25968c2ecf20Sopenharmony_ci 25978c2ecf20Sopenharmony_cistatic inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, 25988c2ecf20Sopenharmony_ci struct keybuf_key *r) 25998c2ecf20Sopenharmony_ci{ 26008c2ecf20Sopenharmony_ci return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 26018c2ecf20Sopenharmony_ci} 26028c2ecf20Sopenharmony_ci 26038c2ecf20Sopenharmony_cistruct refill { 26048c2ecf20Sopenharmony_ci struct btree_op op; 26058c2ecf20Sopenharmony_ci unsigned int nr_found; 26068c2ecf20Sopenharmony_ci struct keybuf *buf; 26078c2ecf20Sopenharmony_ci struct bkey *end; 26088c2ecf20Sopenharmony_ci keybuf_pred_fn *pred; 26098c2ecf20Sopenharmony_ci}; 26108c2ecf20Sopenharmony_ci 26118c2ecf20Sopenharmony_cistatic int refill_keybuf_fn(struct btree_op *op, struct btree *b, 26128c2ecf20Sopenharmony_ci struct bkey *k) 26138c2ecf20Sopenharmony_ci{ 26148c2ecf20Sopenharmony_ci struct refill *refill = container_of(op, struct refill, op); 26158c2ecf20Sopenharmony_ci struct keybuf *buf = refill->buf; 26168c2ecf20Sopenharmony_ci int ret = MAP_CONTINUE; 26178c2ecf20Sopenharmony_ci 26188c2ecf20Sopenharmony_ci if (bkey_cmp(k, refill->end) > 0) { 26198c2ecf20Sopenharmony_ci ret = MAP_DONE; 26208c2ecf20Sopenharmony_ci goto out; 26218c2ecf20Sopenharmony_ci } 26228c2ecf20Sopenharmony_ci 26238c2ecf20Sopenharmony_ci if (!KEY_SIZE(k)) /* end key */ 26248c2ecf20Sopenharmony_ci goto out; 26258c2ecf20Sopenharmony_ci 26268c2ecf20Sopenharmony_ci if (refill->pred(buf, k)) { 26278c2ecf20Sopenharmony_ci struct keybuf_key *w; 26288c2ecf20Sopenharmony_ci 26298c2ecf20Sopenharmony_ci spin_lock(&buf->lock); 26308c2ecf20Sopenharmony_ci 26318c2ecf20Sopenharmony_ci w = array_alloc(&buf->freelist); 26328c2ecf20Sopenharmony_ci if (!w) { 26338c2ecf20Sopenharmony_ci spin_unlock(&buf->lock); 26348c2ecf20Sopenharmony_ci return MAP_DONE; 26358c2ecf20Sopenharmony_ci } 26368c2ecf20Sopenharmony_ci 26378c2ecf20Sopenharmony_ci w->private = NULL; 26388c2ecf20Sopenharmony_ci bkey_copy(&w->key, k); 26398c2ecf20Sopenharmony_ci 26408c2ecf20Sopenharmony_ci if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 26418c2ecf20Sopenharmony_ci array_free(&buf->freelist, w); 26428c2ecf20Sopenharmony_ci else 26438c2ecf20Sopenharmony_ci refill->nr_found++; 26448c2ecf20Sopenharmony_ci 26458c2ecf20Sopenharmony_ci if (array_freelist_empty(&buf->freelist)) 26468c2ecf20Sopenharmony_ci ret = MAP_DONE; 26478c2ecf20Sopenharmony_ci 26488c2ecf20Sopenharmony_ci spin_unlock(&buf->lock); 26498c2ecf20Sopenharmony_ci } 26508c2ecf20Sopenharmony_ciout: 26518c2ecf20Sopenharmony_ci buf->last_scanned = *k; 26528c2ecf20Sopenharmony_ci return ret; 26538c2ecf20Sopenharmony_ci} 26548c2ecf20Sopenharmony_ci 26558c2ecf20Sopenharmony_civoid bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 26568c2ecf20Sopenharmony_ci struct bkey *end, keybuf_pred_fn *pred) 26578c2ecf20Sopenharmony_ci{ 26588c2ecf20Sopenharmony_ci struct bkey start = buf->last_scanned; 26598c2ecf20Sopenharmony_ci struct refill refill; 26608c2ecf20Sopenharmony_ci 26618c2ecf20Sopenharmony_ci cond_resched(); 26628c2ecf20Sopenharmony_ci 26638c2ecf20Sopenharmony_ci bch_btree_op_init(&refill.op, -1); 26648c2ecf20Sopenharmony_ci refill.nr_found = 0; 26658c2ecf20Sopenharmony_ci refill.buf = buf; 26668c2ecf20Sopenharmony_ci refill.end = end; 26678c2ecf20Sopenharmony_ci refill.pred = pred; 26688c2ecf20Sopenharmony_ci 26698c2ecf20Sopenharmony_ci bch_btree_map_keys(&refill.op, c, &buf->last_scanned, 26708c2ecf20Sopenharmony_ci refill_keybuf_fn, MAP_END_KEY); 26718c2ecf20Sopenharmony_ci 26728c2ecf20Sopenharmony_ci trace_bcache_keyscan(refill.nr_found, 26738c2ecf20Sopenharmony_ci KEY_INODE(&start), KEY_OFFSET(&start), 26748c2ecf20Sopenharmony_ci KEY_INODE(&buf->last_scanned), 26758c2ecf20Sopenharmony_ci KEY_OFFSET(&buf->last_scanned)); 26768c2ecf20Sopenharmony_ci 26778c2ecf20Sopenharmony_ci spin_lock(&buf->lock); 26788c2ecf20Sopenharmony_ci 26798c2ecf20Sopenharmony_ci if (!RB_EMPTY_ROOT(&buf->keys)) { 26808c2ecf20Sopenharmony_ci struct keybuf_key *w; 26818c2ecf20Sopenharmony_ci 26828c2ecf20Sopenharmony_ci w = RB_FIRST(&buf->keys, struct keybuf_key, node); 26838c2ecf20Sopenharmony_ci buf->start = START_KEY(&w->key); 26848c2ecf20Sopenharmony_ci 26858c2ecf20Sopenharmony_ci w = RB_LAST(&buf->keys, struct keybuf_key, node); 26868c2ecf20Sopenharmony_ci buf->end = w->key; 26878c2ecf20Sopenharmony_ci } else { 26888c2ecf20Sopenharmony_ci buf->start = MAX_KEY; 26898c2ecf20Sopenharmony_ci buf->end = MAX_KEY; 26908c2ecf20Sopenharmony_ci } 26918c2ecf20Sopenharmony_ci 26928c2ecf20Sopenharmony_ci spin_unlock(&buf->lock); 26938c2ecf20Sopenharmony_ci} 26948c2ecf20Sopenharmony_ci 26958c2ecf20Sopenharmony_cistatic void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 26968c2ecf20Sopenharmony_ci{ 26978c2ecf20Sopenharmony_ci rb_erase(&w->node, &buf->keys); 26988c2ecf20Sopenharmony_ci array_free(&buf->freelist, w); 26998c2ecf20Sopenharmony_ci} 27008c2ecf20Sopenharmony_ci 27018c2ecf20Sopenharmony_civoid bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 27028c2ecf20Sopenharmony_ci{ 27038c2ecf20Sopenharmony_ci spin_lock(&buf->lock); 27048c2ecf20Sopenharmony_ci __bch_keybuf_del(buf, w); 27058c2ecf20Sopenharmony_ci spin_unlock(&buf->lock); 27068c2ecf20Sopenharmony_ci} 27078c2ecf20Sopenharmony_ci 27088c2ecf20Sopenharmony_cibool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, 27098c2ecf20Sopenharmony_ci struct bkey *end) 27108c2ecf20Sopenharmony_ci{ 27118c2ecf20Sopenharmony_ci bool ret = false; 27128c2ecf20Sopenharmony_ci struct keybuf_key *p, *w, s; 27138c2ecf20Sopenharmony_ci 27148c2ecf20Sopenharmony_ci s.key = *start; 27158c2ecf20Sopenharmony_ci 27168c2ecf20Sopenharmony_ci if (bkey_cmp(end, &buf->start) <= 0 || 27178c2ecf20Sopenharmony_ci bkey_cmp(start, &buf->end) >= 0) 27188c2ecf20Sopenharmony_ci return false; 27198c2ecf20Sopenharmony_ci 27208c2ecf20Sopenharmony_ci spin_lock(&buf->lock); 27218c2ecf20Sopenharmony_ci w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); 27228c2ecf20Sopenharmony_ci 27238c2ecf20Sopenharmony_ci while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { 27248c2ecf20Sopenharmony_ci p = w; 27258c2ecf20Sopenharmony_ci w = RB_NEXT(w, node); 27268c2ecf20Sopenharmony_ci 27278c2ecf20Sopenharmony_ci if (p->private) 27288c2ecf20Sopenharmony_ci ret = true; 27298c2ecf20Sopenharmony_ci else 27308c2ecf20Sopenharmony_ci __bch_keybuf_del(buf, p); 27318c2ecf20Sopenharmony_ci } 27328c2ecf20Sopenharmony_ci 27338c2ecf20Sopenharmony_ci spin_unlock(&buf->lock); 27348c2ecf20Sopenharmony_ci return ret; 27358c2ecf20Sopenharmony_ci} 27368c2ecf20Sopenharmony_ci 27378c2ecf20Sopenharmony_cistruct keybuf_key *bch_keybuf_next(struct keybuf *buf) 27388c2ecf20Sopenharmony_ci{ 27398c2ecf20Sopenharmony_ci struct keybuf_key *w; 27408c2ecf20Sopenharmony_ci 27418c2ecf20Sopenharmony_ci spin_lock(&buf->lock); 27428c2ecf20Sopenharmony_ci 27438c2ecf20Sopenharmony_ci w = RB_FIRST(&buf->keys, struct keybuf_key, node); 27448c2ecf20Sopenharmony_ci 27458c2ecf20Sopenharmony_ci while (w && w->private) 27468c2ecf20Sopenharmony_ci w = RB_NEXT(w, node); 27478c2ecf20Sopenharmony_ci 27488c2ecf20Sopenharmony_ci if (w) 27498c2ecf20Sopenharmony_ci w->private = ERR_PTR(-EINTR); 27508c2ecf20Sopenharmony_ci 27518c2ecf20Sopenharmony_ci spin_unlock(&buf->lock); 27528c2ecf20Sopenharmony_ci return w; 27538c2ecf20Sopenharmony_ci} 27548c2ecf20Sopenharmony_ci 27558c2ecf20Sopenharmony_cistruct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 27568c2ecf20Sopenharmony_ci struct keybuf *buf, 27578c2ecf20Sopenharmony_ci struct bkey *end, 27588c2ecf20Sopenharmony_ci keybuf_pred_fn *pred) 27598c2ecf20Sopenharmony_ci{ 27608c2ecf20Sopenharmony_ci struct keybuf_key *ret; 27618c2ecf20Sopenharmony_ci 27628c2ecf20Sopenharmony_ci while (1) { 27638c2ecf20Sopenharmony_ci ret = bch_keybuf_next(buf); 27648c2ecf20Sopenharmony_ci if (ret) 27658c2ecf20Sopenharmony_ci break; 27668c2ecf20Sopenharmony_ci 27678c2ecf20Sopenharmony_ci if (bkey_cmp(&buf->last_scanned, end) >= 0) { 27688c2ecf20Sopenharmony_ci pr_debug("scan finished\n"); 27698c2ecf20Sopenharmony_ci break; 27708c2ecf20Sopenharmony_ci } 27718c2ecf20Sopenharmony_ci 27728c2ecf20Sopenharmony_ci bch_refill_keybuf(c, buf, end, pred); 27738c2ecf20Sopenharmony_ci } 27748c2ecf20Sopenharmony_ci 27758c2ecf20Sopenharmony_ci return ret; 27768c2ecf20Sopenharmony_ci} 27778c2ecf20Sopenharmony_ci 27788c2ecf20Sopenharmony_civoid bch_keybuf_init(struct keybuf *buf) 27798c2ecf20Sopenharmony_ci{ 27808c2ecf20Sopenharmony_ci buf->last_scanned = MAX_KEY; 27818c2ecf20Sopenharmony_ci buf->keys = RB_ROOT; 27828c2ecf20Sopenharmony_ci 27838c2ecf20Sopenharmony_ci spin_lock_init(&buf->lock); 27848c2ecf20Sopenharmony_ci array_allocator_init(&buf->freelist); 27858c2ecf20Sopenharmony_ci} 27868c2ecf20Sopenharmony_ci 27878c2ecf20Sopenharmony_civoid bch_btree_exit(void) 27888c2ecf20Sopenharmony_ci{ 27898c2ecf20Sopenharmony_ci if (btree_io_wq) 27908c2ecf20Sopenharmony_ci destroy_workqueue(btree_io_wq); 27918c2ecf20Sopenharmony_ci} 27928c2ecf20Sopenharmony_ci 27938c2ecf20Sopenharmony_ciint __init bch_btree_init(void) 27948c2ecf20Sopenharmony_ci{ 27958c2ecf20Sopenharmony_ci btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); 27968c2ecf20Sopenharmony_ci if (!btree_io_wq) 27978c2ecf20Sopenharmony_ci return -ENOMEM; 27988c2ecf20Sopenharmony_ci 27998c2ecf20Sopenharmony_ci return 0; 28008c2ecf20Sopenharmony_ci} 2801