18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci#ifndef _BCACHE_BSET_H 38c2ecf20Sopenharmony_ci#define _BCACHE_BSET_H 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci#include <linux/bcache.h> 68c2ecf20Sopenharmony_ci#include <linux/kernel.h> 78c2ecf20Sopenharmony_ci#include <linux/types.h> 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci#include "util.h" /* for time_stats */ 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci/* 128c2ecf20Sopenharmony_ci * BKEYS: 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * A bkey contains a key, a size field, a variable number of pointers, and some 158c2ecf20Sopenharmony_ci * ancillary flag bits. 168c2ecf20Sopenharmony_ci * 178c2ecf20Sopenharmony_ci * We use two different functions for validating bkeys, bch_ptr_invalid and 188c2ecf20Sopenharmony_ci * bch_ptr_bad(). 198c2ecf20Sopenharmony_ci * 208c2ecf20Sopenharmony_ci * bch_ptr_invalid() primarily filters out keys and pointers that would be 218c2ecf20Sopenharmony_ci * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and 228c2ecf20Sopenharmony_ci * pointer that occur in normal practice but don't point to real data. 238c2ecf20Sopenharmony_ci * 248c2ecf20Sopenharmony_ci * The one exception to the rule that ptr_invalid() filters out invalid keys is 258c2ecf20Sopenharmony_ci * that it also filters out keys of size 0 - these are keys that have been 268c2ecf20Sopenharmony_ci * completely overwritten. It'd be safe to delete these in memory while leaving 278c2ecf20Sopenharmony_ci * them on disk, just unnecessary work - so we filter them out when resorting 288c2ecf20Sopenharmony_ci * instead. 298c2ecf20Sopenharmony_ci * 308c2ecf20Sopenharmony_ci * We can't filter out stale keys when we're resorting, because garbage 318c2ecf20Sopenharmony_ci * collection needs to find them to ensure bucket gens don't wrap around - 328c2ecf20Sopenharmony_ci * unless we're rewriting the btree node those stale keys still exist on disk. 338c2ecf20Sopenharmony_ci * 348c2ecf20Sopenharmony_ci * We also implement functions here for removing some number of sectors from the 358c2ecf20Sopenharmony_ci * front or the back of a bkey - this is mainly used for fixing overlapping 368c2ecf20Sopenharmony_ci * extents, by removing the overlapping sectors from the older key. 378c2ecf20Sopenharmony_ci * 388c2ecf20Sopenharmony_ci * BSETS: 398c2ecf20Sopenharmony_ci * 408c2ecf20Sopenharmony_ci * A bset is an array of bkeys laid out contiguously in memory in sorted order, 418c2ecf20Sopenharmony_ci * along with a header. A btree node is made up of a number of these, written at 428c2ecf20Sopenharmony_ci * different times. 438c2ecf20Sopenharmony_ci * 448c2ecf20Sopenharmony_ci * There could be many of them on disk, but we never allow there to be more than 458c2ecf20Sopenharmony_ci * 4 in memory - we lazily resort as needed. 468c2ecf20Sopenharmony_ci * 478c2ecf20Sopenharmony_ci * We implement code here for creating and maintaining auxiliary search trees 488c2ecf20Sopenharmony_ci * (described below) for searching an individial bset, and on top of that we 498c2ecf20Sopenharmony_ci * implement a btree iterator. 508c2ecf20Sopenharmony_ci * 518c2ecf20Sopenharmony_ci * BTREE ITERATOR: 528c2ecf20Sopenharmony_ci * 538c2ecf20Sopenharmony_ci * Most of the code in bcache doesn't care about an individual bset - it needs 548c2ecf20Sopenharmony_ci * to search entire btree nodes and iterate over them in sorted order. 558c2ecf20Sopenharmony_ci * 568c2ecf20Sopenharmony_ci * The btree iterator code serves both functions; it iterates through the keys 578c2ecf20Sopenharmony_ci * in a btree node in sorted order, starting from either keys after a specific 588c2ecf20Sopenharmony_ci * point (if you pass it a search key) or the start of the btree node. 598c2ecf20Sopenharmony_ci * 608c2ecf20Sopenharmony_ci * AUXILIARY SEARCH TREES: 618c2ecf20Sopenharmony_ci * 628c2ecf20Sopenharmony_ci * Since keys are variable length, we can't use a binary search on a bset - we 638c2ecf20Sopenharmony_ci * wouldn't be able to find the start of the next key. But binary searches are 648c2ecf20Sopenharmony_ci * slow anyways, due to terrible cache behaviour; bcache originally used binary 658c2ecf20Sopenharmony_ci * searches and that code topped out at under 50k lookups/second. 668c2ecf20Sopenharmony_ci * 678c2ecf20Sopenharmony_ci * So we need to construct some sort of lookup table. Since we only insert keys 688c2ecf20Sopenharmony_ci * into the last (unwritten) set, most of the keys within a given btree node are 698c2ecf20Sopenharmony_ci * usually in sets that are mostly constant. We use two different types of 708c2ecf20Sopenharmony_ci * lookup tables to take advantage of this. 718c2ecf20Sopenharmony_ci * 728c2ecf20Sopenharmony_ci * Both lookup tables share in common that they don't index every key in the 738c2ecf20Sopenharmony_ci * set; they index one key every BSET_CACHELINE bytes, and then a linear search 748c2ecf20Sopenharmony_ci * is used for the rest. 758c2ecf20Sopenharmony_ci * 768c2ecf20Sopenharmony_ci * For sets that have been written to disk and are no longer being inserted 778c2ecf20Sopenharmony_ci * into, we construct a binary search tree in an array - traversing a binary 788c2ecf20Sopenharmony_ci * search tree in an array gives excellent locality of reference and is very 798c2ecf20Sopenharmony_ci * fast, since both children of any node are adjacent to each other in memory 808c2ecf20Sopenharmony_ci * (and their grandchildren, and great grandchildren...) - this means 818c2ecf20Sopenharmony_ci * prefetching can be used to great effect. 828c2ecf20Sopenharmony_ci * 838c2ecf20Sopenharmony_ci * It's quite useful performance wise to keep these nodes small - not just 848c2ecf20Sopenharmony_ci * because they're more likely to be in L2, but also because we can prefetch 858c2ecf20Sopenharmony_ci * more nodes on a single cacheline and thus prefetch more iterations in advance 868c2ecf20Sopenharmony_ci * when traversing this tree. 878c2ecf20Sopenharmony_ci * 888c2ecf20Sopenharmony_ci * Nodes in the auxiliary search tree must contain both a key to compare against 898c2ecf20Sopenharmony_ci * (we don't want to fetch the key from the set, that would defeat the purpose), 908c2ecf20Sopenharmony_ci * and a pointer to the key. We use a few tricks to compress both of these. 918c2ecf20Sopenharmony_ci * 928c2ecf20Sopenharmony_ci * To compress the pointer, we take advantage of the fact that one node in the 938c2ecf20Sopenharmony_ci * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have 948c2ecf20Sopenharmony_ci * a function (to_inorder()) that takes the index of a node in a binary tree and 958c2ecf20Sopenharmony_ci * returns what its index would be in an inorder traversal, so we only have to 968c2ecf20Sopenharmony_ci * store the low bits of the offset. 978c2ecf20Sopenharmony_ci * 988c2ecf20Sopenharmony_ci * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To 998c2ecf20Sopenharmony_ci * compress that, we take advantage of the fact that when we're traversing the 1008c2ecf20Sopenharmony_ci * search tree at every iteration we know that both our search key and the key 1018c2ecf20Sopenharmony_ci * we're looking for lie within some range - bounded by our previous 1028c2ecf20Sopenharmony_ci * comparisons. (We special case the start of a search so that this is true even 1038c2ecf20Sopenharmony_ci * at the root of the tree). 1048c2ecf20Sopenharmony_ci * 1058c2ecf20Sopenharmony_ci * So we know the key we're looking for is between a and b, and a and b don't 1068c2ecf20Sopenharmony_ci * differ higher than bit 50, we don't need to check anything higher than bit 1078c2ecf20Sopenharmony_ci * 50. 1088c2ecf20Sopenharmony_ci * 1098c2ecf20Sopenharmony_ci * We don't usually need the rest of the bits, either; we only need enough bits 1108c2ecf20Sopenharmony_ci * to partition the key range we're currently checking. Consider key n - the 1118c2ecf20Sopenharmony_ci * key our auxiliary search tree node corresponds to, and key p, the key 1128c2ecf20Sopenharmony_ci * immediately preceding n. The lowest bit we need to store in the auxiliary 1138c2ecf20Sopenharmony_ci * search tree is the highest bit that differs between n and p. 1148c2ecf20Sopenharmony_ci * 1158c2ecf20Sopenharmony_ci * Note that this could be bit 0 - we might sometimes need all 80 bits to do the 1168c2ecf20Sopenharmony_ci * comparison. But we'd really like our nodes in the auxiliary search tree to be 1178c2ecf20Sopenharmony_ci * of fixed size. 1188c2ecf20Sopenharmony_ci * 1198c2ecf20Sopenharmony_ci * The solution is to make them fixed size, and when we're constructing a node 1208c2ecf20Sopenharmony_ci * check if p and n differed in the bits we needed them to. If they don't we 1218c2ecf20Sopenharmony_ci * flag that node, and when doing lookups we fallback to comparing against the 1228c2ecf20Sopenharmony_ci * real key. As long as this doesn't happen to often (and it seems to reliably 1238c2ecf20Sopenharmony_ci * happen a bit less than 1% of the time), we win - even on failures, that key 1248c2ecf20Sopenharmony_ci * is then more likely to be in cache than if we were doing binary searches all 1258c2ecf20Sopenharmony_ci * the way, since we're touching so much less memory. 1268c2ecf20Sopenharmony_ci * 1278c2ecf20Sopenharmony_ci * The keys in the auxiliary search tree are stored in (software) floating 1288c2ecf20Sopenharmony_ci * point, with an exponent and a mantissa. The exponent needs to be big enough 1298c2ecf20Sopenharmony_ci * to address all the bits in the original key, but the number of bits in the 1308c2ecf20Sopenharmony_ci * mantissa is somewhat arbitrary; more bits just gets us fewer failures. 1318c2ecf20Sopenharmony_ci * 1328c2ecf20Sopenharmony_ci * We need 7 bits for the exponent and 3 bits for the key's offset (since keys 1338c2ecf20Sopenharmony_ci * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. 1348c2ecf20Sopenharmony_ci * We need one node per 128 bytes in the btree node, which means the auxiliary 1358c2ecf20Sopenharmony_ci * search trees take up 3% as much memory as the btree itself. 1368c2ecf20Sopenharmony_ci * 1378c2ecf20Sopenharmony_ci * Constructing these auxiliary search trees is moderately expensive, and we 1388c2ecf20Sopenharmony_ci * don't want to be constantly rebuilding the search tree for the last set 1398c2ecf20Sopenharmony_ci * whenever we insert another key into it. For the unwritten set, we use a much 1408c2ecf20Sopenharmony_ci * simpler lookup table - it's just a flat array, so index i in the lookup table 1418c2ecf20Sopenharmony_ci * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing 1428c2ecf20Sopenharmony_ci * within each byte range works the same as with the auxiliary search trees. 1438c2ecf20Sopenharmony_ci * 1448c2ecf20Sopenharmony_ci * These are much easier to keep up to date when we insert a key - we do it 1458c2ecf20Sopenharmony_ci * somewhat lazily; when we shift a key up we usually just increment the pointer 1468c2ecf20Sopenharmony_ci * to it, only when it would overflow do we go to the trouble of finding the 1478c2ecf20Sopenharmony_ci * first key in that range of bytes again. 1488c2ecf20Sopenharmony_ci */ 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_cistruct btree_keys; 1518c2ecf20Sopenharmony_cistruct btree_iter; 1528c2ecf20Sopenharmony_cistruct btree_iter_set; 1538c2ecf20Sopenharmony_cistruct bkey_float; 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci#define MAX_BSETS 4U 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_cistruct bset_tree { 1588c2ecf20Sopenharmony_ci /* 1598c2ecf20Sopenharmony_ci * We construct a binary tree in an array as if the array 1608c2ecf20Sopenharmony_ci * started at 1, so that things line up on the same cachelines 1618c2ecf20Sopenharmony_ci * better: see comments in bset.c at cacheline_to_bkey() for 1628c2ecf20Sopenharmony_ci * details 1638c2ecf20Sopenharmony_ci */ 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci /* size of the binary tree and prev array */ 1668c2ecf20Sopenharmony_ci unsigned int size; 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci /* function of size - precalculated for to_inorder() */ 1698c2ecf20Sopenharmony_ci unsigned int extra; 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci /* copy of the last key in the set */ 1728c2ecf20Sopenharmony_ci struct bkey end; 1738c2ecf20Sopenharmony_ci struct bkey_float *tree; 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci /* 1768c2ecf20Sopenharmony_ci * The nodes in the bset tree point to specific keys - this 1778c2ecf20Sopenharmony_ci * array holds the sizes of the previous key. 1788c2ecf20Sopenharmony_ci * 1798c2ecf20Sopenharmony_ci * Conceptually it's a member of struct bkey_float, but we want 1808c2ecf20Sopenharmony_ci * to keep bkey_float to 4 bytes and prev isn't used in the fast 1818c2ecf20Sopenharmony_ci * path. 1828c2ecf20Sopenharmony_ci */ 1838c2ecf20Sopenharmony_ci uint8_t *prev; 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci /* The actual btree node, with pointers to each sorted set */ 1868c2ecf20Sopenharmony_ci struct bset *data; 1878c2ecf20Sopenharmony_ci}; 1888c2ecf20Sopenharmony_ci 1898c2ecf20Sopenharmony_cistruct btree_keys_ops { 1908c2ecf20Sopenharmony_ci bool (*sort_cmp)(struct btree_iter_set l, 1918c2ecf20Sopenharmony_ci struct btree_iter_set r); 1928c2ecf20Sopenharmony_ci struct bkey *(*sort_fixup)(struct btree_iter *iter, 1938c2ecf20Sopenharmony_ci struct bkey *tmp); 1948c2ecf20Sopenharmony_ci bool (*insert_fixup)(struct btree_keys *b, 1958c2ecf20Sopenharmony_ci struct bkey *insert, 1968c2ecf20Sopenharmony_ci struct btree_iter *iter, 1978c2ecf20Sopenharmony_ci struct bkey *replace_key); 1988c2ecf20Sopenharmony_ci bool (*key_invalid)(struct btree_keys *bk, 1998c2ecf20Sopenharmony_ci const struct bkey *k); 2008c2ecf20Sopenharmony_ci bool (*key_bad)(struct btree_keys *bk, 2018c2ecf20Sopenharmony_ci const struct bkey *k); 2028c2ecf20Sopenharmony_ci bool (*key_merge)(struct btree_keys *bk, 2038c2ecf20Sopenharmony_ci struct bkey *l, struct bkey *r); 2048c2ecf20Sopenharmony_ci void (*key_to_text)(char *buf, 2058c2ecf20Sopenharmony_ci size_t size, 2068c2ecf20Sopenharmony_ci const struct bkey *k); 2078c2ecf20Sopenharmony_ci void (*key_dump)(struct btree_keys *keys, 2088c2ecf20Sopenharmony_ci const struct bkey *k); 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci /* 2118c2ecf20Sopenharmony_ci * Only used for deciding whether to use START_KEY(k) or just the key 2128c2ecf20Sopenharmony_ci * itself in a couple places 2138c2ecf20Sopenharmony_ci */ 2148c2ecf20Sopenharmony_ci bool is_extents; 2158c2ecf20Sopenharmony_ci}; 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_cistruct btree_keys { 2188c2ecf20Sopenharmony_ci const struct btree_keys_ops *ops; 2198c2ecf20Sopenharmony_ci uint8_t page_order; 2208c2ecf20Sopenharmony_ci uint8_t nsets; 2218c2ecf20Sopenharmony_ci unsigned int last_set_unwritten:1; 2228c2ecf20Sopenharmony_ci bool *expensive_debug_checks; 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_ci /* 2258c2ecf20Sopenharmony_ci * Sets of sorted keys - the real btree node - plus a binary search tree 2268c2ecf20Sopenharmony_ci * 2278c2ecf20Sopenharmony_ci * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point 2288c2ecf20Sopenharmony_ci * to the memory we have allocated for this btree node. Additionally, 2298c2ecf20Sopenharmony_ci * set[0]->data points to the entire btree node as it exists on disk. 2308c2ecf20Sopenharmony_ci */ 2318c2ecf20Sopenharmony_ci struct bset_tree set[MAX_BSETS]; 2328c2ecf20Sopenharmony_ci}; 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_cistatic inline struct bset_tree *bset_tree_last(struct btree_keys *b) 2358c2ecf20Sopenharmony_ci{ 2368c2ecf20Sopenharmony_ci return b->set + b->nsets; 2378c2ecf20Sopenharmony_ci} 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_cistatic inline bool bset_written(struct btree_keys *b, struct bset_tree *t) 2408c2ecf20Sopenharmony_ci{ 2418c2ecf20Sopenharmony_ci return t <= b->set + b->nsets - b->last_set_unwritten; 2428c2ecf20Sopenharmony_ci} 2438c2ecf20Sopenharmony_ci 2448c2ecf20Sopenharmony_cistatic inline bool bkey_written(struct btree_keys *b, struct bkey *k) 2458c2ecf20Sopenharmony_ci{ 2468c2ecf20Sopenharmony_ci return !b->last_set_unwritten || k < b->set[b->nsets].data->start; 2478c2ecf20Sopenharmony_ci} 2488c2ecf20Sopenharmony_ci 2498c2ecf20Sopenharmony_cistatic inline unsigned int bset_byte_offset(struct btree_keys *b, 2508c2ecf20Sopenharmony_ci struct bset *i) 2518c2ecf20Sopenharmony_ci{ 2528c2ecf20Sopenharmony_ci return ((size_t) i) - ((size_t) b->set->data); 2538c2ecf20Sopenharmony_ci} 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_cistatic inline unsigned int bset_sector_offset(struct btree_keys *b, 2568c2ecf20Sopenharmony_ci struct bset *i) 2578c2ecf20Sopenharmony_ci{ 2588c2ecf20Sopenharmony_ci return bset_byte_offset(b, i) >> 9; 2598c2ecf20Sopenharmony_ci} 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) 2628c2ecf20Sopenharmony_ci#define set_bytes(i) __set_bytes(i, i->keys) 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_ci#define __set_blocks(i, k, block_bytes) \ 2658c2ecf20Sopenharmony_ci DIV_ROUND_UP(__set_bytes(i, k), block_bytes) 2668c2ecf20Sopenharmony_ci#define set_blocks(i, block_bytes) \ 2678c2ecf20Sopenharmony_ci __set_blocks(i, (i)->keys, block_bytes) 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_cistatic inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) 2708c2ecf20Sopenharmony_ci{ 2718c2ecf20Sopenharmony_ci struct bset_tree *t = bset_tree_last(b); 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci BUG_ON((PAGE_SIZE << b->page_order) < 2748c2ecf20Sopenharmony_ci (bset_byte_offset(b, t->data) + set_bytes(t->data))); 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci if (!b->last_set_unwritten) 2778c2ecf20Sopenharmony_ci return 0; 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci return ((PAGE_SIZE << b->page_order) - 2808c2ecf20Sopenharmony_ci (bset_byte_offset(b, t->data) + set_bytes(t->data))) / 2818c2ecf20Sopenharmony_ci sizeof(u64); 2828c2ecf20Sopenharmony_ci} 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_cistatic inline struct bset *bset_next_set(struct btree_keys *b, 2858c2ecf20Sopenharmony_ci unsigned int block_bytes) 2868c2ecf20Sopenharmony_ci{ 2878c2ecf20Sopenharmony_ci struct bset *i = bset_tree_last(b)->data; 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci return ((void *) i) + roundup(set_bytes(i), block_bytes); 2908c2ecf20Sopenharmony_ci} 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_civoid bch_btree_keys_free(struct btree_keys *b); 2938c2ecf20Sopenharmony_ciint bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, 2948c2ecf20Sopenharmony_ci gfp_t gfp); 2958c2ecf20Sopenharmony_civoid bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, 2968c2ecf20Sopenharmony_ci bool *expensive_debug_checks); 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_civoid bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic); 2998c2ecf20Sopenharmony_civoid bch_bset_build_written_tree(struct btree_keys *b); 3008c2ecf20Sopenharmony_civoid bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k); 3018c2ecf20Sopenharmony_cibool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r); 3028c2ecf20Sopenharmony_civoid bch_bset_insert(struct btree_keys *b, struct bkey *where, 3038c2ecf20Sopenharmony_ci struct bkey *insert); 3048c2ecf20Sopenharmony_ciunsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, 3058c2ecf20Sopenharmony_ci struct bkey *replace_key); 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_cienum { 3088c2ecf20Sopenharmony_ci BTREE_INSERT_STATUS_NO_INSERT = 0, 3098c2ecf20Sopenharmony_ci BTREE_INSERT_STATUS_INSERT, 3108c2ecf20Sopenharmony_ci BTREE_INSERT_STATUS_BACK_MERGE, 3118c2ecf20Sopenharmony_ci BTREE_INSERT_STATUS_OVERWROTE, 3128c2ecf20Sopenharmony_ci BTREE_INSERT_STATUS_FRONT_MERGE, 3138c2ecf20Sopenharmony_ci}; 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci/* Btree key iteration */ 3168c2ecf20Sopenharmony_ci 3178c2ecf20Sopenharmony_cistruct btree_iter { 3188c2ecf20Sopenharmony_ci size_t size, used; 3198c2ecf20Sopenharmony_ci#ifdef CONFIG_BCACHE_DEBUG 3208c2ecf20Sopenharmony_ci struct btree_keys *b; 3218c2ecf20Sopenharmony_ci#endif 3228c2ecf20Sopenharmony_ci struct btree_iter_set { 3238c2ecf20Sopenharmony_ci struct bkey *k, *end; 3248c2ecf20Sopenharmony_ci } data[MAX_BSETS]; 3258c2ecf20Sopenharmony_ci}; 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_citypedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); 3288c2ecf20Sopenharmony_ci 3298c2ecf20Sopenharmony_cistruct bkey *bch_btree_iter_next(struct btree_iter *iter); 3308c2ecf20Sopenharmony_cistruct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, 3318c2ecf20Sopenharmony_ci struct btree_keys *b, 3328c2ecf20Sopenharmony_ci ptr_filter_fn fn); 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_civoid bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, 3358c2ecf20Sopenharmony_ci struct bkey *end); 3368c2ecf20Sopenharmony_cistruct bkey *bch_btree_iter_init(struct btree_keys *b, 3378c2ecf20Sopenharmony_ci struct btree_iter *iter, 3388c2ecf20Sopenharmony_ci struct bkey *search); 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_cistruct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, 3418c2ecf20Sopenharmony_ci const struct bkey *search); 3428c2ecf20Sopenharmony_ci 3438c2ecf20Sopenharmony_ci/* 3448c2ecf20Sopenharmony_ci * Returns the first key that is strictly greater than search 3458c2ecf20Sopenharmony_ci */ 3468c2ecf20Sopenharmony_cistatic inline struct bkey *bch_bset_search(struct btree_keys *b, 3478c2ecf20Sopenharmony_ci struct bset_tree *t, 3488c2ecf20Sopenharmony_ci const struct bkey *search) 3498c2ecf20Sopenharmony_ci{ 3508c2ecf20Sopenharmony_ci return search ? __bch_bset_search(b, t, search) : t->data->start; 3518c2ecf20Sopenharmony_ci} 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci#define for_each_key_filter(b, k, iter, filter) \ 3548c2ecf20Sopenharmony_ci for (bch_btree_iter_init((b), (iter), NULL); \ 3558c2ecf20Sopenharmony_ci ((k) = bch_btree_iter_next_filter((iter), (b), filter));) 3568c2ecf20Sopenharmony_ci 3578c2ecf20Sopenharmony_ci#define for_each_key(b, k, iter) \ 3588c2ecf20Sopenharmony_ci for (bch_btree_iter_init((b), (iter), NULL); \ 3598c2ecf20Sopenharmony_ci ((k) = bch_btree_iter_next(iter));) 3608c2ecf20Sopenharmony_ci 3618c2ecf20Sopenharmony_ci/* Sorting */ 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_cistruct bset_sort_state { 3648c2ecf20Sopenharmony_ci mempool_t pool; 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci unsigned int page_order; 3678c2ecf20Sopenharmony_ci unsigned int crit_factor; 3688c2ecf20Sopenharmony_ci 3698c2ecf20Sopenharmony_ci struct time_stats time; 3708c2ecf20Sopenharmony_ci}; 3718c2ecf20Sopenharmony_ci 3728c2ecf20Sopenharmony_civoid bch_bset_sort_state_free(struct bset_sort_state *state); 3738c2ecf20Sopenharmony_ciint bch_bset_sort_state_init(struct bset_sort_state *state, 3748c2ecf20Sopenharmony_ci unsigned int page_order); 3758c2ecf20Sopenharmony_civoid bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state); 3768c2ecf20Sopenharmony_civoid bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, 3778c2ecf20Sopenharmony_ci struct bset_sort_state *state); 3788c2ecf20Sopenharmony_civoid bch_btree_sort_and_fix_extents(struct btree_keys *b, 3798c2ecf20Sopenharmony_ci struct btree_iter *iter, 3808c2ecf20Sopenharmony_ci struct bset_sort_state *state); 3818c2ecf20Sopenharmony_civoid bch_btree_sort_partial(struct btree_keys *b, unsigned int start, 3828c2ecf20Sopenharmony_ci struct bset_sort_state *state); 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_cistatic inline void bch_btree_sort(struct btree_keys *b, 3858c2ecf20Sopenharmony_ci struct bset_sort_state *state) 3868c2ecf20Sopenharmony_ci{ 3878c2ecf20Sopenharmony_ci bch_btree_sort_partial(b, 0, state); 3888c2ecf20Sopenharmony_ci} 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_cistruct bset_stats { 3918c2ecf20Sopenharmony_ci size_t sets_written, sets_unwritten; 3928c2ecf20Sopenharmony_ci size_t bytes_written, bytes_unwritten; 3938c2ecf20Sopenharmony_ci size_t floats, failed; 3948c2ecf20Sopenharmony_ci}; 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_civoid bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *state); 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci/* Bkey utility code */ 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, \ 4018c2ecf20Sopenharmony_ci (unsigned int)(i)->keys) 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_cistatic inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx) 4048c2ecf20Sopenharmony_ci{ 4058c2ecf20Sopenharmony_ci return bkey_idx(i->start, idx); 4068c2ecf20Sopenharmony_ci} 4078c2ecf20Sopenharmony_ci 4088c2ecf20Sopenharmony_cistatic inline void bkey_init(struct bkey *k) 4098c2ecf20Sopenharmony_ci{ 4108c2ecf20Sopenharmony_ci *k = ZERO_KEY; 4118c2ecf20Sopenharmony_ci} 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_cistatic __always_inline int64_t bkey_cmp(const struct bkey *l, 4148c2ecf20Sopenharmony_ci const struct bkey *r) 4158c2ecf20Sopenharmony_ci{ 4168c2ecf20Sopenharmony_ci return unlikely(KEY_INODE(l) != KEY_INODE(r)) 4178c2ecf20Sopenharmony_ci ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) 4188c2ecf20Sopenharmony_ci : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); 4198c2ecf20Sopenharmony_ci} 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_civoid bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, 4228c2ecf20Sopenharmony_ci unsigned int i); 4238c2ecf20Sopenharmony_cibool __bch_cut_front(const struct bkey *where, struct bkey *k); 4248c2ecf20Sopenharmony_cibool __bch_cut_back(const struct bkey *where, struct bkey *k); 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_cistatic inline bool bch_cut_front(const struct bkey *where, struct bkey *k) 4278c2ecf20Sopenharmony_ci{ 4288c2ecf20Sopenharmony_ci BUG_ON(bkey_cmp(where, k) > 0); 4298c2ecf20Sopenharmony_ci return __bch_cut_front(where, k); 4308c2ecf20Sopenharmony_ci} 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_cistatic inline bool bch_cut_back(const struct bkey *where, struct bkey *k) 4338c2ecf20Sopenharmony_ci{ 4348c2ecf20Sopenharmony_ci BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); 4358c2ecf20Sopenharmony_ci return __bch_cut_back(where, k); 4368c2ecf20Sopenharmony_ci} 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci/* 4398c2ecf20Sopenharmony_ci * Pointer '*preceding_key_p' points to a memory object to store preceding 4408c2ecf20Sopenharmony_ci * key of k. If the preceding key does not exist, set '*preceding_key_p' to 4418c2ecf20Sopenharmony_ci * NULL. So the caller of preceding_key() needs to take care of memory 4428c2ecf20Sopenharmony_ci * which '*preceding_key_p' pointed to before calling preceding_key(). 4438c2ecf20Sopenharmony_ci * Currently the only caller of preceding_key() is bch_btree_insert_key(), 4448c2ecf20Sopenharmony_ci * and it points to an on-stack variable, so the memory release is handled 4458c2ecf20Sopenharmony_ci * by stackframe itself. 4468c2ecf20Sopenharmony_ci */ 4478c2ecf20Sopenharmony_cistatic inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p) 4488c2ecf20Sopenharmony_ci{ 4498c2ecf20Sopenharmony_ci if (KEY_INODE(k) || KEY_OFFSET(k)) { 4508c2ecf20Sopenharmony_ci (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0); 4518c2ecf20Sopenharmony_ci if (!(*preceding_key_p)->low) 4528c2ecf20Sopenharmony_ci (*preceding_key_p)->high--; 4538c2ecf20Sopenharmony_ci (*preceding_key_p)->low--; 4548c2ecf20Sopenharmony_ci } else { 4558c2ecf20Sopenharmony_ci (*preceding_key_p) = NULL; 4568c2ecf20Sopenharmony_ci } 4578c2ecf20Sopenharmony_ci} 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_cistatic inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) 4608c2ecf20Sopenharmony_ci{ 4618c2ecf20Sopenharmony_ci return b->ops->key_invalid(b, k); 4628c2ecf20Sopenharmony_ci} 4638c2ecf20Sopenharmony_ci 4648c2ecf20Sopenharmony_cistatic inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) 4658c2ecf20Sopenharmony_ci{ 4668c2ecf20Sopenharmony_ci return b->ops->key_bad(b, k); 4678c2ecf20Sopenharmony_ci} 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_cistatic inline void bch_bkey_to_text(struct btree_keys *b, char *buf, 4708c2ecf20Sopenharmony_ci size_t size, const struct bkey *k) 4718c2ecf20Sopenharmony_ci{ 4728c2ecf20Sopenharmony_ci return b->ops->key_to_text(buf, size, k); 4738c2ecf20Sopenharmony_ci} 4748c2ecf20Sopenharmony_ci 4758c2ecf20Sopenharmony_cistatic inline bool bch_bkey_equal_header(const struct bkey *l, 4768c2ecf20Sopenharmony_ci const struct bkey *r) 4778c2ecf20Sopenharmony_ci{ 4788c2ecf20Sopenharmony_ci return (KEY_DIRTY(l) == KEY_DIRTY(r) && 4798c2ecf20Sopenharmony_ci KEY_PTRS(l) == KEY_PTRS(r) && 4808c2ecf20Sopenharmony_ci KEY_CSUM(l) == KEY_CSUM(r)); 4818c2ecf20Sopenharmony_ci} 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci/* Keylists */ 4848c2ecf20Sopenharmony_ci 4858c2ecf20Sopenharmony_cistruct keylist { 4868c2ecf20Sopenharmony_ci union { 4878c2ecf20Sopenharmony_ci struct bkey *keys; 4888c2ecf20Sopenharmony_ci uint64_t *keys_p; 4898c2ecf20Sopenharmony_ci }; 4908c2ecf20Sopenharmony_ci union { 4918c2ecf20Sopenharmony_ci struct bkey *top; 4928c2ecf20Sopenharmony_ci uint64_t *top_p; 4938c2ecf20Sopenharmony_ci }; 4948c2ecf20Sopenharmony_ci 4958c2ecf20Sopenharmony_ci /* Enough room for btree_split's keys without realloc */ 4968c2ecf20Sopenharmony_ci#define KEYLIST_INLINE 16 4978c2ecf20Sopenharmony_ci uint64_t inline_keys[KEYLIST_INLINE]; 4988c2ecf20Sopenharmony_ci}; 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_cistatic inline void bch_keylist_init(struct keylist *l) 5018c2ecf20Sopenharmony_ci{ 5028c2ecf20Sopenharmony_ci l->top_p = l->keys_p = l->inline_keys; 5038c2ecf20Sopenharmony_ci} 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_cistatic inline void bch_keylist_init_single(struct keylist *l, struct bkey *k) 5068c2ecf20Sopenharmony_ci{ 5078c2ecf20Sopenharmony_ci l->keys = k; 5088c2ecf20Sopenharmony_ci l->top = bkey_next(k); 5098c2ecf20Sopenharmony_ci} 5108c2ecf20Sopenharmony_ci 5118c2ecf20Sopenharmony_cistatic inline void bch_keylist_push(struct keylist *l) 5128c2ecf20Sopenharmony_ci{ 5138c2ecf20Sopenharmony_ci l->top = bkey_next(l->top); 5148c2ecf20Sopenharmony_ci} 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_cistatic inline void bch_keylist_add(struct keylist *l, struct bkey *k) 5178c2ecf20Sopenharmony_ci{ 5188c2ecf20Sopenharmony_ci bkey_copy(l->top, k); 5198c2ecf20Sopenharmony_ci bch_keylist_push(l); 5208c2ecf20Sopenharmony_ci} 5218c2ecf20Sopenharmony_ci 5228c2ecf20Sopenharmony_cistatic inline bool bch_keylist_empty(struct keylist *l) 5238c2ecf20Sopenharmony_ci{ 5248c2ecf20Sopenharmony_ci return l->top == l->keys; 5258c2ecf20Sopenharmony_ci} 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_cistatic inline void bch_keylist_reset(struct keylist *l) 5288c2ecf20Sopenharmony_ci{ 5298c2ecf20Sopenharmony_ci l->top = l->keys; 5308c2ecf20Sopenharmony_ci} 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_cistatic inline void bch_keylist_free(struct keylist *l) 5338c2ecf20Sopenharmony_ci{ 5348c2ecf20Sopenharmony_ci if (l->keys_p != l->inline_keys) 5358c2ecf20Sopenharmony_ci kfree(l->keys_p); 5368c2ecf20Sopenharmony_ci} 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_cistatic inline size_t bch_keylist_nkeys(struct keylist *l) 5398c2ecf20Sopenharmony_ci{ 5408c2ecf20Sopenharmony_ci return l->top_p - l->keys_p; 5418c2ecf20Sopenharmony_ci} 5428c2ecf20Sopenharmony_ci 5438c2ecf20Sopenharmony_cistatic inline size_t bch_keylist_bytes(struct keylist *l) 5448c2ecf20Sopenharmony_ci{ 5458c2ecf20Sopenharmony_ci return bch_keylist_nkeys(l) * sizeof(uint64_t); 5468c2ecf20Sopenharmony_ci} 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_cistruct bkey *bch_keylist_pop(struct keylist *l); 5498c2ecf20Sopenharmony_civoid bch_keylist_pop_front(struct keylist *l); 5508c2ecf20Sopenharmony_ciint __bch_keylist_realloc(struct keylist *l, unsigned int u64s); 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_ci/* Debug stuff */ 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_ci#ifdef CONFIG_BCACHE_DEBUG 5558c2ecf20Sopenharmony_ci 5568c2ecf20Sopenharmony_ciint __bch_count_data(struct btree_keys *b); 5578c2ecf20Sopenharmony_civoid __printf(2, 3) __bch_check_keys(struct btree_keys *b, 5588c2ecf20Sopenharmony_ci const char *fmt, 5598c2ecf20Sopenharmony_ci ...); 5608c2ecf20Sopenharmony_civoid bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set); 5618c2ecf20Sopenharmony_civoid bch_dump_bucket(struct btree_keys *b); 5628c2ecf20Sopenharmony_ci 5638c2ecf20Sopenharmony_ci#else 5648c2ecf20Sopenharmony_ci 5658c2ecf20Sopenharmony_cistatic inline int __bch_count_data(struct btree_keys *b) { return -1; } 5668c2ecf20Sopenharmony_cistatic inline void __printf(2, 3) 5678c2ecf20Sopenharmony_ci __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} 5688c2ecf20Sopenharmony_cistatic inline void bch_dump_bucket(struct btree_keys *b) {} 5698c2ecf20Sopenharmony_civoid bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set); 5708c2ecf20Sopenharmony_ci 5718c2ecf20Sopenharmony_ci#endif 5728c2ecf20Sopenharmony_ci 5738c2ecf20Sopenharmony_cistatic inline bool btree_keys_expensive_checks(struct btree_keys *b) 5748c2ecf20Sopenharmony_ci{ 5758c2ecf20Sopenharmony_ci#ifdef CONFIG_BCACHE_DEBUG 5768c2ecf20Sopenharmony_ci return *b->expensive_debug_checks; 5778c2ecf20Sopenharmony_ci#else 5788c2ecf20Sopenharmony_ci return false; 5798c2ecf20Sopenharmony_ci#endif 5808c2ecf20Sopenharmony_ci} 5818c2ecf20Sopenharmony_ci 5828c2ecf20Sopenharmony_cistatic inline int bch_count_data(struct btree_keys *b) 5838c2ecf20Sopenharmony_ci{ 5848c2ecf20Sopenharmony_ci return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; 5858c2ecf20Sopenharmony_ci} 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_ci#define bch_check_keys(b, ...) \ 5888c2ecf20Sopenharmony_cido { \ 5898c2ecf20Sopenharmony_ci if (btree_keys_expensive_checks(b)) \ 5908c2ecf20Sopenharmony_ci __bch_check_keys(b, __VA_ARGS__); \ 5918c2ecf20Sopenharmony_ci} while (0) 5928c2ecf20Sopenharmony_ci 5938c2ecf20Sopenharmony_ci#endif 594