18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci#ifndef _BCACHE_JOURNAL_H 38c2ecf20Sopenharmony_ci#define _BCACHE_JOURNAL_H 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci/* 68c2ecf20Sopenharmony_ci * THE JOURNAL: 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * The journal is treated as a circular buffer of buckets - a journal entry 98c2ecf20Sopenharmony_ci * never spans two buckets. This means (not implemented yet) we can resize the 108c2ecf20Sopenharmony_ci * journal at runtime, and will be needed for bcache on raw flash support. 118c2ecf20Sopenharmony_ci * 128c2ecf20Sopenharmony_ci * Journal entries contain a list of keys, ordered by the time they were 138c2ecf20Sopenharmony_ci * inserted; thus journal replay just has to reinsert the keys. 148c2ecf20Sopenharmony_ci * 158c2ecf20Sopenharmony_ci * We also keep some things in the journal header that are logically part of the 168c2ecf20Sopenharmony_ci * superblock - all the things that are frequently updated. This is for future 178c2ecf20Sopenharmony_ci * bcache on raw flash support; the superblock (which will become another 188c2ecf20Sopenharmony_ci * journal) can't be moved or wear leveled, so it contains just enough 198c2ecf20Sopenharmony_ci * information to find the main journal, and the superblock only has to be 208c2ecf20Sopenharmony_ci * rewritten when we want to move/wear level the main journal. 218c2ecf20Sopenharmony_ci * 228c2ecf20Sopenharmony_ci * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be 238c2ecf20Sopenharmony_ci * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions 248c2ecf20Sopenharmony_ci * from cache misses, which don't have to be journaled, and for writeback and 258c2ecf20Sopenharmony_ci * moving gc we work around it by flushing the btree to disk before updating the 268c2ecf20Sopenharmony_ci * gc information. But it is a potential issue with incremental garbage 278c2ecf20Sopenharmony_ci * collection, and it's fragile. 288c2ecf20Sopenharmony_ci * 298c2ecf20Sopenharmony_ci * OPEN JOURNAL ENTRIES: 308c2ecf20Sopenharmony_ci * 318c2ecf20Sopenharmony_ci * Each journal entry contains, in the header, the sequence number of the last 328c2ecf20Sopenharmony_ci * journal entry still open - i.e. that has keys that haven't been flushed to 338c2ecf20Sopenharmony_ci * disk in the btree. 348c2ecf20Sopenharmony_ci * 358c2ecf20Sopenharmony_ci * We track this by maintaining a refcount for every open journal entry, in a 368c2ecf20Sopenharmony_ci * fifo; each entry in the fifo corresponds to a particular journal 378c2ecf20Sopenharmony_ci * entry/sequence number. When the refcount at the tail of the fifo goes to 388c2ecf20Sopenharmony_ci * zero, we pop it off - thus, the size of the fifo tells us the number of open 398c2ecf20Sopenharmony_ci * journal entries 408c2ecf20Sopenharmony_ci * 418c2ecf20Sopenharmony_ci * We take a refcount on a journal entry when we add some keys to a journal 428c2ecf20Sopenharmony_ci * entry that we're going to insert (held by struct btree_op), and then when we 438c2ecf20Sopenharmony_ci * insert those keys into the btree the btree write we're setting up takes a 448c2ecf20Sopenharmony_ci * copy of that refcount (held by struct btree_write). That refcount is dropped 458c2ecf20Sopenharmony_ci * when the btree write completes. 468c2ecf20Sopenharmony_ci * 478c2ecf20Sopenharmony_ci * A struct btree_write can only hold a refcount on a single journal entry, but 488c2ecf20Sopenharmony_ci * might contain keys for many journal entries - we handle this by making sure 498c2ecf20Sopenharmony_ci * it always has a refcount on the _oldest_ journal entry of all the journal 508c2ecf20Sopenharmony_ci * entries it has keys for. 518c2ecf20Sopenharmony_ci * 528c2ecf20Sopenharmony_ci * JOURNAL RECLAIM: 538c2ecf20Sopenharmony_ci * 548c2ecf20Sopenharmony_ci * As mentioned previously, our fifo of refcounts tells us the number of open 558c2ecf20Sopenharmony_ci * journal entries; from that and the current journal sequence number we compute 568c2ecf20Sopenharmony_ci * last_seq - the oldest journal entry we still need. We write last_seq in each 578c2ecf20Sopenharmony_ci * journal entry, and we also have to keep track of where it exists on disk so 588c2ecf20Sopenharmony_ci * we don't overwrite it when we loop around the journal. 598c2ecf20Sopenharmony_ci * 608c2ecf20Sopenharmony_ci * To do that we track, for each journal bucket, the sequence number of the 618c2ecf20Sopenharmony_ci * newest journal entry it contains - if we don't need that journal entry we 628c2ecf20Sopenharmony_ci * don't need anything in that bucket anymore. From that we track the last 638c2ecf20Sopenharmony_ci * journal bucket we still need; all this is tracked in struct journal_device 648c2ecf20Sopenharmony_ci * and updated by journal_reclaim(). 658c2ecf20Sopenharmony_ci * 668c2ecf20Sopenharmony_ci * JOURNAL FILLING UP: 678c2ecf20Sopenharmony_ci * 688c2ecf20Sopenharmony_ci * There are two ways the journal could fill up; either we could run out of 698c2ecf20Sopenharmony_ci * space to write to, or we could have too many open journal entries and run out 708c2ecf20Sopenharmony_ci * of room in the fifo of refcounts. Since those refcounts are decremented 718c2ecf20Sopenharmony_ci * without any locking we can't safely resize that fifo, so we handle it the 728c2ecf20Sopenharmony_ci * same way. 738c2ecf20Sopenharmony_ci * 748c2ecf20Sopenharmony_ci * If the journal fills up, we start flushing dirty btree nodes until we can 758c2ecf20Sopenharmony_ci * allocate space for a journal write again - preferentially flushing btree 768c2ecf20Sopenharmony_ci * nodes that are pinning the oldest journal entries first. 778c2ecf20Sopenharmony_ci */ 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci/* 808c2ecf20Sopenharmony_ci * Only used for holding the journal entries we read in btree_journal_read() 818c2ecf20Sopenharmony_ci * during cache_registration 828c2ecf20Sopenharmony_ci */ 838c2ecf20Sopenharmony_cistruct journal_replay { 848c2ecf20Sopenharmony_ci struct list_head list; 858c2ecf20Sopenharmony_ci atomic_t *pin; 868c2ecf20Sopenharmony_ci struct jset j; 878c2ecf20Sopenharmony_ci}; 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci/* 908c2ecf20Sopenharmony_ci * We put two of these in struct journal; we used them for writes to the 918c2ecf20Sopenharmony_ci * journal that are being staged or in flight. 928c2ecf20Sopenharmony_ci */ 938c2ecf20Sopenharmony_cistruct journal_write { 948c2ecf20Sopenharmony_ci struct jset *data; 958c2ecf20Sopenharmony_ci#define JSET_BITS 3 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci struct cache_set *c; 988c2ecf20Sopenharmony_ci struct closure_waitlist wait; 998c2ecf20Sopenharmony_ci bool dirty; 1008c2ecf20Sopenharmony_ci bool need_write; 1018c2ecf20Sopenharmony_ci}; 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci/* Embedded in struct cache_set */ 1048c2ecf20Sopenharmony_cistruct journal { 1058c2ecf20Sopenharmony_ci spinlock_t lock; 1068c2ecf20Sopenharmony_ci spinlock_t flush_write_lock; 1078c2ecf20Sopenharmony_ci bool btree_flushing; 1088c2ecf20Sopenharmony_ci bool do_reserve; 1098c2ecf20Sopenharmony_ci /* used when waiting because the journal was full */ 1108c2ecf20Sopenharmony_ci struct closure_waitlist wait; 1118c2ecf20Sopenharmony_ci struct closure io; 1128c2ecf20Sopenharmony_ci int io_in_flight; 1138c2ecf20Sopenharmony_ci struct delayed_work work; 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci /* Number of blocks free in the bucket(s) we're currently writing to */ 1168c2ecf20Sopenharmony_ci unsigned int blocks_free; 1178c2ecf20Sopenharmony_ci uint64_t seq; 1188c2ecf20Sopenharmony_ci DECLARE_FIFO(atomic_t, pin); 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci BKEY_PADDED(key); 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci struct journal_write w[2], *cur; 1238c2ecf20Sopenharmony_ci}; 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_ci/* 1268c2ecf20Sopenharmony_ci * Embedded in struct cache. First three fields refer to the array of journal 1278c2ecf20Sopenharmony_ci * buckets, in cache_sb. 1288c2ecf20Sopenharmony_ci */ 1298c2ecf20Sopenharmony_cistruct journal_device { 1308c2ecf20Sopenharmony_ci /* 1318c2ecf20Sopenharmony_ci * For each journal bucket, contains the max sequence number of the 1328c2ecf20Sopenharmony_ci * journal writes it contains - so we know when a bucket can be reused. 1338c2ecf20Sopenharmony_ci */ 1348c2ecf20Sopenharmony_ci uint64_t seq[SB_JOURNAL_BUCKETS]; 1358c2ecf20Sopenharmony_ci 1368c2ecf20Sopenharmony_ci /* Journal bucket we're currently writing to */ 1378c2ecf20Sopenharmony_ci unsigned int cur_idx; 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_ci /* Last journal bucket that still contains an open journal entry */ 1408c2ecf20Sopenharmony_ci unsigned int last_idx; 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci /* Next journal bucket to be discarded */ 1438c2ecf20Sopenharmony_ci unsigned int discard_idx; 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_ci#define DISCARD_READY 0 1468c2ecf20Sopenharmony_ci#define DISCARD_IN_FLIGHT 1 1478c2ecf20Sopenharmony_ci#define DISCARD_DONE 2 1488c2ecf20Sopenharmony_ci /* 1 - discard in flight, -1 - discard completed */ 1498c2ecf20Sopenharmony_ci atomic_t discard_in_flight; 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci struct work_struct discard_work; 1528c2ecf20Sopenharmony_ci struct bio discard_bio; 1538c2ecf20Sopenharmony_ci struct bio_vec discard_bv; 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci /* Bio for journal reads/writes to this device */ 1568c2ecf20Sopenharmony_ci struct bio bio; 1578c2ecf20Sopenharmony_ci struct bio_vec bv[8]; 1588c2ecf20Sopenharmony_ci}; 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci#define BTREE_FLUSH_NR 8 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci#define journal_pin_cmp(c, l, r) \ 1638c2ecf20Sopenharmony_ci (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci#define JOURNAL_PIN 20000 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci#define journal_full(j) \ 1688c2ecf20Sopenharmony_ci (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_cistruct closure; 1718c2ecf20Sopenharmony_cistruct cache_set; 1728c2ecf20Sopenharmony_cistruct btree_op; 1738c2ecf20Sopenharmony_cistruct keylist; 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ciatomic_t *bch_journal(struct cache_set *c, 1768c2ecf20Sopenharmony_ci struct keylist *keys, 1778c2ecf20Sopenharmony_ci struct closure *parent); 1788c2ecf20Sopenharmony_civoid bch_journal_next(struct journal *j); 1798c2ecf20Sopenharmony_civoid bch_journal_mark(struct cache_set *c, struct list_head *list); 1808c2ecf20Sopenharmony_civoid bch_journal_meta(struct cache_set *c, struct closure *cl); 1818c2ecf20Sopenharmony_ciint bch_journal_read(struct cache_set *c, struct list_head *list); 1828c2ecf20Sopenharmony_ciint bch_journal_replay(struct cache_set *c, struct list_head *list); 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_civoid bch_journal_free(struct cache_set *c); 1858c2ecf20Sopenharmony_ciint bch_journal_alloc(struct cache_set *c); 1868c2ecf20Sopenharmony_civoid bch_journal_space_reserve(struct journal *j); 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_ci#endif /* _BCACHE_JOURNAL_H */ 189