xref: /kernel/linux/linux-6.6/fs/btrfs/tree-mod-log.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2
3#include "messages.h"
4#include "tree-mod-log.h"
5#include "disk-io.h"
6#include "fs.h"
7#include "accessors.h"
8#include "tree-checker.h"
9
10struct tree_mod_root {
11	u64 logical;
12	u8 level;
13};
14
15struct tree_mod_elem {
16	struct rb_node node;
17	u64 logical;
18	u64 seq;
19	enum btrfs_mod_log_op op;
20
21	/*
22	 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
23	 * operations.
24	 */
25	int slot;
26
27	/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
28	u64 generation;
29
30	/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
31	struct btrfs_disk_key key;
32	u64 blockptr;
33
34	/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
35	struct {
36		int dst_slot;
37		int nr_items;
38	} move;
39
40	/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
41	struct tree_mod_root old_root;
42};
43
44/*
45 * Pull a new tree mod seq number for our operation.
46 */
47static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
48{
49	return atomic64_inc_return(&fs_info->tree_mod_seq);
50}
51
52/*
53 * This adds a new blocker to the tree mod log's blocker list if the @elem
54 * passed does not already have a sequence number set. So when a caller expects
55 * to record tree modifications, it should ensure to set elem->seq to zero
56 * before calling btrfs_get_tree_mod_seq.
57 * Returns a fresh, unused tree log modification sequence number, even if no new
58 * blocker was added.
59 */
60u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
61			   struct btrfs_seq_list *elem)
62{
63	write_lock(&fs_info->tree_mod_log_lock);
64	if (!elem->seq) {
65		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
66		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
67		set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
68	}
69	write_unlock(&fs_info->tree_mod_log_lock);
70
71	return elem->seq;
72}
73
74void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
75			    struct btrfs_seq_list *elem)
76{
77	struct rb_root *tm_root;
78	struct rb_node *node;
79	struct rb_node *next;
80	struct tree_mod_elem *tm;
81	u64 min_seq = BTRFS_SEQ_LAST;
82	u64 seq_putting = elem->seq;
83
84	if (!seq_putting)
85		return;
86
87	write_lock(&fs_info->tree_mod_log_lock);
88	list_del(&elem->list);
89	elem->seq = 0;
90
91	if (list_empty(&fs_info->tree_mod_seq_list)) {
92		clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
93	} else {
94		struct btrfs_seq_list *first;
95
96		first = list_first_entry(&fs_info->tree_mod_seq_list,
97					 struct btrfs_seq_list, list);
98		if (seq_putting > first->seq) {
99			/*
100			 * Blocker with lower sequence number exists, we cannot
101			 * remove anything from the log.
102			 */
103			write_unlock(&fs_info->tree_mod_log_lock);
104			return;
105		}
106		min_seq = first->seq;
107	}
108
109	/*
110	 * Anything that's lower than the lowest existing (read: blocked)
111	 * sequence number can be removed from the tree.
112	 */
113	tm_root = &fs_info->tree_mod_log;
114	for (node = rb_first(tm_root); node; node = next) {
115		next = rb_next(node);
116		tm = rb_entry(node, struct tree_mod_elem, node);
117		if (tm->seq >= min_seq)
118			continue;
119		rb_erase(node, tm_root);
120		kfree(tm);
121	}
122	write_unlock(&fs_info->tree_mod_log_lock);
123}
124
125/*
126 * Key order of the log:
127 *       node/leaf start address -> sequence
128 *
129 * The 'start address' is the logical address of the *new* root node for root
130 * replace operations, or the logical address of the affected block for all
131 * other operations.
132 */
133static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
134					struct tree_mod_elem *tm)
135{
136	struct rb_root *tm_root;
137	struct rb_node **new;
138	struct rb_node *parent = NULL;
139	struct tree_mod_elem *cur;
140
141	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
142
143	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
144
145	tm_root = &fs_info->tree_mod_log;
146	new = &tm_root->rb_node;
147	while (*new) {
148		cur = rb_entry(*new, struct tree_mod_elem, node);
149		parent = *new;
150		if (cur->logical < tm->logical)
151			new = &((*new)->rb_left);
152		else if (cur->logical > tm->logical)
153			new = &((*new)->rb_right);
154		else if (cur->seq < tm->seq)
155			new = &((*new)->rb_left);
156		else if (cur->seq > tm->seq)
157			new = &((*new)->rb_right);
158		else
159			return -EEXIST;
160	}
161
162	rb_link_node(&tm->node, parent, new);
163	rb_insert_color(&tm->node, tm_root);
164	return 0;
165}
166
167/*
168 * Determines if logging can be omitted. Returns true if it can. Otherwise, it
169 * returns false with the tree_mod_log_lock acquired. The caller must hold
170 * this until all tree mod log insertions are recorded in the rb tree and then
171 * write unlock fs_info::tree_mod_log_lock.
172 */
173static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
174				    struct extent_buffer *eb)
175{
176	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
177		return true;
178	if (eb && btrfs_header_level(eb) == 0)
179		return true;
180
181	write_lock(&fs_info->tree_mod_log_lock);
182	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
183		write_unlock(&fs_info->tree_mod_log_lock);
184		return true;
185	}
186
187	return false;
188}
189
190/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
191static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
192				    struct extent_buffer *eb)
193{
194	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
195		return false;
196	if (eb && btrfs_header_level(eb) == 0)
197		return false;
198
199	return true;
200}
201
202static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
203						 int slot,
204						 enum btrfs_mod_log_op op)
205{
206	struct tree_mod_elem *tm;
207
208	tm = kzalloc(sizeof(*tm), GFP_NOFS);
209	if (!tm)
210		return NULL;
211
212	tm->logical = eb->start;
213	if (op != BTRFS_MOD_LOG_KEY_ADD) {
214		btrfs_node_key(eb, &tm->key, slot);
215		tm->blockptr = btrfs_node_blockptr(eb, slot);
216	}
217	tm->op = op;
218	tm->slot = slot;
219	tm->generation = btrfs_node_ptr_generation(eb, slot);
220	RB_CLEAR_NODE(&tm->node);
221
222	return tm;
223}
224
225int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
226				  enum btrfs_mod_log_op op)
227{
228	struct tree_mod_elem *tm;
229	int ret = 0;
230
231	if (!tree_mod_need_log(eb->fs_info, eb))
232		return 0;
233
234	tm = alloc_tree_mod_elem(eb, slot, op);
235	if (!tm)
236		ret = -ENOMEM;
237
238	if (tree_mod_dont_log(eb->fs_info, eb)) {
239		kfree(tm);
240		/*
241		 * Don't error if we failed to allocate memory because we don't
242		 * need to log.
243		 */
244		return 0;
245	} else if (ret != 0) {
246		/*
247		 * We previously failed to allocate memory and we need to log,
248		 * so we have to fail.
249		 */
250		goto out_unlock;
251	}
252
253	ret = tree_mod_log_insert(eb->fs_info, tm);
254out_unlock:
255	write_unlock(&eb->fs_info->tree_mod_log_lock);
256	if (ret)
257		kfree(tm);
258
259	return ret;
260}
261
262static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
263						     int dst_slot, int src_slot,
264						     int nr_items)
265{
266	struct tree_mod_elem *tm;
267
268	tm = kzalloc(sizeof(*tm), GFP_NOFS);
269	if (!tm)
270		return ERR_PTR(-ENOMEM);
271
272	tm->logical = eb->start;
273	tm->slot = src_slot;
274	tm->move.dst_slot = dst_slot;
275	tm->move.nr_items = nr_items;
276	tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
277	RB_CLEAR_NODE(&tm->node);
278
279	return tm;
280}
281
282int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
283				   int dst_slot, int src_slot,
284				   int nr_items)
285{
286	struct tree_mod_elem *tm = NULL;
287	struct tree_mod_elem **tm_list = NULL;
288	int ret = 0;
289	int i;
290	bool locked = false;
291
292	if (!tree_mod_need_log(eb->fs_info, eb))
293		return 0;
294
295	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
296	if (!tm_list) {
297		ret = -ENOMEM;
298		goto lock;
299	}
300
301	tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
302	if (IS_ERR(tm)) {
303		ret = PTR_ERR(tm);
304		tm = NULL;
305		goto lock;
306	}
307
308	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
309		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
310				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
311		if (!tm_list[i]) {
312			ret = -ENOMEM;
313			goto lock;
314		}
315	}
316
317lock:
318	if (tree_mod_dont_log(eb->fs_info, eb)) {
319		/*
320		 * Don't error if we failed to allocate memory because we don't
321		 * need to log.
322		 */
323		ret = 0;
324		goto free_tms;
325	}
326	locked = true;
327
328	/*
329	 * We previously failed to allocate memory and we need to log, so we
330	 * have to fail.
331	 */
332	if (ret != 0)
333		goto free_tms;
334
335	/*
336	 * When we override something during the move, we log these removals.
337	 * This can only happen when we move towards the beginning of the
338	 * buffer, i.e. dst_slot < src_slot.
339	 */
340	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
341		ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
342		if (ret)
343			goto free_tms;
344	}
345
346	ret = tree_mod_log_insert(eb->fs_info, tm);
347	if (ret)
348		goto free_tms;
349	write_unlock(&eb->fs_info->tree_mod_log_lock);
350	kfree(tm_list);
351
352	return 0;
353
354free_tms:
355	if (tm_list) {
356		for (i = 0; i < nr_items; i++) {
357			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
358				rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
359			kfree(tm_list[i]);
360		}
361	}
362	if (locked)
363		write_unlock(&eb->fs_info->tree_mod_log_lock);
364	kfree(tm_list);
365	kfree(tm);
366
367	return ret;
368}
369
370static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
371				       struct tree_mod_elem **tm_list,
372				       int nritems)
373{
374	int i, j;
375	int ret;
376
377	for (i = nritems - 1; i >= 0; i--) {
378		ret = tree_mod_log_insert(fs_info, tm_list[i]);
379		if (ret) {
380			for (j = nritems - 1; j > i; j--)
381				rb_erase(&tm_list[j]->node,
382					 &fs_info->tree_mod_log);
383			return ret;
384		}
385	}
386
387	return 0;
388}
389
390int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
391				   struct extent_buffer *new_root,
392				   bool log_removal)
393{
394	struct btrfs_fs_info *fs_info = old_root->fs_info;
395	struct tree_mod_elem *tm = NULL;
396	struct tree_mod_elem **tm_list = NULL;
397	int nritems = 0;
398	int ret = 0;
399	int i;
400
401	if (!tree_mod_need_log(fs_info, NULL))
402		return 0;
403
404	if (log_removal && btrfs_header_level(old_root) > 0) {
405		nritems = btrfs_header_nritems(old_root);
406		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
407				  GFP_NOFS);
408		if (!tm_list) {
409			ret = -ENOMEM;
410			goto lock;
411		}
412		for (i = 0; i < nritems; i++) {
413			tm_list[i] = alloc_tree_mod_elem(old_root, i,
414			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
415			if (!tm_list[i]) {
416				ret = -ENOMEM;
417				goto lock;
418			}
419		}
420	}
421
422	tm = kzalloc(sizeof(*tm), GFP_NOFS);
423	if (!tm) {
424		ret = -ENOMEM;
425		goto lock;
426	}
427
428	tm->logical = new_root->start;
429	tm->old_root.logical = old_root->start;
430	tm->old_root.level = btrfs_header_level(old_root);
431	tm->generation = btrfs_header_generation(old_root);
432	tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
433
434lock:
435	if (tree_mod_dont_log(fs_info, NULL)) {
436		/*
437		 * Don't error if we failed to allocate memory because we don't
438		 * need to log.
439		 */
440		ret = 0;
441		goto free_tms;
442	} else if (ret != 0) {
443		/*
444		 * We previously failed to allocate memory and we need to log,
445		 * so we have to fail.
446		 */
447		goto out_unlock;
448	}
449
450	if (tm_list)
451		ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
452	if (!ret)
453		ret = tree_mod_log_insert(fs_info, tm);
454
455out_unlock:
456	write_unlock(&fs_info->tree_mod_log_lock);
457	if (ret)
458		goto free_tms;
459	kfree(tm_list);
460
461	return ret;
462
463free_tms:
464	if (tm_list) {
465		for (i = 0; i < nritems; i++)
466			kfree(tm_list[i]);
467		kfree(tm_list);
468	}
469	kfree(tm);
470
471	return ret;
472}
473
474static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
475						   u64 start, u64 min_seq,
476						   bool smallest)
477{
478	struct rb_root *tm_root;
479	struct rb_node *node;
480	struct tree_mod_elem *cur = NULL;
481	struct tree_mod_elem *found = NULL;
482
483	read_lock(&fs_info->tree_mod_log_lock);
484	tm_root = &fs_info->tree_mod_log;
485	node = tm_root->rb_node;
486	while (node) {
487		cur = rb_entry(node, struct tree_mod_elem, node);
488		if (cur->logical < start) {
489			node = node->rb_left;
490		} else if (cur->logical > start) {
491			node = node->rb_right;
492		} else if (cur->seq < min_seq) {
493			node = node->rb_left;
494		} else if (!smallest) {
495			/* We want the node with the highest seq */
496			if (found)
497				BUG_ON(found->seq > cur->seq);
498			found = cur;
499			node = node->rb_left;
500		} else if (cur->seq > min_seq) {
501			/* We want the node with the smallest seq */
502			if (found)
503				BUG_ON(found->seq < cur->seq);
504			found = cur;
505			node = node->rb_right;
506		} else {
507			found = cur;
508			break;
509		}
510	}
511	read_unlock(&fs_info->tree_mod_log_lock);
512
513	return found;
514}
515
516/*
517 * This returns the element from the log with the smallest time sequence
518 * value that's in the log (the oldest log item). Any element with a time
519 * sequence lower than min_seq will be ignored.
520 */
521static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
522							u64 start, u64 min_seq)
523{
524	return __tree_mod_log_search(fs_info, start, min_seq, true);
525}
526
527/*
528 * This returns the element from the log with the largest time sequence
529 * value that's in the log (the most recent log item). Any element with
530 * a time sequence lower than min_seq will be ignored.
531 */
532static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
533						 u64 start, u64 min_seq)
534{
535	return __tree_mod_log_search(fs_info, start, min_seq, false);
536}
537
538int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
539			       struct extent_buffer *src,
540			       unsigned long dst_offset,
541			       unsigned long src_offset,
542			       int nr_items)
543{
544	struct btrfs_fs_info *fs_info = dst->fs_info;
545	int ret = 0;
546	struct tree_mod_elem **tm_list = NULL;
547	struct tree_mod_elem **tm_list_add = NULL;
548	struct tree_mod_elem **tm_list_rem = NULL;
549	int i;
550	bool locked = false;
551	struct tree_mod_elem *dst_move_tm = NULL;
552	struct tree_mod_elem *src_move_tm = NULL;
553	u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
554	u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
555
556	if (!tree_mod_need_log(fs_info, NULL))
557		return 0;
558
559	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
560		return 0;
561
562	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
563			  GFP_NOFS);
564	if (!tm_list) {
565		ret = -ENOMEM;
566		goto lock;
567	}
568
569	if (dst_move_nr_items) {
570		dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
571						      dst_offset, dst_move_nr_items);
572		if (IS_ERR(dst_move_tm)) {
573			ret = PTR_ERR(dst_move_tm);
574			dst_move_tm = NULL;
575			goto lock;
576		}
577	}
578	if (src_move_nr_items) {
579		src_move_tm = tree_mod_log_alloc_move(src, src_offset,
580						      src_offset + nr_items,
581						      src_move_nr_items);
582		if (IS_ERR(src_move_tm)) {
583			ret = PTR_ERR(src_move_tm);
584			src_move_tm = NULL;
585			goto lock;
586		}
587	}
588
589	tm_list_add = tm_list;
590	tm_list_rem = tm_list + nr_items;
591	for (i = 0; i < nr_items; i++) {
592		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
593						     BTRFS_MOD_LOG_KEY_REMOVE);
594		if (!tm_list_rem[i]) {
595			ret = -ENOMEM;
596			goto lock;
597		}
598
599		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
600						     BTRFS_MOD_LOG_KEY_ADD);
601		if (!tm_list_add[i]) {
602			ret = -ENOMEM;
603			goto lock;
604		}
605	}
606
607lock:
608	if (tree_mod_dont_log(fs_info, NULL)) {
609		/*
610		 * Don't error if we failed to allocate memory because we don't
611		 * need to log.
612		 */
613		ret = 0;
614		goto free_tms;
615	}
616	locked = true;
617
618	/*
619	 * We previously failed to allocate memory and we need to log, so we
620	 * have to fail.
621	 */
622	if (ret != 0)
623		goto free_tms;
624
625	if (dst_move_tm) {
626		ret = tree_mod_log_insert(fs_info, dst_move_tm);
627		if (ret)
628			goto free_tms;
629	}
630	for (i = 0; i < nr_items; i++) {
631		ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
632		if (ret)
633			goto free_tms;
634		ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
635		if (ret)
636			goto free_tms;
637	}
638	if (src_move_tm) {
639		ret = tree_mod_log_insert(fs_info, src_move_tm);
640		if (ret)
641			goto free_tms;
642	}
643
644	write_unlock(&fs_info->tree_mod_log_lock);
645	kfree(tm_list);
646
647	return 0;
648
649free_tms:
650	if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
651		rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
652	kfree(dst_move_tm);
653	if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
654		rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
655	kfree(src_move_tm);
656	if (tm_list) {
657		for (i = 0; i < nr_items * 2; i++) {
658			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
659				rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
660			kfree(tm_list[i]);
661		}
662	}
663	if (locked)
664		write_unlock(&fs_info->tree_mod_log_lock);
665	kfree(tm_list);
666
667	return ret;
668}
669
670int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
671{
672	struct tree_mod_elem **tm_list = NULL;
673	int nritems = 0;
674	int i;
675	int ret = 0;
676
677	if (!tree_mod_need_log(eb->fs_info, eb))
678		return 0;
679
680	nritems = btrfs_header_nritems(eb);
681	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
682	if (!tm_list) {
683		ret = -ENOMEM;
684		goto lock;
685	}
686
687	for (i = 0; i < nritems; i++) {
688		tm_list[i] = alloc_tree_mod_elem(eb, i,
689				    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
690		if (!tm_list[i]) {
691			ret = -ENOMEM;
692			goto lock;
693		}
694	}
695
696lock:
697	if (tree_mod_dont_log(eb->fs_info, eb)) {
698		/*
699		 * Don't error if we failed to allocate memory because we don't
700		 * need to log.
701		 */
702		ret = 0;
703		goto free_tms;
704	} else if (ret != 0) {
705		/*
706		 * We previously failed to allocate memory and we need to log,
707		 * so we have to fail.
708		 */
709		goto out_unlock;
710	}
711
712	ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
713out_unlock:
714	write_unlock(&eb->fs_info->tree_mod_log_lock);
715	if (ret)
716		goto free_tms;
717	kfree(tm_list);
718
719	return 0;
720
721free_tms:
722	if (tm_list) {
723		for (i = 0; i < nritems; i++)
724			kfree(tm_list[i]);
725		kfree(tm_list);
726	}
727
728	return ret;
729}
730
731/*
732 * Returns the logical address of the oldest predecessor of the given root.
733 * Entries older than time_seq are ignored.
734 */
735static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
736						      u64 time_seq)
737{
738	struct tree_mod_elem *tm;
739	struct tree_mod_elem *found = NULL;
740	u64 root_logical = eb_root->start;
741	bool looped = false;
742
743	if (!time_seq)
744		return NULL;
745
746	/*
747	 * The very last operation that's logged for a root is the replacement
748	 * operation (if it is replaced at all). This has the logical address
749	 * of the *new* root, making it the very first operation that's logged
750	 * for this root.
751	 */
752	while (1) {
753		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
754						time_seq);
755		if (!looped && !tm)
756			return NULL;
757		/*
758		 * If there are no tree operation for the oldest root, we simply
759		 * return it. This should only happen if that (old) root is at
760		 * level 0.
761		 */
762		if (!tm)
763			break;
764
765		/*
766		 * If there's an operation that's not a root replacement, we
767		 * found the oldest version of our root. Normally, we'll find a
768		 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
769		 */
770		if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
771			break;
772
773		found = tm;
774		root_logical = tm->old_root.logical;
775		looped = true;
776	}
777
778	/* If there's no old root to return, return what we found instead */
779	if (!found)
780		found = tm;
781
782	return found;
783}
784
785
786/*
787 * tm is a pointer to the first operation to rewind within eb. Then, all
788 * previous operations will be rewound (until we reach something older than
789 * time_seq).
790 */
791static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
792				struct extent_buffer *eb,
793				u64 time_seq,
794				struct tree_mod_elem *first_tm)
795{
796	u32 n;
797	struct rb_node *next;
798	struct tree_mod_elem *tm = first_tm;
799	unsigned long o_dst;
800	unsigned long o_src;
801	unsigned long p_size = sizeof(struct btrfs_key_ptr);
802	/*
803	 * max_slot tracks the maximum valid slot of the rewind eb at every
804	 * step of the rewind. This is in contrast with 'n' which eventually
805	 * matches the number of items, but can be wrong during moves or if
806	 * removes overlap on already valid slots (which is probably separately
807	 * a bug). We do this to validate the offsets of memmoves for rewinding
808	 * moves and detect invalid memmoves.
809	 *
810	 * Since a rewind eb can start empty, max_slot is a signed integer with
811	 * a special meaning for -1, which is that no slot is valid to move out
812	 * of. Any other negative value is invalid.
813	 */
814	int max_slot;
815	int move_src_end_slot;
816	int move_dst_end_slot;
817
818	n = btrfs_header_nritems(eb);
819	max_slot = n - 1;
820	read_lock(&fs_info->tree_mod_log_lock);
821	while (tm && tm->seq >= time_seq) {
822		ASSERT(max_slot >= -1);
823		/*
824		 * All the operations are recorded with the operator used for
825		 * the modification. As we're going backwards, we do the
826		 * opposite of each operation here.
827		 */
828		switch (tm->op) {
829		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
830			BUG_ON(tm->slot < n);
831			fallthrough;
832		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
833		case BTRFS_MOD_LOG_KEY_REMOVE:
834			btrfs_set_node_key(eb, &tm->key, tm->slot);
835			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
836			btrfs_set_node_ptr_generation(eb, tm->slot,
837						      tm->generation);
838			n++;
839			if (tm->slot > max_slot)
840				max_slot = tm->slot;
841			break;
842		case BTRFS_MOD_LOG_KEY_REPLACE:
843			BUG_ON(tm->slot >= n);
844			btrfs_set_node_key(eb, &tm->key, tm->slot);
845			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
846			btrfs_set_node_ptr_generation(eb, tm->slot,
847						      tm->generation);
848			break;
849		case BTRFS_MOD_LOG_KEY_ADD:
850			/*
851			 * It is possible we could have already removed keys
852			 * behind the known max slot, so this will be an
853			 * overestimate. In practice, the copy operation
854			 * inserts them in increasing order, and overestimating
855			 * just means we miss some warnings, so it's OK. It
856			 * isn't worth carefully tracking the full array of
857			 * valid slots to check against when moving.
858			 */
859			if (tm->slot == max_slot)
860				max_slot--;
861			/* if a move operation is needed it's in the log */
862			n--;
863			break;
864		case BTRFS_MOD_LOG_MOVE_KEYS:
865			ASSERT(tm->move.nr_items > 0);
866			move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
867			move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
868			o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
869			o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
870			if (WARN_ON(move_src_end_slot > max_slot ||
871				    tm->move.nr_items <= 0)) {
872				btrfs_warn(fs_info,
873"move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
874					   eb->start, tm->slot,
875					   tm->move.dst_slot, tm->move.nr_items,
876					   tm->seq, n, max_slot);
877			}
878			memmove_extent_buffer(eb, o_dst, o_src,
879					      tm->move.nr_items * p_size);
880			max_slot = move_dst_end_slot;
881			break;
882		case BTRFS_MOD_LOG_ROOT_REPLACE:
883			/*
884			 * This operation is special. For roots, this must be
885			 * handled explicitly before rewinding.
886			 * For non-roots, this operation may exist if the node
887			 * was a root: root A -> child B; then A gets empty and
888			 * B is promoted to the new root. In the mod log, we'll
889			 * have a root-replace operation for B, a tree block
890			 * that is no root. We simply ignore that operation.
891			 */
892			break;
893		}
894		next = rb_next(&tm->node);
895		if (!next)
896			break;
897		tm = rb_entry(next, struct tree_mod_elem, node);
898		if (tm->logical != first_tm->logical)
899			break;
900	}
901	read_unlock(&fs_info->tree_mod_log_lock);
902	btrfs_set_header_nritems(eb, n);
903}
904
905/*
906 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
907 * is returned. If rewind operations happen, a fresh buffer is returned. The
908 * returned buffer is always read-locked. If the returned buffer is not the
909 * input buffer, the lock on the input buffer is released and the input buffer
910 * is freed (its refcount is decremented).
911 */
912struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
913						struct btrfs_path *path,
914						struct extent_buffer *eb,
915						u64 time_seq)
916{
917	struct extent_buffer *eb_rewin;
918	struct tree_mod_elem *tm;
919
920	if (!time_seq)
921		return eb;
922
923	if (btrfs_header_level(eb) == 0)
924		return eb;
925
926	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
927	if (!tm)
928		return eb;
929
930	if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
931		BUG_ON(tm->slot != 0);
932		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
933		if (!eb_rewin) {
934			btrfs_tree_read_unlock(eb);
935			free_extent_buffer(eb);
936			return NULL;
937		}
938		btrfs_set_header_bytenr(eb_rewin, eb->start);
939		btrfs_set_header_backref_rev(eb_rewin,
940					     btrfs_header_backref_rev(eb));
941		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
942		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
943	} else {
944		eb_rewin = btrfs_clone_extent_buffer(eb);
945		if (!eb_rewin) {
946			btrfs_tree_read_unlock(eb);
947			free_extent_buffer(eb);
948			return NULL;
949		}
950	}
951
952	btrfs_tree_read_unlock(eb);
953	free_extent_buffer(eb);
954
955	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
956				       eb_rewin, btrfs_header_level(eb_rewin));
957	btrfs_tree_read_lock(eb_rewin);
958	tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
959	WARN_ON(btrfs_header_nritems(eb_rewin) >
960		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
961
962	return eb_rewin;
963}
964
965/*
966 * Rewind the state of @root's root node to the given @time_seq value.
967 * If there are no changes, the current root->root_node is returned. If anything
968 * changed in between, there's a fresh buffer allocated on which the rewind
969 * operations are done. In any case, the returned buffer is read locked.
970 * Returns NULL on error (with no locks held).
971 */
972struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
973{
974	struct btrfs_fs_info *fs_info = root->fs_info;
975	struct tree_mod_elem *tm;
976	struct extent_buffer *eb = NULL;
977	struct extent_buffer *eb_root;
978	u64 eb_root_owner = 0;
979	struct extent_buffer *old;
980	struct tree_mod_root *old_root = NULL;
981	u64 old_generation = 0;
982	u64 logical;
983	int level;
984
985	eb_root = btrfs_read_lock_root_node(root);
986	tm = tree_mod_log_oldest_root(eb_root, time_seq);
987	if (!tm)
988		return eb_root;
989
990	if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
991		old_root = &tm->old_root;
992		old_generation = tm->generation;
993		logical = old_root->logical;
994		level = old_root->level;
995	} else {
996		logical = eb_root->start;
997		level = btrfs_header_level(eb_root);
998	}
999
1000	tm = tree_mod_log_search(fs_info, logical, time_seq);
1001	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1002		struct btrfs_tree_parent_check check = { 0 };
1003
1004		btrfs_tree_read_unlock(eb_root);
1005		free_extent_buffer(eb_root);
1006
1007		check.level = level;
1008		check.owner_root = root->root_key.objectid;
1009
1010		old = read_tree_block(fs_info, logical, &check);
1011		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1012			if (!IS_ERR(old))
1013				free_extent_buffer(old);
1014			btrfs_warn(fs_info,
1015				   "failed to read tree block %llu from get_old_root",
1016				   logical);
1017		} else {
1018			struct tree_mod_elem *tm2;
1019
1020			btrfs_tree_read_lock(old);
1021			eb = btrfs_clone_extent_buffer(old);
1022			/*
1023			 * After the lookup for the most recent tree mod operation
1024			 * above and before we locked and cloned the extent buffer
1025			 * 'old', a new tree mod log operation may have been added.
1026			 * So lookup for a more recent one to make sure the number
1027			 * of mod log operations we replay is consistent with the
1028			 * number of items we have in the cloned extent buffer,
1029			 * otherwise we can hit a BUG_ON when rewinding the extent
1030			 * buffer.
1031			 */
1032			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1033			btrfs_tree_read_unlock(old);
1034			free_extent_buffer(old);
1035			ASSERT(tm2);
1036			ASSERT(tm2 == tm || tm2->seq > tm->seq);
1037			if (!tm2 || tm2->seq < tm->seq) {
1038				free_extent_buffer(eb);
1039				return NULL;
1040			}
1041			tm = tm2;
1042		}
1043	} else if (old_root) {
1044		eb_root_owner = btrfs_header_owner(eb_root);
1045		btrfs_tree_read_unlock(eb_root);
1046		free_extent_buffer(eb_root);
1047		eb = alloc_dummy_extent_buffer(fs_info, logical);
1048	} else {
1049		eb = btrfs_clone_extent_buffer(eb_root);
1050		btrfs_tree_read_unlock(eb_root);
1051		free_extent_buffer(eb_root);
1052	}
1053
1054	if (!eb)
1055		return NULL;
1056	if (old_root) {
1057		btrfs_set_header_bytenr(eb, eb->start);
1058		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1059		btrfs_set_header_owner(eb, eb_root_owner);
1060		btrfs_set_header_level(eb, old_root->level);
1061		btrfs_set_header_generation(eb, old_generation);
1062	}
1063	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1064				       btrfs_header_level(eb));
1065	btrfs_tree_read_lock(eb);
1066	if (tm)
1067		tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1068	else
1069		WARN_ON(btrfs_header_level(eb) != 0);
1070	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1071
1072	return eb;
1073}
1074
1075int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1076{
1077	struct tree_mod_elem *tm;
1078	int level;
1079	struct extent_buffer *eb_root = btrfs_root_node(root);
1080
1081	tm = tree_mod_log_oldest_root(eb_root, time_seq);
1082	if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1083		level = tm->old_root.level;
1084	else
1085		level = btrfs_header_level(eb_root);
1086
1087	free_extent_buffer(eb_root);
1088
1089	return level;
1090}
1091
1092/*
1093 * Return the lowest sequence number in the tree modification log.
1094 *
1095 * Return the sequence number of the oldest tree modification log user, which
1096 * corresponds to the lowest sequence number of all existing users. If there are
1097 * no users it returns 0.
1098 */
1099u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1100{
1101	u64 ret = 0;
1102
1103	read_lock(&fs_info->tree_mod_log_lock);
1104	if (!list_empty(&fs_info->tree_mod_seq_list)) {
1105		struct btrfs_seq_list *elem;
1106
1107		elem = list_first_entry(&fs_info->tree_mod_seq_list,
1108					struct btrfs_seq_list, list);
1109		ret = elem->seq;
1110	}
1111	read_unlock(&fs_info->tree_mod_log_lock);
1112
1113	return ret;
1114}
1115