xref: /kernel/linux/linux-5.10/fs/btrfs/ctree.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/slab.h>
8#include <linux/rbtree.h>
9#include <linux/mm.h>
10#include "ctree.h"
11#include "disk-io.h"
12#include "transaction.h"
13#include "print-tree.h"
14#include "locking.h"
15#include "volumes.h"
16#include "qgroup.h"
17
18static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19		      *root, struct btrfs_path *path, int level);
20static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21		      const struct btrfs_key *ins_key, struct btrfs_path *path,
22		      int data_size, int extend);
23static int push_node_left(struct btrfs_trans_handle *trans,
24			  struct extent_buffer *dst,
25			  struct extent_buffer *src, int empty);
26static int balance_node_right(struct btrfs_trans_handle *trans,
27			      struct extent_buffer *dst_buf,
28			      struct extent_buffer *src_buf);
29static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30		    int level, int slot);
31
32static const struct btrfs_csums {
33	u16		size;
34	const char	name[10];
35	const char	driver[12];
36} btrfs_csums[] = {
37	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
41				     .driver = "blake2b-256" },
42};
43
44int btrfs_super_csum_size(const struct btrfs_super_block *s)
45{
46	u16 t = btrfs_super_csum_type(s);
47	/*
48	 * csum type is validated at mount time
49	 */
50	return btrfs_csums[t].size;
51}
52
53const char *btrfs_super_csum_name(u16 csum_type)
54{
55	/* csum type is validated at mount time */
56	return btrfs_csums[csum_type].name;
57}
58
59/*
60 * Return driver name if defined, otherwise the name that's also a valid driver
61 * name
62 */
63const char *btrfs_super_csum_driver(u16 csum_type)
64{
65	/* csum type is validated at mount time */
66	return btrfs_csums[csum_type].driver[0] ?
67		btrfs_csums[csum_type].driver :
68		btrfs_csums[csum_type].name;
69}
70
71size_t __attribute_const__ btrfs_get_num_csums(void)
72{
73	return ARRAY_SIZE(btrfs_csums);
74}
75
76struct btrfs_path *btrfs_alloc_path(void)
77{
78	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
79}
80
81/* this also releases the path */
82void btrfs_free_path(struct btrfs_path *p)
83{
84	if (!p)
85		return;
86	btrfs_release_path(p);
87	kmem_cache_free(btrfs_path_cachep, p);
88}
89
90/*
91 * path release drops references on the extent buffers in the path
92 * and it drops any locks held by this path
93 *
94 * It is safe to call this on paths that no locks or extent buffers held.
95 */
96noinline void btrfs_release_path(struct btrfs_path *p)
97{
98	int i;
99
100	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
101		p->slots[i] = 0;
102		if (!p->nodes[i])
103			continue;
104		if (p->locks[i]) {
105			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
106			p->locks[i] = 0;
107		}
108		free_extent_buffer(p->nodes[i]);
109		p->nodes[i] = NULL;
110	}
111}
112
113/*
114 * safely gets a reference on the root node of a tree.  A lock
115 * is not taken, so a concurrent writer may put a different node
116 * at the root of the tree.  See btrfs_lock_root_node for the
117 * looping required.
118 *
119 * The extent buffer returned by this has a reference taken, so
120 * it won't disappear.  It may stop being the root of the tree
121 * at any time because there are no locks held.
122 */
123struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
124{
125	struct extent_buffer *eb;
126
127	while (1) {
128		rcu_read_lock();
129		eb = rcu_dereference(root->node);
130
131		/*
132		 * RCU really hurts here, we could free up the root node because
133		 * it was COWed but we may not get the new root node yet so do
134		 * the inc_not_zero dance and if it doesn't work then
135		 * synchronize_rcu and try again.
136		 */
137		if (atomic_inc_not_zero(&eb->refs)) {
138			rcu_read_unlock();
139			break;
140		}
141		rcu_read_unlock();
142		synchronize_rcu();
143	}
144	return eb;
145}
146
147/*
148 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
149 * just get put onto a simple dirty list.  Transaction walks this list to make
150 * sure they get properly updated on disk.
151 */
152static void add_root_to_dirty_list(struct btrfs_root *root)
153{
154	struct btrfs_fs_info *fs_info = root->fs_info;
155
156	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
157	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
158		return;
159
160	spin_lock(&fs_info->trans_lock);
161	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
162		/* Want the extent tree to be the last on the list */
163		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
164			list_move_tail(&root->dirty_list,
165				       &fs_info->dirty_cowonly_roots);
166		else
167			list_move(&root->dirty_list,
168				  &fs_info->dirty_cowonly_roots);
169	}
170	spin_unlock(&fs_info->trans_lock);
171}
172
173/*
174 * used by snapshot creation to make a copy of a root for a tree with
175 * a given objectid.  The buffer with the new root node is returned in
176 * cow_ret, and this func returns zero on success or a negative error code.
177 */
178int btrfs_copy_root(struct btrfs_trans_handle *trans,
179		      struct btrfs_root *root,
180		      struct extent_buffer *buf,
181		      struct extent_buffer **cow_ret, u64 new_root_objectid)
182{
183	struct btrfs_fs_info *fs_info = root->fs_info;
184	struct extent_buffer *cow;
185	int ret = 0;
186	int level;
187	struct btrfs_disk_key disk_key;
188
189	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
190		trans->transid != fs_info->running_transaction->transid);
191	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
192		trans->transid != root->last_trans);
193
194	level = btrfs_header_level(buf);
195	if (level == 0)
196		btrfs_item_key(buf, &disk_key, 0);
197	else
198		btrfs_node_key(buf, &disk_key, 0);
199
200	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
201				     &disk_key, level, buf->start, 0,
202				     BTRFS_NESTING_NEW_ROOT);
203	if (IS_ERR(cow))
204		return PTR_ERR(cow);
205
206	copy_extent_buffer_full(cow, buf);
207	btrfs_set_header_bytenr(cow, cow->start);
208	btrfs_set_header_generation(cow, trans->transid);
209	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
210	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
211				     BTRFS_HEADER_FLAG_RELOC);
212	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
213		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
214	else
215		btrfs_set_header_owner(cow, new_root_objectid);
216
217	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
218
219	WARN_ON(btrfs_header_generation(buf) > trans->transid);
220	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
221		ret = btrfs_inc_ref(trans, root, cow, 1);
222	else
223		ret = btrfs_inc_ref(trans, root, cow, 0);
224	if (ret) {
225		btrfs_tree_unlock(cow);
226		free_extent_buffer(cow);
227		btrfs_abort_transaction(trans, ret);
228		return ret;
229	}
230
231	btrfs_mark_buffer_dirty(cow);
232	*cow_ret = cow;
233	return 0;
234}
235
236enum mod_log_op {
237	MOD_LOG_KEY_REPLACE,
238	MOD_LOG_KEY_ADD,
239	MOD_LOG_KEY_REMOVE,
240	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
241	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
242	MOD_LOG_MOVE_KEYS,
243	MOD_LOG_ROOT_REPLACE,
244};
245
246struct tree_mod_root {
247	u64 logical;
248	u8 level;
249};
250
251struct tree_mod_elem {
252	struct rb_node node;
253	u64 logical;
254	u64 seq;
255	enum mod_log_op op;
256
257	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
258	int slot;
259
260	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
261	u64 generation;
262
263	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
264	struct btrfs_disk_key key;
265	u64 blockptr;
266
267	/* this is used for op == MOD_LOG_MOVE_KEYS */
268	struct {
269		int dst_slot;
270		int nr_items;
271	} move;
272
273	/* this is used for op == MOD_LOG_ROOT_REPLACE */
274	struct tree_mod_root old_root;
275};
276
277/*
278 * Pull a new tree mod seq number for our operation.
279 */
280static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
281{
282	return atomic64_inc_return(&fs_info->tree_mod_seq);
283}
284
285/*
286 * This adds a new blocker to the tree mod log's blocker list if the @elem
287 * passed does not already have a sequence number set. So when a caller expects
288 * to record tree modifications, it should ensure to set elem->seq to zero
289 * before calling btrfs_get_tree_mod_seq.
290 * Returns a fresh, unused tree log modification sequence number, even if no new
291 * blocker was added.
292 */
293u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
294			   struct seq_list *elem)
295{
296	write_lock(&fs_info->tree_mod_log_lock);
297	if (!elem->seq) {
298		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
299		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
300	}
301	write_unlock(&fs_info->tree_mod_log_lock);
302
303	return elem->seq;
304}
305
306void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
307			    struct seq_list *elem)
308{
309	struct rb_root *tm_root;
310	struct rb_node *node;
311	struct rb_node *next;
312	struct tree_mod_elem *tm;
313	u64 min_seq = (u64)-1;
314	u64 seq_putting = elem->seq;
315
316	if (!seq_putting)
317		return;
318
319	write_lock(&fs_info->tree_mod_log_lock);
320	list_del(&elem->list);
321	elem->seq = 0;
322
323	if (!list_empty(&fs_info->tree_mod_seq_list)) {
324		struct seq_list *first;
325
326		first = list_first_entry(&fs_info->tree_mod_seq_list,
327					 struct seq_list, list);
328		if (seq_putting > first->seq) {
329			/*
330			 * Blocker with lower sequence number exists, we
331			 * cannot remove anything from the log.
332			 */
333			write_unlock(&fs_info->tree_mod_log_lock);
334			return;
335		}
336		min_seq = first->seq;
337	}
338
339	/*
340	 * anything that's lower than the lowest existing (read: blocked)
341	 * sequence number can be removed from the tree.
342	 */
343	tm_root = &fs_info->tree_mod_log;
344	for (node = rb_first(tm_root); node; node = next) {
345		next = rb_next(node);
346		tm = rb_entry(node, struct tree_mod_elem, node);
347		if (tm->seq >= min_seq)
348			continue;
349		rb_erase(node, tm_root);
350		kfree(tm);
351	}
352	write_unlock(&fs_info->tree_mod_log_lock);
353}
354
355/*
356 * key order of the log:
357 *       node/leaf start address -> sequence
358 *
359 * The 'start address' is the logical address of the *new* root node
360 * for root replace operations, or the logical address of the affected
361 * block for all other operations.
362 */
363static noinline int
364__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
365{
366	struct rb_root *tm_root;
367	struct rb_node **new;
368	struct rb_node *parent = NULL;
369	struct tree_mod_elem *cur;
370
371	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
372
373	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
374
375	tm_root = &fs_info->tree_mod_log;
376	new = &tm_root->rb_node;
377	while (*new) {
378		cur = rb_entry(*new, struct tree_mod_elem, node);
379		parent = *new;
380		if (cur->logical < tm->logical)
381			new = &((*new)->rb_left);
382		else if (cur->logical > tm->logical)
383			new = &((*new)->rb_right);
384		else if (cur->seq < tm->seq)
385			new = &((*new)->rb_left);
386		else if (cur->seq > tm->seq)
387			new = &((*new)->rb_right);
388		else
389			return -EEXIST;
390	}
391
392	rb_link_node(&tm->node, parent, new);
393	rb_insert_color(&tm->node, tm_root);
394	return 0;
395}
396
397/*
398 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
399 * returns zero with the tree_mod_log_lock acquired. The caller must hold
400 * this until all tree mod log insertions are recorded in the rb tree and then
401 * write unlock fs_info::tree_mod_log_lock.
402 */
403static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
404				    struct extent_buffer *eb) {
405	smp_mb();
406	if (list_empty(&(fs_info)->tree_mod_seq_list))
407		return 1;
408	if (eb && btrfs_header_level(eb) == 0)
409		return 1;
410
411	write_lock(&fs_info->tree_mod_log_lock);
412	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
413		write_unlock(&fs_info->tree_mod_log_lock);
414		return 1;
415	}
416
417	return 0;
418}
419
420/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
421static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
422				    struct extent_buffer *eb)
423{
424	smp_mb();
425	if (list_empty(&(fs_info)->tree_mod_seq_list))
426		return 0;
427	if (eb && btrfs_header_level(eb) == 0)
428		return 0;
429
430	return 1;
431}
432
433static struct tree_mod_elem *
434alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
435		    enum mod_log_op op, gfp_t flags)
436{
437	struct tree_mod_elem *tm;
438
439	tm = kzalloc(sizeof(*tm), flags);
440	if (!tm)
441		return NULL;
442
443	tm->logical = eb->start;
444	if (op != MOD_LOG_KEY_ADD) {
445		btrfs_node_key(eb, &tm->key, slot);
446		tm->blockptr = btrfs_node_blockptr(eb, slot);
447	}
448	tm->op = op;
449	tm->slot = slot;
450	tm->generation = btrfs_node_ptr_generation(eb, slot);
451	RB_CLEAR_NODE(&tm->node);
452
453	return tm;
454}
455
456static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
457		enum mod_log_op op, gfp_t flags)
458{
459	struct tree_mod_elem *tm;
460	int ret;
461
462	if (!tree_mod_need_log(eb->fs_info, eb))
463		return 0;
464
465	tm = alloc_tree_mod_elem(eb, slot, op, flags);
466	if (!tm)
467		return -ENOMEM;
468
469	if (tree_mod_dont_log(eb->fs_info, eb)) {
470		kfree(tm);
471		return 0;
472	}
473
474	ret = __tree_mod_log_insert(eb->fs_info, tm);
475	write_unlock(&eb->fs_info->tree_mod_log_lock);
476	if (ret)
477		kfree(tm);
478
479	return ret;
480}
481
482static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
483		int dst_slot, int src_slot, int nr_items)
484{
485	struct tree_mod_elem *tm = NULL;
486	struct tree_mod_elem **tm_list = NULL;
487	int ret = 0;
488	int i;
489	int locked = 0;
490
491	if (!tree_mod_need_log(eb->fs_info, eb))
492		return 0;
493
494	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
495	if (!tm_list)
496		return -ENOMEM;
497
498	tm = kzalloc(sizeof(*tm), GFP_NOFS);
499	if (!tm) {
500		ret = -ENOMEM;
501		goto free_tms;
502	}
503
504	tm->logical = eb->start;
505	tm->slot = src_slot;
506	tm->move.dst_slot = dst_slot;
507	tm->move.nr_items = nr_items;
508	tm->op = MOD_LOG_MOVE_KEYS;
509
510	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
511		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
512		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
513		if (!tm_list[i]) {
514			ret = -ENOMEM;
515			goto free_tms;
516		}
517	}
518
519	if (tree_mod_dont_log(eb->fs_info, eb))
520		goto free_tms;
521	locked = 1;
522
523	/*
524	 * When we override something during the move, we log these removals.
525	 * This can only happen when we move towards the beginning of the
526	 * buffer, i.e. dst_slot < src_slot.
527	 */
528	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
529		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
530		if (ret)
531			goto free_tms;
532	}
533
534	ret = __tree_mod_log_insert(eb->fs_info, tm);
535	if (ret)
536		goto free_tms;
537	write_unlock(&eb->fs_info->tree_mod_log_lock);
538	kfree(tm_list);
539
540	return 0;
541free_tms:
542	for (i = 0; i < nr_items; i++) {
543		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
544			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
545		kfree(tm_list[i]);
546	}
547	if (locked)
548		write_unlock(&eb->fs_info->tree_mod_log_lock);
549	kfree(tm_list);
550	kfree(tm);
551
552	return ret;
553}
554
555static inline int
556__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
557		       struct tree_mod_elem **tm_list,
558		       int nritems)
559{
560	int i, j;
561	int ret;
562
563	for (i = nritems - 1; i >= 0; i--) {
564		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
565		if (ret) {
566			for (j = nritems - 1; j > i; j--)
567				rb_erase(&tm_list[j]->node,
568					 &fs_info->tree_mod_log);
569			return ret;
570		}
571	}
572
573	return 0;
574}
575
576static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
577			 struct extent_buffer *new_root, int log_removal)
578{
579	struct btrfs_fs_info *fs_info = old_root->fs_info;
580	struct tree_mod_elem *tm = NULL;
581	struct tree_mod_elem **tm_list = NULL;
582	int nritems = 0;
583	int ret = 0;
584	int i;
585
586	if (!tree_mod_need_log(fs_info, NULL))
587		return 0;
588
589	if (log_removal && btrfs_header_level(old_root) > 0) {
590		nritems = btrfs_header_nritems(old_root);
591		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
592				  GFP_NOFS);
593		if (!tm_list) {
594			ret = -ENOMEM;
595			goto free_tms;
596		}
597		for (i = 0; i < nritems; i++) {
598			tm_list[i] = alloc_tree_mod_elem(old_root, i,
599			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
600			if (!tm_list[i]) {
601				ret = -ENOMEM;
602				goto free_tms;
603			}
604		}
605	}
606
607	tm = kzalloc(sizeof(*tm), GFP_NOFS);
608	if (!tm) {
609		ret = -ENOMEM;
610		goto free_tms;
611	}
612
613	tm->logical = new_root->start;
614	tm->old_root.logical = old_root->start;
615	tm->old_root.level = btrfs_header_level(old_root);
616	tm->generation = btrfs_header_generation(old_root);
617	tm->op = MOD_LOG_ROOT_REPLACE;
618
619	if (tree_mod_dont_log(fs_info, NULL))
620		goto free_tms;
621
622	if (tm_list)
623		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
624	if (!ret)
625		ret = __tree_mod_log_insert(fs_info, tm);
626
627	write_unlock(&fs_info->tree_mod_log_lock);
628	if (ret)
629		goto free_tms;
630	kfree(tm_list);
631
632	return ret;
633
634free_tms:
635	if (tm_list) {
636		for (i = 0; i < nritems; i++)
637			kfree(tm_list[i]);
638		kfree(tm_list);
639	}
640	kfree(tm);
641
642	return ret;
643}
644
645static struct tree_mod_elem *
646__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
647		      int smallest)
648{
649	struct rb_root *tm_root;
650	struct rb_node *node;
651	struct tree_mod_elem *cur = NULL;
652	struct tree_mod_elem *found = NULL;
653
654	read_lock(&fs_info->tree_mod_log_lock);
655	tm_root = &fs_info->tree_mod_log;
656	node = tm_root->rb_node;
657	while (node) {
658		cur = rb_entry(node, struct tree_mod_elem, node);
659		if (cur->logical < start) {
660			node = node->rb_left;
661		} else if (cur->logical > start) {
662			node = node->rb_right;
663		} else if (cur->seq < min_seq) {
664			node = node->rb_left;
665		} else if (!smallest) {
666			/* we want the node with the highest seq */
667			if (found)
668				BUG_ON(found->seq > cur->seq);
669			found = cur;
670			node = node->rb_left;
671		} else if (cur->seq > min_seq) {
672			/* we want the node with the smallest seq */
673			if (found)
674				BUG_ON(found->seq < cur->seq);
675			found = cur;
676			node = node->rb_right;
677		} else {
678			found = cur;
679			break;
680		}
681	}
682	read_unlock(&fs_info->tree_mod_log_lock);
683
684	return found;
685}
686
687/*
688 * this returns the element from the log with the smallest time sequence
689 * value that's in the log (the oldest log item). any element with a time
690 * sequence lower than min_seq will be ignored.
691 */
692static struct tree_mod_elem *
693tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
694			   u64 min_seq)
695{
696	return __tree_mod_log_search(fs_info, start, min_seq, 1);
697}
698
699/*
700 * this returns the element from the log with the largest time sequence
701 * value that's in the log (the most recent log item). any element with
702 * a time sequence lower than min_seq will be ignored.
703 */
704static struct tree_mod_elem *
705tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
706{
707	return __tree_mod_log_search(fs_info, start, min_seq, 0);
708}
709
710static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
711		     struct extent_buffer *src, unsigned long dst_offset,
712		     unsigned long src_offset, int nr_items)
713{
714	struct btrfs_fs_info *fs_info = dst->fs_info;
715	int ret = 0;
716	struct tree_mod_elem **tm_list = NULL;
717	struct tree_mod_elem **tm_list_add, **tm_list_rem;
718	int i;
719	int locked = 0;
720
721	if (!tree_mod_need_log(fs_info, NULL))
722		return 0;
723
724	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
725		return 0;
726
727	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
728			  GFP_NOFS);
729	if (!tm_list)
730		return -ENOMEM;
731
732	tm_list_add = tm_list;
733	tm_list_rem = tm_list + nr_items;
734	for (i = 0; i < nr_items; i++) {
735		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
736		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
737		if (!tm_list_rem[i]) {
738			ret = -ENOMEM;
739			goto free_tms;
740		}
741
742		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
743		    MOD_LOG_KEY_ADD, GFP_NOFS);
744		if (!tm_list_add[i]) {
745			ret = -ENOMEM;
746			goto free_tms;
747		}
748	}
749
750	if (tree_mod_dont_log(fs_info, NULL))
751		goto free_tms;
752	locked = 1;
753
754	for (i = 0; i < nr_items; i++) {
755		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
756		if (ret)
757			goto free_tms;
758		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
759		if (ret)
760			goto free_tms;
761	}
762
763	write_unlock(&fs_info->tree_mod_log_lock);
764	kfree(tm_list);
765
766	return 0;
767
768free_tms:
769	for (i = 0; i < nr_items * 2; i++) {
770		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
771			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
772		kfree(tm_list[i]);
773	}
774	if (locked)
775		write_unlock(&fs_info->tree_mod_log_lock);
776	kfree(tm_list);
777
778	return ret;
779}
780
781static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
782{
783	struct tree_mod_elem **tm_list = NULL;
784	int nritems = 0;
785	int i;
786	int ret = 0;
787
788	if (btrfs_header_level(eb) == 0)
789		return 0;
790
791	if (!tree_mod_need_log(eb->fs_info, NULL))
792		return 0;
793
794	nritems = btrfs_header_nritems(eb);
795	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
796	if (!tm_list)
797		return -ENOMEM;
798
799	for (i = 0; i < nritems; i++) {
800		tm_list[i] = alloc_tree_mod_elem(eb, i,
801		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
802		if (!tm_list[i]) {
803			ret = -ENOMEM;
804			goto free_tms;
805		}
806	}
807
808	if (tree_mod_dont_log(eb->fs_info, eb))
809		goto free_tms;
810
811	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
812	write_unlock(&eb->fs_info->tree_mod_log_lock);
813	if (ret)
814		goto free_tms;
815	kfree(tm_list);
816
817	return 0;
818
819free_tms:
820	for (i = 0; i < nritems; i++)
821		kfree(tm_list[i]);
822	kfree(tm_list);
823
824	return ret;
825}
826
827/*
828 * check if the tree block can be shared by multiple trees
829 */
830int btrfs_block_can_be_shared(struct btrfs_root *root,
831			      struct extent_buffer *buf)
832{
833	/*
834	 * Tree blocks not in shareable trees and tree roots are never shared.
835	 * If a block was allocated after the last snapshot and the block was
836	 * not allocated by tree relocation, we know the block is not shared.
837	 */
838	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
839	    buf != root->node && buf != root->commit_root &&
840	    (btrfs_header_generation(buf) <=
841	     btrfs_root_last_snapshot(&root->root_item) ||
842	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
843		return 1;
844
845	return 0;
846}
847
848static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
849				       struct btrfs_root *root,
850				       struct extent_buffer *buf,
851				       struct extent_buffer *cow,
852				       int *last_ref)
853{
854	struct btrfs_fs_info *fs_info = root->fs_info;
855	u64 refs;
856	u64 owner;
857	u64 flags;
858	u64 new_flags = 0;
859	int ret;
860
861	/*
862	 * Backrefs update rules:
863	 *
864	 * Always use full backrefs for extent pointers in tree block
865	 * allocated by tree relocation.
866	 *
867	 * If a shared tree block is no longer referenced by its owner
868	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
869	 * use full backrefs for extent pointers in tree block.
870	 *
871	 * If a tree block is been relocating
872	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
873	 * use full backrefs for extent pointers in tree block.
874	 * The reason for this is some operations (such as drop tree)
875	 * are only allowed for blocks use full backrefs.
876	 */
877
878	if (btrfs_block_can_be_shared(root, buf)) {
879		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
880					       btrfs_header_level(buf), 1,
881					       &refs, &flags);
882		if (ret)
883			return ret;
884		if (refs == 0) {
885			ret = -EROFS;
886			btrfs_handle_fs_error(fs_info, ret, NULL);
887			return ret;
888		}
889	} else {
890		refs = 1;
891		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
892		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
893			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
894		else
895			flags = 0;
896	}
897
898	owner = btrfs_header_owner(buf);
899	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
900	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
901
902	if (refs > 1) {
903		if ((owner == root->root_key.objectid ||
904		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
905		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
906			ret = btrfs_inc_ref(trans, root, buf, 1);
907			if (ret)
908				return ret;
909
910			if (root->root_key.objectid ==
911			    BTRFS_TREE_RELOC_OBJECTID) {
912				ret = btrfs_dec_ref(trans, root, buf, 0);
913				if (ret)
914					return ret;
915				ret = btrfs_inc_ref(trans, root, cow, 1);
916				if (ret)
917					return ret;
918			}
919			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
920		} else {
921
922			if (root->root_key.objectid ==
923			    BTRFS_TREE_RELOC_OBJECTID)
924				ret = btrfs_inc_ref(trans, root, cow, 1);
925			else
926				ret = btrfs_inc_ref(trans, root, cow, 0);
927			if (ret)
928				return ret;
929		}
930		if (new_flags != 0) {
931			int level = btrfs_header_level(buf);
932
933			ret = btrfs_set_disk_extent_flags(trans, buf,
934							  new_flags, level, 0);
935			if (ret)
936				return ret;
937		}
938	} else {
939		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
940			if (root->root_key.objectid ==
941			    BTRFS_TREE_RELOC_OBJECTID)
942				ret = btrfs_inc_ref(trans, root, cow, 1);
943			else
944				ret = btrfs_inc_ref(trans, root, cow, 0);
945			if (ret)
946				return ret;
947			ret = btrfs_dec_ref(trans, root, buf, 1);
948			if (ret)
949				return ret;
950		}
951		btrfs_clean_tree_block(buf);
952		*last_ref = 1;
953	}
954	return 0;
955}
956
957static struct extent_buffer *alloc_tree_block_no_bg_flush(
958					  struct btrfs_trans_handle *trans,
959					  struct btrfs_root *root,
960					  u64 parent_start,
961					  const struct btrfs_disk_key *disk_key,
962					  int level,
963					  u64 hint,
964					  u64 empty_size,
965					  enum btrfs_lock_nesting nest)
966{
967	struct btrfs_fs_info *fs_info = root->fs_info;
968	struct extent_buffer *ret;
969
970	/*
971	 * If we are COWing a node/leaf from the extent, chunk, device or free
972	 * space trees, make sure that we do not finish block group creation of
973	 * pending block groups. We do this to avoid a deadlock.
974	 * COWing can result in allocation of a new chunk, and flushing pending
975	 * block groups (btrfs_create_pending_block_groups()) can be triggered
976	 * when finishing allocation of a new chunk. Creation of a pending block
977	 * group modifies the extent, chunk, device and free space trees,
978	 * therefore we could deadlock with ourselves since we are holding a
979	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
980	 * try to COW later.
981	 * For similar reasons, we also need to delay flushing pending block
982	 * groups when splitting a leaf or node, from one of those trees, since
983	 * we are holding a write lock on it and its parent or when inserting a
984	 * new root node for one of those trees.
985	 */
986	if (root == fs_info->extent_root ||
987	    root == fs_info->chunk_root ||
988	    root == fs_info->dev_root ||
989	    root == fs_info->free_space_root)
990		trans->can_flush_pending_bgs = false;
991
992	ret = btrfs_alloc_tree_block(trans, root, parent_start,
993				     root->root_key.objectid, disk_key, level,
994				     hint, empty_size, nest);
995	trans->can_flush_pending_bgs = true;
996
997	return ret;
998}
999
1000/*
1001 * does the dirty work in cow of a single block.  The parent block (if
1002 * supplied) is updated to point to the new cow copy.  The new buffer is marked
1003 * dirty and returned locked.  If you modify the block it needs to be marked
1004 * dirty again.
1005 *
1006 * search_start -- an allocation hint for the new block
1007 *
1008 * empty_size -- a hint that you plan on doing more cow.  This is the size in
1009 * bytes the allocator should try to find free next to the block it returns.
1010 * This is just a hint and may be ignored by the allocator.
1011 */
1012static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1013			     struct btrfs_root *root,
1014			     struct extent_buffer *buf,
1015			     struct extent_buffer *parent, int parent_slot,
1016			     struct extent_buffer **cow_ret,
1017			     u64 search_start, u64 empty_size,
1018			     enum btrfs_lock_nesting nest)
1019{
1020	struct btrfs_fs_info *fs_info = root->fs_info;
1021	struct btrfs_disk_key disk_key;
1022	struct extent_buffer *cow;
1023	int level, ret;
1024	int last_ref = 0;
1025	int unlock_orig = 0;
1026	u64 parent_start = 0;
1027
1028	if (*cow_ret == buf)
1029		unlock_orig = 1;
1030
1031	btrfs_assert_tree_locked(buf);
1032
1033	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1034		trans->transid != fs_info->running_transaction->transid);
1035	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1036		trans->transid != root->last_trans);
1037
1038	level = btrfs_header_level(buf);
1039
1040	if (level == 0)
1041		btrfs_item_key(buf, &disk_key, 0);
1042	else
1043		btrfs_node_key(buf, &disk_key, 0);
1044
1045	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1046		parent_start = parent->start;
1047
1048	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1049					   level, search_start, empty_size, nest);
1050	if (IS_ERR(cow))
1051		return PTR_ERR(cow);
1052
1053	/* cow is set to blocking by btrfs_init_new_buffer */
1054
1055	copy_extent_buffer_full(cow, buf);
1056	btrfs_set_header_bytenr(cow, cow->start);
1057	btrfs_set_header_generation(cow, trans->transid);
1058	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1059	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1060				     BTRFS_HEADER_FLAG_RELOC);
1061	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1062		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1063	else
1064		btrfs_set_header_owner(cow, root->root_key.objectid);
1065
1066	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1067
1068	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1069	if (ret) {
1070		btrfs_tree_unlock(cow);
1071		free_extent_buffer(cow);
1072		btrfs_abort_transaction(trans, ret);
1073		return ret;
1074	}
1075
1076	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1077		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1078		if (ret) {
1079			btrfs_tree_unlock(cow);
1080			free_extent_buffer(cow);
1081			btrfs_abort_transaction(trans, ret);
1082			return ret;
1083		}
1084	}
1085
1086	if (buf == root->node) {
1087		WARN_ON(parent && parent != buf);
1088		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1089		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1090			parent_start = buf->start;
1091
1092		atomic_inc(&cow->refs);
1093		ret = tree_mod_log_insert_root(root->node, cow, 1);
1094		BUG_ON(ret < 0);
1095		rcu_assign_pointer(root->node, cow);
1096
1097		btrfs_free_tree_block(trans, root, buf, parent_start,
1098				      last_ref);
1099		free_extent_buffer(buf);
1100		add_root_to_dirty_list(root);
1101	} else {
1102		WARN_ON(trans->transid != btrfs_header_generation(parent));
1103		tree_mod_log_insert_key(parent, parent_slot,
1104					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1105		btrfs_set_node_blockptr(parent, parent_slot,
1106					cow->start);
1107		btrfs_set_node_ptr_generation(parent, parent_slot,
1108					      trans->transid);
1109		btrfs_mark_buffer_dirty(parent);
1110		if (last_ref) {
1111			ret = tree_mod_log_free_eb(buf);
1112			if (ret) {
1113				btrfs_tree_unlock(cow);
1114				free_extent_buffer(cow);
1115				btrfs_abort_transaction(trans, ret);
1116				return ret;
1117			}
1118		}
1119		btrfs_free_tree_block(trans, root, buf, parent_start,
1120				      last_ref);
1121	}
1122	if (unlock_orig)
1123		btrfs_tree_unlock(buf);
1124	free_extent_buffer_stale(buf);
1125	btrfs_mark_buffer_dirty(cow);
1126	*cow_ret = cow;
1127	return 0;
1128}
1129
1130/*
1131 * returns the logical address of the oldest predecessor of the given root.
1132 * entries older than time_seq are ignored.
1133 */
1134static struct tree_mod_elem *__tree_mod_log_oldest_root(
1135		struct extent_buffer *eb_root, u64 time_seq)
1136{
1137	struct tree_mod_elem *tm;
1138	struct tree_mod_elem *found = NULL;
1139	u64 root_logical = eb_root->start;
1140	int looped = 0;
1141
1142	if (!time_seq)
1143		return NULL;
1144
1145	/*
1146	 * the very last operation that's logged for a root is the
1147	 * replacement operation (if it is replaced at all). this has
1148	 * the logical address of the *new* root, making it the very
1149	 * first operation that's logged for this root.
1150	 */
1151	while (1) {
1152		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1153						time_seq);
1154		if (!looped && !tm)
1155			return NULL;
1156		/*
1157		 * if there are no tree operation for the oldest root, we simply
1158		 * return it. this should only happen if that (old) root is at
1159		 * level 0.
1160		 */
1161		if (!tm)
1162			break;
1163
1164		/*
1165		 * if there's an operation that's not a root replacement, we
1166		 * found the oldest version of our root. normally, we'll find a
1167		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1168		 */
1169		if (tm->op != MOD_LOG_ROOT_REPLACE)
1170			break;
1171
1172		found = tm;
1173		root_logical = tm->old_root.logical;
1174		looped = 1;
1175	}
1176
1177	/* if there's no old root to return, return what we found instead */
1178	if (!found)
1179		found = tm;
1180
1181	return found;
1182}
1183
1184/*
1185 * tm is a pointer to the first operation to rewind within eb. then, all
1186 * previous operations will be rewound (until we reach something older than
1187 * time_seq).
1188 */
1189static void
1190__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1191		      u64 time_seq, struct tree_mod_elem *first_tm)
1192{
1193	u32 n;
1194	struct rb_node *next;
1195	struct tree_mod_elem *tm = first_tm;
1196	unsigned long o_dst;
1197	unsigned long o_src;
1198	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1199
1200	n = btrfs_header_nritems(eb);
1201	read_lock(&fs_info->tree_mod_log_lock);
1202	while (tm && tm->seq >= time_seq) {
1203		/*
1204		 * all the operations are recorded with the operator used for
1205		 * the modification. as we're going backwards, we do the
1206		 * opposite of each operation here.
1207		 */
1208		switch (tm->op) {
1209		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1210			BUG_ON(tm->slot < n);
1211			fallthrough;
1212		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1213		case MOD_LOG_KEY_REMOVE:
1214			btrfs_set_node_key(eb, &tm->key, tm->slot);
1215			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1216			btrfs_set_node_ptr_generation(eb, tm->slot,
1217						      tm->generation);
1218			n++;
1219			break;
1220		case MOD_LOG_KEY_REPLACE:
1221			BUG_ON(tm->slot >= n);
1222			btrfs_set_node_key(eb, &tm->key, tm->slot);
1223			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1224			btrfs_set_node_ptr_generation(eb, tm->slot,
1225						      tm->generation);
1226			break;
1227		case MOD_LOG_KEY_ADD:
1228			/* if a move operation is needed it's in the log */
1229			n--;
1230			break;
1231		case MOD_LOG_MOVE_KEYS:
1232			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1233			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1234			memmove_extent_buffer(eb, o_dst, o_src,
1235					      tm->move.nr_items * p_size);
1236			break;
1237		case MOD_LOG_ROOT_REPLACE:
1238			/*
1239			 * this operation is special. for roots, this must be
1240			 * handled explicitly before rewinding.
1241			 * for non-roots, this operation may exist if the node
1242			 * was a root: root A -> child B; then A gets empty and
1243			 * B is promoted to the new root. in the mod log, we'll
1244			 * have a root-replace operation for B, a tree block
1245			 * that is no root. we simply ignore that operation.
1246			 */
1247			break;
1248		}
1249		next = rb_next(&tm->node);
1250		if (!next)
1251			break;
1252		tm = rb_entry(next, struct tree_mod_elem, node);
1253		if (tm->logical != first_tm->logical)
1254			break;
1255	}
1256	read_unlock(&fs_info->tree_mod_log_lock);
1257	btrfs_set_header_nritems(eb, n);
1258}
1259
1260/*
1261 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1262 * is returned. If rewind operations happen, a fresh buffer is returned. The
1263 * returned buffer is always read-locked. If the returned buffer is not the
1264 * input buffer, the lock on the input buffer is released and the input buffer
1265 * is freed (its refcount is decremented).
1266 */
1267static struct extent_buffer *
1268tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1269		    struct extent_buffer *eb, u64 time_seq)
1270{
1271	struct extent_buffer *eb_rewin;
1272	struct tree_mod_elem *tm;
1273
1274	if (!time_seq)
1275		return eb;
1276
1277	if (btrfs_header_level(eb) == 0)
1278		return eb;
1279
1280	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1281	if (!tm)
1282		return eb;
1283
1284	btrfs_set_path_blocking(path);
1285	btrfs_set_lock_blocking_read(eb);
1286
1287	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1288		BUG_ON(tm->slot != 0);
1289		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1290		if (!eb_rewin) {
1291			btrfs_tree_read_unlock_blocking(eb);
1292			free_extent_buffer(eb);
1293			return NULL;
1294		}
1295		btrfs_set_header_bytenr(eb_rewin, eb->start);
1296		btrfs_set_header_backref_rev(eb_rewin,
1297					     btrfs_header_backref_rev(eb));
1298		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1299		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1300	} else {
1301		eb_rewin = btrfs_clone_extent_buffer(eb);
1302		if (!eb_rewin) {
1303			btrfs_tree_read_unlock_blocking(eb);
1304			free_extent_buffer(eb);
1305			return NULL;
1306		}
1307	}
1308
1309	btrfs_tree_read_unlock_blocking(eb);
1310	free_extent_buffer(eb);
1311
1312	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1313				       eb_rewin, btrfs_header_level(eb_rewin));
1314	btrfs_tree_read_lock(eb_rewin);
1315	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1316	WARN_ON(btrfs_header_nritems(eb_rewin) >
1317		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1318
1319	return eb_rewin;
1320}
1321
1322/*
1323 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1324 * value. If there are no changes, the current root->root_node is returned. If
1325 * anything changed in between, there's a fresh buffer allocated on which the
1326 * rewind operations are done. In any case, the returned buffer is read locked.
1327 * Returns NULL on error (with no locks held).
1328 */
1329static inline struct extent_buffer *
1330get_old_root(struct btrfs_root *root, u64 time_seq)
1331{
1332	struct btrfs_fs_info *fs_info = root->fs_info;
1333	struct tree_mod_elem *tm;
1334	struct extent_buffer *eb = NULL;
1335	struct extent_buffer *eb_root;
1336	u64 eb_root_owner = 0;
1337	struct extent_buffer *old;
1338	struct tree_mod_root *old_root = NULL;
1339	u64 old_generation = 0;
1340	u64 logical;
1341	int level;
1342
1343	eb_root = btrfs_read_lock_root_node(root);
1344	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1345	if (!tm)
1346		return eb_root;
1347
1348	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1349		old_root = &tm->old_root;
1350		old_generation = tm->generation;
1351		logical = old_root->logical;
1352		level = old_root->level;
1353	} else {
1354		logical = eb_root->start;
1355		level = btrfs_header_level(eb_root);
1356	}
1357
1358	tm = tree_mod_log_search(fs_info, logical, time_seq);
1359	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1360		btrfs_tree_read_unlock(eb_root);
1361		free_extent_buffer(eb_root);
1362		old = read_tree_block(fs_info, logical, 0, level, NULL);
1363		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1364			if (!IS_ERR(old))
1365				free_extent_buffer(old);
1366			btrfs_warn(fs_info,
1367				   "failed to read tree block %llu from get_old_root",
1368				   logical);
1369		} else {
1370			struct tree_mod_elem *tm2;
1371
1372			btrfs_tree_read_lock(old);
1373			eb = btrfs_clone_extent_buffer(old);
1374			/*
1375			 * After the lookup for the most recent tree mod operation
1376			 * above and before we locked and cloned the extent buffer
1377			 * 'old', a new tree mod log operation may have been added.
1378			 * So lookup for a more recent one to make sure the number
1379			 * of mod log operations we replay is consistent with the
1380			 * number of items we have in the cloned extent buffer,
1381			 * otherwise we can hit a BUG_ON when rewinding the extent
1382			 * buffer.
1383			 */
1384			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1385			btrfs_tree_read_unlock(old);
1386			free_extent_buffer(old);
1387			ASSERT(tm2);
1388			ASSERT(tm2 == tm || tm2->seq > tm->seq);
1389			if (!tm2 || tm2->seq < tm->seq) {
1390				free_extent_buffer(eb);
1391				return NULL;
1392			}
1393			tm = tm2;
1394		}
1395	} else if (old_root) {
1396		eb_root_owner = btrfs_header_owner(eb_root);
1397		btrfs_tree_read_unlock(eb_root);
1398		free_extent_buffer(eb_root);
1399		eb = alloc_dummy_extent_buffer(fs_info, logical);
1400	} else {
1401		btrfs_set_lock_blocking_read(eb_root);
1402		eb = btrfs_clone_extent_buffer(eb_root);
1403		btrfs_tree_read_unlock_blocking(eb_root);
1404		free_extent_buffer(eb_root);
1405	}
1406
1407	if (!eb)
1408		return NULL;
1409	if (old_root) {
1410		btrfs_set_header_bytenr(eb, eb->start);
1411		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1412		btrfs_set_header_owner(eb, eb_root_owner);
1413		btrfs_set_header_level(eb, old_root->level);
1414		btrfs_set_header_generation(eb, old_generation);
1415	}
1416	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1417				       btrfs_header_level(eb));
1418	btrfs_tree_read_lock(eb);
1419	if (tm)
1420		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1421	else
1422		WARN_ON(btrfs_header_level(eb) != 0);
1423	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1424
1425	return eb;
1426}
1427
1428int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1429{
1430	struct tree_mod_elem *tm;
1431	int level;
1432	struct extent_buffer *eb_root = btrfs_root_node(root);
1433
1434	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1435	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1436		level = tm->old_root.level;
1437	} else {
1438		level = btrfs_header_level(eb_root);
1439	}
1440	free_extent_buffer(eb_root);
1441
1442	return level;
1443}
1444
1445static inline int should_cow_block(struct btrfs_trans_handle *trans,
1446				   struct btrfs_root *root,
1447				   struct extent_buffer *buf)
1448{
1449	if (btrfs_is_testing(root->fs_info))
1450		return 0;
1451
1452	/* Ensure we can see the FORCE_COW bit */
1453	smp_mb__before_atomic();
1454
1455	/*
1456	 * We do not need to cow a block if
1457	 * 1) this block is not created or changed in this transaction;
1458	 * 2) this block does not belong to TREE_RELOC tree;
1459	 * 3) the root is not forced COW.
1460	 *
1461	 * What is forced COW:
1462	 *    when we create snapshot during committing the transaction,
1463	 *    after we've finished copying src root, we must COW the shared
1464	 *    block to ensure the metadata consistency.
1465	 */
1466	if (btrfs_header_generation(buf) == trans->transid &&
1467	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1468	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1469	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1470	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1471		return 0;
1472	return 1;
1473}
1474
1475/*
1476 * cows a single block, see __btrfs_cow_block for the real work.
1477 * This version of it has extra checks so that a block isn't COWed more than
1478 * once per transaction, as long as it hasn't been written yet
1479 */
1480noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1481		    struct btrfs_root *root, struct extent_buffer *buf,
1482		    struct extent_buffer *parent, int parent_slot,
1483		    struct extent_buffer **cow_ret,
1484		    enum btrfs_lock_nesting nest)
1485{
1486	struct btrfs_fs_info *fs_info = root->fs_info;
1487	u64 search_start;
1488	int ret;
1489
1490	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1491		btrfs_err(fs_info,
1492			"COW'ing blocks on a fs root that's being dropped");
1493
1494	if (trans->transaction != fs_info->running_transaction)
1495		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1496		       trans->transid,
1497		       fs_info->running_transaction->transid);
1498
1499	if (trans->transid != fs_info->generation)
1500		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1501		       trans->transid, fs_info->generation);
1502
1503	if (!should_cow_block(trans, root, buf)) {
1504		trans->dirty = true;
1505		*cow_ret = buf;
1506		return 0;
1507	}
1508
1509	search_start = buf->start & ~((u64)SZ_1G - 1);
1510
1511	if (parent)
1512		btrfs_set_lock_blocking_write(parent);
1513	btrfs_set_lock_blocking_write(buf);
1514
1515	/*
1516	 * Before CoWing this block for later modification, check if it's
1517	 * the subtree root and do the delayed subtree trace if needed.
1518	 *
1519	 * Also We don't care about the error, as it's handled internally.
1520	 */
1521	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1522	ret = __btrfs_cow_block(trans, root, buf, parent,
1523				 parent_slot, cow_ret, search_start, 0, nest);
1524
1525	trace_btrfs_cow_block(root, buf, *cow_ret);
1526
1527	return ret;
1528}
1529
1530/*
1531 * helper function for defrag to decide if two blocks pointed to by a
1532 * node are actually close by
1533 */
1534static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1535{
1536	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1537		return 1;
1538	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1539		return 1;
1540	return 0;
1541}
1542
1543#ifdef __LITTLE_ENDIAN
1544
1545/*
1546 * Compare two keys, on little-endian the disk order is same as CPU order and
1547 * we can avoid the conversion.
1548 */
1549static int comp_keys(const struct btrfs_disk_key *disk_key,
1550		     const struct btrfs_key *k2)
1551{
1552	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
1553
1554	return btrfs_comp_cpu_keys(k1, k2);
1555}
1556
1557#else
1558
1559/*
1560 * compare two keys in a memcmp fashion
1561 */
1562static int comp_keys(const struct btrfs_disk_key *disk,
1563		     const struct btrfs_key *k2)
1564{
1565	struct btrfs_key k1;
1566
1567	btrfs_disk_key_to_cpu(&k1, disk);
1568
1569	return btrfs_comp_cpu_keys(&k1, k2);
1570}
1571#endif
1572
1573/*
1574 * same as comp_keys only with two btrfs_key's
1575 */
1576int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1577{
1578	if (k1->objectid > k2->objectid)
1579		return 1;
1580	if (k1->objectid < k2->objectid)
1581		return -1;
1582	if (k1->type > k2->type)
1583		return 1;
1584	if (k1->type < k2->type)
1585		return -1;
1586	if (k1->offset > k2->offset)
1587		return 1;
1588	if (k1->offset < k2->offset)
1589		return -1;
1590	return 0;
1591}
1592
1593/*
1594 * this is used by the defrag code to go through all the
1595 * leaves pointed to by a node and reallocate them so that
1596 * disk order is close to key order
1597 */
1598int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1599		       struct btrfs_root *root, struct extent_buffer *parent,
1600		       int start_slot, u64 *last_ret,
1601		       struct btrfs_key *progress)
1602{
1603	struct btrfs_fs_info *fs_info = root->fs_info;
1604	struct extent_buffer *cur;
1605	u64 blocknr;
1606	u64 gen;
1607	u64 search_start = *last_ret;
1608	u64 last_block = 0;
1609	u64 other;
1610	u32 parent_nritems;
1611	int end_slot;
1612	int i;
1613	int err = 0;
1614	int parent_level;
1615	int uptodate;
1616	u32 blocksize;
1617	int progress_passed = 0;
1618	struct btrfs_disk_key disk_key;
1619
1620	parent_level = btrfs_header_level(parent);
1621
1622	WARN_ON(trans->transaction != fs_info->running_transaction);
1623	WARN_ON(trans->transid != fs_info->generation);
1624
1625	parent_nritems = btrfs_header_nritems(parent);
1626	blocksize = fs_info->nodesize;
1627	end_slot = parent_nritems - 1;
1628
1629	if (parent_nritems <= 1)
1630		return 0;
1631
1632	btrfs_set_lock_blocking_write(parent);
1633
1634	for (i = start_slot; i <= end_slot; i++) {
1635		struct btrfs_key first_key;
1636		int close = 1;
1637
1638		btrfs_node_key(parent, &disk_key, i);
1639		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1640			continue;
1641
1642		progress_passed = 1;
1643		blocknr = btrfs_node_blockptr(parent, i);
1644		gen = btrfs_node_ptr_generation(parent, i);
1645		btrfs_node_key_to_cpu(parent, &first_key, i);
1646		if (last_block == 0)
1647			last_block = blocknr;
1648
1649		if (i > 0) {
1650			other = btrfs_node_blockptr(parent, i - 1);
1651			close = close_blocks(blocknr, other, blocksize);
1652		}
1653		if (!close && i < end_slot) {
1654			other = btrfs_node_blockptr(parent, i + 1);
1655			close = close_blocks(blocknr, other, blocksize);
1656		}
1657		if (close) {
1658			last_block = blocknr;
1659			continue;
1660		}
1661
1662		cur = find_extent_buffer(fs_info, blocknr);
1663		if (cur)
1664			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1665		else
1666			uptodate = 0;
1667		if (!cur || !uptodate) {
1668			if (!cur) {
1669				cur = read_tree_block(fs_info, blocknr, gen,
1670						      parent_level - 1,
1671						      &first_key);
1672				if (IS_ERR(cur)) {
1673					return PTR_ERR(cur);
1674				} else if (!extent_buffer_uptodate(cur)) {
1675					free_extent_buffer(cur);
1676					return -EIO;
1677				}
1678			} else if (!uptodate) {
1679				err = btrfs_read_buffer(cur, gen,
1680						parent_level - 1,&first_key);
1681				if (err) {
1682					free_extent_buffer(cur);
1683					return err;
1684				}
1685			}
1686		}
1687		if (search_start == 0)
1688			search_start = last_block;
1689
1690		btrfs_tree_lock(cur);
1691		btrfs_set_lock_blocking_write(cur);
1692		err = __btrfs_cow_block(trans, root, cur, parent, i,
1693					&cur, search_start,
1694					min(16 * blocksize,
1695					    (end_slot - i) * blocksize),
1696					BTRFS_NESTING_COW);
1697		if (err) {
1698			btrfs_tree_unlock(cur);
1699			free_extent_buffer(cur);
1700			break;
1701		}
1702		search_start = cur->start;
1703		last_block = cur->start;
1704		*last_ret = search_start;
1705		btrfs_tree_unlock(cur);
1706		free_extent_buffer(cur);
1707	}
1708	return err;
1709}
1710
1711/*
1712 * search for key in the extent_buffer.  The items start at offset p,
1713 * and they are item_size apart.  There are 'max' items in p.
1714 *
1715 * the slot in the array is returned via slot, and it points to
1716 * the place where you would insert key if it is not found in
1717 * the array.
1718 *
1719 * slot may point to max if the key is bigger than all of the keys
1720 */
1721static noinline int generic_bin_search(struct extent_buffer *eb,
1722				       unsigned long p, int item_size,
1723				       const struct btrfs_key *key,
1724				       int max, int *slot)
1725{
1726	int low = 0;
1727	int high = max;
1728	int ret;
1729	const int key_size = sizeof(struct btrfs_disk_key);
1730
1731	if (low > high) {
1732		btrfs_err(eb->fs_info,
1733		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1734			  __func__, low, high, eb->start,
1735			  btrfs_header_owner(eb), btrfs_header_level(eb));
1736		return -EINVAL;
1737	}
1738
1739	while (low < high) {
1740		unsigned long oip;
1741		unsigned long offset;
1742		struct btrfs_disk_key *tmp;
1743		struct btrfs_disk_key unaligned;
1744		int mid;
1745
1746		mid = (low + high) / 2;
1747		offset = p + mid * item_size;
1748		oip = offset_in_page(offset);
1749
1750		if (oip + key_size <= PAGE_SIZE) {
1751			const unsigned long idx = offset >> PAGE_SHIFT;
1752			char *kaddr = page_address(eb->pages[idx]);
1753
1754			tmp = (struct btrfs_disk_key *)(kaddr + oip);
1755		} else {
1756			read_extent_buffer(eb, &unaligned, offset, key_size);
1757			tmp = &unaligned;
1758		}
1759
1760		ret = comp_keys(tmp, key);
1761
1762		if (ret < 0)
1763			low = mid + 1;
1764		else if (ret > 0)
1765			high = mid;
1766		else {
1767			*slot = mid;
1768			return 0;
1769		}
1770	}
1771	*slot = low;
1772	return 1;
1773}
1774
1775/*
1776 * simple bin_search frontend that does the right thing for
1777 * leaves vs nodes
1778 */
1779int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1780		     int *slot)
1781{
1782	if (btrfs_header_level(eb) == 0)
1783		return generic_bin_search(eb,
1784					  offsetof(struct btrfs_leaf, items),
1785					  sizeof(struct btrfs_item),
1786					  key, btrfs_header_nritems(eb),
1787					  slot);
1788	else
1789		return generic_bin_search(eb,
1790					  offsetof(struct btrfs_node, ptrs),
1791					  sizeof(struct btrfs_key_ptr),
1792					  key, btrfs_header_nritems(eb),
1793					  slot);
1794}
1795
1796static void root_add_used(struct btrfs_root *root, u32 size)
1797{
1798	spin_lock(&root->accounting_lock);
1799	btrfs_set_root_used(&root->root_item,
1800			    btrfs_root_used(&root->root_item) + size);
1801	spin_unlock(&root->accounting_lock);
1802}
1803
1804static void root_sub_used(struct btrfs_root *root, u32 size)
1805{
1806	spin_lock(&root->accounting_lock);
1807	btrfs_set_root_used(&root->root_item,
1808			    btrfs_root_used(&root->root_item) - size);
1809	spin_unlock(&root->accounting_lock);
1810}
1811
1812/* given a node and slot number, this reads the blocks it points to.  The
1813 * extent buffer is returned with a reference taken (but unlocked).
1814 */
1815struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1816					   int slot)
1817{
1818	int level = btrfs_header_level(parent);
1819	struct extent_buffer *eb;
1820	struct btrfs_key first_key;
1821
1822	if (slot < 0 || slot >= btrfs_header_nritems(parent))
1823		return ERR_PTR(-ENOENT);
1824
1825	BUG_ON(level == 0);
1826
1827	btrfs_node_key_to_cpu(parent, &first_key, slot);
1828	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1829			     btrfs_node_ptr_generation(parent, slot),
1830			     level - 1, &first_key);
1831	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1832		free_extent_buffer(eb);
1833		eb = ERR_PTR(-EIO);
1834	}
1835
1836	return eb;
1837}
1838
1839/*
1840 * node level balancing, used to make sure nodes are in proper order for
1841 * item deletion.  We balance from the top down, so we have to make sure
1842 * that a deletion won't leave an node completely empty later on.
1843 */
1844static noinline int balance_level(struct btrfs_trans_handle *trans,
1845			 struct btrfs_root *root,
1846			 struct btrfs_path *path, int level)
1847{
1848	struct btrfs_fs_info *fs_info = root->fs_info;
1849	struct extent_buffer *right = NULL;
1850	struct extent_buffer *mid;
1851	struct extent_buffer *left = NULL;
1852	struct extent_buffer *parent = NULL;
1853	int ret = 0;
1854	int wret;
1855	int pslot;
1856	int orig_slot = path->slots[level];
1857	u64 orig_ptr;
1858
1859	ASSERT(level > 0);
1860
1861	mid = path->nodes[level];
1862
1863	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1864		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1865	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1866
1867	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1868
1869	if (level < BTRFS_MAX_LEVEL - 1) {
1870		parent = path->nodes[level + 1];
1871		pslot = path->slots[level + 1];
1872	}
1873
1874	/*
1875	 * deal with the case where there is only one pointer in the root
1876	 * by promoting the node below to a root
1877	 */
1878	if (!parent) {
1879		struct extent_buffer *child;
1880
1881		if (btrfs_header_nritems(mid) != 1)
1882			return 0;
1883
1884		/* promote the child to a root */
1885		child = btrfs_read_node_slot(mid, 0);
1886		if (IS_ERR(child)) {
1887			ret = PTR_ERR(child);
1888			btrfs_handle_fs_error(fs_info, ret, NULL);
1889			goto enospc;
1890		}
1891
1892		btrfs_tree_lock(child);
1893		btrfs_set_lock_blocking_write(child);
1894		ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1895				      BTRFS_NESTING_COW);
1896		if (ret) {
1897			btrfs_tree_unlock(child);
1898			free_extent_buffer(child);
1899			goto enospc;
1900		}
1901
1902		ret = tree_mod_log_insert_root(root->node, child, 1);
1903		BUG_ON(ret < 0);
1904		rcu_assign_pointer(root->node, child);
1905
1906		add_root_to_dirty_list(root);
1907		btrfs_tree_unlock(child);
1908
1909		path->locks[level] = 0;
1910		path->nodes[level] = NULL;
1911		btrfs_clean_tree_block(mid);
1912		btrfs_tree_unlock(mid);
1913		/* once for the path */
1914		free_extent_buffer(mid);
1915
1916		root_sub_used(root, mid->len);
1917		btrfs_free_tree_block(trans, root, mid, 0, 1);
1918		/* once for the root ptr */
1919		free_extent_buffer_stale(mid);
1920		return 0;
1921	}
1922	if (btrfs_header_nritems(mid) >
1923	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1924		return 0;
1925
1926	left = btrfs_read_node_slot(parent, pslot - 1);
1927	if (IS_ERR(left))
1928		left = NULL;
1929
1930	if (left) {
1931		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1932		btrfs_set_lock_blocking_write(left);
1933		wret = btrfs_cow_block(trans, root, left,
1934				       parent, pslot - 1, &left,
1935				       BTRFS_NESTING_LEFT_COW);
1936		if (wret) {
1937			ret = wret;
1938			goto enospc;
1939		}
1940	}
1941
1942	right = btrfs_read_node_slot(parent, pslot + 1);
1943	if (IS_ERR(right))
1944		right = NULL;
1945
1946	if (right) {
1947		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1948		btrfs_set_lock_blocking_write(right);
1949		wret = btrfs_cow_block(trans, root, right,
1950				       parent, pslot + 1, &right,
1951				       BTRFS_NESTING_RIGHT_COW);
1952		if (wret) {
1953			ret = wret;
1954			goto enospc;
1955		}
1956	}
1957
1958	/* first, try to make some room in the middle buffer */
1959	if (left) {
1960		orig_slot += btrfs_header_nritems(left);
1961		wret = push_node_left(trans, left, mid, 1);
1962		if (wret < 0)
1963			ret = wret;
1964	}
1965
1966	/*
1967	 * then try to empty the right most buffer into the middle
1968	 */
1969	if (right) {
1970		wret = push_node_left(trans, mid, right, 1);
1971		if (wret < 0 && wret != -ENOSPC)
1972			ret = wret;
1973		if (btrfs_header_nritems(right) == 0) {
1974			btrfs_clean_tree_block(right);
1975			btrfs_tree_unlock(right);
1976			del_ptr(root, path, level + 1, pslot + 1);
1977			root_sub_used(root, right->len);
1978			btrfs_free_tree_block(trans, root, right, 0, 1);
1979			free_extent_buffer_stale(right);
1980			right = NULL;
1981		} else {
1982			struct btrfs_disk_key right_key;
1983			btrfs_node_key(right, &right_key, 0);
1984			ret = tree_mod_log_insert_key(parent, pslot + 1,
1985					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1986			BUG_ON(ret < 0);
1987			btrfs_set_node_key(parent, &right_key, pslot + 1);
1988			btrfs_mark_buffer_dirty(parent);
1989		}
1990	}
1991	if (btrfs_header_nritems(mid) == 1) {
1992		/*
1993		 * we're not allowed to leave a node with one item in the
1994		 * tree during a delete.  A deletion from lower in the tree
1995		 * could try to delete the only pointer in this node.
1996		 * So, pull some keys from the left.
1997		 * There has to be a left pointer at this point because
1998		 * otherwise we would have pulled some pointers from the
1999		 * right
2000		 */
2001		if (!left) {
2002			ret = -EROFS;
2003			btrfs_handle_fs_error(fs_info, ret, NULL);
2004			goto enospc;
2005		}
2006		wret = balance_node_right(trans, mid, left);
2007		if (wret < 0) {
2008			ret = wret;
2009			goto enospc;
2010		}
2011		if (wret == 1) {
2012			wret = push_node_left(trans, left, mid, 1);
2013			if (wret < 0)
2014				ret = wret;
2015		}
2016		BUG_ON(wret == 1);
2017	}
2018	if (btrfs_header_nritems(mid) == 0) {
2019		btrfs_clean_tree_block(mid);
2020		btrfs_tree_unlock(mid);
2021		del_ptr(root, path, level + 1, pslot);
2022		root_sub_used(root, mid->len);
2023		btrfs_free_tree_block(trans, root, mid, 0, 1);
2024		free_extent_buffer_stale(mid);
2025		mid = NULL;
2026	} else {
2027		/* update the parent key to reflect our changes */
2028		struct btrfs_disk_key mid_key;
2029		btrfs_node_key(mid, &mid_key, 0);
2030		ret = tree_mod_log_insert_key(parent, pslot,
2031				MOD_LOG_KEY_REPLACE, GFP_NOFS);
2032		BUG_ON(ret < 0);
2033		btrfs_set_node_key(parent, &mid_key, pslot);
2034		btrfs_mark_buffer_dirty(parent);
2035	}
2036
2037	/* update the path */
2038	if (left) {
2039		if (btrfs_header_nritems(left) > orig_slot) {
2040			atomic_inc(&left->refs);
2041			/* left was locked after cow */
2042			path->nodes[level] = left;
2043			path->slots[level + 1] -= 1;
2044			path->slots[level] = orig_slot;
2045			if (mid) {
2046				btrfs_tree_unlock(mid);
2047				free_extent_buffer(mid);
2048			}
2049		} else {
2050			orig_slot -= btrfs_header_nritems(left);
2051			path->slots[level] = orig_slot;
2052		}
2053	}
2054	/* double check we haven't messed things up */
2055	if (orig_ptr !=
2056	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2057		BUG();
2058enospc:
2059	if (right) {
2060		btrfs_tree_unlock(right);
2061		free_extent_buffer(right);
2062	}
2063	if (left) {
2064		if (path->nodes[level] != left)
2065			btrfs_tree_unlock(left);
2066		free_extent_buffer(left);
2067	}
2068	return ret;
2069}
2070
2071/* Node balancing for insertion.  Here we only split or push nodes around
2072 * when they are completely full.  This is also done top down, so we
2073 * have to be pessimistic.
2074 */
2075static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2076					  struct btrfs_root *root,
2077					  struct btrfs_path *path, int level)
2078{
2079	struct btrfs_fs_info *fs_info = root->fs_info;
2080	struct extent_buffer *right = NULL;
2081	struct extent_buffer *mid;
2082	struct extent_buffer *left = NULL;
2083	struct extent_buffer *parent = NULL;
2084	int ret = 0;
2085	int wret;
2086	int pslot;
2087	int orig_slot = path->slots[level];
2088
2089	if (level == 0)
2090		return 1;
2091
2092	mid = path->nodes[level];
2093	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2094
2095	if (level < BTRFS_MAX_LEVEL - 1) {
2096		parent = path->nodes[level + 1];
2097		pslot = path->slots[level + 1];
2098	}
2099
2100	if (!parent)
2101		return 1;
2102
2103	left = btrfs_read_node_slot(parent, pslot - 1);
2104	if (IS_ERR(left))
2105		left = NULL;
2106
2107	/* first, try to make some room in the middle buffer */
2108	if (left) {
2109		u32 left_nr;
2110
2111		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
2112		btrfs_set_lock_blocking_write(left);
2113
2114		left_nr = btrfs_header_nritems(left);
2115		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2116			wret = 1;
2117		} else {
2118			ret = btrfs_cow_block(trans, root, left, parent,
2119					      pslot - 1, &left,
2120					      BTRFS_NESTING_LEFT_COW);
2121			if (ret)
2122				wret = 1;
2123			else {
2124				wret = push_node_left(trans, left, mid, 0);
2125			}
2126		}
2127		if (wret < 0)
2128			ret = wret;
2129		if (wret == 0) {
2130			struct btrfs_disk_key disk_key;
2131			orig_slot += left_nr;
2132			btrfs_node_key(mid, &disk_key, 0);
2133			ret = tree_mod_log_insert_key(parent, pslot,
2134					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2135			BUG_ON(ret < 0);
2136			btrfs_set_node_key(parent, &disk_key, pslot);
2137			btrfs_mark_buffer_dirty(parent);
2138			if (btrfs_header_nritems(left) > orig_slot) {
2139				path->nodes[level] = left;
2140				path->slots[level + 1] -= 1;
2141				path->slots[level] = orig_slot;
2142				btrfs_tree_unlock(mid);
2143				free_extent_buffer(mid);
2144			} else {
2145				orig_slot -=
2146					btrfs_header_nritems(left);
2147				path->slots[level] = orig_slot;
2148				btrfs_tree_unlock(left);
2149				free_extent_buffer(left);
2150			}
2151			return 0;
2152		}
2153		btrfs_tree_unlock(left);
2154		free_extent_buffer(left);
2155	}
2156	right = btrfs_read_node_slot(parent, pslot + 1);
2157	if (IS_ERR(right))
2158		right = NULL;
2159
2160	/*
2161	 * then try to empty the right most buffer into the middle
2162	 */
2163	if (right) {
2164		u32 right_nr;
2165
2166		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2167		btrfs_set_lock_blocking_write(right);
2168
2169		right_nr = btrfs_header_nritems(right);
2170		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2171			wret = 1;
2172		} else {
2173			ret = btrfs_cow_block(trans, root, right,
2174					      parent, pslot + 1,
2175					      &right, BTRFS_NESTING_RIGHT_COW);
2176			if (ret)
2177				wret = 1;
2178			else {
2179				wret = balance_node_right(trans, right, mid);
2180			}
2181		}
2182		if (wret < 0)
2183			ret = wret;
2184		if (wret == 0) {
2185			struct btrfs_disk_key disk_key;
2186
2187			btrfs_node_key(right, &disk_key, 0);
2188			ret = tree_mod_log_insert_key(parent, pslot + 1,
2189					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2190			BUG_ON(ret < 0);
2191			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2192			btrfs_mark_buffer_dirty(parent);
2193
2194			if (btrfs_header_nritems(mid) <= orig_slot) {
2195				path->nodes[level] = right;
2196				path->slots[level + 1] += 1;
2197				path->slots[level] = orig_slot -
2198					btrfs_header_nritems(mid);
2199				btrfs_tree_unlock(mid);
2200				free_extent_buffer(mid);
2201			} else {
2202				btrfs_tree_unlock(right);
2203				free_extent_buffer(right);
2204			}
2205			return 0;
2206		}
2207		btrfs_tree_unlock(right);
2208		free_extent_buffer(right);
2209	}
2210	return 1;
2211}
2212
2213/*
2214 * readahead one full node of leaves, finding things that are close
2215 * to the block in 'slot', and triggering ra on them.
2216 */
2217static void reada_for_search(struct btrfs_fs_info *fs_info,
2218			     struct btrfs_path *path,
2219			     int level, int slot, u64 objectid)
2220{
2221	struct extent_buffer *node;
2222	struct btrfs_disk_key disk_key;
2223	u32 nritems;
2224	u64 search;
2225	u64 target;
2226	u64 nread = 0;
2227	struct extent_buffer *eb;
2228	u32 nr;
2229	u32 blocksize;
2230	u32 nscan = 0;
2231
2232	if (level != 1)
2233		return;
2234
2235	if (!path->nodes[level])
2236		return;
2237
2238	node = path->nodes[level];
2239
2240	search = btrfs_node_blockptr(node, slot);
2241	blocksize = fs_info->nodesize;
2242	eb = find_extent_buffer(fs_info, search);
2243	if (eb) {
2244		free_extent_buffer(eb);
2245		return;
2246	}
2247
2248	target = search;
2249
2250	nritems = btrfs_header_nritems(node);
2251	nr = slot;
2252
2253	while (1) {
2254		if (path->reada == READA_BACK) {
2255			if (nr == 0)
2256				break;
2257			nr--;
2258		} else if (path->reada == READA_FORWARD) {
2259			nr++;
2260			if (nr >= nritems)
2261				break;
2262		}
2263		if (path->reada == READA_BACK && objectid) {
2264			btrfs_node_key(node, &disk_key, nr);
2265			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2266				break;
2267		}
2268		search = btrfs_node_blockptr(node, nr);
2269		if ((search <= target && target - search <= 65536) ||
2270		    (search > target && search - target <= 65536)) {
2271			readahead_tree_block(fs_info, search);
2272			nread += blocksize;
2273		}
2274		nscan++;
2275		if ((nread > 65536 || nscan > 32))
2276			break;
2277	}
2278}
2279
2280static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2281				       struct btrfs_path *path, int level)
2282{
2283	int slot;
2284	int nritems;
2285	struct extent_buffer *parent;
2286	struct extent_buffer *eb;
2287	u64 gen;
2288	u64 block1 = 0;
2289	u64 block2 = 0;
2290
2291	parent = path->nodes[level + 1];
2292	if (!parent)
2293		return;
2294
2295	nritems = btrfs_header_nritems(parent);
2296	slot = path->slots[level + 1];
2297
2298	if (slot > 0) {
2299		block1 = btrfs_node_blockptr(parent, slot - 1);
2300		gen = btrfs_node_ptr_generation(parent, slot - 1);
2301		eb = find_extent_buffer(fs_info, block1);
2302		/*
2303		 * if we get -eagain from btrfs_buffer_uptodate, we
2304		 * don't want to return eagain here.  That will loop
2305		 * forever
2306		 */
2307		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2308			block1 = 0;
2309		free_extent_buffer(eb);
2310	}
2311	if (slot + 1 < nritems) {
2312		block2 = btrfs_node_blockptr(parent, slot + 1);
2313		gen = btrfs_node_ptr_generation(parent, slot + 1);
2314		eb = find_extent_buffer(fs_info, block2);
2315		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2316			block2 = 0;
2317		free_extent_buffer(eb);
2318	}
2319
2320	if (block1)
2321		readahead_tree_block(fs_info, block1);
2322	if (block2)
2323		readahead_tree_block(fs_info, block2);
2324}
2325
2326
2327/*
2328 * when we walk down the tree, it is usually safe to unlock the higher layers
2329 * in the tree.  The exceptions are when our path goes through slot 0, because
2330 * operations on the tree might require changing key pointers higher up in the
2331 * tree.
2332 *
2333 * callers might also have set path->keep_locks, which tells this code to keep
2334 * the lock if the path points to the last slot in the block.  This is part of
2335 * walking through the tree, and selecting the next slot in the higher block.
2336 *
2337 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2338 * if lowest_unlock is 1, level 0 won't be unlocked
2339 */
2340static noinline void unlock_up(struct btrfs_path *path, int level,
2341			       int lowest_unlock, int min_write_lock_level,
2342			       int *write_lock_level)
2343{
2344	int i;
2345	int skip_level = level;
2346	int no_skips = 0;
2347	struct extent_buffer *t;
2348
2349	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2350		if (!path->nodes[i])
2351			break;
2352		if (!path->locks[i])
2353			break;
2354		if (!no_skips && path->slots[i] == 0) {
2355			skip_level = i + 1;
2356			continue;
2357		}
2358		if (!no_skips && path->keep_locks) {
2359			u32 nritems;
2360			t = path->nodes[i];
2361			nritems = btrfs_header_nritems(t);
2362			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2363				skip_level = i + 1;
2364				continue;
2365			}
2366		}
2367		if (skip_level < i && i >= lowest_unlock)
2368			no_skips = 1;
2369
2370		t = path->nodes[i];
2371		if (i >= lowest_unlock && i > skip_level) {
2372			btrfs_tree_unlock_rw(t, path->locks[i]);
2373			path->locks[i] = 0;
2374			if (write_lock_level &&
2375			    i > min_write_lock_level &&
2376			    i <= *write_lock_level) {
2377				*write_lock_level = i - 1;
2378			}
2379		}
2380	}
2381}
2382
2383/*
2384 * helper function for btrfs_search_slot.  The goal is to find a block
2385 * in cache without setting the path to blocking.  If we find the block
2386 * we return zero and the path is unchanged.
2387 *
2388 * If we can't find the block, we set the path blocking and do some
2389 * reada.  -EAGAIN is returned and the search must be repeated.
2390 */
2391static int
2392read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2393		      struct extent_buffer **eb_ret, int level, int slot,
2394		      const struct btrfs_key *key)
2395{
2396	struct btrfs_fs_info *fs_info = root->fs_info;
2397	u64 blocknr;
2398	u64 gen;
2399	struct extent_buffer *tmp;
2400	struct btrfs_key first_key;
2401	int ret;
2402	int parent_level;
2403
2404	blocknr = btrfs_node_blockptr(*eb_ret, slot);
2405	gen = btrfs_node_ptr_generation(*eb_ret, slot);
2406	parent_level = btrfs_header_level(*eb_ret);
2407	btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2408
2409	tmp = find_extent_buffer(fs_info, blocknr);
2410	if (tmp) {
2411		/* first we do an atomic uptodate check */
2412		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2413			/*
2414			 * Do extra check for first_key, eb can be stale due to
2415			 * being cached, read from scrub, or have multiple
2416			 * parents (shared tree blocks).
2417			 */
2418			if (btrfs_verify_level_key(tmp,
2419					parent_level - 1, &first_key, gen)) {
2420				free_extent_buffer(tmp);
2421				return -EUCLEAN;
2422			}
2423			*eb_ret = tmp;
2424			return 0;
2425		}
2426
2427		/* the pages were up to date, but we failed
2428		 * the generation number check.  Do a full
2429		 * read for the generation number that is correct.
2430		 * We must do this without dropping locks so
2431		 * we can trust our generation number
2432		 */
2433		btrfs_set_path_blocking(p);
2434
2435		/* now we're allowed to do a blocking uptodate check */
2436		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2437		if (!ret) {
2438			*eb_ret = tmp;
2439			return 0;
2440		}
2441		free_extent_buffer(tmp);
2442		btrfs_release_path(p);
2443		return -EIO;
2444	}
2445
2446	/*
2447	 * reduce lock contention at high levels
2448	 * of the btree by dropping locks before
2449	 * we read.  Don't release the lock on the current
2450	 * level because we need to walk this node to figure
2451	 * out which blocks to read.
2452	 */
2453	btrfs_unlock_up_safe(p, level + 1);
2454	btrfs_set_path_blocking(p);
2455
2456	if (p->reada != READA_NONE)
2457		reada_for_search(fs_info, p, level, slot, key->objectid);
2458
2459	ret = -EAGAIN;
2460	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2461			      &first_key);
2462	if (!IS_ERR(tmp)) {
2463		/*
2464		 * If the read above didn't mark this buffer up to date,
2465		 * it will never end up being up to date.  Set ret to EIO now
2466		 * and give up so that our caller doesn't loop forever
2467		 * on our EAGAINs.
2468		 */
2469		if (!extent_buffer_uptodate(tmp))
2470			ret = -EIO;
2471		free_extent_buffer(tmp);
2472	} else {
2473		ret = PTR_ERR(tmp);
2474	}
2475
2476	btrfs_release_path(p);
2477	return ret;
2478}
2479
2480/*
2481 * helper function for btrfs_search_slot.  This does all of the checks
2482 * for node-level blocks and does any balancing required based on
2483 * the ins_len.
2484 *
2485 * If no extra work was required, zero is returned.  If we had to
2486 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2487 * start over
2488 */
2489static int
2490setup_nodes_for_search(struct btrfs_trans_handle *trans,
2491		       struct btrfs_root *root, struct btrfs_path *p,
2492		       struct extent_buffer *b, int level, int ins_len,
2493		       int *write_lock_level)
2494{
2495	struct btrfs_fs_info *fs_info = root->fs_info;
2496	int ret;
2497
2498	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2499	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2500		int sret;
2501
2502		if (*write_lock_level < level + 1) {
2503			*write_lock_level = level + 1;
2504			btrfs_release_path(p);
2505			goto again;
2506		}
2507
2508		btrfs_set_path_blocking(p);
2509		reada_for_balance(fs_info, p, level);
2510		sret = split_node(trans, root, p, level);
2511
2512		BUG_ON(sret > 0);
2513		if (sret) {
2514			ret = sret;
2515			goto done;
2516		}
2517		b = p->nodes[level];
2518	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2519		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2520		int sret;
2521
2522		if (*write_lock_level < level + 1) {
2523			*write_lock_level = level + 1;
2524			btrfs_release_path(p);
2525			goto again;
2526		}
2527
2528		btrfs_set_path_blocking(p);
2529		reada_for_balance(fs_info, p, level);
2530		sret = balance_level(trans, root, p, level);
2531
2532		if (sret) {
2533			ret = sret;
2534			goto done;
2535		}
2536		b = p->nodes[level];
2537		if (!b) {
2538			btrfs_release_path(p);
2539			goto again;
2540		}
2541		BUG_ON(btrfs_header_nritems(b) == 1);
2542	}
2543	return 0;
2544
2545again:
2546	ret = -EAGAIN;
2547done:
2548	return ret;
2549}
2550
2551int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2552		u64 iobjectid, u64 ioff, u8 key_type,
2553		struct btrfs_key *found_key)
2554{
2555	int ret;
2556	struct btrfs_key key;
2557	struct extent_buffer *eb;
2558
2559	ASSERT(path);
2560	ASSERT(found_key);
2561
2562	key.type = key_type;
2563	key.objectid = iobjectid;
2564	key.offset = ioff;
2565
2566	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2567	if (ret < 0)
2568		return ret;
2569
2570	eb = path->nodes[0];
2571	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2572		ret = btrfs_next_leaf(fs_root, path);
2573		if (ret)
2574			return ret;
2575		eb = path->nodes[0];
2576	}
2577
2578	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2579	if (found_key->type != key.type ||
2580			found_key->objectid != key.objectid)
2581		return 1;
2582
2583	return 0;
2584}
2585
2586static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2587							struct btrfs_path *p,
2588							int write_lock_level)
2589{
2590	struct btrfs_fs_info *fs_info = root->fs_info;
2591	struct extent_buffer *b;
2592	int root_lock = 0;
2593	int level = 0;
2594
2595	if (p->search_commit_root) {
2596		/*
2597		 * The commit roots are read only so we always do read locks,
2598		 * and we always must hold the commit_root_sem when doing
2599		 * searches on them, the only exception is send where we don't
2600		 * want to block transaction commits for a long time, so
2601		 * we need to clone the commit root in order to avoid races
2602		 * with transaction commits that create a snapshot of one of
2603		 * the roots used by a send operation.
2604		 */
2605		if (p->need_commit_sem) {
2606			down_read(&fs_info->commit_root_sem);
2607			b = btrfs_clone_extent_buffer(root->commit_root);
2608			up_read(&fs_info->commit_root_sem);
2609			if (!b)
2610				return ERR_PTR(-ENOMEM);
2611
2612		} else {
2613			b = root->commit_root;
2614			atomic_inc(&b->refs);
2615		}
2616		level = btrfs_header_level(b);
2617		/*
2618		 * Ensure that all callers have set skip_locking when
2619		 * p->search_commit_root = 1.
2620		 */
2621		ASSERT(p->skip_locking == 1);
2622
2623		goto out;
2624	}
2625
2626	if (p->skip_locking) {
2627		b = btrfs_root_node(root);
2628		level = btrfs_header_level(b);
2629		goto out;
2630	}
2631
2632	/* We try very hard to do read locks on the root */
2633	root_lock = BTRFS_READ_LOCK;
2634
2635	/*
2636	 * If the level is set to maximum, we can skip trying to get the read
2637	 * lock.
2638	 */
2639	if (write_lock_level < BTRFS_MAX_LEVEL) {
2640		/*
2641		 * We don't know the level of the root node until we actually
2642		 * have it read locked
2643		 */
2644		b = __btrfs_read_lock_root_node(root, p->recurse);
2645		level = btrfs_header_level(b);
2646		if (level > write_lock_level)
2647			goto out;
2648
2649		/* Whoops, must trade for write lock */
2650		btrfs_tree_read_unlock(b);
2651		free_extent_buffer(b);
2652	}
2653
2654	b = btrfs_lock_root_node(root);
2655	root_lock = BTRFS_WRITE_LOCK;
2656
2657	/* The level might have changed, check again */
2658	level = btrfs_header_level(b);
2659
2660out:
2661	/*
2662	 * The root may have failed to write out at some point, and thus is no
2663	 * longer valid, return an error in this case.
2664	 */
2665	if (!extent_buffer_uptodate(b)) {
2666		if (root_lock)
2667			btrfs_tree_unlock_rw(b, root_lock);
2668		free_extent_buffer(b);
2669		return ERR_PTR(-EIO);
2670	}
2671
2672	p->nodes[level] = b;
2673	if (!p->skip_locking)
2674		p->locks[level] = root_lock;
2675	/*
2676	 * Callers are responsible for dropping b's references.
2677	 */
2678	return b;
2679}
2680
2681
2682/*
2683 * btrfs_search_slot - look for a key in a tree and perform necessary
2684 * modifications to preserve tree invariants.
2685 *
2686 * @trans:	Handle of transaction, used when modifying the tree
2687 * @p:		Holds all btree nodes along the search path
2688 * @root:	The root node of the tree
2689 * @key:	The key we are looking for
2690 * @ins_len:	Indicates purpose of search, for inserts it is 1, for
2691 *		deletions it's -1. 0 for plain searches
2692 * @cow:	boolean should CoW operations be performed. Must always be 1
2693 *		when modifying the tree.
2694 *
2695 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2696 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2697 *
2698 * If @key is found, 0 is returned and you can find the item in the leaf level
2699 * of the path (level 0)
2700 *
2701 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2702 * points to the slot where it should be inserted
2703 *
2704 * If an error is encountered while searching the tree a negative error number
2705 * is returned
2706 */
2707int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2708		      const struct btrfs_key *key, struct btrfs_path *p,
2709		      int ins_len, int cow)
2710{
2711	struct extent_buffer *b;
2712	int slot;
2713	int ret;
2714	int err;
2715	int level;
2716	int lowest_unlock = 1;
2717	/* everything at write_lock_level or lower must be write locked */
2718	int write_lock_level = 0;
2719	u8 lowest_level = 0;
2720	int min_write_lock_level;
2721	int prev_cmp;
2722
2723	lowest_level = p->lowest_level;
2724	WARN_ON(lowest_level && ins_len > 0);
2725	WARN_ON(p->nodes[0] != NULL);
2726	BUG_ON(!cow && ins_len);
2727
2728	if (ins_len < 0) {
2729		lowest_unlock = 2;
2730
2731		/* when we are removing items, we might have to go up to level
2732		 * two as we update tree pointers  Make sure we keep write
2733		 * for those levels as well
2734		 */
2735		write_lock_level = 2;
2736	} else if (ins_len > 0) {
2737		/*
2738		 * for inserting items, make sure we have a write lock on
2739		 * level 1 so we can update keys
2740		 */
2741		write_lock_level = 1;
2742	}
2743
2744	if (!cow)
2745		write_lock_level = -1;
2746
2747	if (cow && (p->keep_locks || p->lowest_level))
2748		write_lock_level = BTRFS_MAX_LEVEL;
2749
2750	min_write_lock_level = write_lock_level;
2751
2752again:
2753	prev_cmp = -1;
2754	b = btrfs_search_slot_get_root(root, p, write_lock_level);
2755	if (IS_ERR(b)) {
2756		ret = PTR_ERR(b);
2757		goto done;
2758	}
2759
2760	while (b) {
2761		int dec = 0;
2762
2763		level = btrfs_header_level(b);
2764
2765		if (cow) {
2766			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2767
2768			/*
2769			 * if we don't really need to cow this block
2770			 * then we don't want to set the path blocking,
2771			 * so we test it here
2772			 */
2773			if (!should_cow_block(trans, root, b)) {
2774				trans->dirty = true;
2775				goto cow_done;
2776			}
2777
2778			/*
2779			 * must have write locks on this node and the
2780			 * parent
2781			 */
2782			if (level > write_lock_level ||
2783			    (level + 1 > write_lock_level &&
2784			    level + 1 < BTRFS_MAX_LEVEL &&
2785			    p->nodes[level + 1])) {
2786				write_lock_level = level + 1;
2787				btrfs_release_path(p);
2788				goto again;
2789			}
2790
2791			btrfs_set_path_blocking(p);
2792			if (last_level)
2793				err = btrfs_cow_block(trans, root, b, NULL, 0,
2794						      &b,
2795						      BTRFS_NESTING_COW);
2796			else
2797				err = btrfs_cow_block(trans, root, b,
2798						      p->nodes[level + 1],
2799						      p->slots[level + 1], &b,
2800						      BTRFS_NESTING_COW);
2801			if (err) {
2802				ret = err;
2803				goto done;
2804			}
2805		}
2806cow_done:
2807		p->nodes[level] = b;
2808		/*
2809		 * Leave path with blocking locks to avoid massive
2810		 * lock context switch, this is made on purpose.
2811		 */
2812
2813		/*
2814		 * we have a lock on b and as long as we aren't changing
2815		 * the tree, there is no way to for the items in b to change.
2816		 * It is safe to drop the lock on our parent before we
2817		 * go through the expensive btree search on b.
2818		 *
2819		 * If we're inserting or deleting (ins_len != 0), then we might
2820		 * be changing slot zero, which may require changing the parent.
2821		 * So, we can't drop the lock until after we know which slot
2822		 * we're operating on.
2823		 */
2824		if (!ins_len && !p->keep_locks) {
2825			int u = level + 1;
2826
2827			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2828				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2829				p->locks[u] = 0;
2830			}
2831		}
2832
2833		/*
2834		 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
2835		 * we can safely assume the target key will always be in slot 0
2836		 * on lower levels due to the invariants BTRFS' btree provides,
2837		 * namely that a btrfs_key_ptr entry always points to the
2838		 * lowest key in the child node, thus we can skip searching
2839		 * lower levels
2840		 */
2841		if (prev_cmp == 0) {
2842			slot = 0;
2843			ret = 0;
2844		} else {
2845			ret = btrfs_bin_search(b, key, &slot);
2846			prev_cmp = ret;
2847			if (ret < 0)
2848				goto done;
2849		}
2850
2851		if (level == 0) {
2852			p->slots[level] = slot;
2853			if (ins_len > 0 &&
2854			    btrfs_leaf_free_space(b) < ins_len) {
2855				if (write_lock_level < 1) {
2856					write_lock_level = 1;
2857					btrfs_release_path(p);
2858					goto again;
2859				}
2860
2861				btrfs_set_path_blocking(p);
2862				err = split_leaf(trans, root, key,
2863						 p, ins_len, ret == 0);
2864
2865				BUG_ON(err > 0);
2866				if (err) {
2867					ret = err;
2868					goto done;
2869				}
2870			}
2871			if (!p->search_for_split)
2872				unlock_up(p, level, lowest_unlock,
2873					  min_write_lock_level, NULL);
2874			goto done;
2875		}
2876		if (ret && slot > 0) {
2877			dec = 1;
2878			slot--;
2879		}
2880		p->slots[level] = slot;
2881		err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2882					     &write_lock_level);
2883		if (err == -EAGAIN)
2884			goto again;
2885		if (err) {
2886			ret = err;
2887			goto done;
2888		}
2889		b = p->nodes[level];
2890		slot = p->slots[level];
2891
2892		/*
2893		 * Slot 0 is special, if we change the key we have to update
2894		 * the parent pointer which means we must have a write lock on
2895		 * the parent
2896		 */
2897		if (slot == 0 && ins_len && write_lock_level < level + 1) {
2898			write_lock_level = level + 1;
2899			btrfs_release_path(p);
2900			goto again;
2901		}
2902
2903		unlock_up(p, level, lowest_unlock, min_write_lock_level,
2904			  &write_lock_level);
2905
2906		if (level == lowest_level) {
2907			if (dec)
2908				p->slots[level]++;
2909			goto done;
2910		}
2911
2912		err = read_block_for_search(root, p, &b, level, slot, key);
2913		if (err == -EAGAIN)
2914			goto again;
2915		if (err) {
2916			ret = err;
2917			goto done;
2918		}
2919
2920		if (!p->skip_locking) {
2921			level = btrfs_header_level(b);
2922			if (level <= write_lock_level) {
2923				if (!btrfs_try_tree_write_lock(b)) {
2924					btrfs_set_path_blocking(p);
2925					btrfs_tree_lock(b);
2926				}
2927				p->locks[level] = BTRFS_WRITE_LOCK;
2928			} else {
2929				if (!btrfs_tree_read_lock_atomic(b)) {
2930					btrfs_set_path_blocking(p);
2931					__btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
2932							       p->recurse);
2933				}
2934				p->locks[level] = BTRFS_READ_LOCK;
2935			}
2936			p->nodes[level] = b;
2937		}
2938	}
2939	ret = 1;
2940done:
2941	/*
2942	 * we don't really know what they plan on doing with the path
2943	 * from here on, so for now just mark it as blocking
2944	 */
2945	if (!p->leave_spinning)
2946		btrfs_set_path_blocking(p);
2947	if (ret < 0 && !p->skip_release_on_error)
2948		btrfs_release_path(p);
2949	return ret;
2950}
2951
2952/*
2953 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2954 * current state of the tree together with the operations recorded in the tree
2955 * modification log to search for the key in a previous version of this tree, as
2956 * denoted by the time_seq parameter.
2957 *
2958 * Naturally, there is no support for insert, delete or cow operations.
2959 *
2960 * The resulting path and return value will be set up as if we called
2961 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2962 */
2963int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2964			  struct btrfs_path *p, u64 time_seq)
2965{
2966	struct btrfs_fs_info *fs_info = root->fs_info;
2967	struct extent_buffer *b;
2968	int slot;
2969	int ret;
2970	int err;
2971	int level;
2972	int lowest_unlock = 1;
2973	u8 lowest_level = 0;
2974
2975	lowest_level = p->lowest_level;
2976	WARN_ON(p->nodes[0] != NULL);
2977
2978	if (p->search_commit_root) {
2979		BUG_ON(time_seq);
2980		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2981	}
2982
2983again:
2984	b = get_old_root(root, time_seq);
2985	if (!b) {
2986		ret = -EIO;
2987		goto done;
2988	}
2989	level = btrfs_header_level(b);
2990	p->locks[level] = BTRFS_READ_LOCK;
2991
2992	while (b) {
2993		int dec = 0;
2994
2995		level = btrfs_header_level(b);
2996		p->nodes[level] = b;
2997
2998		/*
2999		 * we have a lock on b and as long as we aren't changing
3000		 * the tree, there is no way to for the items in b to change.
3001		 * It is safe to drop the lock on our parent before we
3002		 * go through the expensive btree search on b.
3003		 */
3004		btrfs_unlock_up_safe(p, level + 1);
3005
3006		ret = btrfs_bin_search(b, key, &slot);
3007		if (ret < 0)
3008			goto done;
3009
3010		if (level == 0) {
3011			p->slots[level] = slot;
3012			unlock_up(p, level, lowest_unlock, 0, NULL);
3013			goto done;
3014		}
3015
3016		if (ret && slot > 0) {
3017			dec = 1;
3018			slot--;
3019		}
3020		p->slots[level] = slot;
3021		unlock_up(p, level, lowest_unlock, 0, NULL);
3022
3023		if (level == lowest_level) {
3024			if (dec)
3025				p->slots[level]++;
3026			goto done;
3027		}
3028
3029		err = read_block_for_search(root, p, &b, level, slot, key);
3030		if (err == -EAGAIN)
3031			goto again;
3032		if (err) {
3033			ret = err;
3034			goto done;
3035		}
3036
3037		level = btrfs_header_level(b);
3038		if (!btrfs_tree_read_lock_atomic(b)) {
3039			btrfs_set_path_blocking(p);
3040			btrfs_tree_read_lock(b);
3041		}
3042		b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3043		if (!b) {
3044			ret = -ENOMEM;
3045			goto done;
3046		}
3047		p->locks[level] = BTRFS_READ_LOCK;
3048		p->nodes[level] = b;
3049	}
3050	ret = 1;
3051done:
3052	if (!p->leave_spinning)
3053		btrfs_set_path_blocking(p);
3054	if (ret < 0)
3055		btrfs_release_path(p);
3056
3057	return ret;
3058}
3059
3060/*
3061 * helper to use instead of search slot if no exact match is needed but
3062 * instead the next or previous item should be returned.
3063 * When find_higher is true, the next higher item is returned, the next lower
3064 * otherwise.
3065 * When return_any and find_higher are both true, and no higher item is found,
3066 * return the next lower instead.
3067 * When return_any is true and find_higher is false, and no lower item is found,
3068 * return the next higher instead.
3069 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3070 * < 0 on error
3071 */
3072int btrfs_search_slot_for_read(struct btrfs_root *root,
3073			       const struct btrfs_key *key,
3074			       struct btrfs_path *p, int find_higher,
3075			       int return_any)
3076{
3077	int ret;
3078	struct extent_buffer *leaf;
3079
3080again:
3081	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3082	if (ret <= 0)
3083		return ret;
3084	/*
3085	 * a return value of 1 means the path is at the position where the
3086	 * item should be inserted. Normally this is the next bigger item,
3087	 * but in case the previous item is the last in a leaf, path points
3088	 * to the first free slot in the previous leaf, i.e. at an invalid
3089	 * item.
3090	 */
3091	leaf = p->nodes[0];
3092
3093	if (find_higher) {
3094		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3095			ret = btrfs_next_leaf(root, p);
3096			if (ret <= 0)
3097				return ret;
3098			if (!return_any)
3099				return 1;
3100			/*
3101			 * no higher item found, return the next
3102			 * lower instead
3103			 */
3104			return_any = 0;
3105			find_higher = 0;
3106			btrfs_release_path(p);
3107			goto again;
3108		}
3109	} else {
3110		if (p->slots[0] == 0) {
3111			ret = btrfs_prev_leaf(root, p);
3112			if (ret < 0)
3113				return ret;
3114			if (!ret) {
3115				leaf = p->nodes[0];
3116				if (p->slots[0] == btrfs_header_nritems(leaf))
3117					p->slots[0]--;
3118				return 0;
3119			}
3120			if (!return_any)
3121				return 1;
3122			/*
3123			 * no lower item found, return the next
3124			 * higher instead
3125			 */
3126			return_any = 0;
3127			find_higher = 1;
3128			btrfs_release_path(p);
3129			goto again;
3130		} else {
3131			--p->slots[0];
3132		}
3133	}
3134	return 0;
3135}
3136
3137/*
3138 * adjust the pointers going up the tree, starting at level
3139 * making sure the right key of each node is points to 'key'.
3140 * This is used after shifting pointers to the left, so it stops
3141 * fixing up pointers when a given leaf/node is not in slot 0 of the
3142 * higher levels
3143 *
3144 */
3145static void fixup_low_keys(struct btrfs_path *path,
3146			   struct btrfs_disk_key *key, int level)
3147{
3148	int i;
3149	struct extent_buffer *t;
3150	int ret;
3151
3152	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3153		int tslot = path->slots[i];
3154
3155		if (!path->nodes[i])
3156			break;
3157		t = path->nodes[i];
3158		ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3159				GFP_ATOMIC);
3160		BUG_ON(ret < 0);
3161		btrfs_set_node_key(t, key, tslot);
3162		btrfs_mark_buffer_dirty(path->nodes[i]);
3163		if (tslot != 0)
3164			break;
3165	}
3166}
3167
3168/*
3169 * update item key.
3170 *
3171 * This function isn't completely safe. It's the caller's responsibility
3172 * that the new key won't break the order
3173 */
3174void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3175			     struct btrfs_path *path,
3176			     const struct btrfs_key *new_key)
3177{
3178	struct btrfs_disk_key disk_key;
3179	struct extent_buffer *eb;
3180	int slot;
3181
3182	eb = path->nodes[0];
3183	slot = path->slots[0];
3184	if (slot > 0) {
3185		btrfs_item_key(eb, &disk_key, slot - 1);
3186		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3187			btrfs_crit(fs_info,
3188		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3189				   slot, btrfs_disk_key_objectid(&disk_key),
3190				   btrfs_disk_key_type(&disk_key),
3191				   btrfs_disk_key_offset(&disk_key),
3192				   new_key->objectid, new_key->type,
3193				   new_key->offset);
3194			btrfs_print_leaf(eb);
3195			BUG();
3196		}
3197	}
3198	if (slot < btrfs_header_nritems(eb) - 1) {
3199		btrfs_item_key(eb, &disk_key, slot + 1);
3200		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3201			btrfs_crit(fs_info,
3202		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3203				   slot, btrfs_disk_key_objectid(&disk_key),
3204				   btrfs_disk_key_type(&disk_key),
3205				   btrfs_disk_key_offset(&disk_key),
3206				   new_key->objectid, new_key->type,
3207				   new_key->offset);
3208			btrfs_print_leaf(eb);
3209			BUG();
3210		}
3211	}
3212
3213	btrfs_cpu_key_to_disk(&disk_key, new_key);
3214	btrfs_set_item_key(eb, &disk_key, slot);
3215	btrfs_mark_buffer_dirty(eb);
3216	if (slot == 0)
3217		fixup_low_keys(path, &disk_key, 1);
3218}
3219
3220/*
3221 * Check key order of two sibling extent buffers.
3222 *
3223 * Return true if something is wrong.
3224 * Return false if everything is fine.
3225 *
3226 * Tree-checker only works inside one tree block, thus the following
3227 * corruption can not be detected by tree-checker:
3228 *
3229 * Leaf @left			| Leaf @right
3230 * --------------------------------------------------------------
3231 * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
3232 *
3233 * Key f6 in leaf @left itself is valid, but not valid when the next
3234 * key in leaf @right is 7.
3235 * This can only be checked at tree block merge time.
3236 * And since tree checker has ensured all key order in each tree block
3237 * is correct, we only need to bother the last key of @left and the first
3238 * key of @right.
3239 */
3240static bool check_sibling_keys(struct extent_buffer *left,
3241			       struct extent_buffer *right)
3242{
3243	struct btrfs_key left_last;
3244	struct btrfs_key right_first;
3245	int level = btrfs_header_level(left);
3246	int nr_left = btrfs_header_nritems(left);
3247	int nr_right = btrfs_header_nritems(right);
3248
3249	/* No key to check in one of the tree blocks */
3250	if (!nr_left || !nr_right)
3251		return false;
3252
3253	if (level) {
3254		btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
3255		btrfs_node_key_to_cpu(right, &right_first, 0);
3256	} else {
3257		btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
3258		btrfs_item_key_to_cpu(right, &right_first, 0);
3259	}
3260
3261	if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
3262		btrfs_crit(left->fs_info,
3263"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
3264			   left_last.objectid, left_last.type,
3265			   left_last.offset, right_first.objectid,
3266			   right_first.type, right_first.offset);
3267		return true;
3268	}
3269	return false;
3270}
3271
3272/*
3273 * try to push data from one node into the next node left in the
3274 * tree.
3275 *
3276 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3277 * error, and > 0 if there was no room in the left hand block.
3278 */
3279static int push_node_left(struct btrfs_trans_handle *trans,
3280			  struct extent_buffer *dst,
3281			  struct extent_buffer *src, int empty)
3282{
3283	struct btrfs_fs_info *fs_info = trans->fs_info;
3284	int push_items = 0;
3285	int src_nritems;
3286	int dst_nritems;
3287	int ret = 0;
3288
3289	src_nritems = btrfs_header_nritems(src);
3290	dst_nritems = btrfs_header_nritems(dst);
3291	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3292	WARN_ON(btrfs_header_generation(src) != trans->transid);
3293	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3294
3295	if (!empty && src_nritems <= 8)
3296		return 1;
3297
3298	if (push_items <= 0)
3299		return 1;
3300
3301	if (empty) {
3302		push_items = min(src_nritems, push_items);
3303		if (push_items < src_nritems) {
3304			/* leave at least 8 pointers in the node if
3305			 * we aren't going to empty it
3306			 */
3307			if (src_nritems - push_items < 8) {
3308				if (push_items <= 8)
3309					return 1;
3310				push_items -= 8;
3311			}
3312		}
3313	} else
3314		push_items = min(src_nritems - 8, push_items);
3315
3316	/* dst is the left eb, src is the middle eb */
3317	if (check_sibling_keys(dst, src)) {
3318		ret = -EUCLEAN;
3319		btrfs_abort_transaction(trans, ret);
3320		return ret;
3321	}
3322	ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3323	if (ret) {
3324		btrfs_abort_transaction(trans, ret);
3325		return ret;
3326	}
3327	copy_extent_buffer(dst, src,
3328			   btrfs_node_key_ptr_offset(dst_nritems),
3329			   btrfs_node_key_ptr_offset(0),
3330			   push_items * sizeof(struct btrfs_key_ptr));
3331
3332	if (push_items < src_nritems) {
3333		/*
3334		 * Don't call tree_mod_log_insert_move here, key removal was
3335		 * already fully logged by tree_mod_log_eb_copy above.
3336		 */
3337		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3338				      btrfs_node_key_ptr_offset(push_items),
3339				      (src_nritems - push_items) *
3340				      sizeof(struct btrfs_key_ptr));
3341	}
3342	btrfs_set_header_nritems(src, src_nritems - push_items);
3343	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3344	btrfs_mark_buffer_dirty(src);
3345	btrfs_mark_buffer_dirty(dst);
3346
3347	return ret;
3348}
3349
3350/*
3351 * try to push data from one node into the next node right in the
3352 * tree.
3353 *
3354 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3355 * error, and > 0 if there was no room in the right hand block.
3356 *
3357 * this will  only push up to 1/2 the contents of the left node over
3358 */
3359static int balance_node_right(struct btrfs_trans_handle *trans,
3360			      struct extent_buffer *dst,
3361			      struct extent_buffer *src)
3362{
3363	struct btrfs_fs_info *fs_info = trans->fs_info;
3364	int push_items = 0;
3365	int max_push;
3366	int src_nritems;
3367	int dst_nritems;
3368	int ret = 0;
3369
3370	WARN_ON(btrfs_header_generation(src) != trans->transid);
3371	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3372
3373	src_nritems = btrfs_header_nritems(src);
3374	dst_nritems = btrfs_header_nritems(dst);
3375	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3376	if (push_items <= 0)
3377		return 1;
3378
3379	if (src_nritems < 4)
3380		return 1;
3381
3382	max_push = src_nritems / 2 + 1;
3383	/* don't try to empty the node */
3384	if (max_push >= src_nritems)
3385		return 1;
3386
3387	if (max_push < push_items)
3388		push_items = max_push;
3389
3390	/* dst is the right eb, src is the middle eb */
3391	if (check_sibling_keys(src, dst)) {
3392		ret = -EUCLEAN;
3393		btrfs_abort_transaction(trans, ret);
3394		return ret;
3395	}
3396	ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3397	BUG_ON(ret < 0);
3398	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3399				      btrfs_node_key_ptr_offset(0),
3400				      (dst_nritems) *
3401				      sizeof(struct btrfs_key_ptr));
3402
3403	ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3404				   push_items);
3405	if (ret) {
3406		btrfs_abort_transaction(trans, ret);
3407		return ret;
3408	}
3409	copy_extent_buffer(dst, src,
3410			   btrfs_node_key_ptr_offset(0),
3411			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3412			   push_items * sizeof(struct btrfs_key_ptr));
3413
3414	btrfs_set_header_nritems(src, src_nritems - push_items);
3415	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3416
3417	btrfs_mark_buffer_dirty(src);
3418	btrfs_mark_buffer_dirty(dst);
3419
3420	return ret;
3421}
3422
3423/*
3424 * helper function to insert a new root level in the tree.
3425 * A new node is allocated, and a single item is inserted to
3426 * point to the existing root
3427 *
3428 * returns zero on success or < 0 on failure.
3429 */
3430static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3431			   struct btrfs_root *root,
3432			   struct btrfs_path *path, int level)
3433{
3434	struct btrfs_fs_info *fs_info = root->fs_info;
3435	u64 lower_gen;
3436	struct extent_buffer *lower;
3437	struct extent_buffer *c;
3438	struct extent_buffer *old;
3439	struct btrfs_disk_key lower_key;
3440	int ret;
3441
3442	BUG_ON(path->nodes[level]);
3443	BUG_ON(path->nodes[level-1] != root->node);
3444
3445	lower = path->nodes[level-1];
3446	if (level == 1)
3447		btrfs_item_key(lower, &lower_key, 0);
3448	else
3449		btrfs_node_key(lower, &lower_key, 0);
3450
3451	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3452					 root->node->start, 0,
3453					 BTRFS_NESTING_NEW_ROOT);
3454	if (IS_ERR(c))
3455		return PTR_ERR(c);
3456
3457	root_add_used(root, fs_info->nodesize);
3458
3459	btrfs_set_header_nritems(c, 1);
3460	btrfs_set_node_key(c, &lower_key, 0);
3461	btrfs_set_node_blockptr(c, 0, lower->start);
3462	lower_gen = btrfs_header_generation(lower);
3463	WARN_ON(lower_gen != trans->transid);
3464
3465	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3466
3467	btrfs_mark_buffer_dirty(c);
3468
3469	old = root->node;
3470	ret = tree_mod_log_insert_root(root->node, c, 0);
3471	BUG_ON(ret < 0);
3472	rcu_assign_pointer(root->node, c);
3473
3474	/* the super has an extra ref to root->node */
3475	free_extent_buffer(old);
3476
3477	add_root_to_dirty_list(root);
3478	atomic_inc(&c->refs);
3479	path->nodes[level] = c;
3480	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3481	path->slots[level] = 0;
3482	return 0;
3483}
3484
3485/*
3486 * worker function to insert a single pointer in a node.
3487 * the node should have enough room for the pointer already
3488 *
3489 * slot and level indicate where you want the key to go, and
3490 * blocknr is the block the key points to.
3491 */
3492static void insert_ptr(struct btrfs_trans_handle *trans,
3493		       struct btrfs_path *path,
3494		       struct btrfs_disk_key *key, u64 bytenr,
3495		       int slot, int level)
3496{
3497	struct extent_buffer *lower;
3498	int nritems;
3499	int ret;
3500
3501	BUG_ON(!path->nodes[level]);
3502	btrfs_assert_tree_locked(path->nodes[level]);
3503	lower = path->nodes[level];
3504	nritems = btrfs_header_nritems(lower);
3505	BUG_ON(slot > nritems);
3506	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3507	if (slot != nritems) {
3508		if (level) {
3509			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3510					nritems - slot);
3511			BUG_ON(ret < 0);
3512		}
3513		memmove_extent_buffer(lower,
3514			      btrfs_node_key_ptr_offset(slot + 1),
3515			      btrfs_node_key_ptr_offset(slot),
3516			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3517	}
3518	if (level) {
3519		ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3520				GFP_NOFS);
3521		BUG_ON(ret < 0);
3522	}
3523	btrfs_set_node_key(lower, key, slot);
3524	btrfs_set_node_blockptr(lower, slot, bytenr);
3525	WARN_ON(trans->transid == 0);
3526	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3527	btrfs_set_header_nritems(lower, nritems + 1);
3528	btrfs_mark_buffer_dirty(lower);
3529}
3530
3531/*
3532 * split the node at the specified level in path in two.
3533 * The path is corrected to point to the appropriate node after the split
3534 *
3535 * Before splitting this tries to make some room in the node by pushing
3536 * left and right, if either one works, it returns right away.
3537 *
3538 * returns 0 on success and < 0 on failure
3539 */
3540static noinline int split_node(struct btrfs_trans_handle *trans,
3541			       struct btrfs_root *root,
3542			       struct btrfs_path *path, int level)
3543{
3544	struct btrfs_fs_info *fs_info = root->fs_info;
3545	struct extent_buffer *c;
3546	struct extent_buffer *split;
3547	struct btrfs_disk_key disk_key;
3548	int mid;
3549	int ret;
3550	u32 c_nritems;
3551
3552	c = path->nodes[level];
3553	WARN_ON(btrfs_header_generation(c) != trans->transid);
3554	if (c == root->node) {
3555		/*
3556		 * trying to split the root, lets make a new one
3557		 *
3558		 * tree mod log: We don't log_removal old root in
3559		 * insert_new_root, because that root buffer will be kept as a
3560		 * normal node. We are going to log removal of half of the
3561		 * elements below with tree_mod_log_eb_copy. We're holding a
3562		 * tree lock on the buffer, which is why we cannot race with
3563		 * other tree_mod_log users.
3564		 */
3565		ret = insert_new_root(trans, root, path, level + 1);
3566		if (ret)
3567			return ret;
3568	} else {
3569		ret = push_nodes_for_insert(trans, root, path, level);
3570		c = path->nodes[level];
3571		if (!ret && btrfs_header_nritems(c) <
3572		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3573			return 0;
3574		if (ret < 0)
3575			return ret;
3576	}
3577
3578	c_nritems = btrfs_header_nritems(c);
3579	mid = (c_nritems + 1) / 2;
3580	btrfs_node_key(c, &disk_key, mid);
3581
3582	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3583					     c->start, 0, BTRFS_NESTING_SPLIT);
3584	if (IS_ERR(split))
3585		return PTR_ERR(split);
3586
3587	root_add_used(root, fs_info->nodesize);
3588	ASSERT(btrfs_header_level(c) == level);
3589
3590	ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3591	if (ret) {
3592		btrfs_tree_unlock(split);
3593		free_extent_buffer(split);
3594		btrfs_abort_transaction(trans, ret);
3595		return ret;
3596	}
3597	copy_extent_buffer(split, c,
3598			   btrfs_node_key_ptr_offset(0),
3599			   btrfs_node_key_ptr_offset(mid),
3600			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3601	btrfs_set_header_nritems(split, c_nritems - mid);
3602	btrfs_set_header_nritems(c, mid);
3603	ret = 0;
3604
3605	btrfs_mark_buffer_dirty(c);
3606	btrfs_mark_buffer_dirty(split);
3607
3608	insert_ptr(trans, path, &disk_key, split->start,
3609		   path->slots[level + 1] + 1, level + 1);
3610
3611	if (path->slots[level] >= mid) {
3612		path->slots[level] -= mid;
3613		btrfs_tree_unlock(c);
3614		free_extent_buffer(c);
3615		path->nodes[level] = split;
3616		path->slots[level + 1] += 1;
3617	} else {
3618		btrfs_tree_unlock(split);
3619		free_extent_buffer(split);
3620	}
3621	return ret;
3622}
3623
3624/*
3625 * how many bytes are required to store the items in a leaf.  start
3626 * and nr indicate which items in the leaf to check.  This totals up the
3627 * space used both by the item structs and the item data
3628 */
3629static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3630{
3631	struct btrfs_item *start_item;
3632	struct btrfs_item *end_item;
3633	int data_len;
3634	int nritems = btrfs_header_nritems(l);
3635	int end = min(nritems, start + nr) - 1;
3636
3637	if (!nr)
3638		return 0;
3639	start_item = btrfs_item_nr(start);
3640	end_item = btrfs_item_nr(end);
3641	data_len = btrfs_item_offset(l, start_item) +
3642		   btrfs_item_size(l, start_item);
3643	data_len = data_len - btrfs_item_offset(l, end_item);
3644	data_len += sizeof(struct btrfs_item) * nr;
3645	WARN_ON(data_len < 0);
3646	return data_len;
3647}
3648
3649/*
3650 * The space between the end of the leaf items and
3651 * the start of the leaf data.  IOW, how much room
3652 * the leaf has left for both items and data
3653 */
3654noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3655{
3656	struct btrfs_fs_info *fs_info = leaf->fs_info;
3657	int nritems = btrfs_header_nritems(leaf);
3658	int ret;
3659
3660	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3661	if (ret < 0) {
3662		btrfs_crit(fs_info,
3663			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3664			   ret,
3665			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3666			   leaf_space_used(leaf, 0, nritems), nritems);
3667	}
3668	return ret;
3669}
3670
3671/*
3672 * min slot controls the lowest index we're willing to push to the
3673 * right.  We'll push up to and including min_slot, but no lower
3674 */
3675static noinline int __push_leaf_right(struct btrfs_path *path,
3676				      int data_size, int empty,
3677				      struct extent_buffer *right,
3678				      int free_space, u32 left_nritems,
3679				      u32 min_slot)
3680{
3681	struct btrfs_fs_info *fs_info = right->fs_info;
3682	struct extent_buffer *left = path->nodes[0];
3683	struct extent_buffer *upper = path->nodes[1];
3684	struct btrfs_map_token token;
3685	struct btrfs_disk_key disk_key;
3686	int slot;
3687	u32 i;
3688	int push_space = 0;
3689	int push_items = 0;
3690	struct btrfs_item *item;
3691	u32 nr;
3692	u32 right_nritems;
3693	u32 data_end;
3694	u32 this_item_size;
3695
3696	if (empty)
3697		nr = 0;
3698	else
3699		nr = max_t(u32, 1, min_slot);
3700
3701	if (path->slots[0] >= left_nritems)
3702		push_space += data_size;
3703
3704	slot = path->slots[1];
3705	i = left_nritems - 1;
3706	while (i >= nr) {
3707		item = btrfs_item_nr(i);
3708
3709		if (!empty && push_items > 0) {
3710			if (path->slots[0] > i)
3711				break;
3712			if (path->slots[0] == i) {
3713				int space = btrfs_leaf_free_space(left);
3714
3715				if (space + push_space * 2 > free_space)
3716					break;
3717			}
3718		}
3719
3720		if (path->slots[0] == i)
3721			push_space += data_size;
3722
3723		this_item_size = btrfs_item_size(left, item);
3724		if (this_item_size + sizeof(*item) + push_space > free_space)
3725			break;
3726
3727		push_items++;
3728		push_space += this_item_size + sizeof(*item);
3729		if (i == 0)
3730			break;
3731		i--;
3732	}
3733
3734	if (push_items == 0)
3735		goto out_unlock;
3736
3737	WARN_ON(!empty && push_items == left_nritems);
3738
3739	/* push left to right */
3740	right_nritems = btrfs_header_nritems(right);
3741
3742	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3743	push_space -= leaf_data_end(left);
3744
3745	/* make room in the right data area */
3746	data_end = leaf_data_end(right);
3747	memmove_extent_buffer(right,
3748			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3749			      BTRFS_LEAF_DATA_OFFSET + data_end,
3750			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3751
3752	/* copy from the left data area */
3753	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3754		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3755		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3756		     push_space);
3757
3758	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3759			      btrfs_item_nr_offset(0),
3760			      right_nritems * sizeof(struct btrfs_item));
3761
3762	/* copy the items from left to right */
3763	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3764		   btrfs_item_nr_offset(left_nritems - push_items),
3765		   push_items * sizeof(struct btrfs_item));
3766
3767	/* update the item pointers */
3768	btrfs_init_map_token(&token, right);
3769	right_nritems += push_items;
3770	btrfs_set_header_nritems(right, right_nritems);
3771	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3772	for (i = 0; i < right_nritems; i++) {
3773		item = btrfs_item_nr(i);
3774		push_space -= btrfs_token_item_size(&token, item);
3775		btrfs_set_token_item_offset(&token, item, push_space);
3776	}
3777
3778	left_nritems -= push_items;
3779	btrfs_set_header_nritems(left, left_nritems);
3780
3781	if (left_nritems)
3782		btrfs_mark_buffer_dirty(left);
3783	else
3784		btrfs_clean_tree_block(left);
3785
3786	btrfs_mark_buffer_dirty(right);
3787
3788	btrfs_item_key(right, &disk_key, 0);
3789	btrfs_set_node_key(upper, &disk_key, slot + 1);
3790	btrfs_mark_buffer_dirty(upper);
3791
3792	/* then fixup the leaf pointer in the path */
3793	if (path->slots[0] >= left_nritems) {
3794		path->slots[0] -= left_nritems;
3795		if (btrfs_header_nritems(path->nodes[0]) == 0)
3796			btrfs_clean_tree_block(path->nodes[0]);
3797		btrfs_tree_unlock(path->nodes[0]);
3798		free_extent_buffer(path->nodes[0]);
3799		path->nodes[0] = right;
3800		path->slots[1] += 1;
3801	} else {
3802		btrfs_tree_unlock(right);
3803		free_extent_buffer(right);
3804	}
3805	return 0;
3806
3807out_unlock:
3808	btrfs_tree_unlock(right);
3809	free_extent_buffer(right);
3810	return 1;
3811}
3812
3813/*
3814 * push some data in the path leaf to the right, trying to free up at
3815 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3816 *
3817 * returns 1 if the push failed because the other node didn't have enough
3818 * room, 0 if everything worked out and < 0 if there were major errors.
3819 *
3820 * this will push starting from min_slot to the end of the leaf.  It won't
3821 * push any slot lower than min_slot
3822 */
3823static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3824			   *root, struct btrfs_path *path,
3825			   int min_data_size, int data_size,
3826			   int empty, u32 min_slot)
3827{
3828	struct extent_buffer *left = path->nodes[0];
3829	struct extent_buffer *right;
3830	struct extent_buffer *upper;
3831	int slot;
3832	int free_space;
3833	u32 left_nritems;
3834	int ret;
3835
3836	if (!path->nodes[1])
3837		return 1;
3838
3839	slot = path->slots[1];
3840	upper = path->nodes[1];
3841	if (slot >= btrfs_header_nritems(upper) - 1)
3842		return 1;
3843
3844	btrfs_assert_tree_locked(path->nodes[1]);
3845
3846	right = btrfs_read_node_slot(upper, slot + 1);
3847	/*
3848	 * slot + 1 is not valid or we fail to read the right node,
3849	 * no big deal, just return.
3850	 */
3851	if (IS_ERR(right))
3852		return 1;
3853
3854	__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3855	btrfs_set_lock_blocking_write(right);
3856
3857	free_space = btrfs_leaf_free_space(right);
3858	if (free_space < data_size)
3859		goto out_unlock;
3860
3861	/* cow and double check */
3862	ret = btrfs_cow_block(trans, root, right, upper,
3863			      slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3864	if (ret)
3865		goto out_unlock;
3866
3867	free_space = btrfs_leaf_free_space(right);
3868	if (free_space < data_size)
3869		goto out_unlock;
3870
3871	left_nritems = btrfs_header_nritems(left);
3872	if (left_nritems == 0)
3873		goto out_unlock;
3874
3875	if (check_sibling_keys(left, right)) {
3876		ret = -EUCLEAN;
3877		btrfs_abort_transaction(trans, ret);
3878		btrfs_tree_unlock(right);
3879		free_extent_buffer(right);
3880		return ret;
3881	}
3882	if (path->slots[0] == left_nritems && !empty) {
3883		/* Key greater than all keys in the leaf, right neighbor has
3884		 * enough room for it and we're not emptying our leaf to delete
3885		 * it, therefore use right neighbor to insert the new item and
3886		 * no need to touch/dirty our left leaf. */
3887		btrfs_tree_unlock(left);
3888		free_extent_buffer(left);
3889		path->nodes[0] = right;
3890		path->slots[0] = 0;
3891		path->slots[1]++;
3892		return 0;
3893	}
3894
3895	return __push_leaf_right(path, min_data_size, empty,
3896				right, free_space, left_nritems, min_slot);
3897out_unlock:
3898	btrfs_tree_unlock(right);
3899	free_extent_buffer(right);
3900	return 1;
3901}
3902
3903/*
3904 * push some data in the path leaf to the left, trying to free up at
3905 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3906 *
3907 * max_slot can put a limit on how far into the leaf we'll push items.  The
3908 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3909 * items
3910 */
3911static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3912				     int empty, struct extent_buffer *left,
3913				     int free_space, u32 right_nritems,
3914				     u32 max_slot)
3915{
3916	struct btrfs_fs_info *fs_info = left->fs_info;
3917	struct btrfs_disk_key disk_key;
3918	struct extent_buffer *right = path->nodes[0];
3919	int i;
3920	int push_space = 0;
3921	int push_items = 0;
3922	struct btrfs_item *item;
3923	u32 old_left_nritems;
3924	u32 nr;
3925	int ret = 0;
3926	u32 this_item_size;
3927	u32 old_left_item_size;
3928	struct btrfs_map_token token;
3929
3930	if (empty)
3931		nr = min(right_nritems, max_slot);
3932	else
3933		nr = min(right_nritems - 1, max_slot);
3934
3935	for (i = 0; i < nr; i++) {
3936		item = btrfs_item_nr(i);
3937
3938		if (!empty && push_items > 0) {
3939			if (path->slots[0] < i)
3940				break;
3941			if (path->slots[0] == i) {
3942				int space = btrfs_leaf_free_space(right);
3943
3944				if (space + push_space * 2 > free_space)
3945					break;
3946			}
3947		}
3948
3949		if (path->slots[0] == i)
3950			push_space += data_size;
3951
3952		this_item_size = btrfs_item_size(right, item);
3953		if (this_item_size + sizeof(*item) + push_space > free_space)
3954			break;
3955
3956		push_items++;
3957		push_space += this_item_size + sizeof(*item);
3958	}
3959
3960	if (push_items == 0) {
3961		ret = 1;
3962		goto out;
3963	}
3964	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3965
3966	/* push data from right to left */
3967	copy_extent_buffer(left, right,
3968			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3969			   btrfs_item_nr_offset(0),
3970			   push_items * sizeof(struct btrfs_item));
3971
3972	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3973		     btrfs_item_offset_nr(right, push_items - 1);
3974
3975	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3976		     leaf_data_end(left) - push_space,
3977		     BTRFS_LEAF_DATA_OFFSET +
3978		     btrfs_item_offset_nr(right, push_items - 1),
3979		     push_space);
3980	old_left_nritems = btrfs_header_nritems(left);
3981	BUG_ON(old_left_nritems <= 0);
3982
3983	btrfs_init_map_token(&token, left);
3984	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3985	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3986		u32 ioff;
3987
3988		item = btrfs_item_nr(i);
3989
3990		ioff = btrfs_token_item_offset(&token, item);
3991		btrfs_set_token_item_offset(&token, item,
3992		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3993	}
3994	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3995
3996	/* fixup right node */
3997	if (push_items > right_nritems)
3998		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3999		       right_nritems);
4000
4001	if (push_items < right_nritems) {
4002		push_space = btrfs_item_offset_nr(right, push_items - 1) -
4003						  leaf_data_end(right);
4004		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
4005				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
4006				      BTRFS_LEAF_DATA_OFFSET +
4007				      leaf_data_end(right), push_space);
4008
4009		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
4010			      btrfs_item_nr_offset(push_items),
4011			     (btrfs_header_nritems(right) - push_items) *
4012			     sizeof(struct btrfs_item));
4013	}
4014
4015	btrfs_init_map_token(&token, right);
4016	right_nritems -= push_items;
4017	btrfs_set_header_nritems(right, right_nritems);
4018	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
4019	for (i = 0; i < right_nritems; i++) {
4020		item = btrfs_item_nr(i);
4021
4022		push_space = push_space - btrfs_token_item_size(&token, item);
4023		btrfs_set_token_item_offset(&token, item, push_space);
4024	}
4025
4026	btrfs_mark_buffer_dirty(left);
4027	if (right_nritems)
4028		btrfs_mark_buffer_dirty(right);
4029	else
4030		btrfs_clean_tree_block(right);
4031
4032	btrfs_item_key(right, &disk_key, 0);
4033	fixup_low_keys(path, &disk_key, 1);
4034
4035	/* then fixup the leaf pointer in the path */
4036	if (path->slots[0] < push_items) {
4037		path->slots[0] += old_left_nritems;
4038		btrfs_tree_unlock(path->nodes[0]);
4039		free_extent_buffer(path->nodes[0]);
4040		path->nodes[0] = left;
4041		path->slots[1] -= 1;
4042	} else {
4043		btrfs_tree_unlock(left);
4044		free_extent_buffer(left);
4045		path->slots[0] -= push_items;
4046	}
4047	BUG_ON(path->slots[0] < 0);
4048	return ret;
4049out:
4050	btrfs_tree_unlock(left);
4051	free_extent_buffer(left);
4052	return ret;
4053}
4054
4055/*
4056 * push some data in the path leaf to the left, trying to free up at
4057 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4058 *
4059 * max_slot can put a limit on how far into the leaf we'll push items.  The
4060 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
4061 * items
4062 */
4063static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4064			  *root, struct btrfs_path *path, int min_data_size,
4065			  int data_size, int empty, u32 max_slot)
4066{
4067	struct extent_buffer *right = path->nodes[0];
4068	struct extent_buffer *left;
4069	int slot;
4070	int free_space;
4071	u32 right_nritems;
4072	int ret = 0;
4073
4074	slot = path->slots[1];
4075	if (slot == 0)
4076		return 1;
4077	if (!path->nodes[1])
4078		return 1;
4079
4080	right_nritems = btrfs_header_nritems(right);
4081	if (right_nritems == 0)
4082		return 1;
4083
4084	btrfs_assert_tree_locked(path->nodes[1]);
4085
4086	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
4087	/*
4088	 * slot - 1 is not valid or we fail to read the left node,
4089	 * no big deal, just return.
4090	 */
4091	if (IS_ERR(left))
4092		return 1;
4093
4094	__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
4095	btrfs_set_lock_blocking_write(left);
4096
4097	free_space = btrfs_leaf_free_space(left);
4098	if (free_space < data_size) {
4099		ret = 1;
4100		goto out;
4101	}
4102
4103	/* cow and double check */
4104	ret = btrfs_cow_block(trans, root, left,
4105			      path->nodes[1], slot - 1, &left,
4106			      BTRFS_NESTING_LEFT_COW);
4107	if (ret) {
4108		/* we hit -ENOSPC, but it isn't fatal here */
4109		if (ret == -ENOSPC)
4110			ret = 1;
4111		goto out;
4112	}
4113
4114	free_space = btrfs_leaf_free_space(left);
4115	if (free_space < data_size) {
4116		ret = 1;
4117		goto out;
4118	}
4119
4120	if (check_sibling_keys(left, right)) {
4121		ret = -EUCLEAN;
4122		btrfs_abort_transaction(trans, ret);
4123		goto out;
4124	}
4125	return __push_leaf_left(path, min_data_size,
4126			       empty, left, free_space, right_nritems,
4127			       max_slot);
4128out:
4129	btrfs_tree_unlock(left);
4130	free_extent_buffer(left);
4131	return ret;
4132}
4133
4134/*
4135 * split the path's leaf in two, making sure there is at least data_size
4136 * available for the resulting leaf level of the path.
4137 */
4138static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4139				    struct btrfs_path *path,
4140				    struct extent_buffer *l,
4141				    struct extent_buffer *right,
4142				    int slot, int mid, int nritems)
4143{
4144	struct btrfs_fs_info *fs_info = trans->fs_info;
4145	int data_copy_size;
4146	int rt_data_off;
4147	int i;
4148	struct btrfs_disk_key disk_key;
4149	struct btrfs_map_token token;
4150
4151	nritems = nritems - mid;
4152	btrfs_set_header_nritems(right, nritems);
4153	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4154
4155	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4156			   btrfs_item_nr_offset(mid),
4157			   nritems * sizeof(struct btrfs_item));
4158
4159	copy_extent_buffer(right, l,
4160		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4161		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4162		     leaf_data_end(l), data_copy_size);
4163
4164	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4165
4166	btrfs_init_map_token(&token, right);
4167	for (i = 0; i < nritems; i++) {
4168		struct btrfs_item *item = btrfs_item_nr(i);
4169		u32 ioff;
4170
4171		ioff = btrfs_token_item_offset(&token, item);
4172		btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4173	}
4174
4175	btrfs_set_header_nritems(l, mid);
4176	btrfs_item_key(right, &disk_key, 0);
4177	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4178
4179	btrfs_mark_buffer_dirty(right);
4180	btrfs_mark_buffer_dirty(l);
4181	BUG_ON(path->slots[0] != slot);
4182
4183	if (mid <= slot) {
4184		btrfs_tree_unlock(path->nodes[0]);
4185		free_extent_buffer(path->nodes[0]);
4186		path->nodes[0] = right;
4187		path->slots[0] -= mid;
4188		path->slots[1] += 1;
4189	} else {
4190		btrfs_tree_unlock(right);
4191		free_extent_buffer(right);
4192	}
4193
4194	BUG_ON(path->slots[0] < 0);
4195}
4196
4197/*
4198 * double splits happen when we need to insert a big item in the middle
4199 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4200 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4201 *          A                 B                 C
4202 *
4203 * We avoid this by trying to push the items on either side of our target
4204 * into the adjacent leaves.  If all goes well we can avoid the double split
4205 * completely.
4206 */
4207static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4208					  struct btrfs_root *root,
4209					  struct btrfs_path *path,
4210					  int data_size)
4211{
4212	int ret;
4213	int progress = 0;
4214	int slot;
4215	u32 nritems;
4216	int space_needed = data_size;
4217
4218	slot = path->slots[0];
4219	if (slot < btrfs_header_nritems(path->nodes[0]))
4220		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4221
4222	/*
4223	 * try to push all the items after our slot into the
4224	 * right leaf
4225	 */
4226	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4227	if (ret < 0)
4228		return ret;
4229
4230	if (ret == 0)
4231		progress++;
4232
4233	nritems = btrfs_header_nritems(path->nodes[0]);
4234	/*
4235	 * our goal is to get our slot at the start or end of a leaf.  If
4236	 * we've done so we're done
4237	 */
4238	if (path->slots[0] == 0 || path->slots[0] == nritems)
4239		return 0;
4240
4241	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4242		return 0;
4243
4244	/* try to push all the items before our slot into the next leaf */
4245	slot = path->slots[0];
4246	space_needed = data_size;
4247	if (slot > 0)
4248		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4249	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4250	if (ret < 0)
4251		return ret;
4252
4253	if (ret == 0)
4254		progress++;
4255
4256	if (progress)
4257		return 0;
4258	return 1;
4259}
4260
4261/*
4262 * split the path's leaf in two, making sure there is at least data_size
4263 * available for the resulting leaf level of the path.
4264 *
4265 * returns 0 if all went well and < 0 on failure.
4266 */
4267static noinline int split_leaf(struct btrfs_trans_handle *trans,
4268			       struct btrfs_root *root,
4269			       const struct btrfs_key *ins_key,
4270			       struct btrfs_path *path, int data_size,
4271			       int extend)
4272{
4273	struct btrfs_disk_key disk_key;
4274	struct extent_buffer *l;
4275	u32 nritems;
4276	int mid;
4277	int slot;
4278	struct extent_buffer *right;
4279	struct btrfs_fs_info *fs_info = root->fs_info;
4280	int ret = 0;
4281	int wret;
4282	int split;
4283	int num_doubles = 0;
4284	int tried_avoid_double = 0;
4285
4286	l = path->nodes[0];
4287	slot = path->slots[0];
4288	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4289	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4290		return -EOVERFLOW;
4291
4292	/* first try to make some room by pushing left and right */
4293	if (data_size && path->nodes[1]) {
4294		int space_needed = data_size;
4295
4296		if (slot < btrfs_header_nritems(l))
4297			space_needed -= btrfs_leaf_free_space(l);
4298
4299		wret = push_leaf_right(trans, root, path, space_needed,
4300				       space_needed, 0, 0);
4301		if (wret < 0)
4302			return wret;
4303		if (wret) {
4304			space_needed = data_size;
4305			if (slot > 0)
4306				space_needed -= btrfs_leaf_free_space(l);
4307			wret = push_leaf_left(trans, root, path, space_needed,
4308					      space_needed, 0, (u32)-1);
4309			if (wret < 0)
4310				return wret;
4311		}
4312		l = path->nodes[0];
4313
4314		/* did the pushes work? */
4315		if (btrfs_leaf_free_space(l) >= data_size)
4316			return 0;
4317	}
4318
4319	if (!path->nodes[1]) {
4320		ret = insert_new_root(trans, root, path, 1);
4321		if (ret)
4322			return ret;
4323	}
4324again:
4325	split = 1;
4326	l = path->nodes[0];
4327	slot = path->slots[0];
4328	nritems = btrfs_header_nritems(l);
4329	mid = (nritems + 1) / 2;
4330
4331	if (mid <= slot) {
4332		if (nritems == 1 ||
4333		    leaf_space_used(l, mid, nritems - mid) + data_size >
4334			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4335			if (slot >= nritems) {
4336				split = 0;
4337			} else {
4338				mid = slot;
4339				if (mid != nritems &&
4340				    leaf_space_used(l, mid, nritems - mid) +
4341				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4342					if (data_size && !tried_avoid_double)
4343						goto push_for_double;
4344					split = 2;
4345				}
4346			}
4347		}
4348	} else {
4349		if (leaf_space_used(l, 0, mid) + data_size >
4350			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4351			if (!extend && data_size && slot == 0) {
4352				split = 0;
4353			} else if ((extend || !data_size) && slot == 0) {
4354				mid = 1;
4355			} else {
4356				mid = slot;
4357				if (mid != nritems &&
4358				    leaf_space_used(l, mid, nritems - mid) +
4359				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4360					if (data_size && !tried_avoid_double)
4361						goto push_for_double;
4362					split = 2;
4363				}
4364			}
4365		}
4366	}
4367
4368	if (split == 0)
4369		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4370	else
4371		btrfs_item_key(l, &disk_key, mid);
4372
4373	/*
4374	 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
4375	 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
4376	 * subclasses, which is 8 at the time of this patch, and we've maxed it
4377	 * out.  In the future we could add a
4378	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
4379	 * use BTRFS_NESTING_NEW_ROOT.
4380	 */
4381	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4382					     l->start, 0, num_doubles ?
4383					     BTRFS_NESTING_NEW_ROOT :
4384					     BTRFS_NESTING_SPLIT);
4385	if (IS_ERR(right))
4386		return PTR_ERR(right);
4387
4388	root_add_used(root, fs_info->nodesize);
4389
4390	if (split == 0) {
4391		if (mid <= slot) {
4392			btrfs_set_header_nritems(right, 0);
4393			insert_ptr(trans, path, &disk_key,
4394				   right->start, path->slots[1] + 1, 1);
4395			btrfs_tree_unlock(path->nodes[0]);
4396			free_extent_buffer(path->nodes[0]);
4397			path->nodes[0] = right;
4398			path->slots[0] = 0;
4399			path->slots[1] += 1;
4400		} else {
4401			btrfs_set_header_nritems(right, 0);
4402			insert_ptr(trans, path, &disk_key,
4403				   right->start, path->slots[1], 1);
4404			btrfs_tree_unlock(path->nodes[0]);
4405			free_extent_buffer(path->nodes[0]);
4406			path->nodes[0] = right;
4407			path->slots[0] = 0;
4408			if (path->slots[1] == 0)
4409				fixup_low_keys(path, &disk_key, 1);
4410		}
4411		/*
4412		 * We create a new leaf 'right' for the required ins_len and
4413		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4414		 * the content of ins_len to 'right'.
4415		 */
4416		return ret;
4417	}
4418
4419	copy_for_split(trans, path, l, right, slot, mid, nritems);
4420
4421	if (split == 2) {
4422		BUG_ON(num_doubles != 0);
4423		num_doubles++;
4424		goto again;
4425	}
4426
4427	return 0;
4428
4429push_for_double:
4430	push_for_double_split(trans, root, path, data_size);
4431	tried_avoid_double = 1;
4432	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4433		return 0;
4434	goto again;
4435}
4436
4437static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4438					 struct btrfs_root *root,
4439					 struct btrfs_path *path, int ins_len)
4440{
4441	struct btrfs_key key;
4442	struct extent_buffer *leaf;
4443	struct btrfs_file_extent_item *fi;
4444	u64 extent_len = 0;
4445	u32 item_size;
4446	int ret;
4447
4448	leaf = path->nodes[0];
4449	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4450
4451	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4452	       key.type != BTRFS_EXTENT_CSUM_KEY);
4453
4454	if (btrfs_leaf_free_space(leaf) >= ins_len)
4455		return 0;
4456
4457	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4458	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4459		fi = btrfs_item_ptr(leaf, path->slots[0],
4460				    struct btrfs_file_extent_item);
4461		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4462	}
4463	btrfs_release_path(path);
4464
4465	path->keep_locks = 1;
4466	path->search_for_split = 1;
4467	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4468	path->search_for_split = 0;
4469	if (ret > 0)
4470		ret = -EAGAIN;
4471	if (ret < 0)
4472		goto err;
4473
4474	ret = -EAGAIN;
4475	leaf = path->nodes[0];
4476	/* if our item isn't there, return now */
4477	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4478		goto err;
4479
4480	/* the leaf has  changed, it now has room.  return now */
4481	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4482		goto err;
4483
4484	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4485		fi = btrfs_item_ptr(leaf, path->slots[0],
4486				    struct btrfs_file_extent_item);
4487		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4488			goto err;
4489	}
4490
4491	btrfs_set_path_blocking(path);
4492	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4493	if (ret)
4494		goto err;
4495
4496	path->keep_locks = 0;
4497	btrfs_unlock_up_safe(path, 1);
4498	return 0;
4499err:
4500	path->keep_locks = 0;
4501	return ret;
4502}
4503
4504static noinline int split_item(struct btrfs_path *path,
4505			       const struct btrfs_key *new_key,
4506			       unsigned long split_offset)
4507{
4508	struct extent_buffer *leaf;
4509	struct btrfs_item *item;
4510	struct btrfs_item *new_item;
4511	int slot;
4512	char *buf;
4513	u32 nritems;
4514	u32 item_size;
4515	u32 orig_offset;
4516	struct btrfs_disk_key disk_key;
4517
4518	leaf = path->nodes[0];
4519	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4520
4521	btrfs_set_path_blocking(path);
4522
4523	item = btrfs_item_nr(path->slots[0]);
4524	orig_offset = btrfs_item_offset(leaf, item);
4525	item_size = btrfs_item_size(leaf, item);
4526
4527	buf = kmalloc(item_size, GFP_NOFS);
4528	if (!buf)
4529		return -ENOMEM;
4530
4531	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4532			    path->slots[0]), item_size);
4533
4534	slot = path->slots[0] + 1;
4535	nritems = btrfs_header_nritems(leaf);
4536	if (slot != nritems) {
4537		/* shift the items */
4538		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4539				btrfs_item_nr_offset(slot),
4540				(nritems - slot) * sizeof(struct btrfs_item));
4541	}
4542
4543	btrfs_cpu_key_to_disk(&disk_key, new_key);
4544	btrfs_set_item_key(leaf, &disk_key, slot);
4545
4546	new_item = btrfs_item_nr(slot);
4547
4548	btrfs_set_item_offset(leaf, new_item, orig_offset);
4549	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4550
4551	btrfs_set_item_offset(leaf, item,
4552			      orig_offset + item_size - split_offset);
4553	btrfs_set_item_size(leaf, item, split_offset);
4554
4555	btrfs_set_header_nritems(leaf, nritems + 1);
4556
4557	/* write the data for the start of the original item */
4558	write_extent_buffer(leaf, buf,
4559			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4560			    split_offset);
4561
4562	/* write the data for the new item */
4563	write_extent_buffer(leaf, buf + split_offset,
4564			    btrfs_item_ptr_offset(leaf, slot),
4565			    item_size - split_offset);
4566	btrfs_mark_buffer_dirty(leaf);
4567
4568	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4569	kfree(buf);
4570	return 0;
4571}
4572
4573/*
4574 * This function splits a single item into two items,
4575 * giving 'new_key' to the new item and splitting the
4576 * old one at split_offset (from the start of the item).
4577 *
4578 * The path may be released by this operation.  After
4579 * the split, the path is pointing to the old item.  The
4580 * new item is going to be in the same node as the old one.
4581 *
4582 * Note, the item being split must be smaller enough to live alone on
4583 * a tree block with room for one extra struct btrfs_item
4584 *
4585 * This allows us to split the item in place, keeping a lock on the
4586 * leaf the entire time.
4587 */
4588int btrfs_split_item(struct btrfs_trans_handle *trans,
4589		     struct btrfs_root *root,
4590		     struct btrfs_path *path,
4591		     const struct btrfs_key *new_key,
4592		     unsigned long split_offset)
4593{
4594	int ret;
4595	ret = setup_leaf_for_split(trans, root, path,
4596				   sizeof(struct btrfs_item));
4597	if (ret)
4598		return ret;
4599
4600	ret = split_item(path, new_key, split_offset);
4601	return ret;
4602}
4603
4604/*
4605 * This function duplicate a item, giving 'new_key' to the new item.
4606 * It guarantees both items live in the same tree leaf and the new item
4607 * is contiguous with the original item.
4608 *
4609 * This allows us to split file extent in place, keeping a lock on the
4610 * leaf the entire time.
4611 */
4612int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4613			 struct btrfs_root *root,
4614			 struct btrfs_path *path,
4615			 const struct btrfs_key *new_key)
4616{
4617	struct extent_buffer *leaf;
4618	int ret;
4619	u32 item_size;
4620
4621	leaf = path->nodes[0];
4622	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4623	ret = setup_leaf_for_split(trans, root, path,
4624				   item_size + sizeof(struct btrfs_item));
4625	if (ret)
4626		return ret;
4627
4628	path->slots[0]++;
4629	setup_items_for_insert(root, path, new_key, &item_size, 1);
4630	leaf = path->nodes[0];
4631	memcpy_extent_buffer(leaf,
4632			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4633			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4634			     item_size);
4635	return 0;
4636}
4637
4638/*
4639 * make the item pointed to by the path smaller.  new_size indicates
4640 * how small to make it, and from_end tells us if we just chop bytes
4641 * off the end of the item or if we shift the item to chop bytes off
4642 * the front.
4643 */
4644void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4645{
4646	int slot;
4647	struct extent_buffer *leaf;
4648	struct btrfs_item *item;
4649	u32 nritems;
4650	unsigned int data_end;
4651	unsigned int old_data_start;
4652	unsigned int old_size;
4653	unsigned int size_diff;
4654	int i;
4655	struct btrfs_map_token token;
4656
4657	leaf = path->nodes[0];
4658	slot = path->slots[0];
4659
4660	old_size = btrfs_item_size_nr(leaf, slot);
4661	if (old_size == new_size)
4662		return;
4663
4664	nritems = btrfs_header_nritems(leaf);
4665	data_end = leaf_data_end(leaf);
4666
4667	old_data_start = btrfs_item_offset_nr(leaf, slot);
4668
4669	size_diff = old_size - new_size;
4670
4671	BUG_ON(slot < 0);
4672	BUG_ON(slot >= nritems);
4673
4674	/*
4675	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4676	 */
4677	/* first correct the data pointers */
4678	btrfs_init_map_token(&token, leaf);
4679	for (i = slot; i < nritems; i++) {
4680		u32 ioff;
4681		item = btrfs_item_nr(i);
4682
4683		ioff = btrfs_token_item_offset(&token, item);
4684		btrfs_set_token_item_offset(&token, item, ioff + size_diff);
4685	}
4686
4687	/* shift the data */
4688	if (from_end) {
4689		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4690			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4691			      data_end, old_data_start + new_size - data_end);
4692	} else {
4693		struct btrfs_disk_key disk_key;
4694		u64 offset;
4695
4696		btrfs_item_key(leaf, &disk_key, slot);
4697
4698		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4699			unsigned long ptr;
4700			struct btrfs_file_extent_item *fi;
4701
4702			fi = btrfs_item_ptr(leaf, slot,
4703					    struct btrfs_file_extent_item);
4704			fi = (struct btrfs_file_extent_item *)(
4705			     (unsigned long)fi - size_diff);
4706
4707			if (btrfs_file_extent_type(leaf, fi) ==
4708			    BTRFS_FILE_EXTENT_INLINE) {
4709				ptr = btrfs_item_ptr_offset(leaf, slot);
4710				memmove_extent_buffer(leaf, ptr,
4711				      (unsigned long)fi,
4712				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4713			}
4714		}
4715
4716		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4717			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4718			      data_end, old_data_start - data_end);
4719
4720		offset = btrfs_disk_key_offset(&disk_key);
4721		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4722		btrfs_set_item_key(leaf, &disk_key, slot);
4723		if (slot == 0)
4724			fixup_low_keys(path, &disk_key, 1);
4725	}
4726
4727	item = btrfs_item_nr(slot);
4728	btrfs_set_item_size(leaf, item, new_size);
4729	btrfs_mark_buffer_dirty(leaf);
4730
4731	if (btrfs_leaf_free_space(leaf) < 0) {
4732		btrfs_print_leaf(leaf);
4733		BUG();
4734	}
4735}
4736
4737/*
4738 * make the item pointed to by the path bigger, data_size is the added size.
4739 */
4740void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4741{
4742	int slot;
4743	struct extent_buffer *leaf;
4744	struct btrfs_item *item;
4745	u32 nritems;
4746	unsigned int data_end;
4747	unsigned int old_data;
4748	unsigned int old_size;
4749	int i;
4750	struct btrfs_map_token token;
4751
4752	leaf = path->nodes[0];
4753
4754	nritems = btrfs_header_nritems(leaf);
4755	data_end = leaf_data_end(leaf);
4756
4757	if (btrfs_leaf_free_space(leaf) < data_size) {
4758		btrfs_print_leaf(leaf);
4759		BUG();
4760	}
4761	slot = path->slots[0];
4762	old_data = btrfs_item_end_nr(leaf, slot);
4763
4764	BUG_ON(slot < 0);
4765	if (slot >= nritems) {
4766		btrfs_print_leaf(leaf);
4767		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4768			   slot, nritems);
4769		BUG();
4770	}
4771
4772	/*
4773	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4774	 */
4775	/* first correct the data pointers */
4776	btrfs_init_map_token(&token, leaf);
4777	for (i = slot; i < nritems; i++) {
4778		u32 ioff;
4779		item = btrfs_item_nr(i);
4780
4781		ioff = btrfs_token_item_offset(&token, item);
4782		btrfs_set_token_item_offset(&token, item, ioff - data_size);
4783	}
4784
4785	/* shift the data */
4786	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4787		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4788		      data_end, old_data - data_end);
4789
4790	data_end = old_data;
4791	old_size = btrfs_item_size_nr(leaf, slot);
4792	item = btrfs_item_nr(slot);
4793	btrfs_set_item_size(leaf, item, old_size + data_size);
4794	btrfs_mark_buffer_dirty(leaf);
4795
4796	if (btrfs_leaf_free_space(leaf) < 0) {
4797		btrfs_print_leaf(leaf);
4798		BUG();
4799	}
4800}
4801
4802/**
4803 * setup_items_for_insert - Helper called before inserting one or more items
4804 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
4805 * in a function that doesn't call btrfs_search_slot
4806 *
4807 * @root:	root we are inserting items to
4808 * @path:	points to the leaf/slot where we are going to insert new items
4809 * @cpu_key:	array of keys for items to be inserted
4810 * @data_size:	size of the body of each item we are going to insert
4811 * @nr:		size of @cpu_key/@data_size arrays
4812 */
4813void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4814			    const struct btrfs_key *cpu_key, u32 *data_size,
4815			    int nr)
4816{
4817	struct btrfs_fs_info *fs_info = root->fs_info;
4818	struct btrfs_item *item;
4819	int i;
4820	u32 nritems;
4821	unsigned int data_end;
4822	struct btrfs_disk_key disk_key;
4823	struct extent_buffer *leaf;
4824	int slot;
4825	struct btrfs_map_token token;
4826	u32 total_size;
4827	u32 total_data = 0;
4828
4829	for (i = 0; i < nr; i++)
4830		total_data += data_size[i];
4831	total_size = total_data + (nr * sizeof(struct btrfs_item));
4832
4833	if (path->slots[0] == 0) {
4834		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4835		fixup_low_keys(path, &disk_key, 1);
4836	}
4837	btrfs_unlock_up_safe(path, 1);
4838
4839	leaf = path->nodes[0];
4840	slot = path->slots[0];
4841
4842	nritems = btrfs_header_nritems(leaf);
4843	data_end = leaf_data_end(leaf);
4844
4845	if (btrfs_leaf_free_space(leaf) < total_size) {
4846		btrfs_print_leaf(leaf);
4847		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4848			   total_size, btrfs_leaf_free_space(leaf));
4849		BUG();
4850	}
4851
4852	btrfs_init_map_token(&token, leaf);
4853	if (slot != nritems) {
4854		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4855
4856		if (old_data < data_end) {
4857			btrfs_print_leaf(leaf);
4858			btrfs_crit(fs_info,
4859		"item at slot %d with data offset %u beyond data end of leaf %u",
4860				   slot, old_data, data_end);
4861			BUG();
4862		}
4863		/*
4864		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4865		 */
4866		/* first correct the data pointers */
4867		for (i = slot; i < nritems; i++) {
4868			u32 ioff;
4869
4870			item = btrfs_item_nr(i);
4871			ioff = btrfs_token_item_offset(&token, item);
4872			btrfs_set_token_item_offset(&token, item,
4873						    ioff - total_data);
4874		}
4875		/* shift the items */
4876		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4877			      btrfs_item_nr_offset(slot),
4878			      (nritems - slot) * sizeof(struct btrfs_item));
4879
4880		/* shift the data */
4881		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4882			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4883			      data_end, old_data - data_end);
4884		data_end = old_data;
4885	}
4886
4887	/* setup the item for the new data */
4888	for (i = 0; i < nr; i++) {
4889		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4890		btrfs_set_item_key(leaf, &disk_key, slot + i);
4891		item = btrfs_item_nr(slot + i);
4892		data_end -= data_size[i];
4893		btrfs_set_token_item_offset(&token, item, data_end);
4894		btrfs_set_token_item_size(&token, item, data_size[i]);
4895	}
4896
4897	btrfs_set_header_nritems(leaf, nritems + nr);
4898	btrfs_mark_buffer_dirty(leaf);
4899
4900	if (btrfs_leaf_free_space(leaf) < 0) {
4901		btrfs_print_leaf(leaf);
4902		BUG();
4903	}
4904}
4905
4906/*
4907 * Given a key and some data, insert items into the tree.
4908 * This does all the path init required, making room in the tree if needed.
4909 */
4910int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4911			    struct btrfs_root *root,
4912			    struct btrfs_path *path,
4913			    const struct btrfs_key *cpu_key, u32 *data_size,
4914			    int nr)
4915{
4916	int ret = 0;
4917	int slot;
4918	int i;
4919	u32 total_size = 0;
4920	u32 total_data = 0;
4921
4922	for (i = 0; i < nr; i++)
4923		total_data += data_size[i];
4924
4925	total_size = total_data + (nr * sizeof(struct btrfs_item));
4926	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4927	if (ret == 0)
4928		return -EEXIST;
4929	if (ret < 0)
4930		return ret;
4931
4932	slot = path->slots[0];
4933	BUG_ON(slot < 0);
4934
4935	setup_items_for_insert(root, path, cpu_key, data_size, nr);
4936	return 0;
4937}
4938
4939/*
4940 * Given a key and some data, insert an item into the tree.
4941 * This does all the path init required, making room in the tree if needed.
4942 */
4943int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4944		      const struct btrfs_key *cpu_key, void *data,
4945		      u32 data_size)
4946{
4947	int ret = 0;
4948	struct btrfs_path *path;
4949	struct extent_buffer *leaf;
4950	unsigned long ptr;
4951
4952	path = btrfs_alloc_path();
4953	if (!path)
4954		return -ENOMEM;
4955	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4956	if (!ret) {
4957		leaf = path->nodes[0];
4958		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4959		write_extent_buffer(leaf, data, ptr, data_size);
4960		btrfs_mark_buffer_dirty(leaf);
4961	}
4962	btrfs_free_path(path);
4963	return ret;
4964}
4965
4966/*
4967 * delete the pointer from a given node.
4968 *
4969 * the tree should have been previously balanced so the deletion does not
4970 * empty a node.
4971 */
4972static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4973		    int level, int slot)
4974{
4975	struct extent_buffer *parent = path->nodes[level];
4976	u32 nritems;
4977	int ret;
4978
4979	nritems = btrfs_header_nritems(parent);
4980	if (slot != nritems - 1) {
4981		if (level) {
4982			ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4983					nritems - slot - 1);
4984			BUG_ON(ret < 0);
4985		}
4986		memmove_extent_buffer(parent,
4987			      btrfs_node_key_ptr_offset(slot),
4988			      btrfs_node_key_ptr_offset(slot + 1),
4989			      sizeof(struct btrfs_key_ptr) *
4990			      (nritems - slot - 1));
4991	} else if (level) {
4992		ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4993				GFP_NOFS);
4994		BUG_ON(ret < 0);
4995	}
4996
4997	nritems--;
4998	btrfs_set_header_nritems(parent, nritems);
4999	if (nritems == 0 && parent == root->node) {
5000		BUG_ON(btrfs_header_level(root->node) != 1);
5001		/* just turn the root into a leaf and break */
5002		btrfs_set_header_level(root->node, 0);
5003	} else if (slot == 0) {
5004		struct btrfs_disk_key disk_key;
5005
5006		btrfs_node_key(parent, &disk_key, 0);
5007		fixup_low_keys(path, &disk_key, level + 1);
5008	}
5009	btrfs_mark_buffer_dirty(parent);
5010}
5011
5012/*
5013 * a helper function to delete the leaf pointed to by path->slots[1] and
5014 * path->nodes[1].
5015 *
5016 * This deletes the pointer in path->nodes[1] and frees the leaf
5017 * block extent.  zero is returned if it all worked out, < 0 otherwise.
5018 *
5019 * The path must have already been setup for deleting the leaf, including
5020 * all the proper balancing.  path->nodes[1] must be locked.
5021 */
5022static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
5023				    struct btrfs_root *root,
5024				    struct btrfs_path *path,
5025				    struct extent_buffer *leaf)
5026{
5027	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
5028	del_ptr(root, path, 1, path->slots[1]);
5029
5030	/*
5031	 * btrfs_free_extent is expensive, we want to make sure we
5032	 * aren't holding any locks when we call it
5033	 */
5034	btrfs_unlock_up_safe(path, 0);
5035
5036	root_sub_used(root, leaf->len);
5037
5038	atomic_inc(&leaf->refs);
5039	btrfs_free_tree_block(trans, root, leaf, 0, 1);
5040	free_extent_buffer_stale(leaf);
5041}
5042/*
5043 * delete the item at the leaf level in path.  If that empties
5044 * the leaf, remove it from the tree
5045 */
5046int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5047		    struct btrfs_path *path, int slot, int nr)
5048{
5049	struct btrfs_fs_info *fs_info = root->fs_info;
5050	struct extent_buffer *leaf;
5051	struct btrfs_item *item;
5052	u32 last_off;
5053	u32 dsize = 0;
5054	int ret = 0;
5055	int wret;
5056	int i;
5057	u32 nritems;
5058
5059	leaf = path->nodes[0];
5060	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5061
5062	for (i = 0; i < nr; i++)
5063		dsize += btrfs_item_size_nr(leaf, slot + i);
5064
5065	nritems = btrfs_header_nritems(leaf);
5066
5067	if (slot + nr != nritems) {
5068		int data_end = leaf_data_end(leaf);
5069		struct btrfs_map_token token;
5070
5071		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5072			      data_end + dsize,
5073			      BTRFS_LEAF_DATA_OFFSET + data_end,
5074			      last_off - data_end);
5075
5076		btrfs_init_map_token(&token, leaf);
5077		for (i = slot + nr; i < nritems; i++) {
5078			u32 ioff;
5079
5080			item = btrfs_item_nr(i);
5081			ioff = btrfs_token_item_offset(&token, item);
5082			btrfs_set_token_item_offset(&token, item, ioff + dsize);
5083		}
5084
5085		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5086			      btrfs_item_nr_offset(slot + nr),
5087			      sizeof(struct btrfs_item) *
5088			      (nritems - slot - nr));
5089	}
5090	btrfs_set_header_nritems(leaf, nritems - nr);
5091	nritems -= nr;
5092
5093	/* delete the leaf if we've emptied it */
5094	if (nritems == 0) {
5095		if (leaf == root->node) {
5096			btrfs_set_header_level(leaf, 0);
5097		} else {
5098			btrfs_set_path_blocking(path);
5099			btrfs_clean_tree_block(leaf);
5100			btrfs_del_leaf(trans, root, path, leaf);
5101		}
5102	} else {
5103		int used = leaf_space_used(leaf, 0, nritems);
5104		if (slot == 0) {
5105			struct btrfs_disk_key disk_key;
5106
5107			btrfs_item_key(leaf, &disk_key, 0);
5108			fixup_low_keys(path, &disk_key, 1);
5109		}
5110
5111		/* delete the leaf if it is mostly empty */
5112		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5113			/* push_leaf_left fixes the path.
5114			 * make sure the path still points to our leaf
5115			 * for possible call to del_ptr below
5116			 */
5117			slot = path->slots[1];
5118			atomic_inc(&leaf->refs);
5119
5120			btrfs_set_path_blocking(path);
5121			wret = push_leaf_left(trans, root, path, 1, 1,
5122					      1, (u32)-1);
5123			if (wret < 0 && wret != -ENOSPC)
5124				ret = wret;
5125
5126			if (path->nodes[0] == leaf &&
5127			    btrfs_header_nritems(leaf)) {
5128				wret = push_leaf_right(trans, root, path, 1,
5129						       1, 1, 0);
5130				if (wret < 0 && wret != -ENOSPC)
5131					ret = wret;
5132			}
5133
5134			if (btrfs_header_nritems(leaf) == 0) {
5135				path->slots[1] = slot;
5136				btrfs_del_leaf(trans, root, path, leaf);
5137				free_extent_buffer(leaf);
5138				ret = 0;
5139			} else {
5140				/* if we're still in the path, make sure
5141				 * we're dirty.  Otherwise, one of the
5142				 * push_leaf functions must have already
5143				 * dirtied this buffer
5144				 */
5145				if (path->nodes[0] == leaf)
5146					btrfs_mark_buffer_dirty(leaf);
5147				free_extent_buffer(leaf);
5148			}
5149		} else {
5150			btrfs_mark_buffer_dirty(leaf);
5151		}
5152	}
5153	return ret;
5154}
5155
5156/*
5157 * search the tree again to find a leaf with lesser keys
5158 * returns 0 if it found something or 1 if there are no lesser leaves.
5159 * returns < 0 on io errors.
5160 *
5161 * This may release the path, and so you may lose any locks held at the
5162 * time you call it.
5163 */
5164int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5165{
5166	struct btrfs_key key;
5167	struct btrfs_key orig_key;
5168	struct btrfs_disk_key found_key;
5169	int ret;
5170
5171	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5172	orig_key = key;
5173
5174	if (key.offset > 0) {
5175		key.offset--;
5176	} else if (key.type > 0) {
5177		key.type--;
5178		key.offset = (u64)-1;
5179	} else if (key.objectid > 0) {
5180		key.objectid--;
5181		key.type = (u8)-1;
5182		key.offset = (u64)-1;
5183	} else {
5184		return 1;
5185	}
5186
5187	btrfs_release_path(path);
5188	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5189	if (ret <= 0)
5190		return ret;
5191
5192	/*
5193	 * Previous key not found. Even if we were at slot 0 of the leaf we had
5194	 * before releasing the path and calling btrfs_search_slot(), we now may
5195	 * be in a slot pointing to the same original key - this can happen if
5196	 * after we released the path, one of more items were moved from a
5197	 * sibling leaf into the front of the leaf we had due to an insertion
5198	 * (see push_leaf_right()).
5199	 * If we hit this case and our slot is > 0 and just decrement the slot
5200	 * so that the caller does not process the same key again, which may or
5201	 * may not break the caller, depending on its logic.
5202	 */
5203	if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5204		btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
5205		ret = comp_keys(&found_key, &orig_key);
5206		if (ret == 0) {
5207			if (path->slots[0] > 0) {
5208				path->slots[0]--;
5209				return 0;
5210			}
5211			/*
5212			 * At slot 0, same key as before, it means orig_key is
5213			 * the lowest, leftmost, key in the tree. We're done.
5214			 */
5215			return 1;
5216		}
5217	}
5218
5219	btrfs_item_key(path->nodes[0], &found_key, 0);
5220	ret = comp_keys(&found_key, &key);
5221	/*
5222	 * We might have had an item with the previous key in the tree right
5223	 * before we released our path. And after we released our path, that
5224	 * item might have been pushed to the first slot (0) of the leaf we
5225	 * were holding due to a tree balance. Alternatively, an item with the
5226	 * previous key can exist as the only element of a leaf (big fat item).
5227	 * Therefore account for these 2 cases, so that our callers (like
5228	 * btrfs_previous_item) don't miss an existing item with a key matching
5229	 * the previous key we computed above.
5230	 */
5231	if (ret <= 0)
5232		return 0;
5233	return 1;
5234}
5235
5236/*
5237 * A helper function to walk down the tree starting at min_key, and looking
5238 * for nodes or leaves that are have a minimum transaction id.
5239 * This is used by the btree defrag code, and tree logging
5240 *
5241 * This does not cow, but it does stuff the starting key it finds back
5242 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5243 * key and get a writable path.
5244 *
5245 * This honors path->lowest_level to prevent descent past a given level
5246 * of the tree.
5247 *
5248 * min_trans indicates the oldest transaction that you are interested
5249 * in walking through.  Any nodes or leaves older than min_trans are
5250 * skipped over (without reading them).
5251 *
5252 * returns zero if something useful was found, < 0 on error and 1 if there
5253 * was nothing in the tree that matched the search criteria.
5254 */
5255int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5256			 struct btrfs_path *path,
5257			 u64 min_trans)
5258{
5259	struct extent_buffer *cur;
5260	struct btrfs_key found_key;
5261	int slot;
5262	int sret;
5263	u32 nritems;
5264	int level;
5265	int ret = 1;
5266	int keep_locks = path->keep_locks;
5267
5268	path->keep_locks = 1;
5269again:
5270	cur = btrfs_read_lock_root_node(root);
5271	level = btrfs_header_level(cur);
5272	WARN_ON(path->nodes[level]);
5273	path->nodes[level] = cur;
5274	path->locks[level] = BTRFS_READ_LOCK;
5275
5276	if (btrfs_header_generation(cur) < min_trans) {
5277		ret = 1;
5278		goto out;
5279	}
5280	while (1) {
5281		nritems = btrfs_header_nritems(cur);
5282		level = btrfs_header_level(cur);
5283		sret = btrfs_bin_search(cur, min_key, &slot);
5284		if (sret < 0) {
5285			ret = sret;
5286			goto out;
5287		}
5288
5289		/* at the lowest level, we're done, setup the path and exit */
5290		if (level == path->lowest_level) {
5291			if (slot >= nritems)
5292				goto find_next_key;
5293			ret = 0;
5294			path->slots[level] = slot;
5295			btrfs_item_key_to_cpu(cur, &found_key, slot);
5296			goto out;
5297		}
5298		if (sret && slot > 0)
5299			slot--;
5300		/*
5301		 * check this node pointer against the min_trans parameters.
5302		 * If it is too old, skip to the next one.
5303		 */
5304		while (slot < nritems) {
5305			u64 gen;
5306
5307			gen = btrfs_node_ptr_generation(cur, slot);
5308			if (gen < min_trans) {
5309				slot++;
5310				continue;
5311			}
5312			break;
5313		}
5314find_next_key:
5315		/*
5316		 * we didn't find a candidate key in this node, walk forward
5317		 * and find another one
5318		 */
5319		if (slot >= nritems) {
5320			path->slots[level] = slot;
5321			btrfs_set_path_blocking(path);
5322			sret = btrfs_find_next_key(root, path, min_key, level,
5323						  min_trans);
5324			if (sret == 0) {
5325				btrfs_release_path(path);
5326				goto again;
5327			} else {
5328				goto out;
5329			}
5330		}
5331		/* save our key for returning back */
5332		btrfs_node_key_to_cpu(cur, &found_key, slot);
5333		path->slots[level] = slot;
5334		if (level == path->lowest_level) {
5335			ret = 0;
5336			goto out;
5337		}
5338		btrfs_set_path_blocking(path);
5339		cur = btrfs_read_node_slot(cur, slot);
5340		if (IS_ERR(cur)) {
5341			ret = PTR_ERR(cur);
5342			goto out;
5343		}
5344
5345		btrfs_tree_read_lock(cur);
5346
5347		path->locks[level - 1] = BTRFS_READ_LOCK;
5348		path->nodes[level - 1] = cur;
5349		unlock_up(path, level, 1, 0, NULL);
5350	}
5351out:
5352	path->keep_locks = keep_locks;
5353	if (ret == 0) {
5354		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5355		btrfs_set_path_blocking(path);
5356		memcpy(min_key, &found_key, sizeof(found_key));
5357	}
5358	return ret;
5359}
5360
5361/*
5362 * this is similar to btrfs_next_leaf, but does not try to preserve
5363 * and fixup the path.  It looks for and returns the next key in the
5364 * tree based on the current path and the min_trans parameters.
5365 *
5366 * 0 is returned if another key is found, < 0 if there are any errors
5367 * and 1 is returned if there are no higher keys in the tree
5368 *
5369 * path->keep_locks should be set to 1 on the search made before
5370 * calling this function.
5371 */
5372int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5373			struct btrfs_key *key, int level, u64 min_trans)
5374{
5375	int slot;
5376	struct extent_buffer *c;
5377
5378	WARN_ON(!path->keep_locks && !path->skip_locking);
5379	while (level < BTRFS_MAX_LEVEL) {
5380		if (!path->nodes[level])
5381			return 1;
5382
5383		slot = path->slots[level] + 1;
5384		c = path->nodes[level];
5385next:
5386		if (slot >= btrfs_header_nritems(c)) {
5387			int ret;
5388			int orig_lowest;
5389			struct btrfs_key cur_key;
5390			if (level + 1 >= BTRFS_MAX_LEVEL ||
5391			    !path->nodes[level + 1])
5392				return 1;
5393
5394			if (path->locks[level + 1] || path->skip_locking) {
5395				level++;
5396				continue;
5397			}
5398
5399			slot = btrfs_header_nritems(c) - 1;
5400			if (level == 0)
5401				btrfs_item_key_to_cpu(c, &cur_key, slot);
5402			else
5403				btrfs_node_key_to_cpu(c, &cur_key, slot);
5404
5405			orig_lowest = path->lowest_level;
5406			btrfs_release_path(path);
5407			path->lowest_level = level;
5408			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5409						0, 0);
5410			path->lowest_level = orig_lowest;
5411			if (ret < 0)
5412				return ret;
5413
5414			c = path->nodes[level];
5415			slot = path->slots[level];
5416			if (ret == 0)
5417				slot++;
5418			goto next;
5419		}
5420
5421		if (level == 0)
5422			btrfs_item_key_to_cpu(c, key, slot);
5423		else {
5424			u64 gen = btrfs_node_ptr_generation(c, slot);
5425
5426			if (gen < min_trans) {
5427				slot++;
5428				goto next;
5429			}
5430			btrfs_node_key_to_cpu(c, key, slot);
5431		}
5432		return 0;
5433	}
5434	return 1;
5435}
5436
5437/*
5438 * search the tree again to find a leaf with greater keys
5439 * returns 0 if it found something or 1 if there are no greater leaves.
5440 * returns < 0 on io errors.
5441 */
5442int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5443{
5444	return btrfs_next_old_leaf(root, path, 0);
5445}
5446
5447int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5448			u64 time_seq)
5449{
5450	int slot;
5451	int level;
5452	struct extent_buffer *c;
5453	struct extent_buffer *next;
5454	struct btrfs_key key;
5455	u32 nritems;
5456	int ret;
5457	int old_spinning = path->leave_spinning;
5458	int next_rw_lock = 0;
5459
5460	nritems = btrfs_header_nritems(path->nodes[0]);
5461	if (nritems == 0)
5462		return 1;
5463
5464	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5465again:
5466	level = 1;
5467	next = NULL;
5468	next_rw_lock = 0;
5469	btrfs_release_path(path);
5470
5471	path->keep_locks = 1;
5472	path->leave_spinning = 1;
5473
5474	if (time_seq)
5475		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5476	else
5477		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5478	path->keep_locks = 0;
5479
5480	if (ret < 0)
5481		return ret;
5482
5483	nritems = btrfs_header_nritems(path->nodes[0]);
5484	/*
5485	 * by releasing the path above we dropped all our locks.  A balance
5486	 * could have added more items next to the key that used to be
5487	 * at the very end of the block.  So, check again here and
5488	 * advance the path if there are now more items available.
5489	 */
5490	if (nritems > 0 && path->slots[0] < nritems - 1) {
5491		if (ret == 0)
5492			path->slots[0]++;
5493		ret = 0;
5494		goto done;
5495	}
5496	/*
5497	 * So the above check misses one case:
5498	 * - after releasing the path above, someone has removed the item that
5499	 *   used to be at the very end of the block, and balance between leafs
5500	 *   gets another one with bigger key.offset to replace it.
5501	 *
5502	 * This one should be returned as well, or we can get leaf corruption
5503	 * later(esp. in __btrfs_drop_extents()).
5504	 *
5505	 * And a bit more explanation about this check,
5506	 * with ret > 0, the key isn't found, the path points to the slot
5507	 * where it should be inserted, so the path->slots[0] item must be the
5508	 * bigger one.
5509	 */
5510	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5511		ret = 0;
5512		goto done;
5513	}
5514
5515	while (level < BTRFS_MAX_LEVEL) {
5516		if (!path->nodes[level]) {
5517			ret = 1;
5518			goto done;
5519		}
5520
5521		slot = path->slots[level] + 1;
5522		c = path->nodes[level];
5523		if (slot >= btrfs_header_nritems(c)) {
5524			level++;
5525			if (level == BTRFS_MAX_LEVEL) {
5526				ret = 1;
5527				goto done;
5528			}
5529			continue;
5530		}
5531
5532		if (next) {
5533			btrfs_tree_unlock_rw(next, next_rw_lock);
5534			free_extent_buffer(next);
5535		}
5536
5537		next = c;
5538		next_rw_lock = path->locks[level];
5539		ret = read_block_for_search(root, path, &next, level,
5540					    slot, &key);
5541		if (ret == -EAGAIN)
5542			goto again;
5543
5544		if (ret < 0) {
5545			btrfs_release_path(path);
5546			goto done;
5547		}
5548
5549		if (!path->skip_locking) {
5550			ret = btrfs_try_tree_read_lock(next);
5551			if (!ret && time_seq) {
5552				/*
5553				 * If we don't get the lock, we may be racing
5554				 * with push_leaf_left, holding that lock while
5555				 * itself waiting for the leaf we've currently
5556				 * locked. To solve this situation, we give up
5557				 * on our lock and cycle.
5558				 */
5559				free_extent_buffer(next);
5560				btrfs_release_path(path);
5561				cond_resched();
5562				goto again;
5563			}
5564			if (!ret) {
5565				btrfs_set_path_blocking(path);
5566				__btrfs_tree_read_lock(next,
5567						       BTRFS_NESTING_RIGHT,
5568						       path->recurse);
5569			}
5570			next_rw_lock = BTRFS_READ_LOCK;
5571		}
5572		break;
5573	}
5574	path->slots[level] = slot;
5575	while (1) {
5576		level--;
5577		c = path->nodes[level];
5578		if (path->locks[level])
5579			btrfs_tree_unlock_rw(c, path->locks[level]);
5580
5581		free_extent_buffer(c);
5582		path->nodes[level] = next;
5583		path->slots[level] = 0;
5584		if (!path->skip_locking)
5585			path->locks[level] = next_rw_lock;
5586		if (!level)
5587			break;
5588
5589		ret = read_block_for_search(root, path, &next, level,
5590					    0, &key);
5591		if (ret == -EAGAIN)
5592			goto again;
5593
5594		if (ret < 0) {
5595			btrfs_release_path(path);
5596			goto done;
5597		}
5598
5599		if (!path->skip_locking) {
5600			ret = btrfs_try_tree_read_lock(next);
5601			if (!ret) {
5602				btrfs_set_path_blocking(path);
5603				__btrfs_tree_read_lock(next,
5604						       BTRFS_NESTING_RIGHT,
5605						       path->recurse);
5606			}
5607			next_rw_lock = BTRFS_READ_LOCK;
5608		}
5609	}
5610	ret = 0;
5611done:
5612	unlock_up(path, 0, 1, 0, NULL);
5613	path->leave_spinning = old_spinning;
5614	if (!old_spinning)
5615		btrfs_set_path_blocking(path);
5616
5617	return ret;
5618}
5619
5620/*
5621 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5622 * searching until it gets past min_objectid or finds an item of 'type'
5623 *
5624 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5625 */
5626int btrfs_previous_item(struct btrfs_root *root,
5627			struct btrfs_path *path, u64 min_objectid,
5628			int type)
5629{
5630	struct btrfs_key found_key;
5631	struct extent_buffer *leaf;
5632	u32 nritems;
5633	int ret;
5634
5635	while (1) {
5636		if (path->slots[0] == 0) {
5637			btrfs_set_path_blocking(path);
5638			ret = btrfs_prev_leaf(root, path);
5639			if (ret != 0)
5640				return ret;
5641		} else {
5642			path->slots[0]--;
5643		}
5644		leaf = path->nodes[0];
5645		nritems = btrfs_header_nritems(leaf);
5646		if (nritems == 0)
5647			return 1;
5648		if (path->slots[0] == nritems)
5649			path->slots[0]--;
5650
5651		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5652		if (found_key.objectid < min_objectid)
5653			break;
5654		if (found_key.type == type)
5655			return 0;
5656		if (found_key.objectid == min_objectid &&
5657		    found_key.type < type)
5658			break;
5659	}
5660	return 1;
5661}
5662
5663/*
5664 * search in extent tree to find a previous Metadata/Data extent item with
5665 * min objecitd.
5666 *
5667 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5668 */
5669int btrfs_previous_extent_item(struct btrfs_root *root,
5670			struct btrfs_path *path, u64 min_objectid)
5671{
5672	struct btrfs_key found_key;
5673	struct extent_buffer *leaf;
5674	u32 nritems;
5675	int ret;
5676
5677	while (1) {
5678		if (path->slots[0] == 0) {
5679			btrfs_set_path_blocking(path);
5680			ret = btrfs_prev_leaf(root, path);
5681			if (ret != 0)
5682				return ret;
5683		} else {
5684			path->slots[0]--;
5685		}
5686		leaf = path->nodes[0];
5687		nritems = btrfs_header_nritems(leaf);
5688		if (nritems == 0)
5689			return 1;
5690		if (path->slots[0] == nritems)
5691			path->slots[0]--;
5692
5693		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5694		if (found_key.objectid < min_objectid)
5695			break;
5696		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5697		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5698			return 0;
5699		if (found_key.objectid == min_objectid &&
5700		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5701			break;
5702	}
5703	return 1;
5704}
5705