xref: /kernel/linux/linux-5.10/fs/btrfs/extent-tree.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle.  All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/sched/signal.h>
8#include <linux/pagemap.h>
9#include <linux/writeback.h>
10#include <linux/blkdev.h>
11#include <linux/sort.h>
12#include <linux/rcupdate.h>
13#include <linux/kthread.h>
14#include <linux/slab.h>
15#include <linux/ratelimit.h>
16#include <linux/percpu_counter.h>
17#include <linux/lockdep.h>
18#include <linux/crc32c.h>
19#include "misc.h"
20#include "tree-log.h"
21#include "disk-io.h"
22#include "print-tree.h"
23#include "volumes.h"
24#include "raid56.h"
25#include "locking.h"
26#include "free-space-cache.h"
27#include "free-space-tree.h"
28#include "sysfs.h"
29#include "qgroup.h"
30#include "ref-verify.h"
31#include "space-info.h"
32#include "block-rsv.h"
33#include "delalloc-space.h"
34#include "block-group.h"
35#include "discard.h"
36#include "rcu-string.h"
37
38#undef SCRAMBLE_DELAYED_REFS
39
40
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42			       struct btrfs_delayed_ref_node *node, u64 parent,
43			       u64 root_objectid, u64 owner_objectid,
44			       u64 owner_offset, int refs_to_drop,
45			       struct btrfs_delayed_extent_op *extra_op);
46static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
47				    struct extent_buffer *leaf,
48				    struct btrfs_extent_item *ei);
49static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
50				      u64 parent, u64 root_objectid,
51				      u64 flags, u64 owner, u64 offset,
52				      struct btrfs_key *ins, int ref_mod);
53static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
54				     struct btrfs_delayed_ref_node *node,
55				     struct btrfs_delayed_extent_op *extent_op);
56static int find_next_key(struct btrfs_path *path, int level,
57			 struct btrfs_key *key);
58
59static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
60{
61	return (cache->flags & bits) == bits;
62}
63
64int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
65			      u64 start, u64 num_bytes)
66{
67	u64 end = start + num_bytes - 1;
68	set_extent_bits(&fs_info->excluded_extents, start, end,
69			EXTENT_UPTODATE);
70	return 0;
71}
72
73void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
74{
75	struct btrfs_fs_info *fs_info = cache->fs_info;
76	u64 start, end;
77
78	start = cache->start;
79	end = start + cache->length - 1;
80
81	clear_extent_bits(&fs_info->excluded_extents, start, end,
82			  EXTENT_UPTODATE);
83}
84
85/* simple helper to search for an existing data extent at a given offset */
86int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
87{
88	int ret;
89	struct btrfs_key key;
90	struct btrfs_path *path;
91
92	path = btrfs_alloc_path();
93	if (!path)
94		return -ENOMEM;
95
96	key.objectid = start;
97	key.offset = len;
98	key.type = BTRFS_EXTENT_ITEM_KEY;
99	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
100	btrfs_free_path(path);
101	return ret;
102}
103
104/*
105 * helper function to lookup reference count and flags of a tree block.
106 *
107 * the head node for delayed ref is used to store the sum of all the
108 * reference count modifications queued up in the rbtree. the head
109 * node may also store the extent flags to set. This way you can check
110 * to see what the reference count and extent flags would be if all of
111 * the delayed refs are not processed.
112 */
113int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
114			     struct btrfs_fs_info *fs_info, u64 bytenr,
115			     u64 offset, int metadata, u64 *refs, u64 *flags)
116{
117	struct btrfs_delayed_ref_head *head;
118	struct btrfs_delayed_ref_root *delayed_refs;
119	struct btrfs_path *path;
120	struct btrfs_extent_item *ei;
121	struct extent_buffer *leaf;
122	struct btrfs_key key;
123	u32 item_size;
124	u64 num_refs;
125	u64 extent_flags;
126	int ret;
127
128	/*
129	 * If we don't have skinny metadata, don't bother doing anything
130	 * different
131	 */
132	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
133		offset = fs_info->nodesize;
134		metadata = 0;
135	}
136
137	path = btrfs_alloc_path();
138	if (!path)
139		return -ENOMEM;
140
141	if (!trans) {
142		path->skip_locking = 1;
143		path->search_commit_root = 1;
144	}
145
146search_again:
147	key.objectid = bytenr;
148	key.offset = offset;
149	if (metadata)
150		key.type = BTRFS_METADATA_ITEM_KEY;
151	else
152		key.type = BTRFS_EXTENT_ITEM_KEY;
153
154	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
155	if (ret < 0)
156		goto out_free;
157
158	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
159		if (path->slots[0]) {
160			path->slots[0]--;
161			btrfs_item_key_to_cpu(path->nodes[0], &key,
162					      path->slots[0]);
163			if (key.objectid == bytenr &&
164			    key.type == BTRFS_EXTENT_ITEM_KEY &&
165			    key.offset == fs_info->nodesize)
166				ret = 0;
167		}
168	}
169
170	if (ret == 0) {
171		leaf = path->nodes[0];
172		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
173		if (item_size >= sizeof(*ei)) {
174			ei = btrfs_item_ptr(leaf, path->slots[0],
175					    struct btrfs_extent_item);
176			num_refs = btrfs_extent_refs(leaf, ei);
177			extent_flags = btrfs_extent_flags(leaf, ei);
178		} else {
179			ret = -EINVAL;
180			btrfs_print_v0_err(fs_info);
181			if (trans)
182				btrfs_abort_transaction(trans, ret);
183			else
184				btrfs_handle_fs_error(fs_info, ret, NULL);
185
186			goto out_free;
187		}
188
189		BUG_ON(num_refs == 0);
190	} else {
191		num_refs = 0;
192		extent_flags = 0;
193		ret = 0;
194	}
195
196	if (!trans)
197		goto out;
198
199	delayed_refs = &trans->transaction->delayed_refs;
200	spin_lock(&delayed_refs->lock);
201	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
202	if (head) {
203		if (!mutex_trylock(&head->mutex)) {
204			refcount_inc(&head->refs);
205			spin_unlock(&delayed_refs->lock);
206
207			btrfs_release_path(path);
208
209			/*
210			 * Mutex was contended, block until it's released and try
211			 * again
212			 */
213			mutex_lock(&head->mutex);
214			mutex_unlock(&head->mutex);
215			btrfs_put_delayed_ref_head(head);
216			goto search_again;
217		}
218		spin_lock(&head->lock);
219		if (head->extent_op && head->extent_op->update_flags)
220			extent_flags |= head->extent_op->flags_to_set;
221		else
222			BUG_ON(num_refs == 0);
223
224		num_refs += head->ref_mod;
225		spin_unlock(&head->lock);
226		mutex_unlock(&head->mutex);
227	}
228	spin_unlock(&delayed_refs->lock);
229out:
230	WARN_ON(num_refs == 0);
231	if (refs)
232		*refs = num_refs;
233	if (flags)
234		*flags = extent_flags;
235out_free:
236	btrfs_free_path(path);
237	return ret;
238}
239
240/*
241 * Back reference rules.  Back refs have three main goals:
242 *
243 * 1) differentiate between all holders of references to an extent so that
244 *    when a reference is dropped we can make sure it was a valid reference
245 *    before freeing the extent.
246 *
247 * 2) Provide enough information to quickly find the holders of an extent
248 *    if we notice a given block is corrupted or bad.
249 *
250 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
251 *    maintenance.  This is actually the same as #2, but with a slightly
252 *    different use case.
253 *
254 * There are two kinds of back refs. The implicit back refs is optimized
255 * for pointers in non-shared tree blocks. For a given pointer in a block,
256 * back refs of this kind provide information about the block's owner tree
257 * and the pointer's key. These information allow us to find the block by
258 * b-tree searching. The full back refs is for pointers in tree blocks not
259 * referenced by their owner trees. The location of tree block is recorded
260 * in the back refs. Actually the full back refs is generic, and can be
261 * used in all cases the implicit back refs is used. The major shortcoming
262 * of the full back refs is its overhead. Every time a tree block gets
263 * COWed, we have to update back refs entry for all pointers in it.
264 *
265 * For a newly allocated tree block, we use implicit back refs for
266 * pointers in it. This means most tree related operations only involve
267 * implicit back refs. For a tree block created in old transaction, the
268 * only way to drop a reference to it is COW it. So we can detect the
269 * event that tree block loses its owner tree's reference and do the
270 * back refs conversion.
271 *
272 * When a tree block is COWed through a tree, there are four cases:
273 *
274 * The reference count of the block is one and the tree is the block's
275 * owner tree. Nothing to do in this case.
276 *
277 * The reference count of the block is one and the tree is not the
278 * block's owner tree. In this case, full back refs is used for pointers
279 * in the block. Remove these full back refs, add implicit back refs for
280 * every pointers in the new block.
281 *
282 * The reference count of the block is greater than one and the tree is
283 * the block's owner tree. In this case, implicit back refs is used for
284 * pointers in the block. Add full back refs for every pointers in the
285 * block, increase lower level extents' reference counts. The original
286 * implicit back refs are entailed to the new block.
287 *
288 * The reference count of the block is greater than one and the tree is
289 * not the block's owner tree. Add implicit back refs for every pointer in
290 * the new block, increase lower level extents' reference count.
291 *
292 * Back Reference Key composing:
293 *
294 * The key objectid corresponds to the first byte in the extent,
295 * The key type is used to differentiate between types of back refs.
296 * There are different meanings of the key offset for different types
297 * of back refs.
298 *
299 * File extents can be referenced by:
300 *
301 * - multiple snapshots, subvolumes, or different generations in one subvol
302 * - different files inside a single subvolume
303 * - different offsets inside a file (bookend extents in file.c)
304 *
305 * The extent ref structure for the implicit back refs has fields for:
306 *
307 * - Objectid of the subvolume root
308 * - objectid of the file holding the reference
309 * - original offset in the file
310 * - how many bookend extents
311 *
312 * The key offset for the implicit back refs is hash of the first
313 * three fields.
314 *
315 * The extent ref structure for the full back refs has field for:
316 *
317 * - number of pointers in the tree leaf
318 *
319 * The key offset for the implicit back refs is the first byte of
320 * the tree leaf
321 *
322 * When a file extent is allocated, The implicit back refs is used.
323 * the fields are filled in:
324 *
325 *     (root_key.objectid, inode objectid, offset in file, 1)
326 *
327 * When a file extent is removed file truncation, we find the
328 * corresponding implicit back refs and check the following fields:
329 *
330 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
331 *
332 * Btree extents can be referenced by:
333 *
334 * - Different subvolumes
335 *
336 * Both the implicit back refs and the full back refs for tree blocks
337 * only consist of key. The key offset for the implicit back refs is
338 * objectid of block's owner tree. The key offset for the full back refs
339 * is the first byte of parent block.
340 *
341 * When implicit back refs is used, information about the lowest key and
342 * level of the tree block are required. These information are stored in
343 * tree block info structure.
344 */
345
346/*
347 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
348 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
349 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
350 */
351int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
352				     struct btrfs_extent_inline_ref *iref,
353				     enum btrfs_inline_ref_type is_data)
354{
355	int type = btrfs_extent_inline_ref_type(eb, iref);
356	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
357
358	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
359	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
360	    type == BTRFS_SHARED_DATA_REF_KEY ||
361	    type == BTRFS_EXTENT_DATA_REF_KEY) {
362		if (is_data == BTRFS_REF_TYPE_BLOCK) {
363			if (type == BTRFS_TREE_BLOCK_REF_KEY)
364				return type;
365			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
366				ASSERT(eb->fs_info);
367				/*
368				 * Every shared one has parent tree block,
369				 * which must be aligned to sector size.
370				 */
371				if (offset &&
372				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
373					return type;
374			}
375		} else if (is_data == BTRFS_REF_TYPE_DATA) {
376			if (type == BTRFS_EXTENT_DATA_REF_KEY)
377				return type;
378			if (type == BTRFS_SHARED_DATA_REF_KEY) {
379				ASSERT(eb->fs_info);
380				/*
381				 * Every shared one has parent tree block,
382				 * which must be aligned to sector size.
383				 */
384				if (offset &&
385				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
386					return type;
387			}
388		} else {
389			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
390			return type;
391		}
392	}
393
394	btrfs_print_leaf((struct extent_buffer *)eb);
395	btrfs_err(eb->fs_info,
396		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
397		  eb->start, (unsigned long)iref, type);
398	WARN_ON(1);
399
400	return BTRFS_REF_TYPE_INVALID;
401}
402
403u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
404{
405	u32 high_crc = ~(u32)0;
406	u32 low_crc = ~(u32)0;
407	__le64 lenum;
408
409	lenum = cpu_to_le64(root_objectid);
410	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
411	lenum = cpu_to_le64(owner);
412	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
413	lenum = cpu_to_le64(offset);
414	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
415
416	return ((u64)high_crc << 31) ^ (u64)low_crc;
417}
418
419static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
420				     struct btrfs_extent_data_ref *ref)
421{
422	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
423				    btrfs_extent_data_ref_objectid(leaf, ref),
424				    btrfs_extent_data_ref_offset(leaf, ref));
425}
426
427static int match_extent_data_ref(struct extent_buffer *leaf,
428				 struct btrfs_extent_data_ref *ref,
429				 u64 root_objectid, u64 owner, u64 offset)
430{
431	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
432	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
433	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
434		return 0;
435	return 1;
436}
437
438static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
439					   struct btrfs_path *path,
440					   u64 bytenr, u64 parent,
441					   u64 root_objectid,
442					   u64 owner, u64 offset)
443{
444	struct btrfs_root *root = trans->fs_info->extent_root;
445	struct btrfs_key key;
446	struct btrfs_extent_data_ref *ref;
447	struct extent_buffer *leaf;
448	u32 nritems;
449	int ret;
450	int recow;
451	int err = -ENOENT;
452
453	key.objectid = bytenr;
454	if (parent) {
455		key.type = BTRFS_SHARED_DATA_REF_KEY;
456		key.offset = parent;
457	} else {
458		key.type = BTRFS_EXTENT_DATA_REF_KEY;
459		key.offset = hash_extent_data_ref(root_objectid,
460						  owner, offset);
461	}
462again:
463	recow = 0;
464	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
465	if (ret < 0) {
466		err = ret;
467		goto fail;
468	}
469
470	if (parent) {
471		if (!ret)
472			return 0;
473		goto fail;
474	}
475
476	leaf = path->nodes[0];
477	nritems = btrfs_header_nritems(leaf);
478	while (1) {
479		if (path->slots[0] >= nritems) {
480			ret = btrfs_next_leaf(root, path);
481			if (ret < 0)
482				err = ret;
483			if (ret)
484				goto fail;
485
486			leaf = path->nodes[0];
487			nritems = btrfs_header_nritems(leaf);
488			recow = 1;
489		}
490
491		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
492		if (key.objectid != bytenr ||
493		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
494			goto fail;
495
496		ref = btrfs_item_ptr(leaf, path->slots[0],
497				     struct btrfs_extent_data_ref);
498
499		if (match_extent_data_ref(leaf, ref, root_objectid,
500					  owner, offset)) {
501			if (recow) {
502				btrfs_release_path(path);
503				goto again;
504			}
505			err = 0;
506			break;
507		}
508		path->slots[0]++;
509	}
510fail:
511	return err;
512}
513
514static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
515					   struct btrfs_path *path,
516					   u64 bytenr, u64 parent,
517					   u64 root_objectid, u64 owner,
518					   u64 offset, int refs_to_add)
519{
520	struct btrfs_root *root = trans->fs_info->extent_root;
521	struct btrfs_key key;
522	struct extent_buffer *leaf;
523	u32 size;
524	u32 num_refs;
525	int ret;
526
527	key.objectid = bytenr;
528	if (parent) {
529		key.type = BTRFS_SHARED_DATA_REF_KEY;
530		key.offset = parent;
531		size = sizeof(struct btrfs_shared_data_ref);
532	} else {
533		key.type = BTRFS_EXTENT_DATA_REF_KEY;
534		key.offset = hash_extent_data_ref(root_objectid,
535						  owner, offset);
536		size = sizeof(struct btrfs_extent_data_ref);
537	}
538
539	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
540	if (ret && ret != -EEXIST)
541		goto fail;
542
543	leaf = path->nodes[0];
544	if (parent) {
545		struct btrfs_shared_data_ref *ref;
546		ref = btrfs_item_ptr(leaf, path->slots[0],
547				     struct btrfs_shared_data_ref);
548		if (ret == 0) {
549			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
550		} else {
551			num_refs = btrfs_shared_data_ref_count(leaf, ref);
552			num_refs += refs_to_add;
553			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
554		}
555	} else {
556		struct btrfs_extent_data_ref *ref;
557		while (ret == -EEXIST) {
558			ref = btrfs_item_ptr(leaf, path->slots[0],
559					     struct btrfs_extent_data_ref);
560			if (match_extent_data_ref(leaf, ref, root_objectid,
561						  owner, offset))
562				break;
563			btrfs_release_path(path);
564			key.offset++;
565			ret = btrfs_insert_empty_item(trans, root, path, &key,
566						      size);
567			if (ret && ret != -EEXIST)
568				goto fail;
569
570			leaf = path->nodes[0];
571		}
572		ref = btrfs_item_ptr(leaf, path->slots[0],
573				     struct btrfs_extent_data_ref);
574		if (ret == 0) {
575			btrfs_set_extent_data_ref_root(leaf, ref,
576						       root_objectid);
577			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
578			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
579			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
580		} else {
581			num_refs = btrfs_extent_data_ref_count(leaf, ref);
582			num_refs += refs_to_add;
583			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
584		}
585	}
586	btrfs_mark_buffer_dirty(leaf);
587	ret = 0;
588fail:
589	btrfs_release_path(path);
590	return ret;
591}
592
593static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
594					   struct btrfs_path *path,
595					   int refs_to_drop, int *last_ref)
596{
597	struct btrfs_key key;
598	struct btrfs_extent_data_ref *ref1 = NULL;
599	struct btrfs_shared_data_ref *ref2 = NULL;
600	struct extent_buffer *leaf;
601	u32 num_refs = 0;
602	int ret = 0;
603
604	leaf = path->nodes[0];
605	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
606
607	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
608		ref1 = btrfs_item_ptr(leaf, path->slots[0],
609				      struct btrfs_extent_data_ref);
610		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
611	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
612		ref2 = btrfs_item_ptr(leaf, path->slots[0],
613				      struct btrfs_shared_data_ref);
614		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
615	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
616		btrfs_print_v0_err(trans->fs_info);
617		btrfs_abort_transaction(trans, -EINVAL);
618		return -EINVAL;
619	} else {
620		BUG();
621	}
622
623	BUG_ON(num_refs < refs_to_drop);
624	num_refs -= refs_to_drop;
625
626	if (num_refs == 0) {
627		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
628		*last_ref = 1;
629	} else {
630		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
631			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
632		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
633			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
634		btrfs_mark_buffer_dirty(leaf);
635	}
636	return ret;
637}
638
639static noinline u32 extent_data_ref_count(struct btrfs_path *path,
640					  struct btrfs_extent_inline_ref *iref)
641{
642	struct btrfs_key key;
643	struct extent_buffer *leaf;
644	struct btrfs_extent_data_ref *ref1;
645	struct btrfs_shared_data_ref *ref2;
646	u32 num_refs = 0;
647	int type;
648
649	leaf = path->nodes[0];
650	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
651
652	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
653	if (iref) {
654		/*
655		 * If type is invalid, we should have bailed out earlier than
656		 * this call.
657		 */
658		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
659		ASSERT(type != BTRFS_REF_TYPE_INVALID);
660		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
661			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
662			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
663		} else {
664			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
665			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
666		}
667	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
668		ref1 = btrfs_item_ptr(leaf, path->slots[0],
669				      struct btrfs_extent_data_ref);
670		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
671	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
672		ref2 = btrfs_item_ptr(leaf, path->slots[0],
673				      struct btrfs_shared_data_ref);
674		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
675	} else {
676		WARN_ON(1);
677	}
678	return num_refs;
679}
680
681static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
682					  struct btrfs_path *path,
683					  u64 bytenr, u64 parent,
684					  u64 root_objectid)
685{
686	struct btrfs_root *root = trans->fs_info->extent_root;
687	struct btrfs_key key;
688	int ret;
689
690	key.objectid = bytenr;
691	if (parent) {
692		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
693		key.offset = parent;
694	} else {
695		key.type = BTRFS_TREE_BLOCK_REF_KEY;
696		key.offset = root_objectid;
697	}
698
699	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
700	if (ret > 0)
701		ret = -ENOENT;
702	return ret;
703}
704
705static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
706					  struct btrfs_path *path,
707					  u64 bytenr, u64 parent,
708					  u64 root_objectid)
709{
710	struct btrfs_key key;
711	int ret;
712
713	key.objectid = bytenr;
714	if (parent) {
715		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
716		key.offset = parent;
717	} else {
718		key.type = BTRFS_TREE_BLOCK_REF_KEY;
719		key.offset = root_objectid;
720	}
721
722	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
723				      path, &key, 0);
724	btrfs_release_path(path);
725	return ret;
726}
727
728static inline int extent_ref_type(u64 parent, u64 owner)
729{
730	int type;
731	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
732		if (parent > 0)
733			type = BTRFS_SHARED_BLOCK_REF_KEY;
734		else
735			type = BTRFS_TREE_BLOCK_REF_KEY;
736	} else {
737		if (parent > 0)
738			type = BTRFS_SHARED_DATA_REF_KEY;
739		else
740			type = BTRFS_EXTENT_DATA_REF_KEY;
741	}
742	return type;
743}
744
745static int find_next_key(struct btrfs_path *path, int level,
746			 struct btrfs_key *key)
747
748{
749	for (; level < BTRFS_MAX_LEVEL; level++) {
750		if (!path->nodes[level])
751			break;
752		if (path->slots[level] + 1 >=
753		    btrfs_header_nritems(path->nodes[level]))
754			continue;
755		if (level == 0)
756			btrfs_item_key_to_cpu(path->nodes[level], key,
757					      path->slots[level] + 1);
758		else
759			btrfs_node_key_to_cpu(path->nodes[level], key,
760					      path->slots[level] + 1);
761		return 0;
762	}
763	return 1;
764}
765
766/*
767 * look for inline back ref. if back ref is found, *ref_ret is set
768 * to the address of inline back ref, and 0 is returned.
769 *
770 * if back ref isn't found, *ref_ret is set to the address where it
771 * should be inserted, and -ENOENT is returned.
772 *
773 * if insert is true and there are too many inline back refs, the path
774 * points to the extent item, and -EAGAIN is returned.
775 *
776 * NOTE: inline back refs are ordered in the same way that back ref
777 *	 items in the tree are ordered.
778 */
779static noinline_for_stack
780int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
781				 struct btrfs_path *path,
782				 struct btrfs_extent_inline_ref **ref_ret,
783				 u64 bytenr, u64 num_bytes,
784				 u64 parent, u64 root_objectid,
785				 u64 owner, u64 offset, int insert)
786{
787	struct btrfs_fs_info *fs_info = trans->fs_info;
788	struct btrfs_root *root = fs_info->extent_root;
789	struct btrfs_key key;
790	struct extent_buffer *leaf;
791	struct btrfs_extent_item *ei;
792	struct btrfs_extent_inline_ref *iref;
793	u64 flags;
794	u64 item_size;
795	unsigned long ptr;
796	unsigned long end;
797	int extra_size;
798	int type;
799	int want;
800	int ret;
801	int err = 0;
802	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
803	int needed;
804
805	key.objectid = bytenr;
806	key.type = BTRFS_EXTENT_ITEM_KEY;
807	key.offset = num_bytes;
808
809	want = extent_ref_type(parent, owner);
810	if (insert) {
811		extra_size = btrfs_extent_inline_ref_size(want);
812		path->keep_locks = 1;
813	} else
814		extra_size = -1;
815
816	/*
817	 * Owner is our level, so we can just add one to get the level for the
818	 * block we are interested in.
819	 */
820	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
821		key.type = BTRFS_METADATA_ITEM_KEY;
822		key.offset = owner;
823	}
824
825again:
826	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
827	if (ret < 0) {
828		err = ret;
829		goto out;
830	}
831
832	/*
833	 * We may be a newly converted file system which still has the old fat
834	 * extent entries for metadata, so try and see if we have one of those.
835	 */
836	if (ret > 0 && skinny_metadata) {
837		skinny_metadata = false;
838		if (path->slots[0]) {
839			path->slots[0]--;
840			btrfs_item_key_to_cpu(path->nodes[0], &key,
841					      path->slots[0]);
842			if (key.objectid == bytenr &&
843			    key.type == BTRFS_EXTENT_ITEM_KEY &&
844			    key.offset == num_bytes)
845				ret = 0;
846		}
847		if (ret) {
848			key.objectid = bytenr;
849			key.type = BTRFS_EXTENT_ITEM_KEY;
850			key.offset = num_bytes;
851			btrfs_release_path(path);
852			goto again;
853		}
854	}
855
856	if (ret && !insert) {
857		err = -ENOENT;
858		goto out;
859	} else if (WARN_ON(ret)) {
860		btrfs_print_leaf(path->nodes[0]);
861		btrfs_err(fs_info,
862"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
863			  bytenr, num_bytes, parent, root_objectid, owner,
864			  offset);
865		err = -EIO;
866		goto out;
867	}
868
869	leaf = path->nodes[0];
870	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
871	if (unlikely(item_size < sizeof(*ei))) {
872		err = -EINVAL;
873		btrfs_print_v0_err(fs_info);
874		btrfs_abort_transaction(trans, err);
875		goto out;
876	}
877
878	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
879	flags = btrfs_extent_flags(leaf, ei);
880
881	ptr = (unsigned long)(ei + 1);
882	end = (unsigned long)ei + item_size;
883
884	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
885		ptr += sizeof(struct btrfs_tree_block_info);
886		BUG_ON(ptr > end);
887	}
888
889	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
890		needed = BTRFS_REF_TYPE_DATA;
891	else
892		needed = BTRFS_REF_TYPE_BLOCK;
893
894	err = -ENOENT;
895	while (1) {
896		if (ptr >= end) {
897			WARN_ON(ptr > end);
898			break;
899		}
900		iref = (struct btrfs_extent_inline_ref *)ptr;
901		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
902		if (type == BTRFS_REF_TYPE_INVALID) {
903			err = -EUCLEAN;
904			goto out;
905		}
906
907		if (want < type)
908			break;
909		if (want > type) {
910			ptr += btrfs_extent_inline_ref_size(type);
911			continue;
912		}
913
914		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
915			struct btrfs_extent_data_ref *dref;
916			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
917			if (match_extent_data_ref(leaf, dref, root_objectid,
918						  owner, offset)) {
919				err = 0;
920				break;
921			}
922			if (hash_extent_data_ref_item(leaf, dref) <
923			    hash_extent_data_ref(root_objectid, owner, offset))
924				break;
925		} else {
926			u64 ref_offset;
927			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
928			if (parent > 0) {
929				if (parent == ref_offset) {
930					err = 0;
931					break;
932				}
933				if (ref_offset < parent)
934					break;
935			} else {
936				if (root_objectid == ref_offset) {
937					err = 0;
938					break;
939				}
940				if (ref_offset < root_objectid)
941					break;
942			}
943		}
944		ptr += btrfs_extent_inline_ref_size(type);
945	}
946	if (err == -ENOENT && insert) {
947		if (item_size + extra_size >=
948		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
949			err = -EAGAIN;
950			goto out;
951		}
952		/*
953		 * To add new inline back ref, we have to make sure
954		 * there is no corresponding back ref item.
955		 * For simplicity, we just do not add new inline back
956		 * ref if there is any kind of item for this block
957		 */
958		if (find_next_key(path, 0, &key) == 0 &&
959		    key.objectid == bytenr &&
960		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
961			err = -EAGAIN;
962			goto out;
963		}
964	}
965	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
966out:
967	if (insert) {
968		path->keep_locks = 0;
969		btrfs_unlock_up_safe(path, 1);
970	}
971	return err;
972}
973
974/*
975 * helper to add new inline back ref
976 */
977static noinline_for_stack
978void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
979				 struct btrfs_path *path,
980				 struct btrfs_extent_inline_ref *iref,
981				 u64 parent, u64 root_objectid,
982				 u64 owner, u64 offset, int refs_to_add,
983				 struct btrfs_delayed_extent_op *extent_op)
984{
985	struct extent_buffer *leaf;
986	struct btrfs_extent_item *ei;
987	unsigned long ptr;
988	unsigned long end;
989	unsigned long item_offset;
990	u64 refs;
991	int size;
992	int type;
993
994	leaf = path->nodes[0];
995	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
996	item_offset = (unsigned long)iref - (unsigned long)ei;
997
998	type = extent_ref_type(parent, owner);
999	size = btrfs_extent_inline_ref_size(type);
1000
1001	btrfs_extend_item(path, size);
1002
1003	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1004	refs = btrfs_extent_refs(leaf, ei);
1005	refs += refs_to_add;
1006	btrfs_set_extent_refs(leaf, ei, refs);
1007	if (extent_op)
1008		__run_delayed_extent_op(extent_op, leaf, ei);
1009
1010	ptr = (unsigned long)ei + item_offset;
1011	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1012	if (ptr < end - size)
1013		memmove_extent_buffer(leaf, ptr + size, ptr,
1014				      end - size - ptr);
1015
1016	iref = (struct btrfs_extent_inline_ref *)ptr;
1017	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1018	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1019		struct btrfs_extent_data_ref *dref;
1020		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1021		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1022		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1023		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1024		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1025	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1026		struct btrfs_shared_data_ref *sref;
1027		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1028		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1029		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1030	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1031		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1032	} else {
1033		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1034	}
1035	btrfs_mark_buffer_dirty(leaf);
1036}
1037
1038static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1039				 struct btrfs_path *path,
1040				 struct btrfs_extent_inline_ref **ref_ret,
1041				 u64 bytenr, u64 num_bytes, u64 parent,
1042				 u64 root_objectid, u64 owner, u64 offset)
1043{
1044	int ret;
1045
1046	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1047					   num_bytes, parent, root_objectid,
1048					   owner, offset, 0);
1049	if (ret != -ENOENT)
1050		return ret;
1051
1052	btrfs_release_path(path);
1053	*ref_ret = NULL;
1054
1055	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1056		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1057					    root_objectid);
1058	} else {
1059		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1060					     root_objectid, owner, offset);
1061	}
1062	return ret;
1063}
1064
1065/*
1066 * helper to update/remove inline back ref
1067 */
1068static noinline_for_stack
1069void update_inline_extent_backref(struct btrfs_path *path,
1070				  struct btrfs_extent_inline_ref *iref,
1071				  int refs_to_mod,
1072				  struct btrfs_delayed_extent_op *extent_op,
1073				  int *last_ref)
1074{
1075	struct extent_buffer *leaf = path->nodes[0];
1076	struct btrfs_extent_item *ei;
1077	struct btrfs_extent_data_ref *dref = NULL;
1078	struct btrfs_shared_data_ref *sref = NULL;
1079	unsigned long ptr;
1080	unsigned long end;
1081	u32 item_size;
1082	int size;
1083	int type;
1084	u64 refs;
1085
1086	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1087	refs = btrfs_extent_refs(leaf, ei);
1088	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1089	refs += refs_to_mod;
1090	btrfs_set_extent_refs(leaf, ei, refs);
1091	if (extent_op)
1092		__run_delayed_extent_op(extent_op, leaf, ei);
1093
1094	/*
1095	 * If type is invalid, we should have bailed out after
1096	 * lookup_inline_extent_backref().
1097	 */
1098	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1099	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1100
1101	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1102		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1103		refs = btrfs_extent_data_ref_count(leaf, dref);
1104	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1105		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1106		refs = btrfs_shared_data_ref_count(leaf, sref);
1107	} else {
1108		refs = 1;
1109		BUG_ON(refs_to_mod != -1);
1110	}
1111
1112	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1113	refs += refs_to_mod;
1114
1115	if (refs > 0) {
1116		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1117			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1118		else
1119			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1120	} else {
1121		*last_ref = 1;
1122		size =  btrfs_extent_inline_ref_size(type);
1123		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1124		ptr = (unsigned long)iref;
1125		end = (unsigned long)ei + item_size;
1126		if (ptr + size < end)
1127			memmove_extent_buffer(leaf, ptr, ptr + size,
1128					      end - ptr - size);
1129		item_size -= size;
1130		btrfs_truncate_item(path, item_size, 1);
1131	}
1132	btrfs_mark_buffer_dirty(leaf);
1133}
1134
1135static noinline_for_stack
1136int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1137				 struct btrfs_path *path,
1138				 u64 bytenr, u64 num_bytes, u64 parent,
1139				 u64 root_objectid, u64 owner,
1140				 u64 offset, int refs_to_add,
1141				 struct btrfs_delayed_extent_op *extent_op)
1142{
1143	struct btrfs_extent_inline_ref *iref;
1144	int ret;
1145
1146	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1147					   num_bytes, parent, root_objectid,
1148					   owner, offset, 1);
1149	if (ret == 0) {
1150		/*
1151		 * We're adding refs to a tree block we already own, this
1152		 * should not happen at all.
1153		 */
1154		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1155			btrfs_crit(trans->fs_info,
1156"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
1157				   bytenr, num_bytes, root_objectid);
1158			if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
1159				WARN_ON(1);
1160				btrfs_crit(trans->fs_info,
1161			"path->slots[0]=%d path->nodes[0]:", path->slots[0]);
1162				btrfs_print_leaf(path->nodes[0]);
1163			}
1164			return -EUCLEAN;
1165		}
1166		update_inline_extent_backref(path, iref, refs_to_add,
1167					     extent_op, NULL);
1168	} else if (ret == -ENOENT) {
1169		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1170					    root_objectid, owner, offset,
1171					    refs_to_add, extent_op);
1172		ret = 0;
1173	}
1174	return ret;
1175}
1176
1177static int remove_extent_backref(struct btrfs_trans_handle *trans,
1178				 struct btrfs_path *path,
1179				 struct btrfs_extent_inline_ref *iref,
1180				 int refs_to_drop, int is_data, int *last_ref)
1181{
1182	int ret = 0;
1183
1184	BUG_ON(!is_data && refs_to_drop != 1);
1185	if (iref) {
1186		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1187					     last_ref);
1188	} else if (is_data) {
1189		ret = remove_extent_data_ref(trans, path, refs_to_drop,
1190					     last_ref);
1191	} else {
1192		*last_ref = 1;
1193		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1194	}
1195	return ret;
1196}
1197
1198static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1199			       u64 *discarded_bytes)
1200{
1201	int j, ret = 0;
1202	u64 bytes_left, end;
1203	u64 aligned_start = ALIGN(start, 1 << 9);
1204
1205	/* Adjust the range to be aligned to 512B sectors if necessary. */
1206	if (start != aligned_start) {
1207		len -= aligned_start - start;
1208		len = round_down(len, 1 << 9);
1209		start = aligned_start;
1210	}
1211
1212	*discarded_bytes = 0;
1213
1214	if (!len)
1215		return 0;
1216
1217	end = start + len;
1218	bytes_left = len;
1219
1220	/* Skip any superblocks on this device. */
1221	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1222		u64 sb_start = btrfs_sb_offset(j);
1223		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1224		u64 size = sb_start - start;
1225
1226		if (!in_range(sb_start, start, bytes_left) &&
1227		    !in_range(sb_end, start, bytes_left) &&
1228		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1229			continue;
1230
1231		/*
1232		 * Superblock spans beginning of range.  Adjust start and
1233		 * try again.
1234		 */
1235		if (sb_start <= start) {
1236			start += sb_end - start;
1237			if (start > end) {
1238				bytes_left = 0;
1239				break;
1240			}
1241			bytes_left = end - start;
1242			continue;
1243		}
1244
1245		if (size) {
1246			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1247						   GFP_NOFS, 0);
1248			if (!ret)
1249				*discarded_bytes += size;
1250			else if (ret != -EOPNOTSUPP)
1251				return ret;
1252		}
1253
1254		start = sb_end;
1255		if (start > end) {
1256			bytes_left = 0;
1257			break;
1258		}
1259		bytes_left = end - start;
1260	}
1261
1262	if (bytes_left) {
1263		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1264					   GFP_NOFS, 0);
1265		if (!ret)
1266			*discarded_bytes += bytes_left;
1267	}
1268	return ret;
1269}
1270
1271int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1272			 u64 num_bytes, u64 *actual_bytes)
1273{
1274	int ret = 0;
1275	u64 discarded_bytes = 0;
1276	u64 end = bytenr + num_bytes;
1277	u64 cur = bytenr;
1278	struct btrfs_bio *bbio = NULL;
1279
1280
1281	/*
1282	 * Avoid races with device replace and make sure our bbio has devices
1283	 * associated to its stripes that don't go away while we are discarding.
1284	 */
1285	btrfs_bio_counter_inc_blocked(fs_info);
1286	while (cur < end) {
1287		struct btrfs_bio_stripe *stripe;
1288		int i;
1289
1290		num_bytes = end - cur;
1291		/* Tell the block device(s) that the sectors can be discarded */
1292		ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
1293				      &num_bytes, &bbio, 0);
1294		/*
1295		 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
1296		 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
1297		 * thus we can't continue anyway.
1298		 */
1299		if (ret < 0)
1300			goto out;
1301
1302		stripe = bbio->stripes;
1303		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1304			u64 bytes;
1305			struct request_queue *req_q;
1306			struct btrfs_device *device = stripe->dev;
1307
1308			if (!device->bdev) {
1309				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1310				continue;
1311			}
1312			req_q = bdev_get_queue(device->bdev);
1313			if (!blk_queue_discard(req_q))
1314				continue;
1315
1316			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1317				continue;
1318
1319			ret = btrfs_issue_discard(device->bdev,
1320						  stripe->physical,
1321						  stripe->length,
1322						  &bytes);
1323			if (!ret) {
1324				discarded_bytes += bytes;
1325			} else if (ret != -EOPNOTSUPP) {
1326				/*
1327				 * Logic errors or -ENOMEM, or -EIO, but
1328				 * unlikely to happen.
1329				 *
1330				 * And since there are two loops, explicitly
1331				 * go to out to avoid confusion.
1332				 */
1333				btrfs_put_bbio(bbio);
1334				goto out;
1335			}
1336
1337			/*
1338			 * Just in case we get back EOPNOTSUPP for some reason,
1339			 * just ignore the return value so we don't screw up
1340			 * people calling discard_extent.
1341			 */
1342			ret = 0;
1343		}
1344		btrfs_put_bbio(bbio);
1345		cur += num_bytes;
1346	}
1347out:
1348	btrfs_bio_counter_dec(fs_info);
1349
1350	if (actual_bytes)
1351		*actual_bytes = discarded_bytes;
1352
1353
1354	if (ret == -EOPNOTSUPP)
1355		ret = 0;
1356	return ret;
1357}
1358
1359/* Can return -ENOMEM */
1360int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1361			 struct btrfs_ref *generic_ref)
1362{
1363	struct btrfs_fs_info *fs_info = trans->fs_info;
1364	int ret;
1365
1366	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1367	       generic_ref->action);
1368	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1369	       generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1370
1371	if (generic_ref->type == BTRFS_REF_METADATA)
1372		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1373	else
1374		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1375
1376	btrfs_ref_tree_mod(fs_info, generic_ref);
1377
1378	return ret;
1379}
1380
1381/*
1382 * __btrfs_inc_extent_ref - insert backreference for a given extent
1383 *
1384 * The counterpart is in __btrfs_free_extent(), with examples and more details
1385 * how it works.
1386 *
1387 * @trans:	    Handle of transaction
1388 *
1389 * @node:	    The delayed ref node used to get the bytenr/length for
1390 *		    extent whose references are incremented.
1391 *
1392 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1393 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1394 *		    bytenr of the parent block. Since new extents are always
1395 *		    created with indirect references, this will only be the case
1396 *		    when relocating a shared extent. In that case, root_objectid
1397 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
1398 *		    be 0
1399 *
1400 * @root_objectid:  The id of the root where this modification has originated,
1401 *		    this can be either one of the well-known metadata trees or
1402 *		    the subvolume id which references this extent.
1403 *
1404 * @owner:	    For data extents it is the inode number of the owning file.
1405 *		    For metadata extents this parameter holds the level in the
1406 *		    tree of the extent.
1407 *
1408 * @offset:	    For metadata extents the offset is ignored and is currently
1409 *		    always passed as 0. For data extents it is the fileoffset
1410 *		    this extent belongs to.
1411 *
1412 * @refs_to_add     Number of references to add
1413 *
1414 * @extent_op       Pointer to a structure, holding information necessary when
1415 *                  updating a tree block's flags
1416 *
1417 */
1418static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1419				  struct btrfs_delayed_ref_node *node,
1420				  u64 parent, u64 root_objectid,
1421				  u64 owner, u64 offset, int refs_to_add,
1422				  struct btrfs_delayed_extent_op *extent_op)
1423{
1424	struct btrfs_path *path;
1425	struct extent_buffer *leaf;
1426	struct btrfs_extent_item *item;
1427	struct btrfs_key key;
1428	u64 bytenr = node->bytenr;
1429	u64 num_bytes = node->num_bytes;
1430	u64 refs;
1431	int ret;
1432
1433	path = btrfs_alloc_path();
1434	if (!path)
1435		return -ENOMEM;
1436
1437	path->leave_spinning = 1;
1438	/* this will setup the path even if it fails to insert the back ref */
1439	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1440					   parent, root_objectid, owner,
1441					   offset, refs_to_add, extent_op);
1442	if ((ret < 0 && ret != -EAGAIN) || !ret)
1443		goto out;
1444
1445	/*
1446	 * Ok we had -EAGAIN which means we didn't have space to insert and
1447	 * inline extent ref, so just update the reference count and add a
1448	 * normal backref.
1449	 */
1450	leaf = path->nodes[0];
1451	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1452	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1453	refs = btrfs_extent_refs(leaf, item);
1454	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1455	if (extent_op)
1456		__run_delayed_extent_op(extent_op, leaf, item);
1457
1458	btrfs_mark_buffer_dirty(leaf);
1459	btrfs_release_path(path);
1460
1461	path->leave_spinning = 1;
1462	/* now insert the actual backref */
1463	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1464		BUG_ON(refs_to_add != 1);
1465		ret = insert_tree_block_ref(trans, path, bytenr, parent,
1466					    root_objectid);
1467	} else {
1468		ret = insert_extent_data_ref(trans, path, bytenr, parent,
1469					     root_objectid, owner, offset,
1470					     refs_to_add);
1471	}
1472	if (ret)
1473		btrfs_abort_transaction(trans, ret);
1474out:
1475	btrfs_free_path(path);
1476	return ret;
1477}
1478
1479static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1480				struct btrfs_delayed_ref_node *node,
1481				struct btrfs_delayed_extent_op *extent_op,
1482				int insert_reserved)
1483{
1484	int ret = 0;
1485	struct btrfs_delayed_data_ref *ref;
1486	struct btrfs_key ins;
1487	u64 parent = 0;
1488	u64 ref_root = 0;
1489	u64 flags = 0;
1490
1491	ins.objectid = node->bytenr;
1492	ins.offset = node->num_bytes;
1493	ins.type = BTRFS_EXTENT_ITEM_KEY;
1494
1495	ref = btrfs_delayed_node_to_data_ref(node);
1496	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1497
1498	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1499		parent = ref->parent;
1500	ref_root = ref->root;
1501
1502	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1503		if (extent_op)
1504			flags |= extent_op->flags_to_set;
1505		ret = alloc_reserved_file_extent(trans, parent, ref_root,
1506						 flags, ref->objectid,
1507						 ref->offset, &ins,
1508						 node->ref_mod);
1509	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1510		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1511					     ref->objectid, ref->offset,
1512					     node->ref_mod, extent_op);
1513	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1514		ret = __btrfs_free_extent(trans, node, parent,
1515					  ref_root, ref->objectid,
1516					  ref->offset, node->ref_mod,
1517					  extent_op);
1518	} else {
1519		BUG();
1520	}
1521	return ret;
1522}
1523
1524static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1525				    struct extent_buffer *leaf,
1526				    struct btrfs_extent_item *ei)
1527{
1528	u64 flags = btrfs_extent_flags(leaf, ei);
1529	if (extent_op->update_flags) {
1530		flags |= extent_op->flags_to_set;
1531		btrfs_set_extent_flags(leaf, ei, flags);
1532	}
1533
1534	if (extent_op->update_key) {
1535		struct btrfs_tree_block_info *bi;
1536		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1537		bi = (struct btrfs_tree_block_info *)(ei + 1);
1538		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1539	}
1540}
1541
1542static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1543				 struct btrfs_delayed_ref_head *head,
1544				 struct btrfs_delayed_extent_op *extent_op)
1545{
1546	struct btrfs_fs_info *fs_info = trans->fs_info;
1547	struct btrfs_key key;
1548	struct btrfs_path *path;
1549	struct btrfs_extent_item *ei;
1550	struct extent_buffer *leaf;
1551	u32 item_size;
1552	int ret;
1553	int err = 0;
1554	int metadata = !extent_op->is_data;
1555
1556	if (TRANS_ABORTED(trans))
1557		return 0;
1558
1559	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1560		metadata = 0;
1561
1562	path = btrfs_alloc_path();
1563	if (!path)
1564		return -ENOMEM;
1565
1566	key.objectid = head->bytenr;
1567
1568	if (metadata) {
1569		key.type = BTRFS_METADATA_ITEM_KEY;
1570		key.offset = extent_op->level;
1571	} else {
1572		key.type = BTRFS_EXTENT_ITEM_KEY;
1573		key.offset = head->num_bytes;
1574	}
1575
1576again:
1577	path->leave_spinning = 1;
1578	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
1579	if (ret < 0) {
1580		err = ret;
1581		goto out;
1582	}
1583	if (ret > 0) {
1584		if (metadata) {
1585			if (path->slots[0] > 0) {
1586				path->slots[0]--;
1587				btrfs_item_key_to_cpu(path->nodes[0], &key,
1588						      path->slots[0]);
1589				if (key.objectid == head->bytenr &&
1590				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1591				    key.offset == head->num_bytes)
1592					ret = 0;
1593			}
1594			if (ret > 0) {
1595				btrfs_release_path(path);
1596				metadata = 0;
1597
1598				key.objectid = head->bytenr;
1599				key.offset = head->num_bytes;
1600				key.type = BTRFS_EXTENT_ITEM_KEY;
1601				goto again;
1602			}
1603		} else {
1604			err = -EIO;
1605			goto out;
1606		}
1607	}
1608
1609	leaf = path->nodes[0];
1610	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1611
1612	if (unlikely(item_size < sizeof(*ei))) {
1613		err = -EINVAL;
1614		btrfs_print_v0_err(fs_info);
1615		btrfs_abort_transaction(trans, err);
1616		goto out;
1617	}
1618
1619	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1620	__run_delayed_extent_op(extent_op, leaf, ei);
1621
1622	btrfs_mark_buffer_dirty(leaf);
1623out:
1624	btrfs_free_path(path);
1625	return err;
1626}
1627
1628static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1629				struct btrfs_delayed_ref_node *node,
1630				struct btrfs_delayed_extent_op *extent_op,
1631				int insert_reserved)
1632{
1633	int ret = 0;
1634	struct btrfs_delayed_tree_ref *ref;
1635	u64 parent = 0;
1636	u64 ref_root = 0;
1637
1638	ref = btrfs_delayed_node_to_tree_ref(node);
1639	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1640
1641	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1642		parent = ref->parent;
1643	ref_root = ref->root;
1644
1645	if (unlikely(node->ref_mod != 1)) {
1646		btrfs_err(trans->fs_info,
1647	"btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1648			  node->bytenr, node->ref_mod, node->action, ref_root,
1649			  parent);
1650		return -EUCLEAN;
1651	}
1652	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1653		BUG_ON(!extent_op || !extent_op->update_flags);
1654		ret = alloc_reserved_tree_block(trans, node, extent_op);
1655	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1656		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1657					     ref->level, 0, 1, extent_op);
1658	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1659		ret = __btrfs_free_extent(trans, node, parent, ref_root,
1660					  ref->level, 0, 1, extent_op);
1661	} else {
1662		BUG();
1663	}
1664	return ret;
1665}
1666
1667/* helper function to actually process a single delayed ref entry */
1668static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1669			       struct btrfs_delayed_ref_node *node,
1670			       struct btrfs_delayed_extent_op *extent_op,
1671			       int insert_reserved)
1672{
1673	int ret = 0;
1674
1675	if (TRANS_ABORTED(trans)) {
1676		if (insert_reserved)
1677			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1678		return 0;
1679	}
1680
1681	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1682	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1683		ret = run_delayed_tree_ref(trans, node, extent_op,
1684					   insert_reserved);
1685	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1686		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1687		ret = run_delayed_data_ref(trans, node, extent_op,
1688					   insert_reserved);
1689	else
1690		BUG();
1691	if (ret && insert_reserved)
1692		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1693	if (ret < 0)
1694		btrfs_err(trans->fs_info,
1695"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1696			  node->bytenr, node->num_bytes, node->type,
1697			  node->action, node->ref_mod, ret);
1698	return ret;
1699}
1700
1701static inline struct btrfs_delayed_ref_node *
1702select_delayed_ref(struct btrfs_delayed_ref_head *head)
1703{
1704	struct btrfs_delayed_ref_node *ref;
1705
1706	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1707		return NULL;
1708
1709	/*
1710	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1711	 * This is to prevent a ref count from going down to zero, which deletes
1712	 * the extent item from the extent tree, when there still are references
1713	 * to add, which would fail because they would not find the extent item.
1714	 */
1715	if (!list_empty(&head->ref_add_list))
1716		return list_first_entry(&head->ref_add_list,
1717				struct btrfs_delayed_ref_node, add_list);
1718
1719	ref = rb_entry(rb_first_cached(&head->ref_tree),
1720		       struct btrfs_delayed_ref_node, ref_node);
1721	ASSERT(list_empty(&ref->add_list));
1722	return ref;
1723}
1724
1725static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1726				      struct btrfs_delayed_ref_head *head)
1727{
1728	spin_lock(&delayed_refs->lock);
1729	head->processing = 0;
1730	delayed_refs->num_heads_ready++;
1731	spin_unlock(&delayed_refs->lock);
1732	btrfs_delayed_ref_unlock(head);
1733}
1734
1735static struct btrfs_delayed_extent_op *cleanup_extent_op(
1736				struct btrfs_delayed_ref_head *head)
1737{
1738	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1739
1740	if (!extent_op)
1741		return NULL;
1742
1743	if (head->must_insert_reserved) {
1744		head->extent_op = NULL;
1745		btrfs_free_delayed_extent_op(extent_op);
1746		return NULL;
1747	}
1748	return extent_op;
1749}
1750
1751static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1752				     struct btrfs_delayed_ref_head *head)
1753{
1754	struct btrfs_delayed_extent_op *extent_op;
1755	int ret;
1756
1757	extent_op = cleanup_extent_op(head);
1758	if (!extent_op)
1759		return 0;
1760	head->extent_op = NULL;
1761	spin_unlock(&head->lock);
1762	ret = run_delayed_extent_op(trans, head, extent_op);
1763	btrfs_free_delayed_extent_op(extent_op);
1764	return ret ? ret : 1;
1765}
1766
1767void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1768				  struct btrfs_delayed_ref_root *delayed_refs,
1769				  struct btrfs_delayed_ref_head *head)
1770{
1771	int nr_items = 1;	/* Dropping this ref head update. */
1772
1773	/*
1774	 * We had csum deletions accounted for in our delayed refs rsv, we need
1775	 * to drop the csum leaves for this update from our delayed_refs_rsv.
1776	 */
1777	if (head->total_ref_mod < 0 && head->is_data) {
1778		spin_lock(&delayed_refs->lock);
1779		delayed_refs->pending_csums -= head->num_bytes;
1780		spin_unlock(&delayed_refs->lock);
1781		nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1782	}
1783
1784	/*
1785	 * We were dropping refs, or had a new ref and dropped it, and thus must
1786	 * adjust down our total_bytes_pinned, the space may or may not have
1787	 * been pinned and so is accounted for properly in the pinned space by
1788	 * now.
1789	 */
1790	if (head->total_ref_mod < 0 ||
1791	    (head->total_ref_mod == 0 && head->must_insert_reserved)) {
1792		u64 flags = btrfs_ref_head_to_space_flags(head);
1793
1794		btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
1795	}
1796
1797	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1798}
1799
1800static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1801			    struct btrfs_delayed_ref_head *head)
1802{
1803
1804	struct btrfs_fs_info *fs_info = trans->fs_info;
1805	struct btrfs_delayed_ref_root *delayed_refs;
1806	int ret;
1807
1808	delayed_refs = &trans->transaction->delayed_refs;
1809
1810	ret = run_and_cleanup_extent_op(trans, head);
1811	if (ret < 0) {
1812		unselect_delayed_ref_head(delayed_refs, head);
1813		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1814		return ret;
1815	} else if (ret) {
1816		return ret;
1817	}
1818
1819	/*
1820	 * Need to drop our head ref lock and re-acquire the delayed ref lock
1821	 * and then re-check to make sure nobody got added.
1822	 */
1823	spin_unlock(&head->lock);
1824	spin_lock(&delayed_refs->lock);
1825	spin_lock(&head->lock);
1826	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1827		spin_unlock(&head->lock);
1828		spin_unlock(&delayed_refs->lock);
1829		return 1;
1830	}
1831	btrfs_delete_ref_head(delayed_refs, head);
1832	spin_unlock(&head->lock);
1833	spin_unlock(&delayed_refs->lock);
1834
1835	if (head->must_insert_reserved) {
1836		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1837		if (head->is_data) {
1838			ret = btrfs_del_csums(trans, fs_info->csum_root,
1839					      head->bytenr, head->num_bytes);
1840		}
1841	}
1842
1843	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1844
1845	trace_run_delayed_ref_head(fs_info, head, 0);
1846	btrfs_delayed_ref_unlock(head);
1847	btrfs_put_delayed_ref_head(head);
1848	return ret;
1849}
1850
1851static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1852					struct btrfs_trans_handle *trans)
1853{
1854	struct btrfs_delayed_ref_root *delayed_refs =
1855		&trans->transaction->delayed_refs;
1856	struct btrfs_delayed_ref_head *head = NULL;
1857	int ret;
1858
1859	spin_lock(&delayed_refs->lock);
1860	head = btrfs_select_ref_head(delayed_refs);
1861	if (!head) {
1862		spin_unlock(&delayed_refs->lock);
1863		return head;
1864	}
1865
1866	/*
1867	 * Grab the lock that says we are going to process all the refs for
1868	 * this head
1869	 */
1870	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1871	spin_unlock(&delayed_refs->lock);
1872
1873	/*
1874	 * We may have dropped the spin lock to get the head mutex lock, and
1875	 * that might have given someone else time to free the head.  If that's
1876	 * true, it has been removed from our list and we can move on.
1877	 */
1878	if (ret == -EAGAIN)
1879		head = ERR_PTR(-EAGAIN);
1880
1881	return head;
1882}
1883
1884static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1885				    struct btrfs_delayed_ref_head *locked_ref,
1886				    unsigned long *run_refs)
1887{
1888	struct btrfs_fs_info *fs_info = trans->fs_info;
1889	struct btrfs_delayed_ref_root *delayed_refs;
1890	struct btrfs_delayed_extent_op *extent_op;
1891	struct btrfs_delayed_ref_node *ref;
1892	int must_insert_reserved = 0;
1893	int ret;
1894
1895	delayed_refs = &trans->transaction->delayed_refs;
1896
1897	lockdep_assert_held(&locked_ref->mutex);
1898	lockdep_assert_held(&locked_ref->lock);
1899
1900	while ((ref = select_delayed_ref(locked_ref))) {
1901		if (ref->seq &&
1902		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
1903			spin_unlock(&locked_ref->lock);
1904			unselect_delayed_ref_head(delayed_refs, locked_ref);
1905			return -EAGAIN;
1906		}
1907
1908		(*run_refs)++;
1909		ref->in_tree = 0;
1910		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1911		RB_CLEAR_NODE(&ref->ref_node);
1912		if (!list_empty(&ref->add_list))
1913			list_del(&ref->add_list);
1914		/*
1915		 * When we play the delayed ref, also correct the ref_mod on
1916		 * head
1917		 */
1918		switch (ref->action) {
1919		case BTRFS_ADD_DELAYED_REF:
1920		case BTRFS_ADD_DELAYED_EXTENT:
1921			locked_ref->ref_mod -= ref->ref_mod;
1922			break;
1923		case BTRFS_DROP_DELAYED_REF:
1924			locked_ref->ref_mod += ref->ref_mod;
1925			break;
1926		default:
1927			WARN_ON(1);
1928		}
1929		atomic_dec(&delayed_refs->num_entries);
1930
1931		/*
1932		 * Record the must_insert_reserved flag before we drop the
1933		 * spin lock.
1934		 */
1935		must_insert_reserved = locked_ref->must_insert_reserved;
1936		locked_ref->must_insert_reserved = 0;
1937
1938		extent_op = locked_ref->extent_op;
1939		locked_ref->extent_op = NULL;
1940		spin_unlock(&locked_ref->lock);
1941
1942		ret = run_one_delayed_ref(trans, ref, extent_op,
1943					  must_insert_reserved);
1944
1945		btrfs_free_delayed_extent_op(extent_op);
1946		if (ret) {
1947			unselect_delayed_ref_head(delayed_refs, locked_ref);
1948			btrfs_put_delayed_ref(ref);
1949			return ret;
1950		}
1951
1952		btrfs_put_delayed_ref(ref);
1953		cond_resched();
1954
1955		spin_lock(&locked_ref->lock);
1956		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
1957	}
1958
1959	return 0;
1960}
1961
1962/*
1963 * Returns 0 on success or if called with an already aborted transaction.
1964 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
1965 */
1966static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1967					     unsigned long nr)
1968{
1969	struct btrfs_fs_info *fs_info = trans->fs_info;
1970	struct btrfs_delayed_ref_root *delayed_refs;
1971	struct btrfs_delayed_ref_head *locked_ref = NULL;
1972	ktime_t start = ktime_get();
1973	int ret;
1974	unsigned long count = 0;
1975	unsigned long actual_count = 0;
1976
1977	delayed_refs = &trans->transaction->delayed_refs;
1978	do {
1979		if (!locked_ref) {
1980			locked_ref = btrfs_obtain_ref_head(trans);
1981			if (IS_ERR_OR_NULL(locked_ref)) {
1982				if (PTR_ERR(locked_ref) == -EAGAIN) {
1983					continue;
1984				} else {
1985					break;
1986				}
1987			}
1988			count++;
1989		}
1990		/*
1991		 * We need to try and merge add/drops of the same ref since we
1992		 * can run into issues with relocate dropping the implicit ref
1993		 * and then it being added back again before the drop can
1994		 * finish.  If we merged anything we need to re-loop so we can
1995		 * get a good ref.
1996		 * Or we can get node references of the same type that weren't
1997		 * merged when created due to bumps in the tree mod seq, and
1998		 * we need to merge them to prevent adding an inline extent
1999		 * backref before dropping it (triggering a BUG_ON at
2000		 * insert_inline_extent_backref()).
2001		 */
2002		spin_lock(&locked_ref->lock);
2003		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2004
2005		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2006						      &actual_count);
2007		if (ret < 0 && ret != -EAGAIN) {
2008			/*
2009			 * Error, btrfs_run_delayed_refs_for_head already
2010			 * unlocked everything so just bail out
2011			 */
2012			return ret;
2013		} else if (!ret) {
2014			/*
2015			 * Success, perform the usual cleanup of a processed
2016			 * head
2017			 */
2018			ret = cleanup_ref_head(trans, locked_ref);
2019			if (ret > 0 ) {
2020				/* We dropped our lock, we need to loop. */
2021				ret = 0;
2022				continue;
2023			} else if (ret) {
2024				return ret;
2025			}
2026		}
2027
2028		/*
2029		 * Either success case or btrfs_run_delayed_refs_for_head
2030		 * returned -EAGAIN, meaning we need to select another head
2031		 */
2032
2033		locked_ref = NULL;
2034		cond_resched();
2035	} while ((nr != -1 && count < nr) || locked_ref);
2036
2037	/*
2038	 * We don't want to include ref heads since we can have empty ref heads
2039	 * and those will drastically skew our runtime down since we just do
2040	 * accounting, no actual extent tree updates.
2041	 */
2042	if (actual_count > 0) {
2043		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2044		u64 avg;
2045
2046		/*
2047		 * We weigh the current average higher than our current runtime
2048		 * to avoid large swings in the average.
2049		 */
2050		spin_lock(&delayed_refs->lock);
2051		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2052		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2053		spin_unlock(&delayed_refs->lock);
2054	}
2055	return 0;
2056}
2057
2058#ifdef SCRAMBLE_DELAYED_REFS
2059/*
2060 * Normally delayed refs get processed in ascending bytenr order. This
2061 * correlates in most cases to the order added. To expose dependencies on this
2062 * order, we start to process the tree in the middle instead of the beginning
2063 */
2064static u64 find_middle(struct rb_root *root)
2065{
2066	struct rb_node *n = root->rb_node;
2067	struct btrfs_delayed_ref_node *entry;
2068	int alt = 1;
2069	u64 middle;
2070	u64 first = 0, last = 0;
2071
2072	n = rb_first(root);
2073	if (n) {
2074		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2075		first = entry->bytenr;
2076	}
2077	n = rb_last(root);
2078	if (n) {
2079		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2080		last = entry->bytenr;
2081	}
2082	n = root->rb_node;
2083
2084	while (n) {
2085		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2086		WARN_ON(!entry->in_tree);
2087
2088		middle = entry->bytenr;
2089
2090		if (alt)
2091			n = n->rb_left;
2092		else
2093			n = n->rb_right;
2094
2095		alt = 1 - alt;
2096	}
2097	return middle;
2098}
2099#endif
2100
2101/*
2102 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2103 * would require to store the csums for that many bytes.
2104 */
2105u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2106{
2107	u64 csum_size;
2108	u64 num_csums_per_leaf;
2109	u64 num_csums;
2110
2111	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2112	num_csums_per_leaf = div64_u64(csum_size,
2113			(u64)btrfs_super_csum_size(fs_info->super_copy));
2114	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2115	num_csums += num_csums_per_leaf - 1;
2116	num_csums = div64_u64(num_csums, num_csums_per_leaf);
2117	return num_csums;
2118}
2119
2120/*
2121 * this starts processing the delayed reference count updates and
2122 * extent insertions we have queued up so far.  count can be
2123 * 0, which means to process everything in the tree at the start
2124 * of the run (but not newly added entries), or it can be some target
2125 * number you'd like to process.
2126 *
2127 * Returns 0 on success or if called with an aborted transaction
2128 * Returns <0 on error and aborts the transaction
2129 */
2130int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2131			   unsigned long count)
2132{
2133	struct btrfs_fs_info *fs_info = trans->fs_info;
2134	struct rb_node *node;
2135	struct btrfs_delayed_ref_root *delayed_refs;
2136	struct btrfs_delayed_ref_head *head;
2137	int ret;
2138	int run_all = count == (unsigned long)-1;
2139
2140	/* We'll clean this up in btrfs_cleanup_transaction */
2141	if (TRANS_ABORTED(trans))
2142		return 0;
2143
2144	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2145		return 0;
2146
2147	delayed_refs = &trans->transaction->delayed_refs;
2148	if (count == 0)
2149		count = atomic_read(&delayed_refs->num_entries) * 2;
2150
2151again:
2152#ifdef SCRAMBLE_DELAYED_REFS
2153	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2154#endif
2155	ret = __btrfs_run_delayed_refs(trans, count);
2156	if (ret < 0) {
2157		btrfs_abort_transaction(trans, ret);
2158		return ret;
2159	}
2160
2161	if (run_all) {
2162		btrfs_create_pending_block_groups(trans);
2163
2164		spin_lock(&delayed_refs->lock);
2165		node = rb_first_cached(&delayed_refs->href_root);
2166		if (!node) {
2167			spin_unlock(&delayed_refs->lock);
2168			goto out;
2169		}
2170		head = rb_entry(node, struct btrfs_delayed_ref_head,
2171				href_node);
2172		refcount_inc(&head->refs);
2173		spin_unlock(&delayed_refs->lock);
2174
2175		/* Mutex was contended, block until it's released and retry. */
2176		mutex_lock(&head->mutex);
2177		mutex_unlock(&head->mutex);
2178
2179		btrfs_put_delayed_ref_head(head);
2180		cond_resched();
2181		goto again;
2182	}
2183out:
2184	return 0;
2185}
2186
2187int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2188				struct extent_buffer *eb, u64 flags,
2189				int level, int is_data)
2190{
2191	struct btrfs_delayed_extent_op *extent_op;
2192	int ret;
2193
2194	extent_op = btrfs_alloc_delayed_extent_op();
2195	if (!extent_op)
2196		return -ENOMEM;
2197
2198	extent_op->flags_to_set = flags;
2199	extent_op->update_flags = true;
2200	extent_op->update_key = false;
2201	extent_op->is_data = is_data ? true : false;
2202	extent_op->level = level;
2203
2204	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2205	if (ret)
2206		btrfs_free_delayed_extent_op(extent_op);
2207	return ret;
2208}
2209
2210static noinline int check_delayed_ref(struct btrfs_root *root,
2211				      struct btrfs_path *path,
2212				      u64 objectid, u64 offset, u64 bytenr)
2213{
2214	struct btrfs_delayed_ref_head *head;
2215	struct btrfs_delayed_ref_node *ref;
2216	struct btrfs_delayed_data_ref *data_ref;
2217	struct btrfs_delayed_ref_root *delayed_refs;
2218	struct btrfs_transaction *cur_trans;
2219	struct rb_node *node;
2220	int ret = 0;
2221
2222	spin_lock(&root->fs_info->trans_lock);
2223	cur_trans = root->fs_info->running_transaction;
2224	if (cur_trans)
2225		refcount_inc(&cur_trans->use_count);
2226	spin_unlock(&root->fs_info->trans_lock);
2227	if (!cur_trans)
2228		return 0;
2229
2230	delayed_refs = &cur_trans->delayed_refs;
2231	spin_lock(&delayed_refs->lock);
2232	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2233	if (!head) {
2234		spin_unlock(&delayed_refs->lock);
2235		btrfs_put_transaction(cur_trans);
2236		return 0;
2237	}
2238
2239	if (!mutex_trylock(&head->mutex)) {
2240		refcount_inc(&head->refs);
2241		spin_unlock(&delayed_refs->lock);
2242
2243		btrfs_release_path(path);
2244
2245		/*
2246		 * Mutex was contended, block until it's released and let
2247		 * caller try again
2248		 */
2249		mutex_lock(&head->mutex);
2250		mutex_unlock(&head->mutex);
2251		btrfs_put_delayed_ref_head(head);
2252		btrfs_put_transaction(cur_trans);
2253		return -EAGAIN;
2254	}
2255	spin_unlock(&delayed_refs->lock);
2256
2257	spin_lock(&head->lock);
2258	/*
2259	 * XXX: We should replace this with a proper search function in the
2260	 * future.
2261	 */
2262	for (node = rb_first_cached(&head->ref_tree); node;
2263	     node = rb_next(node)) {
2264		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2265		/* If it's a shared ref we know a cross reference exists */
2266		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2267			ret = 1;
2268			break;
2269		}
2270
2271		data_ref = btrfs_delayed_node_to_data_ref(ref);
2272
2273		/*
2274		 * If our ref doesn't match the one we're currently looking at
2275		 * then we have a cross reference.
2276		 */
2277		if (data_ref->root != root->root_key.objectid ||
2278		    data_ref->objectid != objectid ||
2279		    data_ref->offset != offset) {
2280			ret = 1;
2281			break;
2282		}
2283	}
2284	spin_unlock(&head->lock);
2285	mutex_unlock(&head->mutex);
2286	btrfs_put_transaction(cur_trans);
2287	return ret;
2288}
2289
2290static noinline int check_committed_ref(struct btrfs_root *root,
2291					struct btrfs_path *path,
2292					u64 objectid, u64 offset, u64 bytenr,
2293					bool strict)
2294{
2295	struct btrfs_fs_info *fs_info = root->fs_info;
2296	struct btrfs_root *extent_root = fs_info->extent_root;
2297	struct extent_buffer *leaf;
2298	struct btrfs_extent_data_ref *ref;
2299	struct btrfs_extent_inline_ref *iref;
2300	struct btrfs_extent_item *ei;
2301	struct btrfs_key key;
2302	u32 item_size;
2303	int type;
2304	int ret;
2305
2306	key.objectid = bytenr;
2307	key.offset = (u64)-1;
2308	key.type = BTRFS_EXTENT_ITEM_KEY;
2309
2310	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2311	if (ret < 0)
2312		goto out;
2313	BUG_ON(ret == 0); /* Corruption */
2314
2315	ret = -ENOENT;
2316	if (path->slots[0] == 0)
2317		goto out;
2318
2319	path->slots[0]--;
2320	leaf = path->nodes[0];
2321	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2322
2323	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2324		goto out;
2325
2326	ret = 1;
2327	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2328	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2329
2330	/* If extent item has more than 1 inline ref then it's shared */
2331	if (item_size != sizeof(*ei) +
2332	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2333		goto out;
2334
2335	/*
2336	 * If extent created before last snapshot => it's shared unless the
2337	 * snapshot has been deleted. Use the heuristic if strict is false.
2338	 */
2339	if (!strict &&
2340	    (btrfs_extent_generation(leaf, ei) <=
2341	     btrfs_root_last_snapshot(&root->root_item)))
2342		goto out;
2343
2344	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2345
2346	/* If this extent has SHARED_DATA_REF then it's shared */
2347	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2348	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2349		goto out;
2350
2351	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2352	if (btrfs_extent_refs(leaf, ei) !=
2353	    btrfs_extent_data_ref_count(leaf, ref) ||
2354	    btrfs_extent_data_ref_root(leaf, ref) !=
2355	    root->root_key.objectid ||
2356	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2357	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2358		goto out;
2359
2360	ret = 0;
2361out:
2362	return ret;
2363}
2364
2365int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2366			  u64 bytenr, bool strict)
2367{
2368	struct btrfs_path *path;
2369	int ret;
2370
2371	path = btrfs_alloc_path();
2372	if (!path)
2373		return -ENOMEM;
2374
2375	do {
2376		ret = check_committed_ref(root, path, objectid,
2377					  offset, bytenr, strict);
2378		if (ret && ret != -ENOENT)
2379			goto out;
2380
2381		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2382	} while (ret == -EAGAIN);
2383
2384out:
2385	btrfs_free_path(path);
2386	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2387		WARN_ON(ret > 0);
2388	return ret;
2389}
2390
2391static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2392			   struct btrfs_root *root,
2393			   struct extent_buffer *buf,
2394			   int full_backref, int inc)
2395{
2396	struct btrfs_fs_info *fs_info = root->fs_info;
2397	u64 bytenr;
2398	u64 num_bytes;
2399	u64 parent;
2400	u64 ref_root;
2401	u32 nritems;
2402	struct btrfs_key key;
2403	struct btrfs_file_extent_item *fi;
2404	struct btrfs_ref generic_ref = { 0 };
2405	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2406	int i;
2407	int action;
2408	int level;
2409	int ret = 0;
2410
2411	if (btrfs_is_testing(fs_info))
2412		return 0;
2413
2414	ref_root = btrfs_header_owner(buf);
2415	nritems = btrfs_header_nritems(buf);
2416	level = btrfs_header_level(buf);
2417
2418	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2419		return 0;
2420
2421	if (full_backref)
2422		parent = buf->start;
2423	else
2424		parent = 0;
2425	if (inc)
2426		action = BTRFS_ADD_DELAYED_REF;
2427	else
2428		action = BTRFS_DROP_DELAYED_REF;
2429
2430	for (i = 0; i < nritems; i++) {
2431		if (level == 0) {
2432			btrfs_item_key_to_cpu(buf, &key, i);
2433			if (key.type != BTRFS_EXTENT_DATA_KEY)
2434				continue;
2435			fi = btrfs_item_ptr(buf, i,
2436					    struct btrfs_file_extent_item);
2437			if (btrfs_file_extent_type(buf, fi) ==
2438			    BTRFS_FILE_EXTENT_INLINE)
2439				continue;
2440			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2441			if (bytenr == 0)
2442				continue;
2443
2444			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2445			key.offset -= btrfs_file_extent_offset(buf, fi);
2446			btrfs_init_generic_ref(&generic_ref, action, bytenr,
2447					       num_bytes, parent);
2448			generic_ref.real_root = root->root_key.objectid;
2449			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2450					    key.offset);
2451			generic_ref.skip_qgroup = for_reloc;
2452			if (inc)
2453				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2454			else
2455				ret = btrfs_free_extent(trans, &generic_ref);
2456			if (ret)
2457				goto fail;
2458		} else {
2459			bytenr = btrfs_node_blockptr(buf, i);
2460			num_bytes = fs_info->nodesize;
2461			btrfs_init_generic_ref(&generic_ref, action, bytenr,
2462					       num_bytes, parent);
2463			generic_ref.real_root = root->root_key.objectid;
2464			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
2465			generic_ref.skip_qgroup = for_reloc;
2466			if (inc)
2467				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2468			else
2469				ret = btrfs_free_extent(trans, &generic_ref);
2470			if (ret)
2471				goto fail;
2472		}
2473	}
2474	return 0;
2475fail:
2476	return ret;
2477}
2478
2479int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2480		  struct extent_buffer *buf, int full_backref)
2481{
2482	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2483}
2484
2485int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2486		  struct extent_buffer *buf, int full_backref)
2487{
2488	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2489}
2490
2491int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
2492{
2493	struct btrfs_block_group *block_group;
2494	int readonly = 0;
2495
2496	block_group = btrfs_lookup_block_group(fs_info, bytenr);
2497	if (!block_group || block_group->ro)
2498		readonly = 1;
2499	if (block_group)
2500		btrfs_put_block_group(block_group);
2501	return readonly;
2502}
2503
2504static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2505{
2506	struct btrfs_fs_info *fs_info = root->fs_info;
2507	u64 flags;
2508	u64 ret;
2509
2510	if (data)
2511		flags = BTRFS_BLOCK_GROUP_DATA;
2512	else if (root == fs_info->chunk_root)
2513		flags = BTRFS_BLOCK_GROUP_SYSTEM;
2514	else
2515		flags = BTRFS_BLOCK_GROUP_METADATA;
2516
2517	ret = btrfs_get_alloc_profile(fs_info, flags);
2518	return ret;
2519}
2520
2521static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2522{
2523	struct btrfs_block_group *cache;
2524	u64 bytenr;
2525
2526	spin_lock(&fs_info->block_group_cache_lock);
2527	bytenr = fs_info->first_logical_byte;
2528	spin_unlock(&fs_info->block_group_cache_lock);
2529
2530	if (bytenr < (u64)-1)
2531		return bytenr;
2532
2533	cache = btrfs_lookup_first_block_group(fs_info, search_start);
2534	if (!cache)
2535		return 0;
2536
2537	bytenr = cache->start;
2538	btrfs_put_block_group(cache);
2539
2540	return bytenr;
2541}
2542
2543static int pin_down_extent(struct btrfs_trans_handle *trans,
2544			   struct btrfs_block_group *cache,
2545			   u64 bytenr, u64 num_bytes, int reserved)
2546{
2547	struct btrfs_fs_info *fs_info = cache->fs_info;
2548
2549	spin_lock(&cache->space_info->lock);
2550	spin_lock(&cache->lock);
2551	cache->pinned += num_bytes;
2552	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2553					     num_bytes);
2554	if (reserved) {
2555		cache->reserved -= num_bytes;
2556		cache->space_info->bytes_reserved -= num_bytes;
2557	}
2558	spin_unlock(&cache->lock);
2559	spin_unlock(&cache->space_info->lock);
2560
2561	__btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
2562	set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
2563			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
2564	return 0;
2565}
2566
2567int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2568		     u64 bytenr, u64 num_bytes, int reserved)
2569{
2570	struct btrfs_block_group *cache;
2571
2572	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2573	BUG_ON(!cache); /* Logic error */
2574
2575	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2576
2577	btrfs_put_block_group(cache);
2578	return 0;
2579}
2580
2581/*
2582 * this function must be called within transaction
2583 */
2584int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2585				    u64 bytenr, u64 num_bytes)
2586{
2587	struct btrfs_block_group *cache;
2588	int ret;
2589
2590	btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
2591
2592	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2593	if (!cache)
2594		return -EINVAL;
2595
2596	/*
2597	 * pull in the free space cache (if any) so that our pin
2598	 * removes the free space from the cache.  We have load_only set
2599	 * to one because the slow code to read in the free extents does check
2600	 * the pinned extents.
2601	 */
2602	btrfs_cache_block_group(cache, 1);
2603
2604	pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2605
2606	/* remove us from the free space cache (if we're there at all) */
2607	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2608	btrfs_put_block_group(cache);
2609	return ret;
2610}
2611
2612static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2613				   u64 start, u64 num_bytes)
2614{
2615	int ret;
2616	struct btrfs_block_group *block_group;
2617	struct btrfs_caching_control *caching_ctl;
2618
2619	block_group = btrfs_lookup_block_group(fs_info, start);
2620	if (!block_group)
2621		return -EINVAL;
2622
2623	btrfs_cache_block_group(block_group, 0);
2624	caching_ctl = btrfs_get_caching_control(block_group);
2625
2626	if (!caching_ctl) {
2627		/* Logic error */
2628		BUG_ON(!btrfs_block_group_done(block_group));
2629		ret = btrfs_remove_free_space(block_group, start, num_bytes);
2630	} else {
2631		mutex_lock(&caching_ctl->mutex);
2632
2633		if (start >= caching_ctl->progress) {
2634			ret = btrfs_add_excluded_extent(fs_info, start,
2635							num_bytes);
2636		} else if (start + num_bytes <= caching_ctl->progress) {
2637			ret = btrfs_remove_free_space(block_group,
2638						      start, num_bytes);
2639		} else {
2640			num_bytes = caching_ctl->progress - start;
2641			ret = btrfs_remove_free_space(block_group,
2642						      start, num_bytes);
2643			if (ret)
2644				goto out_lock;
2645
2646			num_bytes = (start + num_bytes) -
2647				caching_ctl->progress;
2648			start = caching_ctl->progress;
2649			ret = btrfs_add_excluded_extent(fs_info, start,
2650							num_bytes);
2651		}
2652out_lock:
2653		mutex_unlock(&caching_ctl->mutex);
2654		btrfs_put_caching_control(caching_ctl);
2655	}
2656	btrfs_put_block_group(block_group);
2657	return ret;
2658}
2659
2660int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2661{
2662	struct btrfs_fs_info *fs_info = eb->fs_info;
2663	struct btrfs_file_extent_item *item;
2664	struct btrfs_key key;
2665	int found_type;
2666	int i;
2667	int ret = 0;
2668
2669	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2670		return 0;
2671
2672	for (i = 0; i < btrfs_header_nritems(eb); i++) {
2673		btrfs_item_key_to_cpu(eb, &key, i);
2674		if (key.type != BTRFS_EXTENT_DATA_KEY)
2675			continue;
2676		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2677		found_type = btrfs_file_extent_type(eb, item);
2678		if (found_type == BTRFS_FILE_EXTENT_INLINE)
2679			continue;
2680		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2681			continue;
2682		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2683		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2684		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2685		if (ret)
2686			break;
2687	}
2688
2689	return ret;
2690}
2691
2692static void
2693btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2694{
2695	atomic_inc(&bg->reservations);
2696}
2697
2698/*
2699 * Returns the free cluster for the given space info and sets empty_cluster to
2700 * what it should be based on the mount options.
2701 */
2702static struct btrfs_free_cluster *
2703fetch_cluster_info(struct btrfs_fs_info *fs_info,
2704		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2705{
2706	struct btrfs_free_cluster *ret = NULL;
2707
2708	*empty_cluster = 0;
2709	if (btrfs_mixed_space_info(space_info))
2710		return ret;
2711
2712	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2713		ret = &fs_info->meta_alloc_cluster;
2714		if (btrfs_test_opt(fs_info, SSD))
2715			*empty_cluster = SZ_2M;
2716		else
2717			*empty_cluster = SZ_64K;
2718	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2719		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
2720		*empty_cluster = SZ_2M;
2721		ret = &fs_info->data_alloc_cluster;
2722	}
2723
2724	return ret;
2725}
2726
2727static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2728			      u64 start, u64 end,
2729			      const bool return_free_space)
2730{
2731	struct btrfs_block_group *cache = NULL;
2732	struct btrfs_space_info *space_info;
2733	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2734	struct btrfs_free_cluster *cluster = NULL;
2735	u64 len;
2736	u64 total_unpinned = 0;
2737	u64 empty_cluster = 0;
2738	bool readonly;
2739
2740	while (start <= end) {
2741		readonly = false;
2742		if (!cache ||
2743		    start >= cache->start + cache->length) {
2744			if (cache)
2745				btrfs_put_block_group(cache);
2746			total_unpinned = 0;
2747			cache = btrfs_lookup_block_group(fs_info, start);
2748			BUG_ON(!cache); /* Logic error */
2749
2750			cluster = fetch_cluster_info(fs_info,
2751						     cache->space_info,
2752						     &empty_cluster);
2753			empty_cluster <<= 1;
2754		}
2755
2756		len = cache->start + cache->length - start;
2757		len = min(len, end + 1 - start);
2758
2759		if (start < cache->last_byte_to_unpin && return_free_space) {
2760			u64 add_len = min(len, cache->last_byte_to_unpin - start);
2761
2762			btrfs_add_free_space(cache, start, add_len);
2763		}
2764
2765		start += len;
2766		total_unpinned += len;
2767		space_info = cache->space_info;
2768
2769		/*
2770		 * If this space cluster has been marked as fragmented and we've
2771		 * unpinned enough in this block group to potentially allow a
2772		 * cluster to be created inside of it go ahead and clear the
2773		 * fragmented check.
2774		 */
2775		if (cluster && cluster->fragmented &&
2776		    total_unpinned > empty_cluster) {
2777			spin_lock(&cluster->lock);
2778			cluster->fragmented = 0;
2779			spin_unlock(&cluster->lock);
2780		}
2781
2782		spin_lock(&space_info->lock);
2783		spin_lock(&cache->lock);
2784		cache->pinned -= len;
2785		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2786		space_info->max_extent_size = 0;
2787		__btrfs_mod_total_bytes_pinned(space_info, -len);
2788		if (cache->ro) {
2789			space_info->bytes_readonly += len;
2790			readonly = true;
2791		}
2792		spin_unlock(&cache->lock);
2793		if (!readonly && return_free_space &&
2794		    global_rsv->space_info == space_info) {
2795			u64 to_add = len;
2796
2797			spin_lock(&global_rsv->lock);
2798			if (!global_rsv->full) {
2799				to_add = min(len, global_rsv->size -
2800					     global_rsv->reserved);
2801				global_rsv->reserved += to_add;
2802				btrfs_space_info_update_bytes_may_use(fs_info,
2803						space_info, to_add);
2804				if (global_rsv->reserved >= global_rsv->size)
2805					global_rsv->full = 1;
2806				len -= to_add;
2807			}
2808			spin_unlock(&global_rsv->lock);
2809		}
2810		/* Add to any tickets we may have */
2811		if (!readonly && return_free_space && len)
2812			btrfs_try_granting_tickets(fs_info, space_info);
2813		spin_unlock(&space_info->lock);
2814	}
2815
2816	if (cache)
2817		btrfs_put_block_group(cache);
2818	return 0;
2819}
2820
2821int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2822{
2823	struct btrfs_fs_info *fs_info = trans->fs_info;
2824	struct btrfs_block_group *block_group, *tmp;
2825	struct list_head *deleted_bgs;
2826	struct extent_io_tree *unpin;
2827	u64 start;
2828	u64 end;
2829	int ret;
2830
2831	unpin = &trans->transaction->pinned_extents;
2832
2833	while (!TRANS_ABORTED(trans)) {
2834		struct extent_state *cached_state = NULL;
2835
2836		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2837		ret = find_first_extent_bit(unpin, 0, &start, &end,
2838					    EXTENT_DIRTY, &cached_state);
2839		if (ret) {
2840			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2841			break;
2842		}
2843		if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
2844			clear_extent_bits(&fs_info->excluded_extents, start,
2845					  end, EXTENT_UPTODATE);
2846
2847		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2848			ret = btrfs_discard_extent(fs_info, start,
2849						   end + 1 - start, NULL);
2850
2851		clear_extent_dirty(unpin, start, end, &cached_state);
2852		unpin_extent_range(fs_info, start, end, true);
2853		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2854		free_extent_state(cached_state);
2855		cond_resched();
2856	}
2857
2858	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2859		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2860		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2861	}
2862
2863	/*
2864	 * Transaction is finished.  We don't need the lock anymore.  We
2865	 * do need to clean up the block groups in case of a transaction
2866	 * abort.
2867	 */
2868	deleted_bgs = &trans->transaction->deleted_bgs;
2869	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2870		u64 trimmed = 0;
2871
2872		ret = -EROFS;
2873		if (!TRANS_ABORTED(trans))
2874			ret = btrfs_discard_extent(fs_info,
2875						   block_group->start,
2876						   block_group->length,
2877						   &trimmed);
2878
2879		list_del_init(&block_group->bg_list);
2880		btrfs_unfreeze_block_group(block_group);
2881		btrfs_put_block_group(block_group);
2882
2883		if (ret) {
2884			const char *errstr = btrfs_decode_error(ret);
2885			btrfs_warn(fs_info,
2886			   "discard failed while removing blockgroup: errno=%d %s",
2887				   ret, errstr);
2888		}
2889	}
2890
2891	return 0;
2892}
2893
2894/*
2895 * Drop one or more refs of @node.
2896 *
2897 * 1. Locate the extent refs.
2898 *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2899 *    Locate it, then reduce the refs number or remove the ref line completely.
2900 *
2901 * 2. Update the refs count in EXTENT/METADATA_ITEM
2902 *
2903 * Inline backref case:
2904 *
2905 * in extent tree we have:
2906 *
2907 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2908 *		refs 2 gen 6 flags DATA
2909 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
2910 *		extent data backref root FS_TREE objectid 257 offset 0 count 1
2911 *
2912 * This function gets called with:
2913 *
2914 *    node->bytenr = 13631488
2915 *    node->num_bytes = 1048576
2916 *    root_objectid = FS_TREE
2917 *    owner_objectid = 257
2918 *    owner_offset = 0
2919 *    refs_to_drop = 1
2920 *
2921 * Then we should get some like:
2922 *
2923 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2924 *		refs 1 gen 6 flags DATA
2925 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
2926 *
2927 * Keyed backref case:
2928 *
2929 * in extent tree we have:
2930 *
2931 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2932 *		refs 754 gen 6 flags DATA
2933 *	[...]
2934 *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2935 *		extent data backref root FS_TREE objectid 866 offset 0 count 1
2936 *
2937 * This function get called with:
2938 *
2939 *    node->bytenr = 13631488
2940 *    node->num_bytes = 1048576
2941 *    root_objectid = FS_TREE
2942 *    owner_objectid = 866
2943 *    owner_offset = 0
2944 *    refs_to_drop = 1
2945 *
2946 * Then we should get some like:
2947 *
2948 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2949 *		refs 753 gen 6 flags DATA
2950 *
2951 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2952 */
2953static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2954			       struct btrfs_delayed_ref_node *node, u64 parent,
2955			       u64 root_objectid, u64 owner_objectid,
2956			       u64 owner_offset, int refs_to_drop,
2957			       struct btrfs_delayed_extent_op *extent_op)
2958{
2959	struct btrfs_fs_info *info = trans->fs_info;
2960	struct btrfs_key key;
2961	struct btrfs_path *path;
2962	struct btrfs_root *extent_root = info->extent_root;
2963	struct extent_buffer *leaf;
2964	struct btrfs_extent_item *ei;
2965	struct btrfs_extent_inline_ref *iref;
2966	int ret;
2967	int is_data;
2968	int extent_slot = 0;
2969	int found_extent = 0;
2970	int num_to_del = 1;
2971	u32 item_size;
2972	u64 refs;
2973	u64 bytenr = node->bytenr;
2974	u64 num_bytes = node->num_bytes;
2975	int last_ref = 0;
2976	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
2977
2978	path = btrfs_alloc_path();
2979	if (!path)
2980		return -ENOMEM;
2981
2982	path->leave_spinning = 1;
2983
2984	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2985
2986	if (!is_data && refs_to_drop != 1) {
2987		btrfs_crit(info,
2988"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
2989			   node->bytenr, refs_to_drop);
2990		ret = -EINVAL;
2991		btrfs_abort_transaction(trans, ret);
2992		goto out;
2993	}
2994
2995	if (is_data)
2996		skinny_metadata = false;
2997
2998	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
2999				    parent, root_objectid, owner_objectid,
3000				    owner_offset);
3001	if (ret == 0) {
3002		/*
3003		 * Either the inline backref or the SHARED_DATA_REF/
3004		 * SHARED_BLOCK_REF is found
3005		 *
3006		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
3007		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
3008		 */
3009		extent_slot = path->slots[0];
3010		while (extent_slot >= 0) {
3011			btrfs_item_key_to_cpu(path->nodes[0], &key,
3012					      extent_slot);
3013			if (key.objectid != bytenr)
3014				break;
3015			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3016			    key.offset == num_bytes) {
3017				found_extent = 1;
3018				break;
3019			}
3020			if (key.type == BTRFS_METADATA_ITEM_KEY &&
3021			    key.offset == owner_objectid) {
3022				found_extent = 1;
3023				break;
3024			}
3025
3026			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
3027			if (path->slots[0] - extent_slot > 5)
3028				break;
3029			extent_slot--;
3030		}
3031
3032		if (!found_extent) {
3033			if (iref) {
3034				btrfs_crit(info,
3035"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
3036				btrfs_abort_transaction(trans, -EUCLEAN);
3037				goto err_dump;
3038			}
3039			/* Must be SHARED_* item, remove the backref first */
3040			ret = remove_extent_backref(trans, path, NULL,
3041						    refs_to_drop,
3042						    is_data, &last_ref);
3043			if (ret) {
3044				btrfs_abort_transaction(trans, ret);
3045				goto out;
3046			}
3047			btrfs_release_path(path);
3048			path->leave_spinning = 1;
3049
3050			/* Slow path to locate EXTENT/METADATA_ITEM */
3051			key.objectid = bytenr;
3052			key.type = BTRFS_EXTENT_ITEM_KEY;
3053			key.offset = num_bytes;
3054
3055			if (!is_data && skinny_metadata) {
3056				key.type = BTRFS_METADATA_ITEM_KEY;
3057				key.offset = owner_objectid;
3058			}
3059
3060			ret = btrfs_search_slot(trans, extent_root,
3061						&key, path, -1, 1);
3062			if (ret > 0 && skinny_metadata && path->slots[0]) {
3063				/*
3064				 * Couldn't find our skinny metadata item,
3065				 * see if we have ye olde extent item.
3066				 */
3067				path->slots[0]--;
3068				btrfs_item_key_to_cpu(path->nodes[0], &key,
3069						      path->slots[0]);
3070				if (key.objectid == bytenr &&
3071				    key.type == BTRFS_EXTENT_ITEM_KEY &&
3072				    key.offset == num_bytes)
3073					ret = 0;
3074			}
3075
3076			if (ret > 0 && skinny_metadata) {
3077				skinny_metadata = false;
3078				key.objectid = bytenr;
3079				key.type = BTRFS_EXTENT_ITEM_KEY;
3080				key.offset = num_bytes;
3081				btrfs_release_path(path);
3082				ret = btrfs_search_slot(trans, extent_root,
3083							&key, path, -1, 1);
3084			}
3085
3086			if (ret) {
3087				btrfs_err(info,
3088					  "umm, got %d back from search, was looking for %llu",
3089					  ret, bytenr);
3090				if (ret > 0)
3091					btrfs_print_leaf(path->nodes[0]);
3092			}
3093			if (ret < 0) {
3094				btrfs_abort_transaction(trans, ret);
3095				goto out;
3096			}
3097			extent_slot = path->slots[0];
3098		}
3099	} else if (WARN_ON(ret == -ENOENT)) {
3100		btrfs_print_leaf(path->nodes[0]);
3101		btrfs_err(info,
3102			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3103			bytenr, parent, root_objectid, owner_objectid,
3104			owner_offset);
3105		btrfs_abort_transaction(trans, ret);
3106		goto out;
3107	} else {
3108		btrfs_abort_transaction(trans, ret);
3109		goto out;
3110	}
3111
3112	leaf = path->nodes[0];
3113	item_size = btrfs_item_size_nr(leaf, extent_slot);
3114	if (unlikely(item_size < sizeof(*ei))) {
3115		ret = -EINVAL;
3116		btrfs_print_v0_err(info);
3117		btrfs_abort_transaction(trans, ret);
3118		goto out;
3119	}
3120	ei = btrfs_item_ptr(leaf, extent_slot,
3121			    struct btrfs_extent_item);
3122	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3123	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3124		struct btrfs_tree_block_info *bi;
3125		if (item_size < sizeof(*ei) + sizeof(*bi)) {
3126			btrfs_crit(info,
3127"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
3128				   key.objectid, key.type, key.offset,
3129				   owner_objectid, item_size,
3130				   sizeof(*ei) + sizeof(*bi));
3131			btrfs_abort_transaction(trans, -EUCLEAN);
3132			goto err_dump;
3133		}
3134		bi = (struct btrfs_tree_block_info *)(ei + 1);
3135		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3136	}
3137
3138	refs = btrfs_extent_refs(leaf, ei);
3139	if (refs < refs_to_drop) {
3140		btrfs_crit(info,
3141		"trying to drop %d refs but we only have %llu for bytenr %llu",
3142			  refs_to_drop, refs, bytenr);
3143		btrfs_abort_transaction(trans, -EUCLEAN);
3144		goto err_dump;
3145	}
3146	refs -= refs_to_drop;
3147
3148	if (refs > 0) {
3149		if (extent_op)
3150			__run_delayed_extent_op(extent_op, leaf, ei);
3151		/*
3152		 * In the case of inline back ref, reference count will
3153		 * be updated by remove_extent_backref
3154		 */
3155		if (iref) {
3156			if (!found_extent) {
3157				btrfs_crit(info,
3158"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
3159				btrfs_abort_transaction(trans, -EUCLEAN);
3160				goto err_dump;
3161			}
3162		} else {
3163			btrfs_set_extent_refs(leaf, ei, refs);
3164			btrfs_mark_buffer_dirty(leaf);
3165		}
3166		if (found_extent) {
3167			ret = remove_extent_backref(trans, path, iref,
3168						    refs_to_drop, is_data,
3169						    &last_ref);
3170			if (ret) {
3171				btrfs_abort_transaction(trans, ret);
3172				goto out;
3173			}
3174		}
3175	} else {
3176		/* In this branch refs == 1 */
3177		if (found_extent) {
3178			if (is_data && refs_to_drop !=
3179			    extent_data_ref_count(path, iref)) {
3180				btrfs_crit(info,
3181		"invalid refs_to_drop, current refs %u refs_to_drop %u",
3182					   extent_data_ref_count(path, iref),
3183					   refs_to_drop);
3184				btrfs_abort_transaction(trans, -EUCLEAN);
3185				goto err_dump;
3186			}
3187			if (iref) {
3188				if (path->slots[0] != extent_slot) {
3189					btrfs_crit(info,
3190"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
3191						   key.objectid, key.type,
3192						   key.offset);
3193					btrfs_abort_transaction(trans, -EUCLEAN);
3194					goto err_dump;
3195				}
3196			} else {
3197				/*
3198				 * No inline ref, we must be at SHARED_* item,
3199				 * And it's single ref, it must be:
3200				 * |	extent_slot	  ||extent_slot + 1|
3201				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3202				 */
3203				if (path->slots[0] != extent_slot + 1) {
3204					btrfs_crit(info,
3205	"invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
3206					btrfs_abort_transaction(trans, -EUCLEAN);
3207					goto err_dump;
3208				}
3209				path->slots[0] = extent_slot;
3210				num_to_del = 2;
3211			}
3212		}
3213
3214		last_ref = 1;
3215		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3216				      num_to_del);
3217		if (ret) {
3218			btrfs_abort_transaction(trans, ret);
3219			goto out;
3220		}
3221		btrfs_release_path(path);
3222
3223		if (is_data) {
3224			ret = btrfs_del_csums(trans, info->csum_root, bytenr,
3225					      num_bytes);
3226			if (ret) {
3227				btrfs_abort_transaction(trans, ret);
3228				goto out;
3229			}
3230		}
3231
3232		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3233		if (ret) {
3234			btrfs_abort_transaction(trans, ret);
3235			goto out;
3236		}
3237
3238		ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
3239		if (ret) {
3240			btrfs_abort_transaction(trans, ret);
3241			goto out;
3242		}
3243	}
3244	btrfs_release_path(path);
3245
3246out:
3247	btrfs_free_path(path);
3248	return ret;
3249err_dump:
3250	/*
3251	 * Leaf dump can take up a lot of log buffer, so we only do full leaf
3252	 * dump for debug build.
3253	 */
3254	if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
3255		btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
3256			   path->slots[0], extent_slot);
3257		btrfs_print_leaf(path->nodes[0]);
3258	}
3259
3260	btrfs_free_path(path);
3261	return -EUCLEAN;
3262}
3263
3264/*
3265 * when we free an block, it is possible (and likely) that we free the last
3266 * delayed ref for that extent as well.  This searches the delayed ref tree for
3267 * a given extent, and if there are no other delayed refs to be processed, it
3268 * removes it from the tree.
3269 */
3270static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3271				      u64 bytenr)
3272{
3273	struct btrfs_delayed_ref_head *head;
3274	struct btrfs_delayed_ref_root *delayed_refs;
3275	int ret = 0;
3276
3277	delayed_refs = &trans->transaction->delayed_refs;
3278	spin_lock(&delayed_refs->lock);
3279	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3280	if (!head)
3281		goto out_delayed_unlock;
3282
3283	spin_lock(&head->lock);
3284	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3285		goto out;
3286
3287	if (cleanup_extent_op(head) != NULL)
3288		goto out;
3289
3290	/*
3291	 * waiting for the lock here would deadlock.  If someone else has it
3292	 * locked they are already in the process of dropping it anyway
3293	 */
3294	if (!mutex_trylock(&head->mutex))
3295		goto out;
3296
3297	btrfs_delete_ref_head(delayed_refs, head);
3298	head->processing = 0;
3299
3300	spin_unlock(&head->lock);
3301	spin_unlock(&delayed_refs->lock);
3302
3303	BUG_ON(head->extent_op);
3304	if (head->must_insert_reserved)
3305		ret = 1;
3306
3307	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3308	mutex_unlock(&head->mutex);
3309	btrfs_put_delayed_ref_head(head);
3310	return ret;
3311out:
3312	spin_unlock(&head->lock);
3313
3314out_delayed_unlock:
3315	spin_unlock(&delayed_refs->lock);
3316	return 0;
3317}
3318
3319void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3320			   struct btrfs_root *root,
3321			   struct extent_buffer *buf,
3322			   u64 parent, int last_ref)
3323{
3324	struct btrfs_fs_info *fs_info = root->fs_info;
3325	struct btrfs_ref generic_ref = { 0 };
3326	int ret;
3327
3328	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3329			       buf->start, buf->len, parent);
3330	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3331			    root->root_key.objectid);
3332
3333	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3334		btrfs_ref_tree_mod(fs_info, &generic_ref);
3335		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3336		BUG_ON(ret); /* -ENOMEM */
3337	}
3338
3339	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3340		struct btrfs_block_group *cache;
3341
3342		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3343			ret = check_ref_cleanup(trans, buf->start);
3344			if (!ret)
3345				goto out;
3346		}
3347
3348		cache = btrfs_lookup_block_group(fs_info, buf->start);
3349
3350		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3351			pin_down_extent(trans, cache, buf->start, buf->len, 1);
3352			btrfs_put_block_group(cache);
3353			goto out;
3354		}
3355
3356		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3357
3358		btrfs_add_free_space(cache, buf->start, buf->len);
3359		btrfs_free_reserved_bytes(cache, buf->len, 0);
3360		btrfs_put_block_group(cache);
3361		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3362	}
3363out:
3364	if (last_ref) {
3365		/*
3366		 * Deleting the buffer, clear the corrupt flag since it doesn't
3367		 * matter anymore.
3368		 */
3369		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3370	}
3371}
3372
3373/* Can return -ENOMEM */
3374int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3375{
3376	struct btrfs_fs_info *fs_info = trans->fs_info;
3377	int ret;
3378
3379	if (btrfs_is_testing(fs_info))
3380		return 0;
3381
3382	/*
3383	 * tree log blocks never actually go into the extent allocation
3384	 * tree, just update pinning info and exit early.
3385	 */
3386	if ((ref->type == BTRFS_REF_METADATA &&
3387	     ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3388	    (ref->type == BTRFS_REF_DATA &&
3389	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
3390		/* unlocks the pinned mutex */
3391		btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3392		ret = 0;
3393	} else if (ref->type == BTRFS_REF_METADATA) {
3394		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3395	} else {
3396		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3397	}
3398
3399	if (!((ref->type == BTRFS_REF_METADATA &&
3400	       ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3401	      (ref->type == BTRFS_REF_DATA &&
3402	       ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
3403		btrfs_ref_tree_mod(fs_info, ref);
3404
3405	return ret;
3406}
3407
3408enum btrfs_loop_type {
3409	LOOP_CACHING_NOWAIT,
3410	LOOP_CACHING_WAIT,
3411	LOOP_ALLOC_CHUNK,
3412	LOOP_NO_EMPTY_SIZE,
3413};
3414
3415static inline void
3416btrfs_lock_block_group(struct btrfs_block_group *cache,
3417		       int delalloc)
3418{
3419	if (delalloc)
3420		down_read(&cache->data_rwsem);
3421}
3422
3423static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3424		       int delalloc)
3425{
3426	btrfs_get_block_group(cache);
3427	if (delalloc)
3428		down_read(&cache->data_rwsem);
3429}
3430
3431static struct btrfs_block_group *btrfs_lock_cluster(
3432		   struct btrfs_block_group *block_group,
3433		   struct btrfs_free_cluster *cluster,
3434		   int delalloc)
3435	__acquires(&cluster->refill_lock)
3436{
3437	struct btrfs_block_group *used_bg = NULL;
3438
3439	spin_lock(&cluster->refill_lock);
3440	while (1) {
3441		used_bg = cluster->block_group;
3442		if (!used_bg)
3443			return NULL;
3444
3445		if (used_bg == block_group)
3446			return used_bg;
3447
3448		btrfs_get_block_group(used_bg);
3449
3450		if (!delalloc)
3451			return used_bg;
3452
3453		if (down_read_trylock(&used_bg->data_rwsem))
3454			return used_bg;
3455
3456		spin_unlock(&cluster->refill_lock);
3457
3458		/* We should only have one-level nested. */
3459		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3460
3461		spin_lock(&cluster->refill_lock);
3462		if (used_bg == cluster->block_group)
3463			return used_bg;
3464
3465		up_read(&used_bg->data_rwsem);
3466		btrfs_put_block_group(used_bg);
3467	}
3468}
3469
3470static inline void
3471btrfs_release_block_group(struct btrfs_block_group *cache,
3472			 int delalloc)
3473{
3474	if (delalloc)
3475		up_read(&cache->data_rwsem);
3476	btrfs_put_block_group(cache);
3477}
3478
3479enum btrfs_extent_allocation_policy {
3480	BTRFS_EXTENT_ALLOC_CLUSTERED,
3481};
3482
3483/*
3484 * Structure used internally for find_free_extent() function.  Wraps needed
3485 * parameters.
3486 */
3487struct find_free_extent_ctl {
3488	/* Basic allocation info */
3489	u64 num_bytes;
3490	u64 empty_size;
3491	u64 flags;
3492	int delalloc;
3493
3494	/* Where to start the search inside the bg */
3495	u64 search_start;
3496
3497	/* For clustered allocation */
3498	u64 empty_cluster;
3499	struct btrfs_free_cluster *last_ptr;
3500	bool use_cluster;
3501
3502	bool have_caching_bg;
3503	bool orig_have_caching_bg;
3504
3505	/* RAID index, converted from flags */
3506	int index;
3507
3508	/*
3509	 * Current loop number, check find_free_extent_update_loop() for details
3510	 */
3511	int loop;
3512
3513	/*
3514	 * Whether we're refilling a cluster, if true we need to re-search
3515	 * current block group but don't try to refill the cluster again.
3516	 */
3517	bool retry_clustered;
3518
3519	/*
3520	 * Whether we're updating free space cache, if true we need to re-search
3521	 * current block group but don't try updating free space cache again.
3522	 */
3523	bool retry_unclustered;
3524
3525	/* If current block group is cached */
3526	int cached;
3527
3528	/* Max contiguous hole found */
3529	u64 max_extent_size;
3530
3531	/* Total free space from free space cache, not always contiguous */
3532	u64 total_free_space;
3533
3534	/* Found result */
3535	u64 found_offset;
3536
3537	/* Hint where to start looking for an empty space */
3538	u64 hint_byte;
3539
3540	/* Allocation policy */
3541	enum btrfs_extent_allocation_policy policy;
3542};
3543
3544
3545/*
3546 * Helper function for find_free_extent().
3547 *
3548 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3549 * Return -EAGAIN to inform caller that we need to re-search this block group
3550 * Return >0 to inform caller that we find nothing
3551 * Return 0 means we have found a location and set ffe_ctl->found_offset.
3552 */
3553static int find_free_extent_clustered(struct btrfs_block_group *bg,
3554				      struct find_free_extent_ctl *ffe_ctl,
3555				      struct btrfs_block_group **cluster_bg_ret)
3556{
3557	struct btrfs_block_group *cluster_bg;
3558	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3559	u64 aligned_cluster;
3560	u64 offset;
3561	int ret;
3562
3563	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3564	if (!cluster_bg)
3565		goto refill_cluster;
3566	if (cluster_bg != bg && (cluster_bg->ro ||
3567	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
3568		goto release_cluster;
3569
3570	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3571			ffe_ctl->num_bytes, cluster_bg->start,
3572			&ffe_ctl->max_extent_size);
3573	if (offset) {
3574		/* We have a block, we're done */
3575		spin_unlock(&last_ptr->refill_lock);
3576		trace_btrfs_reserve_extent_cluster(cluster_bg,
3577				ffe_ctl->search_start, ffe_ctl->num_bytes);
3578		*cluster_bg_ret = cluster_bg;
3579		ffe_ctl->found_offset = offset;
3580		return 0;
3581	}
3582	WARN_ON(last_ptr->block_group != cluster_bg);
3583
3584release_cluster:
3585	/*
3586	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3587	 * lets just skip it and let the allocator find whatever block it can
3588	 * find. If we reach this point, we will have tried the cluster
3589	 * allocator plenty of times and not have found anything, so we are
3590	 * likely way too fragmented for the clustering stuff to find anything.
3591	 *
3592	 * However, if the cluster is taken from the current block group,
3593	 * release the cluster first, so that we stand a better chance of
3594	 * succeeding in the unclustered allocation.
3595	 */
3596	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3597		spin_unlock(&last_ptr->refill_lock);
3598		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3599		return -ENOENT;
3600	}
3601
3602	/* This cluster didn't work out, free it and start over */
3603	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3604
3605	if (cluster_bg != bg)
3606		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3607
3608refill_cluster:
3609	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3610		spin_unlock(&last_ptr->refill_lock);
3611		return -ENOENT;
3612	}
3613
3614	aligned_cluster = max_t(u64,
3615			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3616			bg->full_stripe_len);
3617	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3618			ffe_ctl->num_bytes, aligned_cluster);
3619	if (ret == 0) {
3620		/* Now pull our allocation out of this cluster */
3621		offset = btrfs_alloc_from_cluster(bg, last_ptr,
3622				ffe_ctl->num_bytes, ffe_ctl->search_start,
3623				&ffe_ctl->max_extent_size);
3624		if (offset) {
3625			/* We found one, proceed */
3626			spin_unlock(&last_ptr->refill_lock);
3627			trace_btrfs_reserve_extent_cluster(bg,
3628					ffe_ctl->search_start,
3629					ffe_ctl->num_bytes);
3630			ffe_ctl->found_offset = offset;
3631			return 0;
3632		}
3633	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3634		   !ffe_ctl->retry_clustered) {
3635		spin_unlock(&last_ptr->refill_lock);
3636
3637		ffe_ctl->retry_clustered = true;
3638		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3639				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3640		return -EAGAIN;
3641	}
3642	/*
3643	 * At this point we either didn't find a cluster or we weren't able to
3644	 * allocate a block from our cluster.  Free the cluster we've been
3645	 * trying to use, and go to the next block group.
3646	 */
3647	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3648	spin_unlock(&last_ptr->refill_lock);
3649	return 1;
3650}
3651
3652/*
3653 * Return >0 to inform caller that we find nothing
3654 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3655 * Return -EAGAIN to inform caller that we need to re-search this block group
3656 */
3657static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3658					struct find_free_extent_ctl *ffe_ctl)
3659{
3660	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3661	u64 offset;
3662
3663	/*
3664	 * We are doing an unclustered allocation, set the fragmented flag so
3665	 * we don't bother trying to setup a cluster again until we get more
3666	 * space.
3667	 */
3668	if (unlikely(last_ptr)) {
3669		spin_lock(&last_ptr->lock);
3670		last_ptr->fragmented = 1;
3671		spin_unlock(&last_ptr->lock);
3672	}
3673	if (ffe_ctl->cached) {
3674		struct btrfs_free_space_ctl *free_space_ctl;
3675
3676		free_space_ctl = bg->free_space_ctl;
3677		spin_lock(&free_space_ctl->tree_lock);
3678		if (free_space_ctl->free_space <
3679		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3680		    ffe_ctl->empty_size) {
3681			ffe_ctl->total_free_space = max_t(u64,
3682					ffe_ctl->total_free_space,
3683					free_space_ctl->free_space);
3684			spin_unlock(&free_space_ctl->tree_lock);
3685			return 1;
3686		}
3687		spin_unlock(&free_space_ctl->tree_lock);
3688	}
3689
3690	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3691			ffe_ctl->num_bytes, ffe_ctl->empty_size,
3692			&ffe_ctl->max_extent_size);
3693
3694	/*
3695	 * If we didn't find a chunk, and we haven't failed on this block group
3696	 * before, and this block group is in the middle of caching and we are
3697	 * ok with waiting, then go ahead and wait for progress to be made, and
3698	 * set @retry_unclustered to true.
3699	 *
3700	 * If @retry_unclustered is true then we've already waited on this
3701	 * block group once and should move on to the next block group.
3702	 */
3703	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3704	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3705		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3706						      ffe_ctl->empty_size);
3707		ffe_ctl->retry_unclustered = true;
3708		return -EAGAIN;
3709	} else if (!offset) {
3710		return 1;
3711	}
3712	ffe_ctl->found_offset = offset;
3713	return 0;
3714}
3715
3716static int do_allocation_clustered(struct btrfs_block_group *block_group,
3717				   struct find_free_extent_ctl *ffe_ctl,
3718				   struct btrfs_block_group **bg_ret)
3719{
3720	int ret;
3721
3722	/* We want to try and use the cluster allocator, so lets look there */
3723	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3724		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3725		if (ret >= 0 || ret == -EAGAIN)
3726			return ret;
3727		/* ret == -ENOENT case falls through */
3728	}
3729
3730	return find_free_extent_unclustered(block_group, ffe_ctl);
3731}
3732
3733static int do_allocation(struct btrfs_block_group *block_group,
3734			 struct find_free_extent_ctl *ffe_ctl,
3735			 struct btrfs_block_group **bg_ret)
3736{
3737	switch (ffe_ctl->policy) {
3738	case BTRFS_EXTENT_ALLOC_CLUSTERED:
3739		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3740	default:
3741		BUG();
3742	}
3743}
3744
3745static void release_block_group(struct btrfs_block_group *block_group,
3746				struct find_free_extent_ctl *ffe_ctl,
3747				int delalloc)
3748{
3749	switch (ffe_ctl->policy) {
3750	case BTRFS_EXTENT_ALLOC_CLUSTERED:
3751		ffe_ctl->retry_clustered = false;
3752		ffe_ctl->retry_unclustered = false;
3753		break;
3754	default:
3755		BUG();
3756	}
3757
3758	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3759	       ffe_ctl->index);
3760	btrfs_release_block_group(block_group, delalloc);
3761}
3762
3763static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3764				   struct btrfs_key *ins)
3765{
3766	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3767
3768	if (!ffe_ctl->use_cluster && last_ptr) {
3769		spin_lock(&last_ptr->lock);
3770		last_ptr->window_start = ins->objectid;
3771		spin_unlock(&last_ptr->lock);
3772	}
3773}
3774
3775static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3776			 struct btrfs_key *ins)
3777{
3778	switch (ffe_ctl->policy) {
3779	case BTRFS_EXTENT_ALLOC_CLUSTERED:
3780		found_extent_clustered(ffe_ctl, ins);
3781		break;
3782	default:
3783		BUG();
3784	}
3785}
3786
3787static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
3788{
3789	switch (ffe_ctl->policy) {
3790	case BTRFS_EXTENT_ALLOC_CLUSTERED:
3791		/*
3792		 * If we can't allocate a new chunk we've already looped through
3793		 * at least once, move on to the NO_EMPTY_SIZE case.
3794		 */
3795		ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
3796		return 0;
3797	default:
3798		BUG();
3799	}
3800}
3801
3802/*
3803 * Return >0 means caller needs to re-search for free extent
3804 * Return 0 means we have the needed free extent.
3805 * Return <0 means we failed to locate any free extent.
3806 */
3807static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3808					struct btrfs_key *ins,
3809					struct find_free_extent_ctl *ffe_ctl,
3810					bool full_search)
3811{
3812	struct btrfs_root *root = fs_info->extent_root;
3813	int ret;
3814
3815	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3816	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3817		ffe_ctl->orig_have_caching_bg = true;
3818
3819	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
3820	    ffe_ctl->have_caching_bg)
3821		return 1;
3822
3823	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
3824		return 1;
3825
3826	if (ins->objectid) {
3827		found_extent(ffe_ctl, ins);
3828		return 0;
3829	}
3830
3831	/*
3832	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
3833	 *			caching kthreads as we move along
3834	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3835	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3836	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3837	 *		       again
3838	 */
3839	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
3840		ffe_ctl->index = 0;
3841		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
3842			/*
3843			 * We want to skip the LOOP_CACHING_WAIT step if we
3844			 * don't have any uncached bgs and we've already done a
3845			 * full search through.
3846			 */
3847			if (ffe_ctl->orig_have_caching_bg || !full_search)
3848				ffe_ctl->loop = LOOP_CACHING_WAIT;
3849			else
3850				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
3851		} else {
3852			ffe_ctl->loop++;
3853		}
3854
3855		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
3856			struct btrfs_trans_handle *trans;
3857			int exist = 0;
3858
3859			trans = current->journal_info;
3860			if (trans)
3861				exist = 1;
3862			else
3863				trans = btrfs_join_transaction(root);
3864
3865			if (IS_ERR(trans)) {
3866				ret = PTR_ERR(trans);
3867				return ret;
3868			}
3869
3870			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
3871						CHUNK_ALLOC_FORCE);
3872
3873			/* Do not bail out on ENOSPC since we can do more. */
3874			if (ret == -ENOSPC)
3875				ret = chunk_allocation_failed(ffe_ctl);
3876			else if (ret < 0)
3877				btrfs_abort_transaction(trans, ret);
3878			else
3879				ret = 0;
3880			if (!exist)
3881				btrfs_end_transaction(trans);
3882			if (ret)
3883				return ret;
3884		}
3885
3886		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
3887			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
3888				return -ENOSPC;
3889
3890			/*
3891			 * Don't loop again if we already have no empty_size and
3892			 * no empty_cluster.
3893			 */
3894			if (ffe_ctl->empty_size == 0 &&
3895			    ffe_ctl->empty_cluster == 0)
3896				return -ENOSPC;
3897			ffe_ctl->empty_size = 0;
3898			ffe_ctl->empty_cluster = 0;
3899		}
3900		return 1;
3901	}
3902	return -ENOSPC;
3903}
3904
3905static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
3906					struct find_free_extent_ctl *ffe_ctl,
3907					struct btrfs_space_info *space_info,
3908					struct btrfs_key *ins)
3909{
3910	/*
3911	 * If our free space is heavily fragmented we may not be able to make
3912	 * big contiguous allocations, so instead of doing the expensive search
3913	 * for free space, simply return ENOSPC with our max_extent_size so we
3914	 * can go ahead and search for a more manageable chunk.
3915	 *
3916	 * If our max_extent_size is large enough for our allocation simply
3917	 * disable clustering since we will likely not be able to find enough
3918	 * space to create a cluster and induce latency trying.
3919	 */
3920	if (space_info->max_extent_size) {
3921		spin_lock(&space_info->lock);
3922		if (space_info->max_extent_size &&
3923		    ffe_ctl->num_bytes > space_info->max_extent_size) {
3924			ins->offset = space_info->max_extent_size;
3925			spin_unlock(&space_info->lock);
3926			return -ENOSPC;
3927		} else if (space_info->max_extent_size) {
3928			ffe_ctl->use_cluster = false;
3929		}
3930		spin_unlock(&space_info->lock);
3931	}
3932
3933	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
3934					       &ffe_ctl->empty_cluster);
3935	if (ffe_ctl->last_ptr) {
3936		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3937
3938		spin_lock(&last_ptr->lock);
3939		if (last_ptr->block_group)
3940			ffe_ctl->hint_byte = last_ptr->window_start;
3941		if (last_ptr->fragmented) {
3942			/*
3943			 * We still set window_start so we can keep track of the
3944			 * last place we found an allocation to try and save
3945			 * some time.
3946			 */
3947			ffe_ctl->hint_byte = last_ptr->window_start;
3948			ffe_ctl->use_cluster = false;
3949		}
3950		spin_unlock(&last_ptr->lock);
3951	}
3952
3953	return 0;
3954}
3955
3956static int prepare_allocation(struct btrfs_fs_info *fs_info,
3957			      struct find_free_extent_ctl *ffe_ctl,
3958			      struct btrfs_space_info *space_info,
3959			      struct btrfs_key *ins)
3960{
3961	switch (ffe_ctl->policy) {
3962	case BTRFS_EXTENT_ALLOC_CLUSTERED:
3963		return prepare_allocation_clustered(fs_info, ffe_ctl,
3964						    space_info, ins);
3965	default:
3966		BUG();
3967	}
3968}
3969
3970/*
3971 * walks the btree of allocated extents and find a hole of a given size.
3972 * The key ins is changed to record the hole:
3973 * ins->objectid == start position
3974 * ins->flags = BTRFS_EXTENT_ITEM_KEY
3975 * ins->offset == the size of the hole.
3976 * Any available blocks before search_start are skipped.
3977 *
3978 * If there is no suitable free space, we will record the max size of
3979 * the free space extent currently.
3980 *
3981 * The overall logic and call chain:
3982 *
3983 * find_free_extent()
3984 * |- Iterate through all block groups
3985 * |  |- Get a valid block group
3986 * |  |- Try to do clustered allocation in that block group
3987 * |  |- Try to do unclustered allocation in that block group
3988 * |  |- Check if the result is valid
3989 * |  |  |- If valid, then exit
3990 * |  |- Jump to next block group
3991 * |
3992 * |- Push harder to find free extents
3993 *    |- If not found, re-iterate all block groups
3994 */
3995static noinline int find_free_extent(struct btrfs_root *root,
3996				u64 ram_bytes, u64 num_bytes, u64 empty_size,
3997				u64 hint_byte_orig, struct btrfs_key *ins,
3998				u64 flags, int delalloc)
3999{
4000	struct btrfs_fs_info *fs_info = root->fs_info;
4001	int ret = 0;
4002	int cache_block_group_error = 0;
4003	struct btrfs_block_group *block_group = NULL;
4004	struct find_free_extent_ctl ffe_ctl = {0};
4005	struct btrfs_space_info *space_info;
4006	bool full_search = false;
4007
4008	WARN_ON(num_bytes < fs_info->sectorsize);
4009
4010	ffe_ctl.num_bytes = num_bytes;
4011	ffe_ctl.empty_size = empty_size;
4012	ffe_ctl.flags = flags;
4013	ffe_ctl.search_start = 0;
4014	ffe_ctl.delalloc = delalloc;
4015	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
4016	ffe_ctl.have_caching_bg = false;
4017	ffe_ctl.orig_have_caching_bg = false;
4018	ffe_ctl.found_offset = 0;
4019	ffe_ctl.hint_byte = hint_byte_orig;
4020	ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4021
4022	/* For clustered allocation */
4023	ffe_ctl.retry_clustered = false;
4024	ffe_ctl.retry_unclustered = false;
4025	ffe_ctl.last_ptr = NULL;
4026	ffe_ctl.use_cluster = true;
4027
4028	ins->type = BTRFS_EXTENT_ITEM_KEY;
4029	ins->objectid = 0;
4030	ins->offset = 0;
4031
4032	trace_find_free_extent(root, num_bytes, empty_size, flags);
4033
4034	space_info = btrfs_find_space_info(fs_info, flags);
4035	if (!space_info) {
4036		btrfs_err(fs_info, "No space info for %llu", flags);
4037		return -ENOSPC;
4038	}
4039
4040	ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
4041	if (ret < 0)
4042		return ret;
4043
4044	ffe_ctl.search_start = max(ffe_ctl.search_start,
4045				   first_logical_byte(fs_info, 0));
4046	ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
4047	if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
4048		block_group = btrfs_lookup_block_group(fs_info,
4049						       ffe_ctl.search_start);
4050		/*
4051		 * we don't want to use the block group if it doesn't match our
4052		 * allocation bits, or if its not cached.
4053		 *
4054		 * However if we are re-searching with an ideal block group
4055		 * picked out then we don't care that the block group is cached.
4056		 */
4057		if (block_group && block_group_bits(block_group, flags) &&
4058		    block_group->cached != BTRFS_CACHE_NO) {
4059			down_read(&space_info->groups_sem);
4060			if (list_empty(&block_group->list) ||
4061			    block_group->ro) {
4062				/*
4063				 * someone is removing this block group,
4064				 * we can't jump into the have_block_group
4065				 * target because our list pointers are not
4066				 * valid
4067				 */
4068				btrfs_put_block_group(block_group);
4069				up_read(&space_info->groups_sem);
4070			} else {
4071				ffe_ctl.index = btrfs_bg_flags_to_raid_index(
4072						block_group->flags);
4073				btrfs_lock_block_group(block_group, delalloc);
4074				goto have_block_group;
4075			}
4076		} else if (block_group) {
4077			btrfs_put_block_group(block_group);
4078		}
4079	}
4080search:
4081	ffe_ctl.have_caching_bg = false;
4082	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
4083	    ffe_ctl.index == 0)
4084		full_search = true;
4085	down_read(&space_info->groups_sem);
4086	list_for_each_entry(block_group,
4087			    &space_info->block_groups[ffe_ctl.index], list) {
4088		struct btrfs_block_group *bg_ret;
4089
4090		/* If the block group is read-only, we can skip it entirely. */
4091		if (unlikely(block_group->ro))
4092			continue;
4093
4094		btrfs_grab_block_group(block_group, delalloc);
4095		ffe_ctl.search_start = block_group->start;
4096
4097		/*
4098		 * this can happen if we end up cycling through all the
4099		 * raid types, but we want to make sure we only allocate
4100		 * for the proper type.
4101		 */
4102		if (!block_group_bits(block_group, flags)) {
4103			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4104				BTRFS_BLOCK_GROUP_RAID1_MASK |
4105				BTRFS_BLOCK_GROUP_RAID56_MASK |
4106				BTRFS_BLOCK_GROUP_RAID10;
4107
4108			/*
4109			 * if they asked for extra copies and this block group
4110			 * doesn't provide them, bail.  This does allow us to
4111			 * fill raid0 from raid1.
4112			 */
4113			if ((flags & extra) && !(block_group->flags & extra))
4114				goto loop;
4115
4116			/*
4117			 * This block group has different flags than we want.
4118			 * It's possible that we have MIXED_GROUP flag but no
4119			 * block group is mixed.  Just skip such block group.
4120			 */
4121			btrfs_release_block_group(block_group, delalloc);
4122			continue;
4123		}
4124
4125have_block_group:
4126		ffe_ctl.cached = btrfs_block_group_done(block_group);
4127		if (unlikely(!ffe_ctl.cached)) {
4128			ffe_ctl.have_caching_bg = true;
4129			ret = btrfs_cache_block_group(block_group, 0);
4130
4131			/*
4132			 * If we get ENOMEM here or something else we want to
4133			 * try other block groups, because it may not be fatal.
4134			 * However if we can't find anything else we need to
4135			 * save our return here so that we return the actual
4136			 * error that caused problems, not ENOSPC.
4137			 */
4138			if (ret < 0) {
4139				if (!cache_block_group_error)
4140					cache_block_group_error = ret;
4141				ret = 0;
4142				goto loop;
4143			}
4144			ret = 0;
4145		}
4146
4147		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4148			if (!cache_block_group_error)
4149				cache_block_group_error = -EIO;
4150			goto loop;
4151		}
4152
4153		bg_ret = NULL;
4154		ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
4155		if (ret == 0) {
4156			if (bg_ret && bg_ret != block_group) {
4157				btrfs_release_block_group(block_group, delalloc);
4158				block_group = bg_ret;
4159			}
4160		} else if (ret == -EAGAIN) {
4161			goto have_block_group;
4162		} else if (ret > 0) {
4163			goto loop;
4164		}
4165
4166		/* Checks */
4167		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
4168					     fs_info->stripesize);
4169
4170		/* move on to the next group */
4171		if (ffe_ctl.search_start + num_bytes >
4172		    block_group->start + block_group->length) {
4173			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4174					     num_bytes);
4175			goto loop;
4176		}
4177
4178		if (ffe_ctl.found_offset < ffe_ctl.search_start)
4179			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4180				ffe_ctl.search_start - ffe_ctl.found_offset);
4181
4182		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
4183				num_bytes, delalloc);
4184		if (ret == -EAGAIN) {
4185			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4186					     num_bytes);
4187			goto loop;
4188		}
4189		btrfs_inc_block_group_reservations(block_group);
4190
4191		/* we are all good, lets return */
4192		ins->objectid = ffe_ctl.search_start;
4193		ins->offset = num_bytes;
4194
4195		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
4196					   num_bytes);
4197		btrfs_release_block_group(block_group, delalloc);
4198		break;
4199loop:
4200		release_block_group(block_group, &ffe_ctl, delalloc);
4201		cond_resched();
4202	}
4203	up_read(&space_info->groups_sem);
4204
4205	ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
4206	if (ret > 0)
4207		goto search;
4208
4209	if (ret == -ENOSPC && !cache_block_group_error) {
4210		/*
4211		 * Use ffe_ctl->total_free_space as fallback if we can't find
4212		 * any contiguous hole.
4213		 */
4214		if (!ffe_ctl.max_extent_size)
4215			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4216		spin_lock(&space_info->lock);
4217		space_info->max_extent_size = ffe_ctl.max_extent_size;
4218		spin_unlock(&space_info->lock);
4219		ins->offset = ffe_ctl.max_extent_size;
4220	} else if (ret == -ENOSPC) {
4221		ret = cache_block_group_error;
4222	}
4223	return ret;
4224}
4225
4226/*
4227 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4228 *			  hole that is at least as big as @num_bytes.
4229 *
4230 * @root           -	The root that will contain this extent
4231 *
4232 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
4233 *			is used for accounting purposes. This value differs
4234 *			from @num_bytes only in the case of compressed extents.
4235 *
4236 * @num_bytes      -	Number of bytes to allocate on-disk.
4237 *
4238 * @min_alloc_size -	Indicates the minimum amount of space that the
4239 *			allocator should try to satisfy. In some cases
4240 *			@num_bytes may be larger than what is required and if
4241 *			the filesystem is fragmented then allocation fails.
4242 *			However, the presence of @min_alloc_size gives a
4243 *			chance to try and satisfy the smaller allocation.
4244 *
4245 * @empty_size     -	A hint that you plan on doing more COW. This is the
4246 *			size in bytes the allocator should try to find free
4247 *			next to the block it returns.  This is just a hint and
4248 *			may be ignored by the allocator.
4249 *
4250 * @hint_byte      -	Hint to the allocator to start searching above the byte
4251 *			address passed. It might be ignored.
4252 *
4253 * @ins            -	This key is modified to record the found hole. It will
4254 *			have the following values:
4255 *			ins->objectid == start position
4256 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
4257 *			ins->offset == the size of the hole.
4258 *
4259 * @is_data        -	Boolean flag indicating whether an extent is
4260 *			allocated for data (true) or metadata (false)
4261 *
4262 * @delalloc       -	Boolean flag indicating whether this allocation is for
4263 *			delalloc or not. If 'true' data_rwsem of block groups
4264 *			is going to be acquired.
4265 *
4266 *
4267 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4268 * case -ENOSPC is returned then @ins->offset will contain the size of the
4269 * largest available hole the allocator managed to find.
4270 */
4271int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4272			 u64 num_bytes, u64 min_alloc_size,
4273			 u64 empty_size, u64 hint_byte,
4274			 struct btrfs_key *ins, int is_data, int delalloc)
4275{
4276	struct btrfs_fs_info *fs_info = root->fs_info;
4277	bool final_tried = num_bytes == min_alloc_size;
4278	u64 flags;
4279	int ret;
4280
4281	flags = get_alloc_profile_by_root(root, is_data);
4282again:
4283	WARN_ON(num_bytes < fs_info->sectorsize);
4284	ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
4285			       hint_byte, ins, flags, delalloc);
4286	if (!ret && !is_data) {
4287		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4288	} else if (ret == -ENOSPC) {
4289		if (!final_tried && ins->offset) {
4290			num_bytes = min(num_bytes >> 1, ins->offset);
4291			num_bytes = round_down(num_bytes,
4292					       fs_info->sectorsize);
4293			num_bytes = max(num_bytes, min_alloc_size);
4294			ram_bytes = num_bytes;
4295			if (num_bytes == min_alloc_size)
4296				final_tried = true;
4297			goto again;
4298		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4299			struct btrfs_space_info *sinfo;
4300
4301			sinfo = btrfs_find_space_info(fs_info, flags);
4302			btrfs_err(fs_info,
4303				  "allocation failed flags %llu, wanted %llu",
4304				  flags, num_bytes);
4305			if (sinfo)
4306				btrfs_dump_space_info(fs_info, sinfo,
4307						      num_bytes, 1);
4308		}
4309	}
4310
4311	return ret;
4312}
4313
4314int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4315			       u64 start, u64 len, int delalloc)
4316{
4317	struct btrfs_block_group *cache;
4318
4319	cache = btrfs_lookup_block_group(fs_info, start);
4320	if (!cache) {
4321		btrfs_err(fs_info, "Unable to find block group for %llu",
4322			  start);
4323		return -ENOSPC;
4324	}
4325
4326	btrfs_add_free_space(cache, start, len);
4327	btrfs_free_reserved_bytes(cache, len, delalloc);
4328	trace_btrfs_reserved_extent_free(fs_info, start, len);
4329
4330	btrfs_put_block_group(cache);
4331	return 0;
4332}
4333
4334int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4335			      u64 len)
4336{
4337	struct btrfs_block_group *cache;
4338	int ret = 0;
4339
4340	cache = btrfs_lookup_block_group(trans->fs_info, start);
4341	if (!cache) {
4342		btrfs_err(trans->fs_info, "unable to find block group for %llu",
4343			  start);
4344		return -ENOSPC;
4345	}
4346
4347	ret = pin_down_extent(trans, cache, start, len, 1);
4348	btrfs_put_block_group(cache);
4349	return ret;
4350}
4351
4352static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4353				      u64 parent, u64 root_objectid,
4354				      u64 flags, u64 owner, u64 offset,
4355				      struct btrfs_key *ins, int ref_mod)
4356{
4357	struct btrfs_fs_info *fs_info = trans->fs_info;
4358	int ret;
4359	struct btrfs_extent_item *extent_item;
4360	struct btrfs_extent_inline_ref *iref;
4361	struct btrfs_path *path;
4362	struct extent_buffer *leaf;
4363	int type;
4364	u32 size;
4365
4366	if (parent > 0)
4367		type = BTRFS_SHARED_DATA_REF_KEY;
4368	else
4369		type = BTRFS_EXTENT_DATA_REF_KEY;
4370
4371	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4372
4373	path = btrfs_alloc_path();
4374	if (!path)
4375		return -ENOMEM;
4376
4377	path->leave_spinning = 1;
4378	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4379				      ins, size);
4380	if (ret) {
4381		btrfs_free_path(path);
4382		return ret;
4383	}
4384
4385	leaf = path->nodes[0];
4386	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4387				     struct btrfs_extent_item);
4388	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4389	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4390	btrfs_set_extent_flags(leaf, extent_item,
4391			       flags | BTRFS_EXTENT_FLAG_DATA);
4392
4393	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4394	btrfs_set_extent_inline_ref_type(leaf, iref, type);
4395	if (parent > 0) {
4396		struct btrfs_shared_data_ref *ref;
4397		ref = (struct btrfs_shared_data_ref *)(iref + 1);
4398		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4399		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4400	} else {
4401		struct btrfs_extent_data_ref *ref;
4402		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4403		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4404		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4405		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4406		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4407	}
4408
4409	btrfs_mark_buffer_dirty(path->nodes[0]);
4410	btrfs_free_path(path);
4411
4412	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4413	if (ret)
4414		return ret;
4415
4416	ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
4417	if (ret) { /* -ENOENT, logic error */
4418		btrfs_err(fs_info, "update block group failed for %llu %llu",
4419			ins->objectid, ins->offset);
4420		BUG();
4421	}
4422	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4423	return ret;
4424}
4425
4426static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4427				     struct btrfs_delayed_ref_node *node,
4428				     struct btrfs_delayed_extent_op *extent_op)
4429{
4430	struct btrfs_fs_info *fs_info = trans->fs_info;
4431	int ret;
4432	struct btrfs_extent_item *extent_item;
4433	struct btrfs_key extent_key;
4434	struct btrfs_tree_block_info *block_info;
4435	struct btrfs_extent_inline_ref *iref;
4436	struct btrfs_path *path;
4437	struct extent_buffer *leaf;
4438	struct btrfs_delayed_tree_ref *ref;
4439	u32 size = sizeof(*extent_item) + sizeof(*iref);
4440	u64 num_bytes;
4441	u64 flags = extent_op->flags_to_set;
4442	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4443
4444	ref = btrfs_delayed_node_to_tree_ref(node);
4445
4446	extent_key.objectid = node->bytenr;
4447	if (skinny_metadata) {
4448		extent_key.offset = ref->level;
4449		extent_key.type = BTRFS_METADATA_ITEM_KEY;
4450		num_bytes = fs_info->nodesize;
4451	} else {
4452		extent_key.offset = node->num_bytes;
4453		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4454		size += sizeof(*block_info);
4455		num_bytes = node->num_bytes;
4456	}
4457
4458	path = btrfs_alloc_path();
4459	if (!path)
4460		return -ENOMEM;
4461
4462	path->leave_spinning = 1;
4463	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4464				      &extent_key, size);
4465	if (ret) {
4466		btrfs_free_path(path);
4467		return ret;
4468	}
4469
4470	leaf = path->nodes[0];
4471	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4472				     struct btrfs_extent_item);
4473	btrfs_set_extent_refs(leaf, extent_item, 1);
4474	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4475	btrfs_set_extent_flags(leaf, extent_item,
4476			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4477
4478	if (skinny_metadata) {
4479		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4480	} else {
4481		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4482		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4483		btrfs_set_tree_block_level(leaf, block_info, ref->level);
4484		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4485	}
4486
4487	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4488		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4489		btrfs_set_extent_inline_ref_type(leaf, iref,
4490						 BTRFS_SHARED_BLOCK_REF_KEY);
4491		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4492	} else {
4493		btrfs_set_extent_inline_ref_type(leaf, iref,
4494						 BTRFS_TREE_BLOCK_REF_KEY);
4495		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4496	}
4497
4498	btrfs_mark_buffer_dirty(leaf);
4499	btrfs_free_path(path);
4500
4501	ret = remove_from_free_space_tree(trans, extent_key.objectid,
4502					  num_bytes);
4503	if (ret)
4504		return ret;
4505
4506	ret = btrfs_update_block_group(trans, extent_key.objectid,
4507				       fs_info->nodesize, 1);
4508	if (ret) { /* -ENOENT, logic error */
4509		btrfs_err(fs_info, "update block group failed for %llu %llu",
4510			extent_key.objectid, extent_key.offset);
4511		BUG();
4512	}
4513
4514	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4515					  fs_info->nodesize);
4516	return ret;
4517}
4518
4519int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4520				     struct btrfs_root *root, u64 owner,
4521				     u64 offset, u64 ram_bytes,
4522				     struct btrfs_key *ins)
4523{
4524	struct btrfs_ref generic_ref = { 0 };
4525
4526	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4527
4528	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4529			       ins->objectid, ins->offset, 0);
4530	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4531	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4532
4533	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4534}
4535
4536/*
4537 * this is used by the tree logging recovery code.  It records that
4538 * an extent has been allocated and makes sure to clear the free
4539 * space cache bits as well
4540 */
4541int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4542				   u64 root_objectid, u64 owner, u64 offset,
4543				   struct btrfs_key *ins)
4544{
4545	struct btrfs_fs_info *fs_info = trans->fs_info;
4546	int ret;
4547	struct btrfs_block_group *block_group;
4548	struct btrfs_space_info *space_info;
4549
4550	/*
4551	 * Mixed block groups will exclude before processing the log so we only
4552	 * need to do the exclude dance if this fs isn't mixed.
4553	 */
4554	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4555		ret = __exclude_logged_extent(fs_info, ins->objectid,
4556					      ins->offset);
4557		if (ret)
4558			return ret;
4559	}
4560
4561	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4562	if (!block_group)
4563		return -EINVAL;
4564
4565	space_info = block_group->space_info;
4566	spin_lock(&space_info->lock);
4567	spin_lock(&block_group->lock);
4568	space_info->bytes_reserved += ins->offset;
4569	block_group->reserved += ins->offset;
4570	spin_unlock(&block_group->lock);
4571	spin_unlock(&space_info->lock);
4572
4573	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4574					 offset, ins, 1);
4575	if (ret)
4576		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4577	btrfs_put_block_group(block_group);
4578	return ret;
4579}
4580
4581static struct extent_buffer *
4582btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4583		      u64 bytenr, int level, u64 owner,
4584		      enum btrfs_lock_nesting nest)
4585{
4586	struct btrfs_fs_info *fs_info = root->fs_info;
4587	struct extent_buffer *buf;
4588
4589	buf = btrfs_find_create_tree_block(fs_info, bytenr);
4590	if (IS_ERR(buf))
4591		return buf;
4592
4593	/*
4594	 * Extra safety check in case the extent tree is corrupted and extent
4595	 * allocator chooses to use a tree block which is already used and
4596	 * locked.
4597	 */
4598	if (buf->lock_owner == current->pid) {
4599		btrfs_err_rl(fs_info,
4600"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4601			buf->start, btrfs_header_owner(buf), current->pid);
4602		free_extent_buffer(buf);
4603		return ERR_PTR(-EUCLEAN);
4604	}
4605
4606	btrfs_set_buffer_lockdep_class(owner, buf, level);
4607	__btrfs_tree_lock(buf, nest);
4608	btrfs_clean_tree_block(buf);
4609	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4610
4611	btrfs_set_lock_blocking_write(buf);
4612	set_extent_buffer_uptodate(buf);
4613
4614	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4615	btrfs_set_header_level(buf, level);
4616	btrfs_set_header_bytenr(buf, buf->start);
4617	btrfs_set_header_generation(buf, trans->transid);
4618	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4619	btrfs_set_header_owner(buf, owner);
4620	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4621	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4622	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4623		buf->log_index = root->log_transid % 2;
4624		/*
4625		 * we allow two log transactions at a time, use different
4626		 * EXTENT bit to differentiate dirty pages.
4627		 */
4628		if (buf->log_index == 0)
4629			set_extent_dirty(&root->dirty_log_pages, buf->start,
4630					buf->start + buf->len - 1, GFP_NOFS);
4631		else
4632			set_extent_new(&root->dirty_log_pages, buf->start,
4633					buf->start + buf->len - 1);
4634	} else {
4635		buf->log_index = -1;
4636		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4637			 buf->start + buf->len - 1, GFP_NOFS);
4638	}
4639	trans->dirty = true;
4640	/* this returns a buffer locked for blocking */
4641	return buf;
4642}
4643
4644/*
4645 * finds a free extent and does all the dirty work required for allocation
4646 * returns the tree buffer or an ERR_PTR on error.
4647 */
4648struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4649					     struct btrfs_root *root,
4650					     u64 parent, u64 root_objectid,
4651					     const struct btrfs_disk_key *key,
4652					     int level, u64 hint,
4653					     u64 empty_size,
4654					     enum btrfs_lock_nesting nest)
4655{
4656	struct btrfs_fs_info *fs_info = root->fs_info;
4657	struct btrfs_key ins;
4658	struct btrfs_block_rsv *block_rsv;
4659	struct extent_buffer *buf;
4660	struct btrfs_delayed_extent_op *extent_op;
4661	struct btrfs_ref generic_ref = { 0 };
4662	u64 flags = 0;
4663	int ret;
4664	u32 blocksize = fs_info->nodesize;
4665	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4666
4667#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4668	if (btrfs_is_testing(fs_info)) {
4669		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4670					    level, root_objectid, nest);
4671		if (!IS_ERR(buf))
4672			root->alloc_bytenr += blocksize;
4673		return buf;
4674	}
4675#endif
4676
4677	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4678	if (IS_ERR(block_rsv))
4679		return ERR_CAST(block_rsv);
4680
4681	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4682				   empty_size, hint, &ins, 0, 0);
4683	if (ret)
4684		goto out_unuse;
4685
4686	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4687				    root_objectid, nest);
4688	if (IS_ERR(buf)) {
4689		ret = PTR_ERR(buf);
4690		goto out_free_reserved;
4691	}
4692
4693	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4694		if (parent == 0)
4695			parent = ins.objectid;
4696		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4697	} else
4698		BUG_ON(parent > 0);
4699
4700	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4701		extent_op = btrfs_alloc_delayed_extent_op();
4702		if (!extent_op) {
4703			ret = -ENOMEM;
4704			goto out_free_buf;
4705		}
4706		if (key)
4707			memcpy(&extent_op->key, key, sizeof(extent_op->key));
4708		else
4709			memset(&extent_op->key, 0, sizeof(extent_op->key));
4710		extent_op->flags_to_set = flags;
4711		extent_op->update_key = skinny_metadata ? false : true;
4712		extent_op->update_flags = true;
4713		extent_op->is_data = false;
4714		extent_op->level = level;
4715
4716		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4717				       ins.objectid, ins.offset, parent);
4718		generic_ref.real_root = root->root_key.objectid;
4719		btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4720		btrfs_ref_tree_mod(fs_info, &generic_ref);
4721		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
4722		if (ret)
4723			goto out_free_delayed;
4724	}
4725	return buf;
4726
4727out_free_delayed:
4728	btrfs_free_delayed_extent_op(extent_op);
4729out_free_buf:
4730	btrfs_tree_unlock(buf);
4731	free_extent_buffer(buf);
4732out_free_reserved:
4733	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4734out_unuse:
4735	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4736	return ERR_PTR(ret);
4737}
4738
4739struct walk_control {
4740	u64 refs[BTRFS_MAX_LEVEL];
4741	u64 flags[BTRFS_MAX_LEVEL];
4742	struct btrfs_key update_progress;
4743	struct btrfs_key drop_progress;
4744	int drop_level;
4745	int stage;
4746	int level;
4747	int shared_level;
4748	int update_ref;
4749	int keep_locks;
4750	int reada_slot;
4751	int reada_count;
4752	int restarted;
4753};
4754
4755#define DROP_REFERENCE	1
4756#define UPDATE_BACKREF	2
4757
4758static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4759				     struct btrfs_root *root,
4760				     struct walk_control *wc,
4761				     struct btrfs_path *path)
4762{
4763	struct btrfs_fs_info *fs_info = root->fs_info;
4764	u64 bytenr;
4765	u64 generation;
4766	u64 refs;
4767	u64 flags;
4768	u32 nritems;
4769	struct btrfs_key key;
4770	struct extent_buffer *eb;
4771	int ret;
4772	int slot;
4773	int nread = 0;
4774
4775	if (path->slots[wc->level] < wc->reada_slot) {
4776		wc->reada_count = wc->reada_count * 2 / 3;
4777		wc->reada_count = max(wc->reada_count, 2);
4778	} else {
4779		wc->reada_count = wc->reada_count * 3 / 2;
4780		wc->reada_count = min_t(int, wc->reada_count,
4781					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
4782	}
4783
4784	eb = path->nodes[wc->level];
4785	nritems = btrfs_header_nritems(eb);
4786
4787	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
4788		if (nread >= wc->reada_count)
4789			break;
4790
4791		cond_resched();
4792		bytenr = btrfs_node_blockptr(eb, slot);
4793		generation = btrfs_node_ptr_generation(eb, slot);
4794
4795		if (slot == path->slots[wc->level])
4796			goto reada;
4797
4798		if (wc->stage == UPDATE_BACKREF &&
4799		    generation <= root->root_key.offset)
4800			continue;
4801
4802		/* We don't lock the tree block, it's OK to be racy here */
4803		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
4804					       wc->level - 1, 1, &refs,
4805					       &flags);
4806		/* We don't care about errors in readahead. */
4807		if (ret < 0)
4808			continue;
4809		BUG_ON(refs == 0);
4810
4811		if (wc->stage == DROP_REFERENCE) {
4812			if (refs == 1)
4813				goto reada;
4814
4815			if (wc->level == 1 &&
4816			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4817				continue;
4818			if (!wc->update_ref ||
4819			    generation <= root->root_key.offset)
4820				continue;
4821			btrfs_node_key_to_cpu(eb, &key, slot);
4822			ret = btrfs_comp_cpu_keys(&key,
4823						  &wc->update_progress);
4824			if (ret < 0)
4825				continue;
4826		} else {
4827			if (wc->level == 1 &&
4828			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4829				continue;
4830		}
4831reada:
4832		readahead_tree_block(fs_info, bytenr);
4833		nread++;
4834	}
4835	wc->reada_slot = slot;
4836}
4837
4838/*
4839 * helper to process tree block while walking down the tree.
4840 *
4841 * when wc->stage == UPDATE_BACKREF, this function updates
4842 * back refs for pointers in the block.
4843 *
4844 * NOTE: return value 1 means we should stop walking down.
4845 */
4846static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4847				   struct btrfs_root *root,
4848				   struct btrfs_path *path,
4849				   struct walk_control *wc, int lookup_info)
4850{
4851	struct btrfs_fs_info *fs_info = root->fs_info;
4852	int level = wc->level;
4853	struct extent_buffer *eb = path->nodes[level];
4854	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4855	int ret;
4856
4857	if (wc->stage == UPDATE_BACKREF &&
4858	    btrfs_header_owner(eb) != root->root_key.objectid)
4859		return 1;
4860
4861	/*
4862	 * when reference count of tree block is 1, it won't increase
4863	 * again. once full backref flag is set, we never clear it.
4864	 */
4865	if (lookup_info &&
4866	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4867	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4868		BUG_ON(!path->locks[level]);
4869		ret = btrfs_lookup_extent_info(trans, fs_info,
4870					       eb->start, level, 1,
4871					       &wc->refs[level],
4872					       &wc->flags[level]);
4873		BUG_ON(ret == -ENOMEM);
4874		if (ret)
4875			return ret;
4876		BUG_ON(wc->refs[level] == 0);
4877	}
4878
4879	if (wc->stage == DROP_REFERENCE) {
4880		if (wc->refs[level] > 1)
4881			return 1;
4882
4883		if (path->locks[level] && !wc->keep_locks) {
4884			btrfs_tree_unlock_rw(eb, path->locks[level]);
4885			path->locks[level] = 0;
4886		}
4887		return 0;
4888	}
4889
4890	/* wc->stage == UPDATE_BACKREF */
4891	if (!(wc->flags[level] & flag)) {
4892		BUG_ON(!path->locks[level]);
4893		ret = btrfs_inc_ref(trans, root, eb, 1);
4894		BUG_ON(ret); /* -ENOMEM */
4895		ret = btrfs_dec_ref(trans, root, eb, 0);
4896		BUG_ON(ret); /* -ENOMEM */
4897		ret = btrfs_set_disk_extent_flags(trans, eb, flag,
4898						  btrfs_header_level(eb), 0);
4899		BUG_ON(ret); /* -ENOMEM */
4900		wc->flags[level] |= flag;
4901	}
4902
4903	/*
4904	 * the block is shared by multiple trees, so it's not good to
4905	 * keep the tree lock
4906	 */
4907	if (path->locks[level] && level > 0) {
4908		btrfs_tree_unlock_rw(eb, path->locks[level]);
4909		path->locks[level] = 0;
4910	}
4911	return 0;
4912}
4913
4914/*
4915 * This is used to verify a ref exists for this root to deal with a bug where we
4916 * would have a drop_progress key that hadn't been updated properly.
4917 */
4918static int check_ref_exists(struct btrfs_trans_handle *trans,
4919			    struct btrfs_root *root, u64 bytenr, u64 parent,
4920			    int level)
4921{
4922	struct btrfs_path *path;
4923	struct btrfs_extent_inline_ref *iref;
4924	int ret;
4925
4926	path = btrfs_alloc_path();
4927	if (!path)
4928		return -ENOMEM;
4929
4930	ret = lookup_extent_backref(trans, path, &iref, bytenr,
4931				    root->fs_info->nodesize, parent,
4932				    root->root_key.objectid, level, 0);
4933	btrfs_free_path(path);
4934	if (ret == -ENOENT)
4935		return 0;
4936	if (ret < 0)
4937		return ret;
4938	return 1;
4939}
4940
4941/*
4942 * helper to process tree block pointer.
4943 *
4944 * when wc->stage == DROP_REFERENCE, this function checks
4945 * reference count of the block pointed to. if the block
4946 * is shared and we need update back refs for the subtree
4947 * rooted at the block, this function changes wc->stage to
4948 * UPDATE_BACKREF. if the block is shared and there is no
4949 * need to update back, this function drops the reference
4950 * to the block.
4951 *
4952 * NOTE: return value 1 means we should stop walking down.
4953 */
4954static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4955				 struct btrfs_root *root,
4956				 struct btrfs_path *path,
4957				 struct walk_control *wc, int *lookup_info)
4958{
4959	struct btrfs_fs_info *fs_info = root->fs_info;
4960	u64 bytenr;
4961	u64 generation;
4962	u64 parent;
4963	struct btrfs_key key;
4964	struct btrfs_key first_key;
4965	struct btrfs_ref ref = { 0 };
4966	struct extent_buffer *next;
4967	int level = wc->level;
4968	int reada = 0;
4969	int ret = 0;
4970	bool need_account = false;
4971
4972	generation = btrfs_node_ptr_generation(path->nodes[level],
4973					       path->slots[level]);
4974	/*
4975	 * if the lower level block was created before the snapshot
4976	 * was created, we know there is no need to update back refs
4977	 * for the subtree
4978	 */
4979	if (wc->stage == UPDATE_BACKREF &&
4980	    generation <= root->root_key.offset) {
4981		*lookup_info = 1;
4982		return 1;
4983	}
4984
4985	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4986	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
4987			      path->slots[level]);
4988
4989	next = find_extent_buffer(fs_info, bytenr);
4990	if (!next) {
4991		next = btrfs_find_create_tree_block(fs_info, bytenr);
4992		if (IS_ERR(next))
4993			return PTR_ERR(next);
4994
4995		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
4996					       level - 1);
4997		reada = 1;
4998	}
4999	btrfs_tree_lock(next);
5000	btrfs_set_lock_blocking_write(next);
5001
5002	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5003				       &wc->refs[level - 1],
5004				       &wc->flags[level - 1]);
5005	if (ret < 0)
5006		goto out_unlock;
5007
5008	if (unlikely(wc->refs[level - 1] == 0)) {
5009		btrfs_err(fs_info, "Missing references.");
5010		ret = -EIO;
5011		goto out_unlock;
5012	}
5013	*lookup_info = 0;
5014
5015	if (wc->stage == DROP_REFERENCE) {
5016		if (wc->refs[level - 1] > 1) {
5017			need_account = true;
5018			if (level == 1 &&
5019			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5020				goto skip;
5021
5022			if (!wc->update_ref ||
5023			    generation <= root->root_key.offset)
5024				goto skip;
5025
5026			btrfs_node_key_to_cpu(path->nodes[level], &key,
5027					      path->slots[level]);
5028			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5029			if (ret < 0)
5030				goto skip;
5031
5032			wc->stage = UPDATE_BACKREF;
5033			wc->shared_level = level - 1;
5034		}
5035	} else {
5036		if (level == 1 &&
5037		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5038			goto skip;
5039	}
5040
5041	if (!btrfs_buffer_uptodate(next, generation, 0)) {
5042		btrfs_tree_unlock(next);
5043		free_extent_buffer(next);
5044		next = NULL;
5045		*lookup_info = 1;
5046	}
5047
5048	if (!next) {
5049		if (reada && level == 1)
5050			reada_walk_down(trans, root, wc, path);
5051		next = read_tree_block(fs_info, bytenr, generation, level - 1,
5052				       &first_key);
5053		if (IS_ERR(next)) {
5054			return PTR_ERR(next);
5055		} else if (!extent_buffer_uptodate(next)) {
5056			free_extent_buffer(next);
5057			return -EIO;
5058		}
5059		btrfs_tree_lock(next);
5060		btrfs_set_lock_blocking_write(next);
5061	}
5062
5063	level--;
5064	ASSERT(level == btrfs_header_level(next));
5065	if (level != btrfs_header_level(next)) {
5066		btrfs_err(root->fs_info, "mismatched level");
5067		ret = -EIO;
5068		goto out_unlock;
5069	}
5070	path->nodes[level] = next;
5071	path->slots[level] = 0;
5072	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5073	wc->level = level;
5074	if (wc->level == 1)
5075		wc->reada_slot = 0;
5076	return 0;
5077skip:
5078	wc->refs[level - 1] = 0;
5079	wc->flags[level - 1] = 0;
5080	if (wc->stage == DROP_REFERENCE) {
5081		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5082			parent = path->nodes[level]->start;
5083		} else {
5084			ASSERT(root->root_key.objectid ==
5085			       btrfs_header_owner(path->nodes[level]));
5086			if (root->root_key.objectid !=
5087			    btrfs_header_owner(path->nodes[level])) {
5088				btrfs_err(root->fs_info,
5089						"mismatched block owner");
5090				ret = -EIO;
5091				goto out_unlock;
5092			}
5093			parent = 0;
5094		}
5095
5096		/*
5097		 * If we had a drop_progress we need to verify the refs are set
5098		 * as expected.  If we find our ref then we know that from here
5099		 * on out everything should be correct, and we can clear the
5100		 * ->restarted flag.
5101		 */
5102		if (wc->restarted) {
5103			ret = check_ref_exists(trans, root, bytenr, parent,
5104					       level - 1);
5105			if (ret < 0)
5106				goto out_unlock;
5107			if (ret == 0)
5108				goto no_delete;
5109			ret = 0;
5110			wc->restarted = 0;
5111		}
5112
5113		/*
5114		 * Reloc tree doesn't contribute to qgroup numbers, and we have
5115		 * already accounted them at merge time (replace_path),
5116		 * thus we could skip expensive subtree trace here.
5117		 */
5118		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5119		    need_account) {
5120			ret = btrfs_qgroup_trace_subtree(trans, next,
5121							 generation, level - 1);
5122			if (ret) {
5123				btrfs_err_rl(fs_info,
5124					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5125					     ret);
5126			}
5127		}
5128
5129		/*
5130		 * We need to update the next key in our walk control so we can
5131		 * update the drop_progress key accordingly.  We don't care if
5132		 * find_next_key doesn't find a key because that means we're at
5133		 * the end and are going to clean up now.
5134		 */
5135		wc->drop_level = level;
5136		find_next_key(path, level, &wc->drop_progress);
5137
5138		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5139				       fs_info->nodesize, parent);
5140		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
5141		ret = btrfs_free_extent(trans, &ref);
5142		if (ret)
5143			goto out_unlock;
5144	}
5145no_delete:
5146	*lookup_info = 1;
5147	ret = 1;
5148
5149out_unlock:
5150	btrfs_tree_unlock(next);
5151	free_extent_buffer(next);
5152
5153	return ret;
5154}
5155
5156/*
5157 * helper to process tree block while walking up the tree.
5158 *
5159 * when wc->stage == DROP_REFERENCE, this function drops
5160 * reference count on the block.
5161 *
5162 * when wc->stage == UPDATE_BACKREF, this function changes
5163 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5164 * to UPDATE_BACKREF previously while processing the block.
5165 *
5166 * NOTE: return value 1 means we should stop walking up.
5167 */
5168static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5169				 struct btrfs_root *root,
5170				 struct btrfs_path *path,
5171				 struct walk_control *wc)
5172{
5173	struct btrfs_fs_info *fs_info = root->fs_info;
5174	int ret;
5175	int level = wc->level;
5176	struct extent_buffer *eb = path->nodes[level];
5177	u64 parent = 0;
5178
5179	if (wc->stage == UPDATE_BACKREF) {
5180		BUG_ON(wc->shared_level < level);
5181		if (level < wc->shared_level)
5182			goto out;
5183
5184		ret = find_next_key(path, level + 1, &wc->update_progress);
5185		if (ret > 0)
5186			wc->update_ref = 0;
5187
5188		wc->stage = DROP_REFERENCE;
5189		wc->shared_level = -1;
5190		path->slots[level] = 0;
5191
5192		/*
5193		 * check reference count again if the block isn't locked.
5194		 * we should start walking down the tree again if reference
5195		 * count is one.
5196		 */
5197		if (!path->locks[level]) {
5198			BUG_ON(level == 0);
5199			btrfs_tree_lock(eb);
5200			btrfs_set_lock_blocking_write(eb);
5201			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5202
5203			ret = btrfs_lookup_extent_info(trans, fs_info,
5204						       eb->start, level, 1,
5205						       &wc->refs[level],
5206						       &wc->flags[level]);
5207			if (ret < 0) {
5208				btrfs_tree_unlock_rw(eb, path->locks[level]);
5209				path->locks[level] = 0;
5210				return ret;
5211			}
5212			BUG_ON(wc->refs[level] == 0);
5213			if (wc->refs[level] == 1) {
5214				btrfs_tree_unlock_rw(eb, path->locks[level]);
5215				path->locks[level] = 0;
5216				return 1;
5217			}
5218		}
5219	}
5220
5221	/* wc->stage == DROP_REFERENCE */
5222	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5223
5224	if (wc->refs[level] == 1) {
5225		if (level == 0) {
5226			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5227				ret = btrfs_dec_ref(trans, root, eb, 1);
5228			else
5229				ret = btrfs_dec_ref(trans, root, eb, 0);
5230			BUG_ON(ret); /* -ENOMEM */
5231			if (is_fstree(root->root_key.objectid)) {
5232				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5233				if (ret) {
5234					btrfs_err_rl(fs_info,
5235	"error %d accounting leaf items, quota is out of sync, rescan required",
5236					     ret);
5237				}
5238			}
5239		}
5240		/* make block locked assertion in btrfs_clean_tree_block happy */
5241		if (!path->locks[level] &&
5242		    btrfs_header_generation(eb) == trans->transid) {
5243			btrfs_tree_lock(eb);
5244			btrfs_set_lock_blocking_write(eb);
5245			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5246		}
5247		btrfs_clean_tree_block(eb);
5248	}
5249
5250	if (eb == root->node) {
5251		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5252			parent = eb->start;
5253		else if (root->root_key.objectid != btrfs_header_owner(eb))
5254			goto owner_mismatch;
5255	} else {
5256		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5257			parent = path->nodes[level + 1]->start;
5258		else if (root->root_key.objectid !=
5259			 btrfs_header_owner(path->nodes[level + 1]))
5260			goto owner_mismatch;
5261	}
5262
5263	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5264out:
5265	wc->refs[level] = 0;
5266	wc->flags[level] = 0;
5267	return 0;
5268
5269owner_mismatch:
5270	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5271		     btrfs_header_owner(eb), root->root_key.objectid);
5272	return -EUCLEAN;
5273}
5274
5275static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5276				   struct btrfs_root *root,
5277				   struct btrfs_path *path,
5278				   struct walk_control *wc)
5279{
5280	int level = wc->level;
5281	int lookup_info = 1;
5282	int ret;
5283
5284	while (level >= 0) {
5285		ret = walk_down_proc(trans, root, path, wc, lookup_info);
5286		if (ret > 0)
5287			break;
5288
5289		if (level == 0)
5290			break;
5291
5292		if (path->slots[level] >=
5293		    btrfs_header_nritems(path->nodes[level]))
5294			break;
5295
5296		ret = do_walk_down(trans, root, path, wc, &lookup_info);
5297		if (ret > 0) {
5298			path->slots[level]++;
5299			continue;
5300		} else if (ret < 0)
5301			return ret;
5302		level = wc->level;
5303	}
5304	return 0;
5305}
5306
5307static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5308				 struct btrfs_root *root,
5309				 struct btrfs_path *path,
5310				 struct walk_control *wc, int max_level)
5311{
5312	int level = wc->level;
5313	int ret;
5314
5315	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5316	while (level < max_level && path->nodes[level]) {
5317		wc->level = level;
5318		if (path->slots[level] + 1 <
5319		    btrfs_header_nritems(path->nodes[level])) {
5320			path->slots[level]++;
5321			return 0;
5322		} else {
5323			ret = walk_up_proc(trans, root, path, wc);
5324			if (ret > 0)
5325				return 0;
5326			if (ret < 0)
5327				return ret;
5328
5329			if (path->locks[level]) {
5330				btrfs_tree_unlock_rw(path->nodes[level],
5331						     path->locks[level]);
5332				path->locks[level] = 0;
5333			}
5334			free_extent_buffer(path->nodes[level]);
5335			path->nodes[level] = NULL;
5336			level++;
5337		}
5338	}
5339	return 1;
5340}
5341
5342/*
5343 * drop a subvolume tree.
5344 *
5345 * this function traverses the tree freeing any blocks that only
5346 * referenced by the tree.
5347 *
5348 * when a shared tree block is found. this function decreases its
5349 * reference count by one. if update_ref is true, this function
5350 * also make sure backrefs for the shared block and all lower level
5351 * blocks are properly updated.
5352 *
5353 * If called with for_reloc == 0, may exit early with -EAGAIN
5354 */
5355int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5356{
5357	struct btrfs_fs_info *fs_info = root->fs_info;
5358	struct btrfs_path *path;
5359	struct btrfs_trans_handle *trans;
5360	struct btrfs_root *tree_root = fs_info->tree_root;
5361	struct btrfs_root_item *root_item = &root->root_item;
5362	struct walk_control *wc;
5363	struct btrfs_key key;
5364	int err = 0;
5365	int ret;
5366	int level;
5367	bool root_dropped = false;
5368
5369	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5370
5371	path = btrfs_alloc_path();
5372	if (!path) {
5373		err = -ENOMEM;
5374		goto out;
5375	}
5376
5377	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5378	if (!wc) {
5379		btrfs_free_path(path);
5380		err = -ENOMEM;
5381		goto out;
5382	}
5383
5384	/*
5385	 * Use join to avoid potential EINTR from transaction start. See
5386	 * wait_reserve_ticket and the whole reservation callchain.
5387	 */
5388	if (for_reloc)
5389		trans = btrfs_join_transaction(tree_root);
5390	else
5391		trans = btrfs_start_transaction(tree_root, 0);
5392	if (IS_ERR(trans)) {
5393		err = PTR_ERR(trans);
5394		goto out_free;
5395	}
5396
5397	err = btrfs_run_delayed_items(trans);
5398	if (err)
5399		goto out_end_trans;
5400
5401	/*
5402	 * This will help us catch people modifying the fs tree while we're
5403	 * dropping it.  It is unsafe to mess with the fs tree while it's being
5404	 * dropped as we unlock the root node and parent nodes as we walk down
5405	 * the tree, assuming nothing will change.  If something does change
5406	 * then we'll have stale information and drop references to blocks we've
5407	 * already dropped.
5408	 */
5409	set_bit(BTRFS_ROOT_DELETING, &root->state);
5410	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5411		level = btrfs_header_level(root->node);
5412		path->nodes[level] = btrfs_lock_root_node(root);
5413		btrfs_set_lock_blocking_write(path->nodes[level]);
5414		path->slots[level] = 0;
5415		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5416		memset(&wc->update_progress, 0,
5417		       sizeof(wc->update_progress));
5418	} else {
5419		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5420		memcpy(&wc->update_progress, &key,
5421		       sizeof(wc->update_progress));
5422
5423		level = root_item->drop_level;
5424		BUG_ON(level == 0);
5425		path->lowest_level = level;
5426		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5427		path->lowest_level = 0;
5428		if (ret < 0) {
5429			err = ret;
5430			goto out_end_trans;
5431		}
5432		WARN_ON(ret > 0);
5433
5434		/*
5435		 * unlock our path, this is safe because only this
5436		 * function is allowed to delete this snapshot
5437		 */
5438		btrfs_unlock_up_safe(path, 0);
5439
5440		level = btrfs_header_level(root->node);
5441		while (1) {
5442			btrfs_tree_lock(path->nodes[level]);
5443			btrfs_set_lock_blocking_write(path->nodes[level]);
5444			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5445
5446			ret = btrfs_lookup_extent_info(trans, fs_info,
5447						path->nodes[level]->start,
5448						level, 1, &wc->refs[level],
5449						&wc->flags[level]);
5450			if (ret < 0) {
5451				err = ret;
5452				goto out_end_trans;
5453			}
5454			BUG_ON(wc->refs[level] == 0);
5455
5456			if (level == root_item->drop_level)
5457				break;
5458
5459			btrfs_tree_unlock(path->nodes[level]);
5460			path->locks[level] = 0;
5461			WARN_ON(wc->refs[level] != 1);
5462			level--;
5463		}
5464	}
5465
5466	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5467	wc->level = level;
5468	wc->shared_level = -1;
5469	wc->stage = DROP_REFERENCE;
5470	wc->update_ref = update_ref;
5471	wc->keep_locks = 0;
5472	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5473
5474	while (1) {
5475
5476		ret = walk_down_tree(trans, root, path, wc);
5477		if (ret < 0) {
5478			err = ret;
5479			break;
5480		}
5481
5482		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5483		if (ret < 0) {
5484			err = ret;
5485			break;
5486		}
5487
5488		if (ret > 0) {
5489			BUG_ON(wc->stage != DROP_REFERENCE);
5490			break;
5491		}
5492
5493		if (wc->stage == DROP_REFERENCE) {
5494			wc->drop_level = wc->level;
5495			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5496					      &wc->drop_progress,
5497					      path->slots[wc->drop_level]);
5498		}
5499		btrfs_cpu_key_to_disk(&root_item->drop_progress,
5500				      &wc->drop_progress);
5501		root_item->drop_level = wc->drop_level;
5502
5503		BUG_ON(wc->level == 0);
5504		if (btrfs_should_end_transaction(trans) ||
5505		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5506			ret = btrfs_update_root(trans, tree_root,
5507						&root->root_key,
5508						root_item);
5509			if (ret) {
5510				btrfs_abort_transaction(trans, ret);
5511				err = ret;
5512				goto out_end_trans;
5513			}
5514
5515			btrfs_end_transaction_throttle(trans);
5516			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5517				btrfs_debug(fs_info,
5518					    "drop snapshot early exit");
5519				err = -EAGAIN;
5520				goto out_free;
5521			}
5522
5523		       /*
5524			* Use join to avoid potential EINTR from transaction
5525			* start. See wait_reserve_ticket and the whole
5526			* reservation callchain.
5527			*/
5528			if (for_reloc)
5529				trans = btrfs_join_transaction(tree_root);
5530			else
5531				trans = btrfs_start_transaction(tree_root, 0);
5532			if (IS_ERR(trans)) {
5533				err = PTR_ERR(trans);
5534				goto out_free;
5535			}
5536		}
5537	}
5538	btrfs_release_path(path);
5539	if (err)
5540		goto out_end_trans;
5541
5542	ret = btrfs_del_root(trans, &root->root_key);
5543	if (ret) {
5544		btrfs_abort_transaction(trans, ret);
5545		err = ret;
5546		goto out_end_trans;
5547	}
5548
5549	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5550		ret = btrfs_find_root(tree_root, &root->root_key, path,
5551				      NULL, NULL);
5552		if (ret < 0) {
5553			btrfs_abort_transaction(trans, ret);
5554			err = ret;
5555			goto out_end_trans;
5556		} else if (ret > 0) {
5557			/* if we fail to delete the orphan item this time
5558			 * around, it'll get picked up the next time.
5559			 *
5560			 * The most common failure here is just -ENOENT.
5561			 */
5562			btrfs_del_orphan_item(trans, tree_root,
5563					      root->root_key.objectid);
5564		}
5565	}
5566
5567	/*
5568	 * This subvolume is going to be completely dropped, and won't be
5569	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5570	 * commit transaction time.  So free it here manually.
5571	 */
5572	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5573	btrfs_qgroup_free_meta_all_pertrans(root);
5574
5575	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5576		btrfs_add_dropped_root(trans, root);
5577	else
5578		btrfs_put_root(root);
5579	root_dropped = true;
5580out_end_trans:
5581	btrfs_end_transaction_throttle(trans);
5582out_free:
5583	kfree(wc);
5584	btrfs_free_path(path);
5585out:
5586	/*
5587	 * So if we need to stop dropping the snapshot for whatever reason we
5588	 * need to make sure to add it back to the dead root list so that we
5589	 * keep trying to do the work later.  This also cleans up roots if we
5590	 * don't have it in the radix (like when we recover after a power fail
5591	 * or unmount) so we don't leak memory.
5592	 */
5593	if (!for_reloc && !root_dropped)
5594		btrfs_add_dead_root(root);
5595	return err;
5596}
5597
5598/*
5599 * drop subtree rooted at tree block 'node'.
5600 *
5601 * NOTE: this function will unlock and release tree block 'node'
5602 * only used by relocation code
5603 */
5604int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5605			struct btrfs_root *root,
5606			struct extent_buffer *node,
5607			struct extent_buffer *parent)
5608{
5609	struct btrfs_fs_info *fs_info = root->fs_info;
5610	struct btrfs_path *path;
5611	struct walk_control *wc;
5612	int level;
5613	int parent_level;
5614	int ret = 0;
5615	int wret;
5616
5617	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5618
5619	path = btrfs_alloc_path();
5620	if (!path)
5621		return -ENOMEM;
5622
5623	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5624	if (!wc) {
5625		btrfs_free_path(path);
5626		return -ENOMEM;
5627	}
5628
5629	btrfs_assert_tree_locked(parent);
5630	parent_level = btrfs_header_level(parent);
5631	atomic_inc(&parent->refs);
5632	path->nodes[parent_level] = parent;
5633	path->slots[parent_level] = btrfs_header_nritems(parent);
5634
5635	btrfs_assert_tree_locked(node);
5636	level = btrfs_header_level(node);
5637	path->nodes[level] = node;
5638	path->slots[level] = 0;
5639	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5640
5641	wc->refs[parent_level] = 1;
5642	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5643	wc->level = level;
5644	wc->shared_level = -1;
5645	wc->stage = DROP_REFERENCE;
5646	wc->update_ref = 0;
5647	wc->keep_locks = 1;
5648	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5649
5650	while (1) {
5651		wret = walk_down_tree(trans, root, path, wc);
5652		if (wret < 0) {
5653			ret = wret;
5654			break;
5655		}
5656
5657		wret = walk_up_tree(trans, root, path, wc, parent_level);
5658		if (wret < 0)
5659			ret = wret;
5660		if (wret != 0)
5661			break;
5662	}
5663
5664	kfree(wc);
5665	btrfs_free_path(path);
5666	return ret;
5667}
5668
5669/*
5670 * helper to account the unused space of all the readonly block group in the
5671 * space_info. takes mirrors into account.
5672 */
5673u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5674{
5675	struct btrfs_block_group *block_group;
5676	u64 free_bytes = 0;
5677	int factor;
5678
5679	/* It's df, we don't care if it's racy */
5680	if (list_empty(&sinfo->ro_bgs))
5681		return 0;
5682
5683	spin_lock(&sinfo->lock);
5684	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5685		spin_lock(&block_group->lock);
5686
5687		if (!block_group->ro) {
5688			spin_unlock(&block_group->lock);
5689			continue;
5690		}
5691
5692		factor = btrfs_bg_type_to_factor(block_group->flags);
5693		free_bytes += (block_group->length -
5694			       block_group->used) * factor;
5695
5696		spin_unlock(&block_group->lock);
5697	}
5698	spin_unlock(&sinfo->lock);
5699
5700	return free_bytes;
5701}
5702
5703int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
5704				   u64 start, u64 end)
5705{
5706	return unpin_extent_range(fs_info, start, end, false);
5707}
5708
5709/*
5710 * It used to be that old block groups would be left around forever.
5711 * Iterating over them would be enough to trim unused space.  Since we
5712 * now automatically remove them, we also need to iterate over unallocated
5713 * space.
5714 *
5715 * We don't want a transaction for this since the discard may take a
5716 * substantial amount of time.  We don't require that a transaction be
5717 * running, but we do need to take a running transaction into account
5718 * to ensure that we're not discarding chunks that were released or
5719 * allocated in the current transaction.
5720 *
5721 * Holding the chunks lock will prevent other threads from allocating
5722 * or releasing chunks, but it won't prevent a running transaction
5723 * from committing and releasing the memory that the pending chunks
5724 * list head uses.  For that, we need to take a reference to the
5725 * transaction and hold the commit root sem.  We only need to hold
5726 * it while performing the free space search since we have already
5727 * held back allocations.
5728 */
5729static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5730{
5731	u64 start = SZ_1M, len = 0, end = 0;
5732	int ret;
5733
5734	*trimmed = 0;
5735
5736	/* Discard not supported = nothing to do. */
5737	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
5738		return 0;
5739
5740	/* Not writable = nothing to do. */
5741	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5742		return 0;
5743
5744	/* No free space = nothing to do. */
5745	if (device->total_bytes <= device->bytes_used)
5746		return 0;
5747
5748	ret = 0;
5749
5750	while (1) {
5751		struct btrfs_fs_info *fs_info = device->fs_info;
5752		u64 bytes;
5753
5754		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
5755		if (ret)
5756			break;
5757
5758		find_first_clear_extent_bit(&device->alloc_state, start,
5759					    &start, &end,
5760					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
5761
5762		/* Check if there are any CHUNK_* bits left */
5763		if (start > device->total_bytes) {
5764			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5765			btrfs_warn_in_rcu(fs_info,
5766"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
5767					  start, end - start + 1,
5768					  rcu_str_deref(device->name),
5769					  device->total_bytes);
5770			mutex_unlock(&fs_info->chunk_mutex);
5771			ret = 0;
5772			break;
5773		}
5774
5775		/* Ensure we skip the reserved area in the first 1M */
5776		start = max_t(u64, start, SZ_1M);
5777
5778		/*
5779		 * If find_first_clear_extent_bit find a range that spans the
5780		 * end of the device it will set end to -1, in this case it's up
5781		 * to the caller to trim the value to the size of the device.
5782		 */
5783		end = min(end, device->total_bytes - 1);
5784
5785		len = end - start + 1;
5786
5787		/* We didn't find any extents */
5788		if (!len) {
5789			mutex_unlock(&fs_info->chunk_mutex);
5790			ret = 0;
5791			break;
5792		}
5793
5794		ret = btrfs_issue_discard(device->bdev, start, len,
5795					  &bytes);
5796		if (!ret)
5797			set_extent_bits(&device->alloc_state, start,
5798					start + bytes - 1,
5799					CHUNK_TRIMMED);
5800		mutex_unlock(&fs_info->chunk_mutex);
5801
5802		if (ret)
5803			break;
5804
5805		start += len;
5806		*trimmed += bytes;
5807
5808		if (fatal_signal_pending(current)) {
5809			ret = -ERESTARTSYS;
5810			break;
5811		}
5812
5813		cond_resched();
5814	}
5815
5816	return ret;
5817}
5818
5819/*
5820 * Trim the whole filesystem by:
5821 * 1) trimming the free space in each block group
5822 * 2) trimming the unallocated space on each device
5823 *
5824 * This will also continue trimming even if a block group or device encounters
5825 * an error.  The return value will be the last error, or 0 if nothing bad
5826 * happens.
5827 */
5828int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
5829{
5830	struct btrfs_block_group *cache = NULL;
5831	struct btrfs_device *device;
5832	struct list_head *devices;
5833	u64 group_trimmed;
5834	u64 range_end = U64_MAX;
5835	u64 start;
5836	u64 end;
5837	u64 trimmed = 0;
5838	u64 bg_failed = 0;
5839	u64 dev_failed = 0;
5840	int bg_ret = 0;
5841	int dev_ret = 0;
5842	int ret = 0;
5843
5844	/*
5845	 * Check range overflow if range->len is set.
5846	 * The default range->len is U64_MAX.
5847	 */
5848	if (range->len != U64_MAX &&
5849	    check_add_overflow(range->start, range->len, &range_end))
5850		return -EINVAL;
5851
5852	cache = btrfs_lookup_first_block_group(fs_info, range->start);
5853	for (; cache; cache = btrfs_next_block_group(cache)) {
5854		if (cache->start >= range_end) {
5855			btrfs_put_block_group(cache);
5856			break;
5857		}
5858
5859		start = max(range->start, cache->start);
5860		end = min(range_end, cache->start + cache->length);
5861
5862		if (end - start >= range->minlen) {
5863			if (!btrfs_block_group_done(cache)) {
5864				ret = btrfs_cache_block_group(cache, 0);
5865				if (ret) {
5866					bg_failed++;
5867					bg_ret = ret;
5868					continue;
5869				}
5870				ret = btrfs_wait_block_group_cache_done(cache);
5871				if (ret) {
5872					bg_failed++;
5873					bg_ret = ret;
5874					continue;
5875				}
5876			}
5877			ret = btrfs_trim_block_group(cache,
5878						     &group_trimmed,
5879						     start,
5880						     end,
5881						     range->minlen);
5882
5883			trimmed += group_trimmed;
5884			if (ret) {
5885				bg_failed++;
5886				bg_ret = ret;
5887				continue;
5888			}
5889		}
5890	}
5891
5892	if (bg_failed)
5893		btrfs_warn(fs_info,
5894			"failed to trim %llu block group(s), last error %d",
5895			bg_failed, bg_ret);
5896	mutex_lock(&fs_info->fs_devices->device_list_mutex);
5897	devices = &fs_info->fs_devices->devices;
5898	list_for_each_entry(device, devices, dev_list) {
5899		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
5900			continue;
5901
5902		ret = btrfs_trim_free_extents(device, &group_trimmed);
5903		if (ret) {
5904			dev_failed++;
5905			dev_ret = ret;
5906			break;
5907		}
5908
5909		trimmed += group_trimmed;
5910	}
5911	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5912
5913	if (dev_failed)
5914		btrfs_warn(fs_info,
5915			"failed to trim %llu device(s), last error %d",
5916			dev_failed, dev_ret);
5917	range->len = trimmed;
5918	if (bg_ret)
5919		return bg_ret;
5920	return dev_ret;
5921}
5922