xref: /kernel/linux/linux-5.10/fs/btrfs/delayed-ref.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2009 Oracle.  All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/slab.h>
8#include <linux/sort.h>
9#include "ctree.h"
10#include "delayed-ref.h"
11#include "transaction.h"
12#include "qgroup.h"
13#include "space-info.h"
14
15struct kmem_cache *btrfs_delayed_ref_head_cachep;
16struct kmem_cache *btrfs_delayed_tree_ref_cachep;
17struct kmem_cache *btrfs_delayed_data_ref_cachep;
18struct kmem_cache *btrfs_delayed_extent_op_cachep;
19/*
20 * delayed back reference update tracking.  For subvolume trees
21 * we queue up extent allocations and backref maintenance for
22 * delayed processing.   This avoids deep call chains where we
23 * add extents in the middle of btrfs_search_slot, and it allows
24 * us to buffer up frequently modified backrefs in an rb tree instead
25 * of hammering updates on the extent allocation tree.
26 */
27
28bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
29{
30	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
31	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
32	bool ret = false;
33	u64 reserved;
34
35	spin_lock(&global_rsv->lock);
36	reserved = global_rsv->reserved;
37	spin_unlock(&global_rsv->lock);
38
39	/*
40	 * Since the global reserve is just kind of magic we don't really want
41	 * to rely on it to save our bacon, so if our size is more than the
42	 * delayed_refs_rsv and the global rsv then it's time to think about
43	 * bailing.
44	 */
45	spin_lock(&delayed_refs_rsv->lock);
46	reserved += delayed_refs_rsv->reserved;
47	if (delayed_refs_rsv->size >= reserved)
48		ret = true;
49	spin_unlock(&delayed_refs_rsv->lock);
50	return ret;
51}
52
53int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
54{
55	u64 num_entries =
56		atomic_read(&trans->transaction->delayed_refs.num_entries);
57	u64 avg_runtime;
58	u64 val;
59
60	smp_mb();
61	avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
62	val = num_entries * avg_runtime;
63	if (val >= NSEC_PER_SEC)
64		return 1;
65	if (val >= NSEC_PER_SEC / 2)
66		return 2;
67
68	return btrfs_check_space_for_delayed_refs(trans->fs_info);
69}
70
71/**
72 * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
73 * @fs_info - the fs_info for our fs.
74 * @nr - the number of items to drop.
75 *
76 * This drops the delayed ref head's count from the delayed refs rsv and frees
77 * any excess reservation we had.
78 */
79void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
80{
81	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
82	u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
83	u64 released = 0;
84
85	released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
86	if (released)
87		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
88					      0, released, 0);
89}
90
91/*
92 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
93 * @trans - the trans that may have generated delayed refs
94 *
95 * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
96 * it'll calculate the additional size and add it to the delayed_refs_rsv.
97 */
98void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
99{
100	struct btrfs_fs_info *fs_info = trans->fs_info;
101	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
102	u64 num_bytes;
103
104	if (!trans->delayed_ref_updates)
105		return;
106
107	num_bytes = btrfs_calc_insert_metadata_size(fs_info,
108						    trans->delayed_ref_updates);
109	spin_lock(&delayed_rsv->lock);
110	delayed_rsv->size += num_bytes;
111	delayed_rsv->full = 0;
112	spin_unlock(&delayed_rsv->lock);
113	trans->delayed_ref_updates = 0;
114}
115
116/**
117 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
118 * @fs_info - the fs info for our fs.
119 * @src - the source block rsv to transfer from.
120 * @num_bytes - the number of bytes to transfer.
121 *
122 * This transfers up to the num_bytes amount from the src rsv to the
123 * delayed_refs_rsv.  Any extra bytes are returned to the space info.
124 */
125void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
126				       struct btrfs_block_rsv *src,
127				       u64 num_bytes)
128{
129	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
130	u64 to_free = 0;
131
132	spin_lock(&src->lock);
133	src->reserved -= num_bytes;
134	src->size -= num_bytes;
135	spin_unlock(&src->lock);
136
137	spin_lock(&delayed_refs_rsv->lock);
138	if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
139		u64 delta = delayed_refs_rsv->size -
140			delayed_refs_rsv->reserved;
141		if (num_bytes > delta) {
142			to_free = num_bytes - delta;
143			num_bytes = delta;
144		}
145	} else {
146		to_free = num_bytes;
147		num_bytes = 0;
148	}
149
150	if (num_bytes)
151		delayed_refs_rsv->reserved += num_bytes;
152	if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
153		delayed_refs_rsv->full = 1;
154	spin_unlock(&delayed_refs_rsv->lock);
155
156	if (num_bytes)
157		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
158					      0, num_bytes, 1);
159	if (to_free)
160		btrfs_space_info_free_bytes_may_use(fs_info,
161				delayed_refs_rsv->space_info, to_free);
162}
163
164/**
165 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
166 * @fs_info - the fs_info for our fs.
167 * @flush - control how we can flush for this reservation.
168 *
169 * This will refill the delayed block_rsv up to 1 items size worth of space and
170 * will return -ENOSPC if we can't make the reservation.
171 */
172int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
173				  enum btrfs_reserve_flush_enum flush)
174{
175	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
176	u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
177	u64 num_bytes = 0;
178	int ret = -ENOSPC;
179
180	spin_lock(&block_rsv->lock);
181	if (block_rsv->reserved < block_rsv->size) {
182		num_bytes = block_rsv->size - block_rsv->reserved;
183		num_bytes = min(num_bytes, limit);
184	}
185	spin_unlock(&block_rsv->lock);
186
187	if (!num_bytes)
188		return 0;
189
190	ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
191					   num_bytes, flush);
192	if (ret)
193		return ret;
194	btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
195	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
196				      0, num_bytes, 1);
197	return 0;
198}
199
200/*
201 * compare two delayed tree backrefs with same bytenr and type
202 */
203static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
204			  struct btrfs_delayed_tree_ref *ref2)
205{
206	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
207		if (ref1->root < ref2->root)
208			return -1;
209		if (ref1->root > ref2->root)
210			return 1;
211	} else {
212		if (ref1->parent < ref2->parent)
213			return -1;
214		if (ref1->parent > ref2->parent)
215			return 1;
216	}
217	return 0;
218}
219
220/*
221 * compare two delayed data backrefs with same bytenr and type
222 */
223static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
224			  struct btrfs_delayed_data_ref *ref2)
225{
226	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
227		if (ref1->root < ref2->root)
228			return -1;
229		if (ref1->root > ref2->root)
230			return 1;
231		if (ref1->objectid < ref2->objectid)
232			return -1;
233		if (ref1->objectid > ref2->objectid)
234			return 1;
235		if (ref1->offset < ref2->offset)
236			return -1;
237		if (ref1->offset > ref2->offset)
238			return 1;
239	} else {
240		if (ref1->parent < ref2->parent)
241			return -1;
242		if (ref1->parent > ref2->parent)
243			return 1;
244	}
245	return 0;
246}
247
248static int comp_refs(struct btrfs_delayed_ref_node *ref1,
249		     struct btrfs_delayed_ref_node *ref2,
250		     bool check_seq)
251{
252	int ret = 0;
253
254	if (ref1->type < ref2->type)
255		return -1;
256	if (ref1->type > ref2->type)
257		return 1;
258	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
259	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
260		ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
261				     btrfs_delayed_node_to_tree_ref(ref2));
262	else
263		ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
264				     btrfs_delayed_node_to_data_ref(ref2));
265	if (ret)
266		return ret;
267	if (check_seq) {
268		if (ref1->seq < ref2->seq)
269			return -1;
270		if (ref1->seq > ref2->seq)
271			return 1;
272	}
273	return 0;
274}
275
276/* insert a new ref to head ref rbtree */
277static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
278						   struct rb_node *node)
279{
280	struct rb_node **p = &root->rb_root.rb_node;
281	struct rb_node *parent_node = NULL;
282	struct btrfs_delayed_ref_head *entry;
283	struct btrfs_delayed_ref_head *ins;
284	u64 bytenr;
285	bool leftmost = true;
286
287	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
288	bytenr = ins->bytenr;
289	while (*p) {
290		parent_node = *p;
291		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
292				 href_node);
293
294		if (bytenr < entry->bytenr) {
295			p = &(*p)->rb_left;
296		} else if (bytenr > entry->bytenr) {
297			p = &(*p)->rb_right;
298			leftmost = false;
299		} else {
300			return entry;
301		}
302	}
303
304	rb_link_node(node, parent_node, p);
305	rb_insert_color_cached(node, root, leftmost);
306	return NULL;
307}
308
309static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
310		struct btrfs_delayed_ref_node *ins)
311{
312	struct rb_node **p = &root->rb_root.rb_node;
313	struct rb_node *node = &ins->ref_node;
314	struct rb_node *parent_node = NULL;
315	struct btrfs_delayed_ref_node *entry;
316	bool leftmost = true;
317
318	while (*p) {
319		int comp;
320
321		parent_node = *p;
322		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
323				 ref_node);
324		comp = comp_refs(ins, entry, true);
325		if (comp < 0) {
326			p = &(*p)->rb_left;
327		} else if (comp > 0) {
328			p = &(*p)->rb_right;
329			leftmost = false;
330		} else {
331			return entry;
332		}
333	}
334
335	rb_link_node(node, parent_node, p);
336	rb_insert_color_cached(node, root, leftmost);
337	return NULL;
338}
339
340static struct btrfs_delayed_ref_head *find_first_ref_head(
341		struct btrfs_delayed_ref_root *dr)
342{
343	struct rb_node *n;
344	struct btrfs_delayed_ref_head *entry;
345
346	n = rb_first_cached(&dr->href_root);
347	if (!n)
348		return NULL;
349
350	entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
351
352	return entry;
353}
354
355/*
356 * Find a head entry based on bytenr. This returns the delayed ref head if it
357 * was able to find one, or NULL if nothing was in that spot.  If return_bigger
358 * is given, the next bigger entry is returned if no exact match is found.
359 */
360static struct btrfs_delayed_ref_head *find_ref_head(
361		struct btrfs_delayed_ref_root *dr, u64 bytenr,
362		bool return_bigger)
363{
364	struct rb_root *root = &dr->href_root.rb_root;
365	struct rb_node *n;
366	struct btrfs_delayed_ref_head *entry;
367
368	n = root->rb_node;
369	entry = NULL;
370	while (n) {
371		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
372
373		if (bytenr < entry->bytenr)
374			n = n->rb_left;
375		else if (bytenr > entry->bytenr)
376			n = n->rb_right;
377		else
378			return entry;
379	}
380	if (entry && return_bigger) {
381		if (bytenr > entry->bytenr) {
382			n = rb_next(&entry->href_node);
383			if (!n)
384				return NULL;
385			entry = rb_entry(n, struct btrfs_delayed_ref_head,
386					 href_node);
387		}
388		return entry;
389	}
390	return NULL;
391}
392
393int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
394			   struct btrfs_delayed_ref_head *head)
395{
396	lockdep_assert_held(&delayed_refs->lock);
397	if (mutex_trylock(&head->mutex))
398		return 0;
399
400	refcount_inc(&head->refs);
401	spin_unlock(&delayed_refs->lock);
402
403	mutex_lock(&head->mutex);
404	spin_lock(&delayed_refs->lock);
405	if (RB_EMPTY_NODE(&head->href_node)) {
406		mutex_unlock(&head->mutex);
407		btrfs_put_delayed_ref_head(head);
408		return -EAGAIN;
409	}
410	btrfs_put_delayed_ref_head(head);
411	return 0;
412}
413
414static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
415				    struct btrfs_delayed_ref_root *delayed_refs,
416				    struct btrfs_delayed_ref_head *head,
417				    struct btrfs_delayed_ref_node *ref)
418{
419	lockdep_assert_held(&head->lock);
420	rb_erase_cached(&ref->ref_node, &head->ref_tree);
421	RB_CLEAR_NODE(&ref->ref_node);
422	if (!list_empty(&ref->add_list))
423		list_del(&ref->add_list);
424	ref->in_tree = 0;
425	btrfs_put_delayed_ref(ref);
426	atomic_dec(&delayed_refs->num_entries);
427}
428
429static bool merge_ref(struct btrfs_trans_handle *trans,
430		      struct btrfs_delayed_ref_root *delayed_refs,
431		      struct btrfs_delayed_ref_head *head,
432		      struct btrfs_delayed_ref_node *ref,
433		      u64 seq)
434{
435	struct btrfs_delayed_ref_node *next;
436	struct rb_node *node = rb_next(&ref->ref_node);
437	bool done = false;
438
439	while (!done && node) {
440		int mod;
441
442		next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
443		node = rb_next(node);
444		if (seq && next->seq >= seq)
445			break;
446		if (comp_refs(ref, next, false))
447			break;
448
449		if (ref->action == next->action) {
450			mod = next->ref_mod;
451		} else {
452			if (ref->ref_mod < next->ref_mod) {
453				swap(ref, next);
454				done = true;
455			}
456			mod = -next->ref_mod;
457		}
458
459		drop_delayed_ref(trans, delayed_refs, head, next);
460		ref->ref_mod += mod;
461		if (ref->ref_mod == 0) {
462			drop_delayed_ref(trans, delayed_refs, head, ref);
463			done = true;
464		} else {
465			/*
466			 * Can't have multiples of the same ref on a tree block.
467			 */
468			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
469				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
470		}
471	}
472
473	return done;
474}
475
476void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
477			      struct btrfs_delayed_ref_root *delayed_refs,
478			      struct btrfs_delayed_ref_head *head)
479{
480	struct btrfs_fs_info *fs_info = trans->fs_info;
481	struct btrfs_delayed_ref_node *ref;
482	struct rb_node *node;
483	u64 seq = 0;
484
485	lockdep_assert_held(&head->lock);
486
487	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
488		return;
489
490	/* We don't have too many refs to merge for data. */
491	if (head->is_data)
492		return;
493
494	read_lock(&fs_info->tree_mod_log_lock);
495	if (!list_empty(&fs_info->tree_mod_seq_list)) {
496		struct seq_list *elem;
497
498		elem = list_first_entry(&fs_info->tree_mod_seq_list,
499					struct seq_list, list);
500		seq = elem->seq;
501	}
502	read_unlock(&fs_info->tree_mod_log_lock);
503
504again:
505	for (node = rb_first_cached(&head->ref_tree); node;
506	     node = rb_next(node)) {
507		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
508		if (seq && ref->seq >= seq)
509			continue;
510		if (merge_ref(trans, delayed_refs, head, ref, seq))
511			goto again;
512	}
513}
514
515int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
516{
517	struct seq_list *elem;
518	int ret = 0;
519
520	read_lock(&fs_info->tree_mod_log_lock);
521	if (!list_empty(&fs_info->tree_mod_seq_list)) {
522		elem = list_first_entry(&fs_info->tree_mod_seq_list,
523					struct seq_list, list);
524		if (seq >= elem->seq) {
525			btrfs_debug(fs_info,
526				"holding back delayed_ref %#x.%x, lowest is %#x.%x",
527				(u32)(seq >> 32), (u32)seq,
528				(u32)(elem->seq >> 32), (u32)elem->seq);
529			ret = 1;
530		}
531	}
532
533	read_unlock(&fs_info->tree_mod_log_lock);
534	return ret;
535}
536
537struct btrfs_delayed_ref_head *btrfs_select_ref_head(
538		struct btrfs_delayed_ref_root *delayed_refs)
539{
540	struct btrfs_delayed_ref_head *head;
541
542again:
543	head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
544			     true);
545	if (!head && delayed_refs->run_delayed_start != 0) {
546		delayed_refs->run_delayed_start = 0;
547		head = find_first_ref_head(delayed_refs);
548	}
549	if (!head)
550		return NULL;
551
552	while (head->processing) {
553		struct rb_node *node;
554
555		node = rb_next(&head->href_node);
556		if (!node) {
557			if (delayed_refs->run_delayed_start == 0)
558				return NULL;
559			delayed_refs->run_delayed_start = 0;
560			goto again;
561		}
562		head = rb_entry(node, struct btrfs_delayed_ref_head,
563				href_node);
564	}
565
566	head->processing = 1;
567	WARN_ON(delayed_refs->num_heads_ready == 0);
568	delayed_refs->num_heads_ready--;
569	delayed_refs->run_delayed_start = head->bytenr +
570		head->num_bytes;
571	return head;
572}
573
574void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
575			   struct btrfs_delayed_ref_head *head)
576{
577	lockdep_assert_held(&delayed_refs->lock);
578	lockdep_assert_held(&head->lock);
579
580	rb_erase_cached(&head->href_node, &delayed_refs->href_root);
581	RB_CLEAR_NODE(&head->href_node);
582	atomic_dec(&delayed_refs->num_entries);
583	delayed_refs->num_heads--;
584	if (head->processing == 0)
585		delayed_refs->num_heads_ready--;
586}
587
588/*
589 * Helper to insert the ref_node to the tail or merge with tail.
590 *
591 * Return 0 for insert.
592 * Return >0 for merge.
593 */
594static int insert_delayed_ref(struct btrfs_trans_handle *trans,
595			      struct btrfs_delayed_ref_root *root,
596			      struct btrfs_delayed_ref_head *href,
597			      struct btrfs_delayed_ref_node *ref)
598{
599	struct btrfs_delayed_ref_node *exist;
600	int mod;
601	int ret = 0;
602
603	spin_lock(&href->lock);
604	exist = tree_insert(&href->ref_tree, ref);
605	if (!exist)
606		goto inserted;
607
608	/* Now we are sure we can merge */
609	ret = 1;
610	if (exist->action == ref->action) {
611		mod = ref->ref_mod;
612	} else {
613		/* Need to change action */
614		if (exist->ref_mod < ref->ref_mod) {
615			exist->action = ref->action;
616			mod = -exist->ref_mod;
617			exist->ref_mod = ref->ref_mod;
618			if (ref->action == BTRFS_ADD_DELAYED_REF)
619				list_add_tail(&exist->add_list,
620					      &href->ref_add_list);
621			else if (ref->action == BTRFS_DROP_DELAYED_REF) {
622				ASSERT(!list_empty(&exist->add_list));
623				list_del(&exist->add_list);
624			} else {
625				ASSERT(0);
626			}
627		} else
628			mod = -ref->ref_mod;
629	}
630	exist->ref_mod += mod;
631
632	/* remove existing tail if its ref_mod is zero */
633	if (exist->ref_mod == 0)
634		drop_delayed_ref(trans, root, href, exist);
635	spin_unlock(&href->lock);
636	return ret;
637inserted:
638	if (ref->action == BTRFS_ADD_DELAYED_REF)
639		list_add_tail(&ref->add_list, &href->ref_add_list);
640	atomic_inc(&root->num_entries);
641	spin_unlock(&href->lock);
642	return ret;
643}
644
645/*
646 * helper function to update the accounting in the head ref
647 * existing and update must have the same bytenr
648 */
649static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
650			 struct btrfs_delayed_ref_head *existing,
651			 struct btrfs_delayed_ref_head *update)
652{
653	struct btrfs_delayed_ref_root *delayed_refs =
654		&trans->transaction->delayed_refs;
655	struct btrfs_fs_info *fs_info = trans->fs_info;
656	u64 flags = btrfs_ref_head_to_space_flags(existing);
657	int old_ref_mod;
658
659	BUG_ON(existing->is_data != update->is_data);
660
661	spin_lock(&existing->lock);
662	if (update->must_insert_reserved) {
663		/* if the extent was freed and then
664		 * reallocated before the delayed ref
665		 * entries were processed, we can end up
666		 * with an existing head ref without
667		 * the must_insert_reserved flag set.
668		 * Set it again here
669		 */
670		existing->must_insert_reserved = update->must_insert_reserved;
671
672		/*
673		 * update the num_bytes so we make sure the accounting
674		 * is done correctly
675		 */
676		existing->num_bytes = update->num_bytes;
677
678	}
679
680	if (update->extent_op) {
681		if (!existing->extent_op) {
682			existing->extent_op = update->extent_op;
683		} else {
684			if (update->extent_op->update_key) {
685				memcpy(&existing->extent_op->key,
686				       &update->extent_op->key,
687				       sizeof(update->extent_op->key));
688				existing->extent_op->update_key = true;
689			}
690			if (update->extent_op->update_flags) {
691				existing->extent_op->flags_to_set |=
692					update->extent_op->flags_to_set;
693				existing->extent_op->update_flags = true;
694			}
695			btrfs_free_delayed_extent_op(update->extent_op);
696		}
697	}
698	/*
699	 * update the reference mod on the head to reflect this new operation,
700	 * only need the lock for this case cause we could be processing it
701	 * currently, for refs we just added we know we're a-ok.
702	 */
703	old_ref_mod = existing->total_ref_mod;
704	existing->ref_mod += update->ref_mod;
705	existing->total_ref_mod += update->ref_mod;
706
707	/*
708	 * If we are going to from a positive ref mod to a negative or vice
709	 * versa we need to make sure to adjust pending_csums accordingly.
710	 */
711	if (existing->is_data) {
712		u64 csum_leaves =
713			btrfs_csum_bytes_to_leaves(fs_info,
714						   existing->num_bytes);
715
716		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
717			delayed_refs->pending_csums -= existing->num_bytes;
718			btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
719		}
720		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
721			delayed_refs->pending_csums += existing->num_bytes;
722			trans->delayed_ref_updates += csum_leaves;
723		}
724	}
725
726	/*
727	 * This handles the following conditions:
728	 *
729	 * 1. We had a ref mod of 0 or more and went negative, indicating that
730	 *    we may be freeing space, so add our space to the
731	 *    total_bytes_pinned counter.
732	 * 2. We were negative and went to 0 or positive, so no longer can say
733	 *    that the space would be pinned, decrement our counter from the
734	 *    total_bytes_pinned counter.
735	 * 3. We are now at 0 and have ->must_insert_reserved set, which means
736	 *    this was a new allocation and then we dropped it, and thus must
737	 *    add our space to the total_bytes_pinned counter.
738	 */
739	if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
740		btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
741	else if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
742		btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes);
743	else if (existing->total_ref_mod == 0 && existing->must_insert_reserved)
744		btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
745
746	spin_unlock(&existing->lock);
747}
748
749static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
750				  struct btrfs_qgroup_extent_record *qrecord,
751				  u64 bytenr, u64 num_bytes, u64 ref_root,
752				  u64 reserved, int action, bool is_data,
753				  bool is_system)
754{
755	int count_mod = 1;
756	int must_insert_reserved = 0;
757
758	/* If reserved is provided, it must be a data extent. */
759	BUG_ON(!is_data && reserved);
760
761	/*
762	 * The head node stores the sum of all the mods, so dropping a ref
763	 * should drop the sum in the head node by one.
764	 */
765	if (action == BTRFS_UPDATE_DELAYED_HEAD)
766		count_mod = 0;
767	else if (action == BTRFS_DROP_DELAYED_REF)
768		count_mod = -1;
769
770	/*
771	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
772	 * accounting when the extent is finally added, or if a later
773	 * modification deletes the delayed ref without ever inserting the
774	 * extent into the extent allocation tree.  ref->must_insert_reserved
775	 * is the flag used to record that accounting mods are required.
776	 *
777	 * Once we record must_insert_reserved, switch the action to
778	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
779	 */
780	if (action == BTRFS_ADD_DELAYED_EXTENT)
781		must_insert_reserved = 1;
782	else
783		must_insert_reserved = 0;
784
785	refcount_set(&head_ref->refs, 1);
786	head_ref->bytenr = bytenr;
787	head_ref->num_bytes = num_bytes;
788	head_ref->ref_mod = count_mod;
789	head_ref->must_insert_reserved = must_insert_reserved;
790	head_ref->is_data = is_data;
791	head_ref->is_system = is_system;
792	head_ref->ref_tree = RB_ROOT_CACHED;
793	INIT_LIST_HEAD(&head_ref->ref_add_list);
794	RB_CLEAR_NODE(&head_ref->href_node);
795	head_ref->processing = 0;
796	head_ref->total_ref_mod = count_mod;
797	spin_lock_init(&head_ref->lock);
798	mutex_init(&head_ref->mutex);
799
800	if (qrecord) {
801		if (ref_root && reserved) {
802			qrecord->data_rsv = reserved;
803			qrecord->data_rsv_refroot = ref_root;
804		}
805		qrecord->bytenr = bytenr;
806		qrecord->num_bytes = num_bytes;
807		qrecord->old_roots = NULL;
808	}
809}
810
811/*
812 * helper function to actually insert a head node into the rbtree.
813 * this does all the dirty work in terms of maintaining the correct
814 * overall modification count.
815 */
816static noinline struct btrfs_delayed_ref_head *
817add_delayed_ref_head(struct btrfs_trans_handle *trans,
818		     struct btrfs_delayed_ref_head *head_ref,
819		     struct btrfs_qgroup_extent_record *qrecord,
820		     int action, int *qrecord_inserted_ret)
821{
822	struct btrfs_delayed_ref_head *existing;
823	struct btrfs_delayed_ref_root *delayed_refs;
824	int qrecord_inserted = 0;
825
826	delayed_refs = &trans->transaction->delayed_refs;
827
828	/* Record qgroup extent info if provided */
829	if (qrecord) {
830		if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
831					delayed_refs, qrecord))
832			kfree(qrecord);
833		else
834			qrecord_inserted = 1;
835	}
836
837	trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
838
839	existing = htree_insert(&delayed_refs->href_root,
840				&head_ref->href_node);
841	if (existing) {
842		update_existing_head_ref(trans, existing, head_ref);
843		/*
844		 * we've updated the existing ref, free the newly
845		 * allocated ref
846		 */
847		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
848		head_ref = existing;
849	} else {
850		u64 flags = btrfs_ref_head_to_space_flags(head_ref);
851
852		if (head_ref->is_data && head_ref->ref_mod < 0) {
853			delayed_refs->pending_csums += head_ref->num_bytes;
854			trans->delayed_ref_updates +=
855				btrfs_csum_bytes_to_leaves(trans->fs_info,
856							   head_ref->num_bytes);
857		}
858		if (head_ref->ref_mod < 0)
859			btrfs_mod_total_bytes_pinned(trans->fs_info, flags,
860						     head_ref->num_bytes);
861		delayed_refs->num_heads++;
862		delayed_refs->num_heads_ready++;
863		atomic_inc(&delayed_refs->num_entries);
864		trans->delayed_ref_updates++;
865	}
866	if (qrecord_inserted_ret)
867		*qrecord_inserted_ret = qrecord_inserted;
868
869	return head_ref;
870}
871
872/*
873 * init_delayed_ref_common - Initialize the structure which represents a
874 *			     modification to a an extent.
875 *
876 * @fs_info:    Internal to the mounted filesystem mount structure.
877 *
878 * @ref:	The structure which is going to be initialized.
879 *
880 * @bytenr:	The logical address of the extent for which a modification is
881 *		going to be recorded.
882 *
883 * @num_bytes:  Size of the extent whose modification is being recorded.
884 *
885 * @ref_root:	The id of the root where this modification has originated, this
886 *		can be either one of the well-known metadata trees or the
887 *		subvolume id which references this extent.
888 *
889 * @action:	Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
890 *		BTRFS_ADD_DELAYED_EXTENT
891 *
892 * @ref_type:	Holds the type of the extent which is being recorded, can be
893 *		one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
894 *		when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
895 *		BTRFS_EXTENT_DATA_REF_KEY when recording data extent
896 */
897static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
898				    struct btrfs_delayed_ref_node *ref,
899				    u64 bytenr, u64 num_bytes, u64 ref_root,
900				    int action, u8 ref_type)
901{
902	u64 seq = 0;
903
904	if (action == BTRFS_ADD_DELAYED_EXTENT)
905		action = BTRFS_ADD_DELAYED_REF;
906
907	if (is_fstree(ref_root))
908		seq = atomic64_read(&fs_info->tree_mod_seq);
909
910	refcount_set(&ref->refs, 1);
911	ref->bytenr = bytenr;
912	ref->num_bytes = num_bytes;
913	ref->ref_mod = 1;
914	ref->action = action;
915	ref->is_head = 0;
916	ref->in_tree = 1;
917	ref->seq = seq;
918	ref->type = ref_type;
919	RB_CLEAR_NODE(&ref->ref_node);
920	INIT_LIST_HEAD(&ref->add_list);
921}
922
923/*
924 * add a delayed tree ref.  This does all of the accounting required
925 * to make sure the delayed ref is eventually processed before this
926 * transaction commits.
927 */
928int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
929			       struct btrfs_ref *generic_ref,
930			       struct btrfs_delayed_extent_op *extent_op)
931{
932	struct btrfs_fs_info *fs_info = trans->fs_info;
933	struct btrfs_delayed_tree_ref *ref;
934	struct btrfs_delayed_ref_head *head_ref;
935	struct btrfs_delayed_ref_root *delayed_refs;
936	struct btrfs_qgroup_extent_record *record = NULL;
937	int qrecord_inserted;
938	bool is_system;
939	int action = generic_ref->action;
940	int level = generic_ref->tree_ref.level;
941	int ret;
942	u64 bytenr = generic_ref->bytenr;
943	u64 num_bytes = generic_ref->len;
944	u64 parent = generic_ref->parent;
945	u8 ref_type;
946
947	is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
948
949	ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
950	BUG_ON(extent_op && extent_op->is_data);
951	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
952	if (!ref)
953		return -ENOMEM;
954
955	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
956	if (!head_ref) {
957		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
958		return -ENOMEM;
959	}
960
961	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
962	    is_fstree(generic_ref->real_root) &&
963	    is_fstree(generic_ref->tree_ref.root) &&
964	    !generic_ref->skip_qgroup) {
965		record = kzalloc(sizeof(*record), GFP_NOFS);
966		if (!record) {
967			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
968			kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
969			return -ENOMEM;
970		}
971	}
972
973	if (parent)
974		ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
975	else
976		ref_type = BTRFS_TREE_BLOCK_REF_KEY;
977
978	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
979				generic_ref->tree_ref.root, action, ref_type);
980	ref->root = generic_ref->tree_ref.root;
981	ref->parent = parent;
982	ref->level = level;
983
984	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
985			      generic_ref->tree_ref.root, 0, action, false,
986			      is_system);
987	head_ref->extent_op = extent_op;
988
989	delayed_refs = &trans->transaction->delayed_refs;
990	spin_lock(&delayed_refs->lock);
991
992	/*
993	 * insert both the head node and the new ref without dropping
994	 * the spin lock
995	 */
996	head_ref = add_delayed_ref_head(trans, head_ref, record,
997					action, &qrecord_inserted);
998
999	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
1000	spin_unlock(&delayed_refs->lock);
1001
1002	/*
1003	 * Need to update the delayed_refs_rsv with any changes we may have
1004	 * made.
1005	 */
1006	btrfs_update_delayed_refs_rsv(trans);
1007
1008	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
1009				   action == BTRFS_ADD_DELAYED_EXTENT ?
1010				   BTRFS_ADD_DELAYED_REF : action);
1011	if (ret > 0)
1012		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
1013
1014	if (qrecord_inserted)
1015		btrfs_qgroup_trace_extent_post(fs_info, record);
1016
1017	return 0;
1018}
1019
1020/*
1021 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
1022 */
1023int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
1024			       struct btrfs_ref *generic_ref,
1025			       u64 reserved)
1026{
1027	struct btrfs_fs_info *fs_info = trans->fs_info;
1028	struct btrfs_delayed_data_ref *ref;
1029	struct btrfs_delayed_ref_head *head_ref;
1030	struct btrfs_delayed_ref_root *delayed_refs;
1031	struct btrfs_qgroup_extent_record *record = NULL;
1032	int qrecord_inserted;
1033	int action = generic_ref->action;
1034	int ret;
1035	u64 bytenr = generic_ref->bytenr;
1036	u64 num_bytes = generic_ref->len;
1037	u64 parent = generic_ref->parent;
1038	u64 ref_root = generic_ref->data_ref.ref_root;
1039	u64 owner = generic_ref->data_ref.ino;
1040	u64 offset = generic_ref->data_ref.offset;
1041	u8 ref_type;
1042
1043	ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
1044	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
1045	if (!ref)
1046		return -ENOMEM;
1047
1048	if (parent)
1049	        ref_type = BTRFS_SHARED_DATA_REF_KEY;
1050	else
1051	        ref_type = BTRFS_EXTENT_DATA_REF_KEY;
1052	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
1053				ref_root, action, ref_type);
1054	ref->root = ref_root;
1055	ref->parent = parent;
1056	ref->objectid = owner;
1057	ref->offset = offset;
1058
1059
1060	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
1061	if (!head_ref) {
1062		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1063		return -ENOMEM;
1064	}
1065
1066	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
1067	    is_fstree(ref_root) &&
1068	    is_fstree(generic_ref->real_root) &&
1069	    !generic_ref->skip_qgroup) {
1070		record = kzalloc(sizeof(*record), GFP_NOFS);
1071		if (!record) {
1072			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1073			kmem_cache_free(btrfs_delayed_ref_head_cachep,
1074					head_ref);
1075			return -ENOMEM;
1076		}
1077	}
1078
1079	init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
1080			      reserved, action, true, false);
1081	head_ref->extent_op = NULL;
1082
1083	delayed_refs = &trans->transaction->delayed_refs;
1084	spin_lock(&delayed_refs->lock);
1085
1086	/*
1087	 * insert both the head node and the new ref without dropping
1088	 * the spin lock
1089	 */
1090	head_ref = add_delayed_ref_head(trans, head_ref, record,
1091					action, &qrecord_inserted);
1092
1093	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
1094	spin_unlock(&delayed_refs->lock);
1095
1096	/*
1097	 * Need to update the delayed_refs_rsv with any changes we may have
1098	 * made.
1099	 */
1100	btrfs_update_delayed_refs_rsv(trans);
1101
1102	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
1103				   action == BTRFS_ADD_DELAYED_EXTENT ?
1104				   BTRFS_ADD_DELAYED_REF : action);
1105	if (ret > 0)
1106		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1107
1108
1109	if (qrecord_inserted)
1110		return btrfs_qgroup_trace_extent_post(fs_info, record);
1111	return 0;
1112}
1113
1114int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
1115				u64 bytenr, u64 num_bytes,
1116				struct btrfs_delayed_extent_op *extent_op)
1117{
1118	struct btrfs_delayed_ref_head *head_ref;
1119	struct btrfs_delayed_ref_root *delayed_refs;
1120
1121	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
1122	if (!head_ref)
1123		return -ENOMEM;
1124
1125	init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
1126			      BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
1127			      false);
1128	head_ref->extent_op = extent_op;
1129
1130	delayed_refs = &trans->transaction->delayed_refs;
1131	spin_lock(&delayed_refs->lock);
1132
1133	add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
1134			     NULL);
1135
1136	spin_unlock(&delayed_refs->lock);
1137
1138	/*
1139	 * Need to update the delayed_refs_rsv with any changes we may have
1140	 * made.
1141	 */
1142	btrfs_update_delayed_refs_rsv(trans);
1143	return 0;
1144}
1145
1146/*
1147 * This does a simple search for the head node for a given extent.  Returns the
1148 * head node if found, or NULL if not.
1149 */
1150struct btrfs_delayed_ref_head *
1151btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
1152{
1153	lockdep_assert_held(&delayed_refs->lock);
1154
1155	return find_ref_head(delayed_refs, bytenr, false);
1156}
1157
1158void __cold btrfs_delayed_ref_exit(void)
1159{
1160	kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
1161	kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
1162	kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
1163	kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
1164}
1165
1166int __init btrfs_delayed_ref_init(void)
1167{
1168	btrfs_delayed_ref_head_cachep = kmem_cache_create(
1169				"btrfs_delayed_ref_head",
1170				sizeof(struct btrfs_delayed_ref_head), 0,
1171				SLAB_MEM_SPREAD, NULL);
1172	if (!btrfs_delayed_ref_head_cachep)
1173		goto fail;
1174
1175	btrfs_delayed_tree_ref_cachep = kmem_cache_create(
1176				"btrfs_delayed_tree_ref",
1177				sizeof(struct btrfs_delayed_tree_ref), 0,
1178				SLAB_MEM_SPREAD, NULL);
1179	if (!btrfs_delayed_tree_ref_cachep)
1180		goto fail;
1181
1182	btrfs_delayed_data_ref_cachep = kmem_cache_create(
1183				"btrfs_delayed_data_ref",
1184				sizeof(struct btrfs_delayed_data_ref), 0,
1185				SLAB_MEM_SPREAD, NULL);
1186	if (!btrfs_delayed_data_ref_cachep)
1187		goto fail;
1188
1189	btrfs_delayed_extent_op_cachep = kmem_cache_create(
1190				"btrfs_delayed_extent_op",
1191				sizeof(struct btrfs_delayed_extent_op), 0,
1192				SLAB_MEM_SPREAD, NULL);
1193	if (!btrfs_delayed_extent_op_cachep)
1194		goto fail;
1195
1196	return 0;
1197fail:
1198	btrfs_delayed_ref_exit();
1199	return -ENOMEM;
1200}
1201