1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2011 Fujitsu.  All rights reserved.
4 * Written by Miao Xie <miaox@cn.fujitsu.com>
5 */
6
7#include <linux/slab.h>
8#include <linux/iversion.h>
9#include <linux/sched/mm.h>
10#include "misc.h"
11#include "delayed-inode.h"
12#include "disk-io.h"
13#include "transaction.h"
14#include "ctree.h"
15#include "qgroup.h"
16#include "locking.h"
17
18#define BTRFS_DELAYED_WRITEBACK		512
19#define BTRFS_DELAYED_BACKGROUND	128
20#define BTRFS_DELAYED_BATCH		16
21
22static struct kmem_cache *delayed_node_cache;
23
24int __init btrfs_delayed_inode_init(void)
25{
26	delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
27					sizeof(struct btrfs_delayed_node),
28					0,
29					SLAB_MEM_SPREAD,
30					NULL);
31	if (!delayed_node_cache)
32		return -ENOMEM;
33	return 0;
34}
35
36void __cold btrfs_delayed_inode_exit(void)
37{
38	kmem_cache_destroy(delayed_node_cache);
39}
40
41static inline void btrfs_init_delayed_node(
42				struct btrfs_delayed_node *delayed_node,
43				struct btrfs_root *root, u64 inode_id)
44{
45	delayed_node->root = root;
46	delayed_node->inode_id = inode_id;
47	refcount_set(&delayed_node->refs, 0);
48	delayed_node->ins_root = RB_ROOT_CACHED;
49	delayed_node->del_root = RB_ROOT_CACHED;
50	mutex_init(&delayed_node->mutex);
51	INIT_LIST_HEAD(&delayed_node->n_list);
52	INIT_LIST_HEAD(&delayed_node->p_list);
53}
54
55static inline int btrfs_is_continuous_delayed_item(
56					struct btrfs_delayed_item *item1,
57					struct btrfs_delayed_item *item2)
58{
59	if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
60	    item1->key.objectid == item2->key.objectid &&
61	    item1->key.type == item2->key.type &&
62	    item1->key.offset + 1 == item2->key.offset)
63		return 1;
64	return 0;
65}
66
67static struct btrfs_delayed_node *btrfs_get_delayed_node(
68		struct btrfs_inode *btrfs_inode)
69{
70	struct btrfs_root *root = btrfs_inode->root;
71	u64 ino = btrfs_ino(btrfs_inode);
72	struct btrfs_delayed_node *node;
73
74	node = READ_ONCE(btrfs_inode->delayed_node);
75	if (node) {
76		refcount_inc(&node->refs);
77		return node;
78	}
79
80	spin_lock(&root->inode_lock);
81	node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
82
83	if (node) {
84		if (btrfs_inode->delayed_node) {
85			refcount_inc(&node->refs);	/* can be accessed */
86			BUG_ON(btrfs_inode->delayed_node != node);
87			spin_unlock(&root->inode_lock);
88			return node;
89		}
90
91		/*
92		 * It's possible that we're racing into the middle of removing
93		 * this node from the radix tree.  In this case, the refcount
94		 * was zero and it should never go back to one.  Just return
95		 * NULL like it was never in the radix at all; our release
96		 * function is in the process of removing it.
97		 *
98		 * Some implementations of refcount_inc refuse to bump the
99		 * refcount once it has hit zero.  If we don't do this dance
100		 * here, refcount_inc() may decide to just WARN_ONCE() instead
101		 * of actually bumping the refcount.
102		 *
103		 * If this node is properly in the radix, we want to bump the
104		 * refcount twice, once for the inode and once for this get
105		 * operation.
106		 */
107		if (refcount_inc_not_zero(&node->refs)) {
108			refcount_inc(&node->refs);
109			btrfs_inode->delayed_node = node;
110		} else {
111			node = NULL;
112		}
113
114		spin_unlock(&root->inode_lock);
115		return node;
116	}
117	spin_unlock(&root->inode_lock);
118
119	return NULL;
120}
121
122/* Will return either the node or PTR_ERR(-ENOMEM) */
123static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
124		struct btrfs_inode *btrfs_inode)
125{
126	struct btrfs_delayed_node *node;
127	struct btrfs_root *root = btrfs_inode->root;
128	u64 ino = btrfs_ino(btrfs_inode);
129	int ret;
130
131again:
132	node = btrfs_get_delayed_node(btrfs_inode);
133	if (node)
134		return node;
135
136	node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
137	if (!node)
138		return ERR_PTR(-ENOMEM);
139	btrfs_init_delayed_node(node, root, ino);
140
141	/* cached in the btrfs inode and can be accessed */
142	refcount_set(&node->refs, 2);
143
144	ret = radix_tree_preload(GFP_NOFS);
145	if (ret) {
146		kmem_cache_free(delayed_node_cache, node);
147		return ERR_PTR(ret);
148	}
149
150	spin_lock(&root->inode_lock);
151	ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
152	if (ret == -EEXIST) {
153		spin_unlock(&root->inode_lock);
154		kmem_cache_free(delayed_node_cache, node);
155		radix_tree_preload_end();
156		goto again;
157	}
158	btrfs_inode->delayed_node = node;
159	spin_unlock(&root->inode_lock);
160	radix_tree_preload_end();
161
162	return node;
163}
164
165/*
166 * Call it when holding delayed_node->mutex
167 *
168 * If mod = 1, add this node into the prepared list.
169 */
170static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
171				     struct btrfs_delayed_node *node,
172				     int mod)
173{
174	spin_lock(&root->lock);
175	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
176		if (!list_empty(&node->p_list))
177			list_move_tail(&node->p_list, &root->prepare_list);
178		else if (mod)
179			list_add_tail(&node->p_list, &root->prepare_list);
180	} else {
181		list_add_tail(&node->n_list, &root->node_list);
182		list_add_tail(&node->p_list, &root->prepare_list);
183		refcount_inc(&node->refs);	/* inserted into list */
184		root->nodes++;
185		set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
186	}
187	spin_unlock(&root->lock);
188}
189
190/* Call it when holding delayed_node->mutex */
191static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
192				       struct btrfs_delayed_node *node)
193{
194	spin_lock(&root->lock);
195	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
196		root->nodes--;
197		refcount_dec(&node->refs);	/* not in the list */
198		list_del_init(&node->n_list);
199		if (!list_empty(&node->p_list))
200			list_del_init(&node->p_list);
201		clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
202	}
203	spin_unlock(&root->lock);
204}
205
206static struct btrfs_delayed_node *btrfs_first_delayed_node(
207			struct btrfs_delayed_root *delayed_root)
208{
209	struct list_head *p;
210	struct btrfs_delayed_node *node = NULL;
211
212	spin_lock(&delayed_root->lock);
213	if (list_empty(&delayed_root->node_list))
214		goto out;
215
216	p = delayed_root->node_list.next;
217	node = list_entry(p, struct btrfs_delayed_node, n_list);
218	refcount_inc(&node->refs);
219out:
220	spin_unlock(&delayed_root->lock);
221
222	return node;
223}
224
225static struct btrfs_delayed_node *btrfs_next_delayed_node(
226						struct btrfs_delayed_node *node)
227{
228	struct btrfs_delayed_root *delayed_root;
229	struct list_head *p;
230	struct btrfs_delayed_node *next = NULL;
231
232	delayed_root = node->root->fs_info->delayed_root;
233	spin_lock(&delayed_root->lock);
234	if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
235		/* not in the list */
236		if (list_empty(&delayed_root->node_list))
237			goto out;
238		p = delayed_root->node_list.next;
239	} else if (list_is_last(&node->n_list, &delayed_root->node_list))
240		goto out;
241	else
242		p = node->n_list.next;
243
244	next = list_entry(p, struct btrfs_delayed_node, n_list);
245	refcount_inc(&next->refs);
246out:
247	spin_unlock(&delayed_root->lock);
248
249	return next;
250}
251
252static void __btrfs_release_delayed_node(
253				struct btrfs_delayed_node *delayed_node,
254				int mod)
255{
256	struct btrfs_delayed_root *delayed_root;
257
258	if (!delayed_node)
259		return;
260
261	delayed_root = delayed_node->root->fs_info->delayed_root;
262
263	mutex_lock(&delayed_node->mutex);
264	if (delayed_node->count)
265		btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
266	else
267		btrfs_dequeue_delayed_node(delayed_root, delayed_node);
268	mutex_unlock(&delayed_node->mutex);
269
270	if (refcount_dec_and_test(&delayed_node->refs)) {
271		struct btrfs_root *root = delayed_node->root;
272
273		spin_lock(&root->inode_lock);
274		/*
275		 * Once our refcount goes to zero, nobody is allowed to bump it
276		 * back up.  We can delete it now.
277		 */
278		ASSERT(refcount_read(&delayed_node->refs) == 0);
279		radix_tree_delete(&root->delayed_nodes_tree,
280				  delayed_node->inode_id);
281		spin_unlock(&root->inode_lock);
282		kmem_cache_free(delayed_node_cache, delayed_node);
283	}
284}
285
286static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
287{
288	__btrfs_release_delayed_node(node, 0);
289}
290
291static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
292					struct btrfs_delayed_root *delayed_root)
293{
294	struct list_head *p;
295	struct btrfs_delayed_node *node = NULL;
296
297	spin_lock(&delayed_root->lock);
298	if (list_empty(&delayed_root->prepare_list))
299		goto out;
300
301	p = delayed_root->prepare_list.next;
302	list_del_init(p);
303	node = list_entry(p, struct btrfs_delayed_node, p_list);
304	refcount_inc(&node->refs);
305out:
306	spin_unlock(&delayed_root->lock);
307
308	return node;
309}
310
311static inline void btrfs_release_prepared_delayed_node(
312					struct btrfs_delayed_node *node)
313{
314	__btrfs_release_delayed_node(node, 1);
315}
316
317static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
318{
319	struct btrfs_delayed_item *item;
320	item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
321	if (item) {
322		item->data_len = data_len;
323		item->ins_or_del = 0;
324		item->bytes_reserved = 0;
325		item->delayed_node = NULL;
326		refcount_set(&item->refs, 1);
327	}
328	return item;
329}
330
331/*
332 * __btrfs_lookup_delayed_item - look up the delayed item by key
333 * @delayed_node: pointer to the delayed node
334 * @key:	  the key to look up
335 * @prev:	  used to store the prev item if the right item isn't found
336 * @next:	  used to store the next item if the right item isn't found
337 *
338 * Note: if we don't find the right item, we will return the prev item and
339 * the next item.
340 */
341static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
342				struct rb_root *root,
343				struct btrfs_key *key,
344				struct btrfs_delayed_item **prev,
345				struct btrfs_delayed_item **next)
346{
347	struct rb_node *node, *prev_node = NULL;
348	struct btrfs_delayed_item *delayed_item = NULL;
349	int ret = 0;
350
351	node = root->rb_node;
352
353	while (node) {
354		delayed_item = rb_entry(node, struct btrfs_delayed_item,
355					rb_node);
356		prev_node = node;
357		ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
358		if (ret < 0)
359			node = node->rb_right;
360		else if (ret > 0)
361			node = node->rb_left;
362		else
363			return delayed_item;
364	}
365
366	if (prev) {
367		if (!prev_node)
368			*prev = NULL;
369		else if (ret < 0)
370			*prev = delayed_item;
371		else if ((node = rb_prev(prev_node)) != NULL) {
372			*prev = rb_entry(node, struct btrfs_delayed_item,
373					 rb_node);
374		} else
375			*prev = NULL;
376	}
377
378	if (next) {
379		if (!prev_node)
380			*next = NULL;
381		else if (ret > 0)
382			*next = delayed_item;
383		else if ((node = rb_next(prev_node)) != NULL) {
384			*next = rb_entry(node, struct btrfs_delayed_item,
385					 rb_node);
386		} else
387			*next = NULL;
388	}
389	return NULL;
390}
391
392static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
393					struct btrfs_delayed_node *delayed_node,
394					struct btrfs_key *key)
395{
396	return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
397					   NULL, NULL);
398}
399
400static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
401				    struct btrfs_delayed_item *ins,
402				    int action)
403{
404	struct rb_node **p, *node;
405	struct rb_node *parent_node = NULL;
406	struct rb_root_cached *root;
407	struct btrfs_delayed_item *item;
408	int cmp;
409	bool leftmost = true;
410
411	if (action == BTRFS_DELAYED_INSERTION_ITEM)
412		root = &delayed_node->ins_root;
413	else if (action == BTRFS_DELAYED_DELETION_ITEM)
414		root = &delayed_node->del_root;
415	else
416		BUG();
417	p = &root->rb_root.rb_node;
418	node = &ins->rb_node;
419
420	while (*p) {
421		parent_node = *p;
422		item = rb_entry(parent_node, struct btrfs_delayed_item,
423				 rb_node);
424
425		cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
426		if (cmp < 0) {
427			p = &(*p)->rb_right;
428			leftmost = false;
429		} else if (cmp > 0) {
430			p = &(*p)->rb_left;
431		} else {
432			return -EEXIST;
433		}
434	}
435
436	rb_link_node(node, parent_node, p);
437	rb_insert_color_cached(node, root, leftmost);
438	ins->delayed_node = delayed_node;
439	ins->ins_or_del = action;
440
441	if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
442	    action == BTRFS_DELAYED_INSERTION_ITEM &&
443	    ins->key.offset >= delayed_node->index_cnt)
444			delayed_node->index_cnt = ins->key.offset + 1;
445
446	delayed_node->count++;
447	atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
448	return 0;
449}
450
451static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
452					      struct btrfs_delayed_item *item)
453{
454	return __btrfs_add_delayed_item(node, item,
455					BTRFS_DELAYED_INSERTION_ITEM);
456}
457
458static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
459					     struct btrfs_delayed_item *item)
460{
461	return __btrfs_add_delayed_item(node, item,
462					BTRFS_DELAYED_DELETION_ITEM);
463}
464
465static void finish_one_item(struct btrfs_delayed_root *delayed_root)
466{
467	int seq = atomic_inc_return(&delayed_root->items_seq);
468
469	/* atomic_dec_return implies a barrier */
470	if ((atomic_dec_return(&delayed_root->items) <
471	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
472		cond_wake_up_nomb(&delayed_root->wait);
473}
474
475static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
476{
477	struct rb_root_cached *root;
478	struct btrfs_delayed_root *delayed_root;
479
480	/* Not associated with any delayed_node */
481	if (!delayed_item->delayed_node)
482		return;
483	delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
484
485	BUG_ON(!delayed_root);
486	BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
487	       delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
488
489	if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
490		root = &delayed_item->delayed_node->ins_root;
491	else
492		root = &delayed_item->delayed_node->del_root;
493
494	rb_erase_cached(&delayed_item->rb_node, root);
495	delayed_item->delayed_node->count--;
496
497	finish_one_item(delayed_root);
498}
499
500static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
501{
502	if (item) {
503		__btrfs_remove_delayed_item(item);
504		if (refcount_dec_and_test(&item->refs))
505			kfree(item);
506	}
507}
508
509static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
510					struct btrfs_delayed_node *delayed_node)
511{
512	struct rb_node *p;
513	struct btrfs_delayed_item *item = NULL;
514
515	p = rb_first_cached(&delayed_node->ins_root);
516	if (p)
517		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
518
519	return item;
520}
521
522static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
523					struct btrfs_delayed_node *delayed_node)
524{
525	struct rb_node *p;
526	struct btrfs_delayed_item *item = NULL;
527
528	p = rb_first_cached(&delayed_node->del_root);
529	if (p)
530		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
531
532	return item;
533}
534
535static struct btrfs_delayed_item *__btrfs_next_delayed_item(
536						struct btrfs_delayed_item *item)
537{
538	struct rb_node *p;
539	struct btrfs_delayed_item *next = NULL;
540
541	p = rb_next(&item->rb_node);
542	if (p)
543		next = rb_entry(p, struct btrfs_delayed_item, rb_node);
544
545	return next;
546}
547
548static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
549					       struct btrfs_root *root,
550					       struct btrfs_delayed_item *item)
551{
552	struct btrfs_block_rsv *src_rsv;
553	struct btrfs_block_rsv *dst_rsv;
554	struct btrfs_fs_info *fs_info = root->fs_info;
555	u64 num_bytes;
556	int ret;
557
558	if (!trans->bytes_reserved)
559		return 0;
560
561	src_rsv = trans->block_rsv;
562	dst_rsv = &fs_info->delayed_block_rsv;
563
564	num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
565
566	/*
567	 * Here we migrate space rsv from transaction rsv, since have already
568	 * reserved space when starting a transaction.  So no need to reserve
569	 * qgroup space here.
570	 */
571	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
572	if (!ret) {
573		trace_btrfs_space_reservation(fs_info, "delayed_item",
574					      item->key.objectid,
575					      num_bytes, 1);
576		item->bytes_reserved = num_bytes;
577	}
578
579	return ret;
580}
581
582static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
583						struct btrfs_delayed_item *item)
584{
585	struct btrfs_block_rsv *rsv;
586	struct btrfs_fs_info *fs_info = root->fs_info;
587
588	if (!item->bytes_reserved)
589		return;
590
591	rsv = &fs_info->delayed_block_rsv;
592	/*
593	 * Check btrfs_delayed_item_reserve_metadata() to see why we don't need
594	 * to release/reserve qgroup space.
595	 */
596	trace_btrfs_space_reservation(fs_info, "delayed_item",
597				      item->key.objectid, item->bytes_reserved,
598				      0);
599	btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
600}
601
602static int btrfs_delayed_inode_reserve_metadata(
603					struct btrfs_trans_handle *trans,
604					struct btrfs_root *root,
605					struct btrfs_inode *inode,
606					struct btrfs_delayed_node *node)
607{
608	struct btrfs_fs_info *fs_info = root->fs_info;
609	struct btrfs_block_rsv *src_rsv;
610	struct btrfs_block_rsv *dst_rsv;
611	u64 num_bytes;
612	int ret;
613
614	src_rsv = trans->block_rsv;
615	dst_rsv = &fs_info->delayed_block_rsv;
616
617	num_bytes = btrfs_calc_metadata_size(fs_info, 1);
618
619	/*
620	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
621	 * which doesn't reserve space for speed.  This is a problem since we
622	 * still need to reserve space for this update, so try to reserve the
623	 * space.
624	 *
625	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
626	 * we always reserve enough to update the inode item.
627	 */
628	if (!src_rsv || (!trans->bytes_reserved &&
629			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
630		ret = btrfs_qgroup_reserve_meta(root, num_bytes,
631					  BTRFS_QGROUP_RSV_META_PREALLOC, true);
632		if (ret < 0)
633			return ret;
634		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
635					  BTRFS_RESERVE_NO_FLUSH);
636		/*
637		 * Since we're under a transaction reserve_metadata_bytes could
638		 * try to commit the transaction which will make it return
639		 * EAGAIN to make us stop the transaction we have, so return
640		 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
641		 */
642		if (ret == -EAGAIN) {
643			ret = -ENOSPC;
644			btrfs_qgroup_free_meta_prealloc(root, num_bytes);
645		}
646		if (!ret) {
647			node->bytes_reserved = num_bytes;
648			trace_btrfs_space_reservation(fs_info,
649						      "delayed_inode",
650						      btrfs_ino(inode),
651						      num_bytes, 1);
652		} else {
653			btrfs_qgroup_free_meta_prealloc(root, num_bytes);
654		}
655		return ret;
656	}
657
658	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
659	if (!ret) {
660		trace_btrfs_space_reservation(fs_info, "delayed_inode",
661					      btrfs_ino(inode), num_bytes, 1);
662		node->bytes_reserved = num_bytes;
663	}
664
665	return ret;
666}
667
668static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
669						struct btrfs_delayed_node *node,
670						bool qgroup_free)
671{
672	struct btrfs_block_rsv *rsv;
673
674	if (!node->bytes_reserved)
675		return;
676
677	rsv = &fs_info->delayed_block_rsv;
678	trace_btrfs_space_reservation(fs_info, "delayed_inode",
679				      node->inode_id, node->bytes_reserved, 0);
680	btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
681	if (qgroup_free)
682		btrfs_qgroup_free_meta_prealloc(node->root,
683				node->bytes_reserved);
684	else
685		btrfs_qgroup_convert_reserved_meta(node->root,
686				node->bytes_reserved);
687	node->bytes_reserved = 0;
688}
689
690/*
691 * This helper will insert some continuous items into the same leaf according
692 * to the free space of the leaf.
693 */
694static int btrfs_batch_insert_items(struct btrfs_root *root,
695				    struct btrfs_path *path,
696				    struct btrfs_delayed_item *item)
697{
698	struct btrfs_delayed_item *curr, *next;
699	int free_space;
700	int total_data_size = 0, total_size = 0;
701	struct extent_buffer *leaf;
702	char *data_ptr;
703	struct btrfs_key *keys;
704	u32 *data_size;
705	struct list_head head;
706	int slot;
707	int nitems;
708	int i;
709	int ret = 0;
710
711	BUG_ON(!path->nodes[0]);
712
713	leaf = path->nodes[0];
714	free_space = btrfs_leaf_free_space(leaf);
715	INIT_LIST_HEAD(&head);
716
717	next = item;
718	nitems = 0;
719
720	/*
721	 * count the number of the continuous items that we can insert in batch
722	 */
723	while (total_size + next->data_len + sizeof(struct btrfs_item) <=
724	       free_space) {
725		total_data_size += next->data_len;
726		total_size += next->data_len + sizeof(struct btrfs_item);
727		list_add_tail(&next->tree_list, &head);
728		nitems++;
729
730		curr = next;
731		next = __btrfs_next_delayed_item(curr);
732		if (!next)
733			break;
734
735		if (!btrfs_is_continuous_delayed_item(curr, next))
736			break;
737	}
738
739	if (!nitems) {
740		ret = 0;
741		goto out;
742	}
743
744	/*
745	 * we need allocate some memory space, but it might cause the task
746	 * to sleep, so we set all locked nodes in the path to blocking locks
747	 * first.
748	 */
749	btrfs_set_path_blocking(path);
750
751	keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
752	if (!keys) {
753		ret = -ENOMEM;
754		goto out;
755	}
756
757	data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
758	if (!data_size) {
759		ret = -ENOMEM;
760		goto error;
761	}
762
763	/* get keys of all the delayed items */
764	i = 0;
765	list_for_each_entry(next, &head, tree_list) {
766		keys[i] = next->key;
767		data_size[i] = next->data_len;
768		i++;
769	}
770
771	/* insert the keys of the items */
772	setup_items_for_insert(root, path, keys, data_size, nitems);
773
774	/* insert the dir index items */
775	slot = path->slots[0];
776	list_for_each_entry_safe(curr, next, &head, tree_list) {
777		data_ptr = btrfs_item_ptr(leaf, slot, char);
778		write_extent_buffer(leaf, &curr->data,
779				    (unsigned long)data_ptr,
780				    curr->data_len);
781		slot++;
782
783		btrfs_delayed_item_release_metadata(root, curr);
784
785		list_del(&curr->tree_list);
786		btrfs_release_delayed_item(curr);
787	}
788
789error:
790	kfree(data_size);
791	kfree(keys);
792out:
793	return ret;
794}
795
796/*
797 * This helper can just do simple insertion that needn't extend item for new
798 * data, such as directory name index insertion, inode insertion.
799 */
800static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
801				     struct btrfs_root *root,
802				     struct btrfs_path *path,
803				     struct btrfs_delayed_item *delayed_item)
804{
805	struct extent_buffer *leaf;
806	unsigned int nofs_flag;
807	char *ptr;
808	int ret;
809
810	nofs_flag = memalloc_nofs_save();
811	ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
812				      delayed_item->data_len);
813	memalloc_nofs_restore(nofs_flag);
814	if (ret < 0 && ret != -EEXIST)
815		return ret;
816
817	leaf = path->nodes[0];
818
819	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
820
821	write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
822			    delayed_item->data_len);
823	btrfs_mark_buffer_dirty(leaf);
824
825	btrfs_delayed_item_release_metadata(root, delayed_item);
826	return 0;
827}
828
829/*
830 * we insert an item first, then if there are some continuous items, we try
831 * to insert those items into the same leaf.
832 */
833static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
834				      struct btrfs_path *path,
835				      struct btrfs_root *root,
836				      struct btrfs_delayed_node *node)
837{
838	struct btrfs_delayed_item *curr, *prev;
839	int ret = 0;
840
841do_again:
842	mutex_lock(&node->mutex);
843	curr = __btrfs_first_delayed_insertion_item(node);
844	if (!curr)
845		goto insert_end;
846
847	ret = btrfs_insert_delayed_item(trans, root, path, curr);
848	if (ret < 0) {
849		btrfs_release_path(path);
850		goto insert_end;
851	}
852
853	prev = curr;
854	curr = __btrfs_next_delayed_item(prev);
855	if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
856		/* insert the continuous items into the same leaf */
857		path->slots[0]++;
858		btrfs_batch_insert_items(root, path, curr);
859	}
860	btrfs_release_delayed_item(prev);
861	btrfs_mark_buffer_dirty(path->nodes[0]);
862
863	btrfs_release_path(path);
864	mutex_unlock(&node->mutex);
865	goto do_again;
866
867insert_end:
868	mutex_unlock(&node->mutex);
869	return ret;
870}
871
872static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
873				    struct btrfs_root *root,
874				    struct btrfs_path *path,
875				    struct btrfs_delayed_item *item)
876{
877	struct btrfs_delayed_item *curr, *next;
878	struct extent_buffer *leaf;
879	struct btrfs_key key;
880	struct list_head head;
881	int nitems, i, last_item;
882	int ret = 0;
883
884	BUG_ON(!path->nodes[0]);
885
886	leaf = path->nodes[0];
887
888	i = path->slots[0];
889	last_item = btrfs_header_nritems(leaf) - 1;
890	if (i > last_item)
891		return -ENOENT;	/* FIXME: Is errno suitable? */
892
893	next = item;
894	INIT_LIST_HEAD(&head);
895	btrfs_item_key_to_cpu(leaf, &key, i);
896	nitems = 0;
897	/*
898	 * count the number of the dir index items that we can delete in batch
899	 */
900	while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
901		list_add_tail(&next->tree_list, &head);
902		nitems++;
903
904		curr = next;
905		next = __btrfs_next_delayed_item(curr);
906		if (!next)
907			break;
908
909		if (!btrfs_is_continuous_delayed_item(curr, next))
910			break;
911
912		i++;
913		if (i > last_item)
914			break;
915		btrfs_item_key_to_cpu(leaf, &key, i);
916	}
917
918	if (!nitems)
919		return 0;
920
921	ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
922	if (ret)
923		goto out;
924
925	list_for_each_entry_safe(curr, next, &head, tree_list) {
926		btrfs_delayed_item_release_metadata(root, curr);
927		list_del(&curr->tree_list);
928		btrfs_release_delayed_item(curr);
929	}
930
931out:
932	return ret;
933}
934
935static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
936				      struct btrfs_path *path,
937				      struct btrfs_root *root,
938				      struct btrfs_delayed_node *node)
939{
940	struct btrfs_delayed_item *curr, *prev;
941	unsigned int nofs_flag;
942	int ret = 0;
943
944do_again:
945	mutex_lock(&node->mutex);
946	curr = __btrfs_first_delayed_deletion_item(node);
947	if (!curr)
948		goto delete_fail;
949
950	nofs_flag = memalloc_nofs_save();
951	ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
952	memalloc_nofs_restore(nofs_flag);
953	if (ret < 0)
954		goto delete_fail;
955	else if (ret > 0) {
956		/*
957		 * can't find the item which the node points to, so this node
958		 * is invalid, just drop it.
959		 */
960		prev = curr;
961		curr = __btrfs_next_delayed_item(prev);
962		btrfs_release_delayed_item(prev);
963		ret = 0;
964		btrfs_release_path(path);
965		if (curr) {
966			mutex_unlock(&node->mutex);
967			goto do_again;
968		} else
969			goto delete_fail;
970	}
971
972	btrfs_batch_delete_items(trans, root, path, curr);
973	btrfs_release_path(path);
974	mutex_unlock(&node->mutex);
975	goto do_again;
976
977delete_fail:
978	btrfs_release_path(path);
979	mutex_unlock(&node->mutex);
980	return ret;
981}
982
983static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
984{
985	struct btrfs_delayed_root *delayed_root;
986
987	if (delayed_node &&
988	    test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
989		BUG_ON(!delayed_node->root);
990		clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
991		delayed_node->count--;
992
993		delayed_root = delayed_node->root->fs_info->delayed_root;
994		finish_one_item(delayed_root);
995	}
996}
997
998static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
999{
1000	struct btrfs_delayed_root *delayed_root;
1001
1002	ASSERT(delayed_node->root);
1003	clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
1004	delayed_node->count--;
1005
1006	delayed_root = delayed_node->root->fs_info->delayed_root;
1007	finish_one_item(delayed_root);
1008}
1009
1010static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1011					struct btrfs_root *root,
1012					struct btrfs_path *path,
1013					struct btrfs_delayed_node *node)
1014{
1015	struct btrfs_fs_info *fs_info = root->fs_info;
1016	struct btrfs_key key;
1017	struct btrfs_inode_item *inode_item;
1018	struct extent_buffer *leaf;
1019	unsigned int nofs_flag;
1020	int mod;
1021	int ret;
1022
1023	key.objectid = node->inode_id;
1024	key.type = BTRFS_INODE_ITEM_KEY;
1025	key.offset = 0;
1026
1027	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1028		mod = -1;
1029	else
1030		mod = 1;
1031
1032	nofs_flag = memalloc_nofs_save();
1033	ret = btrfs_lookup_inode(trans, root, path, &key, mod);
1034	memalloc_nofs_restore(nofs_flag);
1035	if (ret > 0)
1036		ret = -ENOENT;
1037	if (ret < 0)
1038		goto out;
1039
1040	leaf = path->nodes[0];
1041	inode_item = btrfs_item_ptr(leaf, path->slots[0],
1042				    struct btrfs_inode_item);
1043	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
1044			    sizeof(struct btrfs_inode_item));
1045	btrfs_mark_buffer_dirty(leaf);
1046
1047	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1048		goto no_iref;
1049
1050	path->slots[0]++;
1051	if (path->slots[0] >= btrfs_header_nritems(leaf))
1052		goto search;
1053again:
1054	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1055	if (key.objectid != node->inode_id)
1056		goto out;
1057
1058	if (key.type != BTRFS_INODE_REF_KEY &&
1059	    key.type != BTRFS_INODE_EXTREF_KEY)
1060		goto out;
1061
1062	/*
1063	 * Delayed iref deletion is for the inode who has only one link,
1064	 * so there is only one iref. The case that several irefs are
1065	 * in the same item doesn't exist.
1066	 */
1067	btrfs_del_item(trans, root, path);
1068out:
1069	btrfs_release_delayed_iref(node);
1070no_iref:
1071	btrfs_release_path(path);
1072err_out:
1073	btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
1074	btrfs_release_delayed_inode(node);
1075
1076	/*
1077	 * If we fail to update the delayed inode we need to abort the
1078	 * transaction, because we could leave the inode with the improper
1079	 * counts behind.
1080	 */
1081	if (ret && ret != -ENOENT)
1082		btrfs_abort_transaction(trans, ret);
1083
1084	return ret;
1085
1086search:
1087	btrfs_release_path(path);
1088
1089	key.type = BTRFS_INODE_EXTREF_KEY;
1090	key.offset = -1;
1091
1092	nofs_flag = memalloc_nofs_save();
1093	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1094	memalloc_nofs_restore(nofs_flag);
1095	if (ret < 0)
1096		goto err_out;
1097	ASSERT(ret);
1098
1099	ret = 0;
1100	leaf = path->nodes[0];
1101	path->slots[0]--;
1102	goto again;
1103}
1104
1105static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1106					     struct btrfs_root *root,
1107					     struct btrfs_path *path,
1108					     struct btrfs_delayed_node *node)
1109{
1110	int ret;
1111
1112	mutex_lock(&node->mutex);
1113	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
1114		mutex_unlock(&node->mutex);
1115		return 0;
1116	}
1117
1118	ret = __btrfs_update_delayed_inode(trans, root, path, node);
1119	mutex_unlock(&node->mutex);
1120	return ret;
1121}
1122
1123static inline int
1124__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1125				   struct btrfs_path *path,
1126				   struct btrfs_delayed_node *node)
1127{
1128	int ret;
1129
1130	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1131	if (ret)
1132		return ret;
1133
1134	ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1135	if (ret)
1136		return ret;
1137
1138	ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1139	return ret;
1140}
1141
1142/*
1143 * Called when committing the transaction.
1144 * Returns 0 on success.
1145 * Returns < 0 on error and returns with an aborted transaction with any
1146 * outstanding delayed items cleaned up.
1147 */
1148static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
1149{
1150	struct btrfs_fs_info *fs_info = trans->fs_info;
1151	struct btrfs_delayed_root *delayed_root;
1152	struct btrfs_delayed_node *curr_node, *prev_node;
1153	struct btrfs_path *path;
1154	struct btrfs_block_rsv *block_rsv;
1155	int ret = 0;
1156	bool count = (nr > 0);
1157
1158	if (TRANS_ABORTED(trans))
1159		return -EIO;
1160
1161	path = btrfs_alloc_path();
1162	if (!path)
1163		return -ENOMEM;
1164	path->leave_spinning = 1;
1165
1166	block_rsv = trans->block_rsv;
1167	trans->block_rsv = &fs_info->delayed_block_rsv;
1168
1169	delayed_root = fs_info->delayed_root;
1170
1171	curr_node = btrfs_first_delayed_node(delayed_root);
1172	while (curr_node && (!count || (count && nr--))) {
1173		ret = __btrfs_commit_inode_delayed_items(trans, path,
1174							 curr_node);
1175		if (ret) {
1176			btrfs_abort_transaction(trans, ret);
1177			break;
1178		}
1179
1180		prev_node = curr_node;
1181		curr_node = btrfs_next_delayed_node(curr_node);
1182		/*
1183		 * See the comment below about releasing path before releasing
1184		 * node. If the commit of delayed items was successful the path
1185		 * should always be released, but in case of an error, it may
1186		 * point to locked extent buffers (a leaf at the very least).
1187		 */
1188		ASSERT(path->nodes[0] == NULL);
1189		btrfs_release_delayed_node(prev_node);
1190	}
1191
1192	/*
1193	 * Release the path to avoid a potential deadlock and lockdep splat when
1194	 * releasing the delayed node, as that requires taking the delayed node's
1195	 * mutex. If another task starts running delayed items before we take
1196	 * the mutex, it will first lock the mutex and then it may try to lock
1197	 * the same btree path (leaf).
1198	 */
1199	btrfs_free_path(path);
1200
1201	if (curr_node)
1202		btrfs_release_delayed_node(curr_node);
1203	trans->block_rsv = block_rsv;
1204
1205	return ret;
1206}
1207
1208int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
1209{
1210	return __btrfs_run_delayed_items(trans, -1);
1211}
1212
1213int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
1214{
1215	return __btrfs_run_delayed_items(trans, nr);
1216}
1217
1218int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1219				     struct btrfs_inode *inode)
1220{
1221	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1222	struct btrfs_path *path;
1223	struct btrfs_block_rsv *block_rsv;
1224	int ret;
1225
1226	if (!delayed_node)
1227		return 0;
1228
1229	mutex_lock(&delayed_node->mutex);
1230	if (!delayed_node->count) {
1231		mutex_unlock(&delayed_node->mutex);
1232		btrfs_release_delayed_node(delayed_node);
1233		return 0;
1234	}
1235	mutex_unlock(&delayed_node->mutex);
1236
1237	path = btrfs_alloc_path();
1238	if (!path) {
1239		btrfs_release_delayed_node(delayed_node);
1240		return -ENOMEM;
1241	}
1242	path->leave_spinning = 1;
1243
1244	block_rsv = trans->block_rsv;
1245	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1246
1247	ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1248
1249	btrfs_release_delayed_node(delayed_node);
1250	btrfs_free_path(path);
1251	trans->block_rsv = block_rsv;
1252
1253	return ret;
1254}
1255
1256int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
1257{
1258	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1259	struct btrfs_trans_handle *trans;
1260	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1261	struct btrfs_path *path;
1262	struct btrfs_block_rsv *block_rsv;
1263	int ret;
1264
1265	if (!delayed_node)
1266		return 0;
1267
1268	mutex_lock(&delayed_node->mutex);
1269	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1270		mutex_unlock(&delayed_node->mutex);
1271		btrfs_release_delayed_node(delayed_node);
1272		return 0;
1273	}
1274	mutex_unlock(&delayed_node->mutex);
1275
1276	trans = btrfs_join_transaction(delayed_node->root);
1277	if (IS_ERR(trans)) {
1278		ret = PTR_ERR(trans);
1279		goto out;
1280	}
1281
1282	path = btrfs_alloc_path();
1283	if (!path) {
1284		ret = -ENOMEM;
1285		goto trans_out;
1286	}
1287	path->leave_spinning = 1;
1288
1289	block_rsv = trans->block_rsv;
1290	trans->block_rsv = &fs_info->delayed_block_rsv;
1291
1292	mutex_lock(&delayed_node->mutex);
1293	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
1294		ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
1295						   path, delayed_node);
1296	else
1297		ret = 0;
1298	mutex_unlock(&delayed_node->mutex);
1299
1300	btrfs_free_path(path);
1301	trans->block_rsv = block_rsv;
1302trans_out:
1303	btrfs_end_transaction(trans);
1304	btrfs_btree_balance_dirty(fs_info);
1305out:
1306	btrfs_release_delayed_node(delayed_node);
1307
1308	return ret;
1309}
1310
1311void btrfs_remove_delayed_node(struct btrfs_inode *inode)
1312{
1313	struct btrfs_delayed_node *delayed_node;
1314
1315	delayed_node = READ_ONCE(inode->delayed_node);
1316	if (!delayed_node)
1317		return;
1318
1319	inode->delayed_node = NULL;
1320	btrfs_release_delayed_node(delayed_node);
1321}
1322
1323struct btrfs_async_delayed_work {
1324	struct btrfs_delayed_root *delayed_root;
1325	int nr;
1326	struct btrfs_work work;
1327};
1328
1329static void btrfs_async_run_delayed_root(struct btrfs_work *work)
1330{
1331	struct btrfs_async_delayed_work *async_work;
1332	struct btrfs_delayed_root *delayed_root;
1333	struct btrfs_trans_handle *trans;
1334	struct btrfs_path *path;
1335	struct btrfs_delayed_node *delayed_node = NULL;
1336	struct btrfs_root *root;
1337	struct btrfs_block_rsv *block_rsv;
1338	int total_done = 0;
1339
1340	async_work = container_of(work, struct btrfs_async_delayed_work, work);
1341	delayed_root = async_work->delayed_root;
1342
1343	path = btrfs_alloc_path();
1344	if (!path)
1345		goto out;
1346
1347	do {
1348		if (atomic_read(&delayed_root->items) <
1349		    BTRFS_DELAYED_BACKGROUND / 2)
1350			break;
1351
1352		delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
1353		if (!delayed_node)
1354			break;
1355
1356		path->leave_spinning = 1;
1357		root = delayed_node->root;
1358
1359		trans = btrfs_join_transaction(root);
1360		if (IS_ERR(trans)) {
1361			btrfs_release_path(path);
1362			btrfs_release_prepared_delayed_node(delayed_node);
1363			total_done++;
1364			continue;
1365		}
1366
1367		block_rsv = trans->block_rsv;
1368		trans->block_rsv = &root->fs_info->delayed_block_rsv;
1369
1370		__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1371
1372		trans->block_rsv = block_rsv;
1373		btrfs_end_transaction(trans);
1374		btrfs_btree_balance_dirty_nodelay(root->fs_info);
1375
1376		btrfs_release_path(path);
1377		btrfs_release_prepared_delayed_node(delayed_node);
1378		total_done++;
1379
1380	} while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
1381		 || total_done < async_work->nr);
1382
1383	btrfs_free_path(path);
1384out:
1385	wake_up(&delayed_root->wait);
1386	kfree(async_work);
1387}
1388
1389
1390static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1391				     struct btrfs_fs_info *fs_info, int nr)
1392{
1393	struct btrfs_async_delayed_work *async_work;
1394
1395	async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
1396	if (!async_work)
1397		return -ENOMEM;
1398
1399	async_work->delayed_root = delayed_root;
1400	btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
1401			NULL);
1402	async_work->nr = nr;
1403
1404	btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
1405	return 0;
1406}
1407
1408void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
1409{
1410	WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
1411}
1412
1413static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
1414{
1415	int val = atomic_read(&delayed_root->items_seq);
1416
1417	if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
1418		return 1;
1419
1420	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1421		return 1;
1422
1423	return 0;
1424}
1425
1426void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
1427{
1428	struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
1429
1430	if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
1431		btrfs_workqueue_normal_congested(fs_info->delayed_workers))
1432		return;
1433
1434	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
1435		int seq;
1436		int ret;
1437
1438		seq = atomic_read(&delayed_root->items_seq);
1439
1440		ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
1441		if (ret)
1442			return;
1443
1444		wait_event_interruptible(delayed_root->wait,
1445					 could_end_wait(delayed_root, seq));
1446		return;
1447	}
1448
1449	btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
1450}
1451
1452/* Will return 0 or -ENOMEM */
1453int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1454				   const char *name, int name_len,
1455				   struct btrfs_inode *dir,
1456				   struct btrfs_disk_key *disk_key, u8 type,
1457				   u64 index)
1458{
1459	struct btrfs_delayed_node *delayed_node;
1460	struct btrfs_delayed_item *delayed_item;
1461	struct btrfs_dir_item *dir_item;
1462	int ret;
1463
1464	delayed_node = btrfs_get_or_create_delayed_node(dir);
1465	if (IS_ERR(delayed_node))
1466		return PTR_ERR(delayed_node);
1467
1468	delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
1469	if (!delayed_item) {
1470		ret = -ENOMEM;
1471		goto release_node;
1472	}
1473
1474	delayed_item->key.objectid = btrfs_ino(dir);
1475	delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
1476	delayed_item->key.offset = index;
1477
1478	dir_item = (struct btrfs_dir_item *)delayed_item->data;
1479	dir_item->location = *disk_key;
1480	btrfs_set_stack_dir_transid(dir_item, trans->transid);
1481	btrfs_set_stack_dir_data_len(dir_item, 0);
1482	btrfs_set_stack_dir_name_len(dir_item, name_len);
1483	btrfs_set_stack_dir_type(dir_item, type);
1484	memcpy((char *)(dir_item + 1), name, name_len);
1485
1486	ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
1487	/*
1488	 * we have reserved enough space when we start a new transaction,
1489	 * so reserving metadata failure is impossible
1490	 */
1491	BUG_ON(ret);
1492
1493	mutex_lock(&delayed_node->mutex);
1494	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
1495	if (unlikely(ret)) {
1496		btrfs_err(trans->fs_info,
1497			  "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
1498			  name_len, name, delayed_node->root->root_key.objectid,
1499			  delayed_node->inode_id, ret);
1500		BUG();
1501	}
1502	mutex_unlock(&delayed_node->mutex);
1503
1504release_node:
1505	btrfs_release_delayed_node(delayed_node);
1506	return ret;
1507}
1508
1509static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
1510					       struct btrfs_delayed_node *node,
1511					       struct btrfs_key *key)
1512{
1513	struct btrfs_delayed_item *item;
1514
1515	mutex_lock(&node->mutex);
1516	item = __btrfs_lookup_delayed_insertion_item(node, key);
1517	if (!item) {
1518		mutex_unlock(&node->mutex);
1519		return 1;
1520	}
1521
1522	btrfs_delayed_item_release_metadata(node->root, item);
1523	btrfs_release_delayed_item(item);
1524	mutex_unlock(&node->mutex);
1525	return 0;
1526}
1527
1528int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
1529				   struct btrfs_inode *dir, u64 index)
1530{
1531	struct btrfs_delayed_node *node;
1532	struct btrfs_delayed_item *item;
1533	struct btrfs_key item_key;
1534	int ret;
1535
1536	node = btrfs_get_or_create_delayed_node(dir);
1537	if (IS_ERR(node))
1538		return PTR_ERR(node);
1539
1540	item_key.objectid = btrfs_ino(dir);
1541	item_key.type = BTRFS_DIR_INDEX_KEY;
1542	item_key.offset = index;
1543
1544	ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node,
1545						  &item_key);
1546	if (!ret)
1547		goto end;
1548
1549	item = btrfs_alloc_delayed_item(0);
1550	if (!item) {
1551		ret = -ENOMEM;
1552		goto end;
1553	}
1554
1555	item->key = item_key;
1556
1557	ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
1558	/*
1559	 * we have reserved enough space when we start a new transaction,
1560	 * so reserving metadata failure is impossible.
1561	 */
1562	if (ret < 0) {
1563		btrfs_err(trans->fs_info,
1564"metadata reservation failed for delayed dir item deltiona, should have been reserved");
1565		btrfs_release_delayed_item(item);
1566		goto end;
1567	}
1568
1569	mutex_lock(&node->mutex);
1570	ret = __btrfs_add_delayed_deletion_item(node, item);
1571	if (unlikely(ret)) {
1572		btrfs_err(trans->fs_info,
1573			  "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
1574			  index, node->root->root_key.objectid,
1575			  node->inode_id, ret);
1576		btrfs_delayed_item_release_metadata(dir->root, item);
1577		btrfs_release_delayed_item(item);
1578	}
1579	mutex_unlock(&node->mutex);
1580end:
1581	btrfs_release_delayed_node(node);
1582	return ret;
1583}
1584
1585int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
1586{
1587	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1588
1589	if (!delayed_node)
1590		return -ENOENT;
1591
1592	/*
1593	 * Since we have held i_mutex of this directory, it is impossible that
1594	 * a new directory index is added into the delayed node and index_cnt
1595	 * is updated now. So we needn't lock the delayed node.
1596	 */
1597	if (!delayed_node->index_cnt) {
1598		btrfs_release_delayed_node(delayed_node);
1599		return -EINVAL;
1600	}
1601
1602	inode->index_cnt = delayed_node->index_cnt;
1603	btrfs_release_delayed_node(delayed_node);
1604	return 0;
1605}
1606
1607bool btrfs_readdir_get_delayed_items(struct inode *inode,
1608				     struct list_head *ins_list,
1609				     struct list_head *del_list)
1610{
1611	struct btrfs_delayed_node *delayed_node;
1612	struct btrfs_delayed_item *item;
1613
1614	delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
1615	if (!delayed_node)
1616		return false;
1617
1618	/*
1619	 * We can only do one readdir with delayed items at a time because of
1620	 * item->readdir_list.
1621	 */
1622	inode_unlock_shared(inode);
1623	inode_lock(inode);
1624
1625	mutex_lock(&delayed_node->mutex);
1626	item = __btrfs_first_delayed_insertion_item(delayed_node);
1627	while (item) {
1628		refcount_inc(&item->refs);
1629		list_add_tail(&item->readdir_list, ins_list);
1630		item = __btrfs_next_delayed_item(item);
1631	}
1632
1633	item = __btrfs_first_delayed_deletion_item(delayed_node);
1634	while (item) {
1635		refcount_inc(&item->refs);
1636		list_add_tail(&item->readdir_list, del_list);
1637		item = __btrfs_next_delayed_item(item);
1638	}
1639	mutex_unlock(&delayed_node->mutex);
1640	/*
1641	 * This delayed node is still cached in the btrfs inode, so refs
1642	 * must be > 1 now, and we needn't check it is going to be freed
1643	 * or not.
1644	 *
1645	 * Besides that, this function is used to read dir, we do not
1646	 * insert/delete delayed items in this period. So we also needn't
1647	 * requeue or dequeue this delayed node.
1648	 */
1649	refcount_dec(&delayed_node->refs);
1650
1651	return true;
1652}
1653
1654void btrfs_readdir_put_delayed_items(struct inode *inode,
1655				     struct list_head *ins_list,
1656				     struct list_head *del_list)
1657{
1658	struct btrfs_delayed_item *curr, *next;
1659
1660	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1661		list_del(&curr->readdir_list);
1662		if (refcount_dec_and_test(&curr->refs))
1663			kfree(curr);
1664	}
1665
1666	list_for_each_entry_safe(curr, next, del_list, readdir_list) {
1667		list_del(&curr->readdir_list);
1668		if (refcount_dec_and_test(&curr->refs))
1669			kfree(curr);
1670	}
1671
1672	/*
1673	 * The VFS is going to do up_read(), so we need to downgrade back to a
1674	 * read lock.
1675	 */
1676	downgrade_write(&inode->i_rwsem);
1677}
1678
1679int btrfs_should_delete_dir_index(struct list_head *del_list,
1680				  u64 index)
1681{
1682	struct btrfs_delayed_item *curr;
1683	int ret = 0;
1684
1685	list_for_each_entry(curr, del_list, readdir_list) {
1686		if (curr->key.offset > index)
1687			break;
1688		if (curr->key.offset == index) {
1689			ret = 1;
1690			break;
1691		}
1692	}
1693	return ret;
1694}
1695
1696/*
1697 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
1698 *
1699 */
1700int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
1701				    struct list_head *ins_list)
1702{
1703	struct btrfs_dir_item *di;
1704	struct btrfs_delayed_item *curr, *next;
1705	struct btrfs_key location;
1706	char *name;
1707	int name_len;
1708	int over = 0;
1709	unsigned char d_type;
1710
1711	if (list_empty(ins_list))
1712		return 0;
1713
1714	/*
1715	 * Changing the data of the delayed item is impossible. So
1716	 * we needn't lock them. And we have held i_mutex of the
1717	 * directory, nobody can delete any directory indexes now.
1718	 */
1719	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1720		list_del(&curr->readdir_list);
1721
1722		if (curr->key.offset < ctx->pos) {
1723			if (refcount_dec_and_test(&curr->refs))
1724				kfree(curr);
1725			continue;
1726		}
1727
1728		ctx->pos = curr->key.offset;
1729
1730		di = (struct btrfs_dir_item *)curr->data;
1731		name = (char *)(di + 1);
1732		name_len = btrfs_stack_dir_name_len(di);
1733
1734		d_type = fs_ftype_to_dtype(di->type);
1735		btrfs_disk_key_to_cpu(&location, &di->location);
1736
1737		over = !dir_emit(ctx, name, name_len,
1738			       location.objectid, d_type);
1739
1740		if (refcount_dec_and_test(&curr->refs))
1741			kfree(curr);
1742
1743		if (over)
1744			return 1;
1745		ctx->pos++;
1746	}
1747	return 0;
1748}
1749
1750static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1751				  struct btrfs_inode_item *inode_item,
1752				  struct inode *inode)
1753{
1754	btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
1755	btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
1756	btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
1757	btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
1758	btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
1759	btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
1760	btrfs_set_stack_inode_generation(inode_item,
1761					 BTRFS_I(inode)->generation);
1762	btrfs_set_stack_inode_sequence(inode_item,
1763				       inode_peek_iversion(inode));
1764	btrfs_set_stack_inode_transid(inode_item, trans->transid);
1765	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
1766	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
1767	btrfs_set_stack_inode_block_group(inode_item, 0);
1768
1769	btrfs_set_stack_timespec_sec(&inode_item->atime,
1770				     inode->i_atime.tv_sec);
1771	btrfs_set_stack_timespec_nsec(&inode_item->atime,
1772				      inode->i_atime.tv_nsec);
1773
1774	btrfs_set_stack_timespec_sec(&inode_item->mtime,
1775				     inode->i_mtime.tv_sec);
1776	btrfs_set_stack_timespec_nsec(&inode_item->mtime,
1777				      inode->i_mtime.tv_nsec);
1778
1779	btrfs_set_stack_timespec_sec(&inode_item->ctime,
1780				     inode->i_ctime.tv_sec);
1781	btrfs_set_stack_timespec_nsec(&inode_item->ctime,
1782				      inode->i_ctime.tv_nsec);
1783
1784	btrfs_set_stack_timespec_sec(&inode_item->otime,
1785				     BTRFS_I(inode)->i_otime.tv_sec);
1786	btrfs_set_stack_timespec_nsec(&inode_item->otime,
1787				     BTRFS_I(inode)->i_otime.tv_nsec);
1788}
1789
1790int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1791{
1792	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
1793	struct btrfs_delayed_node *delayed_node;
1794	struct btrfs_inode_item *inode_item;
1795
1796	delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
1797	if (!delayed_node)
1798		return -ENOENT;
1799
1800	mutex_lock(&delayed_node->mutex);
1801	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1802		mutex_unlock(&delayed_node->mutex);
1803		btrfs_release_delayed_node(delayed_node);
1804		return -ENOENT;
1805	}
1806
1807	inode_item = &delayed_node->inode_item;
1808
1809	i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
1810	i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
1811	btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
1812	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
1813			round_up(i_size_read(inode), fs_info->sectorsize));
1814	inode->i_mode = btrfs_stack_inode_mode(inode_item);
1815	set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
1816	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1817	BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
1818        BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
1819
1820	inode_set_iversion_queried(inode,
1821				   btrfs_stack_inode_sequence(inode_item));
1822	inode->i_rdev = 0;
1823	*rdev = btrfs_stack_inode_rdev(inode_item);
1824	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
1825
1826	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
1827	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
1828
1829	inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
1830	inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
1831
1832	inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
1833	inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
1834
1835	BTRFS_I(inode)->i_otime.tv_sec =
1836		btrfs_stack_timespec_sec(&inode_item->otime);
1837	BTRFS_I(inode)->i_otime.tv_nsec =
1838		btrfs_stack_timespec_nsec(&inode_item->otime);
1839
1840	inode->i_generation = BTRFS_I(inode)->generation;
1841	BTRFS_I(inode)->index_cnt = (u64)-1;
1842
1843	mutex_unlock(&delayed_node->mutex);
1844	btrfs_release_delayed_node(delayed_node);
1845	return 0;
1846}
1847
1848int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1849			       struct btrfs_root *root, struct inode *inode)
1850{
1851	struct btrfs_delayed_node *delayed_node;
1852	int ret = 0;
1853
1854	delayed_node = btrfs_get_or_create_delayed_node(BTRFS_I(inode));
1855	if (IS_ERR(delayed_node))
1856		return PTR_ERR(delayed_node);
1857
1858	mutex_lock(&delayed_node->mutex);
1859	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1860		fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1861		goto release_node;
1862	}
1863
1864	ret = btrfs_delayed_inode_reserve_metadata(trans, root, BTRFS_I(inode),
1865						   delayed_node);
1866	if (ret)
1867		goto release_node;
1868
1869	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1870	set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
1871	delayed_node->count++;
1872	atomic_inc(&root->fs_info->delayed_root->items);
1873release_node:
1874	mutex_unlock(&delayed_node->mutex);
1875	btrfs_release_delayed_node(delayed_node);
1876	return ret;
1877}
1878
1879int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
1880{
1881	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1882	struct btrfs_delayed_node *delayed_node;
1883
1884	/*
1885	 * we don't do delayed inode updates during log recovery because it
1886	 * leads to enospc problems.  This means we also can't do
1887	 * delayed inode refs
1888	 */
1889	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
1890		return -EAGAIN;
1891
1892	delayed_node = btrfs_get_or_create_delayed_node(inode);
1893	if (IS_ERR(delayed_node))
1894		return PTR_ERR(delayed_node);
1895
1896	/*
1897	 * We don't reserve space for inode ref deletion is because:
1898	 * - We ONLY do async inode ref deletion for the inode who has only
1899	 *   one link(i_nlink == 1), it means there is only one inode ref.
1900	 *   And in most case, the inode ref and the inode item are in the
1901	 *   same leaf, and we will deal with them at the same time.
1902	 *   Since we are sure we will reserve the space for the inode item,
1903	 *   it is unnecessary to reserve space for inode ref deletion.
1904	 * - If the inode ref and the inode item are not in the same leaf,
1905	 *   We also needn't worry about enospc problem, because we reserve
1906	 *   much more space for the inode update than it needs.
1907	 * - At the worst, we can steal some space from the global reservation.
1908	 *   It is very rare.
1909	 */
1910	mutex_lock(&delayed_node->mutex);
1911	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
1912		goto release_node;
1913
1914	set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
1915	delayed_node->count++;
1916	atomic_inc(&fs_info->delayed_root->items);
1917release_node:
1918	mutex_unlock(&delayed_node->mutex);
1919	btrfs_release_delayed_node(delayed_node);
1920	return 0;
1921}
1922
1923static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
1924{
1925	struct btrfs_root *root = delayed_node->root;
1926	struct btrfs_fs_info *fs_info = root->fs_info;
1927	struct btrfs_delayed_item *curr_item, *prev_item;
1928
1929	mutex_lock(&delayed_node->mutex);
1930	curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
1931	while (curr_item) {
1932		btrfs_delayed_item_release_metadata(root, curr_item);
1933		prev_item = curr_item;
1934		curr_item = __btrfs_next_delayed_item(prev_item);
1935		btrfs_release_delayed_item(prev_item);
1936	}
1937
1938	curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
1939	while (curr_item) {
1940		btrfs_delayed_item_release_metadata(root, curr_item);
1941		prev_item = curr_item;
1942		curr_item = __btrfs_next_delayed_item(prev_item);
1943		btrfs_release_delayed_item(prev_item);
1944	}
1945
1946	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
1947		btrfs_release_delayed_iref(delayed_node);
1948
1949	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1950		btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
1951		btrfs_release_delayed_inode(delayed_node);
1952	}
1953	mutex_unlock(&delayed_node->mutex);
1954}
1955
1956void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
1957{
1958	struct btrfs_delayed_node *delayed_node;
1959
1960	delayed_node = btrfs_get_delayed_node(inode);
1961	if (!delayed_node)
1962		return;
1963
1964	__btrfs_kill_delayed_node(delayed_node);
1965	btrfs_release_delayed_node(delayed_node);
1966}
1967
1968void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
1969{
1970	u64 inode_id = 0;
1971	struct btrfs_delayed_node *delayed_nodes[8];
1972	int i, n;
1973
1974	while (1) {
1975		spin_lock(&root->inode_lock);
1976		n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
1977					   (void **)delayed_nodes, inode_id,
1978					   ARRAY_SIZE(delayed_nodes));
1979		if (!n) {
1980			spin_unlock(&root->inode_lock);
1981			break;
1982		}
1983
1984		inode_id = delayed_nodes[n - 1]->inode_id + 1;
1985		for (i = 0; i < n; i++) {
1986			/*
1987			 * Don't increase refs in case the node is dead and
1988			 * about to be removed from the tree in the loop below
1989			 */
1990			if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
1991				delayed_nodes[i] = NULL;
1992		}
1993		spin_unlock(&root->inode_lock);
1994
1995		for (i = 0; i < n; i++) {
1996			if (!delayed_nodes[i])
1997				continue;
1998			__btrfs_kill_delayed_node(delayed_nodes[i]);
1999			btrfs_release_delayed_node(delayed_nodes[i]);
2000		}
2001	}
2002}
2003
2004void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
2005{
2006	struct btrfs_delayed_node *curr_node, *prev_node;
2007
2008	curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
2009	while (curr_node) {
2010		__btrfs_kill_delayed_node(curr_node);
2011
2012		prev_node = curr_node;
2013		curr_node = btrfs_next_delayed_node(curr_node);
2014		btrfs_release_delayed_node(prev_node);
2015	}
2016}
2017
2018