xref: /kernel/linux/linux-6.6/fs/btrfs/qgroup.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2011 STRATO.  All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/pagemap.h>
8#include <linux/writeback.h>
9#include <linux/blkdev.h>
10#include <linux/rbtree.h>
11#include <linux/slab.h>
12#include <linux/workqueue.h>
13#include <linux/btrfs.h>
14#include <linux/sched/mm.h>
15
16#include "ctree.h"
17#include "transaction.h"
18#include "disk-io.h"
19#include "locking.h"
20#include "ulist.h"
21#include "backref.h"
22#include "extent_io.h"
23#include "qgroup.h"
24#include "block-group.h"
25#include "sysfs.h"
26#include "tree-mod-log.h"
27#include "fs.h"
28#include "accessors.h"
29#include "extent-tree.h"
30#include "root-tree.h"
31#include "tree-checker.h"
32
33/*
34 * Helpers to access qgroup reservation
35 *
36 * Callers should ensure the lock context and type are valid
37 */
38
39static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
40{
41	u64 ret = 0;
42	int i;
43
44	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
45		ret += qgroup->rsv.values[i];
46
47	return ret;
48}
49
50#ifdef CONFIG_BTRFS_DEBUG
51static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
52{
53	if (type == BTRFS_QGROUP_RSV_DATA)
54		return "data";
55	if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
56		return "meta_pertrans";
57	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
58		return "meta_prealloc";
59	return NULL;
60}
61#endif
62
63static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
64			   struct btrfs_qgroup *qgroup, u64 num_bytes,
65			   enum btrfs_qgroup_rsv_type type)
66{
67	trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
68	qgroup->rsv.values[type] += num_bytes;
69}
70
71static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
72			       struct btrfs_qgroup *qgroup, u64 num_bytes,
73			       enum btrfs_qgroup_rsv_type type)
74{
75	trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
76	if (qgroup->rsv.values[type] >= num_bytes) {
77		qgroup->rsv.values[type] -= num_bytes;
78		return;
79	}
80#ifdef CONFIG_BTRFS_DEBUG
81	WARN_RATELIMIT(1,
82		"qgroup %llu %s reserved space underflow, have %llu to free %llu",
83		qgroup->qgroupid, qgroup_rsv_type_str(type),
84		qgroup->rsv.values[type], num_bytes);
85#endif
86	qgroup->rsv.values[type] = 0;
87}
88
89static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
90				     struct btrfs_qgroup *dest,
91				     struct btrfs_qgroup *src)
92{
93	int i;
94
95	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
96		qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
97}
98
99static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
100					 struct btrfs_qgroup *dest,
101					  struct btrfs_qgroup *src)
102{
103	int i;
104
105	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
106		qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
107}
108
109static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
110					   int mod)
111{
112	if (qg->old_refcnt < seq)
113		qg->old_refcnt = seq;
114	qg->old_refcnt += mod;
115}
116
117static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
118					   int mod)
119{
120	if (qg->new_refcnt < seq)
121		qg->new_refcnt = seq;
122	qg->new_refcnt += mod;
123}
124
125static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
126{
127	if (qg->old_refcnt < seq)
128		return 0;
129	return qg->old_refcnt - seq;
130}
131
132static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
133{
134	if (qg->new_refcnt < seq)
135		return 0;
136	return qg->new_refcnt - seq;
137}
138
139/*
140 * glue structure to represent the relations between qgroups.
141 */
142struct btrfs_qgroup_list {
143	struct list_head next_group;
144	struct list_head next_member;
145	struct btrfs_qgroup *group;
146	struct btrfs_qgroup *member;
147};
148
149static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
150{
151	return (u64)(uintptr_t)qg;
152}
153
154static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
155{
156	return (struct btrfs_qgroup *)(uintptr_t)n->aux;
157}
158
159static int
160qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
161		   int init_flags);
162static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
163
164/* must be called with qgroup_ioctl_lock held */
165static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
166					   u64 qgroupid)
167{
168	struct rb_node *n = fs_info->qgroup_tree.rb_node;
169	struct btrfs_qgroup *qgroup;
170
171	while (n) {
172		qgroup = rb_entry(n, struct btrfs_qgroup, node);
173		if (qgroup->qgroupid < qgroupid)
174			n = n->rb_left;
175		else if (qgroup->qgroupid > qgroupid)
176			n = n->rb_right;
177		else
178			return qgroup;
179	}
180	return NULL;
181}
182
183/* must be called with qgroup_lock held */
184static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
185					  u64 qgroupid)
186{
187	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
188	struct rb_node *parent = NULL;
189	struct btrfs_qgroup *qgroup;
190
191	while (*p) {
192		parent = *p;
193		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
194
195		if (qgroup->qgroupid < qgroupid)
196			p = &(*p)->rb_left;
197		else if (qgroup->qgroupid > qgroupid)
198			p = &(*p)->rb_right;
199		else
200			return qgroup;
201	}
202
203	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
204	if (!qgroup)
205		return ERR_PTR(-ENOMEM);
206
207	qgroup->qgroupid = qgroupid;
208	INIT_LIST_HEAD(&qgroup->groups);
209	INIT_LIST_HEAD(&qgroup->members);
210	INIT_LIST_HEAD(&qgroup->dirty);
211	INIT_LIST_HEAD(&qgroup->iterator);
212
213	rb_link_node(&qgroup->node, parent, p);
214	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
215
216	return qgroup;
217}
218
219static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
220			    struct btrfs_qgroup *qgroup)
221{
222	struct btrfs_qgroup_list *list;
223
224	list_del(&qgroup->dirty);
225	while (!list_empty(&qgroup->groups)) {
226		list = list_first_entry(&qgroup->groups,
227					struct btrfs_qgroup_list, next_group);
228		list_del(&list->next_group);
229		list_del(&list->next_member);
230		kfree(list);
231	}
232
233	while (!list_empty(&qgroup->members)) {
234		list = list_first_entry(&qgroup->members,
235					struct btrfs_qgroup_list, next_member);
236		list_del(&list->next_group);
237		list_del(&list->next_member);
238		kfree(list);
239	}
240}
241
242/* must be called with qgroup_lock held */
243static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
244{
245	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
246
247	if (!qgroup)
248		return -ENOENT;
249
250	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
251	__del_qgroup_rb(fs_info, qgroup);
252	return 0;
253}
254
255/*
256 * Add relation specified by two qgroups.
257 *
258 * Must be called with qgroup_lock held.
259 *
260 * Return: 0        on success
261 *         -ENOENT  if one of the qgroups is NULL
262 *         <0       other errors
263 */
264static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
265{
266	struct btrfs_qgroup_list *list;
267
268	if (!member || !parent)
269		return -ENOENT;
270
271	list = kzalloc(sizeof(*list), GFP_ATOMIC);
272	if (!list)
273		return -ENOMEM;
274
275	list->group = parent;
276	list->member = member;
277	list_add_tail(&list->next_group, &member->groups);
278	list_add_tail(&list->next_member, &parent->members);
279
280	return 0;
281}
282
283/*
284 * Add relation specified by two qgroup ids.
285 *
286 * Must be called with qgroup_lock held.
287 *
288 * Return: 0        on success
289 *         -ENOENT  if one of the ids does not exist
290 *         <0       other errors
291 */
292static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
293{
294	struct btrfs_qgroup *member;
295	struct btrfs_qgroup *parent;
296
297	member = find_qgroup_rb(fs_info, memberid);
298	parent = find_qgroup_rb(fs_info, parentid);
299
300	return __add_relation_rb(member, parent);
301}
302
303/* Must be called with qgroup_lock held */
304static int del_relation_rb(struct btrfs_fs_info *fs_info,
305			   u64 memberid, u64 parentid)
306{
307	struct btrfs_qgroup *member;
308	struct btrfs_qgroup *parent;
309	struct btrfs_qgroup_list *list;
310
311	member = find_qgroup_rb(fs_info, memberid);
312	parent = find_qgroup_rb(fs_info, parentid);
313	if (!member || !parent)
314		return -ENOENT;
315
316	list_for_each_entry(list, &member->groups, next_group) {
317		if (list->group == parent) {
318			list_del(&list->next_group);
319			list_del(&list->next_member);
320			kfree(list);
321			return 0;
322		}
323	}
324	return -ENOENT;
325}
326
327#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
328int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
329			       u64 rfer, u64 excl)
330{
331	struct btrfs_qgroup *qgroup;
332
333	qgroup = find_qgroup_rb(fs_info, qgroupid);
334	if (!qgroup)
335		return -EINVAL;
336	if (qgroup->rfer != rfer || qgroup->excl != excl)
337		return -EINVAL;
338	return 0;
339}
340#endif
341
342static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
343{
344	fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
345				  BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
346				  BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
347}
348
349/*
350 * The full config is read in one go, only called from open_ctree()
351 * It doesn't use any locking, as at this point we're still single-threaded
352 */
353int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
354{
355	struct btrfs_key key;
356	struct btrfs_key found_key;
357	struct btrfs_root *quota_root = fs_info->quota_root;
358	struct btrfs_path *path = NULL;
359	struct extent_buffer *l;
360	int slot;
361	int ret = 0;
362	u64 flags = 0;
363	u64 rescan_progress = 0;
364
365	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
366		return 0;
367
368	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
369	if (!fs_info->qgroup_ulist) {
370		ret = -ENOMEM;
371		goto out;
372	}
373
374	path = btrfs_alloc_path();
375	if (!path) {
376		ret = -ENOMEM;
377		goto out;
378	}
379
380	ret = btrfs_sysfs_add_qgroups(fs_info);
381	if (ret < 0)
382		goto out;
383	/* default this to quota off, in case no status key is found */
384	fs_info->qgroup_flags = 0;
385
386	/*
387	 * pass 1: read status, all qgroup infos and limits
388	 */
389	key.objectid = 0;
390	key.type = 0;
391	key.offset = 0;
392	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
393	if (ret)
394		goto out;
395
396	while (1) {
397		struct btrfs_qgroup *qgroup;
398
399		slot = path->slots[0];
400		l = path->nodes[0];
401		btrfs_item_key_to_cpu(l, &found_key, slot);
402
403		if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
404			struct btrfs_qgroup_status_item *ptr;
405
406			ptr = btrfs_item_ptr(l, slot,
407					     struct btrfs_qgroup_status_item);
408
409			if (btrfs_qgroup_status_version(l, ptr) !=
410			    BTRFS_QGROUP_STATUS_VERSION) {
411				btrfs_err(fs_info,
412				 "old qgroup version, quota disabled");
413				goto out;
414			}
415			if (btrfs_qgroup_status_generation(l, ptr) !=
416			    fs_info->generation) {
417				qgroup_mark_inconsistent(fs_info);
418				btrfs_err(fs_info,
419					"qgroup generation mismatch, marked as inconsistent");
420			}
421			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
422									  ptr);
423			rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
424			goto next1;
425		}
426
427		if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
428		    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
429			goto next1;
430
431		qgroup = find_qgroup_rb(fs_info, found_key.offset);
432		if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
433		    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
434			btrfs_err(fs_info, "inconsistent qgroup config");
435			qgroup_mark_inconsistent(fs_info);
436		}
437		if (!qgroup) {
438			qgroup = add_qgroup_rb(fs_info, found_key.offset);
439			if (IS_ERR(qgroup)) {
440				ret = PTR_ERR(qgroup);
441				goto out;
442			}
443		}
444		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
445		if (ret < 0)
446			goto out;
447
448		switch (found_key.type) {
449		case BTRFS_QGROUP_INFO_KEY: {
450			struct btrfs_qgroup_info_item *ptr;
451
452			ptr = btrfs_item_ptr(l, slot,
453					     struct btrfs_qgroup_info_item);
454			qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
455			qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
456			qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
457			qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
458			/* generation currently unused */
459			break;
460		}
461		case BTRFS_QGROUP_LIMIT_KEY: {
462			struct btrfs_qgroup_limit_item *ptr;
463
464			ptr = btrfs_item_ptr(l, slot,
465					     struct btrfs_qgroup_limit_item);
466			qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
467			qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
468			qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
469			qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
470			qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
471			break;
472		}
473		}
474next1:
475		ret = btrfs_next_item(quota_root, path);
476		if (ret < 0)
477			goto out;
478		if (ret)
479			break;
480	}
481	btrfs_release_path(path);
482
483	/*
484	 * pass 2: read all qgroup relations
485	 */
486	key.objectid = 0;
487	key.type = BTRFS_QGROUP_RELATION_KEY;
488	key.offset = 0;
489	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
490	if (ret)
491		goto out;
492	while (1) {
493		slot = path->slots[0];
494		l = path->nodes[0];
495		btrfs_item_key_to_cpu(l, &found_key, slot);
496
497		if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
498			goto next2;
499
500		if (found_key.objectid > found_key.offset) {
501			/* parent <- member, not needed to build config */
502			/* FIXME should we omit the key completely? */
503			goto next2;
504		}
505
506		ret = add_relation_rb(fs_info, found_key.objectid,
507				      found_key.offset);
508		if (ret == -ENOENT) {
509			btrfs_warn(fs_info,
510				"orphan qgroup relation 0x%llx->0x%llx",
511				found_key.objectid, found_key.offset);
512			ret = 0;	/* ignore the error */
513		}
514		if (ret)
515			goto out;
516next2:
517		ret = btrfs_next_item(quota_root, path);
518		if (ret < 0)
519			goto out;
520		if (ret)
521			break;
522	}
523out:
524	btrfs_free_path(path);
525	fs_info->qgroup_flags |= flags;
526	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
527		clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
528	else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
529		 ret >= 0)
530		ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
531
532	if (ret < 0) {
533		ulist_free(fs_info->qgroup_ulist);
534		fs_info->qgroup_ulist = NULL;
535		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
536		btrfs_sysfs_del_qgroups(fs_info);
537	}
538
539	return ret < 0 ? ret : 0;
540}
541
542/*
543 * Called in close_ctree() when quota is still enabled.  This verifies we don't
544 * leak some reserved space.
545 *
546 * Return false if no reserved space is left.
547 * Return true if some reserved space is leaked.
548 */
549bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
550{
551	struct rb_node *node;
552	bool ret = false;
553
554	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
555		return ret;
556	/*
557	 * Since we're unmounting, there is no race and no need to grab qgroup
558	 * lock.  And here we don't go post-order to provide a more user
559	 * friendly sorted result.
560	 */
561	for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
562		struct btrfs_qgroup *qgroup;
563		int i;
564
565		qgroup = rb_entry(node, struct btrfs_qgroup, node);
566		for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
567			if (qgroup->rsv.values[i]) {
568				ret = true;
569				btrfs_warn(fs_info,
570		"qgroup %hu/%llu has unreleased space, type %d rsv %llu",
571				   btrfs_qgroup_level(qgroup->qgroupid),
572				   btrfs_qgroup_subvolid(qgroup->qgroupid),
573				   i, qgroup->rsv.values[i]);
574			}
575		}
576	}
577	return ret;
578}
579
580/*
581 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
582 * first two are in single-threaded paths.And for the third one, we have set
583 * quota_root to be null with qgroup_lock held before, so it is safe to clean
584 * up the in-memory structures without qgroup_lock held.
585 */
586void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
587{
588	struct rb_node *n;
589	struct btrfs_qgroup *qgroup;
590
591	while ((n = rb_first(&fs_info->qgroup_tree))) {
592		qgroup = rb_entry(n, struct btrfs_qgroup, node);
593		rb_erase(n, &fs_info->qgroup_tree);
594		__del_qgroup_rb(fs_info, qgroup);
595		btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
596		kfree(qgroup);
597	}
598	/*
599	 * We call btrfs_free_qgroup_config() when unmounting
600	 * filesystem and disabling quota, so we set qgroup_ulist
601	 * to be null here to avoid double free.
602	 */
603	ulist_free(fs_info->qgroup_ulist);
604	fs_info->qgroup_ulist = NULL;
605	btrfs_sysfs_del_qgroups(fs_info);
606}
607
608static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
609				    u64 dst)
610{
611	int ret;
612	struct btrfs_root *quota_root = trans->fs_info->quota_root;
613	struct btrfs_path *path;
614	struct btrfs_key key;
615
616	path = btrfs_alloc_path();
617	if (!path)
618		return -ENOMEM;
619
620	key.objectid = src;
621	key.type = BTRFS_QGROUP_RELATION_KEY;
622	key.offset = dst;
623
624	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
625
626	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
627
628	btrfs_free_path(path);
629	return ret;
630}
631
632static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
633				    u64 dst)
634{
635	int ret;
636	struct btrfs_root *quota_root = trans->fs_info->quota_root;
637	struct btrfs_path *path;
638	struct btrfs_key key;
639
640	path = btrfs_alloc_path();
641	if (!path)
642		return -ENOMEM;
643
644	key.objectid = src;
645	key.type = BTRFS_QGROUP_RELATION_KEY;
646	key.offset = dst;
647
648	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
649	if (ret < 0)
650		goto out;
651
652	if (ret > 0) {
653		ret = -ENOENT;
654		goto out;
655	}
656
657	ret = btrfs_del_item(trans, quota_root, path);
658out:
659	btrfs_free_path(path);
660	return ret;
661}
662
663static int add_qgroup_item(struct btrfs_trans_handle *trans,
664			   struct btrfs_root *quota_root, u64 qgroupid)
665{
666	int ret;
667	struct btrfs_path *path;
668	struct btrfs_qgroup_info_item *qgroup_info;
669	struct btrfs_qgroup_limit_item *qgroup_limit;
670	struct extent_buffer *leaf;
671	struct btrfs_key key;
672
673	if (btrfs_is_testing(quota_root->fs_info))
674		return 0;
675
676	path = btrfs_alloc_path();
677	if (!path)
678		return -ENOMEM;
679
680	key.objectid = 0;
681	key.type = BTRFS_QGROUP_INFO_KEY;
682	key.offset = qgroupid;
683
684	/*
685	 * Avoid a transaction abort by catching -EEXIST here. In that
686	 * case, we proceed by re-initializing the existing structure
687	 * on disk.
688	 */
689
690	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
691				      sizeof(*qgroup_info));
692	if (ret && ret != -EEXIST)
693		goto out;
694
695	leaf = path->nodes[0];
696	qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
697				 struct btrfs_qgroup_info_item);
698	btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
699	btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
700	btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
701	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
702	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
703
704	btrfs_mark_buffer_dirty(trans, leaf);
705
706	btrfs_release_path(path);
707
708	key.type = BTRFS_QGROUP_LIMIT_KEY;
709	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
710				      sizeof(*qgroup_limit));
711	if (ret && ret != -EEXIST)
712		goto out;
713
714	leaf = path->nodes[0];
715	qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
716				  struct btrfs_qgroup_limit_item);
717	btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
718	btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
719	btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
720	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
721	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
722
723	btrfs_mark_buffer_dirty(trans, leaf);
724
725	ret = 0;
726out:
727	btrfs_free_path(path);
728	return ret;
729}
730
731static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
732{
733	int ret;
734	struct btrfs_root *quota_root = trans->fs_info->quota_root;
735	struct btrfs_path *path;
736	struct btrfs_key key;
737
738	path = btrfs_alloc_path();
739	if (!path)
740		return -ENOMEM;
741
742	key.objectid = 0;
743	key.type = BTRFS_QGROUP_INFO_KEY;
744	key.offset = qgroupid;
745	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
746	if (ret < 0)
747		goto out;
748
749	if (ret > 0) {
750		ret = -ENOENT;
751		goto out;
752	}
753
754	ret = btrfs_del_item(trans, quota_root, path);
755	if (ret)
756		goto out;
757
758	btrfs_release_path(path);
759
760	key.type = BTRFS_QGROUP_LIMIT_KEY;
761	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
762	if (ret < 0)
763		goto out;
764
765	if (ret > 0) {
766		ret = -ENOENT;
767		goto out;
768	}
769
770	ret = btrfs_del_item(trans, quota_root, path);
771
772out:
773	btrfs_free_path(path);
774	return ret;
775}
776
777static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
778				    struct btrfs_qgroup *qgroup)
779{
780	struct btrfs_root *quota_root = trans->fs_info->quota_root;
781	struct btrfs_path *path;
782	struct btrfs_key key;
783	struct extent_buffer *l;
784	struct btrfs_qgroup_limit_item *qgroup_limit;
785	int ret;
786	int slot;
787
788	key.objectid = 0;
789	key.type = BTRFS_QGROUP_LIMIT_KEY;
790	key.offset = qgroup->qgroupid;
791
792	path = btrfs_alloc_path();
793	if (!path)
794		return -ENOMEM;
795
796	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
797	if (ret > 0)
798		ret = -ENOENT;
799
800	if (ret)
801		goto out;
802
803	l = path->nodes[0];
804	slot = path->slots[0];
805	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
806	btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
807	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
808	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
809	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
810	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
811
812	btrfs_mark_buffer_dirty(trans, l);
813
814out:
815	btrfs_free_path(path);
816	return ret;
817}
818
819static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
820				   struct btrfs_qgroup *qgroup)
821{
822	struct btrfs_fs_info *fs_info = trans->fs_info;
823	struct btrfs_root *quota_root = fs_info->quota_root;
824	struct btrfs_path *path;
825	struct btrfs_key key;
826	struct extent_buffer *l;
827	struct btrfs_qgroup_info_item *qgroup_info;
828	int ret;
829	int slot;
830
831	if (btrfs_is_testing(fs_info))
832		return 0;
833
834	key.objectid = 0;
835	key.type = BTRFS_QGROUP_INFO_KEY;
836	key.offset = qgroup->qgroupid;
837
838	path = btrfs_alloc_path();
839	if (!path)
840		return -ENOMEM;
841
842	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
843	if (ret > 0)
844		ret = -ENOENT;
845
846	if (ret)
847		goto out;
848
849	l = path->nodes[0];
850	slot = path->slots[0];
851	qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
852	btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
853	btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
854	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
855	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
856	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
857
858	btrfs_mark_buffer_dirty(trans, l);
859
860out:
861	btrfs_free_path(path);
862	return ret;
863}
864
865static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
866{
867	struct btrfs_fs_info *fs_info = trans->fs_info;
868	struct btrfs_root *quota_root = fs_info->quota_root;
869	struct btrfs_path *path;
870	struct btrfs_key key;
871	struct extent_buffer *l;
872	struct btrfs_qgroup_status_item *ptr;
873	int ret;
874	int slot;
875
876	key.objectid = 0;
877	key.type = BTRFS_QGROUP_STATUS_KEY;
878	key.offset = 0;
879
880	path = btrfs_alloc_path();
881	if (!path)
882		return -ENOMEM;
883
884	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
885	if (ret > 0)
886		ret = -ENOENT;
887
888	if (ret)
889		goto out;
890
891	l = path->nodes[0];
892	slot = path->slots[0];
893	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
894	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
895				      BTRFS_QGROUP_STATUS_FLAGS_MASK);
896	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
897	btrfs_set_qgroup_status_rescan(l, ptr,
898				fs_info->qgroup_rescan_progress.objectid);
899
900	btrfs_mark_buffer_dirty(trans, l);
901
902out:
903	btrfs_free_path(path);
904	return ret;
905}
906
907/*
908 * called with qgroup_lock held
909 */
910static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
911				  struct btrfs_root *root)
912{
913	struct btrfs_path *path;
914	struct btrfs_key key;
915	struct extent_buffer *leaf = NULL;
916	int ret;
917	int nr = 0;
918
919	path = btrfs_alloc_path();
920	if (!path)
921		return -ENOMEM;
922
923	key.objectid = 0;
924	key.offset = 0;
925	key.type = 0;
926
927	while (1) {
928		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
929		if (ret < 0)
930			goto out;
931		leaf = path->nodes[0];
932		nr = btrfs_header_nritems(leaf);
933		if (!nr)
934			break;
935		/*
936		 * delete the leaf one by one
937		 * since the whole tree is going
938		 * to be deleted.
939		 */
940		path->slots[0] = 0;
941		ret = btrfs_del_items(trans, root, path, 0, nr);
942		if (ret)
943			goto out;
944
945		btrfs_release_path(path);
946	}
947	ret = 0;
948out:
949	btrfs_free_path(path);
950	return ret;
951}
952
953int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
954{
955	struct btrfs_root *quota_root;
956	struct btrfs_root *tree_root = fs_info->tree_root;
957	struct btrfs_path *path = NULL;
958	struct btrfs_qgroup_status_item *ptr;
959	struct extent_buffer *leaf;
960	struct btrfs_key key;
961	struct btrfs_key found_key;
962	struct btrfs_qgroup *qgroup = NULL;
963	struct btrfs_trans_handle *trans = NULL;
964	struct ulist *ulist = NULL;
965	int ret = 0;
966	int slot;
967
968	/*
969	 * We need to have subvol_sem write locked, to prevent races between
970	 * concurrent tasks trying to enable quotas, because we will unlock
971	 * and relock qgroup_ioctl_lock before setting fs_info->quota_root
972	 * and before setting BTRFS_FS_QUOTA_ENABLED.
973	 */
974	lockdep_assert_held_write(&fs_info->subvol_sem);
975
976	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
977		btrfs_err(fs_info,
978			  "qgroups are currently unsupported in extent tree v2");
979		return -EINVAL;
980	}
981
982	mutex_lock(&fs_info->qgroup_ioctl_lock);
983	if (fs_info->quota_root)
984		goto out;
985
986	ulist = ulist_alloc(GFP_KERNEL);
987	if (!ulist) {
988		ret = -ENOMEM;
989		goto out;
990	}
991
992	ret = btrfs_sysfs_add_qgroups(fs_info);
993	if (ret < 0)
994		goto out;
995
996	/*
997	 * Unlock qgroup_ioctl_lock before starting the transaction. This is to
998	 * avoid lock acquisition inversion problems (reported by lockdep) between
999	 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
1000	 * start a transaction.
1001	 * After we started the transaction lock qgroup_ioctl_lock again and
1002	 * check if someone else created the quota root in the meanwhile. If so,
1003	 * just return success and release the transaction handle.
1004	 *
1005	 * Also we don't need to worry about someone else calling
1006	 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
1007	 * that function returns 0 (success) when the sysfs entries already exist.
1008	 */
1009	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1010
1011	/*
1012	 * 1 for quota root item
1013	 * 1 for BTRFS_QGROUP_STATUS item
1014	 *
1015	 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
1016	 * per subvolume. However those are not currently reserved since it
1017	 * would be a lot of overkill.
1018	 */
1019	trans = btrfs_start_transaction(tree_root, 2);
1020
1021	mutex_lock(&fs_info->qgroup_ioctl_lock);
1022	if (IS_ERR(trans)) {
1023		ret = PTR_ERR(trans);
1024		trans = NULL;
1025		goto out;
1026	}
1027
1028	if (fs_info->quota_root)
1029		goto out;
1030
1031	fs_info->qgroup_ulist = ulist;
1032	ulist = NULL;
1033
1034	/*
1035	 * initially create the quota tree
1036	 */
1037	quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
1038	if (IS_ERR(quota_root)) {
1039		ret =  PTR_ERR(quota_root);
1040		btrfs_abort_transaction(trans, ret);
1041		goto out;
1042	}
1043
1044	path = btrfs_alloc_path();
1045	if (!path) {
1046		ret = -ENOMEM;
1047		btrfs_abort_transaction(trans, ret);
1048		goto out_free_root;
1049	}
1050
1051	key.objectid = 0;
1052	key.type = BTRFS_QGROUP_STATUS_KEY;
1053	key.offset = 0;
1054
1055	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
1056				      sizeof(*ptr));
1057	if (ret) {
1058		btrfs_abort_transaction(trans, ret);
1059		goto out_free_path;
1060	}
1061
1062	leaf = path->nodes[0];
1063	ptr = btrfs_item_ptr(leaf, path->slots[0],
1064				 struct btrfs_qgroup_status_item);
1065	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
1066	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
1067	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
1068				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1069	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
1070				      BTRFS_QGROUP_STATUS_FLAGS_MASK);
1071	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
1072
1073	btrfs_mark_buffer_dirty(trans, leaf);
1074
1075	key.objectid = 0;
1076	key.type = BTRFS_ROOT_REF_KEY;
1077	key.offset = 0;
1078
1079	btrfs_release_path(path);
1080	ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
1081	if (ret > 0)
1082		goto out_add_root;
1083	if (ret < 0) {
1084		btrfs_abort_transaction(trans, ret);
1085		goto out_free_path;
1086	}
1087
1088	while (1) {
1089		slot = path->slots[0];
1090		leaf = path->nodes[0];
1091		btrfs_item_key_to_cpu(leaf, &found_key, slot);
1092
1093		if (found_key.type == BTRFS_ROOT_REF_KEY) {
1094
1095			/* Release locks on tree_root before we access quota_root */
1096			btrfs_release_path(path);
1097
1098			ret = add_qgroup_item(trans, quota_root,
1099					      found_key.offset);
1100			if (ret) {
1101				btrfs_abort_transaction(trans, ret);
1102				goto out_free_path;
1103			}
1104
1105			qgroup = add_qgroup_rb(fs_info, found_key.offset);
1106			if (IS_ERR(qgroup)) {
1107				ret = PTR_ERR(qgroup);
1108				btrfs_abort_transaction(trans, ret);
1109				goto out_free_path;
1110			}
1111			ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1112			if (ret < 0) {
1113				btrfs_abort_transaction(trans, ret);
1114				goto out_free_path;
1115			}
1116			ret = btrfs_search_slot_for_read(tree_root, &found_key,
1117							 path, 1, 0);
1118			if (ret < 0) {
1119				btrfs_abort_transaction(trans, ret);
1120				goto out_free_path;
1121			}
1122			if (ret > 0) {
1123				/*
1124				 * Shouldn't happen, but in case it does we
1125				 * don't need to do the btrfs_next_item, just
1126				 * continue.
1127				 */
1128				continue;
1129			}
1130		}
1131		ret = btrfs_next_item(tree_root, path);
1132		if (ret < 0) {
1133			btrfs_abort_transaction(trans, ret);
1134			goto out_free_path;
1135		}
1136		if (ret)
1137			break;
1138	}
1139
1140out_add_root:
1141	btrfs_release_path(path);
1142	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1143	if (ret) {
1144		btrfs_abort_transaction(trans, ret);
1145		goto out_free_path;
1146	}
1147
1148	qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
1149	if (IS_ERR(qgroup)) {
1150		ret = PTR_ERR(qgroup);
1151		btrfs_abort_transaction(trans, ret);
1152		goto out_free_path;
1153	}
1154	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1155	if (ret < 0) {
1156		btrfs_abort_transaction(trans, ret);
1157		goto out_free_path;
1158	}
1159
1160	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1161	/*
1162	 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
1163	 * a deadlock with tasks concurrently doing other qgroup operations, such
1164	 * adding/removing qgroups or adding/deleting qgroup relations for example,
1165	 * because all qgroup operations first start or join a transaction and then
1166	 * lock the qgroup_ioctl_lock mutex.
1167	 * We are safe from a concurrent task trying to enable quotas, by calling
1168	 * this function, since we are serialized by fs_info->subvol_sem.
1169	 */
1170	ret = btrfs_commit_transaction(trans);
1171	trans = NULL;
1172	mutex_lock(&fs_info->qgroup_ioctl_lock);
1173	if (ret)
1174		goto out_free_path;
1175
1176	/*
1177	 * Set quota enabled flag after committing the transaction, to avoid
1178	 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1179	 * creation.
1180	 */
1181	spin_lock(&fs_info->qgroup_lock);
1182	fs_info->quota_root = quota_root;
1183	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1184	spin_unlock(&fs_info->qgroup_lock);
1185
1186	ret = qgroup_rescan_init(fs_info, 0, 1);
1187	if (!ret) {
1188	        qgroup_rescan_zero_tracking(fs_info);
1189		fs_info->qgroup_rescan_running = true;
1190	        btrfs_queue_work(fs_info->qgroup_rescan_workers,
1191	                         &fs_info->qgroup_rescan_work);
1192	} else {
1193		/*
1194		 * We have set both BTRFS_FS_QUOTA_ENABLED and
1195		 * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
1196		 * -EINPROGRESS. That can happen because someone started the
1197		 * rescan worker by calling quota rescan ioctl before we
1198		 * attempted to initialize the rescan worker. Failure due to
1199		 * quotas disabled in the meanwhile is not possible, because
1200		 * we are holding a write lock on fs_info->subvol_sem, which
1201		 * is also acquired when disabling quotas.
1202		 * Ignore such error, and any other error would need to undo
1203		 * everything we did in the transaction we just committed.
1204		 */
1205		ASSERT(ret == -EINPROGRESS);
1206		ret = 0;
1207	}
1208
1209out_free_path:
1210	btrfs_free_path(path);
1211out_free_root:
1212	if (ret)
1213		btrfs_put_root(quota_root);
1214out:
1215	if (ret) {
1216		ulist_free(fs_info->qgroup_ulist);
1217		fs_info->qgroup_ulist = NULL;
1218		btrfs_sysfs_del_qgroups(fs_info);
1219	}
1220	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1221	if (ret && trans)
1222		btrfs_end_transaction(trans);
1223	else if (trans)
1224		ret = btrfs_end_transaction(trans);
1225	ulist_free(ulist);
1226	return ret;
1227}
1228
1229int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1230{
1231	struct btrfs_root *quota_root;
1232	struct btrfs_trans_handle *trans = NULL;
1233	int ret = 0;
1234
1235	/*
1236	 * We need to have subvol_sem write locked to prevent races with
1237	 * snapshot creation.
1238	 */
1239	lockdep_assert_held_write(&fs_info->subvol_sem);
1240
1241	/*
1242	 * Lock the cleaner mutex to prevent races with concurrent relocation,
1243	 * because relocation may be building backrefs for blocks of the quota
1244	 * root while we are deleting the root. This is like dropping fs roots
1245	 * of deleted snapshots/subvolumes, we need the same protection.
1246	 *
1247	 * This also prevents races between concurrent tasks trying to disable
1248	 * quotas, because we will unlock and relock qgroup_ioctl_lock across
1249	 * BTRFS_FS_QUOTA_ENABLED changes.
1250	 */
1251	mutex_lock(&fs_info->cleaner_mutex);
1252
1253	mutex_lock(&fs_info->qgroup_ioctl_lock);
1254	if (!fs_info->quota_root)
1255		goto out;
1256
1257	/*
1258	 * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1259	 * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1260	 * to lock that mutex while holding a transaction handle and the rescan
1261	 * worker needs to commit a transaction.
1262	 */
1263	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1264
1265	/*
1266	 * Request qgroup rescan worker to complete and wait for it. This wait
1267	 * must be done before transaction start for quota disable since it may
1268	 * deadlock with transaction by the qgroup rescan worker.
1269	 */
1270	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1271	btrfs_qgroup_wait_for_completion(fs_info, false);
1272
1273	/*
1274	 * 1 For the root item
1275	 *
1276	 * We should also reserve enough items for the quota tree deletion in
1277	 * btrfs_clean_quota_tree but this is not done.
1278	 *
1279	 * Also, we must always start a transaction without holding the mutex
1280	 * qgroup_ioctl_lock, see btrfs_quota_enable().
1281	 */
1282	trans = btrfs_start_transaction(fs_info->tree_root, 1);
1283
1284	mutex_lock(&fs_info->qgroup_ioctl_lock);
1285	if (IS_ERR(trans)) {
1286		ret = PTR_ERR(trans);
1287		trans = NULL;
1288		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1289		goto out;
1290	}
1291
1292	if (!fs_info->quota_root)
1293		goto out;
1294
1295	spin_lock(&fs_info->qgroup_lock);
1296	quota_root = fs_info->quota_root;
1297	fs_info->quota_root = NULL;
1298	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1299	fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
1300	spin_unlock(&fs_info->qgroup_lock);
1301
1302	btrfs_free_qgroup_config(fs_info);
1303
1304	ret = btrfs_clean_quota_tree(trans, quota_root);
1305	if (ret) {
1306		btrfs_abort_transaction(trans, ret);
1307		goto out;
1308	}
1309
1310	ret = btrfs_del_root(trans, &quota_root->root_key);
1311	if (ret) {
1312		btrfs_abort_transaction(trans, ret);
1313		goto out;
1314	}
1315
1316	spin_lock(&fs_info->trans_lock);
1317	list_del(&quota_root->dirty_list);
1318	spin_unlock(&fs_info->trans_lock);
1319
1320	btrfs_tree_lock(quota_root->node);
1321	btrfs_clear_buffer_dirty(trans, quota_root->node);
1322	btrfs_tree_unlock(quota_root->node);
1323	btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
1324			      quota_root->node, 0, 1);
1325
1326	btrfs_put_root(quota_root);
1327
1328out:
1329	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1330	if (ret && trans)
1331		btrfs_end_transaction(trans);
1332	else if (trans)
1333		ret = btrfs_end_transaction(trans);
1334	mutex_unlock(&fs_info->cleaner_mutex);
1335
1336	return ret;
1337}
1338
1339static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1340			 struct btrfs_qgroup *qgroup)
1341{
1342	if (list_empty(&qgroup->dirty))
1343		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1344}
1345
1346static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup)
1347{
1348	if (!list_empty(&qgroup->iterator))
1349		return;
1350
1351	list_add_tail(&qgroup->iterator, head);
1352}
1353
1354static void qgroup_iterator_clean(struct list_head *head)
1355{
1356	while (!list_empty(head)) {
1357		struct btrfs_qgroup *qgroup;
1358
1359		qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
1360		list_del_init(&qgroup->iterator);
1361	}
1362}
1363
1364/*
1365 * The easy accounting, we're updating qgroup relationship whose child qgroup
1366 * only has exclusive extents.
1367 *
1368 * In this case, all exclusive extents will also be exclusive for parent, so
1369 * excl/rfer just get added/removed.
1370 *
1371 * So is qgroup reservation space, which should also be added/removed to
1372 * parent.
1373 * Or when child tries to release reservation space, parent will underflow its
1374 * reservation (for relationship adding case).
1375 *
1376 * Caller should hold fs_info->qgroup_lock.
1377 */
1378static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1379				    struct ulist *tmp, u64 ref_root,
1380				    struct btrfs_qgroup *src, int sign)
1381{
1382	struct btrfs_qgroup *qgroup;
1383	struct btrfs_qgroup_list *glist;
1384	struct ulist_node *unode;
1385	struct ulist_iterator uiter;
1386	u64 num_bytes = src->excl;
1387	int ret = 0;
1388
1389	qgroup = find_qgroup_rb(fs_info, ref_root);
1390	if (!qgroup)
1391		goto out;
1392
1393	qgroup->rfer += sign * num_bytes;
1394	qgroup->rfer_cmpr += sign * num_bytes;
1395
1396	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1397	qgroup->excl += sign * num_bytes;
1398	qgroup->excl_cmpr += sign * num_bytes;
1399
1400	if (sign > 0)
1401		qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1402	else
1403		qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1404
1405	qgroup_dirty(fs_info, qgroup);
1406
1407	/* Get all of the parent groups that contain this qgroup */
1408	list_for_each_entry(glist, &qgroup->groups, next_group) {
1409		ret = ulist_add(tmp, glist->group->qgroupid,
1410				qgroup_to_aux(glist->group), GFP_ATOMIC);
1411		if (ret < 0)
1412			goto out;
1413	}
1414
1415	/* Iterate all of the parents and adjust their reference counts */
1416	ULIST_ITER_INIT(&uiter);
1417	while ((unode = ulist_next(tmp, &uiter))) {
1418		qgroup = unode_aux_to_qgroup(unode);
1419		qgroup->rfer += sign * num_bytes;
1420		qgroup->rfer_cmpr += sign * num_bytes;
1421		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1422		qgroup->excl += sign * num_bytes;
1423		if (sign > 0)
1424			qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1425		else
1426			qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1427		qgroup->excl_cmpr += sign * num_bytes;
1428		qgroup_dirty(fs_info, qgroup);
1429
1430		/* Add any parents of the parents */
1431		list_for_each_entry(glist, &qgroup->groups, next_group) {
1432			ret = ulist_add(tmp, glist->group->qgroupid,
1433					qgroup_to_aux(glist->group), GFP_ATOMIC);
1434			if (ret < 0)
1435				goto out;
1436		}
1437	}
1438	ret = 0;
1439out:
1440	return ret;
1441}
1442
1443
1444/*
1445 * Quick path for updating qgroup with only excl refs.
1446 *
1447 * In that case, just update all parent will be enough.
1448 * Or we needs to do a full rescan.
1449 * Caller should also hold fs_info->qgroup_lock.
1450 *
1451 * Return 0 for quick update, return >0 for need to full rescan
1452 * and mark INCONSISTENT flag.
1453 * Return < 0 for other error.
1454 */
1455static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1456				   struct ulist *tmp, u64 src, u64 dst,
1457				   int sign)
1458{
1459	struct btrfs_qgroup *qgroup;
1460	int ret = 1;
1461	int err = 0;
1462
1463	qgroup = find_qgroup_rb(fs_info, src);
1464	if (!qgroup)
1465		goto out;
1466	if (qgroup->excl == qgroup->rfer) {
1467		ret = 0;
1468		err = __qgroup_excl_accounting(fs_info, tmp, dst,
1469					       qgroup, sign);
1470		if (err < 0) {
1471			ret = err;
1472			goto out;
1473		}
1474	}
1475out:
1476	if (ret)
1477		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1478	return ret;
1479}
1480
1481int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1482			      u64 dst)
1483{
1484	struct btrfs_fs_info *fs_info = trans->fs_info;
1485	struct btrfs_qgroup *parent;
1486	struct btrfs_qgroup *member;
1487	struct btrfs_qgroup_list *list;
1488	struct ulist *tmp;
1489	unsigned int nofs_flag;
1490	int ret = 0;
1491
1492	/* Check the level of src and dst first */
1493	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1494		return -EINVAL;
1495
1496	/* We hold a transaction handle open, must do a NOFS allocation. */
1497	nofs_flag = memalloc_nofs_save();
1498	tmp = ulist_alloc(GFP_KERNEL);
1499	memalloc_nofs_restore(nofs_flag);
1500	if (!tmp)
1501		return -ENOMEM;
1502
1503	mutex_lock(&fs_info->qgroup_ioctl_lock);
1504	if (!fs_info->quota_root) {
1505		ret = -ENOTCONN;
1506		goto out;
1507	}
1508	member = find_qgroup_rb(fs_info, src);
1509	parent = find_qgroup_rb(fs_info, dst);
1510	if (!member || !parent) {
1511		ret = -EINVAL;
1512		goto out;
1513	}
1514
1515	/* check if such qgroup relation exist firstly */
1516	list_for_each_entry(list, &member->groups, next_group) {
1517		if (list->group == parent) {
1518			ret = -EEXIST;
1519			goto out;
1520		}
1521	}
1522
1523	ret = add_qgroup_relation_item(trans, src, dst);
1524	if (ret)
1525		goto out;
1526
1527	ret = add_qgroup_relation_item(trans, dst, src);
1528	if (ret) {
1529		del_qgroup_relation_item(trans, src, dst);
1530		goto out;
1531	}
1532
1533	spin_lock(&fs_info->qgroup_lock);
1534	ret = __add_relation_rb(member, parent);
1535	if (ret < 0) {
1536		spin_unlock(&fs_info->qgroup_lock);
1537		goto out;
1538	}
1539	ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1540	spin_unlock(&fs_info->qgroup_lock);
1541out:
1542	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1543	ulist_free(tmp);
1544	return ret;
1545}
1546
1547static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1548				 u64 dst)
1549{
1550	struct btrfs_fs_info *fs_info = trans->fs_info;
1551	struct btrfs_qgroup *parent;
1552	struct btrfs_qgroup *member;
1553	struct btrfs_qgroup_list *list;
1554	struct ulist *tmp;
1555	bool found = false;
1556	unsigned int nofs_flag;
1557	int ret = 0;
1558	int ret2;
1559
1560	/* We hold a transaction handle open, must do a NOFS allocation. */
1561	nofs_flag = memalloc_nofs_save();
1562	tmp = ulist_alloc(GFP_KERNEL);
1563	memalloc_nofs_restore(nofs_flag);
1564	if (!tmp)
1565		return -ENOMEM;
1566
1567	if (!fs_info->quota_root) {
1568		ret = -ENOTCONN;
1569		goto out;
1570	}
1571
1572	member = find_qgroup_rb(fs_info, src);
1573	parent = find_qgroup_rb(fs_info, dst);
1574	/*
1575	 * The parent/member pair doesn't exist, then try to delete the dead
1576	 * relation items only.
1577	 */
1578	if (!member || !parent)
1579		goto delete_item;
1580
1581	/* check if such qgroup relation exist firstly */
1582	list_for_each_entry(list, &member->groups, next_group) {
1583		if (list->group == parent) {
1584			found = true;
1585			break;
1586		}
1587	}
1588
1589delete_item:
1590	ret = del_qgroup_relation_item(trans, src, dst);
1591	if (ret < 0 && ret != -ENOENT)
1592		goto out;
1593	ret2 = del_qgroup_relation_item(trans, dst, src);
1594	if (ret2 < 0 && ret2 != -ENOENT)
1595		goto out;
1596
1597	/* At least one deletion succeeded, return 0 */
1598	if (!ret || !ret2)
1599		ret = 0;
1600
1601	if (found) {
1602		spin_lock(&fs_info->qgroup_lock);
1603		del_relation_rb(fs_info, src, dst);
1604		ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1605		spin_unlock(&fs_info->qgroup_lock);
1606	}
1607out:
1608	ulist_free(tmp);
1609	return ret;
1610}
1611
1612int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1613			      u64 dst)
1614{
1615	struct btrfs_fs_info *fs_info = trans->fs_info;
1616	int ret = 0;
1617
1618	mutex_lock(&fs_info->qgroup_ioctl_lock);
1619	ret = __del_qgroup_relation(trans, src, dst);
1620	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1621
1622	return ret;
1623}
1624
1625int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1626{
1627	struct btrfs_fs_info *fs_info = trans->fs_info;
1628	struct btrfs_root *quota_root;
1629	struct btrfs_qgroup *qgroup;
1630	int ret = 0;
1631
1632	mutex_lock(&fs_info->qgroup_ioctl_lock);
1633	if (!fs_info->quota_root) {
1634		ret = -ENOTCONN;
1635		goto out;
1636	}
1637	quota_root = fs_info->quota_root;
1638	qgroup = find_qgroup_rb(fs_info, qgroupid);
1639	if (qgroup) {
1640		ret = -EEXIST;
1641		goto out;
1642	}
1643
1644	ret = add_qgroup_item(trans, quota_root, qgroupid);
1645	if (ret)
1646		goto out;
1647
1648	spin_lock(&fs_info->qgroup_lock);
1649	qgroup = add_qgroup_rb(fs_info, qgroupid);
1650	spin_unlock(&fs_info->qgroup_lock);
1651
1652	if (IS_ERR(qgroup)) {
1653		ret = PTR_ERR(qgroup);
1654		goto out;
1655	}
1656	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1657out:
1658	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1659	return ret;
1660}
1661
1662static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
1663{
1664	return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
1665		qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
1666		qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
1667		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
1668		qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
1669}
1670
1671int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1672{
1673	struct btrfs_fs_info *fs_info = trans->fs_info;
1674	struct btrfs_qgroup *qgroup;
1675	struct btrfs_qgroup_list *list;
1676	int ret = 0;
1677
1678	mutex_lock(&fs_info->qgroup_ioctl_lock);
1679	if (!fs_info->quota_root) {
1680		ret = -ENOTCONN;
1681		goto out;
1682	}
1683
1684	qgroup = find_qgroup_rb(fs_info, qgroupid);
1685	if (!qgroup) {
1686		ret = -ENOENT;
1687		goto out;
1688	}
1689
1690	if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
1691		ret = -EBUSY;
1692		goto out;
1693	}
1694
1695	/* Check if there are no children of this qgroup */
1696	if (!list_empty(&qgroup->members)) {
1697		ret = -EBUSY;
1698		goto out;
1699	}
1700
1701	ret = del_qgroup_item(trans, qgroupid);
1702	if (ret && ret != -ENOENT)
1703		goto out;
1704
1705	while (!list_empty(&qgroup->groups)) {
1706		list = list_first_entry(&qgroup->groups,
1707					struct btrfs_qgroup_list, next_group);
1708		ret = __del_qgroup_relation(trans, qgroupid,
1709					    list->group->qgroupid);
1710		if (ret)
1711			goto out;
1712	}
1713
1714	spin_lock(&fs_info->qgroup_lock);
1715	del_qgroup_rb(fs_info, qgroupid);
1716	spin_unlock(&fs_info->qgroup_lock);
1717
1718	/*
1719	 * Remove the qgroup from sysfs now without holding the qgroup_lock
1720	 * spinlock, since the sysfs_remove_group() function needs to take
1721	 * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1722	 */
1723	btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1724	kfree(qgroup);
1725out:
1726	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1727	return ret;
1728}
1729
1730int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1731		       struct btrfs_qgroup_limit *limit)
1732{
1733	struct btrfs_fs_info *fs_info = trans->fs_info;
1734	struct btrfs_qgroup *qgroup;
1735	int ret = 0;
1736	/* Sometimes we would want to clear the limit on this qgroup.
1737	 * To meet this requirement, we treat the -1 as a special value
1738	 * which tell kernel to clear the limit on this qgroup.
1739	 */
1740	const u64 CLEAR_VALUE = -1;
1741
1742	mutex_lock(&fs_info->qgroup_ioctl_lock);
1743	if (!fs_info->quota_root) {
1744		ret = -ENOTCONN;
1745		goto out;
1746	}
1747
1748	qgroup = find_qgroup_rb(fs_info, qgroupid);
1749	if (!qgroup) {
1750		ret = -ENOENT;
1751		goto out;
1752	}
1753
1754	spin_lock(&fs_info->qgroup_lock);
1755	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1756		if (limit->max_rfer == CLEAR_VALUE) {
1757			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1758			limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1759			qgroup->max_rfer = 0;
1760		} else {
1761			qgroup->max_rfer = limit->max_rfer;
1762		}
1763	}
1764	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1765		if (limit->max_excl == CLEAR_VALUE) {
1766			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1767			limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1768			qgroup->max_excl = 0;
1769		} else {
1770			qgroup->max_excl = limit->max_excl;
1771		}
1772	}
1773	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1774		if (limit->rsv_rfer == CLEAR_VALUE) {
1775			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1776			limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1777			qgroup->rsv_rfer = 0;
1778		} else {
1779			qgroup->rsv_rfer = limit->rsv_rfer;
1780		}
1781	}
1782	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1783		if (limit->rsv_excl == CLEAR_VALUE) {
1784			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1785			limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1786			qgroup->rsv_excl = 0;
1787		} else {
1788			qgroup->rsv_excl = limit->rsv_excl;
1789		}
1790	}
1791	qgroup->lim_flags |= limit->flags;
1792
1793	spin_unlock(&fs_info->qgroup_lock);
1794
1795	ret = update_qgroup_limit_item(trans, qgroup);
1796	if (ret) {
1797		qgroup_mark_inconsistent(fs_info);
1798		btrfs_info(fs_info, "unable to update quota limit for %llu",
1799		       qgroupid);
1800	}
1801
1802out:
1803	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1804	return ret;
1805}
1806
1807int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1808				struct btrfs_delayed_ref_root *delayed_refs,
1809				struct btrfs_qgroup_extent_record *record)
1810{
1811	struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1812	struct rb_node *parent_node = NULL;
1813	struct btrfs_qgroup_extent_record *entry;
1814	u64 bytenr = record->bytenr;
1815
1816	lockdep_assert_held(&delayed_refs->lock);
1817	trace_btrfs_qgroup_trace_extent(fs_info, record);
1818
1819	while (*p) {
1820		parent_node = *p;
1821		entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1822				 node);
1823		if (bytenr < entry->bytenr) {
1824			p = &(*p)->rb_left;
1825		} else if (bytenr > entry->bytenr) {
1826			p = &(*p)->rb_right;
1827		} else {
1828			if (record->data_rsv && !entry->data_rsv) {
1829				entry->data_rsv = record->data_rsv;
1830				entry->data_rsv_refroot =
1831					record->data_rsv_refroot;
1832			}
1833			return 1;
1834		}
1835	}
1836
1837	rb_link_node(&record->node, parent_node, p);
1838	rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1839	return 0;
1840}
1841
1842int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
1843				   struct btrfs_qgroup_extent_record *qrecord)
1844{
1845	struct btrfs_backref_walk_ctx ctx = { 0 };
1846	int ret;
1847
1848	/*
1849	 * We are always called in a context where we are already holding a
1850	 * transaction handle. Often we are called when adding a data delayed
1851	 * reference from btrfs_truncate_inode_items() (truncating or unlinking),
1852	 * in which case we will be holding a write lock on extent buffer from a
1853	 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
1854	 * acquire fs_info->commit_root_sem, because that is a higher level lock
1855	 * that must be acquired before locking any extent buffers.
1856	 *
1857	 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
1858	 * but we can't pass it a non-NULL transaction handle, because otherwise
1859	 * it would not use commit roots and would lock extent buffers, causing
1860	 * a deadlock if it ends up trying to read lock the same extent buffer
1861	 * that was previously write locked at btrfs_truncate_inode_items().
1862	 *
1863	 * So pass a NULL transaction handle to btrfs_find_all_roots() and
1864	 * explicitly tell it to not acquire the commit_root_sem - if we are
1865	 * holding a transaction handle we don't need its protection.
1866	 */
1867	ASSERT(trans != NULL);
1868
1869	if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
1870		return 0;
1871
1872	ctx.bytenr = qrecord->bytenr;
1873	ctx.fs_info = trans->fs_info;
1874
1875	ret = btrfs_find_all_roots(&ctx, true);
1876	if (ret < 0) {
1877		qgroup_mark_inconsistent(trans->fs_info);
1878		btrfs_warn(trans->fs_info,
1879"error accounting new delayed refs extent (err code: %d), quota inconsistent",
1880			ret);
1881		return 0;
1882	}
1883
1884	/*
1885	 * Here we don't need to get the lock of
1886	 * trans->transaction->delayed_refs, since inserted qrecord won't
1887	 * be deleted, only qrecord->node may be modified (new qrecord insert)
1888	 *
1889	 * So modifying qrecord->old_roots is safe here
1890	 */
1891	qrecord->old_roots = ctx.roots;
1892	return 0;
1893}
1894
1895int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
1896			      u64 num_bytes)
1897{
1898	struct btrfs_fs_info *fs_info = trans->fs_info;
1899	struct btrfs_qgroup_extent_record *record;
1900	struct btrfs_delayed_ref_root *delayed_refs;
1901	int ret;
1902
1903	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
1904	    || bytenr == 0 || num_bytes == 0)
1905		return 0;
1906	record = kzalloc(sizeof(*record), GFP_NOFS);
1907	if (!record)
1908		return -ENOMEM;
1909
1910	delayed_refs = &trans->transaction->delayed_refs;
1911	record->bytenr = bytenr;
1912	record->num_bytes = num_bytes;
1913	record->old_roots = NULL;
1914
1915	spin_lock(&delayed_refs->lock);
1916	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
1917	spin_unlock(&delayed_refs->lock);
1918	if (ret > 0) {
1919		kfree(record);
1920		return 0;
1921	}
1922	return btrfs_qgroup_trace_extent_post(trans, record);
1923}
1924
1925int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
1926				  struct extent_buffer *eb)
1927{
1928	struct btrfs_fs_info *fs_info = trans->fs_info;
1929	int nr = btrfs_header_nritems(eb);
1930	int i, extent_type, ret;
1931	struct btrfs_key key;
1932	struct btrfs_file_extent_item *fi;
1933	u64 bytenr, num_bytes;
1934
1935	/* We can be called directly from walk_up_proc() */
1936	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1937		return 0;
1938
1939	for (i = 0; i < nr; i++) {
1940		btrfs_item_key_to_cpu(eb, &key, i);
1941
1942		if (key.type != BTRFS_EXTENT_DATA_KEY)
1943			continue;
1944
1945		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
1946		/* filter out non qgroup-accountable extents  */
1947		extent_type = btrfs_file_extent_type(eb, fi);
1948
1949		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1950			continue;
1951
1952		bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1953		if (!bytenr)
1954			continue;
1955
1956		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1957
1958		ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
1959		if (ret)
1960			return ret;
1961	}
1962	cond_resched();
1963	return 0;
1964}
1965
1966/*
1967 * Walk up the tree from the bottom, freeing leaves and any interior
1968 * nodes which have had all slots visited. If a node (leaf or
1969 * interior) is freed, the node above it will have it's slot
1970 * incremented. The root node will never be freed.
1971 *
1972 * At the end of this function, we should have a path which has all
1973 * slots incremented to the next position for a search. If we need to
1974 * read a new node it will be NULL and the node above it will have the
1975 * correct slot selected for a later read.
1976 *
1977 * If we increment the root nodes slot counter past the number of
1978 * elements, 1 is returned to signal completion of the search.
1979 */
1980static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
1981{
1982	int level = 0;
1983	int nr, slot;
1984	struct extent_buffer *eb;
1985
1986	if (root_level == 0)
1987		return 1;
1988
1989	while (level <= root_level) {
1990		eb = path->nodes[level];
1991		nr = btrfs_header_nritems(eb);
1992		path->slots[level]++;
1993		slot = path->slots[level];
1994		if (slot >= nr || level == 0) {
1995			/*
1996			 * Don't free the root -  we will detect this
1997			 * condition after our loop and return a
1998			 * positive value for caller to stop walking the tree.
1999			 */
2000			if (level != root_level) {
2001				btrfs_tree_unlock_rw(eb, path->locks[level]);
2002				path->locks[level] = 0;
2003
2004				free_extent_buffer(eb);
2005				path->nodes[level] = NULL;
2006				path->slots[level] = 0;
2007			}
2008		} else {
2009			/*
2010			 * We have a valid slot to walk back down
2011			 * from. Stop here so caller can process these
2012			 * new nodes.
2013			 */
2014			break;
2015		}
2016
2017		level++;
2018	}
2019
2020	eb = path->nodes[root_level];
2021	if (path->slots[root_level] >= btrfs_header_nritems(eb))
2022		return 1;
2023
2024	return 0;
2025}
2026
2027/*
2028 * Helper function to trace a subtree tree block swap.
2029 *
2030 * The swap will happen in highest tree block, but there may be a lot of
2031 * tree blocks involved.
2032 *
2033 * For example:
2034 *  OO = Old tree blocks
2035 *  NN = New tree blocks allocated during balance
2036 *
2037 *           File tree (257)                  Reloc tree for 257
2038 * L2              OO                                NN
2039 *               /    \                            /    \
2040 * L1          OO      OO (a)                    OO      NN (a)
2041 *            / \     / \                       / \     / \
2042 * L0       OO   OO OO   OO                   OO   OO NN   NN
2043 *                  (b)  (c)                          (b)  (c)
2044 *
2045 * When calling qgroup_trace_extent_swap(), we will pass:
2046 * @src_eb = OO(a)
2047 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
2048 * @dst_level = 0
2049 * @root_level = 1
2050 *
2051 * In that case, qgroup_trace_extent_swap() will search from OO(a) to
2052 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
2053 *
2054 * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
2055 *
2056 * 1) Tree search from @src_eb
2057 *    It should acts as a simplified btrfs_search_slot().
2058 *    The key for search can be extracted from @dst_path->nodes[dst_level]
2059 *    (first key).
2060 *
2061 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
2062 *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
2063 *    They should be marked during previous (@dst_level = 1) iteration.
2064 *
2065 * 3) Mark file extents in leaves dirty
2066 *    We don't have good way to pick out new file extents only.
2067 *    So we still follow the old method by scanning all file extents in
2068 *    the leave.
2069 *
2070 * This function can free us from keeping two paths, thus later we only need
2071 * to care about how to iterate all new tree blocks in reloc tree.
2072 */
2073static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
2074				    struct extent_buffer *src_eb,
2075				    struct btrfs_path *dst_path,
2076				    int dst_level, int root_level,
2077				    bool trace_leaf)
2078{
2079	struct btrfs_key key;
2080	struct btrfs_path *src_path;
2081	struct btrfs_fs_info *fs_info = trans->fs_info;
2082	u32 nodesize = fs_info->nodesize;
2083	int cur_level = root_level;
2084	int ret;
2085
2086	BUG_ON(dst_level > root_level);
2087	/* Level mismatch */
2088	if (btrfs_header_level(src_eb) != root_level)
2089		return -EINVAL;
2090
2091	src_path = btrfs_alloc_path();
2092	if (!src_path) {
2093		ret = -ENOMEM;
2094		goto out;
2095	}
2096
2097	if (dst_level)
2098		btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2099	else
2100		btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2101
2102	/* For src_path */
2103	atomic_inc(&src_eb->refs);
2104	src_path->nodes[root_level] = src_eb;
2105	src_path->slots[root_level] = dst_path->slots[root_level];
2106	src_path->locks[root_level] = 0;
2107
2108	/* A simplified version of btrfs_search_slot() */
2109	while (cur_level >= dst_level) {
2110		struct btrfs_key src_key;
2111		struct btrfs_key dst_key;
2112
2113		if (src_path->nodes[cur_level] == NULL) {
2114			struct extent_buffer *eb;
2115			int parent_slot;
2116
2117			eb = src_path->nodes[cur_level + 1];
2118			parent_slot = src_path->slots[cur_level + 1];
2119
2120			eb = btrfs_read_node_slot(eb, parent_slot);
2121			if (IS_ERR(eb)) {
2122				ret = PTR_ERR(eb);
2123				goto out;
2124			}
2125
2126			src_path->nodes[cur_level] = eb;
2127
2128			btrfs_tree_read_lock(eb);
2129			src_path->locks[cur_level] = BTRFS_READ_LOCK;
2130		}
2131
2132		src_path->slots[cur_level] = dst_path->slots[cur_level];
2133		if (cur_level) {
2134			btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
2135					&dst_key, dst_path->slots[cur_level]);
2136			btrfs_node_key_to_cpu(src_path->nodes[cur_level],
2137					&src_key, src_path->slots[cur_level]);
2138		} else {
2139			btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
2140					&dst_key, dst_path->slots[cur_level]);
2141			btrfs_item_key_to_cpu(src_path->nodes[cur_level],
2142					&src_key, src_path->slots[cur_level]);
2143		}
2144		/* Content mismatch, something went wrong */
2145		if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
2146			ret = -ENOENT;
2147			goto out;
2148		}
2149		cur_level--;
2150	}
2151
2152	/*
2153	 * Now both @dst_path and @src_path have been populated, record the tree
2154	 * blocks for qgroup accounting.
2155	 */
2156	ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2157					nodesize);
2158	if (ret < 0)
2159		goto out;
2160	ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
2161					nodesize);
2162	if (ret < 0)
2163		goto out;
2164
2165	/* Record leaf file extents */
2166	if (dst_level == 0 && trace_leaf) {
2167		ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2168		if (ret < 0)
2169			goto out;
2170		ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2171	}
2172out:
2173	btrfs_free_path(src_path);
2174	return ret;
2175}
2176
2177/*
2178 * Helper function to do recursive generation-aware depth-first search, to
2179 * locate all new tree blocks in a subtree of reloc tree.
2180 *
2181 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2182 *         reloc tree
2183 * L2         NN (a)
2184 *          /    \
2185 * L1    OO        NN (b)
2186 *      /  \      /  \
2187 * L0  OO  OO    OO  NN
2188 *               (c) (d)
2189 * If we pass:
2190 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2191 * @cur_level = 1
2192 * @root_level = 1
2193 *
2194 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2195 * above tree blocks along with their counter parts in file tree.
2196 * While during search, old tree blocks OO(c) will be skipped as tree block swap
2197 * won't affect OO(c).
2198 */
2199static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2200					   struct extent_buffer *src_eb,
2201					   struct btrfs_path *dst_path,
2202					   int cur_level, int root_level,
2203					   u64 last_snapshot, bool trace_leaf)
2204{
2205	struct btrfs_fs_info *fs_info = trans->fs_info;
2206	struct extent_buffer *eb;
2207	bool need_cleanup = false;
2208	int ret = 0;
2209	int i;
2210
2211	/* Level sanity check */
2212	if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2213	    root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2214	    root_level < cur_level) {
2215		btrfs_err_rl(fs_info,
2216			"%s: bad levels, cur_level=%d root_level=%d",
2217			__func__, cur_level, root_level);
2218		return -EUCLEAN;
2219	}
2220
2221	/* Read the tree block if needed */
2222	if (dst_path->nodes[cur_level] == NULL) {
2223		int parent_slot;
2224		u64 child_gen;
2225
2226		/*
2227		 * dst_path->nodes[root_level] must be initialized before
2228		 * calling this function.
2229		 */
2230		if (cur_level == root_level) {
2231			btrfs_err_rl(fs_info,
2232	"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2233				__func__, root_level, root_level, cur_level);
2234			return -EUCLEAN;
2235		}
2236
2237		/*
2238		 * We need to get child blockptr/gen from parent before we can
2239		 * read it.
2240		  */
2241		eb = dst_path->nodes[cur_level + 1];
2242		parent_slot = dst_path->slots[cur_level + 1];
2243		child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2244
2245		/* This node is old, no need to trace */
2246		if (child_gen < last_snapshot)
2247			goto out;
2248
2249		eb = btrfs_read_node_slot(eb, parent_slot);
2250		if (IS_ERR(eb)) {
2251			ret = PTR_ERR(eb);
2252			goto out;
2253		}
2254
2255		dst_path->nodes[cur_level] = eb;
2256		dst_path->slots[cur_level] = 0;
2257
2258		btrfs_tree_read_lock(eb);
2259		dst_path->locks[cur_level] = BTRFS_READ_LOCK;
2260		need_cleanup = true;
2261	}
2262
2263	/* Now record this tree block and its counter part for qgroups */
2264	ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2265				       root_level, trace_leaf);
2266	if (ret < 0)
2267		goto cleanup;
2268
2269	eb = dst_path->nodes[cur_level];
2270
2271	if (cur_level > 0) {
2272		/* Iterate all child tree blocks */
2273		for (i = 0; i < btrfs_header_nritems(eb); i++) {
2274			/* Skip old tree blocks as they won't be swapped */
2275			if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2276				continue;
2277			dst_path->slots[cur_level] = i;
2278
2279			/* Recursive call (at most 7 times) */
2280			ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2281					dst_path, cur_level - 1, root_level,
2282					last_snapshot, trace_leaf);
2283			if (ret < 0)
2284				goto cleanup;
2285		}
2286	}
2287
2288cleanup:
2289	if (need_cleanup) {
2290		/* Clean up */
2291		btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2292				     dst_path->locks[cur_level]);
2293		free_extent_buffer(dst_path->nodes[cur_level]);
2294		dst_path->nodes[cur_level] = NULL;
2295		dst_path->slots[cur_level] = 0;
2296		dst_path->locks[cur_level] = 0;
2297	}
2298out:
2299	return ret;
2300}
2301
2302static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2303				struct extent_buffer *src_eb,
2304				struct extent_buffer *dst_eb,
2305				u64 last_snapshot, bool trace_leaf)
2306{
2307	struct btrfs_fs_info *fs_info = trans->fs_info;
2308	struct btrfs_path *dst_path = NULL;
2309	int level;
2310	int ret;
2311
2312	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2313		return 0;
2314
2315	/* Wrong parameter order */
2316	if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2317		btrfs_err_rl(fs_info,
2318		"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2319			     btrfs_header_generation(src_eb),
2320			     btrfs_header_generation(dst_eb));
2321		return -EUCLEAN;
2322	}
2323
2324	if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2325		ret = -EIO;
2326		goto out;
2327	}
2328
2329	level = btrfs_header_level(dst_eb);
2330	dst_path = btrfs_alloc_path();
2331	if (!dst_path) {
2332		ret = -ENOMEM;
2333		goto out;
2334	}
2335	/* For dst_path */
2336	atomic_inc(&dst_eb->refs);
2337	dst_path->nodes[level] = dst_eb;
2338	dst_path->slots[level] = 0;
2339	dst_path->locks[level] = 0;
2340
2341	/* Do the generation aware breadth-first search */
2342	ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2343					      level, last_snapshot, trace_leaf);
2344	if (ret < 0)
2345		goto out;
2346	ret = 0;
2347
2348out:
2349	btrfs_free_path(dst_path);
2350	if (ret < 0)
2351		qgroup_mark_inconsistent(fs_info);
2352	return ret;
2353}
2354
2355int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2356			       struct extent_buffer *root_eb,
2357			       u64 root_gen, int root_level)
2358{
2359	struct btrfs_fs_info *fs_info = trans->fs_info;
2360	int ret = 0;
2361	int level;
2362	u8 drop_subptree_thres;
2363	struct extent_buffer *eb = root_eb;
2364	struct btrfs_path *path = NULL;
2365
2366	BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
2367	BUG_ON(root_eb == NULL);
2368
2369	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2370		return 0;
2371
2372	spin_lock(&fs_info->qgroup_lock);
2373	drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
2374	spin_unlock(&fs_info->qgroup_lock);
2375
2376	/*
2377	 * This function only gets called for snapshot drop, if we hit a high
2378	 * node here, it means we are going to change ownership for quite a lot
2379	 * of extents, which will greatly slow down btrfs_commit_transaction().
2380	 *
2381	 * So here if we find a high tree here, we just skip the accounting and
2382	 * mark qgroup inconsistent.
2383	 */
2384	if (root_level >= drop_subptree_thres) {
2385		qgroup_mark_inconsistent(fs_info);
2386		return 0;
2387	}
2388
2389	if (!extent_buffer_uptodate(root_eb)) {
2390		struct btrfs_tree_parent_check check = {
2391			.has_first_key = false,
2392			.transid = root_gen,
2393			.level = root_level
2394		};
2395
2396		ret = btrfs_read_extent_buffer(root_eb, &check);
2397		if (ret)
2398			goto out;
2399	}
2400
2401	if (root_level == 0) {
2402		ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2403		goto out;
2404	}
2405
2406	path = btrfs_alloc_path();
2407	if (!path)
2408		return -ENOMEM;
2409
2410	/*
2411	 * Walk down the tree.  Missing extent blocks are filled in as
2412	 * we go. Metadata is accounted every time we read a new
2413	 * extent block.
2414	 *
2415	 * When we reach a leaf, we account for file extent items in it,
2416	 * walk back up the tree (adjusting slot pointers as we go)
2417	 * and restart the search process.
2418	 */
2419	atomic_inc(&root_eb->refs);	/* For path */
2420	path->nodes[root_level] = root_eb;
2421	path->slots[root_level] = 0;
2422	path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2423walk_down:
2424	level = root_level;
2425	while (level >= 0) {
2426		if (path->nodes[level] == NULL) {
2427			int parent_slot;
2428			u64 child_bytenr;
2429
2430			/*
2431			 * We need to get child blockptr from parent before we
2432			 * can read it.
2433			  */
2434			eb = path->nodes[level + 1];
2435			parent_slot = path->slots[level + 1];
2436			child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2437
2438			eb = btrfs_read_node_slot(eb, parent_slot);
2439			if (IS_ERR(eb)) {
2440				ret = PTR_ERR(eb);
2441				goto out;
2442			}
2443
2444			path->nodes[level] = eb;
2445			path->slots[level] = 0;
2446
2447			btrfs_tree_read_lock(eb);
2448			path->locks[level] = BTRFS_READ_LOCK;
2449
2450			ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2451							fs_info->nodesize);
2452			if (ret)
2453				goto out;
2454		}
2455
2456		if (level == 0) {
2457			ret = btrfs_qgroup_trace_leaf_items(trans,
2458							    path->nodes[level]);
2459			if (ret)
2460				goto out;
2461
2462			/* Nonzero return here means we completed our search */
2463			ret = adjust_slots_upwards(path, root_level);
2464			if (ret)
2465				break;
2466
2467			/* Restart search with new slots */
2468			goto walk_down;
2469		}
2470
2471		level--;
2472	}
2473
2474	ret = 0;
2475out:
2476	btrfs_free_path(path);
2477
2478	return ret;
2479}
2480
2481#define UPDATE_NEW	0
2482#define UPDATE_OLD	1
2483/*
2484 * Walk all of the roots that points to the bytenr and adjust their refcnts.
2485 */
2486static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2487				struct ulist *roots, struct ulist *tmp,
2488				struct ulist *qgroups, u64 seq, int update_old)
2489{
2490	struct ulist_node *unode;
2491	struct ulist_iterator uiter;
2492	struct ulist_node *tmp_unode;
2493	struct ulist_iterator tmp_uiter;
2494	struct btrfs_qgroup *qg;
2495	int ret = 0;
2496
2497	if (!roots)
2498		return 0;
2499	ULIST_ITER_INIT(&uiter);
2500	while ((unode = ulist_next(roots, &uiter))) {
2501		qg = find_qgroup_rb(fs_info, unode->val);
2502		if (!qg)
2503			continue;
2504
2505		ulist_reinit(tmp);
2506		ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
2507				GFP_ATOMIC);
2508		if (ret < 0)
2509			return ret;
2510		ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
2511		if (ret < 0)
2512			return ret;
2513		ULIST_ITER_INIT(&tmp_uiter);
2514		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
2515			struct btrfs_qgroup_list *glist;
2516
2517			qg = unode_aux_to_qgroup(tmp_unode);
2518			if (update_old)
2519				btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2520			else
2521				btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2522			list_for_each_entry(glist, &qg->groups, next_group) {
2523				ret = ulist_add(qgroups, glist->group->qgroupid,
2524						qgroup_to_aux(glist->group),
2525						GFP_ATOMIC);
2526				if (ret < 0)
2527					return ret;
2528				ret = ulist_add(tmp, glist->group->qgroupid,
2529						qgroup_to_aux(glist->group),
2530						GFP_ATOMIC);
2531				if (ret < 0)
2532					return ret;
2533			}
2534		}
2535	}
2536	return 0;
2537}
2538
2539/*
2540 * Update qgroup rfer/excl counters.
2541 * Rfer update is easy, codes can explain themselves.
2542 *
2543 * Excl update is tricky, the update is split into 2 parts.
2544 * Part 1: Possible exclusive <-> sharing detect:
2545 *	|	A	|	!A	|
2546 *  -------------------------------------
2547 *  B	|	*	|	-	|
2548 *  -------------------------------------
2549 *  !B	|	+	|	**	|
2550 *  -------------------------------------
2551 *
2552 * Conditions:
2553 * A:	cur_old_roots < nr_old_roots	(not exclusive before)
2554 * !A:	cur_old_roots == nr_old_roots	(possible exclusive before)
2555 * B:	cur_new_roots < nr_new_roots	(not exclusive now)
2556 * !B:	cur_new_roots == nr_new_roots	(possible exclusive now)
2557 *
2558 * Results:
2559 * +: Possible sharing -> exclusive	-: Possible exclusive -> sharing
2560 * *: Definitely not changed.		**: Possible unchanged.
2561 *
2562 * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2563 *
2564 * To make the logic clear, we first use condition A and B to split
2565 * combination into 4 results.
2566 *
2567 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2568 * only on variant maybe 0.
2569 *
2570 * Lastly, check result **, since there are 2 variants maybe 0, split them
2571 * again(2x2).
2572 * But this time we don't need to consider other things, the codes and logic
2573 * is easy to understand now.
2574 */
2575static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
2576				  struct ulist *qgroups,
2577				  u64 nr_old_roots,
2578				  u64 nr_new_roots,
2579				  u64 num_bytes, u64 seq)
2580{
2581	struct ulist_node *unode;
2582	struct ulist_iterator uiter;
2583	struct btrfs_qgroup *qg;
2584	u64 cur_new_count, cur_old_count;
2585
2586	ULIST_ITER_INIT(&uiter);
2587	while ((unode = ulist_next(qgroups, &uiter))) {
2588		bool dirty = false;
2589
2590		qg = unode_aux_to_qgroup(unode);
2591		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2592		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2593
2594		trace_qgroup_update_counters(fs_info, qg, cur_old_count,
2595					     cur_new_count);
2596
2597		/* Rfer update part */
2598		if (cur_old_count == 0 && cur_new_count > 0) {
2599			qg->rfer += num_bytes;
2600			qg->rfer_cmpr += num_bytes;
2601			dirty = true;
2602		}
2603		if (cur_old_count > 0 && cur_new_count == 0) {
2604			qg->rfer -= num_bytes;
2605			qg->rfer_cmpr -= num_bytes;
2606			dirty = true;
2607		}
2608
2609		/* Excl update part */
2610		/* Exclusive/none -> shared case */
2611		if (cur_old_count == nr_old_roots &&
2612		    cur_new_count < nr_new_roots) {
2613			/* Exclusive -> shared */
2614			if (cur_old_count != 0) {
2615				qg->excl -= num_bytes;
2616				qg->excl_cmpr -= num_bytes;
2617				dirty = true;
2618			}
2619		}
2620
2621		/* Shared -> exclusive/none case */
2622		if (cur_old_count < nr_old_roots &&
2623		    cur_new_count == nr_new_roots) {
2624			/* Shared->exclusive */
2625			if (cur_new_count != 0) {
2626				qg->excl += num_bytes;
2627				qg->excl_cmpr += num_bytes;
2628				dirty = true;
2629			}
2630		}
2631
2632		/* Exclusive/none -> exclusive/none case */
2633		if (cur_old_count == nr_old_roots &&
2634		    cur_new_count == nr_new_roots) {
2635			if (cur_old_count == 0) {
2636				/* None -> exclusive/none */
2637
2638				if (cur_new_count != 0) {
2639					/* None -> exclusive */
2640					qg->excl += num_bytes;
2641					qg->excl_cmpr += num_bytes;
2642					dirty = true;
2643				}
2644				/* None -> none, nothing changed */
2645			} else {
2646				/* Exclusive -> exclusive/none */
2647
2648				if (cur_new_count == 0) {
2649					/* Exclusive -> none */
2650					qg->excl -= num_bytes;
2651					qg->excl_cmpr -= num_bytes;
2652					dirty = true;
2653				}
2654				/* Exclusive -> exclusive, nothing changed */
2655			}
2656		}
2657
2658		if (dirty)
2659			qgroup_dirty(fs_info, qg);
2660	}
2661	return 0;
2662}
2663
2664/*
2665 * Check if the @roots potentially is a list of fs tree roots
2666 *
2667 * Return 0 for definitely not a fs/subvol tree roots ulist
2668 * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2669 *          one as well)
2670 */
2671static int maybe_fs_roots(struct ulist *roots)
2672{
2673	struct ulist_node *unode;
2674	struct ulist_iterator uiter;
2675
2676	/* Empty one, still possible for fs roots */
2677	if (!roots || roots->nnodes == 0)
2678		return 1;
2679
2680	ULIST_ITER_INIT(&uiter);
2681	unode = ulist_next(roots, &uiter);
2682	if (!unode)
2683		return 1;
2684
2685	/*
2686	 * If it contains fs tree roots, then it must belong to fs/subvol
2687	 * trees.
2688	 * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2689	 */
2690	return is_fstree(unode->val);
2691}
2692
2693int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2694				u64 num_bytes, struct ulist *old_roots,
2695				struct ulist *new_roots)
2696{
2697	struct btrfs_fs_info *fs_info = trans->fs_info;
2698	struct ulist *qgroups = NULL;
2699	struct ulist *tmp = NULL;
2700	u64 seq;
2701	u64 nr_new_roots = 0;
2702	u64 nr_old_roots = 0;
2703	int ret = 0;
2704
2705	/*
2706	 * If quotas get disabled meanwhile, the resources need to be freed and
2707	 * we can't just exit here.
2708	 */
2709	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
2710	    fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
2711		goto out_free;
2712
2713	if (new_roots) {
2714		if (!maybe_fs_roots(new_roots))
2715			goto out_free;
2716		nr_new_roots = new_roots->nnodes;
2717	}
2718	if (old_roots) {
2719		if (!maybe_fs_roots(old_roots))
2720			goto out_free;
2721		nr_old_roots = old_roots->nnodes;
2722	}
2723
2724	/* Quick exit, either not fs tree roots, or won't affect any qgroup */
2725	if (nr_old_roots == 0 && nr_new_roots == 0)
2726		goto out_free;
2727
2728	BUG_ON(!fs_info->quota_root);
2729
2730	trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2731					num_bytes, nr_old_roots, nr_new_roots);
2732
2733	qgroups = ulist_alloc(GFP_NOFS);
2734	if (!qgroups) {
2735		ret = -ENOMEM;
2736		goto out_free;
2737	}
2738	tmp = ulist_alloc(GFP_NOFS);
2739	if (!tmp) {
2740		ret = -ENOMEM;
2741		goto out_free;
2742	}
2743
2744	mutex_lock(&fs_info->qgroup_rescan_lock);
2745	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2746		if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2747			mutex_unlock(&fs_info->qgroup_rescan_lock);
2748			ret = 0;
2749			goto out_free;
2750		}
2751	}
2752	mutex_unlock(&fs_info->qgroup_rescan_lock);
2753
2754	spin_lock(&fs_info->qgroup_lock);
2755	seq = fs_info->qgroup_seq;
2756
2757	/* Update old refcnts using old_roots */
2758	ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
2759				   UPDATE_OLD);
2760	if (ret < 0)
2761		goto out;
2762
2763	/* Update new refcnts using new_roots */
2764	ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
2765				   UPDATE_NEW);
2766	if (ret < 0)
2767		goto out;
2768
2769	qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
2770			       num_bytes, seq);
2771
2772	/*
2773	 * Bump qgroup_seq to avoid seq overlap
2774	 */
2775	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
2776out:
2777	spin_unlock(&fs_info->qgroup_lock);
2778out_free:
2779	ulist_free(tmp);
2780	ulist_free(qgroups);
2781	ulist_free(old_roots);
2782	ulist_free(new_roots);
2783	return ret;
2784}
2785
2786int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2787{
2788	struct btrfs_fs_info *fs_info = trans->fs_info;
2789	struct btrfs_qgroup_extent_record *record;
2790	struct btrfs_delayed_ref_root *delayed_refs;
2791	struct ulist *new_roots = NULL;
2792	struct rb_node *node;
2793	u64 num_dirty_extents = 0;
2794	u64 qgroup_to_skip;
2795	int ret = 0;
2796
2797	delayed_refs = &trans->transaction->delayed_refs;
2798	qgroup_to_skip = delayed_refs->qgroup_to_skip;
2799	while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
2800		record = rb_entry(node, struct btrfs_qgroup_extent_record,
2801				  node);
2802
2803		num_dirty_extents++;
2804		trace_btrfs_qgroup_account_extents(fs_info, record);
2805
2806		if (!ret && !(fs_info->qgroup_flags &
2807			      BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
2808			struct btrfs_backref_walk_ctx ctx = { 0 };
2809
2810			ctx.bytenr = record->bytenr;
2811			ctx.fs_info = fs_info;
2812
2813			/*
2814			 * Old roots should be searched when inserting qgroup
2815			 * extent record.
2816			 *
2817			 * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
2818			 * we may have some record inserted during
2819			 * NO_ACCOUNTING (thus no old_roots populated), but
2820			 * later we start rescan, which clears NO_ACCOUNTING,
2821			 * leaving some inserted records without old_roots
2822			 * populated.
2823			 *
2824			 * Those cases are rare and should not cause too much
2825			 * time spent during commit_transaction().
2826			 */
2827			if (!record->old_roots) {
2828				/* Search commit root to find old_roots */
2829				ret = btrfs_find_all_roots(&ctx, false);
2830				if (ret < 0)
2831					goto cleanup;
2832				record->old_roots = ctx.roots;
2833				ctx.roots = NULL;
2834			}
2835
2836			/*
2837			 * Use BTRFS_SEQ_LAST as time_seq to do special search,
2838			 * which doesn't lock tree or delayed_refs and search
2839			 * current root. It's safe inside commit_transaction().
2840			 */
2841			ctx.trans = trans;
2842			ctx.time_seq = BTRFS_SEQ_LAST;
2843			ret = btrfs_find_all_roots(&ctx, false);
2844			if (ret < 0)
2845				goto cleanup;
2846			new_roots = ctx.roots;
2847			if (qgroup_to_skip) {
2848				ulist_del(new_roots, qgroup_to_skip, 0);
2849				ulist_del(record->old_roots, qgroup_to_skip,
2850					  0);
2851			}
2852			ret = btrfs_qgroup_account_extent(trans, record->bytenr,
2853							  record->num_bytes,
2854							  record->old_roots,
2855							  new_roots);
2856			record->old_roots = NULL;
2857			new_roots = NULL;
2858		}
2859		/* Free the reserved data space */
2860		btrfs_qgroup_free_refroot(fs_info,
2861				record->data_rsv_refroot,
2862				record->data_rsv,
2863				BTRFS_QGROUP_RSV_DATA);
2864cleanup:
2865		ulist_free(record->old_roots);
2866		ulist_free(new_roots);
2867		new_roots = NULL;
2868		rb_erase(node, &delayed_refs->dirty_extent_root);
2869		kfree(record);
2870
2871	}
2872	trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2873				       num_dirty_extents);
2874	return ret;
2875}
2876
2877/*
2878 * Writes all changed qgroups to disk.
2879 * Called by the transaction commit path and the qgroup assign ioctl.
2880 */
2881int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
2882{
2883	struct btrfs_fs_info *fs_info = trans->fs_info;
2884	int ret = 0;
2885
2886	/*
2887	 * In case we are called from the qgroup assign ioctl, assert that we
2888	 * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
2889	 * disable operation (ioctl) and access a freed quota root.
2890	 */
2891	if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
2892		lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
2893
2894	if (!fs_info->quota_root)
2895		return ret;
2896
2897	spin_lock(&fs_info->qgroup_lock);
2898	while (!list_empty(&fs_info->dirty_qgroups)) {
2899		struct btrfs_qgroup *qgroup;
2900		qgroup = list_first_entry(&fs_info->dirty_qgroups,
2901					  struct btrfs_qgroup, dirty);
2902		list_del_init(&qgroup->dirty);
2903		spin_unlock(&fs_info->qgroup_lock);
2904		ret = update_qgroup_info_item(trans, qgroup);
2905		if (ret)
2906			qgroup_mark_inconsistent(fs_info);
2907		ret = update_qgroup_limit_item(trans, qgroup);
2908		if (ret)
2909			qgroup_mark_inconsistent(fs_info);
2910		spin_lock(&fs_info->qgroup_lock);
2911	}
2912	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2913		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2914	else
2915		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2916	spin_unlock(&fs_info->qgroup_lock);
2917
2918	ret = update_qgroup_status_item(trans);
2919	if (ret)
2920		qgroup_mark_inconsistent(fs_info);
2921
2922	return ret;
2923}
2924
2925/*
2926 * Copy the accounting information between qgroups. This is necessary
2927 * when a snapshot or a subvolume is created. Throwing an error will
2928 * cause a transaction abort so we take extra care here to only error
2929 * when a readonly fs is a reasonable outcome.
2930 */
2931int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2932			 u64 objectid, struct btrfs_qgroup_inherit *inherit)
2933{
2934	int ret = 0;
2935	int i;
2936	u64 *i_qgroups;
2937	bool committing = false;
2938	struct btrfs_fs_info *fs_info = trans->fs_info;
2939	struct btrfs_root *quota_root;
2940	struct btrfs_qgroup *srcgroup;
2941	struct btrfs_qgroup *dstgroup;
2942	bool need_rescan = false;
2943	u32 level_size = 0;
2944	u64 nums;
2945
2946	/*
2947	 * There are only two callers of this function.
2948	 *
2949	 * One in create_subvol() in the ioctl context, which needs to hold
2950	 * the qgroup_ioctl_lock.
2951	 *
2952	 * The other one in create_pending_snapshot() where no other qgroup
2953	 * code can modify the fs as they all need to either start a new trans
2954	 * or hold a trans handler, thus we don't need to hold
2955	 * qgroup_ioctl_lock.
2956	 * This would avoid long and complex lock chain and make lockdep happy.
2957	 */
2958	spin_lock(&fs_info->trans_lock);
2959	if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
2960		committing = true;
2961	spin_unlock(&fs_info->trans_lock);
2962
2963	if (!committing)
2964		mutex_lock(&fs_info->qgroup_ioctl_lock);
2965	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2966		goto out;
2967
2968	quota_root = fs_info->quota_root;
2969	if (!quota_root) {
2970		ret = -EINVAL;
2971		goto out;
2972	}
2973
2974	if (inherit) {
2975		i_qgroups = (u64 *)(inherit + 1);
2976		nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2977		       2 * inherit->num_excl_copies;
2978		for (i = 0; i < nums; ++i) {
2979			srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
2980
2981			/*
2982			 * Zero out invalid groups so we can ignore
2983			 * them later.
2984			 */
2985			if (!srcgroup ||
2986			    ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
2987				*i_qgroups = 0ULL;
2988
2989			++i_qgroups;
2990		}
2991	}
2992
2993	/*
2994	 * create a tracking group for the subvol itself
2995	 */
2996	ret = add_qgroup_item(trans, quota_root, objectid);
2997	if (ret)
2998		goto out;
2999
3000	/*
3001	 * add qgroup to all inherited groups
3002	 */
3003	if (inherit) {
3004		i_qgroups = (u64 *)(inherit + 1);
3005		for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
3006			if (*i_qgroups == 0)
3007				continue;
3008			ret = add_qgroup_relation_item(trans, objectid,
3009						       *i_qgroups);
3010			if (ret && ret != -EEXIST)
3011				goto out;
3012			ret = add_qgroup_relation_item(trans, *i_qgroups,
3013						       objectid);
3014			if (ret && ret != -EEXIST)
3015				goto out;
3016		}
3017		ret = 0;
3018	}
3019
3020
3021	spin_lock(&fs_info->qgroup_lock);
3022
3023	dstgroup = add_qgroup_rb(fs_info, objectid);
3024	if (IS_ERR(dstgroup)) {
3025		ret = PTR_ERR(dstgroup);
3026		goto unlock;
3027	}
3028
3029	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
3030		dstgroup->lim_flags = inherit->lim.flags;
3031		dstgroup->max_rfer = inherit->lim.max_rfer;
3032		dstgroup->max_excl = inherit->lim.max_excl;
3033		dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
3034		dstgroup->rsv_excl = inherit->lim.rsv_excl;
3035
3036		qgroup_dirty(fs_info, dstgroup);
3037	}
3038
3039	if (srcid) {
3040		srcgroup = find_qgroup_rb(fs_info, srcid);
3041		if (!srcgroup)
3042			goto unlock;
3043
3044		/*
3045		 * We call inherit after we clone the root in order to make sure
3046		 * our counts don't go crazy, so at this point the only
3047		 * difference between the two roots should be the root node.
3048		 */
3049		level_size = fs_info->nodesize;
3050		dstgroup->rfer = srcgroup->rfer;
3051		dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
3052		dstgroup->excl = level_size;
3053		dstgroup->excl_cmpr = level_size;
3054		srcgroup->excl = level_size;
3055		srcgroup->excl_cmpr = level_size;
3056
3057		/* inherit the limit info */
3058		dstgroup->lim_flags = srcgroup->lim_flags;
3059		dstgroup->max_rfer = srcgroup->max_rfer;
3060		dstgroup->max_excl = srcgroup->max_excl;
3061		dstgroup->rsv_rfer = srcgroup->rsv_rfer;
3062		dstgroup->rsv_excl = srcgroup->rsv_excl;
3063
3064		qgroup_dirty(fs_info, dstgroup);
3065		qgroup_dirty(fs_info, srcgroup);
3066	}
3067
3068	if (!inherit)
3069		goto unlock;
3070
3071	i_qgroups = (u64 *)(inherit + 1);
3072	for (i = 0; i < inherit->num_qgroups; ++i) {
3073		if (*i_qgroups) {
3074			ret = add_relation_rb(fs_info, objectid, *i_qgroups);
3075			if (ret)
3076				goto unlock;
3077		}
3078		++i_qgroups;
3079
3080		/*
3081		 * If we're doing a snapshot, and adding the snapshot to a new
3082		 * qgroup, the numbers are guaranteed to be incorrect.
3083		 */
3084		if (srcid)
3085			need_rescan = true;
3086	}
3087
3088	for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
3089		struct btrfs_qgroup *src;
3090		struct btrfs_qgroup *dst;
3091
3092		if (!i_qgroups[0] || !i_qgroups[1])
3093			continue;
3094
3095		src = find_qgroup_rb(fs_info, i_qgroups[0]);
3096		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
3097
3098		if (!src || !dst) {
3099			ret = -EINVAL;
3100			goto unlock;
3101		}
3102
3103		dst->rfer = src->rfer - level_size;
3104		dst->rfer_cmpr = src->rfer_cmpr - level_size;
3105
3106		/* Manually tweaking numbers certainly needs a rescan */
3107		need_rescan = true;
3108	}
3109	for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
3110		struct btrfs_qgroup *src;
3111		struct btrfs_qgroup *dst;
3112
3113		if (!i_qgroups[0] || !i_qgroups[1])
3114			continue;
3115
3116		src = find_qgroup_rb(fs_info, i_qgroups[0]);
3117		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
3118
3119		if (!src || !dst) {
3120			ret = -EINVAL;
3121			goto unlock;
3122		}
3123
3124		dst->excl = src->excl + level_size;
3125		dst->excl_cmpr = src->excl_cmpr + level_size;
3126		need_rescan = true;
3127	}
3128
3129unlock:
3130	spin_unlock(&fs_info->qgroup_lock);
3131	if (!ret)
3132		ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
3133out:
3134	if (!committing)
3135		mutex_unlock(&fs_info->qgroup_ioctl_lock);
3136	if (need_rescan)
3137		qgroup_mark_inconsistent(fs_info);
3138	return ret;
3139}
3140
3141static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
3142{
3143	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
3144	    qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
3145		return false;
3146
3147	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
3148	    qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
3149		return false;
3150
3151	return true;
3152}
3153
3154static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
3155			  enum btrfs_qgroup_rsv_type type)
3156{
3157	struct btrfs_qgroup *qgroup;
3158	struct btrfs_fs_info *fs_info = root->fs_info;
3159	u64 ref_root = root->root_key.objectid;
3160	int ret = 0;
3161	LIST_HEAD(qgroup_list);
3162
3163	if (!is_fstree(ref_root))
3164		return 0;
3165
3166	if (num_bytes == 0)
3167		return 0;
3168
3169	if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
3170	    capable(CAP_SYS_RESOURCE))
3171		enforce = false;
3172
3173	spin_lock(&fs_info->qgroup_lock);
3174	if (!fs_info->quota_root)
3175		goto out;
3176
3177	qgroup = find_qgroup_rb(fs_info, ref_root);
3178	if (!qgroup)
3179		goto out;
3180
3181	qgroup_iterator_add(&qgroup_list, qgroup);
3182	list_for_each_entry(qgroup, &qgroup_list, iterator) {
3183		struct btrfs_qgroup_list *glist;
3184
3185		if (enforce && !qgroup_check_limits(qgroup, num_bytes)) {
3186			ret = -EDQUOT;
3187			goto out;
3188		}
3189
3190		list_for_each_entry(glist, &qgroup->groups, next_group)
3191			qgroup_iterator_add(&qgroup_list, glist->group);
3192	}
3193
3194	ret = 0;
3195	/*
3196	 * no limits exceeded, now record the reservation into all qgroups
3197	 */
3198	list_for_each_entry(qgroup, &qgroup_list, iterator)
3199		qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
3200
3201out:
3202	qgroup_iterator_clean(&qgroup_list);
3203	spin_unlock(&fs_info->qgroup_lock);
3204	return ret;
3205}
3206
3207/*
3208 * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
3209 * qgroup).
3210 *
3211 * Will handle all higher level qgroup too.
3212 *
3213 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3214 * This special case is only used for META_PERTRANS type.
3215 */
3216void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3217			       u64 ref_root, u64 num_bytes,
3218			       enum btrfs_qgroup_rsv_type type)
3219{
3220	struct btrfs_qgroup *qgroup;
3221	struct ulist_node *unode;
3222	struct ulist_iterator uiter;
3223	int ret = 0;
3224
3225	if (!is_fstree(ref_root))
3226		return;
3227
3228	if (num_bytes == 0)
3229		return;
3230
3231	if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3232		WARN(1, "%s: Invalid type to free", __func__);
3233		return;
3234	}
3235	spin_lock(&fs_info->qgroup_lock);
3236
3237	if (!fs_info->quota_root)
3238		goto out;
3239
3240	qgroup = find_qgroup_rb(fs_info, ref_root);
3241	if (!qgroup)
3242		goto out;
3243
3244	if (num_bytes == (u64)-1)
3245		/*
3246		 * We're freeing all pertrans rsv, get reserved value from
3247		 * level 0 qgroup as real num_bytes to free.
3248		 */
3249		num_bytes = qgroup->rsv.values[type];
3250
3251	ulist_reinit(fs_info->qgroup_ulist);
3252	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3253			qgroup_to_aux(qgroup), GFP_ATOMIC);
3254	if (ret < 0)
3255		goto out;
3256	ULIST_ITER_INIT(&uiter);
3257	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3258		struct btrfs_qgroup *qg;
3259		struct btrfs_qgroup_list *glist;
3260
3261		qg = unode_aux_to_qgroup(unode);
3262
3263		qgroup_rsv_release(fs_info, qg, num_bytes, type);
3264
3265		list_for_each_entry(glist, &qg->groups, next_group) {
3266			ret = ulist_add(fs_info->qgroup_ulist,
3267					glist->group->qgroupid,
3268					qgroup_to_aux(glist->group), GFP_ATOMIC);
3269			if (ret < 0)
3270				goto out;
3271		}
3272	}
3273
3274out:
3275	spin_unlock(&fs_info->qgroup_lock);
3276}
3277
3278/*
3279 * Check if the leaf is the last leaf. Which means all node pointers
3280 * are at their last position.
3281 */
3282static bool is_last_leaf(struct btrfs_path *path)
3283{
3284	int i;
3285
3286	for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3287		if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3288			return false;
3289	}
3290	return true;
3291}
3292
3293/*
3294 * returns < 0 on error, 0 when more leafs are to be scanned.
3295 * returns 1 when done.
3296 */
3297static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3298			      struct btrfs_path *path)
3299{
3300	struct btrfs_fs_info *fs_info = trans->fs_info;
3301	struct btrfs_root *extent_root;
3302	struct btrfs_key found;
3303	struct extent_buffer *scratch_leaf = NULL;
3304	u64 num_bytes;
3305	bool done;
3306	int slot;
3307	int ret;
3308
3309	mutex_lock(&fs_info->qgroup_rescan_lock);
3310	extent_root = btrfs_extent_root(fs_info,
3311				fs_info->qgroup_rescan_progress.objectid);
3312	ret = btrfs_search_slot_for_read(extent_root,
3313					 &fs_info->qgroup_rescan_progress,
3314					 path, 1, 0);
3315
3316	btrfs_debug(fs_info,
3317		"current progress key (%llu %u %llu), search_slot ret %d",
3318		fs_info->qgroup_rescan_progress.objectid,
3319		fs_info->qgroup_rescan_progress.type,
3320		fs_info->qgroup_rescan_progress.offset, ret);
3321
3322	if (ret) {
3323		/*
3324		 * The rescan is about to end, we will not be scanning any
3325		 * further blocks. We cannot unset the RESCAN flag here, because
3326		 * we want to commit the transaction if everything went well.
3327		 * To make the live accounting work in this phase, we set our
3328		 * scan progress pointer such that every real extent objectid
3329		 * will be smaller.
3330		 */
3331		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3332		btrfs_release_path(path);
3333		mutex_unlock(&fs_info->qgroup_rescan_lock);
3334		return ret;
3335	}
3336	done = is_last_leaf(path);
3337
3338	btrfs_item_key_to_cpu(path->nodes[0], &found,
3339			      btrfs_header_nritems(path->nodes[0]) - 1);
3340	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3341
3342	scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3343	if (!scratch_leaf) {
3344		ret = -ENOMEM;
3345		mutex_unlock(&fs_info->qgroup_rescan_lock);
3346		goto out;
3347	}
3348	slot = path->slots[0];
3349	btrfs_release_path(path);
3350	mutex_unlock(&fs_info->qgroup_rescan_lock);
3351
3352	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3353		struct btrfs_backref_walk_ctx ctx = { 0 };
3354
3355		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3356		if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3357		    found.type != BTRFS_METADATA_ITEM_KEY)
3358			continue;
3359		if (found.type == BTRFS_METADATA_ITEM_KEY)
3360			num_bytes = fs_info->nodesize;
3361		else
3362			num_bytes = found.offset;
3363
3364		ctx.bytenr = found.objectid;
3365		ctx.fs_info = fs_info;
3366
3367		ret = btrfs_find_all_roots(&ctx, false);
3368		if (ret < 0)
3369			goto out;
3370		/* For rescan, just pass old_roots as NULL */
3371		ret = btrfs_qgroup_account_extent(trans, found.objectid,
3372						  num_bytes, NULL, ctx.roots);
3373		if (ret < 0)
3374			goto out;
3375	}
3376out:
3377	if (scratch_leaf)
3378		free_extent_buffer(scratch_leaf);
3379
3380	if (done && !ret) {
3381		ret = 1;
3382		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3383	}
3384	return ret;
3385}
3386
3387static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
3388{
3389	return btrfs_fs_closing(fs_info) ||
3390		test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
3391		!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3392			  fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
3393}
3394
3395static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3396{
3397	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3398						     qgroup_rescan_work);
3399	struct btrfs_path *path;
3400	struct btrfs_trans_handle *trans = NULL;
3401	int err = -ENOMEM;
3402	int ret = 0;
3403	bool stopped = false;
3404	bool did_leaf_rescans = false;
3405
3406	path = btrfs_alloc_path();
3407	if (!path)
3408		goto out;
3409	/*
3410	 * Rescan should only search for commit root, and any later difference
3411	 * should be recorded by qgroup
3412	 */
3413	path->search_commit_root = 1;
3414	path->skip_locking = 1;
3415
3416	err = 0;
3417	while (!err && !(stopped = rescan_should_stop(fs_info))) {
3418		trans = btrfs_start_transaction(fs_info->fs_root, 0);
3419		if (IS_ERR(trans)) {
3420			err = PTR_ERR(trans);
3421			break;
3422		}
3423
3424		err = qgroup_rescan_leaf(trans, path);
3425		did_leaf_rescans = true;
3426
3427		if (err > 0)
3428			btrfs_commit_transaction(trans);
3429		else
3430			btrfs_end_transaction(trans);
3431	}
3432
3433out:
3434	btrfs_free_path(path);
3435
3436	mutex_lock(&fs_info->qgroup_rescan_lock);
3437	if (err > 0 &&
3438	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3439		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3440	} else if (err < 0 || stopped) {
3441		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3442	}
3443	mutex_unlock(&fs_info->qgroup_rescan_lock);
3444
3445	/*
3446	 * Only update status, since the previous part has already updated the
3447	 * qgroup info, and only if we did any actual work. This also prevents
3448	 * race with a concurrent quota disable, which has already set
3449	 * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
3450	 * btrfs_quota_disable().
3451	 */
3452	if (did_leaf_rescans) {
3453		trans = btrfs_start_transaction(fs_info->quota_root, 1);
3454		if (IS_ERR(trans)) {
3455			err = PTR_ERR(trans);
3456			trans = NULL;
3457			btrfs_err(fs_info,
3458				  "fail to start transaction for status update: %d",
3459				  err);
3460		}
3461	} else {
3462		trans = NULL;
3463	}
3464
3465	mutex_lock(&fs_info->qgroup_rescan_lock);
3466	if (!stopped ||
3467	    fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
3468		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3469	if (trans) {
3470		ret = update_qgroup_status_item(trans);
3471		if (ret < 0) {
3472			err = ret;
3473			btrfs_err(fs_info, "fail to update qgroup status: %d",
3474				  err);
3475		}
3476	}
3477	fs_info->qgroup_rescan_running = false;
3478	fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
3479	complete_all(&fs_info->qgroup_rescan_completion);
3480	mutex_unlock(&fs_info->qgroup_rescan_lock);
3481
3482	if (!trans)
3483		return;
3484
3485	btrfs_end_transaction(trans);
3486
3487	if (stopped) {
3488		btrfs_info(fs_info, "qgroup scan paused");
3489	} else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
3490		btrfs_info(fs_info, "qgroup scan cancelled");
3491	} else if (err >= 0) {
3492		btrfs_info(fs_info, "qgroup scan completed%s",
3493			err > 0 ? " (inconsistency flag cleared)" : "");
3494	} else {
3495		btrfs_err(fs_info, "qgroup scan failed with %d", err);
3496	}
3497}
3498
3499/*
3500 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3501 * memory required for the rescan context.
3502 */
3503static int
3504qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3505		   int init_flags)
3506{
3507	int ret = 0;
3508
3509	if (!init_flags) {
3510		/* we're resuming qgroup rescan at mount time */
3511		if (!(fs_info->qgroup_flags &
3512		      BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3513			btrfs_warn(fs_info,
3514			"qgroup rescan init failed, qgroup rescan is not queued");
3515			ret = -EINVAL;
3516		} else if (!(fs_info->qgroup_flags &
3517			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
3518			btrfs_warn(fs_info,
3519			"qgroup rescan init failed, qgroup is not enabled");
3520			ret = -EINVAL;
3521		}
3522
3523		if (ret)
3524			return ret;
3525	}
3526
3527	mutex_lock(&fs_info->qgroup_rescan_lock);
3528
3529	if (init_flags) {
3530		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3531			btrfs_warn(fs_info,
3532				   "qgroup rescan is already in progress");
3533			ret = -EINPROGRESS;
3534		} else if (!(fs_info->qgroup_flags &
3535			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
3536			btrfs_warn(fs_info,
3537			"qgroup rescan init failed, qgroup is not enabled");
3538			ret = -EINVAL;
3539		} else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3540			/* Quota disable is in progress */
3541			ret = -EBUSY;
3542		}
3543
3544		if (ret) {
3545			mutex_unlock(&fs_info->qgroup_rescan_lock);
3546			return ret;
3547		}
3548		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3549	}
3550
3551	memset(&fs_info->qgroup_rescan_progress, 0,
3552		sizeof(fs_info->qgroup_rescan_progress));
3553	fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
3554				   BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
3555	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3556	init_completion(&fs_info->qgroup_rescan_completion);
3557	mutex_unlock(&fs_info->qgroup_rescan_lock);
3558
3559	btrfs_init_work(&fs_info->qgroup_rescan_work,
3560			btrfs_qgroup_rescan_worker, NULL, NULL);
3561	return 0;
3562}
3563
3564static void
3565qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3566{
3567	struct rb_node *n;
3568	struct btrfs_qgroup *qgroup;
3569
3570	spin_lock(&fs_info->qgroup_lock);
3571	/* clear all current qgroup tracking information */
3572	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3573		qgroup = rb_entry(n, struct btrfs_qgroup, node);
3574		qgroup->rfer = 0;
3575		qgroup->rfer_cmpr = 0;
3576		qgroup->excl = 0;
3577		qgroup->excl_cmpr = 0;
3578		qgroup_dirty(fs_info, qgroup);
3579	}
3580	spin_unlock(&fs_info->qgroup_lock);
3581}
3582
3583int
3584btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3585{
3586	int ret = 0;
3587	struct btrfs_trans_handle *trans;
3588
3589	ret = qgroup_rescan_init(fs_info, 0, 1);
3590	if (ret)
3591		return ret;
3592
3593	/*
3594	 * We have set the rescan_progress to 0, which means no more
3595	 * delayed refs will be accounted by btrfs_qgroup_account_ref.
3596	 * However, btrfs_qgroup_account_ref may be right after its call
3597	 * to btrfs_find_all_roots, in which case it would still do the
3598	 * accounting.
3599	 * To solve this, we're committing the transaction, which will
3600	 * ensure we run all delayed refs and only after that, we are
3601	 * going to clear all tracking information for a clean start.
3602	 */
3603
3604	trans = btrfs_attach_transaction_barrier(fs_info->fs_root);
3605	if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) {
3606		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3607		return PTR_ERR(trans);
3608	} else if (trans != ERR_PTR(-ENOENT)) {
3609		ret = btrfs_commit_transaction(trans);
3610		if (ret) {
3611			fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3612			return ret;
3613		}
3614	}
3615
3616	qgroup_rescan_zero_tracking(fs_info);
3617
3618	mutex_lock(&fs_info->qgroup_rescan_lock);
3619	fs_info->qgroup_rescan_running = true;
3620	btrfs_queue_work(fs_info->qgroup_rescan_workers,
3621			 &fs_info->qgroup_rescan_work);
3622	mutex_unlock(&fs_info->qgroup_rescan_lock);
3623
3624	return 0;
3625}
3626
3627int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
3628				     bool interruptible)
3629{
3630	int running;
3631	int ret = 0;
3632
3633	mutex_lock(&fs_info->qgroup_rescan_lock);
3634	running = fs_info->qgroup_rescan_running;
3635	mutex_unlock(&fs_info->qgroup_rescan_lock);
3636
3637	if (!running)
3638		return 0;
3639
3640	if (interruptible)
3641		ret = wait_for_completion_interruptible(
3642					&fs_info->qgroup_rescan_completion);
3643	else
3644		wait_for_completion(&fs_info->qgroup_rescan_completion);
3645
3646	return ret;
3647}
3648
3649/*
3650 * this is only called from open_ctree where we're still single threaded, thus
3651 * locking is omitted here.
3652 */
3653void
3654btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3655{
3656	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3657		mutex_lock(&fs_info->qgroup_rescan_lock);
3658		fs_info->qgroup_rescan_running = true;
3659		btrfs_queue_work(fs_info->qgroup_rescan_workers,
3660				 &fs_info->qgroup_rescan_work);
3661		mutex_unlock(&fs_info->qgroup_rescan_lock);
3662	}
3663}
3664
3665#define rbtree_iterate_from_safe(node, next, start)				\
3666       for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
3667
3668static int qgroup_unreserve_range(struct btrfs_inode *inode,
3669				  struct extent_changeset *reserved, u64 start,
3670				  u64 len)
3671{
3672	struct rb_node *node;
3673	struct rb_node *next;
3674	struct ulist_node *entry;
3675	int ret = 0;
3676
3677	node = reserved->range_changed.root.rb_node;
3678	if (!node)
3679		return 0;
3680	while (node) {
3681		entry = rb_entry(node, struct ulist_node, rb_node);
3682		if (entry->val < start)
3683			node = node->rb_right;
3684		else
3685			node = node->rb_left;
3686	}
3687
3688	if (entry->val > start && rb_prev(&entry->rb_node))
3689		entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
3690				 rb_node);
3691
3692	rbtree_iterate_from_safe(node, next, &entry->rb_node) {
3693		u64 entry_start;
3694		u64 entry_end;
3695		u64 entry_len;
3696		int clear_ret;
3697
3698		entry = rb_entry(node, struct ulist_node, rb_node);
3699		entry_start = entry->val;
3700		entry_end = entry->aux;
3701		entry_len = entry_end - entry_start + 1;
3702
3703		if (entry_start >= start + len)
3704			break;
3705		if (entry_start + entry_len <= start)
3706			continue;
3707		/*
3708		 * Now the entry is in [start, start + len), revert the
3709		 * EXTENT_QGROUP_RESERVED bit.
3710		 */
3711		clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
3712					      entry_end, EXTENT_QGROUP_RESERVED);
3713		if (!ret && clear_ret < 0)
3714			ret = clear_ret;
3715
3716		ulist_del(&reserved->range_changed, entry->val, entry->aux);
3717		if (likely(reserved->bytes_changed >= entry_len)) {
3718			reserved->bytes_changed -= entry_len;
3719		} else {
3720			WARN_ON(1);
3721			reserved->bytes_changed = 0;
3722		}
3723	}
3724
3725	return ret;
3726}
3727
3728/*
3729 * Try to free some space for qgroup.
3730 *
3731 * For qgroup, there are only 3 ways to free qgroup space:
3732 * - Flush nodatacow write
3733 *   Any nodatacow write will free its reserved data space at run_delalloc_range().
3734 *   In theory, we should only flush nodatacow inodes, but it's not yet
3735 *   possible, so we need to flush the whole root.
3736 *
3737 * - Wait for ordered extents
3738 *   When ordered extents are finished, their reserved metadata is finally
3739 *   converted to per_trans status, which can be freed by later commit
3740 *   transaction.
3741 *
3742 * - Commit transaction
3743 *   This would free the meta_per_trans space.
3744 *   In theory this shouldn't provide much space, but any more qgroup space
3745 *   is needed.
3746 */
3747static int try_flush_qgroup(struct btrfs_root *root)
3748{
3749	struct btrfs_trans_handle *trans;
3750	int ret;
3751
3752	/* Can't hold an open transaction or we run the risk of deadlocking. */
3753	ASSERT(current->journal_info == NULL);
3754	if (WARN_ON(current->journal_info))
3755		return 0;
3756
3757	/*
3758	 * We don't want to run flush again and again, so if there is a running
3759	 * one, we won't try to start a new flush, but exit directly.
3760	 */
3761	if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
3762		wait_event(root->qgroup_flush_wait,
3763			!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
3764		return 0;
3765	}
3766
3767	ret = btrfs_start_delalloc_snapshot(root, true);
3768	if (ret < 0)
3769		goto out;
3770	btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
3771
3772	trans = btrfs_attach_transaction_barrier(root);
3773	if (IS_ERR(trans)) {
3774		ret = PTR_ERR(trans);
3775		if (ret == -ENOENT)
3776			ret = 0;
3777		goto out;
3778	}
3779
3780	ret = btrfs_commit_transaction(trans);
3781out:
3782	clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
3783	wake_up(&root->qgroup_flush_wait);
3784	return ret;
3785}
3786
3787static int qgroup_reserve_data(struct btrfs_inode *inode,
3788			struct extent_changeset **reserved_ret, u64 start,
3789			u64 len)
3790{
3791	struct btrfs_root *root = inode->root;
3792	struct extent_changeset *reserved;
3793	bool new_reserved = false;
3794	u64 orig_reserved;
3795	u64 to_reserve;
3796	int ret;
3797
3798	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3799	    !is_fstree(root->root_key.objectid) || len == 0)
3800		return 0;
3801
3802	/* @reserved parameter is mandatory for qgroup */
3803	if (WARN_ON(!reserved_ret))
3804		return -EINVAL;
3805	if (!*reserved_ret) {
3806		new_reserved = true;
3807		*reserved_ret = extent_changeset_alloc();
3808		if (!*reserved_ret)
3809			return -ENOMEM;
3810	}
3811	reserved = *reserved_ret;
3812	/* Record already reserved space */
3813	orig_reserved = reserved->bytes_changed;
3814	ret = set_record_extent_bits(&inode->io_tree, start,
3815			start + len -1, EXTENT_QGROUP_RESERVED, reserved);
3816
3817	/* Newly reserved space */
3818	to_reserve = reserved->bytes_changed - orig_reserved;
3819	trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
3820					to_reserve, QGROUP_RESERVE);
3821	if (ret < 0)
3822		goto out;
3823	ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
3824	if (ret < 0)
3825		goto cleanup;
3826
3827	return ret;
3828
3829cleanup:
3830	qgroup_unreserve_range(inode, reserved, start, len);
3831out:
3832	if (new_reserved) {
3833		extent_changeset_free(reserved);
3834		*reserved_ret = NULL;
3835	}
3836	return ret;
3837}
3838
3839/*
3840 * Reserve qgroup space for range [start, start + len).
3841 *
3842 * This function will either reserve space from related qgroups or do nothing
3843 * if the range is already reserved.
3844 *
3845 * Return 0 for successful reservation
3846 * Return <0 for error (including -EQUOT)
3847 *
3848 * NOTE: This function may sleep for memory allocation, dirty page flushing and
3849 *	 commit transaction. So caller should not hold any dirty page locked.
3850 */
3851int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3852			struct extent_changeset **reserved_ret, u64 start,
3853			u64 len)
3854{
3855	int ret;
3856
3857	ret = qgroup_reserve_data(inode, reserved_ret, start, len);
3858	if (ret <= 0 && ret != -EDQUOT)
3859		return ret;
3860
3861	ret = try_flush_qgroup(inode->root);
3862	if (ret < 0)
3863		return ret;
3864	return qgroup_reserve_data(inode, reserved_ret, start, len);
3865}
3866
3867/* Free ranges specified by @reserved, normally in error path */
3868static int qgroup_free_reserved_data(struct btrfs_inode *inode,
3869				     struct extent_changeset *reserved,
3870				     u64 start, u64 len, u64 *freed_ret)
3871{
3872	struct btrfs_root *root = inode->root;
3873	struct ulist_node *unode;
3874	struct ulist_iterator uiter;
3875	struct extent_changeset changeset;
3876	u64 freed = 0;
3877	int ret;
3878
3879	extent_changeset_init(&changeset);
3880	len = round_up(start + len, root->fs_info->sectorsize);
3881	start = round_down(start, root->fs_info->sectorsize);
3882
3883	ULIST_ITER_INIT(&uiter);
3884	while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
3885		u64 range_start = unode->val;
3886		/* unode->aux is the inclusive end */
3887		u64 range_len = unode->aux - range_start + 1;
3888		u64 free_start;
3889		u64 free_len;
3890
3891		extent_changeset_release(&changeset);
3892
3893		/* Only free range in range [start, start + len) */
3894		if (range_start >= start + len ||
3895		    range_start + range_len <= start)
3896			continue;
3897		free_start = max(range_start, start);
3898		free_len = min(start + len, range_start + range_len) -
3899			   free_start;
3900		/*
3901		 * TODO: To also modify reserved->ranges_reserved to reflect
3902		 * the modification.
3903		 *
3904		 * However as long as we free qgroup reserved according to
3905		 * EXTENT_QGROUP_RESERVED, we won't double free.
3906		 * So not need to rush.
3907		 */
3908		ret = clear_record_extent_bits(&inode->io_tree, free_start,
3909				free_start + free_len - 1,
3910				EXTENT_QGROUP_RESERVED, &changeset);
3911		if (ret < 0)
3912			goto out;
3913		freed += changeset.bytes_changed;
3914	}
3915	btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
3916				  BTRFS_QGROUP_RSV_DATA);
3917	if (freed_ret)
3918		*freed_ret = freed;
3919	ret = 0;
3920out:
3921	extent_changeset_release(&changeset);
3922	return ret;
3923}
3924
3925static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
3926			struct extent_changeset *reserved, u64 start, u64 len,
3927			u64 *released, int free)
3928{
3929	struct extent_changeset changeset;
3930	int trace_op = QGROUP_RELEASE;
3931	int ret;
3932
3933	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
3934		return 0;
3935
3936	/* In release case, we shouldn't have @reserved */
3937	WARN_ON(!free && reserved);
3938	if (free && reserved)
3939		return qgroup_free_reserved_data(inode, reserved, start, len, released);
3940	extent_changeset_init(&changeset);
3941	ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
3942				       EXTENT_QGROUP_RESERVED, &changeset);
3943	if (ret < 0)
3944		goto out;
3945
3946	if (free)
3947		trace_op = QGROUP_FREE;
3948	trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
3949					changeset.bytes_changed, trace_op);
3950	if (free)
3951		btrfs_qgroup_free_refroot(inode->root->fs_info,
3952				inode->root->root_key.objectid,
3953				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3954	if (released)
3955		*released = changeset.bytes_changed;
3956out:
3957	extent_changeset_release(&changeset);
3958	return ret;
3959}
3960
3961/*
3962 * Free a reserved space range from io_tree and related qgroups
3963 *
3964 * Should be called when a range of pages get invalidated before reaching disk.
3965 * Or for error cleanup case.
3966 * if @reserved is given, only reserved range in [@start, @start + @len) will
3967 * be freed.
3968 *
3969 * For data written to disk, use btrfs_qgroup_release_data().
3970 *
3971 * NOTE: This function may sleep for memory allocation.
3972 */
3973int btrfs_qgroup_free_data(struct btrfs_inode *inode,
3974			   struct extent_changeset *reserved,
3975			   u64 start, u64 len, u64 *freed)
3976{
3977	return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
3978}
3979
3980/*
3981 * Release a reserved space range from io_tree only.
3982 *
3983 * Should be called when a range of pages get written to disk and corresponding
3984 * FILE_EXTENT is inserted into corresponding root.
3985 *
3986 * Since new qgroup accounting framework will only update qgroup numbers at
3987 * commit_transaction() time, its reserved space shouldn't be freed from
3988 * related qgroups.
3989 *
3990 * But we should release the range from io_tree, to allow further write to be
3991 * COWed.
3992 *
3993 * NOTE: This function may sleep for memory allocation.
3994 */
3995int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
3996{
3997	return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
3998}
3999
4000static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
4001			      enum btrfs_qgroup_rsv_type type)
4002{
4003	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4004	    type != BTRFS_QGROUP_RSV_META_PERTRANS)
4005		return;
4006	if (num_bytes == 0)
4007		return;
4008
4009	spin_lock(&root->qgroup_meta_rsv_lock);
4010	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
4011		root->qgroup_meta_rsv_prealloc += num_bytes;
4012	else
4013		root->qgroup_meta_rsv_pertrans += num_bytes;
4014	spin_unlock(&root->qgroup_meta_rsv_lock);
4015}
4016
4017static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
4018			     enum btrfs_qgroup_rsv_type type)
4019{
4020	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4021	    type != BTRFS_QGROUP_RSV_META_PERTRANS)
4022		return 0;
4023	if (num_bytes == 0)
4024		return 0;
4025
4026	spin_lock(&root->qgroup_meta_rsv_lock);
4027	if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
4028		num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
4029				  num_bytes);
4030		root->qgroup_meta_rsv_prealloc -= num_bytes;
4031	} else {
4032		num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
4033				  num_bytes);
4034		root->qgroup_meta_rsv_pertrans -= num_bytes;
4035	}
4036	spin_unlock(&root->qgroup_meta_rsv_lock);
4037	return num_bytes;
4038}
4039
4040int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
4041			      enum btrfs_qgroup_rsv_type type, bool enforce)
4042{
4043	struct btrfs_fs_info *fs_info = root->fs_info;
4044	int ret;
4045
4046	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4047	    !is_fstree(root->root_key.objectid) || num_bytes == 0)
4048		return 0;
4049
4050	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4051	trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
4052	ret = qgroup_reserve(root, num_bytes, enforce, type);
4053	if (ret < 0)
4054		return ret;
4055	/*
4056	 * Record what we have reserved into root.
4057	 *
4058	 * To avoid quota disabled->enabled underflow.
4059	 * In that case, we may try to free space we haven't reserved
4060	 * (since quota was disabled), so record what we reserved into root.
4061	 * And ensure later release won't underflow this number.
4062	 */
4063	add_root_meta_rsv(root, num_bytes, type);
4064	return ret;
4065}
4066
4067int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
4068				enum btrfs_qgroup_rsv_type type, bool enforce,
4069				bool noflush)
4070{
4071	int ret;
4072
4073	ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4074	if ((ret <= 0 && ret != -EDQUOT) || noflush)
4075		return ret;
4076
4077	ret = try_flush_qgroup(root);
4078	if (ret < 0)
4079		return ret;
4080	return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4081}
4082
4083void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
4084{
4085	struct btrfs_fs_info *fs_info = root->fs_info;
4086
4087	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4088	    !is_fstree(root->root_key.objectid))
4089		return;
4090
4091	/* TODO: Update trace point to handle such free */
4092	trace_qgroup_meta_free_all_pertrans(root);
4093	/* Special value -1 means to free all reserved space */
4094	btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
4095				  BTRFS_QGROUP_RSV_META_PERTRANS);
4096}
4097
4098void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
4099			      enum btrfs_qgroup_rsv_type type)
4100{
4101	struct btrfs_fs_info *fs_info = root->fs_info;
4102
4103	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4104	    !is_fstree(root->root_key.objectid))
4105		return;
4106
4107	/*
4108	 * reservation for META_PREALLOC can happen before quota is enabled,
4109	 * which can lead to underflow.
4110	 * Here ensure we will only free what we really have reserved.
4111	 */
4112	num_bytes = sub_root_meta_rsv(root, num_bytes, type);
4113	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4114	trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
4115	btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
4116				  num_bytes, type);
4117}
4118
4119static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
4120				int num_bytes)
4121{
4122	struct btrfs_qgroup *qgroup;
4123	LIST_HEAD(qgroup_list);
4124
4125	if (num_bytes == 0)
4126		return;
4127	if (!fs_info->quota_root)
4128		return;
4129
4130	spin_lock(&fs_info->qgroup_lock);
4131	qgroup = find_qgroup_rb(fs_info, ref_root);
4132	if (!qgroup)
4133		goto out;
4134
4135	qgroup_iterator_add(&qgroup_list, qgroup);
4136	list_for_each_entry(qgroup, &qgroup_list, iterator) {
4137		struct btrfs_qgroup_list *glist;
4138
4139		qgroup_rsv_release(fs_info, qgroup, num_bytes,
4140				BTRFS_QGROUP_RSV_META_PREALLOC);
4141		if (!sb_rdonly(fs_info->sb))
4142			qgroup_rsv_add(fs_info, qgroup, num_bytes,
4143				       BTRFS_QGROUP_RSV_META_PERTRANS);
4144
4145		list_for_each_entry(glist, &qgroup->groups, next_group)
4146			qgroup_iterator_add(&qgroup_list, glist->group);
4147	}
4148out:
4149	qgroup_iterator_clean(&qgroup_list);
4150	spin_unlock(&fs_info->qgroup_lock);
4151}
4152
4153void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
4154{
4155	struct btrfs_fs_info *fs_info = root->fs_info;
4156
4157	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4158	    !is_fstree(root->root_key.objectid))
4159		return;
4160	/* Same as btrfs_qgroup_free_meta_prealloc() */
4161	num_bytes = sub_root_meta_rsv(root, num_bytes,
4162				      BTRFS_QGROUP_RSV_META_PREALLOC);
4163	trace_qgroup_meta_convert(root, num_bytes);
4164	qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
4165}
4166
4167/*
4168 * Check qgroup reserved space leaking, normally at destroy inode
4169 * time
4170 */
4171void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
4172{
4173	struct extent_changeset changeset;
4174	struct ulist_node *unode;
4175	struct ulist_iterator iter;
4176	int ret;
4177
4178	extent_changeset_init(&changeset);
4179	ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
4180			EXTENT_QGROUP_RESERVED, &changeset);
4181
4182	WARN_ON(ret < 0);
4183	if (WARN_ON(changeset.bytes_changed)) {
4184		ULIST_ITER_INIT(&iter);
4185		while ((unode = ulist_next(&changeset.range_changed, &iter))) {
4186			btrfs_warn(inode->root->fs_info,
4187		"leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4188				btrfs_ino(inode), unode->val, unode->aux);
4189		}
4190		btrfs_qgroup_free_refroot(inode->root->fs_info,
4191				inode->root->root_key.objectid,
4192				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4193
4194	}
4195	extent_changeset_release(&changeset);
4196}
4197
4198void btrfs_qgroup_init_swapped_blocks(
4199	struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4200{
4201	int i;
4202
4203	spin_lock_init(&swapped_blocks->lock);
4204	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4205		swapped_blocks->blocks[i] = RB_ROOT;
4206	swapped_blocks->swapped = false;
4207}
4208
4209/*
4210 * Delete all swapped blocks record of @root.
4211 * Every record here means we skipped a full subtree scan for qgroup.
4212 *
4213 * Gets called when committing one transaction.
4214 */
4215void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4216{
4217	struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4218	int i;
4219
4220	swapped_blocks = &root->swapped_blocks;
4221
4222	spin_lock(&swapped_blocks->lock);
4223	if (!swapped_blocks->swapped)
4224		goto out;
4225	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4226		struct rb_root *cur_root = &swapped_blocks->blocks[i];
4227		struct btrfs_qgroup_swapped_block *entry;
4228		struct btrfs_qgroup_swapped_block *next;
4229
4230		rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4231						     node)
4232			kfree(entry);
4233		swapped_blocks->blocks[i] = RB_ROOT;
4234	}
4235	swapped_blocks->swapped = false;
4236out:
4237	spin_unlock(&swapped_blocks->lock);
4238}
4239
4240/*
4241 * Add subtree roots record into @subvol_root.
4242 *
4243 * @subvol_root:	tree root of the subvolume tree get swapped
4244 * @bg:			block group under balance
4245 * @subvol_parent/slot:	pointer to the subtree root in subvolume tree
4246 * @reloc_parent/slot:	pointer to the subtree root in reloc tree
4247 *			BOTH POINTERS ARE BEFORE TREE SWAP
4248 * @last_snapshot:	last snapshot generation of the subvolume tree
4249 */
4250int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
4251		struct btrfs_root *subvol_root,
4252		struct btrfs_block_group *bg,
4253		struct extent_buffer *subvol_parent, int subvol_slot,
4254		struct extent_buffer *reloc_parent, int reloc_slot,
4255		u64 last_snapshot)
4256{
4257	struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4258	struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4259	struct btrfs_qgroup_swapped_block *block;
4260	struct rb_node **cur;
4261	struct rb_node *parent = NULL;
4262	int level = btrfs_header_level(subvol_parent) - 1;
4263	int ret = 0;
4264
4265	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4266		return 0;
4267
4268	if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4269	    btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
4270		btrfs_err_rl(fs_info,
4271		"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4272			__func__,
4273			btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4274			btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4275		return -EUCLEAN;
4276	}
4277
4278	block = kmalloc(sizeof(*block), GFP_NOFS);
4279	if (!block) {
4280		ret = -ENOMEM;
4281		goto out;
4282	}
4283
4284	/*
4285	 * @reloc_parent/slot is still before swap, while @block is going to
4286	 * record the bytenr after swap, so we do the swap here.
4287	 */
4288	block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4289	block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4290							     reloc_slot);
4291	block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4292	block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4293							    subvol_slot);
4294	block->last_snapshot = last_snapshot;
4295	block->level = level;
4296
4297	/*
4298	 * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4299	 * no one else can modify tree blocks thus we qgroup will not change
4300	 * no matter the value of trace_leaf.
4301	 */
4302	if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4303		block->trace_leaf = true;
4304	else
4305		block->trace_leaf = false;
4306	btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4307
4308	/* Insert @block into @blocks */
4309	spin_lock(&blocks->lock);
4310	cur = &blocks->blocks[level].rb_node;
4311	while (*cur) {
4312		struct btrfs_qgroup_swapped_block *entry;
4313
4314		parent = *cur;
4315		entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4316				 node);
4317
4318		if (entry->subvol_bytenr < block->subvol_bytenr) {
4319			cur = &(*cur)->rb_left;
4320		} else if (entry->subvol_bytenr > block->subvol_bytenr) {
4321			cur = &(*cur)->rb_right;
4322		} else {
4323			if (entry->subvol_generation !=
4324					block->subvol_generation ||
4325			    entry->reloc_bytenr != block->reloc_bytenr ||
4326			    entry->reloc_generation !=
4327					block->reloc_generation) {
4328				/*
4329				 * Duplicated but mismatch entry found.
4330				 * Shouldn't happen.
4331				 *
4332				 * Marking qgroup inconsistent should be enough
4333				 * for end users.
4334				 */
4335				WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4336				ret = -EEXIST;
4337			}
4338			kfree(block);
4339			goto out_unlock;
4340		}
4341	}
4342	rb_link_node(&block->node, parent, cur);
4343	rb_insert_color(&block->node, &blocks->blocks[level]);
4344	blocks->swapped = true;
4345out_unlock:
4346	spin_unlock(&blocks->lock);
4347out:
4348	if (ret < 0)
4349		qgroup_mark_inconsistent(fs_info);
4350	return ret;
4351}
4352
4353/*
4354 * Check if the tree block is a subtree root, and if so do the needed
4355 * delayed subtree trace for qgroup.
4356 *
4357 * This is called during btrfs_cow_block().
4358 */
4359int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4360					 struct btrfs_root *root,
4361					 struct extent_buffer *subvol_eb)
4362{
4363	struct btrfs_fs_info *fs_info = root->fs_info;
4364	struct btrfs_tree_parent_check check = { 0 };
4365	struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4366	struct btrfs_qgroup_swapped_block *block;
4367	struct extent_buffer *reloc_eb = NULL;
4368	struct rb_node *node;
4369	bool found = false;
4370	bool swapped = false;
4371	int level = btrfs_header_level(subvol_eb);
4372	int ret = 0;
4373	int i;
4374
4375	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4376		return 0;
4377	if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
4378		return 0;
4379
4380	spin_lock(&blocks->lock);
4381	if (!blocks->swapped) {
4382		spin_unlock(&blocks->lock);
4383		return 0;
4384	}
4385	node = blocks->blocks[level].rb_node;
4386
4387	while (node) {
4388		block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4389		if (block->subvol_bytenr < subvol_eb->start) {
4390			node = node->rb_left;
4391		} else if (block->subvol_bytenr > subvol_eb->start) {
4392			node = node->rb_right;
4393		} else {
4394			found = true;
4395			break;
4396		}
4397	}
4398	if (!found) {
4399		spin_unlock(&blocks->lock);
4400		goto out;
4401	}
4402	/* Found one, remove it from @blocks first and update blocks->swapped */
4403	rb_erase(&block->node, &blocks->blocks[level]);
4404	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4405		if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4406			swapped = true;
4407			break;
4408		}
4409	}
4410	blocks->swapped = swapped;
4411	spin_unlock(&blocks->lock);
4412
4413	check.level = block->level;
4414	check.transid = block->reloc_generation;
4415	check.has_first_key = true;
4416	memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));
4417
4418	/* Read out reloc subtree root */
4419	reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check);
4420	if (IS_ERR(reloc_eb)) {
4421		ret = PTR_ERR(reloc_eb);
4422		reloc_eb = NULL;
4423		goto free_out;
4424	}
4425	if (!extent_buffer_uptodate(reloc_eb)) {
4426		ret = -EIO;
4427		goto free_out;
4428	}
4429
4430	ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4431			block->last_snapshot, block->trace_leaf);
4432free_out:
4433	kfree(block);
4434	free_extent_buffer(reloc_eb);
4435out:
4436	if (ret < 0) {
4437		btrfs_err_rl(fs_info,
4438			     "failed to account subtree at bytenr %llu: %d",
4439			     subvol_eb->start, ret);
4440		qgroup_mark_inconsistent(fs_info);
4441	}
4442	return ret;
4443}
4444
4445void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4446{
4447	struct btrfs_qgroup_extent_record *entry;
4448	struct btrfs_qgroup_extent_record *next;
4449	struct rb_root *root;
4450
4451	root = &trans->delayed_refs.dirty_extent_root;
4452	rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4453		ulist_free(entry->old_roots);
4454		kfree(entry);
4455	}
4456	*root = RB_ROOT;
4457}
4458