162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci
362306a36Sopenharmony_ci#include "misc.h"
462306a36Sopenharmony_ci#include "ctree.h"
562306a36Sopenharmony_ci#include "space-info.h"
662306a36Sopenharmony_ci#include "sysfs.h"
762306a36Sopenharmony_ci#include "volumes.h"
862306a36Sopenharmony_ci#include "free-space-cache.h"
962306a36Sopenharmony_ci#include "ordered-data.h"
1062306a36Sopenharmony_ci#include "transaction.h"
1162306a36Sopenharmony_ci#include "block-group.h"
1262306a36Sopenharmony_ci#include "zoned.h"
1362306a36Sopenharmony_ci#include "fs.h"
1462306a36Sopenharmony_ci#include "accessors.h"
1562306a36Sopenharmony_ci#include "extent-tree.h"
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci/*
1862306a36Sopenharmony_ci * HOW DOES SPACE RESERVATION WORK
1962306a36Sopenharmony_ci *
2062306a36Sopenharmony_ci * If you want to know about delalloc specifically, there is a separate comment
2162306a36Sopenharmony_ci * for that with the delalloc code.  This comment is about how the whole system
2262306a36Sopenharmony_ci * works generally.
2362306a36Sopenharmony_ci *
2462306a36Sopenharmony_ci * BASIC CONCEPTS
2562306a36Sopenharmony_ci *
2662306a36Sopenharmony_ci *   1) space_info.  This is the ultimate arbiter of how much space we can use.
2762306a36Sopenharmony_ci *   There's a description of the bytes_ fields with the struct declaration,
2862306a36Sopenharmony_ci *   refer to that for specifics on each field.  Suffice it to say that for
2962306a36Sopenharmony_ci *   reservations we care about total_bytes - SUM(space_info->bytes_) when
3062306a36Sopenharmony_ci *   determining if there is space to make an allocation.  There is a space_info
3162306a36Sopenharmony_ci *   for METADATA, SYSTEM, and DATA areas.
3262306a36Sopenharmony_ci *
3362306a36Sopenharmony_ci *   2) block_rsv's.  These are basically buckets for every different type of
3462306a36Sopenharmony_ci *   metadata reservation we have.  You can see the comment in the block_rsv
3562306a36Sopenharmony_ci *   code on the rules for each type, but generally block_rsv->reserved is how
3662306a36Sopenharmony_ci *   much space is accounted for in space_info->bytes_may_use.
3762306a36Sopenharmony_ci *
3862306a36Sopenharmony_ci *   3) btrfs_calc*_size.  These are the worst case calculations we used based
3962306a36Sopenharmony_ci *   on the number of items we will want to modify.  We have one for changing
4062306a36Sopenharmony_ci *   items, and one for inserting new items.  Generally we use these helpers to
4162306a36Sopenharmony_ci *   determine the size of the block reserves, and then use the actual bytes
4262306a36Sopenharmony_ci *   values to adjust the space_info counters.
4362306a36Sopenharmony_ci *
4462306a36Sopenharmony_ci * MAKING RESERVATIONS, THE NORMAL CASE
4562306a36Sopenharmony_ci *
4662306a36Sopenharmony_ci *   We call into either btrfs_reserve_data_bytes() or
4762306a36Sopenharmony_ci *   btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
4862306a36Sopenharmony_ci *   num_bytes we want to reserve.
4962306a36Sopenharmony_ci *
5062306a36Sopenharmony_ci *   ->reserve
5162306a36Sopenharmony_ci *     space_info->bytes_may_reserve += num_bytes
5262306a36Sopenharmony_ci *
5362306a36Sopenharmony_ci *   ->extent allocation
5462306a36Sopenharmony_ci *     Call btrfs_add_reserved_bytes() which does
5562306a36Sopenharmony_ci *     space_info->bytes_may_reserve -= num_bytes
5662306a36Sopenharmony_ci *     space_info->bytes_reserved += extent_bytes
5762306a36Sopenharmony_ci *
5862306a36Sopenharmony_ci *   ->insert reference
5962306a36Sopenharmony_ci *     Call btrfs_update_block_group() which does
6062306a36Sopenharmony_ci *     space_info->bytes_reserved -= extent_bytes
6162306a36Sopenharmony_ci *     space_info->bytes_used += extent_bytes
6262306a36Sopenharmony_ci *
6362306a36Sopenharmony_ci * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
6462306a36Sopenharmony_ci *
6562306a36Sopenharmony_ci *   Assume we are unable to simply make the reservation because we do not have
6662306a36Sopenharmony_ci *   enough space
6762306a36Sopenharmony_ci *
6862306a36Sopenharmony_ci *   -> __reserve_bytes
6962306a36Sopenharmony_ci *     create a reserve_ticket with ->bytes set to our reservation, add it to
7062306a36Sopenharmony_ci *     the tail of space_info->tickets, kick async flush thread
7162306a36Sopenharmony_ci *
7262306a36Sopenharmony_ci *   ->handle_reserve_ticket
7362306a36Sopenharmony_ci *     wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
7462306a36Sopenharmony_ci *     on the ticket.
7562306a36Sopenharmony_ci *
7662306a36Sopenharmony_ci *   -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
7762306a36Sopenharmony_ci *     Flushes various things attempting to free up space.
7862306a36Sopenharmony_ci *
7962306a36Sopenharmony_ci *   -> btrfs_try_granting_tickets()
8062306a36Sopenharmony_ci *     This is called by anything that either subtracts space from
8162306a36Sopenharmony_ci *     space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
8262306a36Sopenharmony_ci *     space_info->total_bytes.  This loops through the ->priority_tickets and
8362306a36Sopenharmony_ci *     then the ->tickets list checking to see if the reservation can be
8462306a36Sopenharmony_ci *     completed.  If it can the space is added to space_info->bytes_may_use and
8562306a36Sopenharmony_ci *     the ticket is woken up.
8662306a36Sopenharmony_ci *
8762306a36Sopenharmony_ci *   -> ticket wakeup
8862306a36Sopenharmony_ci *     Check if ->bytes == 0, if it does we got our reservation and we can carry
8962306a36Sopenharmony_ci *     on, if not return the appropriate error (ENOSPC, but can be EINTR if we
9062306a36Sopenharmony_ci *     were interrupted.)
9162306a36Sopenharmony_ci *
9262306a36Sopenharmony_ci * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
9362306a36Sopenharmony_ci *
9462306a36Sopenharmony_ci *   Same as the above, except we add ourselves to the
9562306a36Sopenharmony_ci *   space_info->priority_tickets, and we do not use ticket->wait, we simply
9662306a36Sopenharmony_ci *   call flush_space() ourselves for the states that are safe for us to call
9762306a36Sopenharmony_ci *   without deadlocking and hope for the best.
9862306a36Sopenharmony_ci *
9962306a36Sopenharmony_ci * THE FLUSHING STATES
10062306a36Sopenharmony_ci *
10162306a36Sopenharmony_ci *   Generally speaking we will have two cases for each state, a "nice" state
10262306a36Sopenharmony_ci *   and a "ALL THE THINGS" state.  In btrfs we delay a lot of work in order to
10362306a36Sopenharmony_ci *   reduce the locking over head on the various trees, and even to keep from
10462306a36Sopenharmony_ci *   doing any work at all in the case of delayed refs.  Each of these delayed
10562306a36Sopenharmony_ci *   things however hold reservations, and so letting them run allows us to
10662306a36Sopenharmony_ci *   reclaim space so we can make new reservations.
10762306a36Sopenharmony_ci *
10862306a36Sopenharmony_ci *   FLUSH_DELAYED_ITEMS
10962306a36Sopenharmony_ci *     Every inode has a delayed item to update the inode.  Take a simple write
11062306a36Sopenharmony_ci *     for example, we would update the inode item at write time to update the
11162306a36Sopenharmony_ci *     mtime, and then again at finish_ordered_io() time in order to update the
11262306a36Sopenharmony_ci *     isize or bytes.  We keep these delayed items to coalesce these operations
11362306a36Sopenharmony_ci *     into a single operation done on demand.  These are an easy way to reclaim
11462306a36Sopenharmony_ci *     metadata space.
11562306a36Sopenharmony_ci *
11662306a36Sopenharmony_ci *   FLUSH_DELALLOC
11762306a36Sopenharmony_ci *     Look at the delalloc comment to get an idea of how much space is reserved
11862306a36Sopenharmony_ci *     for delayed allocation.  We can reclaim some of this space simply by
11962306a36Sopenharmony_ci *     running delalloc, but usually we need to wait for ordered extents to
12062306a36Sopenharmony_ci *     reclaim the bulk of this space.
12162306a36Sopenharmony_ci *
12262306a36Sopenharmony_ci *   FLUSH_DELAYED_REFS
12362306a36Sopenharmony_ci *     We have a block reserve for the outstanding delayed refs space, and every
12462306a36Sopenharmony_ci *     delayed ref operation holds a reservation.  Running these is a quick way
12562306a36Sopenharmony_ci *     to reclaim space, but we want to hold this until the end because COW can
12662306a36Sopenharmony_ci *     churn a lot and we can avoid making some extent tree modifications if we
12762306a36Sopenharmony_ci *     are able to delay for as long as possible.
12862306a36Sopenharmony_ci *
12962306a36Sopenharmony_ci *   ALLOC_CHUNK
13062306a36Sopenharmony_ci *     We will skip this the first time through space reservation, because of
13162306a36Sopenharmony_ci *     overcommit and we don't want to have a lot of useless metadata space when
13262306a36Sopenharmony_ci *     our worst case reservations will likely never come true.
13362306a36Sopenharmony_ci *
13462306a36Sopenharmony_ci *   RUN_DELAYED_IPUTS
13562306a36Sopenharmony_ci *     If we're freeing inodes we're likely freeing checksums, file extent
13662306a36Sopenharmony_ci *     items, and extent tree items.  Loads of space could be freed up by these
13762306a36Sopenharmony_ci *     operations, however they won't be usable until the transaction commits.
13862306a36Sopenharmony_ci *
13962306a36Sopenharmony_ci *   COMMIT_TRANS
14062306a36Sopenharmony_ci *     This will commit the transaction.  Historically we had a lot of logic
14162306a36Sopenharmony_ci *     surrounding whether or not we'd commit the transaction, but this waits born
14262306a36Sopenharmony_ci *     out of a pre-tickets era where we could end up committing the transaction
14362306a36Sopenharmony_ci *     thousands of times in a row without making progress.  Now thanks to our
14462306a36Sopenharmony_ci *     ticketing system we know if we're not making progress and can error
14562306a36Sopenharmony_ci *     everybody out after a few commits rather than burning the disk hoping for
14662306a36Sopenharmony_ci *     a different answer.
14762306a36Sopenharmony_ci *
14862306a36Sopenharmony_ci * OVERCOMMIT
14962306a36Sopenharmony_ci *
15062306a36Sopenharmony_ci *   Because we hold so many reservations for metadata we will allow you to
15162306a36Sopenharmony_ci *   reserve more space than is currently free in the currently allocate
15262306a36Sopenharmony_ci *   metadata space.  This only happens with metadata, data does not allow
15362306a36Sopenharmony_ci *   overcommitting.
15462306a36Sopenharmony_ci *
15562306a36Sopenharmony_ci *   You can see the current logic for when we allow overcommit in
15662306a36Sopenharmony_ci *   btrfs_can_overcommit(), but it only applies to unallocated space.  If there
15762306a36Sopenharmony_ci *   is no unallocated space to be had, all reservations are kept within the
15862306a36Sopenharmony_ci *   free space in the allocated metadata chunks.
15962306a36Sopenharmony_ci *
16062306a36Sopenharmony_ci *   Because of overcommitting, you generally want to use the
16162306a36Sopenharmony_ci *   btrfs_can_overcommit() logic for metadata allocations, as it does the right
16262306a36Sopenharmony_ci *   thing with or without extra unallocated space.
16362306a36Sopenharmony_ci */
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ciu64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
16662306a36Sopenharmony_ci			  bool may_use_included)
16762306a36Sopenharmony_ci{
16862306a36Sopenharmony_ci	ASSERT(s_info);
16962306a36Sopenharmony_ci	return s_info->bytes_used + s_info->bytes_reserved +
17062306a36Sopenharmony_ci		s_info->bytes_pinned + s_info->bytes_readonly +
17162306a36Sopenharmony_ci		s_info->bytes_zone_unusable +
17262306a36Sopenharmony_ci		(may_use_included ? s_info->bytes_may_use : 0);
17362306a36Sopenharmony_ci}
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci/*
17662306a36Sopenharmony_ci * after adding space to the filesystem, we need to clear the full flags
17762306a36Sopenharmony_ci * on all the space infos.
17862306a36Sopenharmony_ci */
17962306a36Sopenharmony_civoid btrfs_clear_space_info_full(struct btrfs_fs_info *info)
18062306a36Sopenharmony_ci{
18162306a36Sopenharmony_ci	struct list_head *head = &info->space_info;
18262306a36Sopenharmony_ci	struct btrfs_space_info *found;
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci	list_for_each_entry(found, head, list)
18562306a36Sopenharmony_ci		found->full = 0;
18662306a36Sopenharmony_ci}
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci/*
18962306a36Sopenharmony_ci * Block groups with more than this value (percents) of unusable space will be
19062306a36Sopenharmony_ci * scheduled for background reclaim.
19162306a36Sopenharmony_ci */
19262306a36Sopenharmony_ci#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH			(75)
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci/*
19562306a36Sopenharmony_ci * Calculate chunk size depending on volume type (regular or zoned).
19662306a36Sopenharmony_ci */
19762306a36Sopenharmony_cistatic u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
19862306a36Sopenharmony_ci{
19962306a36Sopenharmony_ci	if (btrfs_is_zoned(fs_info))
20062306a36Sopenharmony_ci		return fs_info->zone_size;
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_ci	ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_ci	if (flags & BTRFS_BLOCK_GROUP_DATA)
20562306a36Sopenharmony_ci		return BTRFS_MAX_DATA_CHUNK_SIZE;
20662306a36Sopenharmony_ci	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
20762306a36Sopenharmony_ci		return SZ_32M;
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci	/* Handle BTRFS_BLOCK_GROUP_METADATA */
21062306a36Sopenharmony_ci	if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
21162306a36Sopenharmony_ci		return SZ_1G;
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	return SZ_256M;
21462306a36Sopenharmony_ci}
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci/*
21762306a36Sopenharmony_ci * Update default chunk size.
21862306a36Sopenharmony_ci */
21962306a36Sopenharmony_civoid btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
22062306a36Sopenharmony_ci					u64 chunk_size)
22162306a36Sopenharmony_ci{
22262306a36Sopenharmony_ci	WRITE_ONCE(space_info->chunk_size, chunk_size);
22362306a36Sopenharmony_ci}
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_cistatic int create_space_info(struct btrfs_fs_info *info, u64 flags)
22662306a36Sopenharmony_ci{
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	struct btrfs_space_info *space_info;
22962306a36Sopenharmony_ci	int i;
23062306a36Sopenharmony_ci	int ret;
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
23362306a36Sopenharmony_ci	if (!space_info)
23462306a36Sopenharmony_ci		return -ENOMEM;
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
23762306a36Sopenharmony_ci		INIT_LIST_HEAD(&space_info->block_groups[i]);
23862306a36Sopenharmony_ci	init_rwsem(&space_info->groups_sem);
23962306a36Sopenharmony_ci	spin_lock_init(&space_info->lock);
24062306a36Sopenharmony_ci	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
24162306a36Sopenharmony_ci	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
24262306a36Sopenharmony_ci	INIT_LIST_HEAD(&space_info->ro_bgs);
24362306a36Sopenharmony_ci	INIT_LIST_HEAD(&space_info->tickets);
24462306a36Sopenharmony_ci	INIT_LIST_HEAD(&space_info->priority_tickets);
24562306a36Sopenharmony_ci	space_info->clamp = 1;
24662306a36Sopenharmony_ci	btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci	if (btrfs_is_zoned(info))
24962306a36Sopenharmony_ci		space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci	ret = btrfs_sysfs_add_space_info_type(info, space_info);
25262306a36Sopenharmony_ci	if (ret)
25362306a36Sopenharmony_ci		return ret;
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	list_add(&space_info->list, &info->space_info);
25662306a36Sopenharmony_ci	if (flags & BTRFS_BLOCK_GROUP_DATA)
25762306a36Sopenharmony_ci		info->data_sinfo = space_info;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	return ret;
26062306a36Sopenharmony_ci}
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ciint btrfs_init_space_info(struct btrfs_fs_info *fs_info)
26362306a36Sopenharmony_ci{
26462306a36Sopenharmony_ci	struct btrfs_super_block *disk_super;
26562306a36Sopenharmony_ci	u64 features;
26662306a36Sopenharmony_ci	u64 flags;
26762306a36Sopenharmony_ci	int mixed = 0;
26862306a36Sopenharmony_ci	int ret;
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	disk_super = fs_info->super_copy;
27162306a36Sopenharmony_ci	if (!btrfs_super_root(disk_super))
27262306a36Sopenharmony_ci		return -EINVAL;
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci	features = btrfs_super_incompat_flags(disk_super);
27562306a36Sopenharmony_ci	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
27662306a36Sopenharmony_ci		mixed = 1;
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	flags = BTRFS_BLOCK_GROUP_SYSTEM;
27962306a36Sopenharmony_ci	ret = create_space_info(fs_info, flags);
28062306a36Sopenharmony_ci	if (ret)
28162306a36Sopenharmony_ci		goto out;
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	if (mixed) {
28462306a36Sopenharmony_ci		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
28562306a36Sopenharmony_ci		ret = create_space_info(fs_info, flags);
28662306a36Sopenharmony_ci	} else {
28762306a36Sopenharmony_ci		flags = BTRFS_BLOCK_GROUP_METADATA;
28862306a36Sopenharmony_ci		ret = create_space_info(fs_info, flags);
28962306a36Sopenharmony_ci		if (ret)
29062306a36Sopenharmony_ci			goto out;
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci		flags = BTRFS_BLOCK_GROUP_DATA;
29362306a36Sopenharmony_ci		ret = create_space_info(fs_info, flags);
29462306a36Sopenharmony_ci	}
29562306a36Sopenharmony_ciout:
29662306a36Sopenharmony_ci	return ret;
29762306a36Sopenharmony_ci}
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_civoid btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
30062306a36Sopenharmony_ci				struct btrfs_block_group *block_group)
30162306a36Sopenharmony_ci{
30262306a36Sopenharmony_ci	struct btrfs_space_info *found;
30362306a36Sopenharmony_ci	int factor, index;
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci	factor = btrfs_bg_type_to_factor(block_group->flags);
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci	found = btrfs_find_space_info(info, block_group->flags);
30862306a36Sopenharmony_ci	ASSERT(found);
30962306a36Sopenharmony_ci	spin_lock(&found->lock);
31062306a36Sopenharmony_ci	found->total_bytes += block_group->length;
31162306a36Sopenharmony_ci	found->disk_total += block_group->length * factor;
31262306a36Sopenharmony_ci	found->bytes_used += block_group->used;
31362306a36Sopenharmony_ci	found->disk_used += block_group->used * factor;
31462306a36Sopenharmony_ci	found->bytes_readonly += block_group->bytes_super;
31562306a36Sopenharmony_ci	found->bytes_zone_unusable += block_group->zone_unusable;
31662306a36Sopenharmony_ci	if (block_group->length > 0)
31762306a36Sopenharmony_ci		found->full = 0;
31862306a36Sopenharmony_ci	btrfs_try_granting_tickets(info, found);
31962306a36Sopenharmony_ci	spin_unlock(&found->lock);
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci	block_group->space_info = found;
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci	index = btrfs_bg_flags_to_raid_index(block_group->flags);
32462306a36Sopenharmony_ci	down_write(&found->groups_sem);
32562306a36Sopenharmony_ci	list_add_tail(&block_group->list, &found->block_groups[index]);
32662306a36Sopenharmony_ci	up_write(&found->groups_sem);
32762306a36Sopenharmony_ci}
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_cistruct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
33062306a36Sopenharmony_ci					       u64 flags)
33162306a36Sopenharmony_ci{
33262306a36Sopenharmony_ci	struct list_head *head = &info->space_info;
33362306a36Sopenharmony_ci	struct btrfs_space_info *found;
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	list_for_each_entry(found, head, list) {
33862306a36Sopenharmony_ci		if (found->flags & flags)
33962306a36Sopenharmony_ci			return found;
34062306a36Sopenharmony_ci	}
34162306a36Sopenharmony_ci	return NULL;
34262306a36Sopenharmony_ci}
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_cistatic u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
34562306a36Sopenharmony_ci			  struct btrfs_space_info *space_info,
34662306a36Sopenharmony_ci			  enum btrfs_reserve_flush_enum flush)
34762306a36Sopenharmony_ci{
34862306a36Sopenharmony_ci	u64 profile;
34962306a36Sopenharmony_ci	u64 avail;
35062306a36Sopenharmony_ci	int factor;
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_ci	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
35362306a36Sopenharmony_ci		profile = btrfs_system_alloc_profile(fs_info);
35462306a36Sopenharmony_ci	else
35562306a36Sopenharmony_ci		profile = btrfs_metadata_alloc_profile(fs_info);
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci	avail = atomic64_read(&fs_info->free_chunk_space);
35862306a36Sopenharmony_ci
35962306a36Sopenharmony_ci	/*
36062306a36Sopenharmony_ci	 * If we have dup, raid1 or raid10 then only half of the free
36162306a36Sopenharmony_ci	 * space is actually usable.  For raid56, the space info used
36262306a36Sopenharmony_ci	 * doesn't include the parity drive, so we don't have to
36362306a36Sopenharmony_ci	 * change the math
36462306a36Sopenharmony_ci	 */
36562306a36Sopenharmony_ci	factor = btrfs_bg_type_to_factor(profile);
36662306a36Sopenharmony_ci	avail = div_u64(avail, factor);
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	/*
36962306a36Sopenharmony_ci	 * If we aren't flushing all things, let us overcommit up to
37062306a36Sopenharmony_ci	 * 1/2th of the space. If we can flush, don't let us overcommit
37162306a36Sopenharmony_ci	 * too much, let it overcommit up to 1/8 of the space.
37262306a36Sopenharmony_ci	 */
37362306a36Sopenharmony_ci	if (flush == BTRFS_RESERVE_FLUSH_ALL)
37462306a36Sopenharmony_ci		avail >>= 3;
37562306a36Sopenharmony_ci	else
37662306a36Sopenharmony_ci		avail >>= 1;
37762306a36Sopenharmony_ci	return avail;
37862306a36Sopenharmony_ci}
37962306a36Sopenharmony_ci
38062306a36Sopenharmony_ciint btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
38162306a36Sopenharmony_ci			 struct btrfs_space_info *space_info, u64 bytes,
38262306a36Sopenharmony_ci			 enum btrfs_reserve_flush_enum flush)
38362306a36Sopenharmony_ci{
38462306a36Sopenharmony_ci	u64 avail;
38562306a36Sopenharmony_ci	u64 used;
38662306a36Sopenharmony_ci
38762306a36Sopenharmony_ci	/* Don't overcommit when in mixed mode */
38862306a36Sopenharmony_ci	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
38962306a36Sopenharmony_ci		return 0;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	used = btrfs_space_info_used(space_info, true);
39262306a36Sopenharmony_ci	avail = calc_available_free_space(fs_info, space_info, flush);
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ci	if (used + bytes < space_info->total_bytes + avail)
39562306a36Sopenharmony_ci		return 1;
39662306a36Sopenharmony_ci	return 0;
39762306a36Sopenharmony_ci}
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_cistatic void remove_ticket(struct btrfs_space_info *space_info,
40062306a36Sopenharmony_ci			  struct reserve_ticket *ticket)
40162306a36Sopenharmony_ci{
40262306a36Sopenharmony_ci	if (!list_empty(&ticket->list)) {
40362306a36Sopenharmony_ci		list_del_init(&ticket->list);
40462306a36Sopenharmony_ci		ASSERT(space_info->reclaim_size >= ticket->bytes);
40562306a36Sopenharmony_ci		space_info->reclaim_size -= ticket->bytes;
40662306a36Sopenharmony_ci	}
40762306a36Sopenharmony_ci}
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci/*
41062306a36Sopenharmony_ci * This is for space we already have accounted in space_info->bytes_may_use, so
41162306a36Sopenharmony_ci * basically when we're returning space from block_rsv's.
41262306a36Sopenharmony_ci */
41362306a36Sopenharmony_civoid btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
41462306a36Sopenharmony_ci				struct btrfs_space_info *space_info)
41562306a36Sopenharmony_ci{
41662306a36Sopenharmony_ci	struct list_head *head;
41762306a36Sopenharmony_ci	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	lockdep_assert_held(&space_info->lock);
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci	head = &space_info->priority_tickets;
42262306a36Sopenharmony_ciagain:
42362306a36Sopenharmony_ci	while (!list_empty(head)) {
42462306a36Sopenharmony_ci		struct reserve_ticket *ticket;
42562306a36Sopenharmony_ci		u64 used = btrfs_space_info_used(space_info, true);
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci		ticket = list_first_entry(head, struct reserve_ticket, list);
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci		/* Check and see if our ticket can be satisfied now. */
43062306a36Sopenharmony_ci		if ((used + ticket->bytes <= space_info->total_bytes) ||
43162306a36Sopenharmony_ci		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
43262306a36Sopenharmony_ci					 flush)) {
43362306a36Sopenharmony_ci			btrfs_space_info_update_bytes_may_use(fs_info,
43462306a36Sopenharmony_ci							      space_info,
43562306a36Sopenharmony_ci							      ticket->bytes);
43662306a36Sopenharmony_ci			remove_ticket(space_info, ticket);
43762306a36Sopenharmony_ci			ticket->bytes = 0;
43862306a36Sopenharmony_ci			space_info->tickets_id++;
43962306a36Sopenharmony_ci			wake_up(&ticket->wait);
44062306a36Sopenharmony_ci		} else {
44162306a36Sopenharmony_ci			break;
44262306a36Sopenharmony_ci		}
44362306a36Sopenharmony_ci	}
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci	if (head == &space_info->priority_tickets) {
44662306a36Sopenharmony_ci		head = &space_info->tickets;
44762306a36Sopenharmony_ci		flush = BTRFS_RESERVE_FLUSH_ALL;
44862306a36Sopenharmony_ci		goto again;
44962306a36Sopenharmony_ci	}
45062306a36Sopenharmony_ci}
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci#define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
45362306a36Sopenharmony_cido {									\
45462306a36Sopenharmony_ci	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
45562306a36Sopenharmony_ci	spin_lock(&__rsv->lock);					\
45662306a36Sopenharmony_ci	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
45762306a36Sopenharmony_ci		   __rsv->size, __rsv->reserved);			\
45862306a36Sopenharmony_ci	spin_unlock(&__rsv->lock);					\
45962306a36Sopenharmony_ci} while (0)
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_cistatic const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
46262306a36Sopenharmony_ci{
46362306a36Sopenharmony_ci	switch (space_info->flags) {
46462306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_SYSTEM:
46562306a36Sopenharmony_ci		return "SYSTEM";
46662306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
46762306a36Sopenharmony_ci		return "DATA+METADATA";
46862306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_DATA:
46962306a36Sopenharmony_ci		return "DATA";
47062306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_METADATA:
47162306a36Sopenharmony_ci		return "METADATA";
47262306a36Sopenharmony_ci	default:
47362306a36Sopenharmony_ci		return "UNKNOWN";
47462306a36Sopenharmony_ci	}
47562306a36Sopenharmony_ci}
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_cistatic void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
47862306a36Sopenharmony_ci{
47962306a36Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
48062306a36Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
48162306a36Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
48262306a36Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
48362306a36Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
48462306a36Sopenharmony_ci}
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_cistatic void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
48762306a36Sopenharmony_ci				    struct btrfs_space_info *info)
48862306a36Sopenharmony_ci{
48962306a36Sopenharmony_ci	const char *flag_str = space_info_flag_to_str(info);
49062306a36Sopenharmony_ci	lockdep_assert_held(&info->lock);
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	/* The free space could be negative in case of overcommit */
49362306a36Sopenharmony_ci	btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
49462306a36Sopenharmony_ci		   flag_str,
49562306a36Sopenharmony_ci		   (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
49662306a36Sopenharmony_ci		   info->full ? "" : "not ");
49762306a36Sopenharmony_ci	btrfs_info(fs_info,
49862306a36Sopenharmony_ci"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
49962306a36Sopenharmony_ci		info->total_bytes, info->bytes_used, info->bytes_pinned,
50062306a36Sopenharmony_ci		info->bytes_reserved, info->bytes_may_use,
50162306a36Sopenharmony_ci		info->bytes_readonly, info->bytes_zone_unusable);
50262306a36Sopenharmony_ci}
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_civoid btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
50562306a36Sopenharmony_ci			   struct btrfs_space_info *info, u64 bytes,
50662306a36Sopenharmony_ci			   int dump_block_groups)
50762306a36Sopenharmony_ci{
50862306a36Sopenharmony_ci	struct btrfs_block_group *cache;
50962306a36Sopenharmony_ci	u64 total_avail = 0;
51062306a36Sopenharmony_ci	int index = 0;
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci	spin_lock(&info->lock);
51362306a36Sopenharmony_ci	__btrfs_dump_space_info(fs_info, info);
51462306a36Sopenharmony_ci	dump_global_block_rsv(fs_info);
51562306a36Sopenharmony_ci	spin_unlock(&info->lock);
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci	if (!dump_block_groups)
51862306a36Sopenharmony_ci		return;
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci	down_read(&info->groups_sem);
52162306a36Sopenharmony_ciagain:
52262306a36Sopenharmony_ci	list_for_each_entry(cache, &info->block_groups[index], list) {
52362306a36Sopenharmony_ci		u64 avail;
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci		spin_lock(&cache->lock);
52662306a36Sopenharmony_ci		avail = cache->length - cache->used - cache->pinned -
52762306a36Sopenharmony_ci			cache->reserved - cache->delalloc_bytes -
52862306a36Sopenharmony_ci			cache->bytes_super - cache->zone_unusable;
52962306a36Sopenharmony_ci		btrfs_info(fs_info,
53062306a36Sopenharmony_ci"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s",
53162306a36Sopenharmony_ci			   cache->start, cache->length, cache->used, cache->pinned,
53262306a36Sopenharmony_ci			   cache->reserved, cache->delalloc_bytes,
53362306a36Sopenharmony_ci			   cache->bytes_super, cache->zone_unusable,
53462306a36Sopenharmony_ci			   avail, cache->ro ? "[readonly]" : "");
53562306a36Sopenharmony_ci		spin_unlock(&cache->lock);
53662306a36Sopenharmony_ci		btrfs_dump_free_space(cache, bytes);
53762306a36Sopenharmony_ci		total_avail += avail;
53862306a36Sopenharmony_ci	}
53962306a36Sopenharmony_ci	if (++index < BTRFS_NR_RAID_TYPES)
54062306a36Sopenharmony_ci		goto again;
54162306a36Sopenharmony_ci	up_read(&info->groups_sem);
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_ci	btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail);
54462306a36Sopenharmony_ci}
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_cistatic inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
54762306a36Sopenharmony_ci					u64 to_reclaim)
54862306a36Sopenharmony_ci{
54962306a36Sopenharmony_ci	u64 bytes;
55062306a36Sopenharmony_ci	u64 nr;
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_ci	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
55362306a36Sopenharmony_ci	nr = div64_u64(to_reclaim, bytes);
55462306a36Sopenharmony_ci	if (!nr)
55562306a36Sopenharmony_ci		nr = 1;
55662306a36Sopenharmony_ci	return nr;
55762306a36Sopenharmony_ci}
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_cistatic inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
56062306a36Sopenharmony_ci				       u64 to_reclaim)
56162306a36Sopenharmony_ci{
56262306a36Sopenharmony_ci	const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
56362306a36Sopenharmony_ci	u64 nr;
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_ci	nr = div64_u64(to_reclaim, bytes);
56662306a36Sopenharmony_ci	if (!nr)
56762306a36Sopenharmony_ci		nr = 1;
56862306a36Sopenharmony_ci	return nr;
56962306a36Sopenharmony_ci}
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci#define EXTENT_SIZE_PER_ITEM	SZ_256K
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci/*
57462306a36Sopenharmony_ci * shrink metadata reservation for delalloc
57562306a36Sopenharmony_ci */
57662306a36Sopenharmony_cistatic void shrink_delalloc(struct btrfs_fs_info *fs_info,
57762306a36Sopenharmony_ci			    struct btrfs_space_info *space_info,
57862306a36Sopenharmony_ci			    u64 to_reclaim, bool wait_ordered,
57962306a36Sopenharmony_ci			    bool for_preempt)
58062306a36Sopenharmony_ci{
58162306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
58262306a36Sopenharmony_ci	u64 delalloc_bytes;
58362306a36Sopenharmony_ci	u64 ordered_bytes;
58462306a36Sopenharmony_ci	u64 items;
58562306a36Sopenharmony_ci	long time_left;
58662306a36Sopenharmony_ci	int loops;
58762306a36Sopenharmony_ci
58862306a36Sopenharmony_ci	delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
58962306a36Sopenharmony_ci	ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
59062306a36Sopenharmony_ci	if (delalloc_bytes == 0 && ordered_bytes == 0)
59162306a36Sopenharmony_ci		return;
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ci	/* Calc the number of the pages we need flush for space reservation */
59462306a36Sopenharmony_ci	if (to_reclaim == U64_MAX) {
59562306a36Sopenharmony_ci		items = U64_MAX;
59662306a36Sopenharmony_ci	} else {
59762306a36Sopenharmony_ci		/*
59862306a36Sopenharmony_ci		 * to_reclaim is set to however much metadata we need to
59962306a36Sopenharmony_ci		 * reclaim, but reclaiming that much data doesn't really track
60062306a36Sopenharmony_ci		 * exactly.  What we really want to do is reclaim full inode's
60162306a36Sopenharmony_ci		 * worth of reservations, however that's not available to us
60262306a36Sopenharmony_ci		 * here.  We will take a fraction of the delalloc bytes for our
60362306a36Sopenharmony_ci		 * flushing loops and hope for the best.  Delalloc will expand
60462306a36Sopenharmony_ci		 * the amount we write to cover an entire dirty extent, which
60562306a36Sopenharmony_ci		 * will reclaim the metadata reservation for that range.  If
60662306a36Sopenharmony_ci		 * it's not enough subsequent flush stages will be more
60762306a36Sopenharmony_ci		 * aggressive.
60862306a36Sopenharmony_ci		 */
60962306a36Sopenharmony_ci		to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
61062306a36Sopenharmony_ci		items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
61162306a36Sopenharmony_ci	}
61262306a36Sopenharmony_ci
61362306a36Sopenharmony_ci	trans = current->journal_info;
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci	/*
61662306a36Sopenharmony_ci	 * If we are doing more ordered than delalloc we need to just wait on
61762306a36Sopenharmony_ci	 * ordered extents, otherwise we'll waste time trying to flush delalloc
61862306a36Sopenharmony_ci	 * that likely won't give us the space back we need.
61962306a36Sopenharmony_ci	 */
62062306a36Sopenharmony_ci	if (ordered_bytes > delalloc_bytes && !for_preempt)
62162306a36Sopenharmony_ci		wait_ordered = true;
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_ci	loops = 0;
62462306a36Sopenharmony_ci	while ((delalloc_bytes || ordered_bytes) && loops < 3) {
62562306a36Sopenharmony_ci		u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
62662306a36Sopenharmony_ci		long nr_pages = min_t(u64, temp, LONG_MAX);
62762306a36Sopenharmony_ci		int async_pages;
62862306a36Sopenharmony_ci
62962306a36Sopenharmony_ci		btrfs_start_delalloc_roots(fs_info, nr_pages, true);
63062306a36Sopenharmony_ci
63162306a36Sopenharmony_ci		/*
63262306a36Sopenharmony_ci		 * We need to make sure any outstanding async pages are now
63362306a36Sopenharmony_ci		 * processed before we continue.  This is because things like
63462306a36Sopenharmony_ci		 * sync_inode() try to be smart and skip writing if the inode is
63562306a36Sopenharmony_ci		 * marked clean.  We don't use filemap_fwrite for flushing
63662306a36Sopenharmony_ci		 * because we want to control how many pages we write out at a
63762306a36Sopenharmony_ci		 * time, thus this is the only safe way to make sure we've
63862306a36Sopenharmony_ci		 * waited for outstanding compressed workers to have started
63962306a36Sopenharmony_ci		 * their jobs and thus have ordered extents set up properly.
64062306a36Sopenharmony_ci		 *
64162306a36Sopenharmony_ci		 * This exists because we do not want to wait for each
64262306a36Sopenharmony_ci		 * individual inode to finish its async work, we simply want to
64362306a36Sopenharmony_ci		 * start the IO on everybody, and then come back here and wait
64462306a36Sopenharmony_ci		 * for all of the async work to catch up.  Once we're done with
64562306a36Sopenharmony_ci		 * that we know we'll have ordered extents for everything and we
64662306a36Sopenharmony_ci		 * can decide if we wait for that or not.
64762306a36Sopenharmony_ci		 *
64862306a36Sopenharmony_ci		 * If we choose to replace this in the future, make absolutely
64962306a36Sopenharmony_ci		 * sure that the proper waiting is being done in the async case,
65062306a36Sopenharmony_ci		 * as there have been bugs in that area before.
65162306a36Sopenharmony_ci		 */
65262306a36Sopenharmony_ci		async_pages = atomic_read(&fs_info->async_delalloc_pages);
65362306a36Sopenharmony_ci		if (!async_pages)
65462306a36Sopenharmony_ci			goto skip_async;
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci		/*
65762306a36Sopenharmony_ci		 * We don't want to wait forever, if we wrote less pages in this
65862306a36Sopenharmony_ci		 * loop than we have outstanding, only wait for that number of
65962306a36Sopenharmony_ci		 * pages, otherwise we can wait for all async pages to finish
66062306a36Sopenharmony_ci		 * before continuing.
66162306a36Sopenharmony_ci		 */
66262306a36Sopenharmony_ci		if (async_pages > nr_pages)
66362306a36Sopenharmony_ci			async_pages -= nr_pages;
66462306a36Sopenharmony_ci		else
66562306a36Sopenharmony_ci			async_pages = 0;
66662306a36Sopenharmony_ci		wait_event(fs_info->async_submit_wait,
66762306a36Sopenharmony_ci			   atomic_read(&fs_info->async_delalloc_pages) <=
66862306a36Sopenharmony_ci			   async_pages);
66962306a36Sopenharmony_ciskip_async:
67062306a36Sopenharmony_ci		loops++;
67162306a36Sopenharmony_ci		if (wait_ordered && !trans) {
67262306a36Sopenharmony_ci			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
67362306a36Sopenharmony_ci		} else {
67462306a36Sopenharmony_ci			time_left = schedule_timeout_killable(1);
67562306a36Sopenharmony_ci			if (time_left)
67662306a36Sopenharmony_ci				break;
67762306a36Sopenharmony_ci		}
67862306a36Sopenharmony_ci
67962306a36Sopenharmony_ci		/*
68062306a36Sopenharmony_ci		 * If we are for preemption we just want a one-shot of delalloc
68162306a36Sopenharmony_ci		 * flushing so we can stop flushing if we decide we don't need
68262306a36Sopenharmony_ci		 * to anymore.
68362306a36Sopenharmony_ci		 */
68462306a36Sopenharmony_ci		if (for_preempt)
68562306a36Sopenharmony_ci			break;
68662306a36Sopenharmony_ci
68762306a36Sopenharmony_ci		spin_lock(&space_info->lock);
68862306a36Sopenharmony_ci		if (list_empty(&space_info->tickets) &&
68962306a36Sopenharmony_ci		    list_empty(&space_info->priority_tickets)) {
69062306a36Sopenharmony_ci			spin_unlock(&space_info->lock);
69162306a36Sopenharmony_ci			break;
69262306a36Sopenharmony_ci		}
69362306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_ci		delalloc_bytes = percpu_counter_sum_positive(
69662306a36Sopenharmony_ci						&fs_info->delalloc_bytes);
69762306a36Sopenharmony_ci		ordered_bytes = percpu_counter_sum_positive(
69862306a36Sopenharmony_ci						&fs_info->ordered_bytes);
69962306a36Sopenharmony_ci	}
70062306a36Sopenharmony_ci}
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci/*
70362306a36Sopenharmony_ci * Try to flush some data based on policy set by @state. This is only advisory
70462306a36Sopenharmony_ci * and may fail for various reasons. The caller is supposed to examine the
70562306a36Sopenharmony_ci * state of @space_info to detect the outcome.
70662306a36Sopenharmony_ci */
70762306a36Sopenharmony_cistatic void flush_space(struct btrfs_fs_info *fs_info,
70862306a36Sopenharmony_ci		       struct btrfs_space_info *space_info, u64 num_bytes,
70962306a36Sopenharmony_ci		       enum btrfs_flush_state state, bool for_preempt)
71062306a36Sopenharmony_ci{
71162306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->tree_root;
71262306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
71362306a36Sopenharmony_ci	int nr;
71462306a36Sopenharmony_ci	int ret = 0;
71562306a36Sopenharmony_ci
71662306a36Sopenharmony_ci	switch (state) {
71762306a36Sopenharmony_ci	case FLUSH_DELAYED_ITEMS_NR:
71862306a36Sopenharmony_ci	case FLUSH_DELAYED_ITEMS:
71962306a36Sopenharmony_ci		if (state == FLUSH_DELAYED_ITEMS_NR)
72062306a36Sopenharmony_ci			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
72162306a36Sopenharmony_ci		else
72262306a36Sopenharmony_ci			nr = -1;
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci		trans = btrfs_join_transaction_nostart(root);
72562306a36Sopenharmony_ci		if (IS_ERR(trans)) {
72662306a36Sopenharmony_ci			ret = PTR_ERR(trans);
72762306a36Sopenharmony_ci			if (ret == -ENOENT)
72862306a36Sopenharmony_ci				ret = 0;
72962306a36Sopenharmony_ci			break;
73062306a36Sopenharmony_ci		}
73162306a36Sopenharmony_ci		ret = btrfs_run_delayed_items_nr(trans, nr);
73262306a36Sopenharmony_ci		btrfs_end_transaction(trans);
73362306a36Sopenharmony_ci		break;
73462306a36Sopenharmony_ci	case FLUSH_DELALLOC:
73562306a36Sopenharmony_ci	case FLUSH_DELALLOC_WAIT:
73662306a36Sopenharmony_ci	case FLUSH_DELALLOC_FULL:
73762306a36Sopenharmony_ci		if (state == FLUSH_DELALLOC_FULL)
73862306a36Sopenharmony_ci			num_bytes = U64_MAX;
73962306a36Sopenharmony_ci		shrink_delalloc(fs_info, space_info, num_bytes,
74062306a36Sopenharmony_ci				state != FLUSH_DELALLOC, for_preempt);
74162306a36Sopenharmony_ci		break;
74262306a36Sopenharmony_ci	case FLUSH_DELAYED_REFS_NR:
74362306a36Sopenharmony_ci	case FLUSH_DELAYED_REFS:
74462306a36Sopenharmony_ci		trans = btrfs_join_transaction_nostart(root);
74562306a36Sopenharmony_ci		if (IS_ERR(trans)) {
74662306a36Sopenharmony_ci			ret = PTR_ERR(trans);
74762306a36Sopenharmony_ci			if (ret == -ENOENT)
74862306a36Sopenharmony_ci				ret = 0;
74962306a36Sopenharmony_ci			break;
75062306a36Sopenharmony_ci		}
75162306a36Sopenharmony_ci		if (state == FLUSH_DELAYED_REFS_NR)
75262306a36Sopenharmony_ci			nr = calc_delayed_refs_nr(fs_info, num_bytes);
75362306a36Sopenharmony_ci		else
75462306a36Sopenharmony_ci			nr = 0;
75562306a36Sopenharmony_ci		btrfs_run_delayed_refs(trans, nr);
75662306a36Sopenharmony_ci		btrfs_end_transaction(trans);
75762306a36Sopenharmony_ci		break;
75862306a36Sopenharmony_ci	case ALLOC_CHUNK:
75962306a36Sopenharmony_ci	case ALLOC_CHUNK_FORCE:
76062306a36Sopenharmony_ci		trans = btrfs_join_transaction(root);
76162306a36Sopenharmony_ci		if (IS_ERR(trans)) {
76262306a36Sopenharmony_ci			ret = PTR_ERR(trans);
76362306a36Sopenharmony_ci			break;
76462306a36Sopenharmony_ci		}
76562306a36Sopenharmony_ci		ret = btrfs_chunk_alloc(trans,
76662306a36Sopenharmony_ci				btrfs_get_alloc_profile(fs_info, space_info->flags),
76762306a36Sopenharmony_ci				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
76862306a36Sopenharmony_ci					CHUNK_ALLOC_FORCE);
76962306a36Sopenharmony_ci		btrfs_end_transaction(trans);
77062306a36Sopenharmony_ci
77162306a36Sopenharmony_ci		if (ret > 0 || ret == -ENOSPC)
77262306a36Sopenharmony_ci			ret = 0;
77362306a36Sopenharmony_ci		break;
77462306a36Sopenharmony_ci	case RUN_DELAYED_IPUTS:
77562306a36Sopenharmony_ci		/*
77662306a36Sopenharmony_ci		 * If we have pending delayed iputs then we could free up a
77762306a36Sopenharmony_ci		 * bunch of pinned space, so make sure we run the iputs before
77862306a36Sopenharmony_ci		 * we do our pinned bytes check below.
77962306a36Sopenharmony_ci		 */
78062306a36Sopenharmony_ci		btrfs_run_delayed_iputs(fs_info);
78162306a36Sopenharmony_ci		btrfs_wait_on_delayed_iputs(fs_info);
78262306a36Sopenharmony_ci		break;
78362306a36Sopenharmony_ci	case COMMIT_TRANS:
78462306a36Sopenharmony_ci		ASSERT(current->journal_info == NULL);
78562306a36Sopenharmony_ci		/*
78662306a36Sopenharmony_ci		 * We don't want to start a new transaction, just attach to the
78762306a36Sopenharmony_ci		 * current one or wait it fully commits in case its commit is
78862306a36Sopenharmony_ci		 * happening at the moment. Note: we don't use a nostart join
78962306a36Sopenharmony_ci		 * because that does not wait for a transaction to fully commit
79062306a36Sopenharmony_ci		 * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED).
79162306a36Sopenharmony_ci		 */
79262306a36Sopenharmony_ci		trans = btrfs_attach_transaction_barrier(root);
79362306a36Sopenharmony_ci		if (IS_ERR(trans)) {
79462306a36Sopenharmony_ci			ret = PTR_ERR(trans);
79562306a36Sopenharmony_ci			if (ret == -ENOENT)
79662306a36Sopenharmony_ci				ret = 0;
79762306a36Sopenharmony_ci			break;
79862306a36Sopenharmony_ci		}
79962306a36Sopenharmony_ci		ret = btrfs_commit_transaction(trans);
80062306a36Sopenharmony_ci		break;
80162306a36Sopenharmony_ci	default:
80262306a36Sopenharmony_ci		ret = -ENOSPC;
80362306a36Sopenharmony_ci		break;
80462306a36Sopenharmony_ci	}
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
80762306a36Sopenharmony_ci				ret, for_preempt);
80862306a36Sopenharmony_ci	return;
80962306a36Sopenharmony_ci}
81062306a36Sopenharmony_ci
81162306a36Sopenharmony_cistatic inline u64
81262306a36Sopenharmony_cibtrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
81362306a36Sopenharmony_ci				 struct btrfs_space_info *space_info)
81462306a36Sopenharmony_ci{
81562306a36Sopenharmony_ci	u64 used;
81662306a36Sopenharmony_ci	u64 avail;
81762306a36Sopenharmony_ci	u64 to_reclaim = space_info->reclaim_size;
81862306a36Sopenharmony_ci
81962306a36Sopenharmony_ci	lockdep_assert_held(&space_info->lock);
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_ci	avail = calc_available_free_space(fs_info, space_info,
82262306a36Sopenharmony_ci					  BTRFS_RESERVE_FLUSH_ALL);
82362306a36Sopenharmony_ci	used = btrfs_space_info_used(space_info, true);
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_ci	/*
82662306a36Sopenharmony_ci	 * We may be flushing because suddenly we have less space than we had
82762306a36Sopenharmony_ci	 * before, and now we're well over-committed based on our current free
82862306a36Sopenharmony_ci	 * space.  If that's the case add in our overage so we make sure to put
82962306a36Sopenharmony_ci	 * appropriate pressure on the flushing state machine.
83062306a36Sopenharmony_ci	 */
83162306a36Sopenharmony_ci	if (space_info->total_bytes + avail < used)
83262306a36Sopenharmony_ci		to_reclaim += used - (space_info->total_bytes + avail);
83362306a36Sopenharmony_ci
83462306a36Sopenharmony_ci	return to_reclaim;
83562306a36Sopenharmony_ci}
83662306a36Sopenharmony_ci
83762306a36Sopenharmony_cistatic bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
83862306a36Sopenharmony_ci				    struct btrfs_space_info *space_info)
83962306a36Sopenharmony_ci{
84062306a36Sopenharmony_ci	const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
84162306a36Sopenharmony_ci	u64 ordered, delalloc;
84262306a36Sopenharmony_ci	u64 thresh;
84362306a36Sopenharmony_ci	u64 used;
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_ci	thresh = mult_perc(space_info->total_bytes, 90);
84662306a36Sopenharmony_ci
84762306a36Sopenharmony_ci	lockdep_assert_held(&space_info->lock);
84862306a36Sopenharmony_ci
84962306a36Sopenharmony_ci	/* If we're just plain full then async reclaim just slows us down. */
85062306a36Sopenharmony_ci	if ((space_info->bytes_used + space_info->bytes_reserved +
85162306a36Sopenharmony_ci	     global_rsv_size) >= thresh)
85262306a36Sopenharmony_ci		return false;
85362306a36Sopenharmony_ci
85462306a36Sopenharmony_ci	used = space_info->bytes_may_use + space_info->bytes_pinned;
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci	/* The total flushable belongs to the global rsv, don't flush. */
85762306a36Sopenharmony_ci	if (global_rsv_size >= used)
85862306a36Sopenharmony_ci		return false;
85962306a36Sopenharmony_ci
86062306a36Sopenharmony_ci	/*
86162306a36Sopenharmony_ci	 * 128MiB is 1/4 of the maximum global rsv size.  If we have less than
86262306a36Sopenharmony_ci	 * that devoted to other reservations then there's no sense in flushing,
86362306a36Sopenharmony_ci	 * we don't have a lot of things that need flushing.
86462306a36Sopenharmony_ci	 */
86562306a36Sopenharmony_ci	if (used - global_rsv_size <= SZ_128M)
86662306a36Sopenharmony_ci		return false;
86762306a36Sopenharmony_ci
86862306a36Sopenharmony_ci	/*
86962306a36Sopenharmony_ci	 * We have tickets queued, bail so we don't compete with the async
87062306a36Sopenharmony_ci	 * flushers.
87162306a36Sopenharmony_ci	 */
87262306a36Sopenharmony_ci	if (space_info->reclaim_size)
87362306a36Sopenharmony_ci		return false;
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ci	/*
87662306a36Sopenharmony_ci	 * If we have over half of the free space occupied by reservations or
87762306a36Sopenharmony_ci	 * pinned then we want to start flushing.
87862306a36Sopenharmony_ci	 *
87962306a36Sopenharmony_ci	 * We do not do the traditional thing here, which is to say
88062306a36Sopenharmony_ci	 *
88162306a36Sopenharmony_ci	 *   if (used >= ((total_bytes + avail) / 2))
88262306a36Sopenharmony_ci	 *     return 1;
88362306a36Sopenharmony_ci	 *
88462306a36Sopenharmony_ci	 * because this doesn't quite work how we want.  If we had more than 50%
88562306a36Sopenharmony_ci	 * of the space_info used by bytes_used and we had 0 available we'd just
88662306a36Sopenharmony_ci	 * constantly run the background flusher.  Instead we want it to kick in
88762306a36Sopenharmony_ci	 * if our reclaimable space exceeds our clamped free space.
88862306a36Sopenharmony_ci	 *
88962306a36Sopenharmony_ci	 * Our clamping range is 2^1 -> 2^8.  Practically speaking that means
89062306a36Sopenharmony_ci	 * the following:
89162306a36Sopenharmony_ci	 *
89262306a36Sopenharmony_ci	 * Amount of RAM        Minimum threshold       Maximum threshold
89362306a36Sopenharmony_ci	 *
89462306a36Sopenharmony_ci	 *        256GiB                     1GiB                  128GiB
89562306a36Sopenharmony_ci	 *        128GiB                   512MiB                   64GiB
89662306a36Sopenharmony_ci	 *         64GiB                   256MiB                   32GiB
89762306a36Sopenharmony_ci	 *         32GiB                   128MiB                   16GiB
89862306a36Sopenharmony_ci	 *         16GiB                    64MiB                    8GiB
89962306a36Sopenharmony_ci	 *
90062306a36Sopenharmony_ci	 * These are the range our thresholds will fall in, corresponding to how
90162306a36Sopenharmony_ci	 * much delalloc we need for the background flusher to kick in.
90262306a36Sopenharmony_ci	 */
90362306a36Sopenharmony_ci
90462306a36Sopenharmony_ci	thresh = calc_available_free_space(fs_info, space_info,
90562306a36Sopenharmony_ci					   BTRFS_RESERVE_FLUSH_ALL);
90662306a36Sopenharmony_ci	used = space_info->bytes_used + space_info->bytes_reserved +
90762306a36Sopenharmony_ci	       space_info->bytes_readonly + global_rsv_size;
90862306a36Sopenharmony_ci	if (used < space_info->total_bytes)
90962306a36Sopenharmony_ci		thresh += space_info->total_bytes - used;
91062306a36Sopenharmony_ci	thresh >>= space_info->clamp;
91162306a36Sopenharmony_ci
91262306a36Sopenharmony_ci	used = space_info->bytes_pinned;
91362306a36Sopenharmony_ci
91462306a36Sopenharmony_ci	/*
91562306a36Sopenharmony_ci	 * If we have more ordered bytes than delalloc bytes then we're either
91662306a36Sopenharmony_ci	 * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
91762306a36Sopenharmony_ci	 * around.  Preemptive flushing is only useful in that it can free up
91862306a36Sopenharmony_ci	 * space before tickets need to wait for things to finish.  In the case
91962306a36Sopenharmony_ci	 * of ordered extents, preemptively waiting on ordered extents gets us
92062306a36Sopenharmony_ci	 * nothing, if our reservations are tied up in ordered extents we'll
92162306a36Sopenharmony_ci	 * simply have to slow down writers by forcing them to wait on ordered
92262306a36Sopenharmony_ci	 * extents.
92362306a36Sopenharmony_ci	 *
92462306a36Sopenharmony_ci	 * In the case that ordered is larger than delalloc, only include the
92562306a36Sopenharmony_ci	 * block reserves that we would actually be able to directly reclaim
92662306a36Sopenharmony_ci	 * from.  In this case if we're heavy on metadata operations this will
92762306a36Sopenharmony_ci	 * clearly be heavy enough to warrant preemptive flushing.  In the case
92862306a36Sopenharmony_ci	 * of heavy DIO or ordered reservations, preemptive flushing will just
92962306a36Sopenharmony_ci	 * waste time and cause us to slow down.
93062306a36Sopenharmony_ci	 *
93162306a36Sopenharmony_ci	 * We want to make sure we truly are maxed out on ordered however, so
93262306a36Sopenharmony_ci	 * cut ordered in half, and if it's still higher than delalloc then we
93362306a36Sopenharmony_ci	 * can keep flushing.  This is to avoid the case where we start
93462306a36Sopenharmony_ci	 * flushing, and now delalloc == ordered and we stop preemptively
93562306a36Sopenharmony_ci	 * flushing when we could still have several gigs of delalloc to flush.
93662306a36Sopenharmony_ci	 */
93762306a36Sopenharmony_ci	ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
93862306a36Sopenharmony_ci	delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
93962306a36Sopenharmony_ci	if (ordered >= delalloc)
94062306a36Sopenharmony_ci		used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) +
94162306a36Sopenharmony_ci			btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv);
94262306a36Sopenharmony_ci	else
94362306a36Sopenharmony_ci		used += space_info->bytes_may_use - global_rsv_size;
94462306a36Sopenharmony_ci
94562306a36Sopenharmony_ci	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
94662306a36Sopenharmony_ci		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
94762306a36Sopenharmony_ci}
94862306a36Sopenharmony_ci
94962306a36Sopenharmony_cistatic bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
95062306a36Sopenharmony_ci				  struct btrfs_space_info *space_info,
95162306a36Sopenharmony_ci				  struct reserve_ticket *ticket)
95262306a36Sopenharmony_ci{
95362306a36Sopenharmony_ci	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
95462306a36Sopenharmony_ci	u64 min_bytes;
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci	if (!ticket->steal)
95762306a36Sopenharmony_ci		return false;
95862306a36Sopenharmony_ci
95962306a36Sopenharmony_ci	if (global_rsv->space_info != space_info)
96062306a36Sopenharmony_ci		return false;
96162306a36Sopenharmony_ci
96262306a36Sopenharmony_ci	spin_lock(&global_rsv->lock);
96362306a36Sopenharmony_ci	min_bytes = mult_perc(global_rsv->size, 10);
96462306a36Sopenharmony_ci	if (global_rsv->reserved < min_bytes + ticket->bytes) {
96562306a36Sopenharmony_ci		spin_unlock(&global_rsv->lock);
96662306a36Sopenharmony_ci		return false;
96762306a36Sopenharmony_ci	}
96862306a36Sopenharmony_ci	global_rsv->reserved -= ticket->bytes;
96962306a36Sopenharmony_ci	remove_ticket(space_info, ticket);
97062306a36Sopenharmony_ci	ticket->bytes = 0;
97162306a36Sopenharmony_ci	wake_up(&ticket->wait);
97262306a36Sopenharmony_ci	space_info->tickets_id++;
97362306a36Sopenharmony_ci	if (global_rsv->reserved < global_rsv->size)
97462306a36Sopenharmony_ci		global_rsv->full = 0;
97562306a36Sopenharmony_ci	spin_unlock(&global_rsv->lock);
97662306a36Sopenharmony_ci
97762306a36Sopenharmony_ci	return true;
97862306a36Sopenharmony_ci}
97962306a36Sopenharmony_ci
98062306a36Sopenharmony_ci/*
98162306a36Sopenharmony_ci * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
98262306a36Sopenharmony_ci * @fs_info - fs_info for this fs
98362306a36Sopenharmony_ci * @space_info - the space info we were flushing
98462306a36Sopenharmony_ci *
98562306a36Sopenharmony_ci * We call this when we've exhausted our flushing ability and haven't made
98662306a36Sopenharmony_ci * progress in satisfying tickets.  The reservation code handles tickets in
98762306a36Sopenharmony_ci * order, so if there is a large ticket first and then smaller ones we could
98862306a36Sopenharmony_ci * very well satisfy the smaller tickets.  This will attempt to wake up any
98962306a36Sopenharmony_ci * tickets in the list to catch this case.
99062306a36Sopenharmony_ci *
99162306a36Sopenharmony_ci * This function returns true if it was able to make progress by clearing out
99262306a36Sopenharmony_ci * other tickets, or if it stumbles across a ticket that was smaller than the
99362306a36Sopenharmony_ci * first ticket.
99462306a36Sopenharmony_ci */
99562306a36Sopenharmony_cistatic bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
99662306a36Sopenharmony_ci				   struct btrfs_space_info *space_info)
99762306a36Sopenharmony_ci{
99862306a36Sopenharmony_ci	struct reserve_ticket *ticket;
99962306a36Sopenharmony_ci	u64 tickets_id = space_info->tickets_id;
100062306a36Sopenharmony_ci	const bool aborted = BTRFS_FS_ERROR(fs_info);
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_ci	trace_btrfs_fail_all_tickets(fs_info, space_info);
100362306a36Sopenharmony_ci
100462306a36Sopenharmony_ci	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
100562306a36Sopenharmony_ci		btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
100662306a36Sopenharmony_ci		__btrfs_dump_space_info(fs_info, space_info);
100762306a36Sopenharmony_ci	}
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_ci	while (!list_empty(&space_info->tickets) &&
101062306a36Sopenharmony_ci	       tickets_id == space_info->tickets_id) {
101162306a36Sopenharmony_ci		ticket = list_first_entry(&space_info->tickets,
101262306a36Sopenharmony_ci					  struct reserve_ticket, list);
101362306a36Sopenharmony_ci
101462306a36Sopenharmony_ci		if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
101562306a36Sopenharmony_ci			return true;
101662306a36Sopenharmony_ci
101762306a36Sopenharmony_ci		if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
101862306a36Sopenharmony_ci			btrfs_info(fs_info, "failing ticket with %llu bytes",
101962306a36Sopenharmony_ci				   ticket->bytes);
102062306a36Sopenharmony_ci
102162306a36Sopenharmony_ci		remove_ticket(space_info, ticket);
102262306a36Sopenharmony_ci		if (aborted)
102362306a36Sopenharmony_ci			ticket->error = -EIO;
102462306a36Sopenharmony_ci		else
102562306a36Sopenharmony_ci			ticket->error = -ENOSPC;
102662306a36Sopenharmony_ci		wake_up(&ticket->wait);
102762306a36Sopenharmony_ci
102862306a36Sopenharmony_ci		/*
102962306a36Sopenharmony_ci		 * We're just throwing tickets away, so more flushing may not
103062306a36Sopenharmony_ci		 * trip over btrfs_try_granting_tickets, so we need to call it
103162306a36Sopenharmony_ci		 * here to see if we can make progress with the next ticket in
103262306a36Sopenharmony_ci		 * the list.
103362306a36Sopenharmony_ci		 */
103462306a36Sopenharmony_ci		if (!aborted)
103562306a36Sopenharmony_ci			btrfs_try_granting_tickets(fs_info, space_info);
103662306a36Sopenharmony_ci	}
103762306a36Sopenharmony_ci	return (tickets_id != space_info->tickets_id);
103862306a36Sopenharmony_ci}
103962306a36Sopenharmony_ci
104062306a36Sopenharmony_ci/*
104162306a36Sopenharmony_ci * This is for normal flushers, we can wait all goddamned day if we want to.  We
104262306a36Sopenharmony_ci * will loop and continuously try to flush as long as we are making progress.
104362306a36Sopenharmony_ci * We count progress as clearing off tickets each time we have to loop.
104462306a36Sopenharmony_ci */
104562306a36Sopenharmony_cistatic void btrfs_async_reclaim_metadata_space(struct work_struct *work)
104662306a36Sopenharmony_ci{
104762306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info;
104862306a36Sopenharmony_ci	struct btrfs_space_info *space_info;
104962306a36Sopenharmony_ci	u64 to_reclaim;
105062306a36Sopenharmony_ci	enum btrfs_flush_state flush_state;
105162306a36Sopenharmony_ci	int commit_cycles = 0;
105262306a36Sopenharmony_ci	u64 last_tickets_id;
105362306a36Sopenharmony_ci
105462306a36Sopenharmony_ci	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
105562306a36Sopenharmony_ci	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
105662306a36Sopenharmony_ci
105762306a36Sopenharmony_ci	spin_lock(&space_info->lock);
105862306a36Sopenharmony_ci	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
105962306a36Sopenharmony_ci	if (!to_reclaim) {
106062306a36Sopenharmony_ci		space_info->flush = 0;
106162306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
106262306a36Sopenharmony_ci		return;
106362306a36Sopenharmony_ci	}
106462306a36Sopenharmony_ci	last_tickets_id = space_info->tickets_id;
106562306a36Sopenharmony_ci	spin_unlock(&space_info->lock);
106662306a36Sopenharmony_ci
106762306a36Sopenharmony_ci	flush_state = FLUSH_DELAYED_ITEMS_NR;
106862306a36Sopenharmony_ci	do {
106962306a36Sopenharmony_ci		flush_space(fs_info, space_info, to_reclaim, flush_state, false);
107062306a36Sopenharmony_ci		spin_lock(&space_info->lock);
107162306a36Sopenharmony_ci		if (list_empty(&space_info->tickets)) {
107262306a36Sopenharmony_ci			space_info->flush = 0;
107362306a36Sopenharmony_ci			spin_unlock(&space_info->lock);
107462306a36Sopenharmony_ci			return;
107562306a36Sopenharmony_ci		}
107662306a36Sopenharmony_ci		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
107762306a36Sopenharmony_ci							      space_info);
107862306a36Sopenharmony_ci		if (last_tickets_id == space_info->tickets_id) {
107962306a36Sopenharmony_ci			flush_state++;
108062306a36Sopenharmony_ci		} else {
108162306a36Sopenharmony_ci			last_tickets_id = space_info->tickets_id;
108262306a36Sopenharmony_ci			flush_state = FLUSH_DELAYED_ITEMS_NR;
108362306a36Sopenharmony_ci			if (commit_cycles)
108462306a36Sopenharmony_ci				commit_cycles--;
108562306a36Sopenharmony_ci		}
108662306a36Sopenharmony_ci
108762306a36Sopenharmony_ci		/*
108862306a36Sopenharmony_ci		 * We do not want to empty the system of delalloc unless we're
108962306a36Sopenharmony_ci		 * under heavy pressure, so allow one trip through the flushing
109062306a36Sopenharmony_ci		 * logic before we start doing a FLUSH_DELALLOC_FULL.
109162306a36Sopenharmony_ci		 */
109262306a36Sopenharmony_ci		if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
109362306a36Sopenharmony_ci			flush_state++;
109462306a36Sopenharmony_ci
109562306a36Sopenharmony_ci		/*
109662306a36Sopenharmony_ci		 * We don't want to force a chunk allocation until we've tried
109762306a36Sopenharmony_ci		 * pretty hard to reclaim space.  Think of the case where we
109862306a36Sopenharmony_ci		 * freed up a bunch of space and so have a lot of pinned space
109962306a36Sopenharmony_ci		 * to reclaim.  We would rather use that than possibly create a
110062306a36Sopenharmony_ci		 * underutilized metadata chunk.  So if this is our first run
110162306a36Sopenharmony_ci		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
110262306a36Sopenharmony_ci		 * commit the transaction.  If nothing has changed the next go
110362306a36Sopenharmony_ci		 * around then we can force a chunk allocation.
110462306a36Sopenharmony_ci		 */
110562306a36Sopenharmony_ci		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
110662306a36Sopenharmony_ci			flush_state++;
110762306a36Sopenharmony_ci
110862306a36Sopenharmony_ci		if (flush_state > COMMIT_TRANS) {
110962306a36Sopenharmony_ci			commit_cycles++;
111062306a36Sopenharmony_ci			if (commit_cycles > 2) {
111162306a36Sopenharmony_ci				if (maybe_fail_all_tickets(fs_info, space_info)) {
111262306a36Sopenharmony_ci					flush_state = FLUSH_DELAYED_ITEMS_NR;
111362306a36Sopenharmony_ci					commit_cycles--;
111462306a36Sopenharmony_ci				} else {
111562306a36Sopenharmony_ci					space_info->flush = 0;
111662306a36Sopenharmony_ci				}
111762306a36Sopenharmony_ci			} else {
111862306a36Sopenharmony_ci				flush_state = FLUSH_DELAYED_ITEMS_NR;
111962306a36Sopenharmony_ci			}
112062306a36Sopenharmony_ci		}
112162306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
112262306a36Sopenharmony_ci	} while (flush_state <= COMMIT_TRANS);
112362306a36Sopenharmony_ci}
112462306a36Sopenharmony_ci
112562306a36Sopenharmony_ci/*
112662306a36Sopenharmony_ci * This handles pre-flushing of metadata space before we get to the point that
112762306a36Sopenharmony_ci * we need to start blocking threads on tickets.  The logic here is different
112862306a36Sopenharmony_ci * from the other flush paths because it doesn't rely on tickets to tell us how
112962306a36Sopenharmony_ci * much we need to flush, instead it attempts to keep us below the 80% full
113062306a36Sopenharmony_ci * watermark of space by flushing whichever reservation pool is currently the
113162306a36Sopenharmony_ci * largest.
113262306a36Sopenharmony_ci */
113362306a36Sopenharmony_cistatic void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
113462306a36Sopenharmony_ci{
113562306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info;
113662306a36Sopenharmony_ci	struct btrfs_space_info *space_info;
113762306a36Sopenharmony_ci	struct btrfs_block_rsv *delayed_block_rsv;
113862306a36Sopenharmony_ci	struct btrfs_block_rsv *delayed_refs_rsv;
113962306a36Sopenharmony_ci	struct btrfs_block_rsv *global_rsv;
114062306a36Sopenharmony_ci	struct btrfs_block_rsv *trans_rsv;
114162306a36Sopenharmony_ci	int loops = 0;
114262306a36Sopenharmony_ci
114362306a36Sopenharmony_ci	fs_info = container_of(work, struct btrfs_fs_info,
114462306a36Sopenharmony_ci			       preempt_reclaim_work);
114562306a36Sopenharmony_ci	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
114662306a36Sopenharmony_ci	delayed_block_rsv = &fs_info->delayed_block_rsv;
114762306a36Sopenharmony_ci	delayed_refs_rsv = &fs_info->delayed_refs_rsv;
114862306a36Sopenharmony_ci	global_rsv = &fs_info->global_block_rsv;
114962306a36Sopenharmony_ci	trans_rsv = &fs_info->trans_block_rsv;
115062306a36Sopenharmony_ci
115162306a36Sopenharmony_ci	spin_lock(&space_info->lock);
115262306a36Sopenharmony_ci	while (need_preemptive_reclaim(fs_info, space_info)) {
115362306a36Sopenharmony_ci		enum btrfs_flush_state flush;
115462306a36Sopenharmony_ci		u64 delalloc_size = 0;
115562306a36Sopenharmony_ci		u64 to_reclaim, block_rsv_size;
115662306a36Sopenharmony_ci		const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
115762306a36Sopenharmony_ci
115862306a36Sopenharmony_ci		loops++;
115962306a36Sopenharmony_ci
116062306a36Sopenharmony_ci		/*
116162306a36Sopenharmony_ci		 * We don't have a precise counter for the metadata being
116262306a36Sopenharmony_ci		 * reserved for delalloc, so we'll approximate it by subtracting
116362306a36Sopenharmony_ci		 * out the block rsv's space from the bytes_may_use.  If that
116462306a36Sopenharmony_ci		 * amount is higher than the individual reserves, then we can
116562306a36Sopenharmony_ci		 * assume it's tied up in delalloc reservations.
116662306a36Sopenharmony_ci		 */
116762306a36Sopenharmony_ci		block_rsv_size = global_rsv_size +
116862306a36Sopenharmony_ci			btrfs_block_rsv_reserved(delayed_block_rsv) +
116962306a36Sopenharmony_ci			btrfs_block_rsv_reserved(delayed_refs_rsv) +
117062306a36Sopenharmony_ci			btrfs_block_rsv_reserved(trans_rsv);
117162306a36Sopenharmony_ci		if (block_rsv_size < space_info->bytes_may_use)
117262306a36Sopenharmony_ci			delalloc_size = space_info->bytes_may_use - block_rsv_size;
117362306a36Sopenharmony_ci
117462306a36Sopenharmony_ci		/*
117562306a36Sopenharmony_ci		 * We don't want to include the global_rsv in our calculation,
117662306a36Sopenharmony_ci		 * because that's space we can't touch.  Subtract it from the
117762306a36Sopenharmony_ci		 * block_rsv_size for the next checks.
117862306a36Sopenharmony_ci		 */
117962306a36Sopenharmony_ci		block_rsv_size -= global_rsv_size;
118062306a36Sopenharmony_ci
118162306a36Sopenharmony_ci		/*
118262306a36Sopenharmony_ci		 * We really want to avoid flushing delalloc too much, as it
118362306a36Sopenharmony_ci		 * could result in poor allocation patterns, so only flush it if
118462306a36Sopenharmony_ci		 * it's larger than the rest of the pools combined.
118562306a36Sopenharmony_ci		 */
118662306a36Sopenharmony_ci		if (delalloc_size > block_rsv_size) {
118762306a36Sopenharmony_ci			to_reclaim = delalloc_size;
118862306a36Sopenharmony_ci			flush = FLUSH_DELALLOC;
118962306a36Sopenharmony_ci		} else if (space_info->bytes_pinned >
119062306a36Sopenharmony_ci			   (btrfs_block_rsv_reserved(delayed_block_rsv) +
119162306a36Sopenharmony_ci			    btrfs_block_rsv_reserved(delayed_refs_rsv))) {
119262306a36Sopenharmony_ci			to_reclaim = space_info->bytes_pinned;
119362306a36Sopenharmony_ci			flush = COMMIT_TRANS;
119462306a36Sopenharmony_ci		} else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
119562306a36Sopenharmony_ci			   btrfs_block_rsv_reserved(delayed_refs_rsv)) {
119662306a36Sopenharmony_ci			to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv);
119762306a36Sopenharmony_ci			flush = FLUSH_DELAYED_ITEMS_NR;
119862306a36Sopenharmony_ci		} else {
119962306a36Sopenharmony_ci			to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv);
120062306a36Sopenharmony_ci			flush = FLUSH_DELAYED_REFS_NR;
120162306a36Sopenharmony_ci		}
120262306a36Sopenharmony_ci
120362306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
120462306a36Sopenharmony_ci
120562306a36Sopenharmony_ci		/*
120662306a36Sopenharmony_ci		 * We don't want to reclaim everything, just a portion, so scale
120762306a36Sopenharmony_ci		 * down the to_reclaim by 1/4.  If it takes us down to 0,
120862306a36Sopenharmony_ci		 * reclaim 1 items worth.
120962306a36Sopenharmony_ci		 */
121062306a36Sopenharmony_ci		to_reclaim >>= 2;
121162306a36Sopenharmony_ci		if (!to_reclaim)
121262306a36Sopenharmony_ci			to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
121362306a36Sopenharmony_ci		flush_space(fs_info, space_info, to_reclaim, flush, true);
121462306a36Sopenharmony_ci		cond_resched();
121562306a36Sopenharmony_ci		spin_lock(&space_info->lock);
121662306a36Sopenharmony_ci	}
121762306a36Sopenharmony_ci
121862306a36Sopenharmony_ci	/* We only went through once, back off our clamping. */
121962306a36Sopenharmony_ci	if (loops == 1 && !space_info->reclaim_size)
122062306a36Sopenharmony_ci		space_info->clamp = max(1, space_info->clamp - 1);
122162306a36Sopenharmony_ci	trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
122262306a36Sopenharmony_ci	spin_unlock(&space_info->lock);
122362306a36Sopenharmony_ci}
122462306a36Sopenharmony_ci
122562306a36Sopenharmony_ci/*
122662306a36Sopenharmony_ci * FLUSH_DELALLOC_WAIT:
122762306a36Sopenharmony_ci *   Space is freed from flushing delalloc in one of two ways.
122862306a36Sopenharmony_ci *
122962306a36Sopenharmony_ci *   1) compression is on and we allocate less space than we reserved
123062306a36Sopenharmony_ci *   2) we are overwriting existing space
123162306a36Sopenharmony_ci *
123262306a36Sopenharmony_ci *   For #1 that extra space is reclaimed as soon as the delalloc pages are
123362306a36Sopenharmony_ci *   COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
123462306a36Sopenharmony_ci *   length to ->bytes_reserved, and subtracts the reserved space from
123562306a36Sopenharmony_ci *   ->bytes_may_use.
123662306a36Sopenharmony_ci *
123762306a36Sopenharmony_ci *   For #2 this is trickier.  Once the ordered extent runs we will drop the
123862306a36Sopenharmony_ci *   extent in the range we are overwriting, which creates a delayed ref for
123962306a36Sopenharmony_ci *   that freed extent.  This however is not reclaimed until the transaction
124062306a36Sopenharmony_ci *   commits, thus the next stages.
124162306a36Sopenharmony_ci *
124262306a36Sopenharmony_ci * RUN_DELAYED_IPUTS
124362306a36Sopenharmony_ci *   If we are freeing inodes, we want to make sure all delayed iputs have
124462306a36Sopenharmony_ci *   completed, because they could have been on an inode with i_nlink == 0, and
124562306a36Sopenharmony_ci *   thus have been truncated and freed up space.  But again this space is not
124662306a36Sopenharmony_ci *   immediately re-usable, it comes in the form of a delayed ref, which must be
124762306a36Sopenharmony_ci *   run and then the transaction must be committed.
124862306a36Sopenharmony_ci *
124962306a36Sopenharmony_ci * COMMIT_TRANS
125062306a36Sopenharmony_ci *   This is where we reclaim all of the pinned space generated by running the
125162306a36Sopenharmony_ci *   iputs
125262306a36Sopenharmony_ci *
125362306a36Sopenharmony_ci * ALLOC_CHUNK_FORCE
125462306a36Sopenharmony_ci *   For data we start with alloc chunk force, however we could have been full
125562306a36Sopenharmony_ci *   before, and then the transaction commit could have freed new block groups,
125662306a36Sopenharmony_ci *   so if we now have space to allocate do the force chunk allocation.
125762306a36Sopenharmony_ci */
125862306a36Sopenharmony_cistatic const enum btrfs_flush_state data_flush_states[] = {
125962306a36Sopenharmony_ci	FLUSH_DELALLOC_FULL,
126062306a36Sopenharmony_ci	RUN_DELAYED_IPUTS,
126162306a36Sopenharmony_ci	COMMIT_TRANS,
126262306a36Sopenharmony_ci	ALLOC_CHUNK_FORCE,
126362306a36Sopenharmony_ci};
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_cistatic void btrfs_async_reclaim_data_space(struct work_struct *work)
126662306a36Sopenharmony_ci{
126762306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info;
126862306a36Sopenharmony_ci	struct btrfs_space_info *space_info;
126962306a36Sopenharmony_ci	u64 last_tickets_id;
127062306a36Sopenharmony_ci	enum btrfs_flush_state flush_state = 0;
127162306a36Sopenharmony_ci
127262306a36Sopenharmony_ci	fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
127362306a36Sopenharmony_ci	space_info = fs_info->data_sinfo;
127462306a36Sopenharmony_ci
127562306a36Sopenharmony_ci	spin_lock(&space_info->lock);
127662306a36Sopenharmony_ci	if (list_empty(&space_info->tickets)) {
127762306a36Sopenharmony_ci		space_info->flush = 0;
127862306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
127962306a36Sopenharmony_ci		return;
128062306a36Sopenharmony_ci	}
128162306a36Sopenharmony_ci	last_tickets_id = space_info->tickets_id;
128262306a36Sopenharmony_ci	spin_unlock(&space_info->lock);
128362306a36Sopenharmony_ci
128462306a36Sopenharmony_ci	while (!space_info->full) {
128562306a36Sopenharmony_ci		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
128662306a36Sopenharmony_ci		spin_lock(&space_info->lock);
128762306a36Sopenharmony_ci		if (list_empty(&space_info->tickets)) {
128862306a36Sopenharmony_ci			space_info->flush = 0;
128962306a36Sopenharmony_ci			spin_unlock(&space_info->lock);
129062306a36Sopenharmony_ci			return;
129162306a36Sopenharmony_ci		}
129262306a36Sopenharmony_ci
129362306a36Sopenharmony_ci		/* Something happened, fail everything and bail. */
129462306a36Sopenharmony_ci		if (BTRFS_FS_ERROR(fs_info))
129562306a36Sopenharmony_ci			goto aborted_fs;
129662306a36Sopenharmony_ci		last_tickets_id = space_info->tickets_id;
129762306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
129862306a36Sopenharmony_ci	}
129962306a36Sopenharmony_ci
130062306a36Sopenharmony_ci	while (flush_state < ARRAY_SIZE(data_flush_states)) {
130162306a36Sopenharmony_ci		flush_space(fs_info, space_info, U64_MAX,
130262306a36Sopenharmony_ci			    data_flush_states[flush_state], false);
130362306a36Sopenharmony_ci		spin_lock(&space_info->lock);
130462306a36Sopenharmony_ci		if (list_empty(&space_info->tickets)) {
130562306a36Sopenharmony_ci			space_info->flush = 0;
130662306a36Sopenharmony_ci			spin_unlock(&space_info->lock);
130762306a36Sopenharmony_ci			return;
130862306a36Sopenharmony_ci		}
130962306a36Sopenharmony_ci
131062306a36Sopenharmony_ci		if (last_tickets_id == space_info->tickets_id) {
131162306a36Sopenharmony_ci			flush_state++;
131262306a36Sopenharmony_ci		} else {
131362306a36Sopenharmony_ci			last_tickets_id = space_info->tickets_id;
131462306a36Sopenharmony_ci			flush_state = 0;
131562306a36Sopenharmony_ci		}
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci		if (flush_state >= ARRAY_SIZE(data_flush_states)) {
131862306a36Sopenharmony_ci			if (space_info->full) {
131962306a36Sopenharmony_ci				if (maybe_fail_all_tickets(fs_info, space_info))
132062306a36Sopenharmony_ci					flush_state = 0;
132162306a36Sopenharmony_ci				else
132262306a36Sopenharmony_ci					space_info->flush = 0;
132362306a36Sopenharmony_ci			} else {
132462306a36Sopenharmony_ci				flush_state = 0;
132562306a36Sopenharmony_ci			}
132662306a36Sopenharmony_ci
132762306a36Sopenharmony_ci			/* Something happened, fail everything and bail. */
132862306a36Sopenharmony_ci			if (BTRFS_FS_ERROR(fs_info))
132962306a36Sopenharmony_ci				goto aborted_fs;
133062306a36Sopenharmony_ci
133162306a36Sopenharmony_ci		}
133262306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
133362306a36Sopenharmony_ci	}
133462306a36Sopenharmony_ci	return;
133562306a36Sopenharmony_ci
133662306a36Sopenharmony_ciaborted_fs:
133762306a36Sopenharmony_ci	maybe_fail_all_tickets(fs_info, space_info);
133862306a36Sopenharmony_ci	space_info->flush = 0;
133962306a36Sopenharmony_ci	spin_unlock(&space_info->lock);
134062306a36Sopenharmony_ci}
134162306a36Sopenharmony_ci
134262306a36Sopenharmony_civoid btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
134362306a36Sopenharmony_ci{
134462306a36Sopenharmony_ci	INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
134562306a36Sopenharmony_ci	INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
134662306a36Sopenharmony_ci	INIT_WORK(&fs_info->preempt_reclaim_work,
134762306a36Sopenharmony_ci		  btrfs_preempt_reclaim_metadata_space);
134862306a36Sopenharmony_ci}
134962306a36Sopenharmony_ci
135062306a36Sopenharmony_cistatic const enum btrfs_flush_state priority_flush_states[] = {
135162306a36Sopenharmony_ci	FLUSH_DELAYED_ITEMS_NR,
135262306a36Sopenharmony_ci	FLUSH_DELAYED_ITEMS,
135362306a36Sopenharmony_ci	ALLOC_CHUNK,
135462306a36Sopenharmony_ci};
135562306a36Sopenharmony_ci
135662306a36Sopenharmony_cistatic const enum btrfs_flush_state evict_flush_states[] = {
135762306a36Sopenharmony_ci	FLUSH_DELAYED_ITEMS_NR,
135862306a36Sopenharmony_ci	FLUSH_DELAYED_ITEMS,
135962306a36Sopenharmony_ci	FLUSH_DELAYED_REFS_NR,
136062306a36Sopenharmony_ci	FLUSH_DELAYED_REFS,
136162306a36Sopenharmony_ci	FLUSH_DELALLOC,
136262306a36Sopenharmony_ci	FLUSH_DELALLOC_WAIT,
136362306a36Sopenharmony_ci	FLUSH_DELALLOC_FULL,
136462306a36Sopenharmony_ci	ALLOC_CHUNK,
136562306a36Sopenharmony_ci	COMMIT_TRANS,
136662306a36Sopenharmony_ci};
136762306a36Sopenharmony_ci
136862306a36Sopenharmony_cistatic void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
136962306a36Sopenharmony_ci				struct btrfs_space_info *space_info,
137062306a36Sopenharmony_ci				struct reserve_ticket *ticket,
137162306a36Sopenharmony_ci				const enum btrfs_flush_state *states,
137262306a36Sopenharmony_ci				int states_nr)
137362306a36Sopenharmony_ci{
137462306a36Sopenharmony_ci	u64 to_reclaim;
137562306a36Sopenharmony_ci	int flush_state = 0;
137662306a36Sopenharmony_ci
137762306a36Sopenharmony_ci	spin_lock(&space_info->lock);
137862306a36Sopenharmony_ci	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
137962306a36Sopenharmony_ci	/*
138062306a36Sopenharmony_ci	 * This is the priority reclaim path, so to_reclaim could be >0 still
138162306a36Sopenharmony_ci	 * because we may have only satisfied the priority tickets and still
138262306a36Sopenharmony_ci	 * left non priority tickets on the list.  We would then have
138362306a36Sopenharmony_ci	 * to_reclaim but ->bytes == 0.
138462306a36Sopenharmony_ci	 */
138562306a36Sopenharmony_ci	if (ticket->bytes == 0) {
138662306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
138762306a36Sopenharmony_ci		return;
138862306a36Sopenharmony_ci	}
138962306a36Sopenharmony_ci
139062306a36Sopenharmony_ci	while (flush_state < states_nr) {
139162306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
139262306a36Sopenharmony_ci		flush_space(fs_info, space_info, to_reclaim, states[flush_state],
139362306a36Sopenharmony_ci			    false);
139462306a36Sopenharmony_ci		flush_state++;
139562306a36Sopenharmony_ci		spin_lock(&space_info->lock);
139662306a36Sopenharmony_ci		if (ticket->bytes == 0) {
139762306a36Sopenharmony_ci			spin_unlock(&space_info->lock);
139862306a36Sopenharmony_ci			return;
139962306a36Sopenharmony_ci		}
140062306a36Sopenharmony_ci	}
140162306a36Sopenharmony_ci
140262306a36Sopenharmony_ci	/*
140362306a36Sopenharmony_ci	 * Attempt to steal from the global rsv if we can, except if the fs was
140462306a36Sopenharmony_ci	 * turned into error mode due to a transaction abort when flushing space
140562306a36Sopenharmony_ci	 * above, in that case fail with the abort error instead of returning
140662306a36Sopenharmony_ci	 * success to the caller if we can steal from the global rsv - this is
140762306a36Sopenharmony_ci	 * just to have caller fail immeditelly instead of later when trying to
140862306a36Sopenharmony_ci	 * modify the fs, making it easier to debug -ENOSPC problems.
140962306a36Sopenharmony_ci	 */
141062306a36Sopenharmony_ci	if (BTRFS_FS_ERROR(fs_info)) {
141162306a36Sopenharmony_ci		ticket->error = BTRFS_FS_ERROR(fs_info);
141262306a36Sopenharmony_ci		remove_ticket(space_info, ticket);
141362306a36Sopenharmony_ci	} else if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
141462306a36Sopenharmony_ci		ticket->error = -ENOSPC;
141562306a36Sopenharmony_ci		remove_ticket(space_info, ticket);
141662306a36Sopenharmony_ci	}
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci	/*
141962306a36Sopenharmony_ci	 * We must run try_granting_tickets here because we could be a large
142062306a36Sopenharmony_ci	 * ticket in front of a smaller ticket that can now be satisfied with
142162306a36Sopenharmony_ci	 * the available space.
142262306a36Sopenharmony_ci	 */
142362306a36Sopenharmony_ci	btrfs_try_granting_tickets(fs_info, space_info);
142462306a36Sopenharmony_ci	spin_unlock(&space_info->lock);
142562306a36Sopenharmony_ci}
142662306a36Sopenharmony_ci
142762306a36Sopenharmony_cistatic void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
142862306a36Sopenharmony_ci					struct btrfs_space_info *space_info,
142962306a36Sopenharmony_ci					struct reserve_ticket *ticket)
143062306a36Sopenharmony_ci{
143162306a36Sopenharmony_ci	spin_lock(&space_info->lock);
143262306a36Sopenharmony_ci
143362306a36Sopenharmony_ci	/* We could have been granted before we got here. */
143462306a36Sopenharmony_ci	if (ticket->bytes == 0) {
143562306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
143662306a36Sopenharmony_ci		return;
143762306a36Sopenharmony_ci	}
143862306a36Sopenharmony_ci
143962306a36Sopenharmony_ci	while (!space_info->full) {
144062306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
144162306a36Sopenharmony_ci		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
144262306a36Sopenharmony_ci		spin_lock(&space_info->lock);
144362306a36Sopenharmony_ci		if (ticket->bytes == 0) {
144462306a36Sopenharmony_ci			spin_unlock(&space_info->lock);
144562306a36Sopenharmony_ci			return;
144662306a36Sopenharmony_ci		}
144762306a36Sopenharmony_ci	}
144862306a36Sopenharmony_ci
144962306a36Sopenharmony_ci	ticket->error = -ENOSPC;
145062306a36Sopenharmony_ci	remove_ticket(space_info, ticket);
145162306a36Sopenharmony_ci	btrfs_try_granting_tickets(fs_info, space_info);
145262306a36Sopenharmony_ci	spin_unlock(&space_info->lock);
145362306a36Sopenharmony_ci}
145462306a36Sopenharmony_ci
145562306a36Sopenharmony_cistatic void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
145662306a36Sopenharmony_ci				struct btrfs_space_info *space_info,
145762306a36Sopenharmony_ci				struct reserve_ticket *ticket)
145862306a36Sopenharmony_ci
145962306a36Sopenharmony_ci{
146062306a36Sopenharmony_ci	DEFINE_WAIT(wait);
146162306a36Sopenharmony_ci	int ret = 0;
146262306a36Sopenharmony_ci
146362306a36Sopenharmony_ci	spin_lock(&space_info->lock);
146462306a36Sopenharmony_ci	while (ticket->bytes > 0 && ticket->error == 0) {
146562306a36Sopenharmony_ci		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
146662306a36Sopenharmony_ci		if (ret) {
146762306a36Sopenharmony_ci			/*
146862306a36Sopenharmony_ci			 * Delete us from the list. After we unlock the space
146962306a36Sopenharmony_ci			 * info, we don't want the async reclaim job to reserve
147062306a36Sopenharmony_ci			 * space for this ticket. If that would happen, then the
147162306a36Sopenharmony_ci			 * ticket's task would not known that space was reserved
147262306a36Sopenharmony_ci			 * despite getting an error, resulting in a space leak
147362306a36Sopenharmony_ci			 * (bytes_may_use counter of our space_info).
147462306a36Sopenharmony_ci			 */
147562306a36Sopenharmony_ci			remove_ticket(space_info, ticket);
147662306a36Sopenharmony_ci			ticket->error = -EINTR;
147762306a36Sopenharmony_ci			break;
147862306a36Sopenharmony_ci		}
147962306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
148062306a36Sopenharmony_ci
148162306a36Sopenharmony_ci		schedule();
148262306a36Sopenharmony_ci
148362306a36Sopenharmony_ci		finish_wait(&ticket->wait, &wait);
148462306a36Sopenharmony_ci		spin_lock(&space_info->lock);
148562306a36Sopenharmony_ci	}
148662306a36Sopenharmony_ci	spin_unlock(&space_info->lock);
148762306a36Sopenharmony_ci}
148862306a36Sopenharmony_ci
148962306a36Sopenharmony_ci/*
149062306a36Sopenharmony_ci * Do the appropriate flushing and waiting for a ticket.
149162306a36Sopenharmony_ci *
149262306a36Sopenharmony_ci * @fs_info:    the filesystem
149362306a36Sopenharmony_ci * @space_info: space info for the reservation
149462306a36Sopenharmony_ci * @ticket:     ticket for the reservation
149562306a36Sopenharmony_ci * @start_ns:   timestamp when the reservation started
149662306a36Sopenharmony_ci * @orig_bytes: amount of bytes originally reserved
149762306a36Sopenharmony_ci * @flush:      how much we can flush
149862306a36Sopenharmony_ci *
149962306a36Sopenharmony_ci * This does the work of figuring out how to flush for the ticket, waiting for
150062306a36Sopenharmony_ci * the reservation, and returning the appropriate error if there is one.
150162306a36Sopenharmony_ci */
150262306a36Sopenharmony_cistatic int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
150362306a36Sopenharmony_ci				 struct btrfs_space_info *space_info,
150462306a36Sopenharmony_ci				 struct reserve_ticket *ticket,
150562306a36Sopenharmony_ci				 u64 start_ns, u64 orig_bytes,
150662306a36Sopenharmony_ci				 enum btrfs_reserve_flush_enum flush)
150762306a36Sopenharmony_ci{
150862306a36Sopenharmony_ci	int ret;
150962306a36Sopenharmony_ci
151062306a36Sopenharmony_ci	switch (flush) {
151162306a36Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_DATA:
151262306a36Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_ALL:
151362306a36Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_ALL_STEAL:
151462306a36Sopenharmony_ci		wait_reserve_ticket(fs_info, space_info, ticket);
151562306a36Sopenharmony_ci		break;
151662306a36Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_LIMIT:
151762306a36Sopenharmony_ci		priority_reclaim_metadata_space(fs_info, space_info, ticket,
151862306a36Sopenharmony_ci						priority_flush_states,
151962306a36Sopenharmony_ci						ARRAY_SIZE(priority_flush_states));
152062306a36Sopenharmony_ci		break;
152162306a36Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_EVICT:
152262306a36Sopenharmony_ci		priority_reclaim_metadata_space(fs_info, space_info, ticket,
152362306a36Sopenharmony_ci						evict_flush_states,
152462306a36Sopenharmony_ci						ARRAY_SIZE(evict_flush_states));
152562306a36Sopenharmony_ci		break;
152662306a36Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
152762306a36Sopenharmony_ci		priority_reclaim_data_space(fs_info, space_info, ticket);
152862306a36Sopenharmony_ci		break;
152962306a36Sopenharmony_ci	default:
153062306a36Sopenharmony_ci		ASSERT(0);
153162306a36Sopenharmony_ci		break;
153262306a36Sopenharmony_ci	}
153362306a36Sopenharmony_ci
153462306a36Sopenharmony_ci	ret = ticket->error;
153562306a36Sopenharmony_ci	ASSERT(list_empty(&ticket->list));
153662306a36Sopenharmony_ci	/*
153762306a36Sopenharmony_ci	 * Check that we can't have an error set if the reservation succeeded,
153862306a36Sopenharmony_ci	 * as that would confuse tasks and lead them to error out without
153962306a36Sopenharmony_ci	 * releasing reserved space (if an error happens the expectation is that
154062306a36Sopenharmony_ci	 * space wasn't reserved at all).
154162306a36Sopenharmony_ci	 */
154262306a36Sopenharmony_ci	ASSERT(!(ticket->bytes == 0 && ticket->error));
154362306a36Sopenharmony_ci	trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
154462306a36Sopenharmony_ci				   start_ns, flush, ticket->error);
154562306a36Sopenharmony_ci	return ret;
154662306a36Sopenharmony_ci}
154762306a36Sopenharmony_ci
154862306a36Sopenharmony_ci/*
154962306a36Sopenharmony_ci * This returns true if this flush state will go through the ordinary flushing
155062306a36Sopenharmony_ci * code.
155162306a36Sopenharmony_ci */
155262306a36Sopenharmony_cistatic inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
155362306a36Sopenharmony_ci{
155462306a36Sopenharmony_ci	return	(flush == BTRFS_RESERVE_FLUSH_ALL) ||
155562306a36Sopenharmony_ci		(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
155662306a36Sopenharmony_ci}
155762306a36Sopenharmony_ci
155862306a36Sopenharmony_cistatic inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
155962306a36Sopenharmony_ci				       struct btrfs_space_info *space_info)
156062306a36Sopenharmony_ci{
156162306a36Sopenharmony_ci	u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
156262306a36Sopenharmony_ci	u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
156362306a36Sopenharmony_ci
156462306a36Sopenharmony_ci	/*
156562306a36Sopenharmony_ci	 * If we're heavy on ordered operations then clamping won't help us.  We
156662306a36Sopenharmony_ci	 * need to clamp specifically to keep up with dirty'ing buffered
156762306a36Sopenharmony_ci	 * writers, because there's not a 1:1 correlation of writing delalloc
156862306a36Sopenharmony_ci	 * and freeing space, like there is with flushing delayed refs or
156962306a36Sopenharmony_ci	 * delayed nodes.  If we're already more ordered than delalloc then
157062306a36Sopenharmony_ci	 * we're keeping up, otherwise we aren't and should probably clamp.
157162306a36Sopenharmony_ci	 */
157262306a36Sopenharmony_ci	if (ordered < delalloc)
157362306a36Sopenharmony_ci		space_info->clamp = min(space_info->clamp + 1, 8);
157462306a36Sopenharmony_ci}
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_cistatic inline bool can_steal(enum btrfs_reserve_flush_enum flush)
157762306a36Sopenharmony_ci{
157862306a36Sopenharmony_ci	return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
157962306a36Sopenharmony_ci		flush == BTRFS_RESERVE_FLUSH_EVICT);
158062306a36Sopenharmony_ci}
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_ci/*
158362306a36Sopenharmony_ci * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
158462306a36Sopenharmony_ci * fail as quickly as possible.
158562306a36Sopenharmony_ci */
158662306a36Sopenharmony_cistatic inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
158762306a36Sopenharmony_ci{
158862306a36Sopenharmony_ci	return (flush != BTRFS_RESERVE_NO_FLUSH &&
158962306a36Sopenharmony_ci		flush != BTRFS_RESERVE_FLUSH_EMERGENCY);
159062306a36Sopenharmony_ci}
159162306a36Sopenharmony_ci
159262306a36Sopenharmony_ci/*
159362306a36Sopenharmony_ci * Try to reserve bytes from the block_rsv's space.
159462306a36Sopenharmony_ci *
159562306a36Sopenharmony_ci * @fs_info:    the filesystem
159662306a36Sopenharmony_ci * @space_info: space info we want to allocate from
159762306a36Sopenharmony_ci * @orig_bytes: number of bytes we want
159862306a36Sopenharmony_ci * @flush:      whether or not we can flush to make our reservation
159962306a36Sopenharmony_ci *
160062306a36Sopenharmony_ci * This will reserve orig_bytes number of bytes from the space info associated
160162306a36Sopenharmony_ci * with the block_rsv.  If there is not enough space it will make an attempt to
160262306a36Sopenharmony_ci * flush out space to make room.  It will do this by flushing delalloc if
160362306a36Sopenharmony_ci * possible or committing the transaction.  If flush is 0 then no attempts to
160462306a36Sopenharmony_ci * regain reservations will be made and this will fail if there is not enough
160562306a36Sopenharmony_ci * space already.
160662306a36Sopenharmony_ci */
160762306a36Sopenharmony_cistatic int __reserve_bytes(struct btrfs_fs_info *fs_info,
160862306a36Sopenharmony_ci			   struct btrfs_space_info *space_info, u64 orig_bytes,
160962306a36Sopenharmony_ci			   enum btrfs_reserve_flush_enum flush)
161062306a36Sopenharmony_ci{
161162306a36Sopenharmony_ci	struct work_struct *async_work;
161262306a36Sopenharmony_ci	struct reserve_ticket ticket;
161362306a36Sopenharmony_ci	u64 start_ns = 0;
161462306a36Sopenharmony_ci	u64 used;
161562306a36Sopenharmony_ci	int ret = -ENOSPC;
161662306a36Sopenharmony_ci	bool pending_tickets;
161762306a36Sopenharmony_ci
161862306a36Sopenharmony_ci	ASSERT(orig_bytes);
161962306a36Sopenharmony_ci	/*
162062306a36Sopenharmony_ci	 * If have a transaction handle (current->journal_info != NULL), then
162162306a36Sopenharmony_ci	 * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
162262306a36Sopenharmony_ci	 * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those
162362306a36Sopenharmony_ci	 * flushing methods can trigger transaction commits.
162462306a36Sopenharmony_ci	 */
162562306a36Sopenharmony_ci	if (current->journal_info) {
162662306a36Sopenharmony_ci		/* One assert per line for easier debugging. */
162762306a36Sopenharmony_ci		ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL);
162862306a36Sopenharmony_ci		ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL);
162962306a36Sopenharmony_ci		ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT);
163062306a36Sopenharmony_ci	}
163162306a36Sopenharmony_ci
163262306a36Sopenharmony_ci	if (flush == BTRFS_RESERVE_FLUSH_DATA)
163362306a36Sopenharmony_ci		async_work = &fs_info->async_data_reclaim_work;
163462306a36Sopenharmony_ci	else
163562306a36Sopenharmony_ci		async_work = &fs_info->async_reclaim_work;
163662306a36Sopenharmony_ci
163762306a36Sopenharmony_ci	spin_lock(&space_info->lock);
163862306a36Sopenharmony_ci	used = btrfs_space_info_used(space_info, true);
163962306a36Sopenharmony_ci
164062306a36Sopenharmony_ci	/*
164162306a36Sopenharmony_ci	 * We don't want NO_FLUSH allocations to jump everybody, they can
164262306a36Sopenharmony_ci	 * generally handle ENOSPC in a different way, so treat them the same as
164362306a36Sopenharmony_ci	 * normal flushers when it comes to skipping pending tickets.
164462306a36Sopenharmony_ci	 */
164562306a36Sopenharmony_ci	if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
164662306a36Sopenharmony_ci		pending_tickets = !list_empty(&space_info->tickets) ||
164762306a36Sopenharmony_ci			!list_empty(&space_info->priority_tickets);
164862306a36Sopenharmony_ci	else
164962306a36Sopenharmony_ci		pending_tickets = !list_empty(&space_info->priority_tickets);
165062306a36Sopenharmony_ci
165162306a36Sopenharmony_ci	/*
165262306a36Sopenharmony_ci	 * Carry on if we have enough space (short-circuit) OR call
165362306a36Sopenharmony_ci	 * can_overcommit() to ensure we can overcommit to continue.
165462306a36Sopenharmony_ci	 */
165562306a36Sopenharmony_ci	if (!pending_tickets &&
165662306a36Sopenharmony_ci	    ((used + orig_bytes <= space_info->total_bytes) ||
165762306a36Sopenharmony_ci	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
165862306a36Sopenharmony_ci		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
165962306a36Sopenharmony_ci						      orig_bytes);
166062306a36Sopenharmony_ci		ret = 0;
166162306a36Sopenharmony_ci	}
166262306a36Sopenharmony_ci
166362306a36Sopenharmony_ci	/*
166462306a36Sopenharmony_ci	 * Things are dire, we need to make a reservation so we don't abort.  We
166562306a36Sopenharmony_ci	 * will let this reservation go through as long as we have actual space
166662306a36Sopenharmony_ci	 * left to allocate for the block.
166762306a36Sopenharmony_ci	 */
166862306a36Sopenharmony_ci	if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
166962306a36Sopenharmony_ci		used = btrfs_space_info_used(space_info, false);
167062306a36Sopenharmony_ci		if (used + orig_bytes <= space_info->total_bytes) {
167162306a36Sopenharmony_ci			btrfs_space_info_update_bytes_may_use(fs_info, space_info,
167262306a36Sopenharmony_ci							      orig_bytes);
167362306a36Sopenharmony_ci			ret = 0;
167462306a36Sopenharmony_ci		}
167562306a36Sopenharmony_ci	}
167662306a36Sopenharmony_ci
167762306a36Sopenharmony_ci	/*
167862306a36Sopenharmony_ci	 * If we couldn't make a reservation then setup our reservation ticket
167962306a36Sopenharmony_ci	 * and kick the async worker if it's not already running.
168062306a36Sopenharmony_ci	 *
168162306a36Sopenharmony_ci	 * If we are a priority flusher then we just need to add our ticket to
168262306a36Sopenharmony_ci	 * the list and we will do our own flushing further down.
168362306a36Sopenharmony_ci	 */
168462306a36Sopenharmony_ci	if (ret && can_ticket(flush)) {
168562306a36Sopenharmony_ci		ticket.bytes = orig_bytes;
168662306a36Sopenharmony_ci		ticket.error = 0;
168762306a36Sopenharmony_ci		space_info->reclaim_size += ticket.bytes;
168862306a36Sopenharmony_ci		init_waitqueue_head(&ticket.wait);
168962306a36Sopenharmony_ci		ticket.steal = can_steal(flush);
169062306a36Sopenharmony_ci		if (trace_btrfs_reserve_ticket_enabled())
169162306a36Sopenharmony_ci			start_ns = ktime_get_ns();
169262306a36Sopenharmony_ci
169362306a36Sopenharmony_ci		if (flush == BTRFS_RESERVE_FLUSH_ALL ||
169462306a36Sopenharmony_ci		    flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
169562306a36Sopenharmony_ci		    flush == BTRFS_RESERVE_FLUSH_DATA) {
169662306a36Sopenharmony_ci			list_add_tail(&ticket.list, &space_info->tickets);
169762306a36Sopenharmony_ci			if (!space_info->flush) {
169862306a36Sopenharmony_ci				/*
169962306a36Sopenharmony_ci				 * We were forced to add a reserve ticket, so
170062306a36Sopenharmony_ci				 * our preemptive flushing is unable to keep
170162306a36Sopenharmony_ci				 * up.  Clamp down on the threshold for the
170262306a36Sopenharmony_ci				 * preemptive flushing in order to keep up with
170362306a36Sopenharmony_ci				 * the workload.
170462306a36Sopenharmony_ci				 */
170562306a36Sopenharmony_ci				maybe_clamp_preempt(fs_info, space_info);
170662306a36Sopenharmony_ci
170762306a36Sopenharmony_ci				space_info->flush = 1;
170862306a36Sopenharmony_ci				trace_btrfs_trigger_flush(fs_info,
170962306a36Sopenharmony_ci							  space_info->flags,
171062306a36Sopenharmony_ci							  orig_bytes, flush,
171162306a36Sopenharmony_ci							  "enospc");
171262306a36Sopenharmony_ci				queue_work(system_unbound_wq, async_work);
171362306a36Sopenharmony_ci			}
171462306a36Sopenharmony_ci		} else {
171562306a36Sopenharmony_ci			list_add_tail(&ticket.list,
171662306a36Sopenharmony_ci				      &space_info->priority_tickets);
171762306a36Sopenharmony_ci		}
171862306a36Sopenharmony_ci	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
171962306a36Sopenharmony_ci		/*
172062306a36Sopenharmony_ci		 * We will do the space reservation dance during log replay,
172162306a36Sopenharmony_ci		 * which means we won't have fs_info->fs_root set, so don't do
172262306a36Sopenharmony_ci		 * the async reclaim as we will panic.
172362306a36Sopenharmony_ci		 */
172462306a36Sopenharmony_ci		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
172562306a36Sopenharmony_ci		    !work_busy(&fs_info->preempt_reclaim_work) &&
172662306a36Sopenharmony_ci		    need_preemptive_reclaim(fs_info, space_info)) {
172762306a36Sopenharmony_ci			trace_btrfs_trigger_flush(fs_info, space_info->flags,
172862306a36Sopenharmony_ci						  orig_bytes, flush, "preempt");
172962306a36Sopenharmony_ci			queue_work(system_unbound_wq,
173062306a36Sopenharmony_ci				   &fs_info->preempt_reclaim_work);
173162306a36Sopenharmony_ci		}
173262306a36Sopenharmony_ci	}
173362306a36Sopenharmony_ci	spin_unlock(&space_info->lock);
173462306a36Sopenharmony_ci	if (!ret || !can_ticket(flush))
173562306a36Sopenharmony_ci		return ret;
173662306a36Sopenharmony_ci
173762306a36Sopenharmony_ci	return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
173862306a36Sopenharmony_ci				     orig_bytes, flush);
173962306a36Sopenharmony_ci}
174062306a36Sopenharmony_ci
174162306a36Sopenharmony_ci/*
174262306a36Sopenharmony_ci * Try to reserve metadata bytes from the block_rsv's space.
174362306a36Sopenharmony_ci *
174462306a36Sopenharmony_ci * @fs_info:    the filesystem
174562306a36Sopenharmony_ci * @block_rsv:  block_rsv we're allocating for
174662306a36Sopenharmony_ci * @orig_bytes: number of bytes we want
174762306a36Sopenharmony_ci * @flush:      whether or not we can flush to make our reservation
174862306a36Sopenharmony_ci *
174962306a36Sopenharmony_ci * This will reserve orig_bytes number of bytes from the space info associated
175062306a36Sopenharmony_ci * with the block_rsv.  If there is not enough space it will make an attempt to
175162306a36Sopenharmony_ci * flush out space to make room.  It will do this by flushing delalloc if
175262306a36Sopenharmony_ci * possible or committing the transaction.  If flush is 0 then no attempts to
175362306a36Sopenharmony_ci * regain reservations will be made and this will fail if there is not enough
175462306a36Sopenharmony_ci * space already.
175562306a36Sopenharmony_ci */
175662306a36Sopenharmony_ciint btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
175762306a36Sopenharmony_ci				 struct btrfs_block_rsv *block_rsv,
175862306a36Sopenharmony_ci				 u64 orig_bytes,
175962306a36Sopenharmony_ci				 enum btrfs_reserve_flush_enum flush)
176062306a36Sopenharmony_ci{
176162306a36Sopenharmony_ci	int ret;
176262306a36Sopenharmony_ci
176362306a36Sopenharmony_ci	ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
176462306a36Sopenharmony_ci	if (ret == -ENOSPC) {
176562306a36Sopenharmony_ci		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
176662306a36Sopenharmony_ci					      block_rsv->space_info->flags,
176762306a36Sopenharmony_ci					      orig_bytes, 1);
176862306a36Sopenharmony_ci
176962306a36Sopenharmony_ci		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
177062306a36Sopenharmony_ci			btrfs_dump_space_info(fs_info, block_rsv->space_info,
177162306a36Sopenharmony_ci					      orig_bytes, 0);
177262306a36Sopenharmony_ci	}
177362306a36Sopenharmony_ci	return ret;
177462306a36Sopenharmony_ci}
177562306a36Sopenharmony_ci
177662306a36Sopenharmony_ci/*
177762306a36Sopenharmony_ci * Try to reserve data bytes for an allocation.
177862306a36Sopenharmony_ci *
177962306a36Sopenharmony_ci * @fs_info: the filesystem
178062306a36Sopenharmony_ci * @bytes:   number of bytes we need
178162306a36Sopenharmony_ci * @flush:   how we are allowed to flush
178262306a36Sopenharmony_ci *
178362306a36Sopenharmony_ci * This will reserve bytes from the data space info.  If there is not enough
178462306a36Sopenharmony_ci * space then we will attempt to flush space as specified by flush.
178562306a36Sopenharmony_ci */
178662306a36Sopenharmony_ciint btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
178762306a36Sopenharmony_ci			     enum btrfs_reserve_flush_enum flush)
178862306a36Sopenharmony_ci{
178962306a36Sopenharmony_ci	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
179062306a36Sopenharmony_ci	int ret;
179162306a36Sopenharmony_ci
179262306a36Sopenharmony_ci	ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
179362306a36Sopenharmony_ci	       flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
179462306a36Sopenharmony_ci	       flush == BTRFS_RESERVE_NO_FLUSH);
179562306a36Sopenharmony_ci	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
179662306a36Sopenharmony_ci
179762306a36Sopenharmony_ci	ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
179862306a36Sopenharmony_ci	if (ret == -ENOSPC) {
179962306a36Sopenharmony_ci		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
180062306a36Sopenharmony_ci					      data_sinfo->flags, bytes, 1);
180162306a36Sopenharmony_ci		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
180262306a36Sopenharmony_ci			btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
180362306a36Sopenharmony_ci	}
180462306a36Sopenharmony_ci	return ret;
180562306a36Sopenharmony_ci}
180662306a36Sopenharmony_ci
180762306a36Sopenharmony_ci/* Dump all the space infos when we abort a transaction due to ENOSPC. */
180862306a36Sopenharmony_ci__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
180962306a36Sopenharmony_ci{
181062306a36Sopenharmony_ci	struct btrfs_space_info *space_info;
181162306a36Sopenharmony_ci
181262306a36Sopenharmony_ci	btrfs_info(fs_info, "dumping space info:");
181362306a36Sopenharmony_ci	list_for_each_entry(space_info, &fs_info->space_info, list) {
181462306a36Sopenharmony_ci		spin_lock(&space_info->lock);
181562306a36Sopenharmony_ci		__btrfs_dump_space_info(fs_info, space_info);
181662306a36Sopenharmony_ci		spin_unlock(&space_info->lock);
181762306a36Sopenharmony_ci	}
181862306a36Sopenharmony_ci	dump_global_block_rsv(fs_info);
181962306a36Sopenharmony_ci}
182062306a36Sopenharmony_ci
182162306a36Sopenharmony_ci/*
182262306a36Sopenharmony_ci * Account the unused space of all the readonly block group in the space_info.
182362306a36Sopenharmony_ci * takes mirrors into account.
182462306a36Sopenharmony_ci */
182562306a36Sopenharmony_ciu64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
182662306a36Sopenharmony_ci{
182762306a36Sopenharmony_ci	struct btrfs_block_group *block_group;
182862306a36Sopenharmony_ci	u64 free_bytes = 0;
182962306a36Sopenharmony_ci	int factor;
183062306a36Sopenharmony_ci
183162306a36Sopenharmony_ci	/* It's df, we don't care if it's racy */
183262306a36Sopenharmony_ci	if (list_empty(&sinfo->ro_bgs))
183362306a36Sopenharmony_ci		return 0;
183462306a36Sopenharmony_ci
183562306a36Sopenharmony_ci	spin_lock(&sinfo->lock);
183662306a36Sopenharmony_ci	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
183762306a36Sopenharmony_ci		spin_lock(&block_group->lock);
183862306a36Sopenharmony_ci
183962306a36Sopenharmony_ci		if (!block_group->ro) {
184062306a36Sopenharmony_ci			spin_unlock(&block_group->lock);
184162306a36Sopenharmony_ci			continue;
184262306a36Sopenharmony_ci		}
184362306a36Sopenharmony_ci
184462306a36Sopenharmony_ci		factor = btrfs_bg_type_to_factor(block_group->flags);
184562306a36Sopenharmony_ci		free_bytes += (block_group->length -
184662306a36Sopenharmony_ci			       block_group->used) * factor;
184762306a36Sopenharmony_ci
184862306a36Sopenharmony_ci		spin_unlock(&block_group->lock);
184962306a36Sopenharmony_ci	}
185062306a36Sopenharmony_ci	spin_unlock(&sinfo->lock);
185162306a36Sopenharmony_ci
185262306a36Sopenharmony_ci	return free_bytes;
185362306a36Sopenharmony_ci}
1854