18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci
38c2ecf20Sopenharmony_ci#include "misc.h"
48c2ecf20Sopenharmony_ci#include "ctree.h"
58c2ecf20Sopenharmony_ci#include "space-info.h"
68c2ecf20Sopenharmony_ci#include "sysfs.h"
78c2ecf20Sopenharmony_ci#include "volumes.h"
88c2ecf20Sopenharmony_ci#include "free-space-cache.h"
98c2ecf20Sopenharmony_ci#include "ordered-data.h"
108c2ecf20Sopenharmony_ci#include "transaction.h"
118c2ecf20Sopenharmony_ci#include "block-group.h"
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci/*
148c2ecf20Sopenharmony_ci * HOW DOES SPACE RESERVATION WORK
158c2ecf20Sopenharmony_ci *
168c2ecf20Sopenharmony_ci * If you want to know about delalloc specifically, there is a separate comment
178c2ecf20Sopenharmony_ci * for that with the delalloc code.  This comment is about how the whole system
188c2ecf20Sopenharmony_ci * works generally.
198c2ecf20Sopenharmony_ci *
208c2ecf20Sopenharmony_ci * BASIC CONCEPTS
218c2ecf20Sopenharmony_ci *
228c2ecf20Sopenharmony_ci *   1) space_info.  This is the ultimate arbiter of how much space we can use.
238c2ecf20Sopenharmony_ci *   There's a description of the bytes_ fields with the struct declaration,
248c2ecf20Sopenharmony_ci *   refer to that for specifics on each field.  Suffice it to say that for
258c2ecf20Sopenharmony_ci *   reservations we care about total_bytes - SUM(space_info->bytes_) when
268c2ecf20Sopenharmony_ci *   determining if there is space to make an allocation.  There is a space_info
278c2ecf20Sopenharmony_ci *   for METADATA, SYSTEM, and DATA areas.
288c2ecf20Sopenharmony_ci *
298c2ecf20Sopenharmony_ci *   2) block_rsv's.  These are basically buckets for every different type of
308c2ecf20Sopenharmony_ci *   metadata reservation we have.  You can see the comment in the block_rsv
318c2ecf20Sopenharmony_ci *   code on the rules for each type, but generally block_rsv->reserved is how
328c2ecf20Sopenharmony_ci *   much space is accounted for in space_info->bytes_may_use.
338c2ecf20Sopenharmony_ci *
348c2ecf20Sopenharmony_ci *   3) btrfs_calc*_size.  These are the worst case calculations we used based
358c2ecf20Sopenharmony_ci *   on the number of items we will want to modify.  We have one for changing
368c2ecf20Sopenharmony_ci *   items, and one for inserting new items.  Generally we use these helpers to
378c2ecf20Sopenharmony_ci *   determine the size of the block reserves, and then use the actual bytes
388c2ecf20Sopenharmony_ci *   values to adjust the space_info counters.
398c2ecf20Sopenharmony_ci *
408c2ecf20Sopenharmony_ci * MAKING RESERVATIONS, THE NORMAL CASE
418c2ecf20Sopenharmony_ci *
428c2ecf20Sopenharmony_ci *   We call into either btrfs_reserve_data_bytes() or
438c2ecf20Sopenharmony_ci *   btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
448c2ecf20Sopenharmony_ci *   num_bytes we want to reserve.
458c2ecf20Sopenharmony_ci *
468c2ecf20Sopenharmony_ci *   ->reserve
478c2ecf20Sopenharmony_ci *     space_info->bytes_may_reserve += num_bytes
488c2ecf20Sopenharmony_ci *
498c2ecf20Sopenharmony_ci *   ->extent allocation
508c2ecf20Sopenharmony_ci *     Call btrfs_add_reserved_bytes() which does
518c2ecf20Sopenharmony_ci *     space_info->bytes_may_reserve -= num_bytes
528c2ecf20Sopenharmony_ci *     space_info->bytes_reserved += extent_bytes
538c2ecf20Sopenharmony_ci *
548c2ecf20Sopenharmony_ci *   ->insert reference
558c2ecf20Sopenharmony_ci *     Call btrfs_update_block_group() which does
568c2ecf20Sopenharmony_ci *     space_info->bytes_reserved -= extent_bytes
578c2ecf20Sopenharmony_ci *     space_info->bytes_used += extent_bytes
588c2ecf20Sopenharmony_ci *
598c2ecf20Sopenharmony_ci * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
608c2ecf20Sopenharmony_ci *
618c2ecf20Sopenharmony_ci *   Assume we are unable to simply make the reservation because we do not have
628c2ecf20Sopenharmony_ci *   enough space
638c2ecf20Sopenharmony_ci *
648c2ecf20Sopenharmony_ci *   -> __reserve_bytes
658c2ecf20Sopenharmony_ci *     create a reserve_ticket with ->bytes set to our reservation, add it to
668c2ecf20Sopenharmony_ci *     the tail of space_info->tickets, kick async flush thread
678c2ecf20Sopenharmony_ci *
688c2ecf20Sopenharmony_ci *   ->handle_reserve_ticket
698c2ecf20Sopenharmony_ci *     wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
708c2ecf20Sopenharmony_ci *     on the ticket.
718c2ecf20Sopenharmony_ci *
728c2ecf20Sopenharmony_ci *   -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
738c2ecf20Sopenharmony_ci *     Flushes various things attempting to free up space.
748c2ecf20Sopenharmony_ci *
758c2ecf20Sopenharmony_ci *   -> btrfs_try_granting_tickets()
768c2ecf20Sopenharmony_ci *     This is called by anything that either subtracts space from
778c2ecf20Sopenharmony_ci *     space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
788c2ecf20Sopenharmony_ci *     space_info->total_bytes.  This loops through the ->priority_tickets and
798c2ecf20Sopenharmony_ci *     then the ->tickets list checking to see if the reservation can be
808c2ecf20Sopenharmony_ci *     completed.  If it can the space is added to space_info->bytes_may_use and
818c2ecf20Sopenharmony_ci *     the ticket is woken up.
828c2ecf20Sopenharmony_ci *
838c2ecf20Sopenharmony_ci *   -> ticket wakeup
848c2ecf20Sopenharmony_ci *     Check if ->bytes == 0, if it does we got our reservation and we can carry
858c2ecf20Sopenharmony_ci *     on, if not return the appropriate error (ENOSPC, but can be EINTR if we
868c2ecf20Sopenharmony_ci *     were interrupted.)
878c2ecf20Sopenharmony_ci *
888c2ecf20Sopenharmony_ci * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
898c2ecf20Sopenharmony_ci *
908c2ecf20Sopenharmony_ci *   Same as the above, except we add ourselves to the
918c2ecf20Sopenharmony_ci *   space_info->priority_tickets, and we do not use ticket->wait, we simply
928c2ecf20Sopenharmony_ci *   call flush_space() ourselves for the states that are safe for us to call
938c2ecf20Sopenharmony_ci *   without deadlocking and hope for the best.
948c2ecf20Sopenharmony_ci *
958c2ecf20Sopenharmony_ci * THE FLUSHING STATES
968c2ecf20Sopenharmony_ci *
978c2ecf20Sopenharmony_ci *   Generally speaking we will have two cases for each state, a "nice" state
988c2ecf20Sopenharmony_ci *   and a "ALL THE THINGS" state.  In btrfs we delay a lot of work in order to
998c2ecf20Sopenharmony_ci *   reduce the locking over head on the various trees, and even to keep from
1008c2ecf20Sopenharmony_ci *   doing any work at all in the case of delayed refs.  Each of these delayed
1018c2ecf20Sopenharmony_ci *   things however hold reservations, and so letting them run allows us to
1028c2ecf20Sopenharmony_ci *   reclaim space so we can make new reservations.
1038c2ecf20Sopenharmony_ci *
1048c2ecf20Sopenharmony_ci *   FLUSH_DELAYED_ITEMS
1058c2ecf20Sopenharmony_ci *     Every inode has a delayed item to update the inode.  Take a simple write
1068c2ecf20Sopenharmony_ci *     for example, we would update the inode item at write time to update the
1078c2ecf20Sopenharmony_ci *     mtime, and then again at finish_ordered_io() time in order to update the
1088c2ecf20Sopenharmony_ci *     isize or bytes.  We keep these delayed items to coalesce these operations
1098c2ecf20Sopenharmony_ci *     into a single operation done on demand.  These are an easy way to reclaim
1108c2ecf20Sopenharmony_ci *     metadata space.
1118c2ecf20Sopenharmony_ci *
1128c2ecf20Sopenharmony_ci *   FLUSH_DELALLOC
1138c2ecf20Sopenharmony_ci *     Look at the delalloc comment to get an idea of how much space is reserved
1148c2ecf20Sopenharmony_ci *     for delayed allocation.  We can reclaim some of this space simply by
1158c2ecf20Sopenharmony_ci *     running delalloc, but usually we need to wait for ordered extents to
1168c2ecf20Sopenharmony_ci *     reclaim the bulk of this space.
1178c2ecf20Sopenharmony_ci *
1188c2ecf20Sopenharmony_ci *   FLUSH_DELAYED_REFS
1198c2ecf20Sopenharmony_ci *     We have a block reserve for the outstanding delayed refs space, and every
1208c2ecf20Sopenharmony_ci *     delayed ref operation holds a reservation.  Running these is a quick way
1218c2ecf20Sopenharmony_ci *     to reclaim space, but we want to hold this until the end because COW can
1228c2ecf20Sopenharmony_ci *     churn a lot and we can avoid making some extent tree modifications if we
1238c2ecf20Sopenharmony_ci *     are able to delay for as long as possible.
1248c2ecf20Sopenharmony_ci *
1258c2ecf20Sopenharmony_ci *   ALLOC_CHUNK
1268c2ecf20Sopenharmony_ci *     We will skip this the first time through space reservation, because of
1278c2ecf20Sopenharmony_ci *     overcommit and we don't want to have a lot of useless metadata space when
1288c2ecf20Sopenharmony_ci *     our worst case reservations will likely never come true.
1298c2ecf20Sopenharmony_ci *
1308c2ecf20Sopenharmony_ci *   RUN_DELAYED_IPUTS
1318c2ecf20Sopenharmony_ci *     If we're freeing inodes we're likely freeing checksums, file extent
1328c2ecf20Sopenharmony_ci *     items, and extent tree items.  Loads of space could be freed up by these
1338c2ecf20Sopenharmony_ci *     operations, however they won't be usable until the transaction commits.
1348c2ecf20Sopenharmony_ci *
1358c2ecf20Sopenharmony_ci *   COMMIT_TRANS
1368c2ecf20Sopenharmony_ci *     may_commit_transaction() is the ultimate arbiter on whether we commit the
1378c2ecf20Sopenharmony_ci *     transaction or not.  In order to avoid constantly churning we do all the
1388c2ecf20Sopenharmony_ci *     above flushing first and then commit the transaction as the last resort.
1398c2ecf20Sopenharmony_ci *     However we need to take into account things like pinned space that would
1408c2ecf20Sopenharmony_ci *     be freed, plus any delayed work we may not have gotten rid of in the case
1418c2ecf20Sopenharmony_ci *     of metadata.
1428c2ecf20Sopenharmony_ci *
1438c2ecf20Sopenharmony_ci * OVERCOMMIT
1448c2ecf20Sopenharmony_ci *
1458c2ecf20Sopenharmony_ci *   Because we hold so many reservations for metadata we will allow you to
1468c2ecf20Sopenharmony_ci *   reserve more space than is currently free in the currently allocate
1478c2ecf20Sopenharmony_ci *   metadata space.  This only happens with metadata, data does not allow
1488c2ecf20Sopenharmony_ci *   overcommitting.
1498c2ecf20Sopenharmony_ci *
1508c2ecf20Sopenharmony_ci *   You can see the current logic for when we allow overcommit in
1518c2ecf20Sopenharmony_ci *   btrfs_can_overcommit(), but it only applies to unallocated space.  If there
1528c2ecf20Sopenharmony_ci *   is no unallocated space to be had, all reservations are kept within the
1538c2ecf20Sopenharmony_ci *   free space in the allocated metadata chunks.
1548c2ecf20Sopenharmony_ci *
1558c2ecf20Sopenharmony_ci *   Because of overcommitting, you generally want to use the
1568c2ecf20Sopenharmony_ci *   btrfs_can_overcommit() logic for metadata allocations, as it does the right
1578c2ecf20Sopenharmony_ci *   thing with or without extra unallocated space.
1588c2ecf20Sopenharmony_ci */
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ciu64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
1618c2ecf20Sopenharmony_ci			  bool may_use_included)
1628c2ecf20Sopenharmony_ci{
1638c2ecf20Sopenharmony_ci	ASSERT(s_info);
1648c2ecf20Sopenharmony_ci	return s_info->bytes_used + s_info->bytes_reserved +
1658c2ecf20Sopenharmony_ci		s_info->bytes_pinned + s_info->bytes_readonly +
1668c2ecf20Sopenharmony_ci		(may_use_included ? s_info->bytes_may_use : 0);
1678c2ecf20Sopenharmony_ci}
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_ci/*
1708c2ecf20Sopenharmony_ci * after adding space to the filesystem, we need to clear the full flags
1718c2ecf20Sopenharmony_ci * on all the space infos.
1728c2ecf20Sopenharmony_ci */
1738c2ecf20Sopenharmony_civoid btrfs_clear_space_info_full(struct btrfs_fs_info *info)
1748c2ecf20Sopenharmony_ci{
1758c2ecf20Sopenharmony_ci	struct list_head *head = &info->space_info;
1768c2ecf20Sopenharmony_ci	struct btrfs_space_info *found;
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci	list_for_each_entry(found, head, list)
1798c2ecf20Sopenharmony_ci		found->full = 0;
1808c2ecf20Sopenharmony_ci}
1818c2ecf20Sopenharmony_ci
1828c2ecf20Sopenharmony_cistatic int create_space_info(struct btrfs_fs_info *info, u64 flags)
1838c2ecf20Sopenharmony_ci{
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci	struct btrfs_space_info *space_info;
1868c2ecf20Sopenharmony_ci	int i;
1878c2ecf20Sopenharmony_ci	int ret;
1888c2ecf20Sopenharmony_ci
1898c2ecf20Sopenharmony_ci	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
1908c2ecf20Sopenharmony_ci	if (!space_info)
1918c2ecf20Sopenharmony_ci		return -ENOMEM;
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
1948c2ecf20Sopenharmony_ci				 GFP_KERNEL);
1958c2ecf20Sopenharmony_ci	if (ret) {
1968c2ecf20Sopenharmony_ci		kfree(space_info);
1978c2ecf20Sopenharmony_ci		return ret;
1988c2ecf20Sopenharmony_ci	}
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ci	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2018c2ecf20Sopenharmony_ci		INIT_LIST_HEAD(&space_info->block_groups[i]);
2028c2ecf20Sopenharmony_ci	init_rwsem(&space_info->groups_sem);
2038c2ecf20Sopenharmony_ci	spin_lock_init(&space_info->lock);
2048c2ecf20Sopenharmony_ci	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
2058c2ecf20Sopenharmony_ci	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
2068c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&space_info->ro_bgs);
2078c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&space_info->tickets);
2088c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&space_info->priority_tickets);
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci	ret = btrfs_sysfs_add_space_info_type(info, space_info);
2118c2ecf20Sopenharmony_ci	if (ret)
2128c2ecf20Sopenharmony_ci		return ret;
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci	list_add(&space_info->list, &info->space_info);
2158c2ecf20Sopenharmony_ci	if (flags & BTRFS_BLOCK_GROUP_DATA)
2168c2ecf20Sopenharmony_ci		info->data_sinfo = space_info;
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci	return ret;
2198c2ecf20Sopenharmony_ci}
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_ciint btrfs_init_space_info(struct btrfs_fs_info *fs_info)
2228c2ecf20Sopenharmony_ci{
2238c2ecf20Sopenharmony_ci	struct btrfs_super_block *disk_super;
2248c2ecf20Sopenharmony_ci	u64 features;
2258c2ecf20Sopenharmony_ci	u64 flags;
2268c2ecf20Sopenharmony_ci	int mixed = 0;
2278c2ecf20Sopenharmony_ci	int ret;
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci	disk_super = fs_info->super_copy;
2308c2ecf20Sopenharmony_ci	if (!btrfs_super_root(disk_super))
2318c2ecf20Sopenharmony_ci		return -EINVAL;
2328c2ecf20Sopenharmony_ci
2338c2ecf20Sopenharmony_ci	features = btrfs_super_incompat_flags(disk_super);
2348c2ecf20Sopenharmony_ci	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
2358c2ecf20Sopenharmony_ci		mixed = 1;
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_ci	flags = BTRFS_BLOCK_GROUP_SYSTEM;
2388c2ecf20Sopenharmony_ci	ret = create_space_info(fs_info, flags);
2398c2ecf20Sopenharmony_ci	if (ret)
2408c2ecf20Sopenharmony_ci		goto out;
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_ci	if (mixed) {
2438c2ecf20Sopenharmony_ci		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
2448c2ecf20Sopenharmony_ci		ret = create_space_info(fs_info, flags);
2458c2ecf20Sopenharmony_ci	} else {
2468c2ecf20Sopenharmony_ci		flags = BTRFS_BLOCK_GROUP_METADATA;
2478c2ecf20Sopenharmony_ci		ret = create_space_info(fs_info, flags);
2488c2ecf20Sopenharmony_ci		if (ret)
2498c2ecf20Sopenharmony_ci			goto out;
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci		flags = BTRFS_BLOCK_GROUP_DATA;
2528c2ecf20Sopenharmony_ci		ret = create_space_info(fs_info, flags);
2538c2ecf20Sopenharmony_ci	}
2548c2ecf20Sopenharmony_ciout:
2558c2ecf20Sopenharmony_ci	return ret;
2568c2ecf20Sopenharmony_ci}
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_civoid btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
2598c2ecf20Sopenharmony_ci			     u64 total_bytes, u64 bytes_used,
2608c2ecf20Sopenharmony_ci			     u64 bytes_readonly,
2618c2ecf20Sopenharmony_ci			     struct btrfs_space_info **space_info)
2628c2ecf20Sopenharmony_ci{
2638c2ecf20Sopenharmony_ci	struct btrfs_space_info *found;
2648c2ecf20Sopenharmony_ci	int factor;
2658c2ecf20Sopenharmony_ci
2668c2ecf20Sopenharmony_ci	factor = btrfs_bg_type_to_factor(flags);
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci	found = btrfs_find_space_info(info, flags);
2698c2ecf20Sopenharmony_ci	ASSERT(found);
2708c2ecf20Sopenharmony_ci	spin_lock(&found->lock);
2718c2ecf20Sopenharmony_ci	found->total_bytes += total_bytes;
2728c2ecf20Sopenharmony_ci	found->disk_total += total_bytes * factor;
2738c2ecf20Sopenharmony_ci	found->bytes_used += bytes_used;
2748c2ecf20Sopenharmony_ci	found->disk_used += bytes_used * factor;
2758c2ecf20Sopenharmony_ci	found->bytes_readonly += bytes_readonly;
2768c2ecf20Sopenharmony_ci	if (total_bytes > 0)
2778c2ecf20Sopenharmony_ci		found->full = 0;
2788c2ecf20Sopenharmony_ci	btrfs_try_granting_tickets(info, found);
2798c2ecf20Sopenharmony_ci	spin_unlock(&found->lock);
2808c2ecf20Sopenharmony_ci	*space_info = found;
2818c2ecf20Sopenharmony_ci}
2828c2ecf20Sopenharmony_ci
2838c2ecf20Sopenharmony_cistruct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
2848c2ecf20Sopenharmony_ci					       u64 flags)
2858c2ecf20Sopenharmony_ci{
2868c2ecf20Sopenharmony_ci	struct list_head *head = &info->space_info;
2878c2ecf20Sopenharmony_ci	struct btrfs_space_info *found;
2888c2ecf20Sopenharmony_ci
2898c2ecf20Sopenharmony_ci	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
2908c2ecf20Sopenharmony_ci
2918c2ecf20Sopenharmony_ci	list_for_each_entry(found, head, list) {
2928c2ecf20Sopenharmony_ci		if (found->flags & flags)
2938c2ecf20Sopenharmony_ci			return found;
2948c2ecf20Sopenharmony_ci	}
2958c2ecf20Sopenharmony_ci	return NULL;
2968c2ecf20Sopenharmony_ci}
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_cistatic u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
2998c2ecf20Sopenharmony_ci			  struct btrfs_space_info *space_info,
3008c2ecf20Sopenharmony_ci			  enum btrfs_reserve_flush_enum flush)
3018c2ecf20Sopenharmony_ci{
3028c2ecf20Sopenharmony_ci	u64 profile;
3038c2ecf20Sopenharmony_ci	u64 avail;
3048c2ecf20Sopenharmony_ci	int factor;
3058c2ecf20Sopenharmony_ci
3068c2ecf20Sopenharmony_ci	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3078c2ecf20Sopenharmony_ci		profile = btrfs_system_alloc_profile(fs_info);
3088c2ecf20Sopenharmony_ci	else
3098c2ecf20Sopenharmony_ci		profile = btrfs_metadata_alloc_profile(fs_info);
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	avail = atomic64_read(&fs_info->free_chunk_space);
3128c2ecf20Sopenharmony_ci
3138c2ecf20Sopenharmony_ci	/*
3148c2ecf20Sopenharmony_ci	 * If we have dup, raid1 or raid10 then only half of the free
3158c2ecf20Sopenharmony_ci	 * space is actually usable.  For raid56, the space info used
3168c2ecf20Sopenharmony_ci	 * doesn't include the parity drive, so we don't have to
3178c2ecf20Sopenharmony_ci	 * change the math
3188c2ecf20Sopenharmony_ci	 */
3198c2ecf20Sopenharmony_ci	factor = btrfs_bg_type_to_factor(profile);
3208c2ecf20Sopenharmony_ci	avail = div_u64(avail, factor);
3218c2ecf20Sopenharmony_ci
3228c2ecf20Sopenharmony_ci	/*
3238c2ecf20Sopenharmony_ci	 * If we aren't flushing all things, let us overcommit up to
3248c2ecf20Sopenharmony_ci	 * 1/2th of the space. If we can flush, don't let us overcommit
3258c2ecf20Sopenharmony_ci	 * too much, let it overcommit up to 1/8 of the space.
3268c2ecf20Sopenharmony_ci	 */
3278c2ecf20Sopenharmony_ci	if (flush == BTRFS_RESERVE_FLUSH_ALL)
3288c2ecf20Sopenharmony_ci		avail >>= 3;
3298c2ecf20Sopenharmony_ci	else
3308c2ecf20Sopenharmony_ci		avail >>= 1;
3318c2ecf20Sopenharmony_ci	return avail;
3328c2ecf20Sopenharmony_ci}
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ciint btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
3358c2ecf20Sopenharmony_ci			 struct btrfs_space_info *space_info, u64 bytes,
3368c2ecf20Sopenharmony_ci			 enum btrfs_reserve_flush_enum flush)
3378c2ecf20Sopenharmony_ci{
3388c2ecf20Sopenharmony_ci	u64 avail;
3398c2ecf20Sopenharmony_ci	u64 used;
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci	/* Don't overcommit when in mixed mode */
3428c2ecf20Sopenharmony_ci	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
3438c2ecf20Sopenharmony_ci		return 0;
3448c2ecf20Sopenharmony_ci
3458c2ecf20Sopenharmony_ci	used = btrfs_space_info_used(space_info, true);
3468c2ecf20Sopenharmony_ci	avail = calc_available_free_space(fs_info, space_info, flush);
3478c2ecf20Sopenharmony_ci
3488c2ecf20Sopenharmony_ci	if (used + bytes < space_info->total_bytes + avail)
3498c2ecf20Sopenharmony_ci		return 1;
3508c2ecf20Sopenharmony_ci	return 0;
3518c2ecf20Sopenharmony_ci}
3528c2ecf20Sopenharmony_ci
3538c2ecf20Sopenharmony_cistatic void remove_ticket(struct btrfs_space_info *space_info,
3548c2ecf20Sopenharmony_ci			  struct reserve_ticket *ticket)
3558c2ecf20Sopenharmony_ci{
3568c2ecf20Sopenharmony_ci	if (!list_empty(&ticket->list)) {
3578c2ecf20Sopenharmony_ci		list_del_init(&ticket->list);
3588c2ecf20Sopenharmony_ci		ASSERT(space_info->reclaim_size >= ticket->bytes);
3598c2ecf20Sopenharmony_ci		space_info->reclaim_size -= ticket->bytes;
3608c2ecf20Sopenharmony_ci	}
3618c2ecf20Sopenharmony_ci}
3628c2ecf20Sopenharmony_ci
3638c2ecf20Sopenharmony_ci/*
3648c2ecf20Sopenharmony_ci * This is for space we already have accounted in space_info->bytes_may_use, so
3658c2ecf20Sopenharmony_ci * basically when we're returning space from block_rsv's.
3668c2ecf20Sopenharmony_ci */
3678c2ecf20Sopenharmony_civoid btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
3688c2ecf20Sopenharmony_ci				struct btrfs_space_info *space_info)
3698c2ecf20Sopenharmony_ci{
3708c2ecf20Sopenharmony_ci	struct list_head *head;
3718c2ecf20Sopenharmony_ci	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
3728c2ecf20Sopenharmony_ci
3738c2ecf20Sopenharmony_ci	lockdep_assert_held(&space_info->lock);
3748c2ecf20Sopenharmony_ci
3758c2ecf20Sopenharmony_ci	head = &space_info->priority_tickets;
3768c2ecf20Sopenharmony_ciagain:
3778c2ecf20Sopenharmony_ci	while (!list_empty(head)) {
3788c2ecf20Sopenharmony_ci		struct reserve_ticket *ticket;
3798c2ecf20Sopenharmony_ci		u64 used = btrfs_space_info_used(space_info, true);
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci		ticket = list_first_entry(head, struct reserve_ticket, list);
3828c2ecf20Sopenharmony_ci
3838c2ecf20Sopenharmony_ci		/* Check and see if our ticket can be satisified now. */
3848c2ecf20Sopenharmony_ci		if ((used + ticket->bytes <= space_info->total_bytes) ||
3858c2ecf20Sopenharmony_ci		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
3868c2ecf20Sopenharmony_ci					 flush)) {
3878c2ecf20Sopenharmony_ci			btrfs_space_info_update_bytes_may_use(fs_info,
3888c2ecf20Sopenharmony_ci							      space_info,
3898c2ecf20Sopenharmony_ci							      ticket->bytes);
3908c2ecf20Sopenharmony_ci			remove_ticket(space_info, ticket);
3918c2ecf20Sopenharmony_ci			ticket->bytes = 0;
3928c2ecf20Sopenharmony_ci			space_info->tickets_id++;
3938c2ecf20Sopenharmony_ci			wake_up(&ticket->wait);
3948c2ecf20Sopenharmony_ci		} else {
3958c2ecf20Sopenharmony_ci			break;
3968c2ecf20Sopenharmony_ci		}
3978c2ecf20Sopenharmony_ci	}
3988c2ecf20Sopenharmony_ci
3998c2ecf20Sopenharmony_ci	if (head == &space_info->priority_tickets) {
4008c2ecf20Sopenharmony_ci		head = &space_info->tickets;
4018c2ecf20Sopenharmony_ci		flush = BTRFS_RESERVE_FLUSH_ALL;
4028c2ecf20Sopenharmony_ci		goto again;
4038c2ecf20Sopenharmony_ci	}
4048c2ecf20Sopenharmony_ci}
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci#define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
4078c2ecf20Sopenharmony_cido {									\
4088c2ecf20Sopenharmony_ci	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
4098c2ecf20Sopenharmony_ci	spin_lock(&__rsv->lock);					\
4108c2ecf20Sopenharmony_ci	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
4118c2ecf20Sopenharmony_ci		   __rsv->size, __rsv->reserved);			\
4128c2ecf20Sopenharmony_ci	spin_unlock(&__rsv->lock);					\
4138c2ecf20Sopenharmony_ci} while (0)
4148c2ecf20Sopenharmony_ci
4158c2ecf20Sopenharmony_cistatic void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
4168c2ecf20Sopenharmony_ci				    struct btrfs_space_info *info)
4178c2ecf20Sopenharmony_ci{
4188c2ecf20Sopenharmony_ci	lockdep_assert_held(&info->lock);
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_ci	/* The free space could be negative in case of overcommit */
4218c2ecf20Sopenharmony_ci	btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
4228c2ecf20Sopenharmony_ci		   info->flags,
4238c2ecf20Sopenharmony_ci		   (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
4248c2ecf20Sopenharmony_ci		   info->full ? "" : "not ");
4258c2ecf20Sopenharmony_ci	btrfs_info(fs_info,
4268c2ecf20Sopenharmony_ci		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
4278c2ecf20Sopenharmony_ci		info->total_bytes, info->bytes_used, info->bytes_pinned,
4288c2ecf20Sopenharmony_ci		info->bytes_reserved, info->bytes_may_use,
4298c2ecf20Sopenharmony_ci		info->bytes_readonly);
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
4328c2ecf20Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
4338c2ecf20Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
4348c2ecf20Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
4358c2ecf20Sopenharmony_ci	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
4368c2ecf20Sopenharmony_ci
4378c2ecf20Sopenharmony_ci}
4388c2ecf20Sopenharmony_ci
4398c2ecf20Sopenharmony_civoid btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
4408c2ecf20Sopenharmony_ci			   struct btrfs_space_info *info, u64 bytes,
4418c2ecf20Sopenharmony_ci			   int dump_block_groups)
4428c2ecf20Sopenharmony_ci{
4438c2ecf20Sopenharmony_ci	struct btrfs_block_group *cache;
4448c2ecf20Sopenharmony_ci	int index = 0;
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ci	spin_lock(&info->lock);
4478c2ecf20Sopenharmony_ci	__btrfs_dump_space_info(fs_info, info);
4488c2ecf20Sopenharmony_ci	spin_unlock(&info->lock);
4498c2ecf20Sopenharmony_ci
4508c2ecf20Sopenharmony_ci	if (!dump_block_groups)
4518c2ecf20Sopenharmony_ci		return;
4528c2ecf20Sopenharmony_ci
4538c2ecf20Sopenharmony_ci	down_read(&info->groups_sem);
4548c2ecf20Sopenharmony_ciagain:
4558c2ecf20Sopenharmony_ci	list_for_each_entry(cache, &info->block_groups[index], list) {
4568c2ecf20Sopenharmony_ci		spin_lock(&cache->lock);
4578c2ecf20Sopenharmony_ci		btrfs_info(fs_info,
4588c2ecf20Sopenharmony_ci			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
4598c2ecf20Sopenharmony_ci			cache->start, cache->length, cache->used, cache->pinned,
4608c2ecf20Sopenharmony_ci			cache->reserved, cache->ro ? "[readonly]" : "");
4618c2ecf20Sopenharmony_ci		spin_unlock(&cache->lock);
4628c2ecf20Sopenharmony_ci		btrfs_dump_free_space(cache, bytes);
4638c2ecf20Sopenharmony_ci	}
4648c2ecf20Sopenharmony_ci	if (++index < BTRFS_NR_RAID_TYPES)
4658c2ecf20Sopenharmony_ci		goto again;
4668c2ecf20Sopenharmony_ci	up_read(&info->groups_sem);
4678c2ecf20Sopenharmony_ci}
4688c2ecf20Sopenharmony_ci
4698c2ecf20Sopenharmony_cistatic inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4708c2ecf20Sopenharmony_ci					u64 to_reclaim)
4718c2ecf20Sopenharmony_ci{
4728c2ecf20Sopenharmony_ci	u64 bytes;
4738c2ecf20Sopenharmony_ci	u64 nr;
4748c2ecf20Sopenharmony_ci
4758c2ecf20Sopenharmony_ci	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
4768c2ecf20Sopenharmony_ci	nr = div64_u64(to_reclaim, bytes);
4778c2ecf20Sopenharmony_ci	if (!nr)
4788c2ecf20Sopenharmony_ci		nr = 1;
4798c2ecf20Sopenharmony_ci	return nr;
4808c2ecf20Sopenharmony_ci}
4818c2ecf20Sopenharmony_ci
4828c2ecf20Sopenharmony_ci#define EXTENT_SIZE_PER_ITEM	SZ_256K
4838c2ecf20Sopenharmony_ci
4848c2ecf20Sopenharmony_ci/*
4858c2ecf20Sopenharmony_ci * shrink metadata reservation for delalloc
4868c2ecf20Sopenharmony_ci */
4878c2ecf20Sopenharmony_cistatic void shrink_delalloc(struct btrfs_fs_info *fs_info,
4888c2ecf20Sopenharmony_ci			    struct btrfs_space_info *space_info,
4898c2ecf20Sopenharmony_ci			    u64 to_reclaim, bool wait_ordered)
4908c2ecf20Sopenharmony_ci{
4918c2ecf20Sopenharmony_ci	struct btrfs_trans_handle *trans;
4928c2ecf20Sopenharmony_ci	u64 delalloc_bytes;
4938c2ecf20Sopenharmony_ci	u64 dio_bytes;
4948c2ecf20Sopenharmony_ci	u64 items;
4958c2ecf20Sopenharmony_ci	long time_left;
4968c2ecf20Sopenharmony_ci	int loops;
4978c2ecf20Sopenharmony_ci
4988c2ecf20Sopenharmony_ci	/* Calc the number of the pages we need flush for space reservation */
4998c2ecf20Sopenharmony_ci	if (to_reclaim == U64_MAX) {
5008c2ecf20Sopenharmony_ci		items = U64_MAX;
5018c2ecf20Sopenharmony_ci	} else {
5028c2ecf20Sopenharmony_ci		/*
5038c2ecf20Sopenharmony_ci		 * to_reclaim is set to however much metadata we need to
5048c2ecf20Sopenharmony_ci		 * reclaim, but reclaiming that much data doesn't really track
5058c2ecf20Sopenharmony_ci		 * exactly, so increase the amount to reclaim by 2x in order to
5068c2ecf20Sopenharmony_ci		 * make sure we're flushing enough delalloc to hopefully reclaim
5078c2ecf20Sopenharmony_ci		 * some metadata reservations.
5088c2ecf20Sopenharmony_ci		 */
5098c2ecf20Sopenharmony_ci		items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
5108c2ecf20Sopenharmony_ci		to_reclaim = items * EXTENT_SIZE_PER_ITEM;
5118c2ecf20Sopenharmony_ci	}
5128c2ecf20Sopenharmony_ci
5138c2ecf20Sopenharmony_ci	trans = (struct btrfs_trans_handle *)current->journal_info;
5148c2ecf20Sopenharmony_ci
5158c2ecf20Sopenharmony_ci	delalloc_bytes = percpu_counter_sum_positive(
5168c2ecf20Sopenharmony_ci						&fs_info->delalloc_bytes);
5178c2ecf20Sopenharmony_ci	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
5188c2ecf20Sopenharmony_ci	if (delalloc_bytes == 0 && dio_bytes == 0) {
5198c2ecf20Sopenharmony_ci		if (trans)
5208c2ecf20Sopenharmony_ci			return;
5218c2ecf20Sopenharmony_ci		if (wait_ordered)
5228c2ecf20Sopenharmony_ci			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
5238c2ecf20Sopenharmony_ci		return;
5248c2ecf20Sopenharmony_ci	}
5258c2ecf20Sopenharmony_ci
5268c2ecf20Sopenharmony_ci	/*
5278c2ecf20Sopenharmony_ci	 * If we are doing more ordered than delalloc we need to just wait on
5288c2ecf20Sopenharmony_ci	 * ordered extents, otherwise we'll waste time trying to flush delalloc
5298c2ecf20Sopenharmony_ci	 * that likely won't give us the space back we need.
5308c2ecf20Sopenharmony_ci	 */
5318c2ecf20Sopenharmony_ci	if (dio_bytes > delalloc_bytes)
5328c2ecf20Sopenharmony_ci		wait_ordered = true;
5338c2ecf20Sopenharmony_ci
5348c2ecf20Sopenharmony_ci	loops = 0;
5358c2ecf20Sopenharmony_ci	while ((delalloc_bytes || dio_bytes) && loops < 3) {
5368c2ecf20Sopenharmony_ci		u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
5378c2ecf20Sopenharmony_ci
5388c2ecf20Sopenharmony_ci		btrfs_start_delalloc_roots(fs_info, nr_pages, true);
5398c2ecf20Sopenharmony_ci
5408c2ecf20Sopenharmony_ci		loops++;
5418c2ecf20Sopenharmony_ci		if (wait_ordered && !trans) {
5428c2ecf20Sopenharmony_ci			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
5438c2ecf20Sopenharmony_ci		} else {
5448c2ecf20Sopenharmony_ci			time_left = schedule_timeout_killable(1);
5458c2ecf20Sopenharmony_ci			if (time_left)
5468c2ecf20Sopenharmony_ci				break;
5478c2ecf20Sopenharmony_ci		}
5488c2ecf20Sopenharmony_ci
5498c2ecf20Sopenharmony_ci		spin_lock(&space_info->lock);
5508c2ecf20Sopenharmony_ci		if (list_empty(&space_info->tickets) &&
5518c2ecf20Sopenharmony_ci		    list_empty(&space_info->priority_tickets)) {
5528c2ecf20Sopenharmony_ci			spin_unlock(&space_info->lock);
5538c2ecf20Sopenharmony_ci			break;
5548c2ecf20Sopenharmony_ci		}
5558c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_ci		delalloc_bytes = percpu_counter_sum_positive(
5588c2ecf20Sopenharmony_ci						&fs_info->delalloc_bytes);
5598c2ecf20Sopenharmony_ci		dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
5608c2ecf20Sopenharmony_ci	}
5618c2ecf20Sopenharmony_ci}
5628c2ecf20Sopenharmony_ci
5638c2ecf20Sopenharmony_ci/**
5648c2ecf20Sopenharmony_ci * maybe_commit_transaction - possibly commit the transaction if its ok to
5658c2ecf20Sopenharmony_ci * @root - the root we're allocating for
5668c2ecf20Sopenharmony_ci * @bytes - the number of bytes we want to reserve
5678c2ecf20Sopenharmony_ci * @force - force the commit
5688c2ecf20Sopenharmony_ci *
5698c2ecf20Sopenharmony_ci * This will check to make sure that committing the transaction will actually
5708c2ecf20Sopenharmony_ci * get us somewhere and then commit the transaction if it does.  Otherwise it
5718c2ecf20Sopenharmony_ci * will return -ENOSPC.
5728c2ecf20Sopenharmony_ci */
5738c2ecf20Sopenharmony_cistatic int may_commit_transaction(struct btrfs_fs_info *fs_info,
5748c2ecf20Sopenharmony_ci				  struct btrfs_space_info *space_info)
5758c2ecf20Sopenharmony_ci{
5768c2ecf20Sopenharmony_ci	struct reserve_ticket *ticket = NULL;
5778c2ecf20Sopenharmony_ci	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
5788c2ecf20Sopenharmony_ci	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
5798c2ecf20Sopenharmony_ci	struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
5808c2ecf20Sopenharmony_ci	struct btrfs_trans_handle *trans;
5818c2ecf20Sopenharmony_ci	u64 reclaim_bytes = 0;
5828c2ecf20Sopenharmony_ci	u64 bytes_needed = 0;
5838c2ecf20Sopenharmony_ci	u64 cur_free_bytes = 0;
5848c2ecf20Sopenharmony_ci
5858c2ecf20Sopenharmony_ci	trans = (struct btrfs_trans_handle *)current->journal_info;
5868c2ecf20Sopenharmony_ci	if (trans)
5878c2ecf20Sopenharmony_ci		return -EAGAIN;
5888c2ecf20Sopenharmony_ci
5898c2ecf20Sopenharmony_ci	spin_lock(&space_info->lock);
5908c2ecf20Sopenharmony_ci	cur_free_bytes = btrfs_space_info_used(space_info, true);
5918c2ecf20Sopenharmony_ci	if (cur_free_bytes < space_info->total_bytes)
5928c2ecf20Sopenharmony_ci		cur_free_bytes = space_info->total_bytes - cur_free_bytes;
5938c2ecf20Sopenharmony_ci	else
5948c2ecf20Sopenharmony_ci		cur_free_bytes = 0;
5958c2ecf20Sopenharmony_ci
5968c2ecf20Sopenharmony_ci	if (!list_empty(&space_info->priority_tickets))
5978c2ecf20Sopenharmony_ci		ticket = list_first_entry(&space_info->priority_tickets,
5988c2ecf20Sopenharmony_ci					  struct reserve_ticket, list);
5998c2ecf20Sopenharmony_ci	else if (!list_empty(&space_info->tickets))
6008c2ecf20Sopenharmony_ci		ticket = list_first_entry(&space_info->tickets,
6018c2ecf20Sopenharmony_ci					  struct reserve_ticket, list);
6028c2ecf20Sopenharmony_ci	if (ticket)
6038c2ecf20Sopenharmony_ci		bytes_needed = ticket->bytes;
6048c2ecf20Sopenharmony_ci
6058c2ecf20Sopenharmony_ci	if (bytes_needed > cur_free_bytes)
6068c2ecf20Sopenharmony_ci		bytes_needed -= cur_free_bytes;
6078c2ecf20Sopenharmony_ci	else
6088c2ecf20Sopenharmony_ci		bytes_needed = 0;
6098c2ecf20Sopenharmony_ci	spin_unlock(&space_info->lock);
6108c2ecf20Sopenharmony_ci
6118c2ecf20Sopenharmony_ci	if (!bytes_needed)
6128c2ecf20Sopenharmony_ci		return 0;
6138c2ecf20Sopenharmony_ci
6148c2ecf20Sopenharmony_ci	trans = btrfs_join_transaction(fs_info->extent_root);
6158c2ecf20Sopenharmony_ci	if (IS_ERR(trans))
6168c2ecf20Sopenharmony_ci		return PTR_ERR(trans);
6178c2ecf20Sopenharmony_ci
6188c2ecf20Sopenharmony_ci	/*
6198c2ecf20Sopenharmony_ci	 * See if there is enough pinned space to make this reservation, or if
6208c2ecf20Sopenharmony_ci	 * we have block groups that are going to be freed, allowing us to
6218c2ecf20Sopenharmony_ci	 * possibly do a chunk allocation the next loop through.
6228c2ecf20Sopenharmony_ci	 */
6238c2ecf20Sopenharmony_ci	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
6248c2ecf20Sopenharmony_ci	    __percpu_counter_compare(&space_info->total_bytes_pinned,
6258c2ecf20Sopenharmony_ci				     bytes_needed,
6268c2ecf20Sopenharmony_ci				     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
6278c2ecf20Sopenharmony_ci		goto commit;
6288c2ecf20Sopenharmony_ci
6298c2ecf20Sopenharmony_ci	/*
6308c2ecf20Sopenharmony_ci	 * See if there is some space in the delayed insertion reserve for this
6318c2ecf20Sopenharmony_ci	 * reservation.  If the space_info's don't match (like for DATA or
6328c2ecf20Sopenharmony_ci	 * SYSTEM) then just go enospc, reclaiming this space won't recover any
6338c2ecf20Sopenharmony_ci	 * space to satisfy those reservations.
6348c2ecf20Sopenharmony_ci	 */
6358c2ecf20Sopenharmony_ci	if (space_info != delayed_rsv->space_info)
6368c2ecf20Sopenharmony_ci		goto enospc;
6378c2ecf20Sopenharmony_ci
6388c2ecf20Sopenharmony_ci	spin_lock(&delayed_rsv->lock);
6398c2ecf20Sopenharmony_ci	reclaim_bytes += delayed_rsv->reserved;
6408c2ecf20Sopenharmony_ci	spin_unlock(&delayed_rsv->lock);
6418c2ecf20Sopenharmony_ci
6428c2ecf20Sopenharmony_ci	spin_lock(&delayed_refs_rsv->lock);
6438c2ecf20Sopenharmony_ci	reclaim_bytes += delayed_refs_rsv->reserved;
6448c2ecf20Sopenharmony_ci	spin_unlock(&delayed_refs_rsv->lock);
6458c2ecf20Sopenharmony_ci
6468c2ecf20Sopenharmony_ci	spin_lock(&trans_rsv->lock);
6478c2ecf20Sopenharmony_ci	reclaim_bytes += trans_rsv->reserved;
6488c2ecf20Sopenharmony_ci	spin_unlock(&trans_rsv->lock);
6498c2ecf20Sopenharmony_ci
6508c2ecf20Sopenharmony_ci	if (reclaim_bytes >= bytes_needed)
6518c2ecf20Sopenharmony_ci		goto commit;
6528c2ecf20Sopenharmony_ci	bytes_needed -= reclaim_bytes;
6538c2ecf20Sopenharmony_ci
6548c2ecf20Sopenharmony_ci	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
6558c2ecf20Sopenharmony_ci				   bytes_needed,
6568c2ecf20Sopenharmony_ci				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
6578c2ecf20Sopenharmony_ci		goto enospc;
6588c2ecf20Sopenharmony_ci
6598c2ecf20Sopenharmony_cicommit:
6608c2ecf20Sopenharmony_ci	return btrfs_commit_transaction(trans);
6618c2ecf20Sopenharmony_cienospc:
6628c2ecf20Sopenharmony_ci	btrfs_end_transaction(trans);
6638c2ecf20Sopenharmony_ci	return -ENOSPC;
6648c2ecf20Sopenharmony_ci}
6658c2ecf20Sopenharmony_ci
6668c2ecf20Sopenharmony_ci/*
6678c2ecf20Sopenharmony_ci * Try to flush some data based on policy set by @state. This is only advisory
6688c2ecf20Sopenharmony_ci * and may fail for various reasons. The caller is supposed to examine the
6698c2ecf20Sopenharmony_ci * state of @space_info to detect the outcome.
6708c2ecf20Sopenharmony_ci */
6718c2ecf20Sopenharmony_cistatic void flush_space(struct btrfs_fs_info *fs_info,
6728c2ecf20Sopenharmony_ci		       struct btrfs_space_info *space_info, u64 num_bytes,
6738c2ecf20Sopenharmony_ci		       int state)
6748c2ecf20Sopenharmony_ci{
6758c2ecf20Sopenharmony_ci	struct btrfs_root *root = fs_info->extent_root;
6768c2ecf20Sopenharmony_ci	struct btrfs_trans_handle *trans;
6778c2ecf20Sopenharmony_ci	int nr;
6788c2ecf20Sopenharmony_ci	int ret = 0;
6798c2ecf20Sopenharmony_ci
6808c2ecf20Sopenharmony_ci	switch (state) {
6818c2ecf20Sopenharmony_ci	case FLUSH_DELAYED_ITEMS_NR:
6828c2ecf20Sopenharmony_ci	case FLUSH_DELAYED_ITEMS:
6838c2ecf20Sopenharmony_ci		if (state == FLUSH_DELAYED_ITEMS_NR)
6848c2ecf20Sopenharmony_ci			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
6858c2ecf20Sopenharmony_ci		else
6868c2ecf20Sopenharmony_ci			nr = -1;
6878c2ecf20Sopenharmony_ci
6888c2ecf20Sopenharmony_ci		trans = btrfs_join_transaction(root);
6898c2ecf20Sopenharmony_ci		if (IS_ERR(trans)) {
6908c2ecf20Sopenharmony_ci			ret = PTR_ERR(trans);
6918c2ecf20Sopenharmony_ci			break;
6928c2ecf20Sopenharmony_ci		}
6938c2ecf20Sopenharmony_ci		ret = btrfs_run_delayed_items_nr(trans, nr);
6948c2ecf20Sopenharmony_ci		btrfs_end_transaction(trans);
6958c2ecf20Sopenharmony_ci		break;
6968c2ecf20Sopenharmony_ci	case FLUSH_DELALLOC:
6978c2ecf20Sopenharmony_ci	case FLUSH_DELALLOC_WAIT:
6988c2ecf20Sopenharmony_ci		shrink_delalloc(fs_info, space_info, num_bytes,
6998c2ecf20Sopenharmony_ci				state == FLUSH_DELALLOC_WAIT);
7008c2ecf20Sopenharmony_ci		break;
7018c2ecf20Sopenharmony_ci	case FLUSH_DELAYED_REFS_NR:
7028c2ecf20Sopenharmony_ci	case FLUSH_DELAYED_REFS:
7038c2ecf20Sopenharmony_ci		trans = btrfs_join_transaction(root);
7048c2ecf20Sopenharmony_ci		if (IS_ERR(trans)) {
7058c2ecf20Sopenharmony_ci			ret = PTR_ERR(trans);
7068c2ecf20Sopenharmony_ci			break;
7078c2ecf20Sopenharmony_ci		}
7088c2ecf20Sopenharmony_ci		if (state == FLUSH_DELAYED_REFS_NR)
7098c2ecf20Sopenharmony_ci			nr = calc_reclaim_items_nr(fs_info, num_bytes);
7108c2ecf20Sopenharmony_ci		else
7118c2ecf20Sopenharmony_ci			nr = 0;
7128c2ecf20Sopenharmony_ci		btrfs_run_delayed_refs(trans, nr);
7138c2ecf20Sopenharmony_ci		btrfs_end_transaction(trans);
7148c2ecf20Sopenharmony_ci		break;
7158c2ecf20Sopenharmony_ci	case ALLOC_CHUNK:
7168c2ecf20Sopenharmony_ci	case ALLOC_CHUNK_FORCE:
7178c2ecf20Sopenharmony_ci		trans = btrfs_join_transaction(root);
7188c2ecf20Sopenharmony_ci		if (IS_ERR(trans)) {
7198c2ecf20Sopenharmony_ci			ret = PTR_ERR(trans);
7208c2ecf20Sopenharmony_ci			break;
7218c2ecf20Sopenharmony_ci		}
7228c2ecf20Sopenharmony_ci		ret = btrfs_chunk_alloc(trans,
7238c2ecf20Sopenharmony_ci				btrfs_get_alloc_profile(fs_info, space_info->flags),
7248c2ecf20Sopenharmony_ci				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
7258c2ecf20Sopenharmony_ci					CHUNK_ALLOC_FORCE);
7268c2ecf20Sopenharmony_ci		btrfs_end_transaction(trans);
7278c2ecf20Sopenharmony_ci		if (ret > 0 || ret == -ENOSPC)
7288c2ecf20Sopenharmony_ci			ret = 0;
7298c2ecf20Sopenharmony_ci		break;
7308c2ecf20Sopenharmony_ci	case RUN_DELAYED_IPUTS:
7318c2ecf20Sopenharmony_ci		/*
7328c2ecf20Sopenharmony_ci		 * If we have pending delayed iputs then we could free up a
7338c2ecf20Sopenharmony_ci		 * bunch of pinned space, so make sure we run the iputs before
7348c2ecf20Sopenharmony_ci		 * we do our pinned bytes check below.
7358c2ecf20Sopenharmony_ci		 */
7368c2ecf20Sopenharmony_ci		btrfs_run_delayed_iputs(fs_info);
7378c2ecf20Sopenharmony_ci		btrfs_wait_on_delayed_iputs(fs_info);
7388c2ecf20Sopenharmony_ci		break;
7398c2ecf20Sopenharmony_ci	case COMMIT_TRANS:
7408c2ecf20Sopenharmony_ci		ret = may_commit_transaction(fs_info, space_info);
7418c2ecf20Sopenharmony_ci		break;
7428c2ecf20Sopenharmony_ci	default:
7438c2ecf20Sopenharmony_ci		ret = -ENOSPC;
7448c2ecf20Sopenharmony_ci		break;
7458c2ecf20Sopenharmony_ci	}
7468c2ecf20Sopenharmony_ci
7478c2ecf20Sopenharmony_ci	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
7488c2ecf20Sopenharmony_ci				ret);
7498c2ecf20Sopenharmony_ci	return;
7508c2ecf20Sopenharmony_ci}
7518c2ecf20Sopenharmony_ci
7528c2ecf20Sopenharmony_cistatic inline u64
7538c2ecf20Sopenharmony_cibtrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
7548c2ecf20Sopenharmony_ci				 struct btrfs_space_info *space_info)
7558c2ecf20Sopenharmony_ci{
7568c2ecf20Sopenharmony_ci	u64 used;
7578c2ecf20Sopenharmony_ci	u64 avail;
7588c2ecf20Sopenharmony_ci	u64 expected;
7598c2ecf20Sopenharmony_ci	u64 to_reclaim = space_info->reclaim_size;
7608c2ecf20Sopenharmony_ci
7618c2ecf20Sopenharmony_ci	lockdep_assert_held(&space_info->lock);
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_ci	avail = calc_available_free_space(fs_info, space_info,
7648c2ecf20Sopenharmony_ci					  BTRFS_RESERVE_FLUSH_ALL);
7658c2ecf20Sopenharmony_ci	used = btrfs_space_info_used(space_info, true);
7668c2ecf20Sopenharmony_ci
7678c2ecf20Sopenharmony_ci	/*
7688c2ecf20Sopenharmony_ci	 * We may be flushing because suddenly we have less space than we had
7698c2ecf20Sopenharmony_ci	 * before, and now we're well over-committed based on our current free
7708c2ecf20Sopenharmony_ci	 * space.  If that's the case add in our overage so we make sure to put
7718c2ecf20Sopenharmony_ci	 * appropriate pressure on the flushing state machine.
7728c2ecf20Sopenharmony_ci	 */
7738c2ecf20Sopenharmony_ci	if (space_info->total_bytes + avail < used)
7748c2ecf20Sopenharmony_ci		to_reclaim += used - (space_info->total_bytes + avail);
7758c2ecf20Sopenharmony_ci
7768c2ecf20Sopenharmony_ci	if (to_reclaim)
7778c2ecf20Sopenharmony_ci		return to_reclaim;
7788c2ecf20Sopenharmony_ci
7798c2ecf20Sopenharmony_ci	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
7808c2ecf20Sopenharmony_ci	if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
7818c2ecf20Sopenharmony_ci				 BTRFS_RESERVE_FLUSH_ALL))
7828c2ecf20Sopenharmony_ci		return 0;
7838c2ecf20Sopenharmony_ci
7848c2ecf20Sopenharmony_ci	used = btrfs_space_info_used(space_info, true);
7858c2ecf20Sopenharmony_ci
7868c2ecf20Sopenharmony_ci	if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
7878c2ecf20Sopenharmony_ci				 BTRFS_RESERVE_FLUSH_ALL))
7888c2ecf20Sopenharmony_ci		expected = div_factor_fine(space_info->total_bytes, 95);
7898c2ecf20Sopenharmony_ci	else
7908c2ecf20Sopenharmony_ci		expected = div_factor_fine(space_info->total_bytes, 90);
7918c2ecf20Sopenharmony_ci
7928c2ecf20Sopenharmony_ci	if (used > expected)
7938c2ecf20Sopenharmony_ci		to_reclaim = used - expected;
7948c2ecf20Sopenharmony_ci	else
7958c2ecf20Sopenharmony_ci		to_reclaim = 0;
7968c2ecf20Sopenharmony_ci	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
7978c2ecf20Sopenharmony_ci				     space_info->bytes_reserved);
7988c2ecf20Sopenharmony_ci	return to_reclaim;
7998c2ecf20Sopenharmony_ci}
8008c2ecf20Sopenharmony_ci
8018c2ecf20Sopenharmony_cistatic inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
8028c2ecf20Sopenharmony_ci					struct btrfs_space_info *space_info,
8038c2ecf20Sopenharmony_ci					u64 used)
8048c2ecf20Sopenharmony_ci{
8058c2ecf20Sopenharmony_ci	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_ci	/* If we're just plain full then async reclaim just slows us down. */
8088c2ecf20Sopenharmony_ci	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
8098c2ecf20Sopenharmony_ci		return 0;
8108c2ecf20Sopenharmony_ci
8118c2ecf20Sopenharmony_ci	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
8128c2ecf20Sopenharmony_ci		return 0;
8138c2ecf20Sopenharmony_ci
8148c2ecf20Sopenharmony_ci	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
8158c2ecf20Sopenharmony_ci		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
8168c2ecf20Sopenharmony_ci}
8178c2ecf20Sopenharmony_ci
8188c2ecf20Sopenharmony_cistatic bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
8198c2ecf20Sopenharmony_ci				  struct btrfs_space_info *space_info,
8208c2ecf20Sopenharmony_ci				  struct reserve_ticket *ticket)
8218c2ecf20Sopenharmony_ci{
8228c2ecf20Sopenharmony_ci	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8238c2ecf20Sopenharmony_ci	u64 min_bytes;
8248c2ecf20Sopenharmony_ci
8258c2ecf20Sopenharmony_ci	if (global_rsv->space_info != space_info)
8268c2ecf20Sopenharmony_ci		return false;
8278c2ecf20Sopenharmony_ci
8288c2ecf20Sopenharmony_ci	spin_lock(&global_rsv->lock);
8298c2ecf20Sopenharmony_ci	min_bytes = div_factor(global_rsv->size, 1);
8308c2ecf20Sopenharmony_ci	if (global_rsv->reserved < min_bytes + ticket->bytes) {
8318c2ecf20Sopenharmony_ci		spin_unlock(&global_rsv->lock);
8328c2ecf20Sopenharmony_ci		return false;
8338c2ecf20Sopenharmony_ci	}
8348c2ecf20Sopenharmony_ci	global_rsv->reserved -= ticket->bytes;
8358c2ecf20Sopenharmony_ci	remove_ticket(space_info, ticket);
8368c2ecf20Sopenharmony_ci	ticket->bytes = 0;
8378c2ecf20Sopenharmony_ci	wake_up(&ticket->wait);
8388c2ecf20Sopenharmony_ci	space_info->tickets_id++;
8398c2ecf20Sopenharmony_ci	if (global_rsv->reserved < global_rsv->size)
8408c2ecf20Sopenharmony_ci		global_rsv->full = 0;
8418c2ecf20Sopenharmony_ci	spin_unlock(&global_rsv->lock);
8428c2ecf20Sopenharmony_ci
8438c2ecf20Sopenharmony_ci	return true;
8448c2ecf20Sopenharmony_ci}
8458c2ecf20Sopenharmony_ci
8468c2ecf20Sopenharmony_ci/*
8478c2ecf20Sopenharmony_ci * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
8488c2ecf20Sopenharmony_ci * @fs_info - fs_info for this fs
8498c2ecf20Sopenharmony_ci * @space_info - the space info we were flushing
8508c2ecf20Sopenharmony_ci *
8518c2ecf20Sopenharmony_ci * We call this when we've exhausted our flushing ability and haven't made
8528c2ecf20Sopenharmony_ci * progress in satisfying tickets.  The reservation code handles tickets in
8538c2ecf20Sopenharmony_ci * order, so if there is a large ticket first and then smaller ones we could
8548c2ecf20Sopenharmony_ci * very well satisfy the smaller tickets.  This will attempt to wake up any
8558c2ecf20Sopenharmony_ci * tickets in the list to catch this case.
8568c2ecf20Sopenharmony_ci *
8578c2ecf20Sopenharmony_ci * This function returns true if it was able to make progress by clearing out
8588c2ecf20Sopenharmony_ci * other tickets, or if it stumbles across a ticket that was smaller than the
8598c2ecf20Sopenharmony_ci * first ticket.
8608c2ecf20Sopenharmony_ci */
8618c2ecf20Sopenharmony_cistatic bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
8628c2ecf20Sopenharmony_ci				   struct btrfs_space_info *space_info)
8638c2ecf20Sopenharmony_ci{
8648c2ecf20Sopenharmony_ci	struct reserve_ticket *ticket;
8658c2ecf20Sopenharmony_ci	u64 tickets_id = space_info->tickets_id;
8668c2ecf20Sopenharmony_ci	u64 first_ticket_bytes = 0;
8678c2ecf20Sopenharmony_ci
8688c2ecf20Sopenharmony_ci	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8698c2ecf20Sopenharmony_ci		btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
8708c2ecf20Sopenharmony_ci		__btrfs_dump_space_info(fs_info, space_info);
8718c2ecf20Sopenharmony_ci	}
8728c2ecf20Sopenharmony_ci
8738c2ecf20Sopenharmony_ci	while (!list_empty(&space_info->tickets) &&
8748c2ecf20Sopenharmony_ci	       tickets_id == space_info->tickets_id) {
8758c2ecf20Sopenharmony_ci		ticket = list_first_entry(&space_info->tickets,
8768c2ecf20Sopenharmony_ci					  struct reserve_ticket, list);
8778c2ecf20Sopenharmony_ci
8788c2ecf20Sopenharmony_ci		if (ticket->steal &&
8798c2ecf20Sopenharmony_ci		    steal_from_global_rsv(fs_info, space_info, ticket))
8808c2ecf20Sopenharmony_ci			return true;
8818c2ecf20Sopenharmony_ci
8828c2ecf20Sopenharmony_ci		/*
8838c2ecf20Sopenharmony_ci		 * may_commit_transaction will avoid committing the transaction
8848c2ecf20Sopenharmony_ci		 * if it doesn't feel like the space reclaimed by the commit
8858c2ecf20Sopenharmony_ci		 * would result in the ticket succeeding.  However if we have a
8868c2ecf20Sopenharmony_ci		 * smaller ticket in the queue it may be small enough to be
8878c2ecf20Sopenharmony_ci		 * satisified by committing the transaction, so if any
8888c2ecf20Sopenharmony_ci		 * subsequent ticket is smaller than the first ticket go ahead
8898c2ecf20Sopenharmony_ci		 * and send us back for another loop through the enospc flushing
8908c2ecf20Sopenharmony_ci		 * code.
8918c2ecf20Sopenharmony_ci		 */
8928c2ecf20Sopenharmony_ci		if (first_ticket_bytes == 0)
8938c2ecf20Sopenharmony_ci			first_ticket_bytes = ticket->bytes;
8948c2ecf20Sopenharmony_ci		else if (first_ticket_bytes > ticket->bytes)
8958c2ecf20Sopenharmony_ci			return true;
8968c2ecf20Sopenharmony_ci
8978c2ecf20Sopenharmony_ci		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
8988c2ecf20Sopenharmony_ci			btrfs_info(fs_info, "failing ticket with %llu bytes",
8998c2ecf20Sopenharmony_ci				   ticket->bytes);
9008c2ecf20Sopenharmony_ci
9018c2ecf20Sopenharmony_ci		remove_ticket(space_info, ticket);
9028c2ecf20Sopenharmony_ci		ticket->error = -ENOSPC;
9038c2ecf20Sopenharmony_ci		wake_up(&ticket->wait);
9048c2ecf20Sopenharmony_ci
9058c2ecf20Sopenharmony_ci		/*
9068c2ecf20Sopenharmony_ci		 * We're just throwing tickets away, so more flushing may not
9078c2ecf20Sopenharmony_ci		 * trip over btrfs_try_granting_tickets, so we need to call it
9088c2ecf20Sopenharmony_ci		 * here to see if we can make progress with the next ticket in
9098c2ecf20Sopenharmony_ci		 * the list.
9108c2ecf20Sopenharmony_ci		 */
9118c2ecf20Sopenharmony_ci		btrfs_try_granting_tickets(fs_info, space_info);
9128c2ecf20Sopenharmony_ci	}
9138c2ecf20Sopenharmony_ci	return (tickets_id != space_info->tickets_id);
9148c2ecf20Sopenharmony_ci}
9158c2ecf20Sopenharmony_ci
9168c2ecf20Sopenharmony_ci/*
9178c2ecf20Sopenharmony_ci * This is for normal flushers, we can wait all goddamned day if we want to.  We
9188c2ecf20Sopenharmony_ci * will loop and continuously try to flush as long as we are making progress.
9198c2ecf20Sopenharmony_ci * We count progress as clearing off tickets each time we have to loop.
9208c2ecf20Sopenharmony_ci */
9218c2ecf20Sopenharmony_cistatic void btrfs_async_reclaim_metadata_space(struct work_struct *work)
9228c2ecf20Sopenharmony_ci{
9238c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info;
9248c2ecf20Sopenharmony_ci	struct btrfs_space_info *space_info;
9258c2ecf20Sopenharmony_ci	u64 to_reclaim;
9268c2ecf20Sopenharmony_ci	int flush_state;
9278c2ecf20Sopenharmony_ci	int commit_cycles = 0;
9288c2ecf20Sopenharmony_ci	u64 last_tickets_id;
9298c2ecf20Sopenharmony_ci
9308c2ecf20Sopenharmony_ci	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
9318c2ecf20Sopenharmony_ci	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
9328c2ecf20Sopenharmony_ci
9338c2ecf20Sopenharmony_ci	spin_lock(&space_info->lock);
9348c2ecf20Sopenharmony_ci	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
9358c2ecf20Sopenharmony_ci	if (!to_reclaim) {
9368c2ecf20Sopenharmony_ci		space_info->flush = 0;
9378c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
9388c2ecf20Sopenharmony_ci		return;
9398c2ecf20Sopenharmony_ci	}
9408c2ecf20Sopenharmony_ci	last_tickets_id = space_info->tickets_id;
9418c2ecf20Sopenharmony_ci	spin_unlock(&space_info->lock);
9428c2ecf20Sopenharmony_ci
9438c2ecf20Sopenharmony_ci	flush_state = FLUSH_DELAYED_ITEMS_NR;
9448c2ecf20Sopenharmony_ci	do {
9458c2ecf20Sopenharmony_ci		flush_space(fs_info, space_info, to_reclaim, flush_state);
9468c2ecf20Sopenharmony_ci		spin_lock(&space_info->lock);
9478c2ecf20Sopenharmony_ci		if (list_empty(&space_info->tickets)) {
9488c2ecf20Sopenharmony_ci			space_info->flush = 0;
9498c2ecf20Sopenharmony_ci			spin_unlock(&space_info->lock);
9508c2ecf20Sopenharmony_ci			return;
9518c2ecf20Sopenharmony_ci		}
9528c2ecf20Sopenharmony_ci		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
9538c2ecf20Sopenharmony_ci							      space_info);
9548c2ecf20Sopenharmony_ci		if (last_tickets_id == space_info->tickets_id) {
9558c2ecf20Sopenharmony_ci			flush_state++;
9568c2ecf20Sopenharmony_ci		} else {
9578c2ecf20Sopenharmony_ci			last_tickets_id = space_info->tickets_id;
9588c2ecf20Sopenharmony_ci			flush_state = FLUSH_DELAYED_ITEMS_NR;
9598c2ecf20Sopenharmony_ci			if (commit_cycles)
9608c2ecf20Sopenharmony_ci				commit_cycles--;
9618c2ecf20Sopenharmony_ci		}
9628c2ecf20Sopenharmony_ci
9638c2ecf20Sopenharmony_ci		/*
9648c2ecf20Sopenharmony_ci		 * We don't want to force a chunk allocation until we've tried
9658c2ecf20Sopenharmony_ci		 * pretty hard to reclaim space.  Think of the case where we
9668c2ecf20Sopenharmony_ci		 * freed up a bunch of space and so have a lot of pinned space
9678c2ecf20Sopenharmony_ci		 * to reclaim.  We would rather use that than possibly create a
9688c2ecf20Sopenharmony_ci		 * underutilized metadata chunk.  So if this is our first run
9698c2ecf20Sopenharmony_ci		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
9708c2ecf20Sopenharmony_ci		 * commit the transaction.  If nothing has changed the next go
9718c2ecf20Sopenharmony_ci		 * around then we can force a chunk allocation.
9728c2ecf20Sopenharmony_ci		 */
9738c2ecf20Sopenharmony_ci		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
9748c2ecf20Sopenharmony_ci			flush_state++;
9758c2ecf20Sopenharmony_ci
9768c2ecf20Sopenharmony_ci		if (flush_state > COMMIT_TRANS) {
9778c2ecf20Sopenharmony_ci			commit_cycles++;
9788c2ecf20Sopenharmony_ci			if (commit_cycles > 2) {
9798c2ecf20Sopenharmony_ci				if (maybe_fail_all_tickets(fs_info, space_info)) {
9808c2ecf20Sopenharmony_ci					flush_state = FLUSH_DELAYED_ITEMS_NR;
9818c2ecf20Sopenharmony_ci					commit_cycles--;
9828c2ecf20Sopenharmony_ci				} else {
9838c2ecf20Sopenharmony_ci					space_info->flush = 0;
9848c2ecf20Sopenharmony_ci				}
9858c2ecf20Sopenharmony_ci			} else {
9868c2ecf20Sopenharmony_ci				flush_state = FLUSH_DELAYED_ITEMS_NR;
9878c2ecf20Sopenharmony_ci			}
9888c2ecf20Sopenharmony_ci		}
9898c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
9908c2ecf20Sopenharmony_ci	} while (flush_state <= COMMIT_TRANS);
9918c2ecf20Sopenharmony_ci}
9928c2ecf20Sopenharmony_ci
9938c2ecf20Sopenharmony_ci/*
9948c2ecf20Sopenharmony_ci * FLUSH_DELALLOC_WAIT:
9958c2ecf20Sopenharmony_ci *   Space is freed from flushing delalloc in one of two ways.
9968c2ecf20Sopenharmony_ci *
9978c2ecf20Sopenharmony_ci *   1) compression is on and we allocate less space than we reserved
9988c2ecf20Sopenharmony_ci *   2) we are overwriting existing space
9998c2ecf20Sopenharmony_ci *
10008c2ecf20Sopenharmony_ci *   For #1 that extra space is reclaimed as soon as the delalloc pages are
10018c2ecf20Sopenharmony_ci *   COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
10028c2ecf20Sopenharmony_ci *   length to ->bytes_reserved, and subtracts the reserved space from
10038c2ecf20Sopenharmony_ci *   ->bytes_may_use.
10048c2ecf20Sopenharmony_ci *
10058c2ecf20Sopenharmony_ci *   For #2 this is trickier.  Once the ordered extent runs we will drop the
10068c2ecf20Sopenharmony_ci *   extent in the range we are overwriting, which creates a delayed ref for
10078c2ecf20Sopenharmony_ci *   that freed extent.  This however is not reclaimed until the transaction
10088c2ecf20Sopenharmony_ci *   commits, thus the next stages.
10098c2ecf20Sopenharmony_ci *
10108c2ecf20Sopenharmony_ci * RUN_DELAYED_IPUTS
10118c2ecf20Sopenharmony_ci *   If we are freeing inodes, we want to make sure all delayed iputs have
10128c2ecf20Sopenharmony_ci *   completed, because they could have been on an inode with i_nlink == 0, and
10138c2ecf20Sopenharmony_ci *   thus have been truncated and freed up space.  But again this space is not
10148c2ecf20Sopenharmony_ci *   immediately re-usable, it comes in the form of a delayed ref, which must be
10158c2ecf20Sopenharmony_ci *   run and then the transaction must be committed.
10168c2ecf20Sopenharmony_ci *
10178c2ecf20Sopenharmony_ci * FLUSH_DELAYED_REFS
10188c2ecf20Sopenharmony_ci *   The above two cases generate delayed refs that will affect
10198c2ecf20Sopenharmony_ci *   ->total_bytes_pinned.  However this counter can be inconsistent with
10208c2ecf20Sopenharmony_ci *   reality if there are outstanding delayed refs.  This is because we adjust
10218c2ecf20Sopenharmony_ci *   the counter based solely on the current set of delayed refs and disregard
10228c2ecf20Sopenharmony_ci *   any on-disk state which might include more refs.  So for example, if we
10238c2ecf20Sopenharmony_ci *   have an extent with 2 references, but we only drop 1, we'll see that there
10248c2ecf20Sopenharmony_ci *   is a negative delayed ref count for the extent and assume that the space
10258c2ecf20Sopenharmony_ci *   will be freed, and thus increase ->total_bytes_pinned.
10268c2ecf20Sopenharmony_ci *
10278c2ecf20Sopenharmony_ci *   Running the delayed refs gives us the actual real view of what will be
10288c2ecf20Sopenharmony_ci *   freed at the transaction commit time.  This stage will not actually free
10298c2ecf20Sopenharmony_ci *   space for us, it just makes sure that may_commit_transaction() has all of
10308c2ecf20Sopenharmony_ci *   the information it needs to make the right decision.
10318c2ecf20Sopenharmony_ci *
10328c2ecf20Sopenharmony_ci * COMMIT_TRANS
10338c2ecf20Sopenharmony_ci *   This is where we reclaim all of the pinned space generated by the previous
10348c2ecf20Sopenharmony_ci *   two stages.  We will not commit the transaction if we don't think we're
10358c2ecf20Sopenharmony_ci *   likely to satisfy our request, which means if our current free space +
10368c2ecf20Sopenharmony_ci *   total_bytes_pinned < reservation we will not commit.  This is why the
10378c2ecf20Sopenharmony_ci *   previous states are actually important, to make sure we know for sure
10388c2ecf20Sopenharmony_ci *   whether committing the transaction will allow us to make progress.
10398c2ecf20Sopenharmony_ci *
10408c2ecf20Sopenharmony_ci * ALLOC_CHUNK_FORCE
10418c2ecf20Sopenharmony_ci *   For data we start with alloc chunk force, however we could have been full
10428c2ecf20Sopenharmony_ci *   before, and then the transaction commit could have freed new block groups,
10438c2ecf20Sopenharmony_ci *   so if we now have space to allocate do the force chunk allocation.
10448c2ecf20Sopenharmony_ci */
10458c2ecf20Sopenharmony_cistatic const enum btrfs_flush_state data_flush_states[] = {
10468c2ecf20Sopenharmony_ci	FLUSH_DELALLOC_WAIT,
10478c2ecf20Sopenharmony_ci	RUN_DELAYED_IPUTS,
10488c2ecf20Sopenharmony_ci	FLUSH_DELAYED_REFS,
10498c2ecf20Sopenharmony_ci	COMMIT_TRANS,
10508c2ecf20Sopenharmony_ci	ALLOC_CHUNK_FORCE,
10518c2ecf20Sopenharmony_ci};
10528c2ecf20Sopenharmony_ci
10538c2ecf20Sopenharmony_cistatic void btrfs_async_reclaim_data_space(struct work_struct *work)
10548c2ecf20Sopenharmony_ci{
10558c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info;
10568c2ecf20Sopenharmony_ci	struct btrfs_space_info *space_info;
10578c2ecf20Sopenharmony_ci	u64 last_tickets_id;
10588c2ecf20Sopenharmony_ci	int flush_state = 0;
10598c2ecf20Sopenharmony_ci
10608c2ecf20Sopenharmony_ci	fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
10618c2ecf20Sopenharmony_ci	space_info = fs_info->data_sinfo;
10628c2ecf20Sopenharmony_ci
10638c2ecf20Sopenharmony_ci	spin_lock(&space_info->lock);
10648c2ecf20Sopenharmony_ci	if (list_empty(&space_info->tickets)) {
10658c2ecf20Sopenharmony_ci		space_info->flush = 0;
10668c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
10678c2ecf20Sopenharmony_ci		return;
10688c2ecf20Sopenharmony_ci	}
10698c2ecf20Sopenharmony_ci	last_tickets_id = space_info->tickets_id;
10708c2ecf20Sopenharmony_ci	spin_unlock(&space_info->lock);
10718c2ecf20Sopenharmony_ci
10728c2ecf20Sopenharmony_ci	while (!space_info->full) {
10738c2ecf20Sopenharmony_ci		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
10748c2ecf20Sopenharmony_ci		spin_lock(&space_info->lock);
10758c2ecf20Sopenharmony_ci		if (list_empty(&space_info->tickets)) {
10768c2ecf20Sopenharmony_ci			space_info->flush = 0;
10778c2ecf20Sopenharmony_ci			spin_unlock(&space_info->lock);
10788c2ecf20Sopenharmony_ci			return;
10798c2ecf20Sopenharmony_ci		}
10808c2ecf20Sopenharmony_ci		last_tickets_id = space_info->tickets_id;
10818c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
10828c2ecf20Sopenharmony_ci	}
10838c2ecf20Sopenharmony_ci
10848c2ecf20Sopenharmony_ci	while (flush_state < ARRAY_SIZE(data_flush_states)) {
10858c2ecf20Sopenharmony_ci		flush_space(fs_info, space_info, U64_MAX,
10868c2ecf20Sopenharmony_ci			    data_flush_states[flush_state]);
10878c2ecf20Sopenharmony_ci		spin_lock(&space_info->lock);
10888c2ecf20Sopenharmony_ci		if (list_empty(&space_info->tickets)) {
10898c2ecf20Sopenharmony_ci			space_info->flush = 0;
10908c2ecf20Sopenharmony_ci			spin_unlock(&space_info->lock);
10918c2ecf20Sopenharmony_ci			return;
10928c2ecf20Sopenharmony_ci		}
10938c2ecf20Sopenharmony_ci
10948c2ecf20Sopenharmony_ci		if (last_tickets_id == space_info->tickets_id) {
10958c2ecf20Sopenharmony_ci			flush_state++;
10968c2ecf20Sopenharmony_ci		} else {
10978c2ecf20Sopenharmony_ci			last_tickets_id = space_info->tickets_id;
10988c2ecf20Sopenharmony_ci			flush_state = 0;
10998c2ecf20Sopenharmony_ci		}
11008c2ecf20Sopenharmony_ci
11018c2ecf20Sopenharmony_ci		if (flush_state >= ARRAY_SIZE(data_flush_states)) {
11028c2ecf20Sopenharmony_ci			if (space_info->full) {
11038c2ecf20Sopenharmony_ci				if (maybe_fail_all_tickets(fs_info, space_info))
11048c2ecf20Sopenharmony_ci					flush_state = 0;
11058c2ecf20Sopenharmony_ci				else
11068c2ecf20Sopenharmony_ci					space_info->flush = 0;
11078c2ecf20Sopenharmony_ci			} else {
11088c2ecf20Sopenharmony_ci				flush_state = 0;
11098c2ecf20Sopenharmony_ci			}
11108c2ecf20Sopenharmony_ci		}
11118c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
11128c2ecf20Sopenharmony_ci	}
11138c2ecf20Sopenharmony_ci}
11148c2ecf20Sopenharmony_ci
11158c2ecf20Sopenharmony_civoid btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
11168c2ecf20Sopenharmony_ci{
11178c2ecf20Sopenharmony_ci	INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
11188c2ecf20Sopenharmony_ci	INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
11198c2ecf20Sopenharmony_ci}
11208c2ecf20Sopenharmony_ci
11218c2ecf20Sopenharmony_cistatic const enum btrfs_flush_state priority_flush_states[] = {
11228c2ecf20Sopenharmony_ci	FLUSH_DELAYED_ITEMS_NR,
11238c2ecf20Sopenharmony_ci	FLUSH_DELAYED_ITEMS,
11248c2ecf20Sopenharmony_ci	ALLOC_CHUNK,
11258c2ecf20Sopenharmony_ci};
11268c2ecf20Sopenharmony_ci
11278c2ecf20Sopenharmony_cistatic const enum btrfs_flush_state evict_flush_states[] = {
11288c2ecf20Sopenharmony_ci	FLUSH_DELAYED_ITEMS_NR,
11298c2ecf20Sopenharmony_ci	FLUSH_DELAYED_ITEMS,
11308c2ecf20Sopenharmony_ci	FLUSH_DELAYED_REFS_NR,
11318c2ecf20Sopenharmony_ci	FLUSH_DELAYED_REFS,
11328c2ecf20Sopenharmony_ci	FLUSH_DELALLOC,
11338c2ecf20Sopenharmony_ci	FLUSH_DELALLOC_WAIT,
11348c2ecf20Sopenharmony_ci	ALLOC_CHUNK,
11358c2ecf20Sopenharmony_ci	COMMIT_TRANS,
11368c2ecf20Sopenharmony_ci};
11378c2ecf20Sopenharmony_ci
11388c2ecf20Sopenharmony_cistatic void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
11398c2ecf20Sopenharmony_ci				struct btrfs_space_info *space_info,
11408c2ecf20Sopenharmony_ci				struct reserve_ticket *ticket,
11418c2ecf20Sopenharmony_ci				const enum btrfs_flush_state *states,
11428c2ecf20Sopenharmony_ci				int states_nr)
11438c2ecf20Sopenharmony_ci{
11448c2ecf20Sopenharmony_ci	u64 to_reclaim;
11458c2ecf20Sopenharmony_ci	int flush_state;
11468c2ecf20Sopenharmony_ci
11478c2ecf20Sopenharmony_ci	spin_lock(&space_info->lock);
11488c2ecf20Sopenharmony_ci	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
11498c2ecf20Sopenharmony_ci	if (!to_reclaim) {
11508c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
11518c2ecf20Sopenharmony_ci		return;
11528c2ecf20Sopenharmony_ci	}
11538c2ecf20Sopenharmony_ci	spin_unlock(&space_info->lock);
11548c2ecf20Sopenharmony_ci
11558c2ecf20Sopenharmony_ci	flush_state = 0;
11568c2ecf20Sopenharmony_ci	do {
11578c2ecf20Sopenharmony_ci		flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
11588c2ecf20Sopenharmony_ci		flush_state++;
11598c2ecf20Sopenharmony_ci		spin_lock(&space_info->lock);
11608c2ecf20Sopenharmony_ci		if (ticket->bytes == 0) {
11618c2ecf20Sopenharmony_ci			spin_unlock(&space_info->lock);
11628c2ecf20Sopenharmony_ci			return;
11638c2ecf20Sopenharmony_ci		}
11648c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
11658c2ecf20Sopenharmony_ci	} while (flush_state < states_nr);
11668c2ecf20Sopenharmony_ci}
11678c2ecf20Sopenharmony_ci
11688c2ecf20Sopenharmony_cistatic void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
11698c2ecf20Sopenharmony_ci					struct btrfs_space_info *space_info,
11708c2ecf20Sopenharmony_ci					struct reserve_ticket *ticket)
11718c2ecf20Sopenharmony_ci{
11728c2ecf20Sopenharmony_ci	while (!space_info->full) {
11738c2ecf20Sopenharmony_ci		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
11748c2ecf20Sopenharmony_ci		spin_lock(&space_info->lock);
11758c2ecf20Sopenharmony_ci		if (ticket->bytes == 0) {
11768c2ecf20Sopenharmony_ci			spin_unlock(&space_info->lock);
11778c2ecf20Sopenharmony_ci			return;
11788c2ecf20Sopenharmony_ci		}
11798c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
11808c2ecf20Sopenharmony_ci	}
11818c2ecf20Sopenharmony_ci}
11828c2ecf20Sopenharmony_ci
11838c2ecf20Sopenharmony_cistatic void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
11848c2ecf20Sopenharmony_ci				struct btrfs_space_info *space_info,
11858c2ecf20Sopenharmony_ci				struct reserve_ticket *ticket)
11868c2ecf20Sopenharmony_ci
11878c2ecf20Sopenharmony_ci{
11888c2ecf20Sopenharmony_ci	DEFINE_WAIT(wait);
11898c2ecf20Sopenharmony_ci	int ret = 0;
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_ci	spin_lock(&space_info->lock);
11928c2ecf20Sopenharmony_ci	while (ticket->bytes > 0 && ticket->error == 0) {
11938c2ecf20Sopenharmony_ci		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
11948c2ecf20Sopenharmony_ci		if (ret) {
11958c2ecf20Sopenharmony_ci			/*
11968c2ecf20Sopenharmony_ci			 * Delete us from the list. After we unlock the space
11978c2ecf20Sopenharmony_ci			 * info, we don't want the async reclaim job to reserve
11988c2ecf20Sopenharmony_ci			 * space for this ticket. If that would happen, then the
11998c2ecf20Sopenharmony_ci			 * ticket's task would not known that space was reserved
12008c2ecf20Sopenharmony_ci			 * despite getting an error, resulting in a space leak
12018c2ecf20Sopenharmony_ci			 * (bytes_may_use counter of our space_info).
12028c2ecf20Sopenharmony_ci			 */
12038c2ecf20Sopenharmony_ci			remove_ticket(space_info, ticket);
12048c2ecf20Sopenharmony_ci			ticket->error = -EINTR;
12058c2ecf20Sopenharmony_ci			break;
12068c2ecf20Sopenharmony_ci		}
12078c2ecf20Sopenharmony_ci		spin_unlock(&space_info->lock);
12088c2ecf20Sopenharmony_ci
12098c2ecf20Sopenharmony_ci		schedule();
12108c2ecf20Sopenharmony_ci
12118c2ecf20Sopenharmony_ci		finish_wait(&ticket->wait, &wait);
12128c2ecf20Sopenharmony_ci		spin_lock(&space_info->lock);
12138c2ecf20Sopenharmony_ci	}
12148c2ecf20Sopenharmony_ci	spin_unlock(&space_info->lock);
12158c2ecf20Sopenharmony_ci}
12168c2ecf20Sopenharmony_ci
12178c2ecf20Sopenharmony_ci/**
12188c2ecf20Sopenharmony_ci * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
12198c2ecf20Sopenharmony_ci * @fs_info - the fs
12208c2ecf20Sopenharmony_ci * @space_info - the space_info for the reservation
12218c2ecf20Sopenharmony_ci * @ticket - the ticket for the reservation
12228c2ecf20Sopenharmony_ci * @flush - how much we can flush
12238c2ecf20Sopenharmony_ci *
12248c2ecf20Sopenharmony_ci * This does the work of figuring out how to flush for the ticket, waiting for
12258c2ecf20Sopenharmony_ci * the reservation, and returning the appropriate error if there is one.
12268c2ecf20Sopenharmony_ci */
12278c2ecf20Sopenharmony_cistatic int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
12288c2ecf20Sopenharmony_ci				 struct btrfs_space_info *space_info,
12298c2ecf20Sopenharmony_ci				 struct reserve_ticket *ticket,
12308c2ecf20Sopenharmony_ci				 enum btrfs_reserve_flush_enum flush)
12318c2ecf20Sopenharmony_ci{
12328c2ecf20Sopenharmony_ci	int ret;
12338c2ecf20Sopenharmony_ci
12348c2ecf20Sopenharmony_ci	switch (flush) {
12358c2ecf20Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_DATA:
12368c2ecf20Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_ALL:
12378c2ecf20Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_ALL_STEAL:
12388c2ecf20Sopenharmony_ci		wait_reserve_ticket(fs_info, space_info, ticket);
12398c2ecf20Sopenharmony_ci		break;
12408c2ecf20Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_LIMIT:
12418c2ecf20Sopenharmony_ci		priority_reclaim_metadata_space(fs_info, space_info, ticket,
12428c2ecf20Sopenharmony_ci						priority_flush_states,
12438c2ecf20Sopenharmony_ci						ARRAY_SIZE(priority_flush_states));
12448c2ecf20Sopenharmony_ci		break;
12458c2ecf20Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_EVICT:
12468c2ecf20Sopenharmony_ci		priority_reclaim_metadata_space(fs_info, space_info, ticket,
12478c2ecf20Sopenharmony_ci						evict_flush_states,
12488c2ecf20Sopenharmony_ci						ARRAY_SIZE(evict_flush_states));
12498c2ecf20Sopenharmony_ci		break;
12508c2ecf20Sopenharmony_ci	case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
12518c2ecf20Sopenharmony_ci		priority_reclaim_data_space(fs_info, space_info, ticket);
12528c2ecf20Sopenharmony_ci		break;
12538c2ecf20Sopenharmony_ci	default:
12548c2ecf20Sopenharmony_ci		ASSERT(0);
12558c2ecf20Sopenharmony_ci		break;
12568c2ecf20Sopenharmony_ci	}
12578c2ecf20Sopenharmony_ci
12588c2ecf20Sopenharmony_ci	spin_lock(&space_info->lock);
12598c2ecf20Sopenharmony_ci	ret = ticket->error;
12608c2ecf20Sopenharmony_ci	if (ticket->bytes || ticket->error) {
12618c2ecf20Sopenharmony_ci		/*
12628c2ecf20Sopenharmony_ci		 * We were a priority ticket, so we need to delete ourselves
12638c2ecf20Sopenharmony_ci		 * from the list.  Because we could have other priority tickets
12648c2ecf20Sopenharmony_ci		 * behind us that require less space, run
12658c2ecf20Sopenharmony_ci		 * btrfs_try_granting_tickets() to see if their reservations can
12668c2ecf20Sopenharmony_ci		 * now be made.
12678c2ecf20Sopenharmony_ci		 */
12688c2ecf20Sopenharmony_ci		if (!list_empty(&ticket->list)) {
12698c2ecf20Sopenharmony_ci			remove_ticket(space_info, ticket);
12708c2ecf20Sopenharmony_ci			btrfs_try_granting_tickets(fs_info, space_info);
12718c2ecf20Sopenharmony_ci		}
12728c2ecf20Sopenharmony_ci
12738c2ecf20Sopenharmony_ci		if (!ret)
12748c2ecf20Sopenharmony_ci			ret = -ENOSPC;
12758c2ecf20Sopenharmony_ci	}
12768c2ecf20Sopenharmony_ci	spin_unlock(&space_info->lock);
12778c2ecf20Sopenharmony_ci	ASSERT(list_empty(&ticket->list));
12788c2ecf20Sopenharmony_ci	/*
12798c2ecf20Sopenharmony_ci	 * Check that we can't have an error set if the reservation succeeded,
12808c2ecf20Sopenharmony_ci	 * as that would confuse tasks and lead them to error out without
12818c2ecf20Sopenharmony_ci	 * releasing reserved space (if an error happens the expectation is that
12828c2ecf20Sopenharmony_ci	 * space wasn't reserved at all).
12838c2ecf20Sopenharmony_ci	 */
12848c2ecf20Sopenharmony_ci	ASSERT(!(ticket->bytes == 0 && ticket->error));
12858c2ecf20Sopenharmony_ci	return ret;
12868c2ecf20Sopenharmony_ci}
12878c2ecf20Sopenharmony_ci
12888c2ecf20Sopenharmony_ci/*
12898c2ecf20Sopenharmony_ci * This returns true if this flush state will go through the ordinary flushing
12908c2ecf20Sopenharmony_ci * code.
12918c2ecf20Sopenharmony_ci */
12928c2ecf20Sopenharmony_cistatic inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
12938c2ecf20Sopenharmony_ci{
12948c2ecf20Sopenharmony_ci	return	(flush == BTRFS_RESERVE_FLUSH_ALL) ||
12958c2ecf20Sopenharmony_ci		(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
12968c2ecf20Sopenharmony_ci}
12978c2ecf20Sopenharmony_ci
12988c2ecf20Sopenharmony_ci/**
12998c2ecf20Sopenharmony_ci * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
13008c2ecf20Sopenharmony_ci * @root - the root we're allocating for
13018c2ecf20Sopenharmony_ci * @space_info - the space info we want to allocate from
13028c2ecf20Sopenharmony_ci * @orig_bytes - the number of bytes we want
13038c2ecf20Sopenharmony_ci * @flush - whether or not we can flush to make our reservation
13048c2ecf20Sopenharmony_ci *
13058c2ecf20Sopenharmony_ci * This will reserve orig_bytes number of bytes from the space info associated
13068c2ecf20Sopenharmony_ci * with the block_rsv.  If there is not enough space it will make an attempt to
13078c2ecf20Sopenharmony_ci * flush out space to make room.  It will do this by flushing delalloc if
13088c2ecf20Sopenharmony_ci * possible or committing the transaction.  If flush is 0 then no attempts to
13098c2ecf20Sopenharmony_ci * regain reservations will be made and this will fail if there is not enough
13108c2ecf20Sopenharmony_ci * space already.
13118c2ecf20Sopenharmony_ci */
13128c2ecf20Sopenharmony_cistatic int __reserve_bytes(struct btrfs_fs_info *fs_info,
13138c2ecf20Sopenharmony_ci			   struct btrfs_space_info *space_info, u64 orig_bytes,
13148c2ecf20Sopenharmony_ci			   enum btrfs_reserve_flush_enum flush)
13158c2ecf20Sopenharmony_ci{
13168c2ecf20Sopenharmony_ci	struct work_struct *async_work;
13178c2ecf20Sopenharmony_ci	struct reserve_ticket ticket;
13188c2ecf20Sopenharmony_ci	u64 used;
13198c2ecf20Sopenharmony_ci	int ret = 0;
13208c2ecf20Sopenharmony_ci	bool pending_tickets;
13218c2ecf20Sopenharmony_ci
13228c2ecf20Sopenharmony_ci	ASSERT(orig_bytes);
13238c2ecf20Sopenharmony_ci	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
13248c2ecf20Sopenharmony_ci
13258c2ecf20Sopenharmony_ci	if (flush == BTRFS_RESERVE_FLUSH_DATA)
13268c2ecf20Sopenharmony_ci		async_work = &fs_info->async_data_reclaim_work;
13278c2ecf20Sopenharmony_ci	else
13288c2ecf20Sopenharmony_ci		async_work = &fs_info->async_reclaim_work;
13298c2ecf20Sopenharmony_ci
13308c2ecf20Sopenharmony_ci	spin_lock(&space_info->lock);
13318c2ecf20Sopenharmony_ci	ret = -ENOSPC;
13328c2ecf20Sopenharmony_ci	used = btrfs_space_info_used(space_info, true);
13338c2ecf20Sopenharmony_ci
13348c2ecf20Sopenharmony_ci	/*
13358c2ecf20Sopenharmony_ci	 * We don't want NO_FLUSH allocations to jump everybody, they can
13368c2ecf20Sopenharmony_ci	 * generally handle ENOSPC in a different way, so treat them the same as
13378c2ecf20Sopenharmony_ci	 * normal flushers when it comes to skipping pending tickets.
13388c2ecf20Sopenharmony_ci	 */
13398c2ecf20Sopenharmony_ci	if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
13408c2ecf20Sopenharmony_ci		pending_tickets = !list_empty(&space_info->tickets) ||
13418c2ecf20Sopenharmony_ci			!list_empty(&space_info->priority_tickets);
13428c2ecf20Sopenharmony_ci	else
13438c2ecf20Sopenharmony_ci		pending_tickets = !list_empty(&space_info->priority_tickets);
13448c2ecf20Sopenharmony_ci
13458c2ecf20Sopenharmony_ci	/*
13468c2ecf20Sopenharmony_ci	 * Carry on if we have enough space (short-circuit) OR call
13478c2ecf20Sopenharmony_ci	 * can_overcommit() to ensure we can overcommit to continue.
13488c2ecf20Sopenharmony_ci	 */
13498c2ecf20Sopenharmony_ci	if (!pending_tickets &&
13508c2ecf20Sopenharmony_ci	    ((used + orig_bytes <= space_info->total_bytes) ||
13518c2ecf20Sopenharmony_ci	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
13528c2ecf20Sopenharmony_ci		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
13538c2ecf20Sopenharmony_ci						      orig_bytes);
13548c2ecf20Sopenharmony_ci		ret = 0;
13558c2ecf20Sopenharmony_ci	}
13568c2ecf20Sopenharmony_ci
13578c2ecf20Sopenharmony_ci	/*
13588c2ecf20Sopenharmony_ci	 * If we couldn't make a reservation then setup our reservation ticket
13598c2ecf20Sopenharmony_ci	 * and kick the async worker if it's not already running.
13608c2ecf20Sopenharmony_ci	 *
13618c2ecf20Sopenharmony_ci	 * If we are a priority flusher then we just need to add our ticket to
13628c2ecf20Sopenharmony_ci	 * the list and we will do our own flushing further down.
13638c2ecf20Sopenharmony_ci	 */
13648c2ecf20Sopenharmony_ci	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
13658c2ecf20Sopenharmony_ci		ticket.bytes = orig_bytes;
13668c2ecf20Sopenharmony_ci		ticket.error = 0;
13678c2ecf20Sopenharmony_ci		space_info->reclaim_size += ticket.bytes;
13688c2ecf20Sopenharmony_ci		init_waitqueue_head(&ticket.wait);
13698c2ecf20Sopenharmony_ci		ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
13708c2ecf20Sopenharmony_ci		if (flush == BTRFS_RESERVE_FLUSH_ALL ||
13718c2ecf20Sopenharmony_ci		    flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
13728c2ecf20Sopenharmony_ci		    flush == BTRFS_RESERVE_FLUSH_DATA) {
13738c2ecf20Sopenharmony_ci			list_add_tail(&ticket.list, &space_info->tickets);
13748c2ecf20Sopenharmony_ci			if (!space_info->flush) {
13758c2ecf20Sopenharmony_ci				space_info->flush = 1;
13768c2ecf20Sopenharmony_ci				trace_btrfs_trigger_flush(fs_info,
13778c2ecf20Sopenharmony_ci							  space_info->flags,
13788c2ecf20Sopenharmony_ci							  orig_bytes, flush,
13798c2ecf20Sopenharmony_ci							  "enospc");
13808c2ecf20Sopenharmony_ci				queue_work(system_unbound_wq, async_work);
13818c2ecf20Sopenharmony_ci			}
13828c2ecf20Sopenharmony_ci		} else {
13838c2ecf20Sopenharmony_ci			list_add_tail(&ticket.list,
13848c2ecf20Sopenharmony_ci				      &space_info->priority_tickets);
13858c2ecf20Sopenharmony_ci		}
13868c2ecf20Sopenharmony_ci	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
13878c2ecf20Sopenharmony_ci		used += orig_bytes;
13888c2ecf20Sopenharmony_ci		/*
13898c2ecf20Sopenharmony_ci		 * We will do the space reservation dance during log replay,
13908c2ecf20Sopenharmony_ci		 * which means we won't have fs_info->fs_root set, so don't do
13918c2ecf20Sopenharmony_ci		 * the async reclaim as we will panic.
13928c2ecf20Sopenharmony_ci		 */
13938c2ecf20Sopenharmony_ci		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
13948c2ecf20Sopenharmony_ci		    need_do_async_reclaim(fs_info, space_info, used) &&
13958c2ecf20Sopenharmony_ci		    !work_busy(&fs_info->async_reclaim_work)) {
13968c2ecf20Sopenharmony_ci			trace_btrfs_trigger_flush(fs_info, space_info->flags,
13978c2ecf20Sopenharmony_ci						  orig_bytes, flush, "preempt");
13988c2ecf20Sopenharmony_ci			queue_work(system_unbound_wq,
13998c2ecf20Sopenharmony_ci				   &fs_info->async_reclaim_work);
14008c2ecf20Sopenharmony_ci		}
14018c2ecf20Sopenharmony_ci	}
14028c2ecf20Sopenharmony_ci	spin_unlock(&space_info->lock);
14038c2ecf20Sopenharmony_ci	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
14048c2ecf20Sopenharmony_ci		return ret;
14058c2ecf20Sopenharmony_ci
14068c2ecf20Sopenharmony_ci	return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
14078c2ecf20Sopenharmony_ci}
14088c2ecf20Sopenharmony_ci
14098c2ecf20Sopenharmony_ci/**
14108c2ecf20Sopenharmony_ci * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
14118c2ecf20Sopenharmony_ci * @root - the root we're allocating for
14128c2ecf20Sopenharmony_ci * @block_rsv - the block_rsv we're allocating for
14138c2ecf20Sopenharmony_ci * @orig_bytes - the number of bytes we want
14148c2ecf20Sopenharmony_ci * @flush - whether or not we can flush to make our reservation
14158c2ecf20Sopenharmony_ci *
14168c2ecf20Sopenharmony_ci * This will reserve orig_bytes number of bytes from the space info associated
14178c2ecf20Sopenharmony_ci * with the block_rsv.  If there is not enough space it will make an attempt to
14188c2ecf20Sopenharmony_ci * flush out space to make room.  It will do this by flushing delalloc if
14198c2ecf20Sopenharmony_ci * possible or committing the transaction.  If flush is 0 then no attempts to
14208c2ecf20Sopenharmony_ci * regain reservations will be made and this will fail if there is not enough
14218c2ecf20Sopenharmony_ci * space already.
14228c2ecf20Sopenharmony_ci */
14238c2ecf20Sopenharmony_ciint btrfs_reserve_metadata_bytes(struct btrfs_root *root,
14248c2ecf20Sopenharmony_ci				 struct btrfs_block_rsv *block_rsv,
14258c2ecf20Sopenharmony_ci				 u64 orig_bytes,
14268c2ecf20Sopenharmony_ci				 enum btrfs_reserve_flush_enum flush)
14278c2ecf20Sopenharmony_ci{
14288c2ecf20Sopenharmony_ci	struct btrfs_fs_info *fs_info = root->fs_info;
14298c2ecf20Sopenharmony_ci	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
14308c2ecf20Sopenharmony_ci	int ret;
14318c2ecf20Sopenharmony_ci
14328c2ecf20Sopenharmony_ci	ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
14338c2ecf20Sopenharmony_ci	if (ret == -ENOSPC &&
14348c2ecf20Sopenharmony_ci	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
14358c2ecf20Sopenharmony_ci		if (block_rsv != global_rsv &&
14368c2ecf20Sopenharmony_ci		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
14378c2ecf20Sopenharmony_ci			ret = 0;
14388c2ecf20Sopenharmony_ci	}
14398c2ecf20Sopenharmony_ci	if (ret == -ENOSPC) {
14408c2ecf20Sopenharmony_ci		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
14418c2ecf20Sopenharmony_ci					      block_rsv->space_info->flags,
14428c2ecf20Sopenharmony_ci					      orig_bytes, 1);
14438c2ecf20Sopenharmony_ci
14448c2ecf20Sopenharmony_ci		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
14458c2ecf20Sopenharmony_ci			btrfs_dump_space_info(fs_info, block_rsv->space_info,
14468c2ecf20Sopenharmony_ci					      orig_bytes, 0);
14478c2ecf20Sopenharmony_ci	}
14488c2ecf20Sopenharmony_ci	return ret;
14498c2ecf20Sopenharmony_ci}
14508c2ecf20Sopenharmony_ci
14518c2ecf20Sopenharmony_ci/**
14528c2ecf20Sopenharmony_ci * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
14538c2ecf20Sopenharmony_ci * @fs_info - the filesystem
14548c2ecf20Sopenharmony_ci * @bytes - the number of bytes we need
14558c2ecf20Sopenharmony_ci * @flush - how we are allowed to flush
14568c2ecf20Sopenharmony_ci *
14578c2ecf20Sopenharmony_ci * This will reserve bytes from the data space info.  If there is not enough
14588c2ecf20Sopenharmony_ci * space then we will attempt to flush space as specified by flush.
14598c2ecf20Sopenharmony_ci */
14608c2ecf20Sopenharmony_ciint btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
14618c2ecf20Sopenharmony_ci			     enum btrfs_reserve_flush_enum flush)
14628c2ecf20Sopenharmony_ci{
14638c2ecf20Sopenharmony_ci	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
14648c2ecf20Sopenharmony_ci	int ret;
14658c2ecf20Sopenharmony_ci
14668c2ecf20Sopenharmony_ci	ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
14678c2ecf20Sopenharmony_ci	       flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
14688c2ecf20Sopenharmony_ci	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
14698c2ecf20Sopenharmony_ci
14708c2ecf20Sopenharmony_ci	ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
14718c2ecf20Sopenharmony_ci	if (ret == -ENOSPC) {
14728c2ecf20Sopenharmony_ci		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
14738c2ecf20Sopenharmony_ci					      data_sinfo->flags, bytes, 1);
14748c2ecf20Sopenharmony_ci		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
14758c2ecf20Sopenharmony_ci			btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
14768c2ecf20Sopenharmony_ci	}
14778c2ecf20Sopenharmony_ci	return ret;
14788c2ecf20Sopenharmony_ci}
1479