162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci 362306a36Sopenharmony_ci#include "misc.h" 462306a36Sopenharmony_ci#include "ctree.h" 562306a36Sopenharmony_ci#include "space-info.h" 662306a36Sopenharmony_ci#include "sysfs.h" 762306a36Sopenharmony_ci#include "volumes.h" 862306a36Sopenharmony_ci#include "free-space-cache.h" 962306a36Sopenharmony_ci#include "ordered-data.h" 1062306a36Sopenharmony_ci#include "transaction.h" 1162306a36Sopenharmony_ci#include "block-group.h" 1262306a36Sopenharmony_ci#include "zoned.h" 1362306a36Sopenharmony_ci#include "fs.h" 1462306a36Sopenharmony_ci#include "accessors.h" 1562306a36Sopenharmony_ci#include "extent-tree.h" 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_ci/* 1862306a36Sopenharmony_ci * HOW DOES SPACE RESERVATION WORK 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * If you want to know about delalloc specifically, there is a separate comment 2162306a36Sopenharmony_ci * for that with the delalloc code. This comment is about how the whole system 2262306a36Sopenharmony_ci * works generally. 2362306a36Sopenharmony_ci * 2462306a36Sopenharmony_ci * BASIC CONCEPTS 2562306a36Sopenharmony_ci * 2662306a36Sopenharmony_ci * 1) space_info. This is the ultimate arbiter of how much space we can use. 2762306a36Sopenharmony_ci * There's a description of the bytes_ fields with the struct declaration, 2862306a36Sopenharmony_ci * refer to that for specifics on each field. Suffice it to say that for 2962306a36Sopenharmony_ci * reservations we care about total_bytes - SUM(space_info->bytes_) when 3062306a36Sopenharmony_ci * determining if there is space to make an allocation. There is a space_info 3162306a36Sopenharmony_ci * for METADATA, SYSTEM, and DATA areas. 3262306a36Sopenharmony_ci * 3362306a36Sopenharmony_ci * 2) block_rsv's. These are basically buckets for every different type of 3462306a36Sopenharmony_ci * metadata reservation we have. You can see the comment in the block_rsv 3562306a36Sopenharmony_ci * code on the rules for each type, but generally block_rsv->reserved is how 3662306a36Sopenharmony_ci * much space is accounted for in space_info->bytes_may_use. 3762306a36Sopenharmony_ci * 3862306a36Sopenharmony_ci * 3) btrfs_calc*_size. These are the worst case calculations we used based 3962306a36Sopenharmony_ci * on the number of items we will want to modify. We have one for changing 4062306a36Sopenharmony_ci * items, and one for inserting new items. Generally we use these helpers to 4162306a36Sopenharmony_ci * determine the size of the block reserves, and then use the actual bytes 4262306a36Sopenharmony_ci * values to adjust the space_info counters. 4362306a36Sopenharmony_ci * 4462306a36Sopenharmony_ci * MAKING RESERVATIONS, THE NORMAL CASE 4562306a36Sopenharmony_ci * 4662306a36Sopenharmony_ci * We call into either btrfs_reserve_data_bytes() or 4762306a36Sopenharmony_ci * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with 4862306a36Sopenharmony_ci * num_bytes we want to reserve. 4962306a36Sopenharmony_ci * 5062306a36Sopenharmony_ci * ->reserve 5162306a36Sopenharmony_ci * space_info->bytes_may_reserve += num_bytes 5262306a36Sopenharmony_ci * 5362306a36Sopenharmony_ci * ->extent allocation 5462306a36Sopenharmony_ci * Call btrfs_add_reserved_bytes() which does 5562306a36Sopenharmony_ci * space_info->bytes_may_reserve -= num_bytes 5662306a36Sopenharmony_ci * space_info->bytes_reserved += extent_bytes 5762306a36Sopenharmony_ci * 5862306a36Sopenharmony_ci * ->insert reference 5962306a36Sopenharmony_ci * Call btrfs_update_block_group() which does 6062306a36Sopenharmony_ci * space_info->bytes_reserved -= extent_bytes 6162306a36Sopenharmony_ci * space_info->bytes_used += extent_bytes 6262306a36Sopenharmony_ci * 6362306a36Sopenharmony_ci * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority) 6462306a36Sopenharmony_ci * 6562306a36Sopenharmony_ci * Assume we are unable to simply make the reservation because we do not have 6662306a36Sopenharmony_ci * enough space 6762306a36Sopenharmony_ci * 6862306a36Sopenharmony_ci * -> __reserve_bytes 6962306a36Sopenharmony_ci * create a reserve_ticket with ->bytes set to our reservation, add it to 7062306a36Sopenharmony_ci * the tail of space_info->tickets, kick async flush thread 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * ->handle_reserve_ticket 7362306a36Sopenharmony_ci * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set 7462306a36Sopenharmony_ci * on the ticket. 7562306a36Sopenharmony_ci * 7662306a36Sopenharmony_ci * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space 7762306a36Sopenharmony_ci * Flushes various things attempting to free up space. 7862306a36Sopenharmony_ci * 7962306a36Sopenharmony_ci * -> btrfs_try_granting_tickets() 8062306a36Sopenharmony_ci * This is called by anything that either subtracts space from 8162306a36Sopenharmony_ci * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the 8262306a36Sopenharmony_ci * space_info->total_bytes. This loops through the ->priority_tickets and 8362306a36Sopenharmony_ci * then the ->tickets list checking to see if the reservation can be 8462306a36Sopenharmony_ci * completed. If it can the space is added to space_info->bytes_may_use and 8562306a36Sopenharmony_ci * the ticket is woken up. 8662306a36Sopenharmony_ci * 8762306a36Sopenharmony_ci * -> ticket wakeup 8862306a36Sopenharmony_ci * Check if ->bytes == 0, if it does we got our reservation and we can carry 8962306a36Sopenharmony_ci * on, if not return the appropriate error (ENOSPC, but can be EINTR if we 9062306a36Sopenharmony_ci * were interrupted.) 9162306a36Sopenharmony_ci * 9262306a36Sopenharmony_ci * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY 9362306a36Sopenharmony_ci * 9462306a36Sopenharmony_ci * Same as the above, except we add ourselves to the 9562306a36Sopenharmony_ci * space_info->priority_tickets, and we do not use ticket->wait, we simply 9662306a36Sopenharmony_ci * call flush_space() ourselves for the states that are safe for us to call 9762306a36Sopenharmony_ci * without deadlocking and hope for the best. 9862306a36Sopenharmony_ci * 9962306a36Sopenharmony_ci * THE FLUSHING STATES 10062306a36Sopenharmony_ci * 10162306a36Sopenharmony_ci * Generally speaking we will have two cases for each state, a "nice" state 10262306a36Sopenharmony_ci * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to 10362306a36Sopenharmony_ci * reduce the locking over head on the various trees, and even to keep from 10462306a36Sopenharmony_ci * doing any work at all in the case of delayed refs. Each of these delayed 10562306a36Sopenharmony_ci * things however hold reservations, and so letting them run allows us to 10662306a36Sopenharmony_ci * reclaim space so we can make new reservations. 10762306a36Sopenharmony_ci * 10862306a36Sopenharmony_ci * FLUSH_DELAYED_ITEMS 10962306a36Sopenharmony_ci * Every inode has a delayed item to update the inode. Take a simple write 11062306a36Sopenharmony_ci * for example, we would update the inode item at write time to update the 11162306a36Sopenharmony_ci * mtime, and then again at finish_ordered_io() time in order to update the 11262306a36Sopenharmony_ci * isize or bytes. We keep these delayed items to coalesce these operations 11362306a36Sopenharmony_ci * into a single operation done on demand. These are an easy way to reclaim 11462306a36Sopenharmony_ci * metadata space. 11562306a36Sopenharmony_ci * 11662306a36Sopenharmony_ci * FLUSH_DELALLOC 11762306a36Sopenharmony_ci * Look at the delalloc comment to get an idea of how much space is reserved 11862306a36Sopenharmony_ci * for delayed allocation. We can reclaim some of this space simply by 11962306a36Sopenharmony_ci * running delalloc, but usually we need to wait for ordered extents to 12062306a36Sopenharmony_ci * reclaim the bulk of this space. 12162306a36Sopenharmony_ci * 12262306a36Sopenharmony_ci * FLUSH_DELAYED_REFS 12362306a36Sopenharmony_ci * We have a block reserve for the outstanding delayed refs space, and every 12462306a36Sopenharmony_ci * delayed ref operation holds a reservation. Running these is a quick way 12562306a36Sopenharmony_ci * to reclaim space, but we want to hold this until the end because COW can 12662306a36Sopenharmony_ci * churn a lot and we can avoid making some extent tree modifications if we 12762306a36Sopenharmony_ci * are able to delay for as long as possible. 12862306a36Sopenharmony_ci * 12962306a36Sopenharmony_ci * ALLOC_CHUNK 13062306a36Sopenharmony_ci * We will skip this the first time through space reservation, because of 13162306a36Sopenharmony_ci * overcommit and we don't want to have a lot of useless metadata space when 13262306a36Sopenharmony_ci * our worst case reservations will likely never come true. 13362306a36Sopenharmony_ci * 13462306a36Sopenharmony_ci * RUN_DELAYED_IPUTS 13562306a36Sopenharmony_ci * If we're freeing inodes we're likely freeing checksums, file extent 13662306a36Sopenharmony_ci * items, and extent tree items. Loads of space could be freed up by these 13762306a36Sopenharmony_ci * operations, however they won't be usable until the transaction commits. 13862306a36Sopenharmony_ci * 13962306a36Sopenharmony_ci * COMMIT_TRANS 14062306a36Sopenharmony_ci * This will commit the transaction. Historically we had a lot of logic 14162306a36Sopenharmony_ci * surrounding whether or not we'd commit the transaction, but this waits born 14262306a36Sopenharmony_ci * out of a pre-tickets era where we could end up committing the transaction 14362306a36Sopenharmony_ci * thousands of times in a row without making progress. Now thanks to our 14462306a36Sopenharmony_ci * ticketing system we know if we're not making progress and can error 14562306a36Sopenharmony_ci * everybody out after a few commits rather than burning the disk hoping for 14662306a36Sopenharmony_ci * a different answer. 14762306a36Sopenharmony_ci * 14862306a36Sopenharmony_ci * OVERCOMMIT 14962306a36Sopenharmony_ci * 15062306a36Sopenharmony_ci * Because we hold so many reservations for metadata we will allow you to 15162306a36Sopenharmony_ci * reserve more space than is currently free in the currently allocate 15262306a36Sopenharmony_ci * metadata space. This only happens with metadata, data does not allow 15362306a36Sopenharmony_ci * overcommitting. 15462306a36Sopenharmony_ci * 15562306a36Sopenharmony_ci * You can see the current logic for when we allow overcommit in 15662306a36Sopenharmony_ci * btrfs_can_overcommit(), but it only applies to unallocated space. If there 15762306a36Sopenharmony_ci * is no unallocated space to be had, all reservations are kept within the 15862306a36Sopenharmony_ci * free space in the allocated metadata chunks. 15962306a36Sopenharmony_ci * 16062306a36Sopenharmony_ci * Because of overcommitting, you generally want to use the 16162306a36Sopenharmony_ci * btrfs_can_overcommit() logic for metadata allocations, as it does the right 16262306a36Sopenharmony_ci * thing with or without extra unallocated space. 16362306a36Sopenharmony_ci */ 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ciu64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, 16662306a36Sopenharmony_ci bool may_use_included) 16762306a36Sopenharmony_ci{ 16862306a36Sopenharmony_ci ASSERT(s_info); 16962306a36Sopenharmony_ci return s_info->bytes_used + s_info->bytes_reserved + 17062306a36Sopenharmony_ci s_info->bytes_pinned + s_info->bytes_readonly + 17162306a36Sopenharmony_ci s_info->bytes_zone_unusable + 17262306a36Sopenharmony_ci (may_use_included ? s_info->bytes_may_use : 0); 17362306a36Sopenharmony_ci} 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci/* 17662306a36Sopenharmony_ci * after adding space to the filesystem, we need to clear the full flags 17762306a36Sopenharmony_ci * on all the space infos. 17862306a36Sopenharmony_ci */ 17962306a36Sopenharmony_civoid btrfs_clear_space_info_full(struct btrfs_fs_info *info) 18062306a36Sopenharmony_ci{ 18162306a36Sopenharmony_ci struct list_head *head = &info->space_info; 18262306a36Sopenharmony_ci struct btrfs_space_info *found; 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci list_for_each_entry(found, head, list) 18562306a36Sopenharmony_ci found->full = 0; 18662306a36Sopenharmony_ci} 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci/* 18962306a36Sopenharmony_ci * Block groups with more than this value (percents) of unusable space will be 19062306a36Sopenharmony_ci * scheduled for background reclaim. 19162306a36Sopenharmony_ci */ 19262306a36Sopenharmony_ci#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci/* 19562306a36Sopenharmony_ci * Calculate chunk size depending on volume type (regular or zoned). 19662306a36Sopenharmony_ci */ 19762306a36Sopenharmony_cistatic u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) 19862306a36Sopenharmony_ci{ 19962306a36Sopenharmony_ci if (btrfs_is_zoned(fs_info)) 20062306a36Sopenharmony_ci return fs_info->zone_size; 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_ci ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_DATA) 20562306a36Sopenharmony_ci return BTRFS_MAX_DATA_CHUNK_SIZE; 20662306a36Sopenharmony_ci else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 20762306a36Sopenharmony_ci return SZ_32M; 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci /* Handle BTRFS_BLOCK_GROUP_METADATA */ 21062306a36Sopenharmony_ci if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) 21162306a36Sopenharmony_ci return SZ_1G; 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci return SZ_256M; 21462306a36Sopenharmony_ci} 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci/* 21762306a36Sopenharmony_ci * Update default chunk size. 21862306a36Sopenharmony_ci */ 21962306a36Sopenharmony_civoid btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, 22062306a36Sopenharmony_ci u64 chunk_size) 22162306a36Sopenharmony_ci{ 22262306a36Sopenharmony_ci WRITE_ONCE(space_info->chunk_size, chunk_size); 22362306a36Sopenharmony_ci} 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_cistatic int create_space_info(struct btrfs_fs_info *info, u64 flags) 22662306a36Sopenharmony_ci{ 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci struct btrfs_space_info *space_info; 22962306a36Sopenharmony_ci int i; 23062306a36Sopenharmony_ci int ret; 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 23362306a36Sopenharmony_ci if (!space_info) 23462306a36Sopenharmony_ci return -ENOMEM; 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 23762306a36Sopenharmony_ci INIT_LIST_HEAD(&space_info->block_groups[i]); 23862306a36Sopenharmony_ci init_rwsem(&space_info->groups_sem); 23962306a36Sopenharmony_ci spin_lock_init(&space_info->lock); 24062306a36Sopenharmony_ci space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 24162306a36Sopenharmony_ci space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 24262306a36Sopenharmony_ci INIT_LIST_HEAD(&space_info->ro_bgs); 24362306a36Sopenharmony_ci INIT_LIST_HEAD(&space_info->tickets); 24462306a36Sopenharmony_ci INIT_LIST_HEAD(&space_info->priority_tickets); 24562306a36Sopenharmony_ci space_info->clamp = 1; 24662306a36Sopenharmony_ci btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci if (btrfs_is_zoned(info)) 24962306a36Sopenharmony_ci space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_ci ret = btrfs_sysfs_add_space_info_type(info, space_info); 25262306a36Sopenharmony_ci if (ret) 25362306a36Sopenharmony_ci return ret; 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci list_add(&space_info->list, &info->space_info); 25662306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_DATA) 25762306a36Sopenharmony_ci info->data_sinfo = space_info; 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci return ret; 26062306a36Sopenharmony_ci} 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ciint btrfs_init_space_info(struct btrfs_fs_info *fs_info) 26362306a36Sopenharmony_ci{ 26462306a36Sopenharmony_ci struct btrfs_super_block *disk_super; 26562306a36Sopenharmony_ci u64 features; 26662306a36Sopenharmony_ci u64 flags; 26762306a36Sopenharmony_ci int mixed = 0; 26862306a36Sopenharmony_ci int ret; 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci disk_super = fs_info->super_copy; 27162306a36Sopenharmony_ci if (!btrfs_super_root(disk_super)) 27262306a36Sopenharmony_ci return -EINVAL; 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci features = btrfs_super_incompat_flags(disk_super); 27562306a36Sopenharmony_ci if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 27662306a36Sopenharmony_ci mixed = 1; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci flags = BTRFS_BLOCK_GROUP_SYSTEM; 27962306a36Sopenharmony_ci ret = create_space_info(fs_info, flags); 28062306a36Sopenharmony_ci if (ret) 28162306a36Sopenharmony_ci goto out; 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci if (mixed) { 28462306a36Sopenharmony_ci flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 28562306a36Sopenharmony_ci ret = create_space_info(fs_info, flags); 28662306a36Sopenharmony_ci } else { 28762306a36Sopenharmony_ci flags = BTRFS_BLOCK_GROUP_METADATA; 28862306a36Sopenharmony_ci ret = create_space_info(fs_info, flags); 28962306a36Sopenharmony_ci if (ret) 29062306a36Sopenharmony_ci goto out; 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci flags = BTRFS_BLOCK_GROUP_DATA; 29362306a36Sopenharmony_ci ret = create_space_info(fs_info, flags); 29462306a36Sopenharmony_ci } 29562306a36Sopenharmony_ciout: 29662306a36Sopenharmony_ci return ret; 29762306a36Sopenharmony_ci} 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_civoid btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, 30062306a36Sopenharmony_ci struct btrfs_block_group *block_group) 30162306a36Sopenharmony_ci{ 30262306a36Sopenharmony_ci struct btrfs_space_info *found; 30362306a36Sopenharmony_ci int factor, index; 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci factor = btrfs_bg_type_to_factor(block_group->flags); 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci found = btrfs_find_space_info(info, block_group->flags); 30862306a36Sopenharmony_ci ASSERT(found); 30962306a36Sopenharmony_ci spin_lock(&found->lock); 31062306a36Sopenharmony_ci found->total_bytes += block_group->length; 31162306a36Sopenharmony_ci found->disk_total += block_group->length * factor; 31262306a36Sopenharmony_ci found->bytes_used += block_group->used; 31362306a36Sopenharmony_ci found->disk_used += block_group->used * factor; 31462306a36Sopenharmony_ci found->bytes_readonly += block_group->bytes_super; 31562306a36Sopenharmony_ci found->bytes_zone_unusable += block_group->zone_unusable; 31662306a36Sopenharmony_ci if (block_group->length > 0) 31762306a36Sopenharmony_ci found->full = 0; 31862306a36Sopenharmony_ci btrfs_try_granting_tickets(info, found); 31962306a36Sopenharmony_ci spin_unlock(&found->lock); 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci block_group->space_info = found; 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci index = btrfs_bg_flags_to_raid_index(block_group->flags); 32462306a36Sopenharmony_ci down_write(&found->groups_sem); 32562306a36Sopenharmony_ci list_add_tail(&block_group->list, &found->block_groups[index]); 32662306a36Sopenharmony_ci up_write(&found->groups_sem); 32762306a36Sopenharmony_ci} 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_cistruct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 33062306a36Sopenharmony_ci u64 flags) 33162306a36Sopenharmony_ci{ 33262306a36Sopenharmony_ci struct list_head *head = &info->space_info; 33362306a36Sopenharmony_ci struct btrfs_space_info *found; 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ci list_for_each_entry(found, head, list) { 33862306a36Sopenharmony_ci if (found->flags & flags) 33962306a36Sopenharmony_ci return found; 34062306a36Sopenharmony_ci } 34162306a36Sopenharmony_ci return NULL; 34262306a36Sopenharmony_ci} 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_cistatic u64 calc_available_free_space(struct btrfs_fs_info *fs_info, 34562306a36Sopenharmony_ci struct btrfs_space_info *space_info, 34662306a36Sopenharmony_ci enum btrfs_reserve_flush_enum flush) 34762306a36Sopenharmony_ci{ 34862306a36Sopenharmony_ci u64 profile; 34962306a36Sopenharmony_ci u64 avail; 35062306a36Sopenharmony_ci int factor; 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) 35362306a36Sopenharmony_ci profile = btrfs_system_alloc_profile(fs_info); 35462306a36Sopenharmony_ci else 35562306a36Sopenharmony_ci profile = btrfs_metadata_alloc_profile(fs_info); 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci avail = atomic64_read(&fs_info->free_chunk_space); 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci /* 36062306a36Sopenharmony_ci * If we have dup, raid1 or raid10 then only half of the free 36162306a36Sopenharmony_ci * space is actually usable. For raid56, the space info used 36262306a36Sopenharmony_ci * doesn't include the parity drive, so we don't have to 36362306a36Sopenharmony_ci * change the math 36462306a36Sopenharmony_ci */ 36562306a36Sopenharmony_ci factor = btrfs_bg_type_to_factor(profile); 36662306a36Sopenharmony_ci avail = div_u64(avail, factor); 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci /* 36962306a36Sopenharmony_ci * If we aren't flushing all things, let us overcommit up to 37062306a36Sopenharmony_ci * 1/2th of the space. If we can flush, don't let us overcommit 37162306a36Sopenharmony_ci * too much, let it overcommit up to 1/8 of the space. 37262306a36Sopenharmony_ci */ 37362306a36Sopenharmony_ci if (flush == BTRFS_RESERVE_FLUSH_ALL) 37462306a36Sopenharmony_ci avail >>= 3; 37562306a36Sopenharmony_ci else 37662306a36Sopenharmony_ci avail >>= 1; 37762306a36Sopenharmony_ci return avail; 37862306a36Sopenharmony_ci} 37962306a36Sopenharmony_ci 38062306a36Sopenharmony_ciint btrfs_can_overcommit(struct btrfs_fs_info *fs_info, 38162306a36Sopenharmony_ci struct btrfs_space_info *space_info, u64 bytes, 38262306a36Sopenharmony_ci enum btrfs_reserve_flush_enum flush) 38362306a36Sopenharmony_ci{ 38462306a36Sopenharmony_ci u64 avail; 38562306a36Sopenharmony_ci u64 used; 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci /* Don't overcommit when in mixed mode */ 38862306a36Sopenharmony_ci if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 38962306a36Sopenharmony_ci return 0; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci used = btrfs_space_info_used(space_info, true); 39262306a36Sopenharmony_ci avail = calc_available_free_space(fs_info, space_info, flush); 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_ci if (used + bytes < space_info->total_bytes + avail) 39562306a36Sopenharmony_ci return 1; 39662306a36Sopenharmony_ci return 0; 39762306a36Sopenharmony_ci} 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_cistatic void remove_ticket(struct btrfs_space_info *space_info, 40062306a36Sopenharmony_ci struct reserve_ticket *ticket) 40162306a36Sopenharmony_ci{ 40262306a36Sopenharmony_ci if (!list_empty(&ticket->list)) { 40362306a36Sopenharmony_ci list_del_init(&ticket->list); 40462306a36Sopenharmony_ci ASSERT(space_info->reclaim_size >= ticket->bytes); 40562306a36Sopenharmony_ci space_info->reclaim_size -= ticket->bytes; 40662306a36Sopenharmony_ci } 40762306a36Sopenharmony_ci} 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci/* 41062306a36Sopenharmony_ci * This is for space we already have accounted in space_info->bytes_may_use, so 41162306a36Sopenharmony_ci * basically when we're returning space from block_rsv's. 41262306a36Sopenharmony_ci */ 41362306a36Sopenharmony_civoid btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 41462306a36Sopenharmony_ci struct btrfs_space_info *space_info) 41562306a36Sopenharmony_ci{ 41662306a36Sopenharmony_ci struct list_head *head; 41762306a36Sopenharmony_ci enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci lockdep_assert_held(&space_info->lock); 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci head = &space_info->priority_tickets; 42262306a36Sopenharmony_ciagain: 42362306a36Sopenharmony_ci while (!list_empty(head)) { 42462306a36Sopenharmony_ci struct reserve_ticket *ticket; 42562306a36Sopenharmony_ci u64 used = btrfs_space_info_used(space_info, true); 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci ticket = list_first_entry(head, struct reserve_ticket, list); 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci /* Check and see if our ticket can be satisfied now. */ 43062306a36Sopenharmony_ci if ((used + ticket->bytes <= space_info->total_bytes) || 43162306a36Sopenharmony_ci btrfs_can_overcommit(fs_info, space_info, ticket->bytes, 43262306a36Sopenharmony_ci flush)) { 43362306a36Sopenharmony_ci btrfs_space_info_update_bytes_may_use(fs_info, 43462306a36Sopenharmony_ci space_info, 43562306a36Sopenharmony_ci ticket->bytes); 43662306a36Sopenharmony_ci remove_ticket(space_info, ticket); 43762306a36Sopenharmony_ci ticket->bytes = 0; 43862306a36Sopenharmony_ci space_info->tickets_id++; 43962306a36Sopenharmony_ci wake_up(&ticket->wait); 44062306a36Sopenharmony_ci } else { 44162306a36Sopenharmony_ci break; 44262306a36Sopenharmony_ci } 44362306a36Sopenharmony_ci } 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci if (head == &space_info->priority_tickets) { 44662306a36Sopenharmony_ci head = &space_info->tickets; 44762306a36Sopenharmony_ci flush = BTRFS_RESERVE_FLUSH_ALL; 44862306a36Sopenharmony_ci goto again; 44962306a36Sopenharmony_ci } 45062306a36Sopenharmony_ci} 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 45362306a36Sopenharmony_cido { \ 45462306a36Sopenharmony_ci struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 45562306a36Sopenharmony_ci spin_lock(&__rsv->lock); \ 45662306a36Sopenharmony_ci btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 45762306a36Sopenharmony_ci __rsv->size, __rsv->reserved); \ 45862306a36Sopenharmony_ci spin_unlock(&__rsv->lock); \ 45962306a36Sopenharmony_ci} while (0) 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_cistatic const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) 46262306a36Sopenharmony_ci{ 46362306a36Sopenharmony_ci switch (space_info->flags) { 46462306a36Sopenharmony_ci case BTRFS_BLOCK_GROUP_SYSTEM: 46562306a36Sopenharmony_ci return "SYSTEM"; 46662306a36Sopenharmony_ci case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: 46762306a36Sopenharmony_ci return "DATA+METADATA"; 46862306a36Sopenharmony_ci case BTRFS_BLOCK_GROUP_DATA: 46962306a36Sopenharmony_ci return "DATA"; 47062306a36Sopenharmony_ci case BTRFS_BLOCK_GROUP_METADATA: 47162306a36Sopenharmony_ci return "METADATA"; 47262306a36Sopenharmony_ci default: 47362306a36Sopenharmony_ci return "UNKNOWN"; 47462306a36Sopenharmony_ci } 47562306a36Sopenharmony_ci} 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_cistatic void dump_global_block_rsv(struct btrfs_fs_info *fs_info) 47862306a36Sopenharmony_ci{ 47962306a36Sopenharmony_ci DUMP_BLOCK_RSV(fs_info, global_block_rsv); 48062306a36Sopenharmony_ci DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 48162306a36Sopenharmony_ci DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 48262306a36Sopenharmony_ci DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 48362306a36Sopenharmony_ci DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 48462306a36Sopenharmony_ci} 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_cistatic void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 48762306a36Sopenharmony_ci struct btrfs_space_info *info) 48862306a36Sopenharmony_ci{ 48962306a36Sopenharmony_ci const char *flag_str = space_info_flag_to_str(info); 49062306a36Sopenharmony_ci lockdep_assert_held(&info->lock); 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci /* The free space could be negative in case of overcommit */ 49362306a36Sopenharmony_ci btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", 49462306a36Sopenharmony_ci flag_str, 49562306a36Sopenharmony_ci (s64)(info->total_bytes - btrfs_space_info_used(info, true)), 49662306a36Sopenharmony_ci info->full ? "" : "not "); 49762306a36Sopenharmony_ci btrfs_info(fs_info, 49862306a36Sopenharmony_ci"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", 49962306a36Sopenharmony_ci info->total_bytes, info->bytes_used, info->bytes_pinned, 50062306a36Sopenharmony_ci info->bytes_reserved, info->bytes_may_use, 50162306a36Sopenharmony_ci info->bytes_readonly, info->bytes_zone_unusable); 50262306a36Sopenharmony_ci} 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_civoid btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 50562306a36Sopenharmony_ci struct btrfs_space_info *info, u64 bytes, 50662306a36Sopenharmony_ci int dump_block_groups) 50762306a36Sopenharmony_ci{ 50862306a36Sopenharmony_ci struct btrfs_block_group *cache; 50962306a36Sopenharmony_ci u64 total_avail = 0; 51062306a36Sopenharmony_ci int index = 0; 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci spin_lock(&info->lock); 51362306a36Sopenharmony_ci __btrfs_dump_space_info(fs_info, info); 51462306a36Sopenharmony_ci dump_global_block_rsv(fs_info); 51562306a36Sopenharmony_ci spin_unlock(&info->lock); 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci if (!dump_block_groups) 51862306a36Sopenharmony_ci return; 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci down_read(&info->groups_sem); 52162306a36Sopenharmony_ciagain: 52262306a36Sopenharmony_ci list_for_each_entry(cache, &info->block_groups[index], list) { 52362306a36Sopenharmony_ci u64 avail; 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci spin_lock(&cache->lock); 52662306a36Sopenharmony_ci avail = cache->length - cache->used - cache->pinned - 52762306a36Sopenharmony_ci cache->reserved - cache->delalloc_bytes - 52862306a36Sopenharmony_ci cache->bytes_super - cache->zone_unusable; 52962306a36Sopenharmony_ci btrfs_info(fs_info, 53062306a36Sopenharmony_ci"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", 53162306a36Sopenharmony_ci cache->start, cache->length, cache->used, cache->pinned, 53262306a36Sopenharmony_ci cache->reserved, cache->delalloc_bytes, 53362306a36Sopenharmony_ci cache->bytes_super, cache->zone_unusable, 53462306a36Sopenharmony_ci avail, cache->ro ? "[readonly]" : ""); 53562306a36Sopenharmony_ci spin_unlock(&cache->lock); 53662306a36Sopenharmony_ci btrfs_dump_free_space(cache, bytes); 53762306a36Sopenharmony_ci total_avail += avail; 53862306a36Sopenharmony_ci } 53962306a36Sopenharmony_ci if (++index < BTRFS_NR_RAID_TYPES) 54062306a36Sopenharmony_ci goto again; 54162306a36Sopenharmony_ci up_read(&info->groups_sem); 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_ci btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail); 54462306a36Sopenharmony_ci} 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_cistatic inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, 54762306a36Sopenharmony_ci u64 to_reclaim) 54862306a36Sopenharmony_ci{ 54962306a36Sopenharmony_ci u64 bytes; 55062306a36Sopenharmony_ci u64 nr; 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 55362306a36Sopenharmony_ci nr = div64_u64(to_reclaim, bytes); 55462306a36Sopenharmony_ci if (!nr) 55562306a36Sopenharmony_ci nr = 1; 55662306a36Sopenharmony_ci return nr; 55762306a36Sopenharmony_ci} 55862306a36Sopenharmony_ci 55962306a36Sopenharmony_cistatic inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, 56062306a36Sopenharmony_ci u64 to_reclaim) 56162306a36Sopenharmony_ci{ 56262306a36Sopenharmony_ci const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); 56362306a36Sopenharmony_ci u64 nr; 56462306a36Sopenharmony_ci 56562306a36Sopenharmony_ci nr = div64_u64(to_reclaim, bytes); 56662306a36Sopenharmony_ci if (!nr) 56762306a36Sopenharmony_ci nr = 1; 56862306a36Sopenharmony_ci return nr; 56962306a36Sopenharmony_ci} 57062306a36Sopenharmony_ci 57162306a36Sopenharmony_ci#define EXTENT_SIZE_PER_ITEM SZ_256K 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci/* 57462306a36Sopenharmony_ci * shrink metadata reservation for delalloc 57562306a36Sopenharmony_ci */ 57662306a36Sopenharmony_cistatic void shrink_delalloc(struct btrfs_fs_info *fs_info, 57762306a36Sopenharmony_ci struct btrfs_space_info *space_info, 57862306a36Sopenharmony_ci u64 to_reclaim, bool wait_ordered, 57962306a36Sopenharmony_ci bool for_preempt) 58062306a36Sopenharmony_ci{ 58162306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 58262306a36Sopenharmony_ci u64 delalloc_bytes; 58362306a36Sopenharmony_ci u64 ordered_bytes; 58462306a36Sopenharmony_ci u64 items; 58562306a36Sopenharmony_ci long time_left; 58662306a36Sopenharmony_ci int loops; 58762306a36Sopenharmony_ci 58862306a36Sopenharmony_ci delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes); 58962306a36Sopenharmony_ci ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); 59062306a36Sopenharmony_ci if (delalloc_bytes == 0 && ordered_bytes == 0) 59162306a36Sopenharmony_ci return; 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_ci /* Calc the number of the pages we need flush for space reservation */ 59462306a36Sopenharmony_ci if (to_reclaim == U64_MAX) { 59562306a36Sopenharmony_ci items = U64_MAX; 59662306a36Sopenharmony_ci } else { 59762306a36Sopenharmony_ci /* 59862306a36Sopenharmony_ci * to_reclaim is set to however much metadata we need to 59962306a36Sopenharmony_ci * reclaim, but reclaiming that much data doesn't really track 60062306a36Sopenharmony_ci * exactly. What we really want to do is reclaim full inode's 60162306a36Sopenharmony_ci * worth of reservations, however that's not available to us 60262306a36Sopenharmony_ci * here. We will take a fraction of the delalloc bytes for our 60362306a36Sopenharmony_ci * flushing loops and hope for the best. Delalloc will expand 60462306a36Sopenharmony_ci * the amount we write to cover an entire dirty extent, which 60562306a36Sopenharmony_ci * will reclaim the metadata reservation for that range. If 60662306a36Sopenharmony_ci * it's not enough subsequent flush stages will be more 60762306a36Sopenharmony_ci * aggressive. 60862306a36Sopenharmony_ci */ 60962306a36Sopenharmony_ci to_reclaim = max(to_reclaim, delalloc_bytes >> 3); 61062306a36Sopenharmony_ci items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; 61162306a36Sopenharmony_ci } 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_ci trans = current->journal_info; 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_ci /* 61662306a36Sopenharmony_ci * If we are doing more ordered than delalloc we need to just wait on 61762306a36Sopenharmony_ci * ordered extents, otherwise we'll waste time trying to flush delalloc 61862306a36Sopenharmony_ci * that likely won't give us the space back we need. 61962306a36Sopenharmony_ci */ 62062306a36Sopenharmony_ci if (ordered_bytes > delalloc_bytes && !for_preempt) 62162306a36Sopenharmony_ci wait_ordered = true; 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_ci loops = 0; 62462306a36Sopenharmony_ci while ((delalloc_bytes || ordered_bytes) && loops < 3) { 62562306a36Sopenharmony_ci u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 62662306a36Sopenharmony_ci long nr_pages = min_t(u64, temp, LONG_MAX); 62762306a36Sopenharmony_ci int async_pages; 62862306a36Sopenharmony_ci 62962306a36Sopenharmony_ci btrfs_start_delalloc_roots(fs_info, nr_pages, true); 63062306a36Sopenharmony_ci 63162306a36Sopenharmony_ci /* 63262306a36Sopenharmony_ci * We need to make sure any outstanding async pages are now 63362306a36Sopenharmony_ci * processed before we continue. This is because things like 63462306a36Sopenharmony_ci * sync_inode() try to be smart and skip writing if the inode is 63562306a36Sopenharmony_ci * marked clean. We don't use filemap_fwrite for flushing 63662306a36Sopenharmony_ci * because we want to control how many pages we write out at a 63762306a36Sopenharmony_ci * time, thus this is the only safe way to make sure we've 63862306a36Sopenharmony_ci * waited for outstanding compressed workers to have started 63962306a36Sopenharmony_ci * their jobs and thus have ordered extents set up properly. 64062306a36Sopenharmony_ci * 64162306a36Sopenharmony_ci * This exists because we do not want to wait for each 64262306a36Sopenharmony_ci * individual inode to finish its async work, we simply want to 64362306a36Sopenharmony_ci * start the IO on everybody, and then come back here and wait 64462306a36Sopenharmony_ci * for all of the async work to catch up. Once we're done with 64562306a36Sopenharmony_ci * that we know we'll have ordered extents for everything and we 64662306a36Sopenharmony_ci * can decide if we wait for that or not. 64762306a36Sopenharmony_ci * 64862306a36Sopenharmony_ci * If we choose to replace this in the future, make absolutely 64962306a36Sopenharmony_ci * sure that the proper waiting is being done in the async case, 65062306a36Sopenharmony_ci * as there have been bugs in that area before. 65162306a36Sopenharmony_ci */ 65262306a36Sopenharmony_ci async_pages = atomic_read(&fs_info->async_delalloc_pages); 65362306a36Sopenharmony_ci if (!async_pages) 65462306a36Sopenharmony_ci goto skip_async; 65562306a36Sopenharmony_ci 65662306a36Sopenharmony_ci /* 65762306a36Sopenharmony_ci * We don't want to wait forever, if we wrote less pages in this 65862306a36Sopenharmony_ci * loop than we have outstanding, only wait for that number of 65962306a36Sopenharmony_ci * pages, otherwise we can wait for all async pages to finish 66062306a36Sopenharmony_ci * before continuing. 66162306a36Sopenharmony_ci */ 66262306a36Sopenharmony_ci if (async_pages > nr_pages) 66362306a36Sopenharmony_ci async_pages -= nr_pages; 66462306a36Sopenharmony_ci else 66562306a36Sopenharmony_ci async_pages = 0; 66662306a36Sopenharmony_ci wait_event(fs_info->async_submit_wait, 66762306a36Sopenharmony_ci atomic_read(&fs_info->async_delalloc_pages) <= 66862306a36Sopenharmony_ci async_pages); 66962306a36Sopenharmony_ciskip_async: 67062306a36Sopenharmony_ci loops++; 67162306a36Sopenharmony_ci if (wait_ordered && !trans) { 67262306a36Sopenharmony_ci btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 67362306a36Sopenharmony_ci } else { 67462306a36Sopenharmony_ci time_left = schedule_timeout_killable(1); 67562306a36Sopenharmony_ci if (time_left) 67662306a36Sopenharmony_ci break; 67762306a36Sopenharmony_ci } 67862306a36Sopenharmony_ci 67962306a36Sopenharmony_ci /* 68062306a36Sopenharmony_ci * If we are for preemption we just want a one-shot of delalloc 68162306a36Sopenharmony_ci * flushing so we can stop flushing if we decide we don't need 68262306a36Sopenharmony_ci * to anymore. 68362306a36Sopenharmony_ci */ 68462306a36Sopenharmony_ci if (for_preempt) 68562306a36Sopenharmony_ci break; 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci spin_lock(&space_info->lock); 68862306a36Sopenharmony_ci if (list_empty(&space_info->tickets) && 68962306a36Sopenharmony_ci list_empty(&space_info->priority_tickets)) { 69062306a36Sopenharmony_ci spin_unlock(&space_info->lock); 69162306a36Sopenharmony_ci break; 69262306a36Sopenharmony_ci } 69362306a36Sopenharmony_ci spin_unlock(&space_info->lock); 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_ci delalloc_bytes = percpu_counter_sum_positive( 69662306a36Sopenharmony_ci &fs_info->delalloc_bytes); 69762306a36Sopenharmony_ci ordered_bytes = percpu_counter_sum_positive( 69862306a36Sopenharmony_ci &fs_info->ordered_bytes); 69962306a36Sopenharmony_ci } 70062306a36Sopenharmony_ci} 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci/* 70362306a36Sopenharmony_ci * Try to flush some data based on policy set by @state. This is only advisory 70462306a36Sopenharmony_ci * and may fail for various reasons. The caller is supposed to examine the 70562306a36Sopenharmony_ci * state of @space_info to detect the outcome. 70662306a36Sopenharmony_ci */ 70762306a36Sopenharmony_cistatic void flush_space(struct btrfs_fs_info *fs_info, 70862306a36Sopenharmony_ci struct btrfs_space_info *space_info, u64 num_bytes, 70962306a36Sopenharmony_ci enum btrfs_flush_state state, bool for_preempt) 71062306a36Sopenharmony_ci{ 71162306a36Sopenharmony_ci struct btrfs_root *root = fs_info->tree_root; 71262306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 71362306a36Sopenharmony_ci int nr; 71462306a36Sopenharmony_ci int ret = 0; 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_ci switch (state) { 71762306a36Sopenharmony_ci case FLUSH_DELAYED_ITEMS_NR: 71862306a36Sopenharmony_ci case FLUSH_DELAYED_ITEMS: 71962306a36Sopenharmony_ci if (state == FLUSH_DELAYED_ITEMS_NR) 72062306a36Sopenharmony_ci nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 72162306a36Sopenharmony_ci else 72262306a36Sopenharmony_ci nr = -1; 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci trans = btrfs_join_transaction_nostart(root); 72562306a36Sopenharmony_ci if (IS_ERR(trans)) { 72662306a36Sopenharmony_ci ret = PTR_ERR(trans); 72762306a36Sopenharmony_ci if (ret == -ENOENT) 72862306a36Sopenharmony_ci ret = 0; 72962306a36Sopenharmony_ci break; 73062306a36Sopenharmony_ci } 73162306a36Sopenharmony_ci ret = btrfs_run_delayed_items_nr(trans, nr); 73262306a36Sopenharmony_ci btrfs_end_transaction(trans); 73362306a36Sopenharmony_ci break; 73462306a36Sopenharmony_ci case FLUSH_DELALLOC: 73562306a36Sopenharmony_ci case FLUSH_DELALLOC_WAIT: 73662306a36Sopenharmony_ci case FLUSH_DELALLOC_FULL: 73762306a36Sopenharmony_ci if (state == FLUSH_DELALLOC_FULL) 73862306a36Sopenharmony_ci num_bytes = U64_MAX; 73962306a36Sopenharmony_ci shrink_delalloc(fs_info, space_info, num_bytes, 74062306a36Sopenharmony_ci state != FLUSH_DELALLOC, for_preempt); 74162306a36Sopenharmony_ci break; 74262306a36Sopenharmony_ci case FLUSH_DELAYED_REFS_NR: 74362306a36Sopenharmony_ci case FLUSH_DELAYED_REFS: 74462306a36Sopenharmony_ci trans = btrfs_join_transaction_nostart(root); 74562306a36Sopenharmony_ci if (IS_ERR(trans)) { 74662306a36Sopenharmony_ci ret = PTR_ERR(trans); 74762306a36Sopenharmony_ci if (ret == -ENOENT) 74862306a36Sopenharmony_ci ret = 0; 74962306a36Sopenharmony_ci break; 75062306a36Sopenharmony_ci } 75162306a36Sopenharmony_ci if (state == FLUSH_DELAYED_REFS_NR) 75262306a36Sopenharmony_ci nr = calc_delayed_refs_nr(fs_info, num_bytes); 75362306a36Sopenharmony_ci else 75462306a36Sopenharmony_ci nr = 0; 75562306a36Sopenharmony_ci btrfs_run_delayed_refs(trans, nr); 75662306a36Sopenharmony_ci btrfs_end_transaction(trans); 75762306a36Sopenharmony_ci break; 75862306a36Sopenharmony_ci case ALLOC_CHUNK: 75962306a36Sopenharmony_ci case ALLOC_CHUNK_FORCE: 76062306a36Sopenharmony_ci trans = btrfs_join_transaction(root); 76162306a36Sopenharmony_ci if (IS_ERR(trans)) { 76262306a36Sopenharmony_ci ret = PTR_ERR(trans); 76362306a36Sopenharmony_ci break; 76462306a36Sopenharmony_ci } 76562306a36Sopenharmony_ci ret = btrfs_chunk_alloc(trans, 76662306a36Sopenharmony_ci btrfs_get_alloc_profile(fs_info, space_info->flags), 76762306a36Sopenharmony_ci (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 76862306a36Sopenharmony_ci CHUNK_ALLOC_FORCE); 76962306a36Sopenharmony_ci btrfs_end_transaction(trans); 77062306a36Sopenharmony_ci 77162306a36Sopenharmony_ci if (ret > 0 || ret == -ENOSPC) 77262306a36Sopenharmony_ci ret = 0; 77362306a36Sopenharmony_ci break; 77462306a36Sopenharmony_ci case RUN_DELAYED_IPUTS: 77562306a36Sopenharmony_ci /* 77662306a36Sopenharmony_ci * If we have pending delayed iputs then we could free up a 77762306a36Sopenharmony_ci * bunch of pinned space, so make sure we run the iputs before 77862306a36Sopenharmony_ci * we do our pinned bytes check below. 77962306a36Sopenharmony_ci */ 78062306a36Sopenharmony_ci btrfs_run_delayed_iputs(fs_info); 78162306a36Sopenharmony_ci btrfs_wait_on_delayed_iputs(fs_info); 78262306a36Sopenharmony_ci break; 78362306a36Sopenharmony_ci case COMMIT_TRANS: 78462306a36Sopenharmony_ci ASSERT(current->journal_info == NULL); 78562306a36Sopenharmony_ci /* 78662306a36Sopenharmony_ci * We don't want to start a new transaction, just attach to the 78762306a36Sopenharmony_ci * current one or wait it fully commits in case its commit is 78862306a36Sopenharmony_ci * happening at the moment. Note: we don't use a nostart join 78962306a36Sopenharmony_ci * because that does not wait for a transaction to fully commit 79062306a36Sopenharmony_ci * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). 79162306a36Sopenharmony_ci */ 79262306a36Sopenharmony_ci trans = btrfs_attach_transaction_barrier(root); 79362306a36Sopenharmony_ci if (IS_ERR(trans)) { 79462306a36Sopenharmony_ci ret = PTR_ERR(trans); 79562306a36Sopenharmony_ci if (ret == -ENOENT) 79662306a36Sopenharmony_ci ret = 0; 79762306a36Sopenharmony_ci break; 79862306a36Sopenharmony_ci } 79962306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 80062306a36Sopenharmony_ci break; 80162306a36Sopenharmony_ci default: 80262306a36Sopenharmony_ci ret = -ENOSPC; 80362306a36Sopenharmony_ci break; 80462306a36Sopenharmony_ci } 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_ci trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 80762306a36Sopenharmony_ci ret, for_preempt); 80862306a36Sopenharmony_ci return; 80962306a36Sopenharmony_ci} 81062306a36Sopenharmony_ci 81162306a36Sopenharmony_cistatic inline u64 81262306a36Sopenharmony_cibtrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 81362306a36Sopenharmony_ci struct btrfs_space_info *space_info) 81462306a36Sopenharmony_ci{ 81562306a36Sopenharmony_ci u64 used; 81662306a36Sopenharmony_ci u64 avail; 81762306a36Sopenharmony_ci u64 to_reclaim = space_info->reclaim_size; 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_ci lockdep_assert_held(&space_info->lock); 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_ci avail = calc_available_free_space(fs_info, space_info, 82262306a36Sopenharmony_ci BTRFS_RESERVE_FLUSH_ALL); 82362306a36Sopenharmony_ci used = btrfs_space_info_used(space_info, true); 82462306a36Sopenharmony_ci 82562306a36Sopenharmony_ci /* 82662306a36Sopenharmony_ci * We may be flushing because suddenly we have less space than we had 82762306a36Sopenharmony_ci * before, and now we're well over-committed based on our current free 82862306a36Sopenharmony_ci * space. If that's the case add in our overage so we make sure to put 82962306a36Sopenharmony_ci * appropriate pressure on the flushing state machine. 83062306a36Sopenharmony_ci */ 83162306a36Sopenharmony_ci if (space_info->total_bytes + avail < used) 83262306a36Sopenharmony_ci to_reclaim += used - (space_info->total_bytes + avail); 83362306a36Sopenharmony_ci 83462306a36Sopenharmony_ci return to_reclaim; 83562306a36Sopenharmony_ci} 83662306a36Sopenharmony_ci 83762306a36Sopenharmony_cistatic bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, 83862306a36Sopenharmony_ci struct btrfs_space_info *space_info) 83962306a36Sopenharmony_ci{ 84062306a36Sopenharmony_ci const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); 84162306a36Sopenharmony_ci u64 ordered, delalloc; 84262306a36Sopenharmony_ci u64 thresh; 84362306a36Sopenharmony_ci u64 used; 84462306a36Sopenharmony_ci 84562306a36Sopenharmony_ci thresh = mult_perc(space_info->total_bytes, 90); 84662306a36Sopenharmony_ci 84762306a36Sopenharmony_ci lockdep_assert_held(&space_info->lock); 84862306a36Sopenharmony_ci 84962306a36Sopenharmony_ci /* If we're just plain full then async reclaim just slows us down. */ 85062306a36Sopenharmony_ci if ((space_info->bytes_used + space_info->bytes_reserved + 85162306a36Sopenharmony_ci global_rsv_size) >= thresh) 85262306a36Sopenharmony_ci return false; 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci used = space_info->bytes_may_use + space_info->bytes_pinned; 85562306a36Sopenharmony_ci 85662306a36Sopenharmony_ci /* The total flushable belongs to the global rsv, don't flush. */ 85762306a36Sopenharmony_ci if (global_rsv_size >= used) 85862306a36Sopenharmony_ci return false; 85962306a36Sopenharmony_ci 86062306a36Sopenharmony_ci /* 86162306a36Sopenharmony_ci * 128MiB is 1/4 of the maximum global rsv size. If we have less than 86262306a36Sopenharmony_ci * that devoted to other reservations then there's no sense in flushing, 86362306a36Sopenharmony_ci * we don't have a lot of things that need flushing. 86462306a36Sopenharmony_ci */ 86562306a36Sopenharmony_ci if (used - global_rsv_size <= SZ_128M) 86662306a36Sopenharmony_ci return false; 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci /* 86962306a36Sopenharmony_ci * We have tickets queued, bail so we don't compete with the async 87062306a36Sopenharmony_ci * flushers. 87162306a36Sopenharmony_ci */ 87262306a36Sopenharmony_ci if (space_info->reclaim_size) 87362306a36Sopenharmony_ci return false; 87462306a36Sopenharmony_ci 87562306a36Sopenharmony_ci /* 87662306a36Sopenharmony_ci * If we have over half of the free space occupied by reservations or 87762306a36Sopenharmony_ci * pinned then we want to start flushing. 87862306a36Sopenharmony_ci * 87962306a36Sopenharmony_ci * We do not do the traditional thing here, which is to say 88062306a36Sopenharmony_ci * 88162306a36Sopenharmony_ci * if (used >= ((total_bytes + avail) / 2)) 88262306a36Sopenharmony_ci * return 1; 88362306a36Sopenharmony_ci * 88462306a36Sopenharmony_ci * because this doesn't quite work how we want. If we had more than 50% 88562306a36Sopenharmony_ci * of the space_info used by bytes_used and we had 0 available we'd just 88662306a36Sopenharmony_ci * constantly run the background flusher. Instead we want it to kick in 88762306a36Sopenharmony_ci * if our reclaimable space exceeds our clamped free space. 88862306a36Sopenharmony_ci * 88962306a36Sopenharmony_ci * Our clamping range is 2^1 -> 2^8. Practically speaking that means 89062306a36Sopenharmony_ci * the following: 89162306a36Sopenharmony_ci * 89262306a36Sopenharmony_ci * Amount of RAM Minimum threshold Maximum threshold 89362306a36Sopenharmony_ci * 89462306a36Sopenharmony_ci * 256GiB 1GiB 128GiB 89562306a36Sopenharmony_ci * 128GiB 512MiB 64GiB 89662306a36Sopenharmony_ci * 64GiB 256MiB 32GiB 89762306a36Sopenharmony_ci * 32GiB 128MiB 16GiB 89862306a36Sopenharmony_ci * 16GiB 64MiB 8GiB 89962306a36Sopenharmony_ci * 90062306a36Sopenharmony_ci * These are the range our thresholds will fall in, corresponding to how 90162306a36Sopenharmony_ci * much delalloc we need for the background flusher to kick in. 90262306a36Sopenharmony_ci */ 90362306a36Sopenharmony_ci 90462306a36Sopenharmony_ci thresh = calc_available_free_space(fs_info, space_info, 90562306a36Sopenharmony_ci BTRFS_RESERVE_FLUSH_ALL); 90662306a36Sopenharmony_ci used = space_info->bytes_used + space_info->bytes_reserved + 90762306a36Sopenharmony_ci space_info->bytes_readonly + global_rsv_size; 90862306a36Sopenharmony_ci if (used < space_info->total_bytes) 90962306a36Sopenharmony_ci thresh += space_info->total_bytes - used; 91062306a36Sopenharmony_ci thresh >>= space_info->clamp; 91162306a36Sopenharmony_ci 91262306a36Sopenharmony_ci used = space_info->bytes_pinned; 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci /* 91562306a36Sopenharmony_ci * If we have more ordered bytes than delalloc bytes then we're either 91662306a36Sopenharmony_ci * doing a lot of DIO, or we simply don't have a lot of delalloc waiting 91762306a36Sopenharmony_ci * around. Preemptive flushing is only useful in that it can free up 91862306a36Sopenharmony_ci * space before tickets need to wait for things to finish. In the case 91962306a36Sopenharmony_ci * of ordered extents, preemptively waiting on ordered extents gets us 92062306a36Sopenharmony_ci * nothing, if our reservations are tied up in ordered extents we'll 92162306a36Sopenharmony_ci * simply have to slow down writers by forcing them to wait on ordered 92262306a36Sopenharmony_ci * extents. 92362306a36Sopenharmony_ci * 92462306a36Sopenharmony_ci * In the case that ordered is larger than delalloc, only include the 92562306a36Sopenharmony_ci * block reserves that we would actually be able to directly reclaim 92662306a36Sopenharmony_ci * from. In this case if we're heavy on metadata operations this will 92762306a36Sopenharmony_ci * clearly be heavy enough to warrant preemptive flushing. In the case 92862306a36Sopenharmony_ci * of heavy DIO or ordered reservations, preemptive flushing will just 92962306a36Sopenharmony_ci * waste time and cause us to slow down. 93062306a36Sopenharmony_ci * 93162306a36Sopenharmony_ci * We want to make sure we truly are maxed out on ordered however, so 93262306a36Sopenharmony_ci * cut ordered in half, and if it's still higher than delalloc then we 93362306a36Sopenharmony_ci * can keep flushing. This is to avoid the case where we start 93462306a36Sopenharmony_ci * flushing, and now delalloc == ordered and we stop preemptively 93562306a36Sopenharmony_ci * flushing when we could still have several gigs of delalloc to flush. 93662306a36Sopenharmony_ci */ 93762306a36Sopenharmony_ci ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; 93862306a36Sopenharmony_ci delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); 93962306a36Sopenharmony_ci if (ordered >= delalloc) 94062306a36Sopenharmony_ci used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) + 94162306a36Sopenharmony_ci btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv); 94262306a36Sopenharmony_ci else 94362306a36Sopenharmony_ci used += space_info->bytes_may_use - global_rsv_size; 94462306a36Sopenharmony_ci 94562306a36Sopenharmony_ci return (used >= thresh && !btrfs_fs_closing(fs_info) && 94662306a36Sopenharmony_ci !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 94762306a36Sopenharmony_ci} 94862306a36Sopenharmony_ci 94962306a36Sopenharmony_cistatic bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, 95062306a36Sopenharmony_ci struct btrfs_space_info *space_info, 95162306a36Sopenharmony_ci struct reserve_ticket *ticket) 95262306a36Sopenharmony_ci{ 95362306a36Sopenharmony_ci struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 95462306a36Sopenharmony_ci u64 min_bytes; 95562306a36Sopenharmony_ci 95662306a36Sopenharmony_ci if (!ticket->steal) 95762306a36Sopenharmony_ci return false; 95862306a36Sopenharmony_ci 95962306a36Sopenharmony_ci if (global_rsv->space_info != space_info) 96062306a36Sopenharmony_ci return false; 96162306a36Sopenharmony_ci 96262306a36Sopenharmony_ci spin_lock(&global_rsv->lock); 96362306a36Sopenharmony_ci min_bytes = mult_perc(global_rsv->size, 10); 96462306a36Sopenharmony_ci if (global_rsv->reserved < min_bytes + ticket->bytes) { 96562306a36Sopenharmony_ci spin_unlock(&global_rsv->lock); 96662306a36Sopenharmony_ci return false; 96762306a36Sopenharmony_ci } 96862306a36Sopenharmony_ci global_rsv->reserved -= ticket->bytes; 96962306a36Sopenharmony_ci remove_ticket(space_info, ticket); 97062306a36Sopenharmony_ci ticket->bytes = 0; 97162306a36Sopenharmony_ci wake_up(&ticket->wait); 97262306a36Sopenharmony_ci space_info->tickets_id++; 97362306a36Sopenharmony_ci if (global_rsv->reserved < global_rsv->size) 97462306a36Sopenharmony_ci global_rsv->full = 0; 97562306a36Sopenharmony_ci spin_unlock(&global_rsv->lock); 97662306a36Sopenharmony_ci 97762306a36Sopenharmony_ci return true; 97862306a36Sopenharmony_ci} 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_ci/* 98162306a36Sopenharmony_ci * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets 98262306a36Sopenharmony_ci * @fs_info - fs_info for this fs 98362306a36Sopenharmony_ci * @space_info - the space info we were flushing 98462306a36Sopenharmony_ci * 98562306a36Sopenharmony_ci * We call this when we've exhausted our flushing ability and haven't made 98662306a36Sopenharmony_ci * progress in satisfying tickets. The reservation code handles tickets in 98762306a36Sopenharmony_ci * order, so if there is a large ticket first and then smaller ones we could 98862306a36Sopenharmony_ci * very well satisfy the smaller tickets. This will attempt to wake up any 98962306a36Sopenharmony_ci * tickets in the list to catch this case. 99062306a36Sopenharmony_ci * 99162306a36Sopenharmony_ci * This function returns true if it was able to make progress by clearing out 99262306a36Sopenharmony_ci * other tickets, or if it stumbles across a ticket that was smaller than the 99362306a36Sopenharmony_ci * first ticket. 99462306a36Sopenharmony_ci */ 99562306a36Sopenharmony_cistatic bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, 99662306a36Sopenharmony_ci struct btrfs_space_info *space_info) 99762306a36Sopenharmony_ci{ 99862306a36Sopenharmony_ci struct reserve_ticket *ticket; 99962306a36Sopenharmony_ci u64 tickets_id = space_info->tickets_id; 100062306a36Sopenharmony_ci const bool aborted = BTRFS_FS_ERROR(fs_info); 100162306a36Sopenharmony_ci 100262306a36Sopenharmony_ci trace_btrfs_fail_all_tickets(fs_info, space_info); 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 100562306a36Sopenharmony_ci btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); 100662306a36Sopenharmony_ci __btrfs_dump_space_info(fs_info, space_info); 100762306a36Sopenharmony_ci } 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_ci while (!list_empty(&space_info->tickets) && 101062306a36Sopenharmony_ci tickets_id == space_info->tickets_id) { 101162306a36Sopenharmony_ci ticket = list_first_entry(&space_info->tickets, 101262306a36Sopenharmony_ci struct reserve_ticket, list); 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_ci if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) 101562306a36Sopenharmony_ci return true; 101662306a36Sopenharmony_ci 101762306a36Sopenharmony_ci if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 101862306a36Sopenharmony_ci btrfs_info(fs_info, "failing ticket with %llu bytes", 101962306a36Sopenharmony_ci ticket->bytes); 102062306a36Sopenharmony_ci 102162306a36Sopenharmony_ci remove_ticket(space_info, ticket); 102262306a36Sopenharmony_ci if (aborted) 102362306a36Sopenharmony_ci ticket->error = -EIO; 102462306a36Sopenharmony_ci else 102562306a36Sopenharmony_ci ticket->error = -ENOSPC; 102662306a36Sopenharmony_ci wake_up(&ticket->wait); 102762306a36Sopenharmony_ci 102862306a36Sopenharmony_ci /* 102962306a36Sopenharmony_ci * We're just throwing tickets away, so more flushing may not 103062306a36Sopenharmony_ci * trip over btrfs_try_granting_tickets, so we need to call it 103162306a36Sopenharmony_ci * here to see if we can make progress with the next ticket in 103262306a36Sopenharmony_ci * the list. 103362306a36Sopenharmony_ci */ 103462306a36Sopenharmony_ci if (!aborted) 103562306a36Sopenharmony_ci btrfs_try_granting_tickets(fs_info, space_info); 103662306a36Sopenharmony_ci } 103762306a36Sopenharmony_ci return (tickets_id != space_info->tickets_id); 103862306a36Sopenharmony_ci} 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci/* 104162306a36Sopenharmony_ci * This is for normal flushers, we can wait all goddamned day if we want to. We 104262306a36Sopenharmony_ci * will loop and continuously try to flush as long as we are making progress. 104362306a36Sopenharmony_ci * We count progress as clearing off tickets each time we have to loop. 104462306a36Sopenharmony_ci */ 104562306a36Sopenharmony_cistatic void btrfs_async_reclaim_metadata_space(struct work_struct *work) 104662306a36Sopenharmony_ci{ 104762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info; 104862306a36Sopenharmony_ci struct btrfs_space_info *space_info; 104962306a36Sopenharmony_ci u64 to_reclaim; 105062306a36Sopenharmony_ci enum btrfs_flush_state flush_state; 105162306a36Sopenharmony_ci int commit_cycles = 0; 105262306a36Sopenharmony_ci u64 last_tickets_id; 105362306a36Sopenharmony_ci 105462306a36Sopenharmony_ci fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 105562306a36Sopenharmony_ci space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 105662306a36Sopenharmony_ci 105762306a36Sopenharmony_ci spin_lock(&space_info->lock); 105862306a36Sopenharmony_ci to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); 105962306a36Sopenharmony_ci if (!to_reclaim) { 106062306a36Sopenharmony_ci space_info->flush = 0; 106162306a36Sopenharmony_ci spin_unlock(&space_info->lock); 106262306a36Sopenharmony_ci return; 106362306a36Sopenharmony_ci } 106462306a36Sopenharmony_ci last_tickets_id = space_info->tickets_id; 106562306a36Sopenharmony_ci spin_unlock(&space_info->lock); 106662306a36Sopenharmony_ci 106762306a36Sopenharmony_ci flush_state = FLUSH_DELAYED_ITEMS_NR; 106862306a36Sopenharmony_ci do { 106962306a36Sopenharmony_ci flush_space(fs_info, space_info, to_reclaim, flush_state, false); 107062306a36Sopenharmony_ci spin_lock(&space_info->lock); 107162306a36Sopenharmony_ci if (list_empty(&space_info->tickets)) { 107262306a36Sopenharmony_ci space_info->flush = 0; 107362306a36Sopenharmony_ci spin_unlock(&space_info->lock); 107462306a36Sopenharmony_ci return; 107562306a36Sopenharmony_ci } 107662306a36Sopenharmony_ci to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 107762306a36Sopenharmony_ci space_info); 107862306a36Sopenharmony_ci if (last_tickets_id == space_info->tickets_id) { 107962306a36Sopenharmony_ci flush_state++; 108062306a36Sopenharmony_ci } else { 108162306a36Sopenharmony_ci last_tickets_id = space_info->tickets_id; 108262306a36Sopenharmony_ci flush_state = FLUSH_DELAYED_ITEMS_NR; 108362306a36Sopenharmony_ci if (commit_cycles) 108462306a36Sopenharmony_ci commit_cycles--; 108562306a36Sopenharmony_ci } 108662306a36Sopenharmony_ci 108762306a36Sopenharmony_ci /* 108862306a36Sopenharmony_ci * We do not want to empty the system of delalloc unless we're 108962306a36Sopenharmony_ci * under heavy pressure, so allow one trip through the flushing 109062306a36Sopenharmony_ci * logic before we start doing a FLUSH_DELALLOC_FULL. 109162306a36Sopenharmony_ci */ 109262306a36Sopenharmony_ci if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles) 109362306a36Sopenharmony_ci flush_state++; 109462306a36Sopenharmony_ci 109562306a36Sopenharmony_ci /* 109662306a36Sopenharmony_ci * We don't want to force a chunk allocation until we've tried 109762306a36Sopenharmony_ci * pretty hard to reclaim space. Think of the case where we 109862306a36Sopenharmony_ci * freed up a bunch of space and so have a lot of pinned space 109962306a36Sopenharmony_ci * to reclaim. We would rather use that than possibly create a 110062306a36Sopenharmony_ci * underutilized metadata chunk. So if this is our first run 110162306a36Sopenharmony_ci * through the flushing state machine skip ALLOC_CHUNK_FORCE and 110262306a36Sopenharmony_ci * commit the transaction. If nothing has changed the next go 110362306a36Sopenharmony_ci * around then we can force a chunk allocation. 110462306a36Sopenharmony_ci */ 110562306a36Sopenharmony_ci if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 110662306a36Sopenharmony_ci flush_state++; 110762306a36Sopenharmony_ci 110862306a36Sopenharmony_ci if (flush_state > COMMIT_TRANS) { 110962306a36Sopenharmony_ci commit_cycles++; 111062306a36Sopenharmony_ci if (commit_cycles > 2) { 111162306a36Sopenharmony_ci if (maybe_fail_all_tickets(fs_info, space_info)) { 111262306a36Sopenharmony_ci flush_state = FLUSH_DELAYED_ITEMS_NR; 111362306a36Sopenharmony_ci commit_cycles--; 111462306a36Sopenharmony_ci } else { 111562306a36Sopenharmony_ci space_info->flush = 0; 111662306a36Sopenharmony_ci } 111762306a36Sopenharmony_ci } else { 111862306a36Sopenharmony_ci flush_state = FLUSH_DELAYED_ITEMS_NR; 111962306a36Sopenharmony_ci } 112062306a36Sopenharmony_ci } 112162306a36Sopenharmony_ci spin_unlock(&space_info->lock); 112262306a36Sopenharmony_ci } while (flush_state <= COMMIT_TRANS); 112362306a36Sopenharmony_ci} 112462306a36Sopenharmony_ci 112562306a36Sopenharmony_ci/* 112662306a36Sopenharmony_ci * This handles pre-flushing of metadata space before we get to the point that 112762306a36Sopenharmony_ci * we need to start blocking threads on tickets. The logic here is different 112862306a36Sopenharmony_ci * from the other flush paths because it doesn't rely on tickets to tell us how 112962306a36Sopenharmony_ci * much we need to flush, instead it attempts to keep us below the 80% full 113062306a36Sopenharmony_ci * watermark of space by flushing whichever reservation pool is currently the 113162306a36Sopenharmony_ci * largest. 113262306a36Sopenharmony_ci */ 113362306a36Sopenharmony_cistatic void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) 113462306a36Sopenharmony_ci{ 113562306a36Sopenharmony_ci struct btrfs_fs_info *fs_info; 113662306a36Sopenharmony_ci struct btrfs_space_info *space_info; 113762306a36Sopenharmony_ci struct btrfs_block_rsv *delayed_block_rsv; 113862306a36Sopenharmony_ci struct btrfs_block_rsv *delayed_refs_rsv; 113962306a36Sopenharmony_ci struct btrfs_block_rsv *global_rsv; 114062306a36Sopenharmony_ci struct btrfs_block_rsv *trans_rsv; 114162306a36Sopenharmony_ci int loops = 0; 114262306a36Sopenharmony_ci 114362306a36Sopenharmony_ci fs_info = container_of(work, struct btrfs_fs_info, 114462306a36Sopenharmony_ci preempt_reclaim_work); 114562306a36Sopenharmony_ci space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 114662306a36Sopenharmony_ci delayed_block_rsv = &fs_info->delayed_block_rsv; 114762306a36Sopenharmony_ci delayed_refs_rsv = &fs_info->delayed_refs_rsv; 114862306a36Sopenharmony_ci global_rsv = &fs_info->global_block_rsv; 114962306a36Sopenharmony_ci trans_rsv = &fs_info->trans_block_rsv; 115062306a36Sopenharmony_ci 115162306a36Sopenharmony_ci spin_lock(&space_info->lock); 115262306a36Sopenharmony_ci while (need_preemptive_reclaim(fs_info, space_info)) { 115362306a36Sopenharmony_ci enum btrfs_flush_state flush; 115462306a36Sopenharmony_ci u64 delalloc_size = 0; 115562306a36Sopenharmony_ci u64 to_reclaim, block_rsv_size; 115662306a36Sopenharmony_ci const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv); 115762306a36Sopenharmony_ci 115862306a36Sopenharmony_ci loops++; 115962306a36Sopenharmony_ci 116062306a36Sopenharmony_ci /* 116162306a36Sopenharmony_ci * We don't have a precise counter for the metadata being 116262306a36Sopenharmony_ci * reserved for delalloc, so we'll approximate it by subtracting 116362306a36Sopenharmony_ci * out the block rsv's space from the bytes_may_use. If that 116462306a36Sopenharmony_ci * amount is higher than the individual reserves, then we can 116562306a36Sopenharmony_ci * assume it's tied up in delalloc reservations. 116662306a36Sopenharmony_ci */ 116762306a36Sopenharmony_ci block_rsv_size = global_rsv_size + 116862306a36Sopenharmony_ci btrfs_block_rsv_reserved(delayed_block_rsv) + 116962306a36Sopenharmony_ci btrfs_block_rsv_reserved(delayed_refs_rsv) + 117062306a36Sopenharmony_ci btrfs_block_rsv_reserved(trans_rsv); 117162306a36Sopenharmony_ci if (block_rsv_size < space_info->bytes_may_use) 117262306a36Sopenharmony_ci delalloc_size = space_info->bytes_may_use - block_rsv_size; 117362306a36Sopenharmony_ci 117462306a36Sopenharmony_ci /* 117562306a36Sopenharmony_ci * We don't want to include the global_rsv in our calculation, 117662306a36Sopenharmony_ci * because that's space we can't touch. Subtract it from the 117762306a36Sopenharmony_ci * block_rsv_size for the next checks. 117862306a36Sopenharmony_ci */ 117962306a36Sopenharmony_ci block_rsv_size -= global_rsv_size; 118062306a36Sopenharmony_ci 118162306a36Sopenharmony_ci /* 118262306a36Sopenharmony_ci * We really want to avoid flushing delalloc too much, as it 118362306a36Sopenharmony_ci * could result in poor allocation patterns, so only flush it if 118462306a36Sopenharmony_ci * it's larger than the rest of the pools combined. 118562306a36Sopenharmony_ci */ 118662306a36Sopenharmony_ci if (delalloc_size > block_rsv_size) { 118762306a36Sopenharmony_ci to_reclaim = delalloc_size; 118862306a36Sopenharmony_ci flush = FLUSH_DELALLOC; 118962306a36Sopenharmony_ci } else if (space_info->bytes_pinned > 119062306a36Sopenharmony_ci (btrfs_block_rsv_reserved(delayed_block_rsv) + 119162306a36Sopenharmony_ci btrfs_block_rsv_reserved(delayed_refs_rsv))) { 119262306a36Sopenharmony_ci to_reclaim = space_info->bytes_pinned; 119362306a36Sopenharmony_ci flush = COMMIT_TRANS; 119462306a36Sopenharmony_ci } else if (btrfs_block_rsv_reserved(delayed_block_rsv) > 119562306a36Sopenharmony_ci btrfs_block_rsv_reserved(delayed_refs_rsv)) { 119662306a36Sopenharmony_ci to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv); 119762306a36Sopenharmony_ci flush = FLUSH_DELAYED_ITEMS_NR; 119862306a36Sopenharmony_ci } else { 119962306a36Sopenharmony_ci to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv); 120062306a36Sopenharmony_ci flush = FLUSH_DELAYED_REFS_NR; 120162306a36Sopenharmony_ci } 120262306a36Sopenharmony_ci 120362306a36Sopenharmony_ci spin_unlock(&space_info->lock); 120462306a36Sopenharmony_ci 120562306a36Sopenharmony_ci /* 120662306a36Sopenharmony_ci * We don't want to reclaim everything, just a portion, so scale 120762306a36Sopenharmony_ci * down the to_reclaim by 1/4. If it takes us down to 0, 120862306a36Sopenharmony_ci * reclaim 1 items worth. 120962306a36Sopenharmony_ci */ 121062306a36Sopenharmony_ci to_reclaim >>= 2; 121162306a36Sopenharmony_ci if (!to_reclaim) 121262306a36Sopenharmony_ci to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); 121362306a36Sopenharmony_ci flush_space(fs_info, space_info, to_reclaim, flush, true); 121462306a36Sopenharmony_ci cond_resched(); 121562306a36Sopenharmony_ci spin_lock(&space_info->lock); 121662306a36Sopenharmony_ci } 121762306a36Sopenharmony_ci 121862306a36Sopenharmony_ci /* We only went through once, back off our clamping. */ 121962306a36Sopenharmony_ci if (loops == 1 && !space_info->reclaim_size) 122062306a36Sopenharmony_ci space_info->clamp = max(1, space_info->clamp - 1); 122162306a36Sopenharmony_ci trace_btrfs_done_preemptive_reclaim(fs_info, space_info); 122262306a36Sopenharmony_ci spin_unlock(&space_info->lock); 122362306a36Sopenharmony_ci} 122462306a36Sopenharmony_ci 122562306a36Sopenharmony_ci/* 122662306a36Sopenharmony_ci * FLUSH_DELALLOC_WAIT: 122762306a36Sopenharmony_ci * Space is freed from flushing delalloc in one of two ways. 122862306a36Sopenharmony_ci * 122962306a36Sopenharmony_ci * 1) compression is on and we allocate less space than we reserved 123062306a36Sopenharmony_ci * 2) we are overwriting existing space 123162306a36Sopenharmony_ci * 123262306a36Sopenharmony_ci * For #1 that extra space is reclaimed as soon as the delalloc pages are 123362306a36Sopenharmony_ci * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent 123462306a36Sopenharmony_ci * length to ->bytes_reserved, and subtracts the reserved space from 123562306a36Sopenharmony_ci * ->bytes_may_use. 123662306a36Sopenharmony_ci * 123762306a36Sopenharmony_ci * For #2 this is trickier. Once the ordered extent runs we will drop the 123862306a36Sopenharmony_ci * extent in the range we are overwriting, which creates a delayed ref for 123962306a36Sopenharmony_ci * that freed extent. This however is not reclaimed until the transaction 124062306a36Sopenharmony_ci * commits, thus the next stages. 124162306a36Sopenharmony_ci * 124262306a36Sopenharmony_ci * RUN_DELAYED_IPUTS 124362306a36Sopenharmony_ci * If we are freeing inodes, we want to make sure all delayed iputs have 124462306a36Sopenharmony_ci * completed, because they could have been on an inode with i_nlink == 0, and 124562306a36Sopenharmony_ci * thus have been truncated and freed up space. But again this space is not 124662306a36Sopenharmony_ci * immediately re-usable, it comes in the form of a delayed ref, which must be 124762306a36Sopenharmony_ci * run and then the transaction must be committed. 124862306a36Sopenharmony_ci * 124962306a36Sopenharmony_ci * COMMIT_TRANS 125062306a36Sopenharmony_ci * This is where we reclaim all of the pinned space generated by running the 125162306a36Sopenharmony_ci * iputs 125262306a36Sopenharmony_ci * 125362306a36Sopenharmony_ci * ALLOC_CHUNK_FORCE 125462306a36Sopenharmony_ci * For data we start with alloc chunk force, however we could have been full 125562306a36Sopenharmony_ci * before, and then the transaction commit could have freed new block groups, 125662306a36Sopenharmony_ci * so if we now have space to allocate do the force chunk allocation. 125762306a36Sopenharmony_ci */ 125862306a36Sopenharmony_cistatic const enum btrfs_flush_state data_flush_states[] = { 125962306a36Sopenharmony_ci FLUSH_DELALLOC_FULL, 126062306a36Sopenharmony_ci RUN_DELAYED_IPUTS, 126162306a36Sopenharmony_ci COMMIT_TRANS, 126262306a36Sopenharmony_ci ALLOC_CHUNK_FORCE, 126362306a36Sopenharmony_ci}; 126462306a36Sopenharmony_ci 126562306a36Sopenharmony_cistatic void btrfs_async_reclaim_data_space(struct work_struct *work) 126662306a36Sopenharmony_ci{ 126762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info; 126862306a36Sopenharmony_ci struct btrfs_space_info *space_info; 126962306a36Sopenharmony_ci u64 last_tickets_id; 127062306a36Sopenharmony_ci enum btrfs_flush_state flush_state = 0; 127162306a36Sopenharmony_ci 127262306a36Sopenharmony_ci fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); 127362306a36Sopenharmony_ci space_info = fs_info->data_sinfo; 127462306a36Sopenharmony_ci 127562306a36Sopenharmony_ci spin_lock(&space_info->lock); 127662306a36Sopenharmony_ci if (list_empty(&space_info->tickets)) { 127762306a36Sopenharmony_ci space_info->flush = 0; 127862306a36Sopenharmony_ci spin_unlock(&space_info->lock); 127962306a36Sopenharmony_ci return; 128062306a36Sopenharmony_ci } 128162306a36Sopenharmony_ci last_tickets_id = space_info->tickets_id; 128262306a36Sopenharmony_ci spin_unlock(&space_info->lock); 128362306a36Sopenharmony_ci 128462306a36Sopenharmony_ci while (!space_info->full) { 128562306a36Sopenharmony_ci flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); 128662306a36Sopenharmony_ci spin_lock(&space_info->lock); 128762306a36Sopenharmony_ci if (list_empty(&space_info->tickets)) { 128862306a36Sopenharmony_ci space_info->flush = 0; 128962306a36Sopenharmony_ci spin_unlock(&space_info->lock); 129062306a36Sopenharmony_ci return; 129162306a36Sopenharmony_ci } 129262306a36Sopenharmony_ci 129362306a36Sopenharmony_ci /* Something happened, fail everything and bail. */ 129462306a36Sopenharmony_ci if (BTRFS_FS_ERROR(fs_info)) 129562306a36Sopenharmony_ci goto aborted_fs; 129662306a36Sopenharmony_ci last_tickets_id = space_info->tickets_id; 129762306a36Sopenharmony_ci spin_unlock(&space_info->lock); 129862306a36Sopenharmony_ci } 129962306a36Sopenharmony_ci 130062306a36Sopenharmony_ci while (flush_state < ARRAY_SIZE(data_flush_states)) { 130162306a36Sopenharmony_ci flush_space(fs_info, space_info, U64_MAX, 130262306a36Sopenharmony_ci data_flush_states[flush_state], false); 130362306a36Sopenharmony_ci spin_lock(&space_info->lock); 130462306a36Sopenharmony_ci if (list_empty(&space_info->tickets)) { 130562306a36Sopenharmony_ci space_info->flush = 0; 130662306a36Sopenharmony_ci spin_unlock(&space_info->lock); 130762306a36Sopenharmony_ci return; 130862306a36Sopenharmony_ci } 130962306a36Sopenharmony_ci 131062306a36Sopenharmony_ci if (last_tickets_id == space_info->tickets_id) { 131162306a36Sopenharmony_ci flush_state++; 131262306a36Sopenharmony_ci } else { 131362306a36Sopenharmony_ci last_tickets_id = space_info->tickets_id; 131462306a36Sopenharmony_ci flush_state = 0; 131562306a36Sopenharmony_ci } 131662306a36Sopenharmony_ci 131762306a36Sopenharmony_ci if (flush_state >= ARRAY_SIZE(data_flush_states)) { 131862306a36Sopenharmony_ci if (space_info->full) { 131962306a36Sopenharmony_ci if (maybe_fail_all_tickets(fs_info, space_info)) 132062306a36Sopenharmony_ci flush_state = 0; 132162306a36Sopenharmony_ci else 132262306a36Sopenharmony_ci space_info->flush = 0; 132362306a36Sopenharmony_ci } else { 132462306a36Sopenharmony_ci flush_state = 0; 132562306a36Sopenharmony_ci } 132662306a36Sopenharmony_ci 132762306a36Sopenharmony_ci /* Something happened, fail everything and bail. */ 132862306a36Sopenharmony_ci if (BTRFS_FS_ERROR(fs_info)) 132962306a36Sopenharmony_ci goto aborted_fs; 133062306a36Sopenharmony_ci 133162306a36Sopenharmony_ci } 133262306a36Sopenharmony_ci spin_unlock(&space_info->lock); 133362306a36Sopenharmony_ci } 133462306a36Sopenharmony_ci return; 133562306a36Sopenharmony_ci 133662306a36Sopenharmony_ciaborted_fs: 133762306a36Sopenharmony_ci maybe_fail_all_tickets(fs_info, space_info); 133862306a36Sopenharmony_ci space_info->flush = 0; 133962306a36Sopenharmony_ci spin_unlock(&space_info->lock); 134062306a36Sopenharmony_ci} 134162306a36Sopenharmony_ci 134262306a36Sopenharmony_civoid btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) 134362306a36Sopenharmony_ci{ 134462306a36Sopenharmony_ci INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); 134562306a36Sopenharmony_ci INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); 134662306a36Sopenharmony_ci INIT_WORK(&fs_info->preempt_reclaim_work, 134762306a36Sopenharmony_ci btrfs_preempt_reclaim_metadata_space); 134862306a36Sopenharmony_ci} 134962306a36Sopenharmony_ci 135062306a36Sopenharmony_cistatic const enum btrfs_flush_state priority_flush_states[] = { 135162306a36Sopenharmony_ci FLUSH_DELAYED_ITEMS_NR, 135262306a36Sopenharmony_ci FLUSH_DELAYED_ITEMS, 135362306a36Sopenharmony_ci ALLOC_CHUNK, 135462306a36Sopenharmony_ci}; 135562306a36Sopenharmony_ci 135662306a36Sopenharmony_cistatic const enum btrfs_flush_state evict_flush_states[] = { 135762306a36Sopenharmony_ci FLUSH_DELAYED_ITEMS_NR, 135862306a36Sopenharmony_ci FLUSH_DELAYED_ITEMS, 135962306a36Sopenharmony_ci FLUSH_DELAYED_REFS_NR, 136062306a36Sopenharmony_ci FLUSH_DELAYED_REFS, 136162306a36Sopenharmony_ci FLUSH_DELALLOC, 136262306a36Sopenharmony_ci FLUSH_DELALLOC_WAIT, 136362306a36Sopenharmony_ci FLUSH_DELALLOC_FULL, 136462306a36Sopenharmony_ci ALLOC_CHUNK, 136562306a36Sopenharmony_ci COMMIT_TRANS, 136662306a36Sopenharmony_ci}; 136762306a36Sopenharmony_ci 136862306a36Sopenharmony_cistatic void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 136962306a36Sopenharmony_ci struct btrfs_space_info *space_info, 137062306a36Sopenharmony_ci struct reserve_ticket *ticket, 137162306a36Sopenharmony_ci const enum btrfs_flush_state *states, 137262306a36Sopenharmony_ci int states_nr) 137362306a36Sopenharmony_ci{ 137462306a36Sopenharmony_ci u64 to_reclaim; 137562306a36Sopenharmony_ci int flush_state = 0; 137662306a36Sopenharmony_ci 137762306a36Sopenharmony_ci spin_lock(&space_info->lock); 137862306a36Sopenharmony_ci to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); 137962306a36Sopenharmony_ci /* 138062306a36Sopenharmony_ci * This is the priority reclaim path, so to_reclaim could be >0 still 138162306a36Sopenharmony_ci * because we may have only satisfied the priority tickets and still 138262306a36Sopenharmony_ci * left non priority tickets on the list. We would then have 138362306a36Sopenharmony_ci * to_reclaim but ->bytes == 0. 138462306a36Sopenharmony_ci */ 138562306a36Sopenharmony_ci if (ticket->bytes == 0) { 138662306a36Sopenharmony_ci spin_unlock(&space_info->lock); 138762306a36Sopenharmony_ci return; 138862306a36Sopenharmony_ci } 138962306a36Sopenharmony_ci 139062306a36Sopenharmony_ci while (flush_state < states_nr) { 139162306a36Sopenharmony_ci spin_unlock(&space_info->lock); 139262306a36Sopenharmony_ci flush_space(fs_info, space_info, to_reclaim, states[flush_state], 139362306a36Sopenharmony_ci false); 139462306a36Sopenharmony_ci flush_state++; 139562306a36Sopenharmony_ci spin_lock(&space_info->lock); 139662306a36Sopenharmony_ci if (ticket->bytes == 0) { 139762306a36Sopenharmony_ci spin_unlock(&space_info->lock); 139862306a36Sopenharmony_ci return; 139962306a36Sopenharmony_ci } 140062306a36Sopenharmony_ci } 140162306a36Sopenharmony_ci 140262306a36Sopenharmony_ci /* 140362306a36Sopenharmony_ci * Attempt to steal from the global rsv if we can, except if the fs was 140462306a36Sopenharmony_ci * turned into error mode due to a transaction abort when flushing space 140562306a36Sopenharmony_ci * above, in that case fail with the abort error instead of returning 140662306a36Sopenharmony_ci * success to the caller if we can steal from the global rsv - this is 140762306a36Sopenharmony_ci * just to have caller fail immeditelly instead of later when trying to 140862306a36Sopenharmony_ci * modify the fs, making it easier to debug -ENOSPC problems. 140962306a36Sopenharmony_ci */ 141062306a36Sopenharmony_ci if (BTRFS_FS_ERROR(fs_info)) { 141162306a36Sopenharmony_ci ticket->error = BTRFS_FS_ERROR(fs_info); 141262306a36Sopenharmony_ci remove_ticket(space_info, ticket); 141362306a36Sopenharmony_ci } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { 141462306a36Sopenharmony_ci ticket->error = -ENOSPC; 141562306a36Sopenharmony_ci remove_ticket(space_info, ticket); 141662306a36Sopenharmony_ci } 141762306a36Sopenharmony_ci 141862306a36Sopenharmony_ci /* 141962306a36Sopenharmony_ci * We must run try_granting_tickets here because we could be a large 142062306a36Sopenharmony_ci * ticket in front of a smaller ticket that can now be satisfied with 142162306a36Sopenharmony_ci * the available space. 142262306a36Sopenharmony_ci */ 142362306a36Sopenharmony_ci btrfs_try_granting_tickets(fs_info, space_info); 142462306a36Sopenharmony_ci spin_unlock(&space_info->lock); 142562306a36Sopenharmony_ci} 142662306a36Sopenharmony_ci 142762306a36Sopenharmony_cistatic void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, 142862306a36Sopenharmony_ci struct btrfs_space_info *space_info, 142962306a36Sopenharmony_ci struct reserve_ticket *ticket) 143062306a36Sopenharmony_ci{ 143162306a36Sopenharmony_ci spin_lock(&space_info->lock); 143262306a36Sopenharmony_ci 143362306a36Sopenharmony_ci /* We could have been granted before we got here. */ 143462306a36Sopenharmony_ci if (ticket->bytes == 0) { 143562306a36Sopenharmony_ci spin_unlock(&space_info->lock); 143662306a36Sopenharmony_ci return; 143762306a36Sopenharmony_ci } 143862306a36Sopenharmony_ci 143962306a36Sopenharmony_ci while (!space_info->full) { 144062306a36Sopenharmony_ci spin_unlock(&space_info->lock); 144162306a36Sopenharmony_ci flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); 144262306a36Sopenharmony_ci spin_lock(&space_info->lock); 144362306a36Sopenharmony_ci if (ticket->bytes == 0) { 144462306a36Sopenharmony_ci spin_unlock(&space_info->lock); 144562306a36Sopenharmony_ci return; 144662306a36Sopenharmony_ci } 144762306a36Sopenharmony_ci } 144862306a36Sopenharmony_ci 144962306a36Sopenharmony_ci ticket->error = -ENOSPC; 145062306a36Sopenharmony_ci remove_ticket(space_info, ticket); 145162306a36Sopenharmony_ci btrfs_try_granting_tickets(fs_info, space_info); 145262306a36Sopenharmony_ci spin_unlock(&space_info->lock); 145362306a36Sopenharmony_ci} 145462306a36Sopenharmony_ci 145562306a36Sopenharmony_cistatic void wait_reserve_ticket(struct btrfs_fs_info *fs_info, 145662306a36Sopenharmony_ci struct btrfs_space_info *space_info, 145762306a36Sopenharmony_ci struct reserve_ticket *ticket) 145862306a36Sopenharmony_ci 145962306a36Sopenharmony_ci{ 146062306a36Sopenharmony_ci DEFINE_WAIT(wait); 146162306a36Sopenharmony_ci int ret = 0; 146262306a36Sopenharmony_ci 146362306a36Sopenharmony_ci spin_lock(&space_info->lock); 146462306a36Sopenharmony_ci while (ticket->bytes > 0 && ticket->error == 0) { 146562306a36Sopenharmony_ci ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 146662306a36Sopenharmony_ci if (ret) { 146762306a36Sopenharmony_ci /* 146862306a36Sopenharmony_ci * Delete us from the list. After we unlock the space 146962306a36Sopenharmony_ci * info, we don't want the async reclaim job to reserve 147062306a36Sopenharmony_ci * space for this ticket. If that would happen, then the 147162306a36Sopenharmony_ci * ticket's task would not known that space was reserved 147262306a36Sopenharmony_ci * despite getting an error, resulting in a space leak 147362306a36Sopenharmony_ci * (bytes_may_use counter of our space_info). 147462306a36Sopenharmony_ci */ 147562306a36Sopenharmony_ci remove_ticket(space_info, ticket); 147662306a36Sopenharmony_ci ticket->error = -EINTR; 147762306a36Sopenharmony_ci break; 147862306a36Sopenharmony_ci } 147962306a36Sopenharmony_ci spin_unlock(&space_info->lock); 148062306a36Sopenharmony_ci 148162306a36Sopenharmony_ci schedule(); 148262306a36Sopenharmony_ci 148362306a36Sopenharmony_ci finish_wait(&ticket->wait, &wait); 148462306a36Sopenharmony_ci spin_lock(&space_info->lock); 148562306a36Sopenharmony_ci } 148662306a36Sopenharmony_ci spin_unlock(&space_info->lock); 148762306a36Sopenharmony_ci} 148862306a36Sopenharmony_ci 148962306a36Sopenharmony_ci/* 149062306a36Sopenharmony_ci * Do the appropriate flushing and waiting for a ticket. 149162306a36Sopenharmony_ci * 149262306a36Sopenharmony_ci * @fs_info: the filesystem 149362306a36Sopenharmony_ci * @space_info: space info for the reservation 149462306a36Sopenharmony_ci * @ticket: ticket for the reservation 149562306a36Sopenharmony_ci * @start_ns: timestamp when the reservation started 149662306a36Sopenharmony_ci * @orig_bytes: amount of bytes originally reserved 149762306a36Sopenharmony_ci * @flush: how much we can flush 149862306a36Sopenharmony_ci * 149962306a36Sopenharmony_ci * This does the work of figuring out how to flush for the ticket, waiting for 150062306a36Sopenharmony_ci * the reservation, and returning the appropriate error if there is one. 150162306a36Sopenharmony_ci */ 150262306a36Sopenharmony_cistatic int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 150362306a36Sopenharmony_ci struct btrfs_space_info *space_info, 150462306a36Sopenharmony_ci struct reserve_ticket *ticket, 150562306a36Sopenharmony_ci u64 start_ns, u64 orig_bytes, 150662306a36Sopenharmony_ci enum btrfs_reserve_flush_enum flush) 150762306a36Sopenharmony_ci{ 150862306a36Sopenharmony_ci int ret; 150962306a36Sopenharmony_ci 151062306a36Sopenharmony_ci switch (flush) { 151162306a36Sopenharmony_ci case BTRFS_RESERVE_FLUSH_DATA: 151262306a36Sopenharmony_ci case BTRFS_RESERVE_FLUSH_ALL: 151362306a36Sopenharmony_ci case BTRFS_RESERVE_FLUSH_ALL_STEAL: 151462306a36Sopenharmony_ci wait_reserve_ticket(fs_info, space_info, ticket); 151562306a36Sopenharmony_ci break; 151662306a36Sopenharmony_ci case BTRFS_RESERVE_FLUSH_LIMIT: 151762306a36Sopenharmony_ci priority_reclaim_metadata_space(fs_info, space_info, ticket, 151862306a36Sopenharmony_ci priority_flush_states, 151962306a36Sopenharmony_ci ARRAY_SIZE(priority_flush_states)); 152062306a36Sopenharmony_ci break; 152162306a36Sopenharmony_ci case BTRFS_RESERVE_FLUSH_EVICT: 152262306a36Sopenharmony_ci priority_reclaim_metadata_space(fs_info, space_info, ticket, 152362306a36Sopenharmony_ci evict_flush_states, 152462306a36Sopenharmony_ci ARRAY_SIZE(evict_flush_states)); 152562306a36Sopenharmony_ci break; 152662306a36Sopenharmony_ci case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: 152762306a36Sopenharmony_ci priority_reclaim_data_space(fs_info, space_info, ticket); 152862306a36Sopenharmony_ci break; 152962306a36Sopenharmony_ci default: 153062306a36Sopenharmony_ci ASSERT(0); 153162306a36Sopenharmony_ci break; 153262306a36Sopenharmony_ci } 153362306a36Sopenharmony_ci 153462306a36Sopenharmony_ci ret = ticket->error; 153562306a36Sopenharmony_ci ASSERT(list_empty(&ticket->list)); 153662306a36Sopenharmony_ci /* 153762306a36Sopenharmony_ci * Check that we can't have an error set if the reservation succeeded, 153862306a36Sopenharmony_ci * as that would confuse tasks and lead them to error out without 153962306a36Sopenharmony_ci * releasing reserved space (if an error happens the expectation is that 154062306a36Sopenharmony_ci * space wasn't reserved at all). 154162306a36Sopenharmony_ci */ 154262306a36Sopenharmony_ci ASSERT(!(ticket->bytes == 0 && ticket->error)); 154362306a36Sopenharmony_ci trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, 154462306a36Sopenharmony_ci start_ns, flush, ticket->error); 154562306a36Sopenharmony_ci return ret; 154662306a36Sopenharmony_ci} 154762306a36Sopenharmony_ci 154862306a36Sopenharmony_ci/* 154962306a36Sopenharmony_ci * This returns true if this flush state will go through the ordinary flushing 155062306a36Sopenharmony_ci * code. 155162306a36Sopenharmony_ci */ 155262306a36Sopenharmony_cistatic inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) 155362306a36Sopenharmony_ci{ 155462306a36Sopenharmony_ci return (flush == BTRFS_RESERVE_FLUSH_ALL) || 155562306a36Sopenharmony_ci (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); 155662306a36Sopenharmony_ci} 155762306a36Sopenharmony_ci 155862306a36Sopenharmony_cistatic inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, 155962306a36Sopenharmony_ci struct btrfs_space_info *space_info) 156062306a36Sopenharmony_ci{ 156162306a36Sopenharmony_ci u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); 156262306a36Sopenharmony_ci u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); 156362306a36Sopenharmony_ci 156462306a36Sopenharmony_ci /* 156562306a36Sopenharmony_ci * If we're heavy on ordered operations then clamping won't help us. We 156662306a36Sopenharmony_ci * need to clamp specifically to keep up with dirty'ing buffered 156762306a36Sopenharmony_ci * writers, because there's not a 1:1 correlation of writing delalloc 156862306a36Sopenharmony_ci * and freeing space, like there is with flushing delayed refs or 156962306a36Sopenharmony_ci * delayed nodes. If we're already more ordered than delalloc then 157062306a36Sopenharmony_ci * we're keeping up, otherwise we aren't and should probably clamp. 157162306a36Sopenharmony_ci */ 157262306a36Sopenharmony_ci if (ordered < delalloc) 157362306a36Sopenharmony_ci space_info->clamp = min(space_info->clamp + 1, 8); 157462306a36Sopenharmony_ci} 157562306a36Sopenharmony_ci 157662306a36Sopenharmony_cistatic inline bool can_steal(enum btrfs_reserve_flush_enum flush) 157762306a36Sopenharmony_ci{ 157862306a36Sopenharmony_ci return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || 157962306a36Sopenharmony_ci flush == BTRFS_RESERVE_FLUSH_EVICT); 158062306a36Sopenharmony_ci} 158162306a36Sopenharmony_ci 158262306a36Sopenharmony_ci/* 158362306a36Sopenharmony_ci * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to 158462306a36Sopenharmony_ci * fail as quickly as possible. 158562306a36Sopenharmony_ci */ 158662306a36Sopenharmony_cistatic inline bool can_ticket(enum btrfs_reserve_flush_enum flush) 158762306a36Sopenharmony_ci{ 158862306a36Sopenharmony_ci return (flush != BTRFS_RESERVE_NO_FLUSH && 158962306a36Sopenharmony_ci flush != BTRFS_RESERVE_FLUSH_EMERGENCY); 159062306a36Sopenharmony_ci} 159162306a36Sopenharmony_ci 159262306a36Sopenharmony_ci/* 159362306a36Sopenharmony_ci * Try to reserve bytes from the block_rsv's space. 159462306a36Sopenharmony_ci * 159562306a36Sopenharmony_ci * @fs_info: the filesystem 159662306a36Sopenharmony_ci * @space_info: space info we want to allocate from 159762306a36Sopenharmony_ci * @orig_bytes: number of bytes we want 159862306a36Sopenharmony_ci * @flush: whether or not we can flush to make our reservation 159962306a36Sopenharmony_ci * 160062306a36Sopenharmony_ci * This will reserve orig_bytes number of bytes from the space info associated 160162306a36Sopenharmony_ci * with the block_rsv. If there is not enough space it will make an attempt to 160262306a36Sopenharmony_ci * flush out space to make room. It will do this by flushing delalloc if 160362306a36Sopenharmony_ci * possible or committing the transaction. If flush is 0 then no attempts to 160462306a36Sopenharmony_ci * regain reservations will be made and this will fail if there is not enough 160562306a36Sopenharmony_ci * space already. 160662306a36Sopenharmony_ci */ 160762306a36Sopenharmony_cistatic int __reserve_bytes(struct btrfs_fs_info *fs_info, 160862306a36Sopenharmony_ci struct btrfs_space_info *space_info, u64 orig_bytes, 160962306a36Sopenharmony_ci enum btrfs_reserve_flush_enum flush) 161062306a36Sopenharmony_ci{ 161162306a36Sopenharmony_ci struct work_struct *async_work; 161262306a36Sopenharmony_ci struct reserve_ticket ticket; 161362306a36Sopenharmony_ci u64 start_ns = 0; 161462306a36Sopenharmony_ci u64 used; 161562306a36Sopenharmony_ci int ret = -ENOSPC; 161662306a36Sopenharmony_ci bool pending_tickets; 161762306a36Sopenharmony_ci 161862306a36Sopenharmony_ci ASSERT(orig_bytes); 161962306a36Sopenharmony_ci /* 162062306a36Sopenharmony_ci * If have a transaction handle (current->journal_info != NULL), then 162162306a36Sopenharmony_ci * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor 162262306a36Sopenharmony_ci * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those 162362306a36Sopenharmony_ci * flushing methods can trigger transaction commits. 162462306a36Sopenharmony_ci */ 162562306a36Sopenharmony_ci if (current->journal_info) { 162662306a36Sopenharmony_ci /* One assert per line for easier debugging. */ 162762306a36Sopenharmony_ci ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); 162862306a36Sopenharmony_ci ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); 162962306a36Sopenharmony_ci ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); 163062306a36Sopenharmony_ci } 163162306a36Sopenharmony_ci 163262306a36Sopenharmony_ci if (flush == BTRFS_RESERVE_FLUSH_DATA) 163362306a36Sopenharmony_ci async_work = &fs_info->async_data_reclaim_work; 163462306a36Sopenharmony_ci else 163562306a36Sopenharmony_ci async_work = &fs_info->async_reclaim_work; 163662306a36Sopenharmony_ci 163762306a36Sopenharmony_ci spin_lock(&space_info->lock); 163862306a36Sopenharmony_ci used = btrfs_space_info_used(space_info, true); 163962306a36Sopenharmony_ci 164062306a36Sopenharmony_ci /* 164162306a36Sopenharmony_ci * We don't want NO_FLUSH allocations to jump everybody, they can 164262306a36Sopenharmony_ci * generally handle ENOSPC in a different way, so treat them the same as 164362306a36Sopenharmony_ci * normal flushers when it comes to skipping pending tickets. 164462306a36Sopenharmony_ci */ 164562306a36Sopenharmony_ci if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH)) 164662306a36Sopenharmony_ci pending_tickets = !list_empty(&space_info->tickets) || 164762306a36Sopenharmony_ci !list_empty(&space_info->priority_tickets); 164862306a36Sopenharmony_ci else 164962306a36Sopenharmony_ci pending_tickets = !list_empty(&space_info->priority_tickets); 165062306a36Sopenharmony_ci 165162306a36Sopenharmony_ci /* 165262306a36Sopenharmony_ci * Carry on if we have enough space (short-circuit) OR call 165362306a36Sopenharmony_ci * can_overcommit() to ensure we can overcommit to continue. 165462306a36Sopenharmony_ci */ 165562306a36Sopenharmony_ci if (!pending_tickets && 165662306a36Sopenharmony_ci ((used + orig_bytes <= space_info->total_bytes) || 165762306a36Sopenharmony_ci btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { 165862306a36Sopenharmony_ci btrfs_space_info_update_bytes_may_use(fs_info, space_info, 165962306a36Sopenharmony_ci orig_bytes); 166062306a36Sopenharmony_ci ret = 0; 166162306a36Sopenharmony_ci } 166262306a36Sopenharmony_ci 166362306a36Sopenharmony_ci /* 166462306a36Sopenharmony_ci * Things are dire, we need to make a reservation so we don't abort. We 166562306a36Sopenharmony_ci * will let this reservation go through as long as we have actual space 166662306a36Sopenharmony_ci * left to allocate for the block. 166762306a36Sopenharmony_ci */ 166862306a36Sopenharmony_ci if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { 166962306a36Sopenharmony_ci used = btrfs_space_info_used(space_info, false); 167062306a36Sopenharmony_ci if (used + orig_bytes <= space_info->total_bytes) { 167162306a36Sopenharmony_ci btrfs_space_info_update_bytes_may_use(fs_info, space_info, 167262306a36Sopenharmony_ci orig_bytes); 167362306a36Sopenharmony_ci ret = 0; 167462306a36Sopenharmony_ci } 167562306a36Sopenharmony_ci } 167662306a36Sopenharmony_ci 167762306a36Sopenharmony_ci /* 167862306a36Sopenharmony_ci * If we couldn't make a reservation then setup our reservation ticket 167962306a36Sopenharmony_ci * and kick the async worker if it's not already running. 168062306a36Sopenharmony_ci * 168162306a36Sopenharmony_ci * If we are a priority flusher then we just need to add our ticket to 168262306a36Sopenharmony_ci * the list and we will do our own flushing further down. 168362306a36Sopenharmony_ci */ 168462306a36Sopenharmony_ci if (ret && can_ticket(flush)) { 168562306a36Sopenharmony_ci ticket.bytes = orig_bytes; 168662306a36Sopenharmony_ci ticket.error = 0; 168762306a36Sopenharmony_ci space_info->reclaim_size += ticket.bytes; 168862306a36Sopenharmony_ci init_waitqueue_head(&ticket.wait); 168962306a36Sopenharmony_ci ticket.steal = can_steal(flush); 169062306a36Sopenharmony_ci if (trace_btrfs_reserve_ticket_enabled()) 169162306a36Sopenharmony_ci start_ns = ktime_get_ns(); 169262306a36Sopenharmony_ci 169362306a36Sopenharmony_ci if (flush == BTRFS_RESERVE_FLUSH_ALL || 169462306a36Sopenharmony_ci flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || 169562306a36Sopenharmony_ci flush == BTRFS_RESERVE_FLUSH_DATA) { 169662306a36Sopenharmony_ci list_add_tail(&ticket.list, &space_info->tickets); 169762306a36Sopenharmony_ci if (!space_info->flush) { 169862306a36Sopenharmony_ci /* 169962306a36Sopenharmony_ci * We were forced to add a reserve ticket, so 170062306a36Sopenharmony_ci * our preemptive flushing is unable to keep 170162306a36Sopenharmony_ci * up. Clamp down on the threshold for the 170262306a36Sopenharmony_ci * preemptive flushing in order to keep up with 170362306a36Sopenharmony_ci * the workload. 170462306a36Sopenharmony_ci */ 170562306a36Sopenharmony_ci maybe_clamp_preempt(fs_info, space_info); 170662306a36Sopenharmony_ci 170762306a36Sopenharmony_ci space_info->flush = 1; 170862306a36Sopenharmony_ci trace_btrfs_trigger_flush(fs_info, 170962306a36Sopenharmony_ci space_info->flags, 171062306a36Sopenharmony_ci orig_bytes, flush, 171162306a36Sopenharmony_ci "enospc"); 171262306a36Sopenharmony_ci queue_work(system_unbound_wq, async_work); 171362306a36Sopenharmony_ci } 171462306a36Sopenharmony_ci } else { 171562306a36Sopenharmony_ci list_add_tail(&ticket.list, 171662306a36Sopenharmony_ci &space_info->priority_tickets); 171762306a36Sopenharmony_ci } 171862306a36Sopenharmony_ci } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 171962306a36Sopenharmony_ci /* 172062306a36Sopenharmony_ci * We will do the space reservation dance during log replay, 172162306a36Sopenharmony_ci * which means we won't have fs_info->fs_root set, so don't do 172262306a36Sopenharmony_ci * the async reclaim as we will panic. 172362306a36Sopenharmony_ci */ 172462306a36Sopenharmony_ci if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 172562306a36Sopenharmony_ci !work_busy(&fs_info->preempt_reclaim_work) && 172662306a36Sopenharmony_ci need_preemptive_reclaim(fs_info, space_info)) { 172762306a36Sopenharmony_ci trace_btrfs_trigger_flush(fs_info, space_info->flags, 172862306a36Sopenharmony_ci orig_bytes, flush, "preempt"); 172962306a36Sopenharmony_ci queue_work(system_unbound_wq, 173062306a36Sopenharmony_ci &fs_info->preempt_reclaim_work); 173162306a36Sopenharmony_ci } 173262306a36Sopenharmony_ci } 173362306a36Sopenharmony_ci spin_unlock(&space_info->lock); 173462306a36Sopenharmony_ci if (!ret || !can_ticket(flush)) 173562306a36Sopenharmony_ci return ret; 173662306a36Sopenharmony_ci 173762306a36Sopenharmony_ci return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, 173862306a36Sopenharmony_ci orig_bytes, flush); 173962306a36Sopenharmony_ci} 174062306a36Sopenharmony_ci 174162306a36Sopenharmony_ci/* 174262306a36Sopenharmony_ci * Try to reserve metadata bytes from the block_rsv's space. 174362306a36Sopenharmony_ci * 174462306a36Sopenharmony_ci * @fs_info: the filesystem 174562306a36Sopenharmony_ci * @block_rsv: block_rsv we're allocating for 174662306a36Sopenharmony_ci * @orig_bytes: number of bytes we want 174762306a36Sopenharmony_ci * @flush: whether or not we can flush to make our reservation 174862306a36Sopenharmony_ci * 174962306a36Sopenharmony_ci * This will reserve orig_bytes number of bytes from the space info associated 175062306a36Sopenharmony_ci * with the block_rsv. If there is not enough space it will make an attempt to 175162306a36Sopenharmony_ci * flush out space to make room. It will do this by flushing delalloc if 175262306a36Sopenharmony_ci * possible or committing the transaction. If flush is 0 then no attempts to 175362306a36Sopenharmony_ci * regain reservations will be made and this will fail if there is not enough 175462306a36Sopenharmony_ci * space already. 175562306a36Sopenharmony_ci */ 175662306a36Sopenharmony_ciint btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 175762306a36Sopenharmony_ci struct btrfs_block_rsv *block_rsv, 175862306a36Sopenharmony_ci u64 orig_bytes, 175962306a36Sopenharmony_ci enum btrfs_reserve_flush_enum flush) 176062306a36Sopenharmony_ci{ 176162306a36Sopenharmony_ci int ret; 176262306a36Sopenharmony_ci 176362306a36Sopenharmony_ci ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); 176462306a36Sopenharmony_ci if (ret == -ENOSPC) { 176562306a36Sopenharmony_ci trace_btrfs_space_reservation(fs_info, "space_info:enospc", 176662306a36Sopenharmony_ci block_rsv->space_info->flags, 176762306a36Sopenharmony_ci orig_bytes, 1); 176862306a36Sopenharmony_ci 176962306a36Sopenharmony_ci if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 177062306a36Sopenharmony_ci btrfs_dump_space_info(fs_info, block_rsv->space_info, 177162306a36Sopenharmony_ci orig_bytes, 0); 177262306a36Sopenharmony_ci } 177362306a36Sopenharmony_ci return ret; 177462306a36Sopenharmony_ci} 177562306a36Sopenharmony_ci 177662306a36Sopenharmony_ci/* 177762306a36Sopenharmony_ci * Try to reserve data bytes for an allocation. 177862306a36Sopenharmony_ci * 177962306a36Sopenharmony_ci * @fs_info: the filesystem 178062306a36Sopenharmony_ci * @bytes: number of bytes we need 178162306a36Sopenharmony_ci * @flush: how we are allowed to flush 178262306a36Sopenharmony_ci * 178362306a36Sopenharmony_ci * This will reserve bytes from the data space info. If there is not enough 178462306a36Sopenharmony_ci * space then we will attempt to flush space as specified by flush. 178562306a36Sopenharmony_ci */ 178662306a36Sopenharmony_ciint btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, 178762306a36Sopenharmony_ci enum btrfs_reserve_flush_enum flush) 178862306a36Sopenharmony_ci{ 178962306a36Sopenharmony_ci struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 179062306a36Sopenharmony_ci int ret; 179162306a36Sopenharmony_ci 179262306a36Sopenharmony_ci ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || 179362306a36Sopenharmony_ci flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || 179462306a36Sopenharmony_ci flush == BTRFS_RESERVE_NO_FLUSH); 179562306a36Sopenharmony_ci ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); 179662306a36Sopenharmony_ci 179762306a36Sopenharmony_ci ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); 179862306a36Sopenharmony_ci if (ret == -ENOSPC) { 179962306a36Sopenharmony_ci trace_btrfs_space_reservation(fs_info, "space_info:enospc", 180062306a36Sopenharmony_ci data_sinfo->flags, bytes, 1); 180162306a36Sopenharmony_ci if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 180262306a36Sopenharmony_ci btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); 180362306a36Sopenharmony_ci } 180462306a36Sopenharmony_ci return ret; 180562306a36Sopenharmony_ci} 180662306a36Sopenharmony_ci 180762306a36Sopenharmony_ci/* Dump all the space infos when we abort a transaction due to ENOSPC. */ 180862306a36Sopenharmony_ci__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) 180962306a36Sopenharmony_ci{ 181062306a36Sopenharmony_ci struct btrfs_space_info *space_info; 181162306a36Sopenharmony_ci 181262306a36Sopenharmony_ci btrfs_info(fs_info, "dumping space info:"); 181362306a36Sopenharmony_ci list_for_each_entry(space_info, &fs_info->space_info, list) { 181462306a36Sopenharmony_ci spin_lock(&space_info->lock); 181562306a36Sopenharmony_ci __btrfs_dump_space_info(fs_info, space_info); 181662306a36Sopenharmony_ci spin_unlock(&space_info->lock); 181762306a36Sopenharmony_ci } 181862306a36Sopenharmony_ci dump_global_block_rsv(fs_info); 181962306a36Sopenharmony_ci} 182062306a36Sopenharmony_ci 182162306a36Sopenharmony_ci/* 182262306a36Sopenharmony_ci * Account the unused space of all the readonly block group in the space_info. 182362306a36Sopenharmony_ci * takes mirrors into account. 182462306a36Sopenharmony_ci */ 182562306a36Sopenharmony_ciu64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 182662306a36Sopenharmony_ci{ 182762306a36Sopenharmony_ci struct btrfs_block_group *block_group; 182862306a36Sopenharmony_ci u64 free_bytes = 0; 182962306a36Sopenharmony_ci int factor; 183062306a36Sopenharmony_ci 183162306a36Sopenharmony_ci /* It's df, we don't care if it's racy */ 183262306a36Sopenharmony_ci if (list_empty(&sinfo->ro_bgs)) 183362306a36Sopenharmony_ci return 0; 183462306a36Sopenharmony_ci 183562306a36Sopenharmony_ci spin_lock(&sinfo->lock); 183662306a36Sopenharmony_ci list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 183762306a36Sopenharmony_ci spin_lock(&block_group->lock); 183862306a36Sopenharmony_ci 183962306a36Sopenharmony_ci if (!block_group->ro) { 184062306a36Sopenharmony_ci spin_unlock(&block_group->lock); 184162306a36Sopenharmony_ci continue; 184262306a36Sopenharmony_ci } 184362306a36Sopenharmony_ci 184462306a36Sopenharmony_ci factor = btrfs_bg_type_to_factor(block_group->flags); 184562306a36Sopenharmony_ci free_bytes += (block_group->length - 184662306a36Sopenharmony_ci block_group->used) * factor; 184762306a36Sopenharmony_ci 184862306a36Sopenharmony_ci spin_unlock(&block_group->lock); 184962306a36Sopenharmony_ci } 185062306a36Sopenharmony_ci spin_unlock(&sinfo->lock); 185162306a36Sopenharmony_ci 185262306a36Sopenharmony_ci return free_bytes; 185362306a36Sopenharmony_ci} 1854