162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci 362306a36Sopenharmony_ci#include <linux/sizes.h> 462306a36Sopenharmony_ci#include <linux/list_sort.h> 562306a36Sopenharmony_ci#include "misc.h" 662306a36Sopenharmony_ci#include "ctree.h" 762306a36Sopenharmony_ci#include "block-group.h" 862306a36Sopenharmony_ci#include "space-info.h" 962306a36Sopenharmony_ci#include "disk-io.h" 1062306a36Sopenharmony_ci#include "free-space-cache.h" 1162306a36Sopenharmony_ci#include "free-space-tree.h" 1262306a36Sopenharmony_ci#include "volumes.h" 1362306a36Sopenharmony_ci#include "transaction.h" 1462306a36Sopenharmony_ci#include "ref-verify.h" 1562306a36Sopenharmony_ci#include "sysfs.h" 1662306a36Sopenharmony_ci#include "tree-log.h" 1762306a36Sopenharmony_ci#include "delalloc-space.h" 1862306a36Sopenharmony_ci#include "discard.h" 1962306a36Sopenharmony_ci#include "raid56.h" 2062306a36Sopenharmony_ci#include "zoned.h" 2162306a36Sopenharmony_ci#include "fs.h" 2262306a36Sopenharmony_ci#include "accessors.h" 2362306a36Sopenharmony_ci#include "extent-tree.h" 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci#ifdef CONFIG_BTRFS_DEBUG 2662306a36Sopenharmony_ciint btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) 2762306a36Sopenharmony_ci{ 2862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = block_group->fs_info; 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_ci return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && 3162306a36Sopenharmony_ci block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || 3262306a36Sopenharmony_ci (btrfs_test_opt(fs_info, FRAGMENT_DATA) && 3362306a36Sopenharmony_ci block_group->flags & BTRFS_BLOCK_GROUP_DATA); 3462306a36Sopenharmony_ci} 3562306a36Sopenharmony_ci#endif 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci/* 3862306a36Sopenharmony_ci * Return target flags in extended format or 0 if restripe for this chunk_type 3962306a36Sopenharmony_ci * is not in progress 4062306a36Sopenharmony_ci * 4162306a36Sopenharmony_ci * Should be called with balance_lock held 4262306a36Sopenharmony_ci */ 4362306a36Sopenharmony_cistatic u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 4462306a36Sopenharmony_ci{ 4562306a36Sopenharmony_ci struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4662306a36Sopenharmony_ci u64 target = 0; 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci if (!bctl) 4962306a36Sopenharmony_ci return 0; 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_DATA && 5262306a36Sopenharmony_ci bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 5362306a36Sopenharmony_ci target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 5462306a36Sopenharmony_ci } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 5562306a36Sopenharmony_ci bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 5662306a36Sopenharmony_ci target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 5762306a36Sopenharmony_ci } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 5862306a36Sopenharmony_ci bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 5962306a36Sopenharmony_ci target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 6062306a36Sopenharmony_ci } 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci return target; 6362306a36Sopenharmony_ci} 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci/* 6662306a36Sopenharmony_ci * @flags: available profiles in extended format (see ctree.h) 6762306a36Sopenharmony_ci * 6862306a36Sopenharmony_ci * Return reduced profile in chunk format. If profile changing is in progress 6962306a36Sopenharmony_ci * (either running or paused) picks the target profile (if it's already 7062306a36Sopenharmony_ci * available), otherwise falls back to plain reducing. 7162306a36Sopenharmony_ci */ 7262306a36Sopenharmony_cistatic u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 7362306a36Sopenharmony_ci{ 7462306a36Sopenharmony_ci u64 num_devices = fs_info->fs_devices->rw_devices; 7562306a36Sopenharmony_ci u64 target; 7662306a36Sopenharmony_ci u64 raid_type; 7762306a36Sopenharmony_ci u64 allowed = 0; 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci /* 8062306a36Sopenharmony_ci * See if restripe for this chunk_type is in progress, if so try to 8162306a36Sopenharmony_ci * reduce to the target profile 8262306a36Sopenharmony_ci */ 8362306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 8462306a36Sopenharmony_ci target = get_restripe_target(fs_info, flags); 8562306a36Sopenharmony_ci if (target) { 8662306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 8762306a36Sopenharmony_ci return extended_to_chunk(target); 8862306a36Sopenharmony_ci } 8962306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci /* First, mask out the RAID levels which aren't possible */ 9262306a36Sopenharmony_ci for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 9362306a36Sopenharmony_ci if (num_devices >= btrfs_raid_array[raid_type].devs_min) 9462306a36Sopenharmony_ci allowed |= btrfs_raid_array[raid_type].bg_flag; 9562306a36Sopenharmony_ci } 9662306a36Sopenharmony_ci allowed &= flags; 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci /* Select the highest-redundancy RAID level. */ 9962306a36Sopenharmony_ci if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) 10062306a36Sopenharmony_ci allowed = BTRFS_BLOCK_GROUP_RAID1C4; 10162306a36Sopenharmony_ci else if (allowed & BTRFS_BLOCK_GROUP_RAID6) 10262306a36Sopenharmony_ci allowed = BTRFS_BLOCK_GROUP_RAID6; 10362306a36Sopenharmony_ci else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) 10462306a36Sopenharmony_ci allowed = BTRFS_BLOCK_GROUP_RAID1C3; 10562306a36Sopenharmony_ci else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 10662306a36Sopenharmony_ci allowed = BTRFS_BLOCK_GROUP_RAID5; 10762306a36Sopenharmony_ci else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 10862306a36Sopenharmony_ci allowed = BTRFS_BLOCK_GROUP_RAID10; 10962306a36Sopenharmony_ci else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 11062306a36Sopenharmony_ci allowed = BTRFS_BLOCK_GROUP_RAID1; 11162306a36Sopenharmony_ci else if (allowed & BTRFS_BLOCK_GROUP_DUP) 11262306a36Sopenharmony_ci allowed = BTRFS_BLOCK_GROUP_DUP; 11362306a36Sopenharmony_ci else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 11462306a36Sopenharmony_ci allowed = BTRFS_BLOCK_GROUP_RAID0; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci return extended_to_chunk(flags | allowed); 11962306a36Sopenharmony_ci} 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ciu64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 12262306a36Sopenharmony_ci{ 12362306a36Sopenharmony_ci unsigned seq; 12462306a36Sopenharmony_ci u64 flags; 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci do { 12762306a36Sopenharmony_ci flags = orig_flags; 12862306a36Sopenharmony_ci seq = read_seqbegin(&fs_info->profiles_lock); 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_DATA) 13162306a36Sopenharmony_ci flags |= fs_info->avail_data_alloc_bits; 13262306a36Sopenharmony_ci else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 13362306a36Sopenharmony_ci flags |= fs_info->avail_system_alloc_bits; 13462306a36Sopenharmony_ci else if (flags & BTRFS_BLOCK_GROUP_METADATA) 13562306a36Sopenharmony_ci flags |= fs_info->avail_metadata_alloc_bits; 13662306a36Sopenharmony_ci } while (read_seqretry(&fs_info->profiles_lock, seq)); 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci return btrfs_reduce_alloc_profile(fs_info, flags); 13962306a36Sopenharmony_ci} 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_civoid btrfs_get_block_group(struct btrfs_block_group *cache) 14262306a36Sopenharmony_ci{ 14362306a36Sopenharmony_ci refcount_inc(&cache->refs); 14462306a36Sopenharmony_ci} 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_civoid btrfs_put_block_group(struct btrfs_block_group *cache) 14762306a36Sopenharmony_ci{ 14862306a36Sopenharmony_ci if (refcount_dec_and_test(&cache->refs)) { 14962306a36Sopenharmony_ci WARN_ON(cache->pinned > 0); 15062306a36Sopenharmony_ci /* 15162306a36Sopenharmony_ci * If there was a failure to cleanup a log tree, very likely due 15262306a36Sopenharmony_ci * to an IO failure on a writeback attempt of one or more of its 15362306a36Sopenharmony_ci * extent buffers, we could not do proper (and cheap) unaccounting 15462306a36Sopenharmony_ci * of their reserved space, so don't warn on reserved > 0 in that 15562306a36Sopenharmony_ci * case. 15662306a36Sopenharmony_ci */ 15762306a36Sopenharmony_ci if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || 15862306a36Sopenharmony_ci !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) 15962306a36Sopenharmony_ci WARN_ON(cache->reserved > 0); 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci /* 16262306a36Sopenharmony_ci * A block_group shouldn't be on the discard_list anymore. 16362306a36Sopenharmony_ci * Remove the block_group from the discard_list to prevent us 16462306a36Sopenharmony_ci * from causing a panic due to NULL pointer dereference. 16562306a36Sopenharmony_ci */ 16662306a36Sopenharmony_ci if (WARN_ON(!list_empty(&cache->discard_list))) 16762306a36Sopenharmony_ci btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, 16862306a36Sopenharmony_ci cache); 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci kfree(cache->free_space_ctl); 17162306a36Sopenharmony_ci kfree(cache->physical_map); 17262306a36Sopenharmony_ci kfree(cache); 17362306a36Sopenharmony_ci } 17462306a36Sopenharmony_ci} 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci/* 17762306a36Sopenharmony_ci * This adds the block group to the fs_info rb tree for the block group cache 17862306a36Sopenharmony_ci */ 17962306a36Sopenharmony_cistatic int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 18062306a36Sopenharmony_ci struct btrfs_block_group *block_group) 18162306a36Sopenharmony_ci{ 18262306a36Sopenharmony_ci struct rb_node **p; 18362306a36Sopenharmony_ci struct rb_node *parent = NULL; 18462306a36Sopenharmony_ci struct btrfs_block_group *cache; 18562306a36Sopenharmony_ci bool leftmost = true; 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci ASSERT(block_group->length != 0); 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci write_lock(&info->block_group_cache_lock); 19062306a36Sopenharmony_ci p = &info->block_group_cache_tree.rb_root.rb_node; 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci while (*p) { 19362306a36Sopenharmony_ci parent = *p; 19462306a36Sopenharmony_ci cache = rb_entry(parent, struct btrfs_block_group, cache_node); 19562306a36Sopenharmony_ci if (block_group->start < cache->start) { 19662306a36Sopenharmony_ci p = &(*p)->rb_left; 19762306a36Sopenharmony_ci } else if (block_group->start > cache->start) { 19862306a36Sopenharmony_ci p = &(*p)->rb_right; 19962306a36Sopenharmony_ci leftmost = false; 20062306a36Sopenharmony_ci } else { 20162306a36Sopenharmony_ci write_unlock(&info->block_group_cache_lock); 20262306a36Sopenharmony_ci return -EEXIST; 20362306a36Sopenharmony_ci } 20462306a36Sopenharmony_ci } 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci rb_link_node(&block_group->cache_node, parent, p); 20762306a36Sopenharmony_ci rb_insert_color_cached(&block_group->cache_node, 20862306a36Sopenharmony_ci &info->block_group_cache_tree, leftmost); 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci write_unlock(&info->block_group_cache_lock); 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci return 0; 21362306a36Sopenharmony_ci} 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci/* 21662306a36Sopenharmony_ci * This will return the block group at or after bytenr if contains is 0, else 21762306a36Sopenharmony_ci * it will return the block group that contains the bytenr 21862306a36Sopenharmony_ci */ 21962306a36Sopenharmony_cistatic struct btrfs_block_group *block_group_cache_tree_search( 22062306a36Sopenharmony_ci struct btrfs_fs_info *info, u64 bytenr, int contains) 22162306a36Sopenharmony_ci{ 22262306a36Sopenharmony_ci struct btrfs_block_group *cache, *ret = NULL; 22362306a36Sopenharmony_ci struct rb_node *n; 22462306a36Sopenharmony_ci u64 end, start; 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci read_lock(&info->block_group_cache_lock); 22762306a36Sopenharmony_ci n = info->block_group_cache_tree.rb_root.rb_node; 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci while (n) { 23062306a36Sopenharmony_ci cache = rb_entry(n, struct btrfs_block_group, cache_node); 23162306a36Sopenharmony_ci end = cache->start + cache->length - 1; 23262306a36Sopenharmony_ci start = cache->start; 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci if (bytenr < start) { 23562306a36Sopenharmony_ci if (!contains && (!ret || start < ret->start)) 23662306a36Sopenharmony_ci ret = cache; 23762306a36Sopenharmony_ci n = n->rb_left; 23862306a36Sopenharmony_ci } else if (bytenr > start) { 23962306a36Sopenharmony_ci if (contains && bytenr <= end) { 24062306a36Sopenharmony_ci ret = cache; 24162306a36Sopenharmony_ci break; 24262306a36Sopenharmony_ci } 24362306a36Sopenharmony_ci n = n->rb_right; 24462306a36Sopenharmony_ci } else { 24562306a36Sopenharmony_ci ret = cache; 24662306a36Sopenharmony_ci break; 24762306a36Sopenharmony_ci } 24862306a36Sopenharmony_ci } 24962306a36Sopenharmony_ci if (ret) 25062306a36Sopenharmony_ci btrfs_get_block_group(ret); 25162306a36Sopenharmony_ci read_unlock(&info->block_group_cache_lock); 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci return ret; 25462306a36Sopenharmony_ci} 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci/* 25762306a36Sopenharmony_ci * Return the block group that starts at or after bytenr 25862306a36Sopenharmony_ci */ 25962306a36Sopenharmony_cistruct btrfs_block_group *btrfs_lookup_first_block_group( 26062306a36Sopenharmony_ci struct btrfs_fs_info *info, u64 bytenr) 26162306a36Sopenharmony_ci{ 26262306a36Sopenharmony_ci return block_group_cache_tree_search(info, bytenr, 0); 26362306a36Sopenharmony_ci} 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci/* 26662306a36Sopenharmony_ci * Return the block group that contains the given bytenr 26762306a36Sopenharmony_ci */ 26862306a36Sopenharmony_cistruct btrfs_block_group *btrfs_lookup_block_group( 26962306a36Sopenharmony_ci struct btrfs_fs_info *info, u64 bytenr) 27062306a36Sopenharmony_ci{ 27162306a36Sopenharmony_ci return block_group_cache_tree_search(info, bytenr, 1); 27262306a36Sopenharmony_ci} 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_cistruct btrfs_block_group *btrfs_next_block_group( 27562306a36Sopenharmony_ci struct btrfs_block_group *cache) 27662306a36Sopenharmony_ci{ 27762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = cache->fs_info; 27862306a36Sopenharmony_ci struct rb_node *node; 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci read_lock(&fs_info->block_group_cache_lock); 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci /* If our block group was removed, we need a full search. */ 28362306a36Sopenharmony_ci if (RB_EMPTY_NODE(&cache->cache_node)) { 28462306a36Sopenharmony_ci const u64 next_bytenr = cache->start + cache->length; 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci read_unlock(&fs_info->block_group_cache_lock); 28762306a36Sopenharmony_ci btrfs_put_block_group(cache); 28862306a36Sopenharmony_ci return btrfs_lookup_first_block_group(fs_info, next_bytenr); 28962306a36Sopenharmony_ci } 29062306a36Sopenharmony_ci node = rb_next(&cache->cache_node); 29162306a36Sopenharmony_ci btrfs_put_block_group(cache); 29262306a36Sopenharmony_ci if (node) { 29362306a36Sopenharmony_ci cache = rb_entry(node, struct btrfs_block_group, cache_node); 29462306a36Sopenharmony_ci btrfs_get_block_group(cache); 29562306a36Sopenharmony_ci } else 29662306a36Sopenharmony_ci cache = NULL; 29762306a36Sopenharmony_ci read_unlock(&fs_info->block_group_cache_lock); 29862306a36Sopenharmony_ci return cache; 29962306a36Sopenharmony_ci} 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci/* 30262306a36Sopenharmony_ci * Check if we can do a NOCOW write for a given extent. 30362306a36Sopenharmony_ci * 30462306a36Sopenharmony_ci * @fs_info: The filesystem information object. 30562306a36Sopenharmony_ci * @bytenr: Logical start address of the extent. 30662306a36Sopenharmony_ci * 30762306a36Sopenharmony_ci * Check if we can do a NOCOW write for the given extent, and increments the 30862306a36Sopenharmony_ci * number of NOCOW writers in the block group that contains the extent, as long 30962306a36Sopenharmony_ci * as the block group exists and it's currently not in read-only mode. 31062306a36Sopenharmony_ci * 31162306a36Sopenharmony_ci * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller 31262306a36Sopenharmony_ci * is responsible for calling btrfs_dec_nocow_writers() later. 31362306a36Sopenharmony_ci * 31462306a36Sopenharmony_ci * Or NULL if we can not do a NOCOW write 31562306a36Sopenharmony_ci */ 31662306a36Sopenharmony_cistruct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, 31762306a36Sopenharmony_ci u64 bytenr) 31862306a36Sopenharmony_ci{ 31962306a36Sopenharmony_ci struct btrfs_block_group *bg; 32062306a36Sopenharmony_ci bool can_nocow = true; 32162306a36Sopenharmony_ci 32262306a36Sopenharmony_ci bg = btrfs_lookup_block_group(fs_info, bytenr); 32362306a36Sopenharmony_ci if (!bg) 32462306a36Sopenharmony_ci return NULL; 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci spin_lock(&bg->lock); 32762306a36Sopenharmony_ci if (bg->ro) 32862306a36Sopenharmony_ci can_nocow = false; 32962306a36Sopenharmony_ci else 33062306a36Sopenharmony_ci atomic_inc(&bg->nocow_writers); 33162306a36Sopenharmony_ci spin_unlock(&bg->lock); 33262306a36Sopenharmony_ci 33362306a36Sopenharmony_ci if (!can_nocow) { 33462306a36Sopenharmony_ci btrfs_put_block_group(bg); 33562306a36Sopenharmony_ci return NULL; 33662306a36Sopenharmony_ci } 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci /* No put on block group, done by btrfs_dec_nocow_writers(). */ 33962306a36Sopenharmony_ci return bg; 34062306a36Sopenharmony_ci} 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci/* 34362306a36Sopenharmony_ci * Decrement the number of NOCOW writers in a block group. 34462306a36Sopenharmony_ci * 34562306a36Sopenharmony_ci * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), 34662306a36Sopenharmony_ci * and on the block group returned by that call. Typically this is called after 34762306a36Sopenharmony_ci * creating an ordered extent for a NOCOW write, to prevent races with scrub and 34862306a36Sopenharmony_ci * relocation. 34962306a36Sopenharmony_ci * 35062306a36Sopenharmony_ci * After this call, the caller should not use the block group anymore. It it wants 35162306a36Sopenharmony_ci * to use it, then it should get a reference on it before calling this function. 35262306a36Sopenharmony_ci */ 35362306a36Sopenharmony_civoid btrfs_dec_nocow_writers(struct btrfs_block_group *bg) 35462306a36Sopenharmony_ci{ 35562306a36Sopenharmony_ci if (atomic_dec_and_test(&bg->nocow_writers)) 35662306a36Sopenharmony_ci wake_up_var(&bg->nocow_writers); 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ 35962306a36Sopenharmony_ci btrfs_put_block_group(bg); 36062306a36Sopenharmony_ci} 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_civoid btrfs_wait_nocow_writers(struct btrfs_block_group *bg) 36362306a36Sopenharmony_ci{ 36462306a36Sopenharmony_ci wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 36562306a36Sopenharmony_ci} 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_civoid btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 36862306a36Sopenharmony_ci const u64 start) 36962306a36Sopenharmony_ci{ 37062306a36Sopenharmony_ci struct btrfs_block_group *bg; 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci bg = btrfs_lookup_block_group(fs_info, start); 37362306a36Sopenharmony_ci ASSERT(bg); 37462306a36Sopenharmony_ci if (atomic_dec_and_test(&bg->reservations)) 37562306a36Sopenharmony_ci wake_up_var(&bg->reservations); 37662306a36Sopenharmony_ci btrfs_put_block_group(bg); 37762306a36Sopenharmony_ci} 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_civoid btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) 38062306a36Sopenharmony_ci{ 38162306a36Sopenharmony_ci struct btrfs_space_info *space_info = bg->space_info; 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci ASSERT(bg->ro); 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 38662306a36Sopenharmony_ci return; 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci /* 38962306a36Sopenharmony_ci * Our block group is read only but before we set it to read only, 39062306a36Sopenharmony_ci * some task might have had allocated an extent from it already, but it 39162306a36Sopenharmony_ci * has not yet created a respective ordered extent (and added it to a 39262306a36Sopenharmony_ci * root's list of ordered extents). 39362306a36Sopenharmony_ci * Therefore wait for any task currently allocating extents, since the 39462306a36Sopenharmony_ci * block group's reservations counter is incremented while a read lock 39562306a36Sopenharmony_ci * on the groups' semaphore is held and decremented after releasing 39662306a36Sopenharmony_ci * the read access on that semaphore and creating the ordered extent. 39762306a36Sopenharmony_ci */ 39862306a36Sopenharmony_ci down_write(&space_info->groups_sem); 39962306a36Sopenharmony_ci up_write(&space_info->groups_sem); 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 40262306a36Sopenharmony_ci} 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_cistruct btrfs_caching_control *btrfs_get_caching_control( 40562306a36Sopenharmony_ci struct btrfs_block_group *cache) 40662306a36Sopenharmony_ci{ 40762306a36Sopenharmony_ci struct btrfs_caching_control *ctl; 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci spin_lock(&cache->lock); 41062306a36Sopenharmony_ci if (!cache->caching_ctl) { 41162306a36Sopenharmony_ci spin_unlock(&cache->lock); 41262306a36Sopenharmony_ci return NULL; 41362306a36Sopenharmony_ci } 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci ctl = cache->caching_ctl; 41662306a36Sopenharmony_ci refcount_inc(&ctl->count); 41762306a36Sopenharmony_ci spin_unlock(&cache->lock); 41862306a36Sopenharmony_ci return ctl; 41962306a36Sopenharmony_ci} 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_civoid btrfs_put_caching_control(struct btrfs_caching_control *ctl) 42262306a36Sopenharmony_ci{ 42362306a36Sopenharmony_ci if (refcount_dec_and_test(&ctl->count)) 42462306a36Sopenharmony_ci kfree(ctl); 42562306a36Sopenharmony_ci} 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci/* 42862306a36Sopenharmony_ci * When we wait for progress in the block group caching, its because our 42962306a36Sopenharmony_ci * allocation attempt failed at least once. So, we must sleep and let some 43062306a36Sopenharmony_ci * progress happen before we try again. 43162306a36Sopenharmony_ci * 43262306a36Sopenharmony_ci * This function will sleep at least once waiting for new free space to show 43362306a36Sopenharmony_ci * up, and then it will check the block group free space numbers for our min 43462306a36Sopenharmony_ci * num_bytes. Another option is to have it go ahead and look in the rbtree for 43562306a36Sopenharmony_ci * a free extent of a given size, but this is a good start. 43662306a36Sopenharmony_ci * 43762306a36Sopenharmony_ci * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 43862306a36Sopenharmony_ci * any of the information in this block group. 43962306a36Sopenharmony_ci */ 44062306a36Sopenharmony_civoid btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, 44162306a36Sopenharmony_ci u64 num_bytes) 44262306a36Sopenharmony_ci{ 44362306a36Sopenharmony_ci struct btrfs_caching_control *caching_ctl; 44462306a36Sopenharmony_ci int progress; 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_ci caching_ctl = btrfs_get_caching_control(cache); 44762306a36Sopenharmony_ci if (!caching_ctl) 44862306a36Sopenharmony_ci return; 44962306a36Sopenharmony_ci 45062306a36Sopenharmony_ci /* 45162306a36Sopenharmony_ci * We've already failed to allocate from this block group, so even if 45262306a36Sopenharmony_ci * there's enough space in the block group it isn't contiguous enough to 45362306a36Sopenharmony_ci * allow for an allocation, so wait for at least the next wakeup tick, 45462306a36Sopenharmony_ci * or for the thing to be done. 45562306a36Sopenharmony_ci */ 45662306a36Sopenharmony_ci progress = atomic_read(&caching_ctl->progress); 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || 45962306a36Sopenharmony_ci (progress != atomic_read(&caching_ctl->progress) && 46062306a36Sopenharmony_ci (cache->free_space_ctl->free_space >= num_bytes))); 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci btrfs_put_caching_control(caching_ctl); 46362306a36Sopenharmony_ci} 46462306a36Sopenharmony_ci 46562306a36Sopenharmony_cistatic int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, 46662306a36Sopenharmony_ci struct btrfs_caching_control *caching_ctl) 46762306a36Sopenharmony_ci{ 46862306a36Sopenharmony_ci wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); 46962306a36Sopenharmony_ci return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; 47062306a36Sopenharmony_ci} 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_cistatic int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) 47362306a36Sopenharmony_ci{ 47462306a36Sopenharmony_ci struct btrfs_caching_control *caching_ctl; 47562306a36Sopenharmony_ci int ret; 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci caching_ctl = btrfs_get_caching_control(cache); 47862306a36Sopenharmony_ci if (!caching_ctl) 47962306a36Sopenharmony_ci return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 48062306a36Sopenharmony_ci ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 48162306a36Sopenharmony_ci btrfs_put_caching_control(caching_ctl); 48262306a36Sopenharmony_ci return ret; 48362306a36Sopenharmony_ci} 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci#ifdef CONFIG_BTRFS_DEBUG 48662306a36Sopenharmony_cistatic void fragment_free_space(struct btrfs_block_group *block_group) 48762306a36Sopenharmony_ci{ 48862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = block_group->fs_info; 48962306a36Sopenharmony_ci u64 start = block_group->start; 49062306a36Sopenharmony_ci u64 len = block_group->length; 49162306a36Sopenharmony_ci u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 49262306a36Sopenharmony_ci fs_info->nodesize : fs_info->sectorsize; 49362306a36Sopenharmony_ci u64 step = chunk << 1; 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci while (len > chunk) { 49662306a36Sopenharmony_ci btrfs_remove_free_space(block_group, start, chunk); 49762306a36Sopenharmony_ci start += step; 49862306a36Sopenharmony_ci if (len < step) 49962306a36Sopenharmony_ci len = 0; 50062306a36Sopenharmony_ci else 50162306a36Sopenharmony_ci len -= step; 50262306a36Sopenharmony_ci } 50362306a36Sopenharmony_ci} 50462306a36Sopenharmony_ci#endif 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci/* 50762306a36Sopenharmony_ci * Add a free space range to the in memory free space cache of a block group. 50862306a36Sopenharmony_ci * This checks if the range contains super block locations and any such 50962306a36Sopenharmony_ci * locations are not added to the free space cache. 51062306a36Sopenharmony_ci * 51162306a36Sopenharmony_ci * @block_group: The target block group. 51262306a36Sopenharmony_ci * @start: Start offset of the range. 51362306a36Sopenharmony_ci * @end: End offset of the range (exclusive). 51462306a36Sopenharmony_ci * @total_added_ret: Optional pointer to return the total amount of space 51562306a36Sopenharmony_ci * added to the block group's free space cache. 51662306a36Sopenharmony_ci * 51762306a36Sopenharmony_ci * Returns 0 on success or < 0 on error. 51862306a36Sopenharmony_ci */ 51962306a36Sopenharmony_ciint btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, 52062306a36Sopenharmony_ci u64 end, u64 *total_added_ret) 52162306a36Sopenharmony_ci{ 52262306a36Sopenharmony_ci struct btrfs_fs_info *info = block_group->fs_info; 52362306a36Sopenharmony_ci u64 extent_start, extent_end, size; 52462306a36Sopenharmony_ci int ret; 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_ci if (total_added_ret) 52762306a36Sopenharmony_ci *total_added_ret = 0; 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci while (start < end) { 53062306a36Sopenharmony_ci if (!find_first_extent_bit(&info->excluded_extents, start, 53162306a36Sopenharmony_ci &extent_start, &extent_end, 53262306a36Sopenharmony_ci EXTENT_DIRTY | EXTENT_UPTODATE, 53362306a36Sopenharmony_ci NULL)) 53462306a36Sopenharmony_ci break; 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci if (extent_start <= start) { 53762306a36Sopenharmony_ci start = extent_end + 1; 53862306a36Sopenharmony_ci } else if (extent_start > start && extent_start < end) { 53962306a36Sopenharmony_ci size = extent_start - start; 54062306a36Sopenharmony_ci ret = btrfs_add_free_space_async_trimmed(block_group, 54162306a36Sopenharmony_ci start, size); 54262306a36Sopenharmony_ci if (ret) 54362306a36Sopenharmony_ci return ret; 54462306a36Sopenharmony_ci if (total_added_ret) 54562306a36Sopenharmony_ci *total_added_ret += size; 54662306a36Sopenharmony_ci start = extent_end + 1; 54762306a36Sopenharmony_ci } else { 54862306a36Sopenharmony_ci break; 54962306a36Sopenharmony_ci } 55062306a36Sopenharmony_ci } 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci if (start < end) { 55362306a36Sopenharmony_ci size = end - start; 55462306a36Sopenharmony_ci ret = btrfs_add_free_space_async_trimmed(block_group, start, 55562306a36Sopenharmony_ci size); 55662306a36Sopenharmony_ci if (ret) 55762306a36Sopenharmony_ci return ret; 55862306a36Sopenharmony_ci if (total_added_ret) 55962306a36Sopenharmony_ci *total_added_ret += size; 56062306a36Sopenharmony_ci } 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_ci return 0; 56362306a36Sopenharmony_ci} 56462306a36Sopenharmony_ci 56562306a36Sopenharmony_ci/* 56662306a36Sopenharmony_ci * Get an arbitrary extent item index / max_index through the block group 56762306a36Sopenharmony_ci * 56862306a36Sopenharmony_ci * @block_group the block group to sample from 56962306a36Sopenharmony_ci * @index: the integral step through the block group to grab from 57062306a36Sopenharmony_ci * @max_index: the granularity of the sampling 57162306a36Sopenharmony_ci * @key: return value parameter for the item we find 57262306a36Sopenharmony_ci * 57362306a36Sopenharmony_ci * Pre-conditions on indices: 57462306a36Sopenharmony_ci * 0 <= index <= max_index 57562306a36Sopenharmony_ci * 0 < max_index 57662306a36Sopenharmony_ci * 57762306a36Sopenharmony_ci * Returns: 0 on success, 1 if the search didn't yield a useful item, negative 57862306a36Sopenharmony_ci * error code on error. 57962306a36Sopenharmony_ci */ 58062306a36Sopenharmony_cistatic int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, 58162306a36Sopenharmony_ci struct btrfs_block_group *block_group, 58262306a36Sopenharmony_ci int index, int max_index, 58362306a36Sopenharmony_ci struct btrfs_key *found_key) 58462306a36Sopenharmony_ci{ 58562306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = block_group->fs_info; 58662306a36Sopenharmony_ci struct btrfs_root *extent_root; 58762306a36Sopenharmony_ci u64 search_offset; 58862306a36Sopenharmony_ci u64 search_end = block_group->start + block_group->length; 58962306a36Sopenharmony_ci struct btrfs_path *path; 59062306a36Sopenharmony_ci struct btrfs_key search_key; 59162306a36Sopenharmony_ci int ret = 0; 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_ci ASSERT(index >= 0); 59462306a36Sopenharmony_ci ASSERT(index <= max_index); 59562306a36Sopenharmony_ci ASSERT(max_index > 0); 59662306a36Sopenharmony_ci lockdep_assert_held(&caching_ctl->mutex); 59762306a36Sopenharmony_ci lockdep_assert_held_read(&fs_info->commit_root_sem); 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_ci path = btrfs_alloc_path(); 60062306a36Sopenharmony_ci if (!path) 60162306a36Sopenharmony_ci return -ENOMEM; 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, 60462306a36Sopenharmony_ci BTRFS_SUPER_INFO_OFFSET)); 60562306a36Sopenharmony_ci 60662306a36Sopenharmony_ci path->skip_locking = 1; 60762306a36Sopenharmony_ci path->search_commit_root = 1; 60862306a36Sopenharmony_ci path->reada = READA_FORWARD; 60962306a36Sopenharmony_ci 61062306a36Sopenharmony_ci search_offset = index * div_u64(block_group->length, max_index); 61162306a36Sopenharmony_ci search_key.objectid = block_group->start + search_offset; 61262306a36Sopenharmony_ci search_key.type = BTRFS_EXTENT_ITEM_KEY; 61362306a36Sopenharmony_ci search_key.offset = 0; 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_ci btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) { 61662306a36Sopenharmony_ci /* Success; sampled an extent item in the block group */ 61762306a36Sopenharmony_ci if (found_key->type == BTRFS_EXTENT_ITEM_KEY && 61862306a36Sopenharmony_ci found_key->objectid >= block_group->start && 61962306a36Sopenharmony_ci found_key->objectid + found_key->offset <= search_end) 62062306a36Sopenharmony_ci break; 62162306a36Sopenharmony_ci 62262306a36Sopenharmony_ci /* We can't possibly find a valid extent item anymore */ 62362306a36Sopenharmony_ci if (found_key->objectid >= search_end) { 62462306a36Sopenharmony_ci ret = 1; 62562306a36Sopenharmony_ci break; 62662306a36Sopenharmony_ci } 62762306a36Sopenharmony_ci } 62862306a36Sopenharmony_ci 62962306a36Sopenharmony_ci lockdep_assert_held(&caching_ctl->mutex); 63062306a36Sopenharmony_ci lockdep_assert_held_read(&fs_info->commit_root_sem); 63162306a36Sopenharmony_ci btrfs_free_path(path); 63262306a36Sopenharmony_ci return ret; 63362306a36Sopenharmony_ci} 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci/* 63662306a36Sopenharmony_ci * Best effort attempt to compute a block group's size class while caching it. 63762306a36Sopenharmony_ci * 63862306a36Sopenharmony_ci * @block_group: the block group we are caching 63962306a36Sopenharmony_ci * 64062306a36Sopenharmony_ci * We cannot infer the size class while adding free space extents, because that 64162306a36Sopenharmony_ci * logic doesn't care about contiguous file extents (it doesn't differentiate 64262306a36Sopenharmony_ci * between a 100M extent and 100 contiguous 1M extents). So we need to read the 64362306a36Sopenharmony_ci * file extent items. Reading all of them is quite wasteful, because usually 64462306a36Sopenharmony_ci * only a handful are enough to give a good answer. Therefore, we just grab 5 of 64562306a36Sopenharmony_ci * them at even steps through the block group and pick the smallest size class 64662306a36Sopenharmony_ci * we see. Since size class is best effort, and not guaranteed in general, 64762306a36Sopenharmony_ci * inaccuracy is acceptable. 64862306a36Sopenharmony_ci * 64962306a36Sopenharmony_ci * To be more explicit about why this algorithm makes sense: 65062306a36Sopenharmony_ci * 65162306a36Sopenharmony_ci * If we are caching in a block group from disk, then there are three major cases 65262306a36Sopenharmony_ci * to consider: 65362306a36Sopenharmony_ci * 1. the block group is well behaved and all extents in it are the same size 65462306a36Sopenharmony_ci * class. 65562306a36Sopenharmony_ci * 2. the block group is mostly one size class with rare exceptions for last 65662306a36Sopenharmony_ci * ditch allocations 65762306a36Sopenharmony_ci * 3. the block group was populated before size classes and can have a totally 65862306a36Sopenharmony_ci * arbitrary mix of size classes. 65962306a36Sopenharmony_ci * 66062306a36Sopenharmony_ci * In case 1, looking at any extent in the block group will yield the correct 66162306a36Sopenharmony_ci * result. For the mixed cases, taking the minimum size class seems like a good 66262306a36Sopenharmony_ci * approximation, since gaps from frees will be usable to the size class. For 66362306a36Sopenharmony_ci * 2., a small handful of file extents is likely to yield the right answer. For 66462306a36Sopenharmony_ci * 3, we can either read every file extent, or admit that this is best effort 66562306a36Sopenharmony_ci * anyway and try to stay fast. 66662306a36Sopenharmony_ci * 66762306a36Sopenharmony_ci * Returns: 0 on success, negative error code on error. 66862306a36Sopenharmony_ci */ 66962306a36Sopenharmony_cistatic int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, 67062306a36Sopenharmony_ci struct btrfs_block_group *block_group) 67162306a36Sopenharmony_ci{ 67262306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = block_group->fs_info; 67362306a36Sopenharmony_ci struct btrfs_key key; 67462306a36Sopenharmony_ci int i; 67562306a36Sopenharmony_ci u64 min_size = block_group->length; 67662306a36Sopenharmony_ci enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; 67762306a36Sopenharmony_ci int ret; 67862306a36Sopenharmony_ci 67962306a36Sopenharmony_ci if (!btrfs_block_group_should_use_size_class(block_group)) 68062306a36Sopenharmony_ci return 0; 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_ci lockdep_assert_held(&caching_ctl->mutex); 68362306a36Sopenharmony_ci lockdep_assert_held_read(&fs_info->commit_root_sem); 68462306a36Sopenharmony_ci for (i = 0; i < 5; ++i) { 68562306a36Sopenharmony_ci ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); 68662306a36Sopenharmony_ci if (ret < 0) 68762306a36Sopenharmony_ci goto out; 68862306a36Sopenharmony_ci if (ret > 0) 68962306a36Sopenharmony_ci continue; 69062306a36Sopenharmony_ci min_size = min_t(u64, min_size, key.offset); 69162306a36Sopenharmony_ci size_class = btrfs_calc_block_group_size_class(min_size); 69262306a36Sopenharmony_ci } 69362306a36Sopenharmony_ci if (size_class != BTRFS_BG_SZ_NONE) { 69462306a36Sopenharmony_ci spin_lock(&block_group->lock); 69562306a36Sopenharmony_ci block_group->size_class = size_class; 69662306a36Sopenharmony_ci spin_unlock(&block_group->lock); 69762306a36Sopenharmony_ci } 69862306a36Sopenharmony_ciout: 69962306a36Sopenharmony_ci return ret; 70062306a36Sopenharmony_ci} 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_cistatic int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 70362306a36Sopenharmony_ci{ 70462306a36Sopenharmony_ci struct btrfs_block_group *block_group = caching_ctl->block_group; 70562306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = block_group->fs_info; 70662306a36Sopenharmony_ci struct btrfs_root *extent_root; 70762306a36Sopenharmony_ci struct btrfs_path *path; 70862306a36Sopenharmony_ci struct extent_buffer *leaf; 70962306a36Sopenharmony_ci struct btrfs_key key; 71062306a36Sopenharmony_ci u64 total_found = 0; 71162306a36Sopenharmony_ci u64 last = 0; 71262306a36Sopenharmony_ci u32 nritems; 71362306a36Sopenharmony_ci int ret; 71462306a36Sopenharmony_ci bool wakeup = true; 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_ci path = btrfs_alloc_path(); 71762306a36Sopenharmony_ci if (!path) 71862306a36Sopenharmony_ci return -ENOMEM; 71962306a36Sopenharmony_ci 72062306a36Sopenharmony_ci last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); 72162306a36Sopenharmony_ci extent_root = btrfs_extent_root(fs_info, last); 72262306a36Sopenharmony_ci 72362306a36Sopenharmony_ci#ifdef CONFIG_BTRFS_DEBUG 72462306a36Sopenharmony_ci /* 72562306a36Sopenharmony_ci * If we're fragmenting we don't want to make anybody think we can 72662306a36Sopenharmony_ci * allocate from this block group until we've had a chance to fragment 72762306a36Sopenharmony_ci * the free space. 72862306a36Sopenharmony_ci */ 72962306a36Sopenharmony_ci if (btrfs_should_fragment_free_space(block_group)) 73062306a36Sopenharmony_ci wakeup = false; 73162306a36Sopenharmony_ci#endif 73262306a36Sopenharmony_ci /* 73362306a36Sopenharmony_ci * We don't want to deadlock with somebody trying to allocate a new 73462306a36Sopenharmony_ci * extent for the extent root while also trying to search the extent 73562306a36Sopenharmony_ci * root to add free space. So we skip locking and search the commit 73662306a36Sopenharmony_ci * root, since its read-only 73762306a36Sopenharmony_ci */ 73862306a36Sopenharmony_ci path->skip_locking = 1; 73962306a36Sopenharmony_ci path->search_commit_root = 1; 74062306a36Sopenharmony_ci path->reada = READA_FORWARD; 74162306a36Sopenharmony_ci 74262306a36Sopenharmony_ci key.objectid = last; 74362306a36Sopenharmony_ci key.offset = 0; 74462306a36Sopenharmony_ci key.type = BTRFS_EXTENT_ITEM_KEY; 74562306a36Sopenharmony_ci 74662306a36Sopenharmony_cinext: 74762306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 74862306a36Sopenharmony_ci if (ret < 0) 74962306a36Sopenharmony_ci goto out; 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci leaf = path->nodes[0]; 75262306a36Sopenharmony_ci nritems = btrfs_header_nritems(leaf); 75362306a36Sopenharmony_ci 75462306a36Sopenharmony_ci while (1) { 75562306a36Sopenharmony_ci if (btrfs_fs_closing(fs_info) > 1) { 75662306a36Sopenharmony_ci last = (u64)-1; 75762306a36Sopenharmony_ci break; 75862306a36Sopenharmony_ci } 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci if (path->slots[0] < nritems) { 76162306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 76262306a36Sopenharmony_ci } else { 76362306a36Sopenharmony_ci ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); 76462306a36Sopenharmony_ci if (ret) 76562306a36Sopenharmony_ci break; 76662306a36Sopenharmony_ci 76762306a36Sopenharmony_ci if (need_resched() || 76862306a36Sopenharmony_ci rwsem_is_contended(&fs_info->commit_root_sem)) { 76962306a36Sopenharmony_ci btrfs_release_path(path); 77062306a36Sopenharmony_ci up_read(&fs_info->commit_root_sem); 77162306a36Sopenharmony_ci mutex_unlock(&caching_ctl->mutex); 77262306a36Sopenharmony_ci cond_resched(); 77362306a36Sopenharmony_ci mutex_lock(&caching_ctl->mutex); 77462306a36Sopenharmony_ci down_read(&fs_info->commit_root_sem); 77562306a36Sopenharmony_ci goto next; 77662306a36Sopenharmony_ci } 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_ci ret = btrfs_next_leaf(extent_root, path); 77962306a36Sopenharmony_ci if (ret < 0) 78062306a36Sopenharmony_ci goto out; 78162306a36Sopenharmony_ci if (ret) 78262306a36Sopenharmony_ci break; 78362306a36Sopenharmony_ci leaf = path->nodes[0]; 78462306a36Sopenharmony_ci nritems = btrfs_header_nritems(leaf); 78562306a36Sopenharmony_ci continue; 78662306a36Sopenharmony_ci } 78762306a36Sopenharmony_ci 78862306a36Sopenharmony_ci if (key.objectid < last) { 78962306a36Sopenharmony_ci key.objectid = last; 79062306a36Sopenharmony_ci key.offset = 0; 79162306a36Sopenharmony_ci key.type = BTRFS_EXTENT_ITEM_KEY; 79262306a36Sopenharmony_ci btrfs_release_path(path); 79362306a36Sopenharmony_ci goto next; 79462306a36Sopenharmony_ci } 79562306a36Sopenharmony_ci 79662306a36Sopenharmony_ci if (key.objectid < block_group->start) { 79762306a36Sopenharmony_ci path->slots[0]++; 79862306a36Sopenharmony_ci continue; 79962306a36Sopenharmony_ci } 80062306a36Sopenharmony_ci 80162306a36Sopenharmony_ci if (key.objectid >= block_group->start + block_group->length) 80262306a36Sopenharmony_ci break; 80362306a36Sopenharmony_ci 80462306a36Sopenharmony_ci if (key.type == BTRFS_EXTENT_ITEM_KEY || 80562306a36Sopenharmony_ci key.type == BTRFS_METADATA_ITEM_KEY) { 80662306a36Sopenharmony_ci u64 space_added; 80762306a36Sopenharmony_ci 80862306a36Sopenharmony_ci ret = btrfs_add_new_free_space(block_group, last, 80962306a36Sopenharmony_ci key.objectid, &space_added); 81062306a36Sopenharmony_ci if (ret) 81162306a36Sopenharmony_ci goto out; 81262306a36Sopenharmony_ci total_found += space_added; 81362306a36Sopenharmony_ci if (key.type == BTRFS_METADATA_ITEM_KEY) 81462306a36Sopenharmony_ci last = key.objectid + 81562306a36Sopenharmony_ci fs_info->nodesize; 81662306a36Sopenharmony_ci else 81762306a36Sopenharmony_ci last = key.objectid + key.offset; 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_ci if (total_found > CACHING_CTL_WAKE_UP) { 82062306a36Sopenharmony_ci total_found = 0; 82162306a36Sopenharmony_ci if (wakeup) { 82262306a36Sopenharmony_ci atomic_inc(&caching_ctl->progress); 82362306a36Sopenharmony_ci wake_up(&caching_ctl->wait); 82462306a36Sopenharmony_ci } 82562306a36Sopenharmony_ci } 82662306a36Sopenharmony_ci } 82762306a36Sopenharmony_ci path->slots[0]++; 82862306a36Sopenharmony_ci } 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci ret = btrfs_add_new_free_space(block_group, last, 83162306a36Sopenharmony_ci block_group->start + block_group->length, 83262306a36Sopenharmony_ci NULL); 83362306a36Sopenharmony_ciout: 83462306a36Sopenharmony_ci btrfs_free_path(path); 83562306a36Sopenharmony_ci return ret; 83662306a36Sopenharmony_ci} 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_cistatic inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) 83962306a36Sopenharmony_ci{ 84062306a36Sopenharmony_ci clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, 84162306a36Sopenharmony_ci bg->start + bg->length - 1, EXTENT_UPTODATE); 84262306a36Sopenharmony_ci} 84362306a36Sopenharmony_ci 84462306a36Sopenharmony_cistatic noinline void caching_thread(struct btrfs_work *work) 84562306a36Sopenharmony_ci{ 84662306a36Sopenharmony_ci struct btrfs_block_group *block_group; 84762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info; 84862306a36Sopenharmony_ci struct btrfs_caching_control *caching_ctl; 84962306a36Sopenharmony_ci int ret; 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_ci caching_ctl = container_of(work, struct btrfs_caching_control, work); 85262306a36Sopenharmony_ci block_group = caching_ctl->block_group; 85362306a36Sopenharmony_ci fs_info = block_group->fs_info; 85462306a36Sopenharmony_ci 85562306a36Sopenharmony_ci mutex_lock(&caching_ctl->mutex); 85662306a36Sopenharmony_ci down_read(&fs_info->commit_root_sem); 85762306a36Sopenharmony_ci 85862306a36Sopenharmony_ci load_block_group_size_class(caching_ctl, block_group); 85962306a36Sopenharmony_ci if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 86062306a36Sopenharmony_ci ret = load_free_space_cache(block_group); 86162306a36Sopenharmony_ci if (ret == 1) { 86262306a36Sopenharmony_ci ret = 0; 86362306a36Sopenharmony_ci goto done; 86462306a36Sopenharmony_ci } 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_ci /* 86762306a36Sopenharmony_ci * We failed to load the space cache, set ourselves to 86862306a36Sopenharmony_ci * CACHE_STARTED and carry on. 86962306a36Sopenharmony_ci */ 87062306a36Sopenharmony_ci spin_lock(&block_group->lock); 87162306a36Sopenharmony_ci block_group->cached = BTRFS_CACHE_STARTED; 87262306a36Sopenharmony_ci spin_unlock(&block_group->lock); 87362306a36Sopenharmony_ci wake_up(&caching_ctl->wait); 87462306a36Sopenharmony_ci } 87562306a36Sopenharmony_ci 87662306a36Sopenharmony_ci /* 87762306a36Sopenharmony_ci * If we are in the transaction that populated the free space tree we 87862306a36Sopenharmony_ci * can't actually cache from the free space tree as our commit root and 87962306a36Sopenharmony_ci * real root are the same, so we could change the contents of the blocks 88062306a36Sopenharmony_ci * while caching. Instead do the slow caching in this case, and after 88162306a36Sopenharmony_ci * the transaction has committed we will be safe. 88262306a36Sopenharmony_ci */ 88362306a36Sopenharmony_ci if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && 88462306a36Sopenharmony_ci !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) 88562306a36Sopenharmony_ci ret = load_free_space_tree(caching_ctl); 88662306a36Sopenharmony_ci else 88762306a36Sopenharmony_ci ret = load_extent_tree_free(caching_ctl); 88862306a36Sopenharmony_cidone: 88962306a36Sopenharmony_ci spin_lock(&block_group->lock); 89062306a36Sopenharmony_ci block_group->caching_ctl = NULL; 89162306a36Sopenharmony_ci block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 89262306a36Sopenharmony_ci spin_unlock(&block_group->lock); 89362306a36Sopenharmony_ci 89462306a36Sopenharmony_ci#ifdef CONFIG_BTRFS_DEBUG 89562306a36Sopenharmony_ci if (btrfs_should_fragment_free_space(block_group)) { 89662306a36Sopenharmony_ci u64 bytes_used; 89762306a36Sopenharmony_ci 89862306a36Sopenharmony_ci spin_lock(&block_group->space_info->lock); 89962306a36Sopenharmony_ci spin_lock(&block_group->lock); 90062306a36Sopenharmony_ci bytes_used = block_group->length - block_group->used; 90162306a36Sopenharmony_ci block_group->space_info->bytes_used += bytes_used >> 1; 90262306a36Sopenharmony_ci spin_unlock(&block_group->lock); 90362306a36Sopenharmony_ci spin_unlock(&block_group->space_info->lock); 90462306a36Sopenharmony_ci fragment_free_space(block_group); 90562306a36Sopenharmony_ci } 90662306a36Sopenharmony_ci#endif 90762306a36Sopenharmony_ci 90862306a36Sopenharmony_ci up_read(&fs_info->commit_root_sem); 90962306a36Sopenharmony_ci btrfs_free_excluded_extents(block_group); 91062306a36Sopenharmony_ci mutex_unlock(&caching_ctl->mutex); 91162306a36Sopenharmony_ci 91262306a36Sopenharmony_ci wake_up(&caching_ctl->wait); 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci btrfs_put_caching_control(caching_ctl); 91562306a36Sopenharmony_ci btrfs_put_block_group(block_group); 91662306a36Sopenharmony_ci} 91762306a36Sopenharmony_ci 91862306a36Sopenharmony_ciint btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) 91962306a36Sopenharmony_ci{ 92062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = cache->fs_info; 92162306a36Sopenharmony_ci struct btrfs_caching_control *caching_ctl = NULL; 92262306a36Sopenharmony_ci int ret = 0; 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_ci /* Allocator for zoned filesystems does not use the cache at all */ 92562306a36Sopenharmony_ci if (btrfs_is_zoned(fs_info)) 92662306a36Sopenharmony_ci return 0; 92762306a36Sopenharmony_ci 92862306a36Sopenharmony_ci caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 92962306a36Sopenharmony_ci if (!caching_ctl) 93062306a36Sopenharmony_ci return -ENOMEM; 93162306a36Sopenharmony_ci 93262306a36Sopenharmony_ci INIT_LIST_HEAD(&caching_ctl->list); 93362306a36Sopenharmony_ci mutex_init(&caching_ctl->mutex); 93462306a36Sopenharmony_ci init_waitqueue_head(&caching_ctl->wait); 93562306a36Sopenharmony_ci caching_ctl->block_group = cache; 93662306a36Sopenharmony_ci refcount_set(&caching_ctl->count, 2); 93762306a36Sopenharmony_ci atomic_set(&caching_ctl->progress, 0); 93862306a36Sopenharmony_ci btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 93962306a36Sopenharmony_ci 94062306a36Sopenharmony_ci spin_lock(&cache->lock); 94162306a36Sopenharmony_ci if (cache->cached != BTRFS_CACHE_NO) { 94262306a36Sopenharmony_ci kfree(caching_ctl); 94362306a36Sopenharmony_ci 94462306a36Sopenharmony_ci caching_ctl = cache->caching_ctl; 94562306a36Sopenharmony_ci if (caching_ctl) 94662306a36Sopenharmony_ci refcount_inc(&caching_ctl->count); 94762306a36Sopenharmony_ci spin_unlock(&cache->lock); 94862306a36Sopenharmony_ci goto out; 94962306a36Sopenharmony_ci } 95062306a36Sopenharmony_ci WARN_ON(cache->caching_ctl); 95162306a36Sopenharmony_ci cache->caching_ctl = caching_ctl; 95262306a36Sopenharmony_ci cache->cached = BTRFS_CACHE_STARTED; 95362306a36Sopenharmony_ci spin_unlock(&cache->lock); 95462306a36Sopenharmony_ci 95562306a36Sopenharmony_ci write_lock(&fs_info->block_group_cache_lock); 95662306a36Sopenharmony_ci refcount_inc(&caching_ctl->count); 95762306a36Sopenharmony_ci list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 95862306a36Sopenharmony_ci write_unlock(&fs_info->block_group_cache_lock); 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_ci btrfs_get_block_group(cache); 96162306a36Sopenharmony_ci 96262306a36Sopenharmony_ci btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 96362306a36Sopenharmony_ciout: 96462306a36Sopenharmony_ci if (wait && caching_ctl) 96562306a36Sopenharmony_ci ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 96662306a36Sopenharmony_ci if (caching_ctl) 96762306a36Sopenharmony_ci btrfs_put_caching_control(caching_ctl); 96862306a36Sopenharmony_ci 96962306a36Sopenharmony_ci return ret; 97062306a36Sopenharmony_ci} 97162306a36Sopenharmony_ci 97262306a36Sopenharmony_cistatic void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 97362306a36Sopenharmony_ci{ 97462306a36Sopenharmony_ci u64 extra_flags = chunk_to_extended(flags) & 97562306a36Sopenharmony_ci BTRFS_EXTENDED_PROFILE_MASK; 97662306a36Sopenharmony_ci 97762306a36Sopenharmony_ci write_seqlock(&fs_info->profiles_lock); 97862306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_DATA) 97962306a36Sopenharmony_ci fs_info->avail_data_alloc_bits &= ~extra_flags; 98062306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_METADATA) 98162306a36Sopenharmony_ci fs_info->avail_metadata_alloc_bits &= ~extra_flags; 98262306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 98362306a36Sopenharmony_ci fs_info->avail_system_alloc_bits &= ~extra_flags; 98462306a36Sopenharmony_ci write_sequnlock(&fs_info->profiles_lock); 98562306a36Sopenharmony_ci} 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_ci/* 98862306a36Sopenharmony_ci * Clear incompat bits for the following feature(s): 98962306a36Sopenharmony_ci * 99062306a36Sopenharmony_ci * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group 99162306a36Sopenharmony_ci * in the whole filesystem 99262306a36Sopenharmony_ci * 99362306a36Sopenharmony_ci * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups 99462306a36Sopenharmony_ci */ 99562306a36Sopenharmony_cistatic void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) 99662306a36Sopenharmony_ci{ 99762306a36Sopenharmony_ci bool found_raid56 = false; 99862306a36Sopenharmony_ci bool found_raid1c34 = false; 99962306a36Sopenharmony_ci 100062306a36Sopenharmony_ci if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || 100162306a36Sopenharmony_ci (flags & BTRFS_BLOCK_GROUP_RAID1C3) || 100262306a36Sopenharmony_ci (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { 100362306a36Sopenharmony_ci struct list_head *head = &fs_info->space_info; 100462306a36Sopenharmony_ci struct btrfs_space_info *sinfo; 100562306a36Sopenharmony_ci 100662306a36Sopenharmony_ci list_for_each_entry_rcu(sinfo, head, list) { 100762306a36Sopenharmony_ci down_read(&sinfo->groups_sem); 100862306a36Sopenharmony_ci if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) 100962306a36Sopenharmony_ci found_raid56 = true; 101062306a36Sopenharmony_ci if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) 101162306a36Sopenharmony_ci found_raid56 = true; 101262306a36Sopenharmony_ci if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) 101362306a36Sopenharmony_ci found_raid1c34 = true; 101462306a36Sopenharmony_ci if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) 101562306a36Sopenharmony_ci found_raid1c34 = true; 101662306a36Sopenharmony_ci up_read(&sinfo->groups_sem); 101762306a36Sopenharmony_ci } 101862306a36Sopenharmony_ci if (!found_raid56) 101962306a36Sopenharmony_ci btrfs_clear_fs_incompat(fs_info, RAID56); 102062306a36Sopenharmony_ci if (!found_raid1c34) 102162306a36Sopenharmony_ci btrfs_clear_fs_incompat(fs_info, RAID1C34); 102262306a36Sopenharmony_ci } 102362306a36Sopenharmony_ci} 102462306a36Sopenharmony_ci 102562306a36Sopenharmony_cistatic int remove_block_group_item(struct btrfs_trans_handle *trans, 102662306a36Sopenharmony_ci struct btrfs_path *path, 102762306a36Sopenharmony_ci struct btrfs_block_group *block_group) 102862306a36Sopenharmony_ci{ 102962306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 103062306a36Sopenharmony_ci struct btrfs_root *root; 103162306a36Sopenharmony_ci struct btrfs_key key; 103262306a36Sopenharmony_ci int ret; 103362306a36Sopenharmony_ci 103462306a36Sopenharmony_ci root = btrfs_block_group_root(fs_info); 103562306a36Sopenharmony_ci key.objectid = block_group->start; 103662306a36Sopenharmony_ci key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 103762306a36Sopenharmony_ci key.offset = block_group->length; 103862306a36Sopenharmony_ci 103962306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 104062306a36Sopenharmony_ci if (ret > 0) 104162306a36Sopenharmony_ci ret = -ENOENT; 104262306a36Sopenharmony_ci if (ret < 0) 104362306a36Sopenharmony_ci return ret; 104462306a36Sopenharmony_ci 104562306a36Sopenharmony_ci ret = btrfs_del_item(trans, root, path); 104662306a36Sopenharmony_ci return ret; 104762306a36Sopenharmony_ci} 104862306a36Sopenharmony_ci 104962306a36Sopenharmony_ciint btrfs_remove_block_group(struct btrfs_trans_handle *trans, 105062306a36Sopenharmony_ci u64 group_start, struct extent_map *em) 105162306a36Sopenharmony_ci{ 105262306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 105362306a36Sopenharmony_ci struct btrfs_path *path; 105462306a36Sopenharmony_ci struct btrfs_block_group *block_group; 105562306a36Sopenharmony_ci struct btrfs_free_cluster *cluster; 105662306a36Sopenharmony_ci struct inode *inode; 105762306a36Sopenharmony_ci struct kobject *kobj = NULL; 105862306a36Sopenharmony_ci int ret; 105962306a36Sopenharmony_ci int index; 106062306a36Sopenharmony_ci int factor; 106162306a36Sopenharmony_ci struct btrfs_caching_control *caching_ctl = NULL; 106262306a36Sopenharmony_ci bool remove_em; 106362306a36Sopenharmony_ci bool remove_rsv = false; 106462306a36Sopenharmony_ci 106562306a36Sopenharmony_ci block_group = btrfs_lookup_block_group(fs_info, group_start); 106662306a36Sopenharmony_ci BUG_ON(!block_group); 106762306a36Sopenharmony_ci BUG_ON(!block_group->ro); 106862306a36Sopenharmony_ci 106962306a36Sopenharmony_ci trace_btrfs_remove_block_group(block_group); 107062306a36Sopenharmony_ci /* 107162306a36Sopenharmony_ci * Free the reserved super bytes from this block group before 107262306a36Sopenharmony_ci * remove it. 107362306a36Sopenharmony_ci */ 107462306a36Sopenharmony_ci btrfs_free_excluded_extents(block_group); 107562306a36Sopenharmony_ci btrfs_free_ref_tree_range(fs_info, block_group->start, 107662306a36Sopenharmony_ci block_group->length); 107762306a36Sopenharmony_ci 107862306a36Sopenharmony_ci index = btrfs_bg_flags_to_raid_index(block_group->flags); 107962306a36Sopenharmony_ci factor = btrfs_bg_type_to_factor(block_group->flags); 108062306a36Sopenharmony_ci 108162306a36Sopenharmony_ci /* make sure this block group isn't part of an allocation cluster */ 108262306a36Sopenharmony_ci cluster = &fs_info->data_alloc_cluster; 108362306a36Sopenharmony_ci spin_lock(&cluster->refill_lock); 108462306a36Sopenharmony_ci btrfs_return_cluster_to_free_space(block_group, cluster); 108562306a36Sopenharmony_ci spin_unlock(&cluster->refill_lock); 108662306a36Sopenharmony_ci 108762306a36Sopenharmony_ci /* 108862306a36Sopenharmony_ci * make sure this block group isn't part of a metadata 108962306a36Sopenharmony_ci * allocation cluster 109062306a36Sopenharmony_ci */ 109162306a36Sopenharmony_ci cluster = &fs_info->meta_alloc_cluster; 109262306a36Sopenharmony_ci spin_lock(&cluster->refill_lock); 109362306a36Sopenharmony_ci btrfs_return_cluster_to_free_space(block_group, cluster); 109462306a36Sopenharmony_ci spin_unlock(&cluster->refill_lock); 109562306a36Sopenharmony_ci 109662306a36Sopenharmony_ci btrfs_clear_treelog_bg(block_group); 109762306a36Sopenharmony_ci btrfs_clear_data_reloc_bg(block_group); 109862306a36Sopenharmony_ci 109962306a36Sopenharmony_ci path = btrfs_alloc_path(); 110062306a36Sopenharmony_ci if (!path) { 110162306a36Sopenharmony_ci ret = -ENOMEM; 110262306a36Sopenharmony_ci goto out; 110362306a36Sopenharmony_ci } 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_ci /* 110662306a36Sopenharmony_ci * get the inode first so any iput calls done for the io_list 110762306a36Sopenharmony_ci * aren't the final iput (no unlinks allowed now) 110862306a36Sopenharmony_ci */ 110962306a36Sopenharmony_ci inode = lookup_free_space_inode(block_group, path); 111062306a36Sopenharmony_ci 111162306a36Sopenharmony_ci mutex_lock(&trans->transaction->cache_write_mutex); 111262306a36Sopenharmony_ci /* 111362306a36Sopenharmony_ci * Make sure our free space cache IO is done before removing the 111462306a36Sopenharmony_ci * free space inode 111562306a36Sopenharmony_ci */ 111662306a36Sopenharmony_ci spin_lock(&trans->transaction->dirty_bgs_lock); 111762306a36Sopenharmony_ci if (!list_empty(&block_group->io_list)) { 111862306a36Sopenharmony_ci list_del_init(&block_group->io_list); 111962306a36Sopenharmony_ci 112062306a36Sopenharmony_ci WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 112162306a36Sopenharmony_ci 112262306a36Sopenharmony_ci spin_unlock(&trans->transaction->dirty_bgs_lock); 112362306a36Sopenharmony_ci btrfs_wait_cache_io(trans, block_group, path); 112462306a36Sopenharmony_ci btrfs_put_block_group(block_group); 112562306a36Sopenharmony_ci spin_lock(&trans->transaction->dirty_bgs_lock); 112662306a36Sopenharmony_ci } 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci if (!list_empty(&block_group->dirty_list)) { 112962306a36Sopenharmony_ci list_del_init(&block_group->dirty_list); 113062306a36Sopenharmony_ci remove_rsv = true; 113162306a36Sopenharmony_ci btrfs_put_block_group(block_group); 113262306a36Sopenharmony_ci } 113362306a36Sopenharmony_ci spin_unlock(&trans->transaction->dirty_bgs_lock); 113462306a36Sopenharmony_ci mutex_unlock(&trans->transaction->cache_write_mutex); 113562306a36Sopenharmony_ci 113662306a36Sopenharmony_ci ret = btrfs_remove_free_space_inode(trans, inode, block_group); 113762306a36Sopenharmony_ci if (ret) 113862306a36Sopenharmony_ci goto out; 113962306a36Sopenharmony_ci 114062306a36Sopenharmony_ci write_lock(&fs_info->block_group_cache_lock); 114162306a36Sopenharmony_ci rb_erase_cached(&block_group->cache_node, 114262306a36Sopenharmony_ci &fs_info->block_group_cache_tree); 114362306a36Sopenharmony_ci RB_CLEAR_NODE(&block_group->cache_node); 114462306a36Sopenharmony_ci 114562306a36Sopenharmony_ci /* Once for the block groups rbtree */ 114662306a36Sopenharmony_ci btrfs_put_block_group(block_group); 114762306a36Sopenharmony_ci 114862306a36Sopenharmony_ci write_unlock(&fs_info->block_group_cache_lock); 114962306a36Sopenharmony_ci 115062306a36Sopenharmony_ci down_write(&block_group->space_info->groups_sem); 115162306a36Sopenharmony_ci /* 115262306a36Sopenharmony_ci * we must use list_del_init so people can check to see if they 115362306a36Sopenharmony_ci * are still on the list after taking the semaphore 115462306a36Sopenharmony_ci */ 115562306a36Sopenharmony_ci list_del_init(&block_group->list); 115662306a36Sopenharmony_ci if (list_empty(&block_group->space_info->block_groups[index])) { 115762306a36Sopenharmony_ci kobj = block_group->space_info->block_group_kobjs[index]; 115862306a36Sopenharmony_ci block_group->space_info->block_group_kobjs[index] = NULL; 115962306a36Sopenharmony_ci clear_avail_alloc_bits(fs_info, block_group->flags); 116062306a36Sopenharmony_ci } 116162306a36Sopenharmony_ci up_write(&block_group->space_info->groups_sem); 116262306a36Sopenharmony_ci clear_incompat_bg_bits(fs_info, block_group->flags); 116362306a36Sopenharmony_ci if (kobj) { 116462306a36Sopenharmony_ci kobject_del(kobj); 116562306a36Sopenharmony_ci kobject_put(kobj); 116662306a36Sopenharmony_ci } 116762306a36Sopenharmony_ci 116862306a36Sopenharmony_ci if (block_group->cached == BTRFS_CACHE_STARTED) 116962306a36Sopenharmony_ci btrfs_wait_block_group_cache_done(block_group); 117062306a36Sopenharmony_ci 117162306a36Sopenharmony_ci write_lock(&fs_info->block_group_cache_lock); 117262306a36Sopenharmony_ci caching_ctl = btrfs_get_caching_control(block_group); 117362306a36Sopenharmony_ci if (!caching_ctl) { 117462306a36Sopenharmony_ci struct btrfs_caching_control *ctl; 117562306a36Sopenharmony_ci 117662306a36Sopenharmony_ci list_for_each_entry(ctl, &fs_info->caching_block_groups, list) { 117762306a36Sopenharmony_ci if (ctl->block_group == block_group) { 117862306a36Sopenharmony_ci caching_ctl = ctl; 117962306a36Sopenharmony_ci refcount_inc(&caching_ctl->count); 118062306a36Sopenharmony_ci break; 118162306a36Sopenharmony_ci } 118262306a36Sopenharmony_ci } 118362306a36Sopenharmony_ci } 118462306a36Sopenharmony_ci if (caching_ctl) 118562306a36Sopenharmony_ci list_del_init(&caching_ctl->list); 118662306a36Sopenharmony_ci write_unlock(&fs_info->block_group_cache_lock); 118762306a36Sopenharmony_ci 118862306a36Sopenharmony_ci if (caching_ctl) { 118962306a36Sopenharmony_ci /* Once for the caching bgs list and once for us. */ 119062306a36Sopenharmony_ci btrfs_put_caching_control(caching_ctl); 119162306a36Sopenharmony_ci btrfs_put_caching_control(caching_ctl); 119262306a36Sopenharmony_ci } 119362306a36Sopenharmony_ci 119462306a36Sopenharmony_ci spin_lock(&trans->transaction->dirty_bgs_lock); 119562306a36Sopenharmony_ci WARN_ON(!list_empty(&block_group->dirty_list)); 119662306a36Sopenharmony_ci WARN_ON(!list_empty(&block_group->io_list)); 119762306a36Sopenharmony_ci spin_unlock(&trans->transaction->dirty_bgs_lock); 119862306a36Sopenharmony_ci 119962306a36Sopenharmony_ci btrfs_remove_free_space_cache(block_group); 120062306a36Sopenharmony_ci 120162306a36Sopenharmony_ci spin_lock(&block_group->space_info->lock); 120262306a36Sopenharmony_ci list_del_init(&block_group->ro_list); 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 120562306a36Sopenharmony_ci WARN_ON(block_group->space_info->total_bytes 120662306a36Sopenharmony_ci < block_group->length); 120762306a36Sopenharmony_ci WARN_ON(block_group->space_info->bytes_readonly 120862306a36Sopenharmony_ci < block_group->length - block_group->zone_unusable); 120962306a36Sopenharmony_ci WARN_ON(block_group->space_info->bytes_zone_unusable 121062306a36Sopenharmony_ci < block_group->zone_unusable); 121162306a36Sopenharmony_ci WARN_ON(block_group->space_info->disk_total 121262306a36Sopenharmony_ci < block_group->length * factor); 121362306a36Sopenharmony_ci } 121462306a36Sopenharmony_ci block_group->space_info->total_bytes -= block_group->length; 121562306a36Sopenharmony_ci block_group->space_info->bytes_readonly -= 121662306a36Sopenharmony_ci (block_group->length - block_group->zone_unusable); 121762306a36Sopenharmony_ci block_group->space_info->bytes_zone_unusable -= 121862306a36Sopenharmony_ci block_group->zone_unusable; 121962306a36Sopenharmony_ci block_group->space_info->disk_total -= block_group->length * factor; 122062306a36Sopenharmony_ci 122162306a36Sopenharmony_ci spin_unlock(&block_group->space_info->lock); 122262306a36Sopenharmony_ci 122362306a36Sopenharmony_ci /* 122462306a36Sopenharmony_ci * Remove the free space for the block group from the free space tree 122562306a36Sopenharmony_ci * and the block group's item from the extent tree before marking the 122662306a36Sopenharmony_ci * block group as removed. This is to prevent races with tasks that 122762306a36Sopenharmony_ci * freeze and unfreeze a block group, this task and another task 122862306a36Sopenharmony_ci * allocating a new block group - the unfreeze task ends up removing 122962306a36Sopenharmony_ci * the block group's extent map before the task calling this function 123062306a36Sopenharmony_ci * deletes the block group item from the extent tree, allowing for 123162306a36Sopenharmony_ci * another task to attempt to create another block group with the same 123262306a36Sopenharmony_ci * item key (and failing with -EEXIST and a transaction abort). 123362306a36Sopenharmony_ci */ 123462306a36Sopenharmony_ci ret = remove_block_group_free_space(trans, block_group); 123562306a36Sopenharmony_ci if (ret) 123662306a36Sopenharmony_ci goto out; 123762306a36Sopenharmony_ci 123862306a36Sopenharmony_ci ret = remove_block_group_item(trans, path, block_group); 123962306a36Sopenharmony_ci if (ret < 0) 124062306a36Sopenharmony_ci goto out; 124162306a36Sopenharmony_ci 124262306a36Sopenharmony_ci spin_lock(&block_group->lock); 124362306a36Sopenharmony_ci set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); 124462306a36Sopenharmony_ci 124562306a36Sopenharmony_ci /* 124662306a36Sopenharmony_ci * At this point trimming or scrub can't start on this block group, 124762306a36Sopenharmony_ci * because we removed the block group from the rbtree 124862306a36Sopenharmony_ci * fs_info->block_group_cache_tree so no one can't find it anymore and 124962306a36Sopenharmony_ci * even if someone already got this block group before we removed it 125062306a36Sopenharmony_ci * from the rbtree, they have already incremented block_group->frozen - 125162306a36Sopenharmony_ci * if they didn't, for the trimming case they won't find any free space 125262306a36Sopenharmony_ci * entries because we already removed them all when we called 125362306a36Sopenharmony_ci * btrfs_remove_free_space_cache(). 125462306a36Sopenharmony_ci * 125562306a36Sopenharmony_ci * And we must not remove the extent map from the fs_info->mapping_tree 125662306a36Sopenharmony_ci * to prevent the same logical address range and physical device space 125762306a36Sopenharmony_ci * ranges from being reused for a new block group. This is needed to 125862306a36Sopenharmony_ci * avoid races with trimming and scrub. 125962306a36Sopenharmony_ci * 126062306a36Sopenharmony_ci * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 126162306a36Sopenharmony_ci * completely transactionless, so while it is trimming a range the 126262306a36Sopenharmony_ci * currently running transaction might finish and a new one start, 126362306a36Sopenharmony_ci * allowing for new block groups to be created that can reuse the same 126462306a36Sopenharmony_ci * physical device locations unless we take this special care. 126562306a36Sopenharmony_ci * 126662306a36Sopenharmony_ci * There may also be an implicit trim operation if the file system 126762306a36Sopenharmony_ci * is mounted with -odiscard. The same protections must remain 126862306a36Sopenharmony_ci * in place until the extents have been discarded completely when 126962306a36Sopenharmony_ci * the transaction commit has completed. 127062306a36Sopenharmony_ci */ 127162306a36Sopenharmony_ci remove_em = (atomic_read(&block_group->frozen) == 0); 127262306a36Sopenharmony_ci spin_unlock(&block_group->lock); 127362306a36Sopenharmony_ci 127462306a36Sopenharmony_ci if (remove_em) { 127562306a36Sopenharmony_ci struct extent_map_tree *em_tree; 127662306a36Sopenharmony_ci 127762306a36Sopenharmony_ci em_tree = &fs_info->mapping_tree; 127862306a36Sopenharmony_ci write_lock(&em_tree->lock); 127962306a36Sopenharmony_ci remove_extent_mapping(em_tree, em); 128062306a36Sopenharmony_ci write_unlock(&em_tree->lock); 128162306a36Sopenharmony_ci /* once for the tree */ 128262306a36Sopenharmony_ci free_extent_map(em); 128362306a36Sopenharmony_ci } 128462306a36Sopenharmony_ci 128562306a36Sopenharmony_ciout: 128662306a36Sopenharmony_ci /* Once for the lookup reference */ 128762306a36Sopenharmony_ci btrfs_put_block_group(block_group); 128862306a36Sopenharmony_ci if (remove_rsv) 128962306a36Sopenharmony_ci btrfs_delayed_refs_rsv_release(fs_info, 1); 129062306a36Sopenharmony_ci btrfs_free_path(path); 129162306a36Sopenharmony_ci return ret; 129262306a36Sopenharmony_ci} 129362306a36Sopenharmony_ci 129462306a36Sopenharmony_cistruct btrfs_trans_handle *btrfs_start_trans_remove_block_group( 129562306a36Sopenharmony_ci struct btrfs_fs_info *fs_info, const u64 chunk_offset) 129662306a36Sopenharmony_ci{ 129762306a36Sopenharmony_ci struct btrfs_root *root = btrfs_block_group_root(fs_info); 129862306a36Sopenharmony_ci struct extent_map_tree *em_tree = &fs_info->mapping_tree; 129962306a36Sopenharmony_ci struct extent_map *em; 130062306a36Sopenharmony_ci struct map_lookup *map; 130162306a36Sopenharmony_ci unsigned int num_items; 130262306a36Sopenharmony_ci 130362306a36Sopenharmony_ci read_lock(&em_tree->lock); 130462306a36Sopenharmony_ci em = lookup_extent_mapping(em_tree, chunk_offset, 1); 130562306a36Sopenharmony_ci read_unlock(&em_tree->lock); 130662306a36Sopenharmony_ci ASSERT(em && em->start == chunk_offset); 130762306a36Sopenharmony_ci 130862306a36Sopenharmony_ci /* 130962306a36Sopenharmony_ci * We need to reserve 3 + N units from the metadata space info in order 131062306a36Sopenharmony_ci * to remove a block group (done at btrfs_remove_chunk() and at 131162306a36Sopenharmony_ci * btrfs_remove_block_group()), which are used for: 131262306a36Sopenharmony_ci * 131362306a36Sopenharmony_ci * 1 unit for adding the free space inode's orphan (located in the tree 131462306a36Sopenharmony_ci * of tree roots). 131562306a36Sopenharmony_ci * 1 unit for deleting the block group item (located in the extent 131662306a36Sopenharmony_ci * tree). 131762306a36Sopenharmony_ci * 1 unit for deleting the free space item (located in tree of tree 131862306a36Sopenharmony_ci * roots). 131962306a36Sopenharmony_ci * N units for deleting N device extent items corresponding to each 132062306a36Sopenharmony_ci * stripe (located in the device tree). 132162306a36Sopenharmony_ci * 132262306a36Sopenharmony_ci * In order to remove a block group we also need to reserve units in the 132362306a36Sopenharmony_ci * system space info in order to update the chunk tree (update one or 132462306a36Sopenharmony_ci * more device items and remove one chunk item), but this is done at 132562306a36Sopenharmony_ci * btrfs_remove_chunk() through a call to check_system_chunk(). 132662306a36Sopenharmony_ci */ 132762306a36Sopenharmony_ci map = em->map_lookup; 132862306a36Sopenharmony_ci num_items = 3 + map->num_stripes; 132962306a36Sopenharmony_ci free_extent_map(em); 133062306a36Sopenharmony_ci 133162306a36Sopenharmony_ci return btrfs_start_transaction_fallback_global_rsv(root, num_items); 133262306a36Sopenharmony_ci} 133362306a36Sopenharmony_ci 133462306a36Sopenharmony_ci/* 133562306a36Sopenharmony_ci * Mark block group @cache read-only, so later write won't happen to block 133662306a36Sopenharmony_ci * group @cache. 133762306a36Sopenharmony_ci * 133862306a36Sopenharmony_ci * If @force is not set, this function will only mark the block group readonly 133962306a36Sopenharmony_ci * if we have enough free space (1M) in other metadata/system block groups. 134062306a36Sopenharmony_ci * If @force is not set, this function will mark the block group readonly 134162306a36Sopenharmony_ci * without checking free space. 134262306a36Sopenharmony_ci * 134362306a36Sopenharmony_ci * NOTE: This function doesn't care if other block groups can contain all the 134462306a36Sopenharmony_ci * data in this block group. That check should be done by relocation routine, 134562306a36Sopenharmony_ci * not this function. 134662306a36Sopenharmony_ci */ 134762306a36Sopenharmony_cistatic int inc_block_group_ro(struct btrfs_block_group *cache, int force) 134862306a36Sopenharmony_ci{ 134962306a36Sopenharmony_ci struct btrfs_space_info *sinfo = cache->space_info; 135062306a36Sopenharmony_ci u64 num_bytes; 135162306a36Sopenharmony_ci int ret = -ENOSPC; 135262306a36Sopenharmony_ci 135362306a36Sopenharmony_ci spin_lock(&sinfo->lock); 135462306a36Sopenharmony_ci spin_lock(&cache->lock); 135562306a36Sopenharmony_ci 135662306a36Sopenharmony_ci if (cache->swap_extents) { 135762306a36Sopenharmony_ci ret = -ETXTBSY; 135862306a36Sopenharmony_ci goto out; 135962306a36Sopenharmony_ci } 136062306a36Sopenharmony_ci 136162306a36Sopenharmony_ci if (cache->ro) { 136262306a36Sopenharmony_ci cache->ro++; 136362306a36Sopenharmony_ci ret = 0; 136462306a36Sopenharmony_ci goto out; 136562306a36Sopenharmony_ci } 136662306a36Sopenharmony_ci 136762306a36Sopenharmony_ci num_bytes = cache->length - cache->reserved - cache->pinned - 136862306a36Sopenharmony_ci cache->bytes_super - cache->zone_unusable - cache->used; 136962306a36Sopenharmony_ci 137062306a36Sopenharmony_ci /* 137162306a36Sopenharmony_ci * Data never overcommits, even in mixed mode, so do just the straight 137262306a36Sopenharmony_ci * check of left over space in how much we have allocated. 137362306a36Sopenharmony_ci */ 137462306a36Sopenharmony_ci if (force) { 137562306a36Sopenharmony_ci ret = 0; 137662306a36Sopenharmony_ci } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { 137762306a36Sopenharmony_ci u64 sinfo_used = btrfs_space_info_used(sinfo, true); 137862306a36Sopenharmony_ci 137962306a36Sopenharmony_ci /* 138062306a36Sopenharmony_ci * Here we make sure if we mark this bg RO, we still have enough 138162306a36Sopenharmony_ci * free space as buffer. 138262306a36Sopenharmony_ci */ 138362306a36Sopenharmony_ci if (sinfo_used + num_bytes <= sinfo->total_bytes) 138462306a36Sopenharmony_ci ret = 0; 138562306a36Sopenharmony_ci } else { 138662306a36Sopenharmony_ci /* 138762306a36Sopenharmony_ci * We overcommit metadata, so we need to do the 138862306a36Sopenharmony_ci * btrfs_can_overcommit check here, and we need to pass in 138962306a36Sopenharmony_ci * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of 139062306a36Sopenharmony_ci * leeway to allow us to mark this block group as read only. 139162306a36Sopenharmony_ci */ 139262306a36Sopenharmony_ci if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, 139362306a36Sopenharmony_ci BTRFS_RESERVE_NO_FLUSH)) 139462306a36Sopenharmony_ci ret = 0; 139562306a36Sopenharmony_ci } 139662306a36Sopenharmony_ci 139762306a36Sopenharmony_ci if (!ret) { 139862306a36Sopenharmony_ci sinfo->bytes_readonly += num_bytes; 139962306a36Sopenharmony_ci if (btrfs_is_zoned(cache->fs_info)) { 140062306a36Sopenharmony_ci /* Migrate zone_unusable bytes to readonly */ 140162306a36Sopenharmony_ci sinfo->bytes_readonly += cache->zone_unusable; 140262306a36Sopenharmony_ci sinfo->bytes_zone_unusable -= cache->zone_unusable; 140362306a36Sopenharmony_ci cache->zone_unusable = 0; 140462306a36Sopenharmony_ci } 140562306a36Sopenharmony_ci cache->ro++; 140662306a36Sopenharmony_ci list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 140762306a36Sopenharmony_ci } 140862306a36Sopenharmony_ciout: 140962306a36Sopenharmony_ci spin_unlock(&cache->lock); 141062306a36Sopenharmony_ci spin_unlock(&sinfo->lock); 141162306a36Sopenharmony_ci if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 141262306a36Sopenharmony_ci btrfs_info(cache->fs_info, 141362306a36Sopenharmony_ci "unable to make block group %llu ro", cache->start); 141462306a36Sopenharmony_ci btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 141562306a36Sopenharmony_ci } 141662306a36Sopenharmony_ci return ret; 141762306a36Sopenharmony_ci} 141862306a36Sopenharmony_ci 141962306a36Sopenharmony_cistatic bool clean_pinned_extents(struct btrfs_trans_handle *trans, 142062306a36Sopenharmony_ci struct btrfs_block_group *bg) 142162306a36Sopenharmony_ci{ 142262306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = bg->fs_info; 142362306a36Sopenharmony_ci struct btrfs_transaction *prev_trans = NULL; 142462306a36Sopenharmony_ci const u64 start = bg->start; 142562306a36Sopenharmony_ci const u64 end = start + bg->length - 1; 142662306a36Sopenharmony_ci int ret; 142762306a36Sopenharmony_ci 142862306a36Sopenharmony_ci spin_lock(&fs_info->trans_lock); 142962306a36Sopenharmony_ci if (trans->transaction->list.prev != &fs_info->trans_list) { 143062306a36Sopenharmony_ci prev_trans = list_last_entry(&trans->transaction->list, 143162306a36Sopenharmony_ci struct btrfs_transaction, list); 143262306a36Sopenharmony_ci refcount_inc(&prev_trans->use_count); 143362306a36Sopenharmony_ci } 143462306a36Sopenharmony_ci spin_unlock(&fs_info->trans_lock); 143562306a36Sopenharmony_ci 143662306a36Sopenharmony_ci /* 143762306a36Sopenharmony_ci * Hold the unused_bg_unpin_mutex lock to avoid racing with 143862306a36Sopenharmony_ci * btrfs_finish_extent_commit(). If we are at transaction N, another 143962306a36Sopenharmony_ci * task might be running finish_extent_commit() for the previous 144062306a36Sopenharmony_ci * transaction N - 1, and have seen a range belonging to the block 144162306a36Sopenharmony_ci * group in pinned_extents before we were able to clear the whole block 144262306a36Sopenharmony_ci * group range from pinned_extents. This means that task can lookup for 144362306a36Sopenharmony_ci * the block group after we unpinned it from pinned_extents and removed 144462306a36Sopenharmony_ci * it, leading to a BUG_ON() at unpin_extent_range(). 144562306a36Sopenharmony_ci */ 144662306a36Sopenharmony_ci mutex_lock(&fs_info->unused_bg_unpin_mutex); 144762306a36Sopenharmony_ci if (prev_trans) { 144862306a36Sopenharmony_ci ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, 144962306a36Sopenharmony_ci EXTENT_DIRTY); 145062306a36Sopenharmony_ci if (ret) 145162306a36Sopenharmony_ci goto out; 145262306a36Sopenharmony_ci } 145362306a36Sopenharmony_ci 145462306a36Sopenharmony_ci ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, 145562306a36Sopenharmony_ci EXTENT_DIRTY); 145662306a36Sopenharmony_ciout: 145762306a36Sopenharmony_ci mutex_unlock(&fs_info->unused_bg_unpin_mutex); 145862306a36Sopenharmony_ci if (prev_trans) 145962306a36Sopenharmony_ci btrfs_put_transaction(prev_trans); 146062306a36Sopenharmony_ci 146162306a36Sopenharmony_ci return ret == 0; 146262306a36Sopenharmony_ci} 146362306a36Sopenharmony_ci 146462306a36Sopenharmony_ci/* 146562306a36Sopenharmony_ci * Process the unused_bgs list and remove any that don't have any allocated 146662306a36Sopenharmony_ci * space inside of them. 146762306a36Sopenharmony_ci */ 146862306a36Sopenharmony_civoid btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 146962306a36Sopenharmony_ci{ 147062306a36Sopenharmony_ci LIST_HEAD(retry_list); 147162306a36Sopenharmony_ci struct btrfs_block_group *block_group; 147262306a36Sopenharmony_ci struct btrfs_space_info *space_info; 147362306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 147462306a36Sopenharmony_ci const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); 147562306a36Sopenharmony_ci int ret = 0; 147662306a36Sopenharmony_ci 147762306a36Sopenharmony_ci if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 147862306a36Sopenharmony_ci return; 147962306a36Sopenharmony_ci 148062306a36Sopenharmony_ci if (btrfs_fs_closing(fs_info)) 148162306a36Sopenharmony_ci return; 148262306a36Sopenharmony_ci 148362306a36Sopenharmony_ci /* 148462306a36Sopenharmony_ci * Long running balances can keep us blocked here for eternity, so 148562306a36Sopenharmony_ci * simply skip deletion if we're unable to get the mutex. 148662306a36Sopenharmony_ci */ 148762306a36Sopenharmony_ci if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 148862306a36Sopenharmony_ci return; 148962306a36Sopenharmony_ci 149062306a36Sopenharmony_ci spin_lock(&fs_info->unused_bgs_lock); 149162306a36Sopenharmony_ci while (!list_empty(&fs_info->unused_bgs)) { 149262306a36Sopenharmony_ci u64 used; 149362306a36Sopenharmony_ci int trimming; 149462306a36Sopenharmony_ci 149562306a36Sopenharmony_ci block_group = list_first_entry(&fs_info->unused_bgs, 149662306a36Sopenharmony_ci struct btrfs_block_group, 149762306a36Sopenharmony_ci bg_list); 149862306a36Sopenharmony_ci list_del_init(&block_group->bg_list); 149962306a36Sopenharmony_ci 150062306a36Sopenharmony_ci space_info = block_group->space_info; 150162306a36Sopenharmony_ci 150262306a36Sopenharmony_ci if (ret || btrfs_mixed_space_info(space_info)) { 150362306a36Sopenharmony_ci btrfs_put_block_group(block_group); 150462306a36Sopenharmony_ci continue; 150562306a36Sopenharmony_ci } 150662306a36Sopenharmony_ci spin_unlock(&fs_info->unused_bgs_lock); 150762306a36Sopenharmony_ci 150862306a36Sopenharmony_ci btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 150962306a36Sopenharmony_ci 151062306a36Sopenharmony_ci /* Don't want to race with allocators so take the groups_sem */ 151162306a36Sopenharmony_ci down_write(&space_info->groups_sem); 151262306a36Sopenharmony_ci 151362306a36Sopenharmony_ci /* 151462306a36Sopenharmony_ci * Async discard moves the final block group discard to be prior 151562306a36Sopenharmony_ci * to the unused_bgs code path. Therefore, if it's not fully 151662306a36Sopenharmony_ci * trimmed, punt it back to the async discard lists. 151762306a36Sopenharmony_ci */ 151862306a36Sopenharmony_ci if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && 151962306a36Sopenharmony_ci !btrfs_is_free_space_trimmed(block_group)) { 152062306a36Sopenharmony_ci trace_btrfs_skip_unused_block_group(block_group); 152162306a36Sopenharmony_ci up_write(&space_info->groups_sem); 152262306a36Sopenharmony_ci /* Requeue if we failed because of async discard */ 152362306a36Sopenharmony_ci btrfs_discard_queue_work(&fs_info->discard_ctl, 152462306a36Sopenharmony_ci block_group); 152562306a36Sopenharmony_ci goto next; 152662306a36Sopenharmony_ci } 152762306a36Sopenharmony_ci 152862306a36Sopenharmony_ci spin_lock(&space_info->lock); 152962306a36Sopenharmony_ci spin_lock(&block_group->lock); 153062306a36Sopenharmony_ci if (btrfs_is_block_group_used(block_group) || block_group->ro || 153162306a36Sopenharmony_ci list_is_singular(&block_group->list)) { 153262306a36Sopenharmony_ci /* 153362306a36Sopenharmony_ci * We want to bail if we made new allocations or have 153462306a36Sopenharmony_ci * outstanding allocations in this block group. We do 153562306a36Sopenharmony_ci * the ro check in case balance is currently acting on 153662306a36Sopenharmony_ci * this block group. 153762306a36Sopenharmony_ci */ 153862306a36Sopenharmony_ci trace_btrfs_skip_unused_block_group(block_group); 153962306a36Sopenharmony_ci spin_unlock(&block_group->lock); 154062306a36Sopenharmony_ci spin_unlock(&space_info->lock); 154162306a36Sopenharmony_ci up_write(&space_info->groups_sem); 154262306a36Sopenharmony_ci goto next; 154362306a36Sopenharmony_ci } 154462306a36Sopenharmony_ci 154562306a36Sopenharmony_ci /* 154662306a36Sopenharmony_ci * The block group may be unused but there may be space reserved 154762306a36Sopenharmony_ci * accounting with the existence of that block group, that is, 154862306a36Sopenharmony_ci * space_info->bytes_may_use was incremented by a task but no 154962306a36Sopenharmony_ci * space was yet allocated from the block group by the task. 155062306a36Sopenharmony_ci * That space may or may not be allocated, as we are generally 155162306a36Sopenharmony_ci * pessimistic about space reservation for metadata as well as 155262306a36Sopenharmony_ci * for data when using compression (as we reserve space based on 155362306a36Sopenharmony_ci * the worst case, when data can't be compressed, and before 155462306a36Sopenharmony_ci * actually attempting compression, before starting writeback). 155562306a36Sopenharmony_ci * 155662306a36Sopenharmony_ci * So check if the total space of the space_info minus the size 155762306a36Sopenharmony_ci * of this block group is less than the used space of the 155862306a36Sopenharmony_ci * space_info - if that's the case, then it means we have tasks 155962306a36Sopenharmony_ci * that might be relying on the block group in order to allocate 156062306a36Sopenharmony_ci * extents, and add back the block group to the unused list when 156162306a36Sopenharmony_ci * we finish, so that we retry later in case no tasks ended up 156262306a36Sopenharmony_ci * needing to allocate extents from the block group. 156362306a36Sopenharmony_ci */ 156462306a36Sopenharmony_ci used = btrfs_space_info_used(space_info, true); 156562306a36Sopenharmony_ci if (space_info->total_bytes - block_group->length < used) { 156662306a36Sopenharmony_ci /* 156762306a36Sopenharmony_ci * Add a reference for the list, compensate for the ref 156862306a36Sopenharmony_ci * drop under the "next" label for the 156962306a36Sopenharmony_ci * fs_info->unused_bgs list. 157062306a36Sopenharmony_ci */ 157162306a36Sopenharmony_ci btrfs_get_block_group(block_group); 157262306a36Sopenharmony_ci list_add_tail(&block_group->bg_list, &retry_list); 157362306a36Sopenharmony_ci 157462306a36Sopenharmony_ci trace_btrfs_skip_unused_block_group(block_group); 157562306a36Sopenharmony_ci spin_unlock(&block_group->lock); 157662306a36Sopenharmony_ci spin_unlock(&space_info->lock); 157762306a36Sopenharmony_ci up_write(&space_info->groups_sem); 157862306a36Sopenharmony_ci goto next; 157962306a36Sopenharmony_ci } 158062306a36Sopenharmony_ci 158162306a36Sopenharmony_ci spin_unlock(&block_group->lock); 158262306a36Sopenharmony_ci spin_unlock(&space_info->lock); 158362306a36Sopenharmony_ci 158462306a36Sopenharmony_ci /* We don't want to force the issue, only flip if it's ok. */ 158562306a36Sopenharmony_ci ret = inc_block_group_ro(block_group, 0); 158662306a36Sopenharmony_ci up_write(&space_info->groups_sem); 158762306a36Sopenharmony_ci if (ret < 0) { 158862306a36Sopenharmony_ci ret = 0; 158962306a36Sopenharmony_ci goto next; 159062306a36Sopenharmony_ci } 159162306a36Sopenharmony_ci 159262306a36Sopenharmony_ci ret = btrfs_zone_finish(block_group); 159362306a36Sopenharmony_ci if (ret < 0) { 159462306a36Sopenharmony_ci btrfs_dec_block_group_ro(block_group); 159562306a36Sopenharmony_ci if (ret == -EAGAIN) 159662306a36Sopenharmony_ci ret = 0; 159762306a36Sopenharmony_ci goto next; 159862306a36Sopenharmony_ci } 159962306a36Sopenharmony_ci 160062306a36Sopenharmony_ci /* 160162306a36Sopenharmony_ci * Want to do this before we do anything else so we can recover 160262306a36Sopenharmony_ci * properly if we fail to join the transaction. 160362306a36Sopenharmony_ci */ 160462306a36Sopenharmony_ci trans = btrfs_start_trans_remove_block_group(fs_info, 160562306a36Sopenharmony_ci block_group->start); 160662306a36Sopenharmony_ci if (IS_ERR(trans)) { 160762306a36Sopenharmony_ci btrfs_dec_block_group_ro(block_group); 160862306a36Sopenharmony_ci ret = PTR_ERR(trans); 160962306a36Sopenharmony_ci goto next; 161062306a36Sopenharmony_ci } 161162306a36Sopenharmony_ci 161262306a36Sopenharmony_ci /* 161362306a36Sopenharmony_ci * We could have pending pinned extents for this block group, 161462306a36Sopenharmony_ci * just delete them, we don't care about them anymore. 161562306a36Sopenharmony_ci */ 161662306a36Sopenharmony_ci if (!clean_pinned_extents(trans, block_group)) { 161762306a36Sopenharmony_ci btrfs_dec_block_group_ro(block_group); 161862306a36Sopenharmony_ci goto end_trans; 161962306a36Sopenharmony_ci } 162062306a36Sopenharmony_ci 162162306a36Sopenharmony_ci /* 162262306a36Sopenharmony_ci * At this point, the block_group is read only and should fail 162362306a36Sopenharmony_ci * new allocations. However, btrfs_finish_extent_commit() can 162462306a36Sopenharmony_ci * cause this block_group to be placed back on the discard 162562306a36Sopenharmony_ci * lists because now the block_group isn't fully discarded. 162662306a36Sopenharmony_ci * Bail here and try again later after discarding everything. 162762306a36Sopenharmony_ci */ 162862306a36Sopenharmony_ci spin_lock(&fs_info->discard_ctl.lock); 162962306a36Sopenharmony_ci if (!list_empty(&block_group->discard_list)) { 163062306a36Sopenharmony_ci spin_unlock(&fs_info->discard_ctl.lock); 163162306a36Sopenharmony_ci btrfs_dec_block_group_ro(block_group); 163262306a36Sopenharmony_ci btrfs_discard_queue_work(&fs_info->discard_ctl, 163362306a36Sopenharmony_ci block_group); 163462306a36Sopenharmony_ci goto end_trans; 163562306a36Sopenharmony_ci } 163662306a36Sopenharmony_ci spin_unlock(&fs_info->discard_ctl.lock); 163762306a36Sopenharmony_ci 163862306a36Sopenharmony_ci /* Reset pinned so btrfs_put_block_group doesn't complain */ 163962306a36Sopenharmony_ci spin_lock(&space_info->lock); 164062306a36Sopenharmony_ci spin_lock(&block_group->lock); 164162306a36Sopenharmony_ci 164262306a36Sopenharmony_ci btrfs_space_info_update_bytes_pinned(fs_info, space_info, 164362306a36Sopenharmony_ci -block_group->pinned); 164462306a36Sopenharmony_ci space_info->bytes_readonly += block_group->pinned; 164562306a36Sopenharmony_ci block_group->pinned = 0; 164662306a36Sopenharmony_ci 164762306a36Sopenharmony_ci spin_unlock(&block_group->lock); 164862306a36Sopenharmony_ci spin_unlock(&space_info->lock); 164962306a36Sopenharmony_ci 165062306a36Sopenharmony_ci /* 165162306a36Sopenharmony_ci * The normal path here is an unused block group is passed here, 165262306a36Sopenharmony_ci * then trimming is handled in the transaction commit path. 165362306a36Sopenharmony_ci * Async discard interposes before this to do the trimming 165462306a36Sopenharmony_ci * before coming down the unused block group path as trimming 165562306a36Sopenharmony_ci * will no longer be done later in the transaction commit path. 165662306a36Sopenharmony_ci */ 165762306a36Sopenharmony_ci if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) 165862306a36Sopenharmony_ci goto flip_async; 165962306a36Sopenharmony_ci 166062306a36Sopenharmony_ci /* 166162306a36Sopenharmony_ci * DISCARD can flip during remount. On zoned filesystems, we 166262306a36Sopenharmony_ci * need to reset sequential-required zones. 166362306a36Sopenharmony_ci */ 166462306a36Sopenharmony_ci trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || 166562306a36Sopenharmony_ci btrfs_is_zoned(fs_info); 166662306a36Sopenharmony_ci 166762306a36Sopenharmony_ci /* Implicit trim during transaction commit. */ 166862306a36Sopenharmony_ci if (trimming) 166962306a36Sopenharmony_ci btrfs_freeze_block_group(block_group); 167062306a36Sopenharmony_ci 167162306a36Sopenharmony_ci /* 167262306a36Sopenharmony_ci * Btrfs_remove_chunk will abort the transaction if things go 167362306a36Sopenharmony_ci * horribly wrong. 167462306a36Sopenharmony_ci */ 167562306a36Sopenharmony_ci ret = btrfs_remove_chunk(trans, block_group->start); 167662306a36Sopenharmony_ci 167762306a36Sopenharmony_ci if (ret) { 167862306a36Sopenharmony_ci if (trimming) 167962306a36Sopenharmony_ci btrfs_unfreeze_block_group(block_group); 168062306a36Sopenharmony_ci goto end_trans; 168162306a36Sopenharmony_ci } 168262306a36Sopenharmony_ci 168362306a36Sopenharmony_ci /* 168462306a36Sopenharmony_ci * If we're not mounted with -odiscard, we can just forget 168562306a36Sopenharmony_ci * about this block group. Otherwise we'll need to wait 168662306a36Sopenharmony_ci * until transaction commit to do the actual discard. 168762306a36Sopenharmony_ci */ 168862306a36Sopenharmony_ci if (trimming) { 168962306a36Sopenharmony_ci spin_lock(&fs_info->unused_bgs_lock); 169062306a36Sopenharmony_ci /* 169162306a36Sopenharmony_ci * A concurrent scrub might have added us to the list 169262306a36Sopenharmony_ci * fs_info->unused_bgs, so use a list_move operation 169362306a36Sopenharmony_ci * to add the block group to the deleted_bgs list. 169462306a36Sopenharmony_ci */ 169562306a36Sopenharmony_ci list_move(&block_group->bg_list, 169662306a36Sopenharmony_ci &trans->transaction->deleted_bgs); 169762306a36Sopenharmony_ci spin_unlock(&fs_info->unused_bgs_lock); 169862306a36Sopenharmony_ci btrfs_get_block_group(block_group); 169962306a36Sopenharmony_ci } 170062306a36Sopenharmony_ciend_trans: 170162306a36Sopenharmony_ci btrfs_end_transaction(trans); 170262306a36Sopenharmony_cinext: 170362306a36Sopenharmony_ci btrfs_put_block_group(block_group); 170462306a36Sopenharmony_ci spin_lock(&fs_info->unused_bgs_lock); 170562306a36Sopenharmony_ci } 170662306a36Sopenharmony_ci list_splice_tail(&retry_list, &fs_info->unused_bgs); 170762306a36Sopenharmony_ci spin_unlock(&fs_info->unused_bgs_lock); 170862306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 170962306a36Sopenharmony_ci return; 171062306a36Sopenharmony_ci 171162306a36Sopenharmony_ciflip_async: 171262306a36Sopenharmony_ci btrfs_end_transaction(trans); 171362306a36Sopenharmony_ci spin_lock(&fs_info->unused_bgs_lock); 171462306a36Sopenharmony_ci list_splice_tail(&retry_list, &fs_info->unused_bgs); 171562306a36Sopenharmony_ci spin_unlock(&fs_info->unused_bgs_lock); 171662306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 171762306a36Sopenharmony_ci btrfs_put_block_group(block_group); 171862306a36Sopenharmony_ci btrfs_discard_punt_unused_bgs_list(fs_info); 171962306a36Sopenharmony_ci} 172062306a36Sopenharmony_ci 172162306a36Sopenharmony_civoid btrfs_mark_bg_unused(struct btrfs_block_group *bg) 172262306a36Sopenharmony_ci{ 172362306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = bg->fs_info; 172462306a36Sopenharmony_ci 172562306a36Sopenharmony_ci spin_lock(&fs_info->unused_bgs_lock); 172662306a36Sopenharmony_ci if (list_empty(&bg->bg_list)) { 172762306a36Sopenharmony_ci btrfs_get_block_group(bg); 172862306a36Sopenharmony_ci trace_btrfs_add_unused_block_group(bg); 172962306a36Sopenharmony_ci list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 173062306a36Sopenharmony_ci } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { 173162306a36Sopenharmony_ci /* Pull out the block group from the reclaim_bgs list. */ 173262306a36Sopenharmony_ci trace_btrfs_add_unused_block_group(bg); 173362306a36Sopenharmony_ci list_move_tail(&bg->bg_list, &fs_info->unused_bgs); 173462306a36Sopenharmony_ci } 173562306a36Sopenharmony_ci spin_unlock(&fs_info->unused_bgs_lock); 173662306a36Sopenharmony_ci} 173762306a36Sopenharmony_ci 173862306a36Sopenharmony_ci/* 173962306a36Sopenharmony_ci * We want block groups with a low number of used bytes to be in the beginning 174062306a36Sopenharmony_ci * of the list, so they will get reclaimed first. 174162306a36Sopenharmony_ci */ 174262306a36Sopenharmony_cistatic int reclaim_bgs_cmp(void *unused, const struct list_head *a, 174362306a36Sopenharmony_ci const struct list_head *b) 174462306a36Sopenharmony_ci{ 174562306a36Sopenharmony_ci const struct btrfs_block_group *bg1, *bg2; 174662306a36Sopenharmony_ci 174762306a36Sopenharmony_ci bg1 = list_entry(a, struct btrfs_block_group, bg_list); 174862306a36Sopenharmony_ci bg2 = list_entry(b, struct btrfs_block_group, bg_list); 174962306a36Sopenharmony_ci 175062306a36Sopenharmony_ci return bg1->used > bg2->used; 175162306a36Sopenharmony_ci} 175262306a36Sopenharmony_ci 175362306a36Sopenharmony_cistatic inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) 175462306a36Sopenharmony_ci{ 175562306a36Sopenharmony_ci if (btrfs_is_zoned(fs_info)) 175662306a36Sopenharmony_ci return btrfs_zoned_should_reclaim(fs_info); 175762306a36Sopenharmony_ci return true; 175862306a36Sopenharmony_ci} 175962306a36Sopenharmony_ci 176062306a36Sopenharmony_cistatic bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) 176162306a36Sopenharmony_ci{ 176262306a36Sopenharmony_ci const struct btrfs_space_info *space_info = bg->space_info; 176362306a36Sopenharmony_ci const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); 176462306a36Sopenharmony_ci const u64 new_val = bg->used; 176562306a36Sopenharmony_ci const u64 old_val = new_val + bytes_freed; 176662306a36Sopenharmony_ci u64 thresh; 176762306a36Sopenharmony_ci 176862306a36Sopenharmony_ci if (reclaim_thresh == 0) 176962306a36Sopenharmony_ci return false; 177062306a36Sopenharmony_ci 177162306a36Sopenharmony_ci thresh = mult_perc(bg->length, reclaim_thresh); 177262306a36Sopenharmony_ci 177362306a36Sopenharmony_ci /* 177462306a36Sopenharmony_ci * If we were below the threshold before don't reclaim, we are likely a 177562306a36Sopenharmony_ci * brand new block group and we don't want to relocate new block groups. 177662306a36Sopenharmony_ci */ 177762306a36Sopenharmony_ci if (old_val < thresh) 177862306a36Sopenharmony_ci return false; 177962306a36Sopenharmony_ci if (new_val >= thresh) 178062306a36Sopenharmony_ci return false; 178162306a36Sopenharmony_ci return true; 178262306a36Sopenharmony_ci} 178362306a36Sopenharmony_ci 178462306a36Sopenharmony_civoid btrfs_reclaim_bgs_work(struct work_struct *work) 178562306a36Sopenharmony_ci{ 178662306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = 178762306a36Sopenharmony_ci container_of(work, struct btrfs_fs_info, reclaim_bgs_work); 178862306a36Sopenharmony_ci struct btrfs_block_group *bg; 178962306a36Sopenharmony_ci struct btrfs_space_info *space_info; 179062306a36Sopenharmony_ci 179162306a36Sopenharmony_ci if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 179262306a36Sopenharmony_ci return; 179362306a36Sopenharmony_ci 179462306a36Sopenharmony_ci if (btrfs_fs_closing(fs_info)) 179562306a36Sopenharmony_ci return; 179662306a36Sopenharmony_ci 179762306a36Sopenharmony_ci if (!btrfs_should_reclaim(fs_info)) 179862306a36Sopenharmony_ci return; 179962306a36Sopenharmony_ci 180062306a36Sopenharmony_ci sb_start_write(fs_info->sb); 180162306a36Sopenharmony_ci 180262306a36Sopenharmony_ci if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 180362306a36Sopenharmony_ci sb_end_write(fs_info->sb); 180462306a36Sopenharmony_ci return; 180562306a36Sopenharmony_ci } 180662306a36Sopenharmony_ci 180762306a36Sopenharmony_ci /* 180862306a36Sopenharmony_ci * Long running balances can keep us blocked here for eternity, so 180962306a36Sopenharmony_ci * simply skip reclaim if we're unable to get the mutex. 181062306a36Sopenharmony_ci */ 181162306a36Sopenharmony_ci if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { 181262306a36Sopenharmony_ci btrfs_exclop_finish(fs_info); 181362306a36Sopenharmony_ci sb_end_write(fs_info->sb); 181462306a36Sopenharmony_ci return; 181562306a36Sopenharmony_ci } 181662306a36Sopenharmony_ci 181762306a36Sopenharmony_ci spin_lock(&fs_info->unused_bgs_lock); 181862306a36Sopenharmony_ci /* 181962306a36Sopenharmony_ci * Sort happens under lock because we can't simply splice it and sort. 182062306a36Sopenharmony_ci * The block groups might still be in use and reachable via bg_list, 182162306a36Sopenharmony_ci * and their presence in the reclaim_bgs list must be preserved. 182262306a36Sopenharmony_ci */ 182362306a36Sopenharmony_ci list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); 182462306a36Sopenharmony_ci while (!list_empty(&fs_info->reclaim_bgs)) { 182562306a36Sopenharmony_ci u64 zone_unusable; 182662306a36Sopenharmony_ci int ret = 0; 182762306a36Sopenharmony_ci 182862306a36Sopenharmony_ci bg = list_first_entry(&fs_info->reclaim_bgs, 182962306a36Sopenharmony_ci struct btrfs_block_group, 183062306a36Sopenharmony_ci bg_list); 183162306a36Sopenharmony_ci list_del_init(&bg->bg_list); 183262306a36Sopenharmony_ci 183362306a36Sopenharmony_ci space_info = bg->space_info; 183462306a36Sopenharmony_ci spin_unlock(&fs_info->unused_bgs_lock); 183562306a36Sopenharmony_ci 183662306a36Sopenharmony_ci /* Don't race with allocators so take the groups_sem */ 183762306a36Sopenharmony_ci down_write(&space_info->groups_sem); 183862306a36Sopenharmony_ci 183962306a36Sopenharmony_ci spin_lock(&bg->lock); 184062306a36Sopenharmony_ci if (bg->reserved || bg->pinned || bg->ro) { 184162306a36Sopenharmony_ci /* 184262306a36Sopenharmony_ci * We want to bail if we made new allocations or have 184362306a36Sopenharmony_ci * outstanding allocations in this block group. We do 184462306a36Sopenharmony_ci * the ro check in case balance is currently acting on 184562306a36Sopenharmony_ci * this block group. 184662306a36Sopenharmony_ci */ 184762306a36Sopenharmony_ci spin_unlock(&bg->lock); 184862306a36Sopenharmony_ci up_write(&space_info->groups_sem); 184962306a36Sopenharmony_ci goto next; 185062306a36Sopenharmony_ci } 185162306a36Sopenharmony_ci if (bg->used == 0) { 185262306a36Sopenharmony_ci /* 185362306a36Sopenharmony_ci * It is possible that we trigger relocation on a block 185462306a36Sopenharmony_ci * group as its extents are deleted and it first goes 185562306a36Sopenharmony_ci * below the threshold, then shortly after goes empty. 185662306a36Sopenharmony_ci * 185762306a36Sopenharmony_ci * In this case, relocating it does delete it, but has 185862306a36Sopenharmony_ci * some overhead in relocation specific metadata, looking 185962306a36Sopenharmony_ci * for the non-existent extents and running some extra 186062306a36Sopenharmony_ci * transactions, which we can avoid by using one of the 186162306a36Sopenharmony_ci * other mechanisms for dealing with empty block groups. 186262306a36Sopenharmony_ci */ 186362306a36Sopenharmony_ci if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) 186462306a36Sopenharmony_ci btrfs_mark_bg_unused(bg); 186562306a36Sopenharmony_ci spin_unlock(&bg->lock); 186662306a36Sopenharmony_ci up_write(&space_info->groups_sem); 186762306a36Sopenharmony_ci goto next; 186862306a36Sopenharmony_ci 186962306a36Sopenharmony_ci } 187062306a36Sopenharmony_ci /* 187162306a36Sopenharmony_ci * The block group might no longer meet the reclaim condition by 187262306a36Sopenharmony_ci * the time we get around to reclaiming it, so to avoid 187362306a36Sopenharmony_ci * reclaiming overly full block_groups, skip reclaiming them. 187462306a36Sopenharmony_ci * 187562306a36Sopenharmony_ci * Since the decision making process also depends on the amount 187662306a36Sopenharmony_ci * being freed, pass in a fake giant value to skip that extra 187762306a36Sopenharmony_ci * check, which is more meaningful when adding to the list in 187862306a36Sopenharmony_ci * the first place. 187962306a36Sopenharmony_ci */ 188062306a36Sopenharmony_ci if (!should_reclaim_block_group(bg, bg->length)) { 188162306a36Sopenharmony_ci spin_unlock(&bg->lock); 188262306a36Sopenharmony_ci up_write(&space_info->groups_sem); 188362306a36Sopenharmony_ci goto next; 188462306a36Sopenharmony_ci } 188562306a36Sopenharmony_ci spin_unlock(&bg->lock); 188662306a36Sopenharmony_ci 188762306a36Sopenharmony_ci /* 188862306a36Sopenharmony_ci * Get out fast, in case we're read-only or unmounting the 188962306a36Sopenharmony_ci * filesystem. It is OK to drop block groups from the list even 189062306a36Sopenharmony_ci * for the read-only case. As we did sb_start_write(), 189162306a36Sopenharmony_ci * "mount -o remount,ro" won't happen and read-only filesystem 189262306a36Sopenharmony_ci * means it is forced read-only due to a fatal error. So, it 189362306a36Sopenharmony_ci * never gets back to read-write to let us reclaim again. 189462306a36Sopenharmony_ci */ 189562306a36Sopenharmony_ci if (btrfs_need_cleaner_sleep(fs_info)) { 189662306a36Sopenharmony_ci up_write(&space_info->groups_sem); 189762306a36Sopenharmony_ci goto next; 189862306a36Sopenharmony_ci } 189962306a36Sopenharmony_ci 190062306a36Sopenharmony_ci /* 190162306a36Sopenharmony_ci * Cache the zone_unusable value before turning the block group 190262306a36Sopenharmony_ci * to read only. As soon as the blog group is read only it's 190362306a36Sopenharmony_ci * zone_unusable value gets moved to the block group's read-only 190462306a36Sopenharmony_ci * bytes and isn't available for calculations anymore. 190562306a36Sopenharmony_ci */ 190662306a36Sopenharmony_ci zone_unusable = bg->zone_unusable; 190762306a36Sopenharmony_ci ret = inc_block_group_ro(bg, 0); 190862306a36Sopenharmony_ci up_write(&space_info->groups_sem); 190962306a36Sopenharmony_ci if (ret < 0) 191062306a36Sopenharmony_ci goto next; 191162306a36Sopenharmony_ci 191262306a36Sopenharmony_ci btrfs_info(fs_info, 191362306a36Sopenharmony_ci "reclaiming chunk %llu with %llu%% used %llu%% unusable", 191462306a36Sopenharmony_ci bg->start, 191562306a36Sopenharmony_ci div64_u64(bg->used * 100, bg->length), 191662306a36Sopenharmony_ci div64_u64(zone_unusable * 100, bg->length)); 191762306a36Sopenharmony_ci trace_btrfs_reclaim_block_group(bg); 191862306a36Sopenharmony_ci ret = btrfs_relocate_chunk(fs_info, bg->start); 191962306a36Sopenharmony_ci if (ret) { 192062306a36Sopenharmony_ci btrfs_dec_block_group_ro(bg); 192162306a36Sopenharmony_ci btrfs_err(fs_info, "error relocating chunk %llu", 192262306a36Sopenharmony_ci bg->start); 192362306a36Sopenharmony_ci } 192462306a36Sopenharmony_ci 192562306a36Sopenharmony_cinext: 192662306a36Sopenharmony_ci if (ret) 192762306a36Sopenharmony_ci btrfs_mark_bg_to_reclaim(bg); 192862306a36Sopenharmony_ci btrfs_put_block_group(bg); 192962306a36Sopenharmony_ci 193062306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 193162306a36Sopenharmony_ci /* 193262306a36Sopenharmony_ci * Reclaiming all the block groups in the list can take really 193362306a36Sopenharmony_ci * long. Prioritize cleaning up unused block groups. 193462306a36Sopenharmony_ci */ 193562306a36Sopenharmony_ci btrfs_delete_unused_bgs(fs_info); 193662306a36Sopenharmony_ci /* 193762306a36Sopenharmony_ci * If we are interrupted by a balance, we can just bail out. The 193862306a36Sopenharmony_ci * cleaner thread restart again if necessary. 193962306a36Sopenharmony_ci */ 194062306a36Sopenharmony_ci if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 194162306a36Sopenharmony_ci goto end; 194262306a36Sopenharmony_ci spin_lock(&fs_info->unused_bgs_lock); 194362306a36Sopenharmony_ci } 194462306a36Sopenharmony_ci spin_unlock(&fs_info->unused_bgs_lock); 194562306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 194662306a36Sopenharmony_ciend: 194762306a36Sopenharmony_ci btrfs_exclop_finish(fs_info); 194862306a36Sopenharmony_ci sb_end_write(fs_info->sb); 194962306a36Sopenharmony_ci} 195062306a36Sopenharmony_ci 195162306a36Sopenharmony_civoid btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) 195262306a36Sopenharmony_ci{ 195362306a36Sopenharmony_ci spin_lock(&fs_info->unused_bgs_lock); 195462306a36Sopenharmony_ci if (!list_empty(&fs_info->reclaim_bgs)) 195562306a36Sopenharmony_ci queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); 195662306a36Sopenharmony_ci spin_unlock(&fs_info->unused_bgs_lock); 195762306a36Sopenharmony_ci} 195862306a36Sopenharmony_ci 195962306a36Sopenharmony_civoid btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) 196062306a36Sopenharmony_ci{ 196162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = bg->fs_info; 196262306a36Sopenharmony_ci 196362306a36Sopenharmony_ci spin_lock(&fs_info->unused_bgs_lock); 196462306a36Sopenharmony_ci if (list_empty(&bg->bg_list)) { 196562306a36Sopenharmony_ci btrfs_get_block_group(bg); 196662306a36Sopenharmony_ci trace_btrfs_add_reclaim_block_group(bg); 196762306a36Sopenharmony_ci list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); 196862306a36Sopenharmony_ci } 196962306a36Sopenharmony_ci spin_unlock(&fs_info->unused_bgs_lock); 197062306a36Sopenharmony_ci} 197162306a36Sopenharmony_ci 197262306a36Sopenharmony_cistatic int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 197362306a36Sopenharmony_ci struct btrfs_path *path) 197462306a36Sopenharmony_ci{ 197562306a36Sopenharmony_ci struct extent_map_tree *em_tree; 197662306a36Sopenharmony_ci struct extent_map *em; 197762306a36Sopenharmony_ci struct btrfs_block_group_item bg; 197862306a36Sopenharmony_ci struct extent_buffer *leaf; 197962306a36Sopenharmony_ci int slot; 198062306a36Sopenharmony_ci u64 flags; 198162306a36Sopenharmony_ci int ret = 0; 198262306a36Sopenharmony_ci 198362306a36Sopenharmony_ci slot = path->slots[0]; 198462306a36Sopenharmony_ci leaf = path->nodes[0]; 198562306a36Sopenharmony_ci 198662306a36Sopenharmony_ci em_tree = &fs_info->mapping_tree; 198762306a36Sopenharmony_ci read_lock(&em_tree->lock); 198862306a36Sopenharmony_ci em = lookup_extent_mapping(em_tree, key->objectid, key->offset); 198962306a36Sopenharmony_ci read_unlock(&em_tree->lock); 199062306a36Sopenharmony_ci if (!em) { 199162306a36Sopenharmony_ci btrfs_err(fs_info, 199262306a36Sopenharmony_ci "logical %llu len %llu found bg but no related chunk", 199362306a36Sopenharmony_ci key->objectid, key->offset); 199462306a36Sopenharmony_ci return -ENOENT; 199562306a36Sopenharmony_ci } 199662306a36Sopenharmony_ci 199762306a36Sopenharmony_ci if (em->start != key->objectid || em->len != key->offset) { 199862306a36Sopenharmony_ci btrfs_err(fs_info, 199962306a36Sopenharmony_ci "block group %llu len %llu mismatch with chunk %llu len %llu", 200062306a36Sopenharmony_ci key->objectid, key->offset, em->start, em->len); 200162306a36Sopenharmony_ci ret = -EUCLEAN; 200262306a36Sopenharmony_ci goto out_free_em; 200362306a36Sopenharmony_ci } 200462306a36Sopenharmony_ci 200562306a36Sopenharmony_ci read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), 200662306a36Sopenharmony_ci sizeof(bg)); 200762306a36Sopenharmony_ci flags = btrfs_stack_block_group_flags(&bg) & 200862306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_TYPE_MASK; 200962306a36Sopenharmony_ci 201062306a36Sopenharmony_ci if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 201162306a36Sopenharmony_ci btrfs_err(fs_info, 201262306a36Sopenharmony_ci"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 201362306a36Sopenharmony_ci key->objectid, key->offset, flags, 201462306a36Sopenharmony_ci (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); 201562306a36Sopenharmony_ci ret = -EUCLEAN; 201662306a36Sopenharmony_ci } 201762306a36Sopenharmony_ci 201862306a36Sopenharmony_ciout_free_em: 201962306a36Sopenharmony_ci free_extent_map(em); 202062306a36Sopenharmony_ci return ret; 202162306a36Sopenharmony_ci} 202262306a36Sopenharmony_ci 202362306a36Sopenharmony_cistatic int find_first_block_group(struct btrfs_fs_info *fs_info, 202462306a36Sopenharmony_ci struct btrfs_path *path, 202562306a36Sopenharmony_ci struct btrfs_key *key) 202662306a36Sopenharmony_ci{ 202762306a36Sopenharmony_ci struct btrfs_root *root = btrfs_block_group_root(fs_info); 202862306a36Sopenharmony_ci int ret; 202962306a36Sopenharmony_ci struct btrfs_key found_key; 203062306a36Sopenharmony_ci 203162306a36Sopenharmony_ci btrfs_for_each_slot(root, key, &found_key, path, ret) { 203262306a36Sopenharmony_ci if (found_key.objectid >= key->objectid && 203362306a36Sopenharmony_ci found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 203462306a36Sopenharmony_ci return read_bg_from_eb(fs_info, &found_key, path); 203562306a36Sopenharmony_ci } 203662306a36Sopenharmony_ci } 203762306a36Sopenharmony_ci return ret; 203862306a36Sopenharmony_ci} 203962306a36Sopenharmony_ci 204062306a36Sopenharmony_cistatic void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 204162306a36Sopenharmony_ci{ 204262306a36Sopenharmony_ci u64 extra_flags = chunk_to_extended(flags) & 204362306a36Sopenharmony_ci BTRFS_EXTENDED_PROFILE_MASK; 204462306a36Sopenharmony_ci 204562306a36Sopenharmony_ci write_seqlock(&fs_info->profiles_lock); 204662306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_DATA) 204762306a36Sopenharmony_ci fs_info->avail_data_alloc_bits |= extra_flags; 204862306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_METADATA) 204962306a36Sopenharmony_ci fs_info->avail_metadata_alloc_bits |= extra_flags; 205062306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 205162306a36Sopenharmony_ci fs_info->avail_system_alloc_bits |= extra_flags; 205262306a36Sopenharmony_ci write_sequnlock(&fs_info->profiles_lock); 205362306a36Sopenharmony_ci} 205462306a36Sopenharmony_ci 205562306a36Sopenharmony_ci/* 205662306a36Sopenharmony_ci * Map a physical disk address to a list of logical addresses. 205762306a36Sopenharmony_ci * 205862306a36Sopenharmony_ci * @fs_info: the filesystem 205962306a36Sopenharmony_ci * @chunk_start: logical address of block group 206062306a36Sopenharmony_ci * @physical: physical address to map to logical addresses 206162306a36Sopenharmony_ci * @logical: return array of logical addresses which map to @physical 206262306a36Sopenharmony_ci * @naddrs: length of @logical 206362306a36Sopenharmony_ci * @stripe_len: size of IO stripe for the given block group 206462306a36Sopenharmony_ci * 206562306a36Sopenharmony_ci * Maps a particular @physical disk address to a list of @logical addresses. 206662306a36Sopenharmony_ci * Used primarily to exclude those portions of a block group that contain super 206762306a36Sopenharmony_ci * block copies. 206862306a36Sopenharmony_ci */ 206962306a36Sopenharmony_ciint btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 207062306a36Sopenharmony_ci u64 physical, u64 **logical, int *naddrs, int *stripe_len) 207162306a36Sopenharmony_ci{ 207262306a36Sopenharmony_ci struct extent_map *em; 207362306a36Sopenharmony_ci struct map_lookup *map; 207462306a36Sopenharmony_ci u64 *buf; 207562306a36Sopenharmony_ci u64 bytenr; 207662306a36Sopenharmony_ci u64 data_stripe_length; 207762306a36Sopenharmony_ci u64 io_stripe_size; 207862306a36Sopenharmony_ci int i, nr = 0; 207962306a36Sopenharmony_ci int ret = 0; 208062306a36Sopenharmony_ci 208162306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, chunk_start, 1); 208262306a36Sopenharmony_ci if (IS_ERR(em)) 208362306a36Sopenharmony_ci return -EIO; 208462306a36Sopenharmony_ci 208562306a36Sopenharmony_ci map = em->map_lookup; 208662306a36Sopenharmony_ci data_stripe_length = em->orig_block_len; 208762306a36Sopenharmony_ci io_stripe_size = BTRFS_STRIPE_LEN; 208862306a36Sopenharmony_ci chunk_start = em->start; 208962306a36Sopenharmony_ci 209062306a36Sopenharmony_ci /* For RAID5/6 adjust to a full IO stripe length */ 209162306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 209262306a36Sopenharmony_ci io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 209362306a36Sopenharmony_ci 209462306a36Sopenharmony_ci buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 209562306a36Sopenharmony_ci if (!buf) { 209662306a36Sopenharmony_ci ret = -ENOMEM; 209762306a36Sopenharmony_ci goto out; 209862306a36Sopenharmony_ci } 209962306a36Sopenharmony_ci 210062306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 210162306a36Sopenharmony_ci bool already_inserted = false; 210262306a36Sopenharmony_ci u32 stripe_nr; 210362306a36Sopenharmony_ci u32 offset; 210462306a36Sopenharmony_ci int j; 210562306a36Sopenharmony_ci 210662306a36Sopenharmony_ci if (!in_range(physical, map->stripes[i].physical, 210762306a36Sopenharmony_ci data_stripe_length)) 210862306a36Sopenharmony_ci continue; 210962306a36Sopenharmony_ci 211062306a36Sopenharmony_ci stripe_nr = (physical - map->stripes[i].physical) >> 211162306a36Sopenharmony_ci BTRFS_STRIPE_LEN_SHIFT; 211262306a36Sopenharmony_ci offset = (physical - map->stripes[i].physical) & 211362306a36Sopenharmony_ci BTRFS_STRIPE_LEN_MASK; 211462306a36Sopenharmony_ci 211562306a36Sopenharmony_ci if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 211662306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_RAID10)) 211762306a36Sopenharmony_ci stripe_nr = div_u64(stripe_nr * map->num_stripes + i, 211862306a36Sopenharmony_ci map->sub_stripes); 211962306a36Sopenharmony_ci /* 212062306a36Sopenharmony_ci * The remaining case would be for RAID56, multiply by 212162306a36Sopenharmony_ci * nr_data_stripes(). Alternatively, just use rmap_len below 212262306a36Sopenharmony_ci * instead of map->stripe_len 212362306a36Sopenharmony_ci */ 212462306a36Sopenharmony_ci bytenr = chunk_start + stripe_nr * io_stripe_size + offset; 212562306a36Sopenharmony_ci 212662306a36Sopenharmony_ci /* Ensure we don't add duplicate addresses */ 212762306a36Sopenharmony_ci for (j = 0; j < nr; j++) { 212862306a36Sopenharmony_ci if (buf[j] == bytenr) { 212962306a36Sopenharmony_ci already_inserted = true; 213062306a36Sopenharmony_ci break; 213162306a36Sopenharmony_ci } 213262306a36Sopenharmony_ci } 213362306a36Sopenharmony_ci 213462306a36Sopenharmony_ci if (!already_inserted) 213562306a36Sopenharmony_ci buf[nr++] = bytenr; 213662306a36Sopenharmony_ci } 213762306a36Sopenharmony_ci 213862306a36Sopenharmony_ci *logical = buf; 213962306a36Sopenharmony_ci *naddrs = nr; 214062306a36Sopenharmony_ci *stripe_len = io_stripe_size; 214162306a36Sopenharmony_ciout: 214262306a36Sopenharmony_ci free_extent_map(em); 214362306a36Sopenharmony_ci return ret; 214462306a36Sopenharmony_ci} 214562306a36Sopenharmony_ci 214662306a36Sopenharmony_cistatic int exclude_super_stripes(struct btrfs_block_group *cache) 214762306a36Sopenharmony_ci{ 214862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = cache->fs_info; 214962306a36Sopenharmony_ci const bool zoned = btrfs_is_zoned(fs_info); 215062306a36Sopenharmony_ci u64 bytenr; 215162306a36Sopenharmony_ci u64 *logical; 215262306a36Sopenharmony_ci int stripe_len; 215362306a36Sopenharmony_ci int i, nr, ret; 215462306a36Sopenharmony_ci 215562306a36Sopenharmony_ci if (cache->start < BTRFS_SUPER_INFO_OFFSET) { 215662306a36Sopenharmony_ci stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; 215762306a36Sopenharmony_ci cache->bytes_super += stripe_len; 215862306a36Sopenharmony_ci ret = set_extent_bit(&fs_info->excluded_extents, cache->start, 215962306a36Sopenharmony_ci cache->start + stripe_len - 1, 216062306a36Sopenharmony_ci EXTENT_UPTODATE, NULL); 216162306a36Sopenharmony_ci if (ret) 216262306a36Sopenharmony_ci return ret; 216362306a36Sopenharmony_ci } 216462306a36Sopenharmony_ci 216562306a36Sopenharmony_ci for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 216662306a36Sopenharmony_ci bytenr = btrfs_sb_offset(i); 216762306a36Sopenharmony_ci ret = btrfs_rmap_block(fs_info, cache->start, 216862306a36Sopenharmony_ci bytenr, &logical, &nr, &stripe_len); 216962306a36Sopenharmony_ci if (ret) 217062306a36Sopenharmony_ci return ret; 217162306a36Sopenharmony_ci 217262306a36Sopenharmony_ci /* Shouldn't have super stripes in sequential zones */ 217362306a36Sopenharmony_ci if (zoned && nr) { 217462306a36Sopenharmony_ci kfree(logical); 217562306a36Sopenharmony_ci btrfs_err(fs_info, 217662306a36Sopenharmony_ci "zoned: block group %llu must not contain super block", 217762306a36Sopenharmony_ci cache->start); 217862306a36Sopenharmony_ci return -EUCLEAN; 217962306a36Sopenharmony_ci } 218062306a36Sopenharmony_ci 218162306a36Sopenharmony_ci while (nr--) { 218262306a36Sopenharmony_ci u64 len = min_t(u64, stripe_len, 218362306a36Sopenharmony_ci cache->start + cache->length - logical[nr]); 218462306a36Sopenharmony_ci 218562306a36Sopenharmony_ci cache->bytes_super += len; 218662306a36Sopenharmony_ci ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], 218762306a36Sopenharmony_ci logical[nr] + len - 1, 218862306a36Sopenharmony_ci EXTENT_UPTODATE, NULL); 218962306a36Sopenharmony_ci if (ret) { 219062306a36Sopenharmony_ci kfree(logical); 219162306a36Sopenharmony_ci return ret; 219262306a36Sopenharmony_ci } 219362306a36Sopenharmony_ci } 219462306a36Sopenharmony_ci 219562306a36Sopenharmony_ci kfree(logical); 219662306a36Sopenharmony_ci } 219762306a36Sopenharmony_ci return 0; 219862306a36Sopenharmony_ci} 219962306a36Sopenharmony_ci 220062306a36Sopenharmony_cistatic struct btrfs_block_group *btrfs_create_block_group_cache( 220162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info, u64 start) 220262306a36Sopenharmony_ci{ 220362306a36Sopenharmony_ci struct btrfs_block_group *cache; 220462306a36Sopenharmony_ci 220562306a36Sopenharmony_ci cache = kzalloc(sizeof(*cache), GFP_NOFS); 220662306a36Sopenharmony_ci if (!cache) 220762306a36Sopenharmony_ci return NULL; 220862306a36Sopenharmony_ci 220962306a36Sopenharmony_ci cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 221062306a36Sopenharmony_ci GFP_NOFS); 221162306a36Sopenharmony_ci if (!cache->free_space_ctl) { 221262306a36Sopenharmony_ci kfree(cache); 221362306a36Sopenharmony_ci return NULL; 221462306a36Sopenharmony_ci } 221562306a36Sopenharmony_ci 221662306a36Sopenharmony_ci cache->start = start; 221762306a36Sopenharmony_ci 221862306a36Sopenharmony_ci cache->fs_info = fs_info; 221962306a36Sopenharmony_ci cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 222062306a36Sopenharmony_ci 222162306a36Sopenharmony_ci cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; 222262306a36Sopenharmony_ci 222362306a36Sopenharmony_ci refcount_set(&cache->refs, 1); 222462306a36Sopenharmony_ci spin_lock_init(&cache->lock); 222562306a36Sopenharmony_ci init_rwsem(&cache->data_rwsem); 222662306a36Sopenharmony_ci INIT_LIST_HEAD(&cache->list); 222762306a36Sopenharmony_ci INIT_LIST_HEAD(&cache->cluster_list); 222862306a36Sopenharmony_ci INIT_LIST_HEAD(&cache->bg_list); 222962306a36Sopenharmony_ci INIT_LIST_HEAD(&cache->ro_list); 223062306a36Sopenharmony_ci INIT_LIST_HEAD(&cache->discard_list); 223162306a36Sopenharmony_ci INIT_LIST_HEAD(&cache->dirty_list); 223262306a36Sopenharmony_ci INIT_LIST_HEAD(&cache->io_list); 223362306a36Sopenharmony_ci INIT_LIST_HEAD(&cache->active_bg_list); 223462306a36Sopenharmony_ci btrfs_init_free_space_ctl(cache, cache->free_space_ctl); 223562306a36Sopenharmony_ci atomic_set(&cache->frozen, 0); 223662306a36Sopenharmony_ci mutex_init(&cache->free_space_lock); 223762306a36Sopenharmony_ci 223862306a36Sopenharmony_ci return cache; 223962306a36Sopenharmony_ci} 224062306a36Sopenharmony_ci 224162306a36Sopenharmony_ci/* 224262306a36Sopenharmony_ci * Iterate all chunks and verify that each of them has the corresponding block 224362306a36Sopenharmony_ci * group 224462306a36Sopenharmony_ci */ 224562306a36Sopenharmony_cistatic int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 224662306a36Sopenharmony_ci{ 224762306a36Sopenharmony_ci struct extent_map_tree *map_tree = &fs_info->mapping_tree; 224862306a36Sopenharmony_ci struct extent_map *em; 224962306a36Sopenharmony_ci struct btrfs_block_group *bg; 225062306a36Sopenharmony_ci u64 start = 0; 225162306a36Sopenharmony_ci int ret = 0; 225262306a36Sopenharmony_ci 225362306a36Sopenharmony_ci while (1) { 225462306a36Sopenharmony_ci read_lock(&map_tree->lock); 225562306a36Sopenharmony_ci /* 225662306a36Sopenharmony_ci * lookup_extent_mapping will return the first extent map 225762306a36Sopenharmony_ci * intersecting the range, so setting @len to 1 is enough to 225862306a36Sopenharmony_ci * get the first chunk. 225962306a36Sopenharmony_ci */ 226062306a36Sopenharmony_ci em = lookup_extent_mapping(map_tree, start, 1); 226162306a36Sopenharmony_ci read_unlock(&map_tree->lock); 226262306a36Sopenharmony_ci if (!em) 226362306a36Sopenharmony_ci break; 226462306a36Sopenharmony_ci 226562306a36Sopenharmony_ci bg = btrfs_lookup_block_group(fs_info, em->start); 226662306a36Sopenharmony_ci if (!bg) { 226762306a36Sopenharmony_ci btrfs_err(fs_info, 226862306a36Sopenharmony_ci "chunk start=%llu len=%llu doesn't have corresponding block group", 226962306a36Sopenharmony_ci em->start, em->len); 227062306a36Sopenharmony_ci ret = -EUCLEAN; 227162306a36Sopenharmony_ci free_extent_map(em); 227262306a36Sopenharmony_ci break; 227362306a36Sopenharmony_ci } 227462306a36Sopenharmony_ci if (bg->start != em->start || bg->length != em->len || 227562306a36Sopenharmony_ci (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 227662306a36Sopenharmony_ci (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 227762306a36Sopenharmony_ci btrfs_err(fs_info, 227862306a36Sopenharmony_ci"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 227962306a36Sopenharmony_ci em->start, em->len, 228062306a36Sopenharmony_ci em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 228162306a36Sopenharmony_ci bg->start, bg->length, 228262306a36Sopenharmony_ci bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 228362306a36Sopenharmony_ci ret = -EUCLEAN; 228462306a36Sopenharmony_ci free_extent_map(em); 228562306a36Sopenharmony_ci btrfs_put_block_group(bg); 228662306a36Sopenharmony_ci break; 228762306a36Sopenharmony_ci } 228862306a36Sopenharmony_ci start = em->start + em->len; 228962306a36Sopenharmony_ci free_extent_map(em); 229062306a36Sopenharmony_ci btrfs_put_block_group(bg); 229162306a36Sopenharmony_ci } 229262306a36Sopenharmony_ci return ret; 229362306a36Sopenharmony_ci} 229462306a36Sopenharmony_ci 229562306a36Sopenharmony_cistatic int read_one_block_group(struct btrfs_fs_info *info, 229662306a36Sopenharmony_ci struct btrfs_block_group_item *bgi, 229762306a36Sopenharmony_ci const struct btrfs_key *key, 229862306a36Sopenharmony_ci int need_clear) 229962306a36Sopenharmony_ci{ 230062306a36Sopenharmony_ci struct btrfs_block_group *cache; 230162306a36Sopenharmony_ci const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); 230262306a36Sopenharmony_ci int ret; 230362306a36Sopenharmony_ci 230462306a36Sopenharmony_ci ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); 230562306a36Sopenharmony_ci 230662306a36Sopenharmony_ci cache = btrfs_create_block_group_cache(info, key->objectid); 230762306a36Sopenharmony_ci if (!cache) 230862306a36Sopenharmony_ci return -ENOMEM; 230962306a36Sopenharmony_ci 231062306a36Sopenharmony_ci cache->length = key->offset; 231162306a36Sopenharmony_ci cache->used = btrfs_stack_block_group_used(bgi); 231262306a36Sopenharmony_ci cache->commit_used = cache->used; 231362306a36Sopenharmony_ci cache->flags = btrfs_stack_block_group_flags(bgi); 231462306a36Sopenharmony_ci cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); 231562306a36Sopenharmony_ci 231662306a36Sopenharmony_ci set_free_space_tree_thresholds(cache); 231762306a36Sopenharmony_ci 231862306a36Sopenharmony_ci if (need_clear) { 231962306a36Sopenharmony_ci /* 232062306a36Sopenharmony_ci * When we mount with old space cache, we need to 232162306a36Sopenharmony_ci * set BTRFS_DC_CLEAR and set dirty flag. 232262306a36Sopenharmony_ci * 232362306a36Sopenharmony_ci * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 232462306a36Sopenharmony_ci * truncate the old free space cache inode and 232562306a36Sopenharmony_ci * setup a new one. 232662306a36Sopenharmony_ci * b) Setting 'dirty flag' makes sure that we flush 232762306a36Sopenharmony_ci * the new space cache info onto disk. 232862306a36Sopenharmony_ci */ 232962306a36Sopenharmony_ci if (btrfs_test_opt(info, SPACE_CACHE)) 233062306a36Sopenharmony_ci cache->disk_cache_state = BTRFS_DC_CLEAR; 233162306a36Sopenharmony_ci } 233262306a36Sopenharmony_ci if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 233362306a36Sopenharmony_ci (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 233462306a36Sopenharmony_ci btrfs_err(info, 233562306a36Sopenharmony_ci"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 233662306a36Sopenharmony_ci cache->start); 233762306a36Sopenharmony_ci ret = -EINVAL; 233862306a36Sopenharmony_ci goto error; 233962306a36Sopenharmony_ci } 234062306a36Sopenharmony_ci 234162306a36Sopenharmony_ci ret = btrfs_load_block_group_zone_info(cache, false); 234262306a36Sopenharmony_ci if (ret) { 234362306a36Sopenharmony_ci btrfs_err(info, "zoned: failed to load zone info of bg %llu", 234462306a36Sopenharmony_ci cache->start); 234562306a36Sopenharmony_ci goto error; 234662306a36Sopenharmony_ci } 234762306a36Sopenharmony_ci 234862306a36Sopenharmony_ci /* 234962306a36Sopenharmony_ci * We need to exclude the super stripes now so that the space info has 235062306a36Sopenharmony_ci * super bytes accounted for, otherwise we'll think we have more space 235162306a36Sopenharmony_ci * than we actually do. 235262306a36Sopenharmony_ci */ 235362306a36Sopenharmony_ci ret = exclude_super_stripes(cache); 235462306a36Sopenharmony_ci if (ret) { 235562306a36Sopenharmony_ci /* We may have excluded something, so call this just in case. */ 235662306a36Sopenharmony_ci btrfs_free_excluded_extents(cache); 235762306a36Sopenharmony_ci goto error; 235862306a36Sopenharmony_ci } 235962306a36Sopenharmony_ci 236062306a36Sopenharmony_ci /* 236162306a36Sopenharmony_ci * For zoned filesystem, space after the allocation offset is the only 236262306a36Sopenharmony_ci * free space for a block group. So, we don't need any caching work. 236362306a36Sopenharmony_ci * btrfs_calc_zone_unusable() will set the amount of free space and 236462306a36Sopenharmony_ci * zone_unusable space. 236562306a36Sopenharmony_ci * 236662306a36Sopenharmony_ci * For regular filesystem, check for two cases, either we are full, and 236762306a36Sopenharmony_ci * therefore don't need to bother with the caching work since we won't 236862306a36Sopenharmony_ci * find any space, or we are empty, and we can just add all the space 236962306a36Sopenharmony_ci * in and be done with it. This saves us _a_lot_ of time, particularly 237062306a36Sopenharmony_ci * in the full case. 237162306a36Sopenharmony_ci */ 237262306a36Sopenharmony_ci if (btrfs_is_zoned(info)) { 237362306a36Sopenharmony_ci btrfs_calc_zone_unusable(cache); 237462306a36Sopenharmony_ci /* Should not have any excluded extents. Just in case, though. */ 237562306a36Sopenharmony_ci btrfs_free_excluded_extents(cache); 237662306a36Sopenharmony_ci } else if (cache->length == cache->used) { 237762306a36Sopenharmony_ci cache->cached = BTRFS_CACHE_FINISHED; 237862306a36Sopenharmony_ci btrfs_free_excluded_extents(cache); 237962306a36Sopenharmony_ci } else if (cache->used == 0) { 238062306a36Sopenharmony_ci cache->cached = BTRFS_CACHE_FINISHED; 238162306a36Sopenharmony_ci ret = btrfs_add_new_free_space(cache, cache->start, 238262306a36Sopenharmony_ci cache->start + cache->length, NULL); 238362306a36Sopenharmony_ci btrfs_free_excluded_extents(cache); 238462306a36Sopenharmony_ci if (ret) 238562306a36Sopenharmony_ci goto error; 238662306a36Sopenharmony_ci } 238762306a36Sopenharmony_ci 238862306a36Sopenharmony_ci ret = btrfs_add_block_group_cache(info, cache); 238962306a36Sopenharmony_ci if (ret) { 239062306a36Sopenharmony_ci btrfs_remove_free_space_cache(cache); 239162306a36Sopenharmony_ci goto error; 239262306a36Sopenharmony_ci } 239362306a36Sopenharmony_ci trace_btrfs_add_block_group(info, cache, 0); 239462306a36Sopenharmony_ci btrfs_add_bg_to_space_info(info, cache); 239562306a36Sopenharmony_ci 239662306a36Sopenharmony_ci set_avail_alloc_bits(info, cache->flags); 239762306a36Sopenharmony_ci if (btrfs_chunk_writeable(info, cache->start)) { 239862306a36Sopenharmony_ci if (cache->used == 0) { 239962306a36Sopenharmony_ci ASSERT(list_empty(&cache->bg_list)); 240062306a36Sopenharmony_ci if (btrfs_test_opt(info, DISCARD_ASYNC)) 240162306a36Sopenharmony_ci btrfs_discard_queue_work(&info->discard_ctl, cache); 240262306a36Sopenharmony_ci else 240362306a36Sopenharmony_ci btrfs_mark_bg_unused(cache); 240462306a36Sopenharmony_ci } 240562306a36Sopenharmony_ci } else { 240662306a36Sopenharmony_ci inc_block_group_ro(cache, 1); 240762306a36Sopenharmony_ci } 240862306a36Sopenharmony_ci 240962306a36Sopenharmony_ci return 0; 241062306a36Sopenharmony_cierror: 241162306a36Sopenharmony_ci btrfs_put_block_group(cache); 241262306a36Sopenharmony_ci return ret; 241362306a36Sopenharmony_ci} 241462306a36Sopenharmony_ci 241562306a36Sopenharmony_cistatic int fill_dummy_bgs(struct btrfs_fs_info *fs_info) 241662306a36Sopenharmony_ci{ 241762306a36Sopenharmony_ci struct extent_map_tree *em_tree = &fs_info->mapping_tree; 241862306a36Sopenharmony_ci struct rb_node *node; 241962306a36Sopenharmony_ci int ret = 0; 242062306a36Sopenharmony_ci 242162306a36Sopenharmony_ci for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 242262306a36Sopenharmony_ci struct extent_map *em; 242362306a36Sopenharmony_ci struct map_lookup *map; 242462306a36Sopenharmony_ci struct btrfs_block_group *bg; 242562306a36Sopenharmony_ci 242662306a36Sopenharmony_ci em = rb_entry(node, struct extent_map, rb_node); 242762306a36Sopenharmony_ci map = em->map_lookup; 242862306a36Sopenharmony_ci bg = btrfs_create_block_group_cache(fs_info, em->start); 242962306a36Sopenharmony_ci if (!bg) { 243062306a36Sopenharmony_ci ret = -ENOMEM; 243162306a36Sopenharmony_ci break; 243262306a36Sopenharmony_ci } 243362306a36Sopenharmony_ci 243462306a36Sopenharmony_ci /* Fill dummy cache as FULL */ 243562306a36Sopenharmony_ci bg->length = em->len; 243662306a36Sopenharmony_ci bg->flags = map->type; 243762306a36Sopenharmony_ci bg->cached = BTRFS_CACHE_FINISHED; 243862306a36Sopenharmony_ci bg->used = em->len; 243962306a36Sopenharmony_ci bg->flags = map->type; 244062306a36Sopenharmony_ci ret = btrfs_add_block_group_cache(fs_info, bg); 244162306a36Sopenharmony_ci /* 244262306a36Sopenharmony_ci * We may have some valid block group cache added already, in 244362306a36Sopenharmony_ci * that case we skip to the next one. 244462306a36Sopenharmony_ci */ 244562306a36Sopenharmony_ci if (ret == -EEXIST) { 244662306a36Sopenharmony_ci ret = 0; 244762306a36Sopenharmony_ci btrfs_put_block_group(bg); 244862306a36Sopenharmony_ci continue; 244962306a36Sopenharmony_ci } 245062306a36Sopenharmony_ci 245162306a36Sopenharmony_ci if (ret) { 245262306a36Sopenharmony_ci btrfs_remove_free_space_cache(bg); 245362306a36Sopenharmony_ci btrfs_put_block_group(bg); 245462306a36Sopenharmony_ci break; 245562306a36Sopenharmony_ci } 245662306a36Sopenharmony_ci 245762306a36Sopenharmony_ci btrfs_add_bg_to_space_info(fs_info, bg); 245862306a36Sopenharmony_ci 245962306a36Sopenharmony_ci set_avail_alloc_bits(fs_info, bg->flags); 246062306a36Sopenharmony_ci } 246162306a36Sopenharmony_ci if (!ret) 246262306a36Sopenharmony_ci btrfs_init_global_block_rsv(fs_info); 246362306a36Sopenharmony_ci return ret; 246462306a36Sopenharmony_ci} 246562306a36Sopenharmony_ci 246662306a36Sopenharmony_ciint btrfs_read_block_groups(struct btrfs_fs_info *info) 246762306a36Sopenharmony_ci{ 246862306a36Sopenharmony_ci struct btrfs_root *root = btrfs_block_group_root(info); 246962306a36Sopenharmony_ci struct btrfs_path *path; 247062306a36Sopenharmony_ci int ret; 247162306a36Sopenharmony_ci struct btrfs_block_group *cache; 247262306a36Sopenharmony_ci struct btrfs_space_info *space_info; 247362306a36Sopenharmony_ci struct btrfs_key key; 247462306a36Sopenharmony_ci int need_clear = 0; 247562306a36Sopenharmony_ci u64 cache_gen; 247662306a36Sopenharmony_ci 247762306a36Sopenharmony_ci /* 247862306a36Sopenharmony_ci * Either no extent root (with ibadroots rescue option) or we have 247962306a36Sopenharmony_ci * unsupported RO options. The fs can never be mounted read-write, so no 248062306a36Sopenharmony_ci * need to waste time searching block group items. 248162306a36Sopenharmony_ci * 248262306a36Sopenharmony_ci * This also allows new extent tree related changes to be RO compat, 248362306a36Sopenharmony_ci * no need for a full incompat flag. 248462306a36Sopenharmony_ci */ 248562306a36Sopenharmony_ci if (!root || (btrfs_super_compat_ro_flags(info->super_copy) & 248662306a36Sopenharmony_ci ~BTRFS_FEATURE_COMPAT_RO_SUPP)) 248762306a36Sopenharmony_ci return fill_dummy_bgs(info); 248862306a36Sopenharmony_ci 248962306a36Sopenharmony_ci key.objectid = 0; 249062306a36Sopenharmony_ci key.offset = 0; 249162306a36Sopenharmony_ci key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 249262306a36Sopenharmony_ci path = btrfs_alloc_path(); 249362306a36Sopenharmony_ci if (!path) 249462306a36Sopenharmony_ci return -ENOMEM; 249562306a36Sopenharmony_ci 249662306a36Sopenharmony_ci cache_gen = btrfs_super_cache_generation(info->super_copy); 249762306a36Sopenharmony_ci if (btrfs_test_opt(info, SPACE_CACHE) && 249862306a36Sopenharmony_ci btrfs_super_generation(info->super_copy) != cache_gen) 249962306a36Sopenharmony_ci need_clear = 1; 250062306a36Sopenharmony_ci if (btrfs_test_opt(info, CLEAR_CACHE)) 250162306a36Sopenharmony_ci need_clear = 1; 250262306a36Sopenharmony_ci 250362306a36Sopenharmony_ci while (1) { 250462306a36Sopenharmony_ci struct btrfs_block_group_item bgi; 250562306a36Sopenharmony_ci struct extent_buffer *leaf; 250662306a36Sopenharmony_ci int slot; 250762306a36Sopenharmony_ci 250862306a36Sopenharmony_ci ret = find_first_block_group(info, path, &key); 250962306a36Sopenharmony_ci if (ret > 0) 251062306a36Sopenharmony_ci break; 251162306a36Sopenharmony_ci if (ret != 0) 251262306a36Sopenharmony_ci goto error; 251362306a36Sopenharmony_ci 251462306a36Sopenharmony_ci leaf = path->nodes[0]; 251562306a36Sopenharmony_ci slot = path->slots[0]; 251662306a36Sopenharmony_ci 251762306a36Sopenharmony_ci read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), 251862306a36Sopenharmony_ci sizeof(bgi)); 251962306a36Sopenharmony_ci 252062306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, slot); 252162306a36Sopenharmony_ci btrfs_release_path(path); 252262306a36Sopenharmony_ci ret = read_one_block_group(info, &bgi, &key, need_clear); 252362306a36Sopenharmony_ci if (ret < 0) 252462306a36Sopenharmony_ci goto error; 252562306a36Sopenharmony_ci key.objectid += key.offset; 252662306a36Sopenharmony_ci key.offset = 0; 252762306a36Sopenharmony_ci } 252862306a36Sopenharmony_ci btrfs_release_path(path); 252962306a36Sopenharmony_ci 253062306a36Sopenharmony_ci list_for_each_entry(space_info, &info->space_info, list) { 253162306a36Sopenharmony_ci int i; 253262306a36Sopenharmony_ci 253362306a36Sopenharmony_ci for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 253462306a36Sopenharmony_ci if (list_empty(&space_info->block_groups[i])) 253562306a36Sopenharmony_ci continue; 253662306a36Sopenharmony_ci cache = list_first_entry(&space_info->block_groups[i], 253762306a36Sopenharmony_ci struct btrfs_block_group, 253862306a36Sopenharmony_ci list); 253962306a36Sopenharmony_ci btrfs_sysfs_add_block_group_type(cache); 254062306a36Sopenharmony_ci } 254162306a36Sopenharmony_ci 254262306a36Sopenharmony_ci if (!(btrfs_get_alloc_profile(info, space_info->flags) & 254362306a36Sopenharmony_ci (BTRFS_BLOCK_GROUP_RAID10 | 254462306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_RAID1_MASK | 254562306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_RAID56_MASK | 254662306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_DUP))) 254762306a36Sopenharmony_ci continue; 254862306a36Sopenharmony_ci /* 254962306a36Sopenharmony_ci * Avoid allocating from un-mirrored block group if there are 255062306a36Sopenharmony_ci * mirrored block groups. 255162306a36Sopenharmony_ci */ 255262306a36Sopenharmony_ci list_for_each_entry(cache, 255362306a36Sopenharmony_ci &space_info->block_groups[BTRFS_RAID_RAID0], 255462306a36Sopenharmony_ci list) 255562306a36Sopenharmony_ci inc_block_group_ro(cache, 1); 255662306a36Sopenharmony_ci list_for_each_entry(cache, 255762306a36Sopenharmony_ci &space_info->block_groups[BTRFS_RAID_SINGLE], 255862306a36Sopenharmony_ci list) 255962306a36Sopenharmony_ci inc_block_group_ro(cache, 1); 256062306a36Sopenharmony_ci } 256162306a36Sopenharmony_ci 256262306a36Sopenharmony_ci btrfs_init_global_block_rsv(info); 256362306a36Sopenharmony_ci ret = check_chunk_block_group_mappings(info); 256462306a36Sopenharmony_cierror: 256562306a36Sopenharmony_ci btrfs_free_path(path); 256662306a36Sopenharmony_ci /* 256762306a36Sopenharmony_ci * We've hit some error while reading the extent tree, and have 256862306a36Sopenharmony_ci * rescue=ibadroots mount option. 256962306a36Sopenharmony_ci * Try to fill the tree using dummy block groups so that the user can 257062306a36Sopenharmony_ci * continue to mount and grab their data. 257162306a36Sopenharmony_ci */ 257262306a36Sopenharmony_ci if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) 257362306a36Sopenharmony_ci ret = fill_dummy_bgs(info); 257462306a36Sopenharmony_ci return ret; 257562306a36Sopenharmony_ci} 257662306a36Sopenharmony_ci 257762306a36Sopenharmony_ci/* 257862306a36Sopenharmony_ci * This function, insert_block_group_item(), belongs to the phase 2 of chunk 257962306a36Sopenharmony_ci * allocation. 258062306a36Sopenharmony_ci * 258162306a36Sopenharmony_ci * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 258262306a36Sopenharmony_ci * phases. 258362306a36Sopenharmony_ci */ 258462306a36Sopenharmony_cistatic int insert_block_group_item(struct btrfs_trans_handle *trans, 258562306a36Sopenharmony_ci struct btrfs_block_group *block_group) 258662306a36Sopenharmony_ci{ 258762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 258862306a36Sopenharmony_ci struct btrfs_block_group_item bgi; 258962306a36Sopenharmony_ci struct btrfs_root *root = btrfs_block_group_root(fs_info); 259062306a36Sopenharmony_ci struct btrfs_key key; 259162306a36Sopenharmony_ci u64 old_commit_used; 259262306a36Sopenharmony_ci int ret; 259362306a36Sopenharmony_ci 259462306a36Sopenharmony_ci spin_lock(&block_group->lock); 259562306a36Sopenharmony_ci btrfs_set_stack_block_group_used(&bgi, block_group->used); 259662306a36Sopenharmony_ci btrfs_set_stack_block_group_chunk_objectid(&bgi, 259762306a36Sopenharmony_ci block_group->global_root_id); 259862306a36Sopenharmony_ci btrfs_set_stack_block_group_flags(&bgi, block_group->flags); 259962306a36Sopenharmony_ci old_commit_used = block_group->commit_used; 260062306a36Sopenharmony_ci block_group->commit_used = block_group->used; 260162306a36Sopenharmony_ci key.objectid = block_group->start; 260262306a36Sopenharmony_ci key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 260362306a36Sopenharmony_ci key.offset = block_group->length; 260462306a36Sopenharmony_ci spin_unlock(&block_group->lock); 260562306a36Sopenharmony_ci 260662306a36Sopenharmony_ci ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); 260762306a36Sopenharmony_ci if (ret < 0) { 260862306a36Sopenharmony_ci spin_lock(&block_group->lock); 260962306a36Sopenharmony_ci block_group->commit_used = old_commit_used; 261062306a36Sopenharmony_ci spin_unlock(&block_group->lock); 261162306a36Sopenharmony_ci } 261262306a36Sopenharmony_ci 261362306a36Sopenharmony_ci return ret; 261462306a36Sopenharmony_ci} 261562306a36Sopenharmony_ci 261662306a36Sopenharmony_cistatic int insert_dev_extent(struct btrfs_trans_handle *trans, 261762306a36Sopenharmony_ci struct btrfs_device *device, u64 chunk_offset, 261862306a36Sopenharmony_ci u64 start, u64 num_bytes) 261962306a36Sopenharmony_ci{ 262062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = device->fs_info; 262162306a36Sopenharmony_ci struct btrfs_root *root = fs_info->dev_root; 262262306a36Sopenharmony_ci struct btrfs_path *path; 262362306a36Sopenharmony_ci struct btrfs_dev_extent *extent; 262462306a36Sopenharmony_ci struct extent_buffer *leaf; 262562306a36Sopenharmony_ci struct btrfs_key key; 262662306a36Sopenharmony_ci int ret; 262762306a36Sopenharmony_ci 262862306a36Sopenharmony_ci WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 262962306a36Sopenharmony_ci WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 263062306a36Sopenharmony_ci path = btrfs_alloc_path(); 263162306a36Sopenharmony_ci if (!path) 263262306a36Sopenharmony_ci return -ENOMEM; 263362306a36Sopenharmony_ci 263462306a36Sopenharmony_ci key.objectid = device->devid; 263562306a36Sopenharmony_ci key.type = BTRFS_DEV_EXTENT_KEY; 263662306a36Sopenharmony_ci key.offset = start; 263762306a36Sopenharmony_ci ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); 263862306a36Sopenharmony_ci if (ret) 263962306a36Sopenharmony_ci goto out; 264062306a36Sopenharmony_ci 264162306a36Sopenharmony_ci leaf = path->nodes[0]; 264262306a36Sopenharmony_ci extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 264362306a36Sopenharmony_ci btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); 264462306a36Sopenharmony_ci btrfs_set_dev_extent_chunk_objectid(leaf, extent, 264562306a36Sopenharmony_ci BTRFS_FIRST_CHUNK_TREE_OBJECTID); 264662306a36Sopenharmony_ci btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 264762306a36Sopenharmony_ci 264862306a36Sopenharmony_ci btrfs_set_dev_extent_length(leaf, extent, num_bytes); 264962306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 265062306a36Sopenharmony_ciout: 265162306a36Sopenharmony_ci btrfs_free_path(path); 265262306a36Sopenharmony_ci return ret; 265362306a36Sopenharmony_ci} 265462306a36Sopenharmony_ci 265562306a36Sopenharmony_ci/* 265662306a36Sopenharmony_ci * This function belongs to phase 2. 265762306a36Sopenharmony_ci * 265862306a36Sopenharmony_ci * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 265962306a36Sopenharmony_ci * phases. 266062306a36Sopenharmony_ci */ 266162306a36Sopenharmony_cistatic int insert_dev_extents(struct btrfs_trans_handle *trans, 266262306a36Sopenharmony_ci u64 chunk_offset, u64 chunk_size) 266362306a36Sopenharmony_ci{ 266462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 266562306a36Sopenharmony_ci struct btrfs_device *device; 266662306a36Sopenharmony_ci struct extent_map *em; 266762306a36Sopenharmony_ci struct map_lookup *map; 266862306a36Sopenharmony_ci u64 dev_offset; 266962306a36Sopenharmony_ci u64 stripe_size; 267062306a36Sopenharmony_ci int i; 267162306a36Sopenharmony_ci int ret = 0; 267262306a36Sopenharmony_ci 267362306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); 267462306a36Sopenharmony_ci if (IS_ERR(em)) 267562306a36Sopenharmony_ci return PTR_ERR(em); 267662306a36Sopenharmony_ci 267762306a36Sopenharmony_ci map = em->map_lookup; 267862306a36Sopenharmony_ci stripe_size = em->orig_block_len; 267962306a36Sopenharmony_ci 268062306a36Sopenharmony_ci /* 268162306a36Sopenharmony_ci * Take the device list mutex to prevent races with the final phase of 268262306a36Sopenharmony_ci * a device replace operation that replaces the device object associated 268362306a36Sopenharmony_ci * with the map's stripes, because the device object's id can change 268462306a36Sopenharmony_ci * at any time during that final phase of the device replace operation 268562306a36Sopenharmony_ci * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 268662306a36Sopenharmony_ci * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 268762306a36Sopenharmony_ci * resulting in persisting a device extent item with such ID. 268862306a36Sopenharmony_ci */ 268962306a36Sopenharmony_ci mutex_lock(&fs_info->fs_devices->device_list_mutex); 269062306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 269162306a36Sopenharmony_ci device = map->stripes[i].dev; 269262306a36Sopenharmony_ci dev_offset = map->stripes[i].physical; 269362306a36Sopenharmony_ci 269462306a36Sopenharmony_ci ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, 269562306a36Sopenharmony_ci stripe_size); 269662306a36Sopenharmony_ci if (ret) 269762306a36Sopenharmony_ci break; 269862306a36Sopenharmony_ci } 269962306a36Sopenharmony_ci mutex_unlock(&fs_info->fs_devices->device_list_mutex); 270062306a36Sopenharmony_ci 270162306a36Sopenharmony_ci free_extent_map(em); 270262306a36Sopenharmony_ci return ret; 270362306a36Sopenharmony_ci} 270462306a36Sopenharmony_ci 270562306a36Sopenharmony_ci/* 270662306a36Sopenharmony_ci * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of 270762306a36Sopenharmony_ci * chunk allocation. 270862306a36Sopenharmony_ci * 270962306a36Sopenharmony_ci * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 271062306a36Sopenharmony_ci * phases. 271162306a36Sopenharmony_ci */ 271262306a36Sopenharmony_civoid btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 271362306a36Sopenharmony_ci{ 271462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 271562306a36Sopenharmony_ci struct btrfs_block_group *block_group; 271662306a36Sopenharmony_ci int ret = 0; 271762306a36Sopenharmony_ci 271862306a36Sopenharmony_ci while (!list_empty(&trans->new_bgs)) { 271962306a36Sopenharmony_ci int index; 272062306a36Sopenharmony_ci 272162306a36Sopenharmony_ci block_group = list_first_entry(&trans->new_bgs, 272262306a36Sopenharmony_ci struct btrfs_block_group, 272362306a36Sopenharmony_ci bg_list); 272462306a36Sopenharmony_ci if (ret) 272562306a36Sopenharmony_ci goto next; 272662306a36Sopenharmony_ci 272762306a36Sopenharmony_ci index = btrfs_bg_flags_to_raid_index(block_group->flags); 272862306a36Sopenharmony_ci 272962306a36Sopenharmony_ci ret = insert_block_group_item(trans, block_group); 273062306a36Sopenharmony_ci if (ret) 273162306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 273262306a36Sopenharmony_ci if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, 273362306a36Sopenharmony_ci &block_group->runtime_flags)) { 273462306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 273562306a36Sopenharmony_ci ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); 273662306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 273762306a36Sopenharmony_ci if (ret) 273862306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 273962306a36Sopenharmony_ci } 274062306a36Sopenharmony_ci ret = insert_dev_extents(trans, block_group->start, 274162306a36Sopenharmony_ci block_group->length); 274262306a36Sopenharmony_ci if (ret) 274362306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 274462306a36Sopenharmony_ci add_block_group_free_space(trans, block_group); 274562306a36Sopenharmony_ci 274662306a36Sopenharmony_ci /* 274762306a36Sopenharmony_ci * If we restriped during balance, we may have added a new raid 274862306a36Sopenharmony_ci * type, so now add the sysfs entries when it is safe to do so. 274962306a36Sopenharmony_ci * We don't have to worry about locking here as it's handled in 275062306a36Sopenharmony_ci * btrfs_sysfs_add_block_group_type. 275162306a36Sopenharmony_ci */ 275262306a36Sopenharmony_ci if (block_group->space_info->block_group_kobjs[index] == NULL) 275362306a36Sopenharmony_ci btrfs_sysfs_add_block_group_type(block_group); 275462306a36Sopenharmony_ci 275562306a36Sopenharmony_ci /* Already aborted the transaction if it failed. */ 275662306a36Sopenharmony_cinext: 275762306a36Sopenharmony_ci btrfs_delayed_refs_rsv_release(fs_info, 1); 275862306a36Sopenharmony_ci list_del_init(&block_group->bg_list); 275962306a36Sopenharmony_ci clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); 276062306a36Sopenharmony_ci } 276162306a36Sopenharmony_ci btrfs_trans_release_chunk_metadata(trans); 276262306a36Sopenharmony_ci} 276362306a36Sopenharmony_ci 276462306a36Sopenharmony_ci/* 276562306a36Sopenharmony_ci * For extent tree v2 we use the block_group_item->chunk_offset to point at our 276662306a36Sopenharmony_ci * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. 276762306a36Sopenharmony_ci */ 276862306a36Sopenharmony_cistatic u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) 276962306a36Sopenharmony_ci{ 277062306a36Sopenharmony_ci u64 div = SZ_1G; 277162306a36Sopenharmony_ci u64 index; 277262306a36Sopenharmony_ci 277362306a36Sopenharmony_ci if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 277462306a36Sopenharmony_ci return BTRFS_FIRST_CHUNK_TREE_OBJECTID; 277562306a36Sopenharmony_ci 277662306a36Sopenharmony_ci /* If we have a smaller fs index based on 128MiB. */ 277762306a36Sopenharmony_ci if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) 277862306a36Sopenharmony_ci div = SZ_128M; 277962306a36Sopenharmony_ci 278062306a36Sopenharmony_ci offset = div64_u64(offset, div); 278162306a36Sopenharmony_ci div64_u64_rem(offset, fs_info->nr_global_roots, &index); 278262306a36Sopenharmony_ci return index; 278362306a36Sopenharmony_ci} 278462306a36Sopenharmony_ci 278562306a36Sopenharmony_cistruct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, 278662306a36Sopenharmony_ci u64 type, 278762306a36Sopenharmony_ci u64 chunk_offset, u64 size) 278862306a36Sopenharmony_ci{ 278962306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 279062306a36Sopenharmony_ci struct btrfs_block_group *cache; 279162306a36Sopenharmony_ci int ret; 279262306a36Sopenharmony_ci 279362306a36Sopenharmony_ci btrfs_set_log_full_commit(trans); 279462306a36Sopenharmony_ci 279562306a36Sopenharmony_ci cache = btrfs_create_block_group_cache(fs_info, chunk_offset); 279662306a36Sopenharmony_ci if (!cache) 279762306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 279862306a36Sopenharmony_ci 279962306a36Sopenharmony_ci /* 280062306a36Sopenharmony_ci * Mark it as new before adding it to the rbtree of block groups or any 280162306a36Sopenharmony_ci * list, so that no other task finds it and calls btrfs_mark_bg_unused() 280262306a36Sopenharmony_ci * before the new flag is set. 280362306a36Sopenharmony_ci */ 280462306a36Sopenharmony_ci set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); 280562306a36Sopenharmony_ci 280662306a36Sopenharmony_ci cache->length = size; 280762306a36Sopenharmony_ci set_free_space_tree_thresholds(cache); 280862306a36Sopenharmony_ci cache->flags = type; 280962306a36Sopenharmony_ci cache->cached = BTRFS_CACHE_FINISHED; 281062306a36Sopenharmony_ci cache->global_root_id = calculate_global_root_id(fs_info, cache->start); 281162306a36Sopenharmony_ci 281262306a36Sopenharmony_ci if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 281362306a36Sopenharmony_ci set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); 281462306a36Sopenharmony_ci 281562306a36Sopenharmony_ci ret = btrfs_load_block_group_zone_info(cache, true); 281662306a36Sopenharmony_ci if (ret) { 281762306a36Sopenharmony_ci btrfs_put_block_group(cache); 281862306a36Sopenharmony_ci return ERR_PTR(ret); 281962306a36Sopenharmony_ci } 282062306a36Sopenharmony_ci 282162306a36Sopenharmony_ci ret = exclude_super_stripes(cache); 282262306a36Sopenharmony_ci if (ret) { 282362306a36Sopenharmony_ci /* We may have excluded something, so call this just in case */ 282462306a36Sopenharmony_ci btrfs_free_excluded_extents(cache); 282562306a36Sopenharmony_ci btrfs_put_block_group(cache); 282662306a36Sopenharmony_ci return ERR_PTR(ret); 282762306a36Sopenharmony_ci } 282862306a36Sopenharmony_ci 282962306a36Sopenharmony_ci ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); 283062306a36Sopenharmony_ci btrfs_free_excluded_extents(cache); 283162306a36Sopenharmony_ci if (ret) { 283262306a36Sopenharmony_ci btrfs_put_block_group(cache); 283362306a36Sopenharmony_ci return ERR_PTR(ret); 283462306a36Sopenharmony_ci } 283562306a36Sopenharmony_ci 283662306a36Sopenharmony_ci /* 283762306a36Sopenharmony_ci * Ensure the corresponding space_info object is created and 283862306a36Sopenharmony_ci * assigned to our block group. We want our bg to be added to the rbtree 283962306a36Sopenharmony_ci * with its ->space_info set. 284062306a36Sopenharmony_ci */ 284162306a36Sopenharmony_ci cache->space_info = btrfs_find_space_info(fs_info, cache->flags); 284262306a36Sopenharmony_ci ASSERT(cache->space_info); 284362306a36Sopenharmony_ci 284462306a36Sopenharmony_ci ret = btrfs_add_block_group_cache(fs_info, cache); 284562306a36Sopenharmony_ci if (ret) { 284662306a36Sopenharmony_ci btrfs_remove_free_space_cache(cache); 284762306a36Sopenharmony_ci btrfs_put_block_group(cache); 284862306a36Sopenharmony_ci return ERR_PTR(ret); 284962306a36Sopenharmony_ci } 285062306a36Sopenharmony_ci 285162306a36Sopenharmony_ci /* 285262306a36Sopenharmony_ci * Now that our block group has its ->space_info set and is inserted in 285362306a36Sopenharmony_ci * the rbtree, update the space info's counters. 285462306a36Sopenharmony_ci */ 285562306a36Sopenharmony_ci trace_btrfs_add_block_group(fs_info, cache, 1); 285662306a36Sopenharmony_ci btrfs_add_bg_to_space_info(fs_info, cache); 285762306a36Sopenharmony_ci btrfs_update_global_block_rsv(fs_info); 285862306a36Sopenharmony_ci 285962306a36Sopenharmony_ci#ifdef CONFIG_BTRFS_DEBUG 286062306a36Sopenharmony_ci if (btrfs_should_fragment_free_space(cache)) { 286162306a36Sopenharmony_ci cache->space_info->bytes_used += size >> 1; 286262306a36Sopenharmony_ci fragment_free_space(cache); 286362306a36Sopenharmony_ci } 286462306a36Sopenharmony_ci#endif 286562306a36Sopenharmony_ci 286662306a36Sopenharmony_ci list_add_tail(&cache->bg_list, &trans->new_bgs); 286762306a36Sopenharmony_ci trans->delayed_ref_updates++; 286862306a36Sopenharmony_ci btrfs_update_delayed_refs_rsv(trans); 286962306a36Sopenharmony_ci 287062306a36Sopenharmony_ci set_avail_alloc_bits(fs_info, type); 287162306a36Sopenharmony_ci return cache; 287262306a36Sopenharmony_ci} 287362306a36Sopenharmony_ci 287462306a36Sopenharmony_ci/* 287562306a36Sopenharmony_ci * Mark one block group RO, can be called several times for the same block 287662306a36Sopenharmony_ci * group. 287762306a36Sopenharmony_ci * 287862306a36Sopenharmony_ci * @cache: the destination block group 287962306a36Sopenharmony_ci * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to 288062306a36Sopenharmony_ci * ensure we still have some free space after marking this 288162306a36Sopenharmony_ci * block group RO. 288262306a36Sopenharmony_ci */ 288362306a36Sopenharmony_ciint btrfs_inc_block_group_ro(struct btrfs_block_group *cache, 288462306a36Sopenharmony_ci bool do_chunk_alloc) 288562306a36Sopenharmony_ci{ 288662306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = cache->fs_info; 288762306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 288862306a36Sopenharmony_ci struct btrfs_root *root = btrfs_block_group_root(fs_info); 288962306a36Sopenharmony_ci u64 alloc_flags; 289062306a36Sopenharmony_ci int ret; 289162306a36Sopenharmony_ci bool dirty_bg_running; 289262306a36Sopenharmony_ci 289362306a36Sopenharmony_ci /* 289462306a36Sopenharmony_ci * This can only happen when we are doing read-only scrub on read-only 289562306a36Sopenharmony_ci * mount. 289662306a36Sopenharmony_ci * In that case we should not start a new transaction on read-only fs. 289762306a36Sopenharmony_ci * Thus here we skip all chunk allocations. 289862306a36Sopenharmony_ci */ 289962306a36Sopenharmony_ci if (sb_rdonly(fs_info->sb)) { 290062306a36Sopenharmony_ci mutex_lock(&fs_info->ro_block_group_mutex); 290162306a36Sopenharmony_ci ret = inc_block_group_ro(cache, 0); 290262306a36Sopenharmony_ci mutex_unlock(&fs_info->ro_block_group_mutex); 290362306a36Sopenharmony_ci return ret; 290462306a36Sopenharmony_ci } 290562306a36Sopenharmony_ci 290662306a36Sopenharmony_ci do { 290762306a36Sopenharmony_ci trans = btrfs_join_transaction(root); 290862306a36Sopenharmony_ci if (IS_ERR(trans)) 290962306a36Sopenharmony_ci return PTR_ERR(trans); 291062306a36Sopenharmony_ci 291162306a36Sopenharmony_ci dirty_bg_running = false; 291262306a36Sopenharmony_ci 291362306a36Sopenharmony_ci /* 291462306a36Sopenharmony_ci * We're not allowed to set block groups readonly after the dirty 291562306a36Sopenharmony_ci * block group cache has started writing. If it already started, 291662306a36Sopenharmony_ci * back off and let this transaction commit. 291762306a36Sopenharmony_ci */ 291862306a36Sopenharmony_ci mutex_lock(&fs_info->ro_block_group_mutex); 291962306a36Sopenharmony_ci if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 292062306a36Sopenharmony_ci u64 transid = trans->transid; 292162306a36Sopenharmony_ci 292262306a36Sopenharmony_ci mutex_unlock(&fs_info->ro_block_group_mutex); 292362306a36Sopenharmony_ci btrfs_end_transaction(trans); 292462306a36Sopenharmony_ci 292562306a36Sopenharmony_ci ret = btrfs_wait_for_commit(fs_info, transid); 292662306a36Sopenharmony_ci if (ret) 292762306a36Sopenharmony_ci return ret; 292862306a36Sopenharmony_ci dirty_bg_running = true; 292962306a36Sopenharmony_ci } 293062306a36Sopenharmony_ci } while (dirty_bg_running); 293162306a36Sopenharmony_ci 293262306a36Sopenharmony_ci if (do_chunk_alloc) { 293362306a36Sopenharmony_ci /* 293462306a36Sopenharmony_ci * If we are changing raid levels, try to allocate a 293562306a36Sopenharmony_ci * corresponding block group with the new raid level. 293662306a36Sopenharmony_ci */ 293762306a36Sopenharmony_ci alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 293862306a36Sopenharmony_ci if (alloc_flags != cache->flags) { 293962306a36Sopenharmony_ci ret = btrfs_chunk_alloc(trans, alloc_flags, 294062306a36Sopenharmony_ci CHUNK_ALLOC_FORCE); 294162306a36Sopenharmony_ci /* 294262306a36Sopenharmony_ci * ENOSPC is allowed here, we may have enough space 294362306a36Sopenharmony_ci * already allocated at the new raid level to carry on 294462306a36Sopenharmony_ci */ 294562306a36Sopenharmony_ci if (ret == -ENOSPC) 294662306a36Sopenharmony_ci ret = 0; 294762306a36Sopenharmony_ci if (ret < 0) 294862306a36Sopenharmony_ci goto out; 294962306a36Sopenharmony_ci } 295062306a36Sopenharmony_ci } 295162306a36Sopenharmony_ci 295262306a36Sopenharmony_ci ret = inc_block_group_ro(cache, 0); 295362306a36Sopenharmony_ci if (!ret) 295462306a36Sopenharmony_ci goto out; 295562306a36Sopenharmony_ci if (ret == -ETXTBSY) 295662306a36Sopenharmony_ci goto unlock_out; 295762306a36Sopenharmony_ci 295862306a36Sopenharmony_ci /* 295962306a36Sopenharmony_ci * Skip chunk alloction if the bg is SYSTEM, this is to avoid system 296062306a36Sopenharmony_ci * chunk allocation storm to exhaust the system chunk array. Otherwise 296162306a36Sopenharmony_ci * we still want to try our best to mark the block group read-only. 296262306a36Sopenharmony_ci */ 296362306a36Sopenharmony_ci if (!do_chunk_alloc && ret == -ENOSPC && 296462306a36Sopenharmony_ci (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) 296562306a36Sopenharmony_ci goto unlock_out; 296662306a36Sopenharmony_ci 296762306a36Sopenharmony_ci alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); 296862306a36Sopenharmony_ci ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 296962306a36Sopenharmony_ci if (ret < 0) 297062306a36Sopenharmony_ci goto out; 297162306a36Sopenharmony_ci /* 297262306a36Sopenharmony_ci * We have allocated a new chunk. We also need to activate that chunk to 297362306a36Sopenharmony_ci * grant metadata tickets for zoned filesystem. 297462306a36Sopenharmony_ci */ 297562306a36Sopenharmony_ci ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); 297662306a36Sopenharmony_ci if (ret < 0) 297762306a36Sopenharmony_ci goto out; 297862306a36Sopenharmony_ci 297962306a36Sopenharmony_ci ret = inc_block_group_ro(cache, 0); 298062306a36Sopenharmony_ci if (ret == -ETXTBSY) 298162306a36Sopenharmony_ci goto unlock_out; 298262306a36Sopenharmony_ciout: 298362306a36Sopenharmony_ci if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 298462306a36Sopenharmony_ci alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 298562306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 298662306a36Sopenharmony_ci check_system_chunk(trans, alloc_flags); 298762306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 298862306a36Sopenharmony_ci } 298962306a36Sopenharmony_ciunlock_out: 299062306a36Sopenharmony_ci mutex_unlock(&fs_info->ro_block_group_mutex); 299162306a36Sopenharmony_ci 299262306a36Sopenharmony_ci btrfs_end_transaction(trans); 299362306a36Sopenharmony_ci return ret; 299462306a36Sopenharmony_ci} 299562306a36Sopenharmony_ci 299662306a36Sopenharmony_civoid btrfs_dec_block_group_ro(struct btrfs_block_group *cache) 299762306a36Sopenharmony_ci{ 299862306a36Sopenharmony_ci struct btrfs_space_info *sinfo = cache->space_info; 299962306a36Sopenharmony_ci u64 num_bytes; 300062306a36Sopenharmony_ci 300162306a36Sopenharmony_ci BUG_ON(!cache->ro); 300262306a36Sopenharmony_ci 300362306a36Sopenharmony_ci spin_lock(&sinfo->lock); 300462306a36Sopenharmony_ci spin_lock(&cache->lock); 300562306a36Sopenharmony_ci if (!--cache->ro) { 300662306a36Sopenharmony_ci if (btrfs_is_zoned(cache->fs_info)) { 300762306a36Sopenharmony_ci /* Migrate zone_unusable bytes back */ 300862306a36Sopenharmony_ci cache->zone_unusable = 300962306a36Sopenharmony_ci (cache->alloc_offset - cache->used) + 301062306a36Sopenharmony_ci (cache->length - cache->zone_capacity); 301162306a36Sopenharmony_ci sinfo->bytes_zone_unusable += cache->zone_unusable; 301262306a36Sopenharmony_ci sinfo->bytes_readonly -= cache->zone_unusable; 301362306a36Sopenharmony_ci } 301462306a36Sopenharmony_ci num_bytes = cache->length - cache->reserved - 301562306a36Sopenharmony_ci cache->pinned - cache->bytes_super - 301662306a36Sopenharmony_ci cache->zone_unusable - cache->used; 301762306a36Sopenharmony_ci sinfo->bytes_readonly -= num_bytes; 301862306a36Sopenharmony_ci list_del_init(&cache->ro_list); 301962306a36Sopenharmony_ci } 302062306a36Sopenharmony_ci spin_unlock(&cache->lock); 302162306a36Sopenharmony_ci spin_unlock(&sinfo->lock); 302262306a36Sopenharmony_ci} 302362306a36Sopenharmony_ci 302462306a36Sopenharmony_cistatic int update_block_group_item(struct btrfs_trans_handle *trans, 302562306a36Sopenharmony_ci struct btrfs_path *path, 302662306a36Sopenharmony_ci struct btrfs_block_group *cache) 302762306a36Sopenharmony_ci{ 302862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 302962306a36Sopenharmony_ci int ret; 303062306a36Sopenharmony_ci struct btrfs_root *root = btrfs_block_group_root(fs_info); 303162306a36Sopenharmony_ci unsigned long bi; 303262306a36Sopenharmony_ci struct extent_buffer *leaf; 303362306a36Sopenharmony_ci struct btrfs_block_group_item bgi; 303462306a36Sopenharmony_ci struct btrfs_key key; 303562306a36Sopenharmony_ci u64 old_commit_used; 303662306a36Sopenharmony_ci u64 used; 303762306a36Sopenharmony_ci 303862306a36Sopenharmony_ci /* 303962306a36Sopenharmony_ci * Block group items update can be triggered out of commit transaction 304062306a36Sopenharmony_ci * critical section, thus we need a consistent view of used bytes. 304162306a36Sopenharmony_ci * We cannot use cache->used directly outside of the spin lock, as it 304262306a36Sopenharmony_ci * may be changed. 304362306a36Sopenharmony_ci */ 304462306a36Sopenharmony_ci spin_lock(&cache->lock); 304562306a36Sopenharmony_ci old_commit_used = cache->commit_used; 304662306a36Sopenharmony_ci used = cache->used; 304762306a36Sopenharmony_ci /* No change in used bytes, can safely skip it. */ 304862306a36Sopenharmony_ci if (cache->commit_used == used) { 304962306a36Sopenharmony_ci spin_unlock(&cache->lock); 305062306a36Sopenharmony_ci return 0; 305162306a36Sopenharmony_ci } 305262306a36Sopenharmony_ci cache->commit_used = used; 305362306a36Sopenharmony_ci spin_unlock(&cache->lock); 305462306a36Sopenharmony_ci 305562306a36Sopenharmony_ci key.objectid = cache->start; 305662306a36Sopenharmony_ci key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 305762306a36Sopenharmony_ci key.offset = cache->length; 305862306a36Sopenharmony_ci 305962306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 306062306a36Sopenharmony_ci if (ret) { 306162306a36Sopenharmony_ci if (ret > 0) 306262306a36Sopenharmony_ci ret = -ENOENT; 306362306a36Sopenharmony_ci goto fail; 306462306a36Sopenharmony_ci } 306562306a36Sopenharmony_ci 306662306a36Sopenharmony_ci leaf = path->nodes[0]; 306762306a36Sopenharmony_ci bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 306862306a36Sopenharmony_ci btrfs_set_stack_block_group_used(&bgi, used); 306962306a36Sopenharmony_ci btrfs_set_stack_block_group_chunk_objectid(&bgi, 307062306a36Sopenharmony_ci cache->global_root_id); 307162306a36Sopenharmony_ci btrfs_set_stack_block_group_flags(&bgi, cache->flags); 307262306a36Sopenharmony_ci write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); 307362306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 307462306a36Sopenharmony_cifail: 307562306a36Sopenharmony_ci btrfs_release_path(path); 307662306a36Sopenharmony_ci /* 307762306a36Sopenharmony_ci * We didn't update the block group item, need to revert commit_used 307862306a36Sopenharmony_ci * unless the block group item didn't exist yet - this is to prevent a 307962306a36Sopenharmony_ci * race with a concurrent insertion of the block group item, with 308062306a36Sopenharmony_ci * insert_block_group_item(), that happened just after we attempted to 308162306a36Sopenharmony_ci * update. In that case we would reset commit_used to 0 just after the 308262306a36Sopenharmony_ci * insertion set it to a value greater than 0 - if the block group later 308362306a36Sopenharmony_ci * becomes with 0 used bytes, we would incorrectly skip its update. 308462306a36Sopenharmony_ci */ 308562306a36Sopenharmony_ci if (ret < 0 && ret != -ENOENT) { 308662306a36Sopenharmony_ci spin_lock(&cache->lock); 308762306a36Sopenharmony_ci cache->commit_used = old_commit_used; 308862306a36Sopenharmony_ci spin_unlock(&cache->lock); 308962306a36Sopenharmony_ci } 309062306a36Sopenharmony_ci return ret; 309162306a36Sopenharmony_ci 309262306a36Sopenharmony_ci} 309362306a36Sopenharmony_ci 309462306a36Sopenharmony_cistatic int cache_save_setup(struct btrfs_block_group *block_group, 309562306a36Sopenharmony_ci struct btrfs_trans_handle *trans, 309662306a36Sopenharmony_ci struct btrfs_path *path) 309762306a36Sopenharmony_ci{ 309862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = block_group->fs_info; 309962306a36Sopenharmony_ci struct btrfs_root *root = fs_info->tree_root; 310062306a36Sopenharmony_ci struct inode *inode = NULL; 310162306a36Sopenharmony_ci struct extent_changeset *data_reserved = NULL; 310262306a36Sopenharmony_ci u64 alloc_hint = 0; 310362306a36Sopenharmony_ci int dcs = BTRFS_DC_ERROR; 310462306a36Sopenharmony_ci u64 cache_size = 0; 310562306a36Sopenharmony_ci int retries = 0; 310662306a36Sopenharmony_ci int ret = 0; 310762306a36Sopenharmony_ci 310862306a36Sopenharmony_ci if (!btrfs_test_opt(fs_info, SPACE_CACHE)) 310962306a36Sopenharmony_ci return 0; 311062306a36Sopenharmony_ci 311162306a36Sopenharmony_ci /* 311262306a36Sopenharmony_ci * If this block group is smaller than 100 megs don't bother caching the 311362306a36Sopenharmony_ci * block group. 311462306a36Sopenharmony_ci */ 311562306a36Sopenharmony_ci if (block_group->length < (100 * SZ_1M)) { 311662306a36Sopenharmony_ci spin_lock(&block_group->lock); 311762306a36Sopenharmony_ci block_group->disk_cache_state = BTRFS_DC_WRITTEN; 311862306a36Sopenharmony_ci spin_unlock(&block_group->lock); 311962306a36Sopenharmony_ci return 0; 312062306a36Sopenharmony_ci } 312162306a36Sopenharmony_ci 312262306a36Sopenharmony_ci if (TRANS_ABORTED(trans)) 312362306a36Sopenharmony_ci return 0; 312462306a36Sopenharmony_ciagain: 312562306a36Sopenharmony_ci inode = lookup_free_space_inode(block_group, path); 312662306a36Sopenharmony_ci if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 312762306a36Sopenharmony_ci ret = PTR_ERR(inode); 312862306a36Sopenharmony_ci btrfs_release_path(path); 312962306a36Sopenharmony_ci goto out; 313062306a36Sopenharmony_ci } 313162306a36Sopenharmony_ci 313262306a36Sopenharmony_ci if (IS_ERR(inode)) { 313362306a36Sopenharmony_ci BUG_ON(retries); 313462306a36Sopenharmony_ci retries++; 313562306a36Sopenharmony_ci 313662306a36Sopenharmony_ci if (block_group->ro) 313762306a36Sopenharmony_ci goto out_free; 313862306a36Sopenharmony_ci 313962306a36Sopenharmony_ci ret = create_free_space_inode(trans, block_group, path); 314062306a36Sopenharmony_ci if (ret) 314162306a36Sopenharmony_ci goto out_free; 314262306a36Sopenharmony_ci goto again; 314362306a36Sopenharmony_ci } 314462306a36Sopenharmony_ci 314562306a36Sopenharmony_ci /* 314662306a36Sopenharmony_ci * We want to set the generation to 0, that way if anything goes wrong 314762306a36Sopenharmony_ci * from here on out we know not to trust this cache when we load up next 314862306a36Sopenharmony_ci * time. 314962306a36Sopenharmony_ci */ 315062306a36Sopenharmony_ci BTRFS_I(inode)->generation = 0; 315162306a36Sopenharmony_ci ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 315262306a36Sopenharmony_ci if (ret) { 315362306a36Sopenharmony_ci /* 315462306a36Sopenharmony_ci * So theoretically we could recover from this, simply set the 315562306a36Sopenharmony_ci * super cache generation to 0 so we know to invalidate the 315662306a36Sopenharmony_ci * cache, but then we'd have to keep track of the block groups 315762306a36Sopenharmony_ci * that fail this way so we know we _have_ to reset this cache 315862306a36Sopenharmony_ci * before the next commit or risk reading stale cache. So to 315962306a36Sopenharmony_ci * limit our exposure to horrible edge cases lets just abort the 316062306a36Sopenharmony_ci * transaction, this only happens in really bad situations 316162306a36Sopenharmony_ci * anyway. 316262306a36Sopenharmony_ci */ 316362306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 316462306a36Sopenharmony_ci goto out_put; 316562306a36Sopenharmony_ci } 316662306a36Sopenharmony_ci WARN_ON(ret); 316762306a36Sopenharmony_ci 316862306a36Sopenharmony_ci /* We've already setup this transaction, go ahead and exit */ 316962306a36Sopenharmony_ci if (block_group->cache_generation == trans->transid && 317062306a36Sopenharmony_ci i_size_read(inode)) { 317162306a36Sopenharmony_ci dcs = BTRFS_DC_SETUP; 317262306a36Sopenharmony_ci goto out_put; 317362306a36Sopenharmony_ci } 317462306a36Sopenharmony_ci 317562306a36Sopenharmony_ci if (i_size_read(inode) > 0) { 317662306a36Sopenharmony_ci ret = btrfs_check_trunc_cache_free_space(fs_info, 317762306a36Sopenharmony_ci &fs_info->global_block_rsv); 317862306a36Sopenharmony_ci if (ret) 317962306a36Sopenharmony_ci goto out_put; 318062306a36Sopenharmony_ci 318162306a36Sopenharmony_ci ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 318262306a36Sopenharmony_ci if (ret) 318362306a36Sopenharmony_ci goto out_put; 318462306a36Sopenharmony_ci } 318562306a36Sopenharmony_ci 318662306a36Sopenharmony_ci spin_lock(&block_group->lock); 318762306a36Sopenharmony_ci if (block_group->cached != BTRFS_CACHE_FINISHED || 318862306a36Sopenharmony_ci !btrfs_test_opt(fs_info, SPACE_CACHE)) { 318962306a36Sopenharmony_ci /* 319062306a36Sopenharmony_ci * don't bother trying to write stuff out _if_ 319162306a36Sopenharmony_ci * a) we're not cached, 319262306a36Sopenharmony_ci * b) we're with nospace_cache mount option, 319362306a36Sopenharmony_ci * c) we're with v2 space_cache (FREE_SPACE_TREE). 319462306a36Sopenharmony_ci */ 319562306a36Sopenharmony_ci dcs = BTRFS_DC_WRITTEN; 319662306a36Sopenharmony_ci spin_unlock(&block_group->lock); 319762306a36Sopenharmony_ci goto out_put; 319862306a36Sopenharmony_ci } 319962306a36Sopenharmony_ci spin_unlock(&block_group->lock); 320062306a36Sopenharmony_ci 320162306a36Sopenharmony_ci /* 320262306a36Sopenharmony_ci * We hit an ENOSPC when setting up the cache in this transaction, just 320362306a36Sopenharmony_ci * skip doing the setup, we've already cleared the cache so we're safe. 320462306a36Sopenharmony_ci */ 320562306a36Sopenharmony_ci if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 320662306a36Sopenharmony_ci ret = -ENOSPC; 320762306a36Sopenharmony_ci goto out_put; 320862306a36Sopenharmony_ci } 320962306a36Sopenharmony_ci 321062306a36Sopenharmony_ci /* 321162306a36Sopenharmony_ci * Try to preallocate enough space based on how big the block group is. 321262306a36Sopenharmony_ci * Keep in mind this has to include any pinned space which could end up 321362306a36Sopenharmony_ci * taking up quite a bit since it's not folded into the other space 321462306a36Sopenharmony_ci * cache. 321562306a36Sopenharmony_ci */ 321662306a36Sopenharmony_ci cache_size = div_u64(block_group->length, SZ_256M); 321762306a36Sopenharmony_ci if (!cache_size) 321862306a36Sopenharmony_ci cache_size = 1; 321962306a36Sopenharmony_ci 322062306a36Sopenharmony_ci cache_size *= 16; 322162306a36Sopenharmony_ci cache_size *= fs_info->sectorsize; 322262306a36Sopenharmony_ci 322362306a36Sopenharmony_ci ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, 322462306a36Sopenharmony_ci cache_size, false); 322562306a36Sopenharmony_ci if (ret) 322662306a36Sopenharmony_ci goto out_put; 322762306a36Sopenharmony_ci 322862306a36Sopenharmony_ci ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, 322962306a36Sopenharmony_ci cache_size, cache_size, 323062306a36Sopenharmony_ci &alloc_hint); 323162306a36Sopenharmony_ci /* 323262306a36Sopenharmony_ci * Our cache requires contiguous chunks so that we don't modify a bunch 323362306a36Sopenharmony_ci * of metadata or split extents when writing the cache out, which means 323462306a36Sopenharmony_ci * we can enospc if we are heavily fragmented in addition to just normal 323562306a36Sopenharmony_ci * out of space conditions. So if we hit this just skip setting up any 323662306a36Sopenharmony_ci * other block groups for this transaction, maybe we'll unpin enough 323762306a36Sopenharmony_ci * space the next time around. 323862306a36Sopenharmony_ci */ 323962306a36Sopenharmony_ci if (!ret) 324062306a36Sopenharmony_ci dcs = BTRFS_DC_SETUP; 324162306a36Sopenharmony_ci else if (ret == -ENOSPC) 324262306a36Sopenharmony_ci set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 324362306a36Sopenharmony_ci 324462306a36Sopenharmony_ciout_put: 324562306a36Sopenharmony_ci iput(inode); 324662306a36Sopenharmony_ciout_free: 324762306a36Sopenharmony_ci btrfs_release_path(path); 324862306a36Sopenharmony_ciout: 324962306a36Sopenharmony_ci spin_lock(&block_group->lock); 325062306a36Sopenharmony_ci if (!ret && dcs == BTRFS_DC_SETUP) 325162306a36Sopenharmony_ci block_group->cache_generation = trans->transid; 325262306a36Sopenharmony_ci block_group->disk_cache_state = dcs; 325362306a36Sopenharmony_ci spin_unlock(&block_group->lock); 325462306a36Sopenharmony_ci 325562306a36Sopenharmony_ci extent_changeset_free(data_reserved); 325662306a36Sopenharmony_ci return ret; 325762306a36Sopenharmony_ci} 325862306a36Sopenharmony_ci 325962306a36Sopenharmony_ciint btrfs_setup_space_cache(struct btrfs_trans_handle *trans) 326062306a36Sopenharmony_ci{ 326162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 326262306a36Sopenharmony_ci struct btrfs_block_group *cache, *tmp; 326362306a36Sopenharmony_ci struct btrfs_transaction *cur_trans = trans->transaction; 326462306a36Sopenharmony_ci struct btrfs_path *path; 326562306a36Sopenharmony_ci 326662306a36Sopenharmony_ci if (list_empty(&cur_trans->dirty_bgs) || 326762306a36Sopenharmony_ci !btrfs_test_opt(fs_info, SPACE_CACHE)) 326862306a36Sopenharmony_ci return 0; 326962306a36Sopenharmony_ci 327062306a36Sopenharmony_ci path = btrfs_alloc_path(); 327162306a36Sopenharmony_ci if (!path) 327262306a36Sopenharmony_ci return -ENOMEM; 327362306a36Sopenharmony_ci 327462306a36Sopenharmony_ci /* Could add new block groups, use _safe just in case */ 327562306a36Sopenharmony_ci list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 327662306a36Sopenharmony_ci dirty_list) { 327762306a36Sopenharmony_ci if (cache->disk_cache_state == BTRFS_DC_CLEAR) 327862306a36Sopenharmony_ci cache_save_setup(cache, trans, path); 327962306a36Sopenharmony_ci } 328062306a36Sopenharmony_ci 328162306a36Sopenharmony_ci btrfs_free_path(path); 328262306a36Sopenharmony_ci return 0; 328362306a36Sopenharmony_ci} 328462306a36Sopenharmony_ci 328562306a36Sopenharmony_ci/* 328662306a36Sopenharmony_ci * Transaction commit does final block group cache writeback during a critical 328762306a36Sopenharmony_ci * section where nothing is allowed to change the FS. This is required in 328862306a36Sopenharmony_ci * order for the cache to actually match the block group, but can introduce a 328962306a36Sopenharmony_ci * lot of latency into the commit. 329062306a36Sopenharmony_ci * 329162306a36Sopenharmony_ci * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. 329262306a36Sopenharmony_ci * There's a chance we'll have to redo some of it if the block group changes 329362306a36Sopenharmony_ci * again during the commit, but it greatly reduces the commit latency by 329462306a36Sopenharmony_ci * getting rid of the easy block groups while we're still allowing others to 329562306a36Sopenharmony_ci * join the commit. 329662306a36Sopenharmony_ci */ 329762306a36Sopenharmony_ciint btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 329862306a36Sopenharmony_ci{ 329962306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 330062306a36Sopenharmony_ci struct btrfs_block_group *cache; 330162306a36Sopenharmony_ci struct btrfs_transaction *cur_trans = trans->transaction; 330262306a36Sopenharmony_ci int ret = 0; 330362306a36Sopenharmony_ci int should_put; 330462306a36Sopenharmony_ci struct btrfs_path *path = NULL; 330562306a36Sopenharmony_ci LIST_HEAD(dirty); 330662306a36Sopenharmony_ci struct list_head *io = &cur_trans->io_bgs; 330762306a36Sopenharmony_ci int loops = 0; 330862306a36Sopenharmony_ci 330962306a36Sopenharmony_ci spin_lock(&cur_trans->dirty_bgs_lock); 331062306a36Sopenharmony_ci if (list_empty(&cur_trans->dirty_bgs)) { 331162306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 331262306a36Sopenharmony_ci return 0; 331362306a36Sopenharmony_ci } 331462306a36Sopenharmony_ci list_splice_init(&cur_trans->dirty_bgs, &dirty); 331562306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 331662306a36Sopenharmony_ci 331762306a36Sopenharmony_ciagain: 331862306a36Sopenharmony_ci /* Make sure all the block groups on our dirty list actually exist */ 331962306a36Sopenharmony_ci btrfs_create_pending_block_groups(trans); 332062306a36Sopenharmony_ci 332162306a36Sopenharmony_ci if (!path) { 332262306a36Sopenharmony_ci path = btrfs_alloc_path(); 332362306a36Sopenharmony_ci if (!path) { 332462306a36Sopenharmony_ci ret = -ENOMEM; 332562306a36Sopenharmony_ci goto out; 332662306a36Sopenharmony_ci } 332762306a36Sopenharmony_ci } 332862306a36Sopenharmony_ci 332962306a36Sopenharmony_ci /* 333062306a36Sopenharmony_ci * cache_write_mutex is here only to save us from balance or automatic 333162306a36Sopenharmony_ci * removal of empty block groups deleting this block group while we are 333262306a36Sopenharmony_ci * writing out the cache 333362306a36Sopenharmony_ci */ 333462306a36Sopenharmony_ci mutex_lock(&trans->transaction->cache_write_mutex); 333562306a36Sopenharmony_ci while (!list_empty(&dirty)) { 333662306a36Sopenharmony_ci bool drop_reserve = true; 333762306a36Sopenharmony_ci 333862306a36Sopenharmony_ci cache = list_first_entry(&dirty, struct btrfs_block_group, 333962306a36Sopenharmony_ci dirty_list); 334062306a36Sopenharmony_ci /* 334162306a36Sopenharmony_ci * This can happen if something re-dirties a block group that 334262306a36Sopenharmony_ci * is already under IO. Just wait for it to finish and then do 334362306a36Sopenharmony_ci * it all again 334462306a36Sopenharmony_ci */ 334562306a36Sopenharmony_ci if (!list_empty(&cache->io_list)) { 334662306a36Sopenharmony_ci list_del_init(&cache->io_list); 334762306a36Sopenharmony_ci btrfs_wait_cache_io(trans, cache, path); 334862306a36Sopenharmony_ci btrfs_put_block_group(cache); 334962306a36Sopenharmony_ci } 335062306a36Sopenharmony_ci 335162306a36Sopenharmony_ci 335262306a36Sopenharmony_ci /* 335362306a36Sopenharmony_ci * btrfs_wait_cache_io uses the cache->dirty_list to decide if 335462306a36Sopenharmony_ci * it should update the cache_state. Don't delete until after 335562306a36Sopenharmony_ci * we wait. 335662306a36Sopenharmony_ci * 335762306a36Sopenharmony_ci * Since we're not running in the commit critical section 335862306a36Sopenharmony_ci * we need the dirty_bgs_lock to protect from update_block_group 335962306a36Sopenharmony_ci */ 336062306a36Sopenharmony_ci spin_lock(&cur_trans->dirty_bgs_lock); 336162306a36Sopenharmony_ci list_del_init(&cache->dirty_list); 336262306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 336362306a36Sopenharmony_ci 336462306a36Sopenharmony_ci should_put = 1; 336562306a36Sopenharmony_ci 336662306a36Sopenharmony_ci cache_save_setup(cache, trans, path); 336762306a36Sopenharmony_ci 336862306a36Sopenharmony_ci if (cache->disk_cache_state == BTRFS_DC_SETUP) { 336962306a36Sopenharmony_ci cache->io_ctl.inode = NULL; 337062306a36Sopenharmony_ci ret = btrfs_write_out_cache(trans, cache, path); 337162306a36Sopenharmony_ci if (ret == 0 && cache->io_ctl.inode) { 337262306a36Sopenharmony_ci should_put = 0; 337362306a36Sopenharmony_ci 337462306a36Sopenharmony_ci /* 337562306a36Sopenharmony_ci * The cache_write_mutex is protecting the 337662306a36Sopenharmony_ci * io_list, also refer to the definition of 337762306a36Sopenharmony_ci * btrfs_transaction::io_bgs for more details 337862306a36Sopenharmony_ci */ 337962306a36Sopenharmony_ci list_add_tail(&cache->io_list, io); 338062306a36Sopenharmony_ci } else { 338162306a36Sopenharmony_ci /* 338262306a36Sopenharmony_ci * If we failed to write the cache, the 338362306a36Sopenharmony_ci * generation will be bad and life goes on 338462306a36Sopenharmony_ci */ 338562306a36Sopenharmony_ci ret = 0; 338662306a36Sopenharmony_ci } 338762306a36Sopenharmony_ci } 338862306a36Sopenharmony_ci if (!ret) { 338962306a36Sopenharmony_ci ret = update_block_group_item(trans, path, cache); 339062306a36Sopenharmony_ci /* 339162306a36Sopenharmony_ci * Our block group might still be attached to the list 339262306a36Sopenharmony_ci * of new block groups in the transaction handle of some 339362306a36Sopenharmony_ci * other task (struct btrfs_trans_handle->new_bgs). This 339462306a36Sopenharmony_ci * means its block group item isn't yet in the extent 339562306a36Sopenharmony_ci * tree. If this happens ignore the error, as we will 339662306a36Sopenharmony_ci * try again later in the critical section of the 339762306a36Sopenharmony_ci * transaction commit. 339862306a36Sopenharmony_ci */ 339962306a36Sopenharmony_ci if (ret == -ENOENT) { 340062306a36Sopenharmony_ci ret = 0; 340162306a36Sopenharmony_ci spin_lock(&cur_trans->dirty_bgs_lock); 340262306a36Sopenharmony_ci if (list_empty(&cache->dirty_list)) { 340362306a36Sopenharmony_ci list_add_tail(&cache->dirty_list, 340462306a36Sopenharmony_ci &cur_trans->dirty_bgs); 340562306a36Sopenharmony_ci btrfs_get_block_group(cache); 340662306a36Sopenharmony_ci drop_reserve = false; 340762306a36Sopenharmony_ci } 340862306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 340962306a36Sopenharmony_ci } else if (ret) { 341062306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 341162306a36Sopenharmony_ci } 341262306a36Sopenharmony_ci } 341362306a36Sopenharmony_ci 341462306a36Sopenharmony_ci /* If it's not on the io list, we need to put the block group */ 341562306a36Sopenharmony_ci if (should_put) 341662306a36Sopenharmony_ci btrfs_put_block_group(cache); 341762306a36Sopenharmony_ci if (drop_reserve) 341862306a36Sopenharmony_ci btrfs_delayed_refs_rsv_release(fs_info, 1); 341962306a36Sopenharmony_ci /* 342062306a36Sopenharmony_ci * Avoid blocking other tasks for too long. It might even save 342162306a36Sopenharmony_ci * us from writing caches for block groups that are going to be 342262306a36Sopenharmony_ci * removed. 342362306a36Sopenharmony_ci */ 342462306a36Sopenharmony_ci mutex_unlock(&trans->transaction->cache_write_mutex); 342562306a36Sopenharmony_ci if (ret) 342662306a36Sopenharmony_ci goto out; 342762306a36Sopenharmony_ci mutex_lock(&trans->transaction->cache_write_mutex); 342862306a36Sopenharmony_ci } 342962306a36Sopenharmony_ci mutex_unlock(&trans->transaction->cache_write_mutex); 343062306a36Sopenharmony_ci 343162306a36Sopenharmony_ci /* 343262306a36Sopenharmony_ci * Go through delayed refs for all the stuff we've just kicked off 343362306a36Sopenharmony_ci * and then loop back (just once) 343462306a36Sopenharmony_ci */ 343562306a36Sopenharmony_ci if (!ret) 343662306a36Sopenharmony_ci ret = btrfs_run_delayed_refs(trans, 0); 343762306a36Sopenharmony_ci if (!ret && loops == 0) { 343862306a36Sopenharmony_ci loops++; 343962306a36Sopenharmony_ci spin_lock(&cur_trans->dirty_bgs_lock); 344062306a36Sopenharmony_ci list_splice_init(&cur_trans->dirty_bgs, &dirty); 344162306a36Sopenharmony_ci /* 344262306a36Sopenharmony_ci * dirty_bgs_lock protects us from concurrent block group 344362306a36Sopenharmony_ci * deletes too (not just cache_write_mutex). 344462306a36Sopenharmony_ci */ 344562306a36Sopenharmony_ci if (!list_empty(&dirty)) { 344662306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 344762306a36Sopenharmony_ci goto again; 344862306a36Sopenharmony_ci } 344962306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 345062306a36Sopenharmony_ci } 345162306a36Sopenharmony_ciout: 345262306a36Sopenharmony_ci if (ret < 0) { 345362306a36Sopenharmony_ci spin_lock(&cur_trans->dirty_bgs_lock); 345462306a36Sopenharmony_ci list_splice_init(&dirty, &cur_trans->dirty_bgs); 345562306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 345662306a36Sopenharmony_ci btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 345762306a36Sopenharmony_ci } 345862306a36Sopenharmony_ci 345962306a36Sopenharmony_ci btrfs_free_path(path); 346062306a36Sopenharmony_ci return ret; 346162306a36Sopenharmony_ci} 346262306a36Sopenharmony_ci 346362306a36Sopenharmony_ciint btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) 346462306a36Sopenharmony_ci{ 346562306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 346662306a36Sopenharmony_ci struct btrfs_block_group *cache; 346762306a36Sopenharmony_ci struct btrfs_transaction *cur_trans = trans->transaction; 346862306a36Sopenharmony_ci int ret = 0; 346962306a36Sopenharmony_ci int should_put; 347062306a36Sopenharmony_ci struct btrfs_path *path; 347162306a36Sopenharmony_ci struct list_head *io = &cur_trans->io_bgs; 347262306a36Sopenharmony_ci 347362306a36Sopenharmony_ci path = btrfs_alloc_path(); 347462306a36Sopenharmony_ci if (!path) 347562306a36Sopenharmony_ci return -ENOMEM; 347662306a36Sopenharmony_ci 347762306a36Sopenharmony_ci /* 347862306a36Sopenharmony_ci * Even though we are in the critical section of the transaction commit, 347962306a36Sopenharmony_ci * we can still have concurrent tasks adding elements to this 348062306a36Sopenharmony_ci * transaction's list of dirty block groups. These tasks correspond to 348162306a36Sopenharmony_ci * endio free space workers started when writeback finishes for a 348262306a36Sopenharmony_ci * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 348362306a36Sopenharmony_ci * allocate new block groups as a result of COWing nodes of the root 348462306a36Sopenharmony_ci * tree when updating the free space inode. The writeback for the space 348562306a36Sopenharmony_ci * caches is triggered by an earlier call to 348662306a36Sopenharmony_ci * btrfs_start_dirty_block_groups() and iterations of the following 348762306a36Sopenharmony_ci * loop. 348862306a36Sopenharmony_ci * Also we want to do the cache_save_setup first and then run the 348962306a36Sopenharmony_ci * delayed refs to make sure we have the best chance at doing this all 349062306a36Sopenharmony_ci * in one shot. 349162306a36Sopenharmony_ci */ 349262306a36Sopenharmony_ci spin_lock(&cur_trans->dirty_bgs_lock); 349362306a36Sopenharmony_ci while (!list_empty(&cur_trans->dirty_bgs)) { 349462306a36Sopenharmony_ci cache = list_first_entry(&cur_trans->dirty_bgs, 349562306a36Sopenharmony_ci struct btrfs_block_group, 349662306a36Sopenharmony_ci dirty_list); 349762306a36Sopenharmony_ci 349862306a36Sopenharmony_ci /* 349962306a36Sopenharmony_ci * This can happen if cache_save_setup re-dirties a block group 350062306a36Sopenharmony_ci * that is already under IO. Just wait for it to finish and 350162306a36Sopenharmony_ci * then do it all again 350262306a36Sopenharmony_ci */ 350362306a36Sopenharmony_ci if (!list_empty(&cache->io_list)) { 350462306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 350562306a36Sopenharmony_ci list_del_init(&cache->io_list); 350662306a36Sopenharmony_ci btrfs_wait_cache_io(trans, cache, path); 350762306a36Sopenharmony_ci btrfs_put_block_group(cache); 350862306a36Sopenharmony_ci spin_lock(&cur_trans->dirty_bgs_lock); 350962306a36Sopenharmony_ci } 351062306a36Sopenharmony_ci 351162306a36Sopenharmony_ci /* 351262306a36Sopenharmony_ci * Don't remove from the dirty list until after we've waited on 351362306a36Sopenharmony_ci * any pending IO 351462306a36Sopenharmony_ci */ 351562306a36Sopenharmony_ci list_del_init(&cache->dirty_list); 351662306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 351762306a36Sopenharmony_ci should_put = 1; 351862306a36Sopenharmony_ci 351962306a36Sopenharmony_ci cache_save_setup(cache, trans, path); 352062306a36Sopenharmony_ci 352162306a36Sopenharmony_ci if (!ret) 352262306a36Sopenharmony_ci ret = btrfs_run_delayed_refs(trans, 352362306a36Sopenharmony_ci (unsigned long) -1); 352462306a36Sopenharmony_ci 352562306a36Sopenharmony_ci if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 352662306a36Sopenharmony_ci cache->io_ctl.inode = NULL; 352762306a36Sopenharmony_ci ret = btrfs_write_out_cache(trans, cache, path); 352862306a36Sopenharmony_ci if (ret == 0 && cache->io_ctl.inode) { 352962306a36Sopenharmony_ci should_put = 0; 353062306a36Sopenharmony_ci list_add_tail(&cache->io_list, io); 353162306a36Sopenharmony_ci } else { 353262306a36Sopenharmony_ci /* 353362306a36Sopenharmony_ci * If we failed to write the cache, the 353462306a36Sopenharmony_ci * generation will be bad and life goes on 353562306a36Sopenharmony_ci */ 353662306a36Sopenharmony_ci ret = 0; 353762306a36Sopenharmony_ci } 353862306a36Sopenharmony_ci } 353962306a36Sopenharmony_ci if (!ret) { 354062306a36Sopenharmony_ci ret = update_block_group_item(trans, path, cache); 354162306a36Sopenharmony_ci /* 354262306a36Sopenharmony_ci * One of the free space endio workers might have 354362306a36Sopenharmony_ci * created a new block group while updating a free space 354462306a36Sopenharmony_ci * cache's inode (at inode.c:btrfs_finish_ordered_io()) 354562306a36Sopenharmony_ci * and hasn't released its transaction handle yet, in 354662306a36Sopenharmony_ci * which case the new block group is still attached to 354762306a36Sopenharmony_ci * its transaction handle and its creation has not 354862306a36Sopenharmony_ci * finished yet (no block group item in the extent tree 354962306a36Sopenharmony_ci * yet, etc). If this is the case, wait for all free 355062306a36Sopenharmony_ci * space endio workers to finish and retry. This is a 355162306a36Sopenharmony_ci * very rare case so no need for a more efficient and 355262306a36Sopenharmony_ci * complex approach. 355362306a36Sopenharmony_ci */ 355462306a36Sopenharmony_ci if (ret == -ENOENT) { 355562306a36Sopenharmony_ci wait_event(cur_trans->writer_wait, 355662306a36Sopenharmony_ci atomic_read(&cur_trans->num_writers) == 1); 355762306a36Sopenharmony_ci ret = update_block_group_item(trans, path, cache); 355862306a36Sopenharmony_ci } 355962306a36Sopenharmony_ci if (ret) 356062306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 356162306a36Sopenharmony_ci } 356262306a36Sopenharmony_ci 356362306a36Sopenharmony_ci /* If its not on the io list, we need to put the block group */ 356462306a36Sopenharmony_ci if (should_put) 356562306a36Sopenharmony_ci btrfs_put_block_group(cache); 356662306a36Sopenharmony_ci btrfs_delayed_refs_rsv_release(fs_info, 1); 356762306a36Sopenharmony_ci spin_lock(&cur_trans->dirty_bgs_lock); 356862306a36Sopenharmony_ci } 356962306a36Sopenharmony_ci spin_unlock(&cur_trans->dirty_bgs_lock); 357062306a36Sopenharmony_ci 357162306a36Sopenharmony_ci /* 357262306a36Sopenharmony_ci * Refer to the definition of io_bgs member for details why it's safe 357362306a36Sopenharmony_ci * to use it without any locking 357462306a36Sopenharmony_ci */ 357562306a36Sopenharmony_ci while (!list_empty(io)) { 357662306a36Sopenharmony_ci cache = list_first_entry(io, struct btrfs_block_group, 357762306a36Sopenharmony_ci io_list); 357862306a36Sopenharmony_ci list_del_init(&cache->io_list); 357962306a36Sopenharmony_ci btrfs_wait_cache_io(trans, cache, path); 358062306a36Sopenharmony_ci btrfs_put_block_group(cache); 358162306a36Sopenharmony_ci } 358262306a36Sopenharmony_ci 358362306a36Sopenharmony_ci btrfs_free_path(path); 358462306a36Sopenharmony_ci return ret; 358562306a36Sopenharmony_ci} 358662306a36Sopenharmony_ci 358762306a36Sopenharmony_ciint btrfs_update_block_group(struct btrfs_trans_handle *trans, 358862306a36Sopenharmony_ci u64 bytenr, u64 num_bytes, bool alloc) 358962306a36Sopenharmony_ci{ 359062306a36Sopenharmony_ci struct btrfs_fs_info *info = trans->fs_info; 359162306a36Sopenharmony_ci struct btrfs_block_group *cache = NULL; 359262306a36Sopenharmony_ci u64 total = num_bytes; 359362306a36Sopenharmony_ci u64 old_val; 359462306a36Sopenharmony_ci u64 byte_in_group; 359562306a36Sopenharmony_ci int factor; 359662306a36Sopenharmony_ci int ret = 0; 359762306a36Sopenharmony_ci 359862306a36Sopenharmony_ci /* Block accounting for super block */ 359962306a36Sopenharmony_ci spin_lock(&info->delalloc_root_lock); 360062306a36Sopenharmony_ci old_val = btrfs_super_bytes_used(info->super_copy); 360162306a36Sopenharmony_ci if (alloc) 360262306a36Sopenharmony_ci old_val += num_bytes; 360362306a36Sopenharmony_ci else 360462306a36Sopenharmony_ci old_val -= num_bytes; 360562306a36Sopenharmony_ci btrfs_set_super_bytes_used(info->super_copy, old_val); 360662306a36Sopenharmony_ci spin_unlock(&info->delalloc_root_lock); 360762306a36Sopenharmony_ci 360862306a36Sopenharmony_ci while (total) { 360962306a36Sopenharmony_ci struct btrfs_space_info *space_info; 361062306a36Sopenharmony_ci bool reclaim = false; 361162306a36Sopenharmony_ci 361262306a36Sopenharmony_ci cache = btrfs_lookup_block_group(info, bytenr); 361362306a36Sopenharmony_ci if (!cache) { 361462306a36Sopenharmony_ci ret = -ENOENT; 361562306a36Sopenharmony_ci break; 361662306a36Sopenharmony_ci } 361762306a36Sopenharmony_ci space_info = cache->space_info; 361862306a36Sopenharmony_ci factor = btrfs_bg_type_to_factor(cache->flags); 361962306a36Sopenharmony_ci 362062306a36Sopenharmony_ci /* 362162306a36Sopenharmony_ci * If this block group has free space cache written out, we 362262306a36Sopenharmony_ci * need to make sure to load it if we are removing space. This 362362306a36Sopenharmony_ci * is because we need the unpinning stage to actually add the 362462306a36Sopenharmony_ci * space back to the block group, otherwise we will leak space. 362562306a36Sopenharmony_ci */ 362662306a36Sopenharmony_ci if (!alloc && !btrfs_block_group_done(cache)) 362762306a36Sopenharmony_ci btrfs_cache_block_group(cache, true); 362862306a36Sopenharmony_ci 362962306a36Sopenharmony_ci byte_in_group = bytenr - cache->start; 363062306a36Sopenharmony_ci WARN_ON(byte_in_group > cache->length); 363162306a36Sopenharmony_ci 363262306a36Sopenharmony_ci spin_lock(&space_info->lock); 363362306a36Sopenharmony_ci spin_lock(&cache->lock); 363462306a36Sopenharmony_ci 363562306a36Sopenharmony_ci if (btrfs_test_opt(info, SPACE_CACHE) && 363662306a36Sopenharmony_ci cache->disk_cache_state < BTRFS_DC_CLEAR) 363762306a36Sopenharmony_ci cache->disk_cache_state = BTRFS_DC_CLEAR; 363862306a36Sopenharmony_ci 363962306a36Sopenharmony_ci old_val = cache->used; 364062306a36Sopenharmony_ci num_bytes = min(total, cache->length - byte_in_group); 364162306a36Sopenharmony_ci if (alloc) { 364262306a36Sopenharmony_ci old_val += num_bytes; 364362306a36Sopenharmony_ci cache->used = old_val; 364462306a36Sopenharmony_ci cache->reserved -= num_bytes; 364562306a36Sopenharmony_ci space_info->bytes_reserved -= num_bytes; 364662306a36Sopenharmony_ci space_info->bytes_used += num_bytes; 364762306a36Sopenharmony_ci space_info->disk_used += num_bytes * factor; 364862306a36Sopenharmony_ci spin_unlock(&cache->lock); 364962306a36Sopenharmony_ci spin_unlock(&space_info->lock); 365062306a36Sopenharmony_ci } else { 365162306a36Sopenharmony_ci old_val -= num_bytes; 365262306a36Sopenharmony_ci cache->used = old_val; 365362306a36Sopenharmony_ci cache->pinned += num_bytes; 365462306a36Sopenharmony_ci btrfs_space_info_update_bytes_pinned(info, space_info, 365562306a36Sopenharmony_ci num_bytes); 365662306a36Sopenharmony_ci space_info->bytes_used -= num_bytes; 365762306a36Sopenharmony_ci space_info->disk_used -= num_bytes * factor; 365862306a36Sopenharmony_ci 365962306a36Sopenharmony_ci reclaim = should_reclaim_block_group(cache, num_bytes); 366062306a36Sopenharmony_ci 366162306a36Sopenharmony_ci spin_unlock(&cache->lock); 366262306a36Sopenharmony_ci spin_unlock(&space_info->lock); 366362306a36Sopenharmony_ci 366462306a36Sopenharmony_ci set_extent_bit(&trans->transaction->pinned_extents, 366562306a36Sopenharmony_ci bytenr, bytenr + num_bytes - 1, 366662306a36Sopenharmony_ci EXTENT_DIRTY, NULL); 366762306a36Sopenharmony_ci } 366862306a36Sopenharmony_ci 366962306a36Sopenharmony_ci spin_lock(&trans->transaction->dirty_bgs_lock); 367062306a36Sopenharmony_ci if (list_empty(&cache->dirty_list)) { 367162306a36Sopenharmony_ci list_add_tail(&cache->dirty_list, 367262306a36Sopenharmony_ci &trans->transaction->dirty_bgs); 367362306a36Sopenharmony_ci trans->delayed_ref_updates++; 367462306a36Sopenharmony_ci btrfs_get_block_group(cache); 367562306a36Sopenharmony_ci } 367662306a36Sopenharmony_ci spin_unlock(&trans->transaction->dirty_bgs_lock); 367762306a36Sopenharmony_ci 367862306a36Sopenharmony_ci /* 367962306a36Sopenharmony_ci * No longer have used bytes in this block group, queue it for 368062306a36Sopenharmony_ci * deletion. We do this after adding the block group to the 368162306a36Sopenharmony_ci * dirty list to avoid races between cleaner kthread and space 368262306a36Sopenharmony_ci * cache writeout. 368362306a36Sopenharmony_ci */ 368462306a36Sopenharmony_ci if (!alloc && old_val == 0) { 368562306a36Sopenharmony_ci if (!btrfs_test_opt(info, DISCARD_ASYNC)) 368662306a36Sopenharmony_ci btrfs_mark_bg_unused(cache); 368762306a36Sopenharmony_ci } else if (!alloc && reclaim) { 368862306a36Sopenharmony_ci btrfs_mark_bg_to_reclaim(cache); 368962306a36Sopenharmony_ci } 369062306a36Sopenharmony_ci 369162306a36Sopenharmony_ci btrfs_put_block_group(cache); 369262306a36Sopenharmony_ci total -= num_bytes; 369362306a36Sopenharmony_ci bytenr += num_bytes; 369462306a36Sopenharmony_ci } 369562306a36Sopenharmony_ci 369662306a36Sopenharmony_ci /* Modified block groups are accounted for in the delayed_refs_rsv. */ 369762306a36Sopenharmony_ci btrfs_update_delayed_refs_rsv(trans); 369862306a36Sopenharmony_ci return ret; 369962306a36Sopenharmony_ci} 370062306a36Sopenharmony_ci 370162306a36Sopenharmony_ci/* 370262306a36Sopenharmony_ci * Update the block_group and space info counters. 370362306a36Sopenharmony_ci * 370462306a36Sopenharmony_ci * @cache: The cache we are manipulating 370562306a36Sopenharmony_ci * @ram_bytes: The number of bytes of file content, and will be same to 370662306a36Sopenharmony_ci * @num_bytes except for the compress path. 370762306a36Sopenharmony_ci * @num_bytes: The number of bytes in question 370862306a36Sopenharmony_ci * @delalloc: The blocks are allocated for the delalloc write 370962306a36Sopenharmony_ci * 371062306a36Sopenharmony_ci * This is called by the allocator when it reserves space. If this is a 371162306a36Sopenharmony_ci * reservation and the block group has become read only we cannot make the 371262306a36Sopenharmony_ci * reservation and return -EAGAIN, otherwise this function always succeeds. 371362306a36Sopenharmony_ci */ 371462306a36Sopenharmony_ciint btrfs_add_reserved_bytes(struct btrfs_block_group *cache, 371562306a36Sopenharmony_ci u64 ram_bytes, u64 num_bytes, int delalloc, 371662306a36Sopenharmony_ci bool force_wrong_size_class) 371762306a36Sopenharmony_ci{ 371862306a36Sopenharmony_ci struct btrfs_space_info *space_info = cache->space_info; 371962306a36Sopenharmony_ci enum btrfs_block_group_size_class size_class; 372062306a36Sopenharmony_ci int ret = 0; 372162306a36Sopenharmony_ci 372262306a36Sopenharmony_ci spin_lock(&space_info->lock); 372362306a36Sopenharmony_ci spin_lock(&cache->lock); 372462306a36Sopenharmony_ci if (cache->ro) { 372562306a36Sopenharmony_ci ret = -EAGAIN; 372662306a36Sopenharmony_ci goto out; 372762306a36Sopenharmony_ci } 372862306a36Sopenharmony_ci 372962306a36Sopenharmony_ci if (btrfs_block_group_should_use_size_class(cache)) { 373062306a36Sopenharmony_ci size_class = btrfs_calc_block_group_size_class(num_bytes); 373162306a36Sopenharmony_ci ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); 373262306a36Sopenharmony_ci if (ret) 373362306a36Sopenharmony_ci goto out; 373462306a36Sopenharmony_ci } 373562306a36Sopenharmony_ci cache->reserved += num_bytes; 373662306a36Sopenharmony_ci space_info->bytes_reserved += num_bytes; 373762306a36Sopenharmony_ci trace_btrfs_space_reservation(cache->fs_info, "space_info", 373862306a36Sopenharmony_ci space_info->flags, num_bytes, 1); 373962306a36Sopenharmony_ci btrfs_space_info_update_bytes_may_use(cache->fs_info, 374062306a36Sopenharmony_ci space_info, -ram_bytes); 374162306a36Sopenharmony_ci if (delalloc) 374262306a36Sopenharmony_ci cache->delalloc_bytes += num_bytes; 374362306a36Sopenharmony_ci 374462306a36Sopenharmony_ci /* 374562306a36Sopenharmony_ci * Compression can use less space than we reserved, so wake tickets if 374662306a36Sopenharmony_ci * that happens. 374762306a36Sopenharmony_ci */ 374862306a36Sopenharmony_ci if (num_bytes < ram_bytes) 374962306a36Sopenharmony_ci btrfs_try_granting_tickets(cache->fs_info, space_info); 375062306a36Sopenharmony_ciout: 375162306a36Sopenharmony_ci spin_unlock(&cache->lock); 375262306a36Sopenharmony_ci spin_unlock(&space_info->lock); 375362306a36Sopenharmony_ci return ret; 375462306a36Sopenharmony_ci} 375562306a36Sopenharmony_ci 375662306a36Sopenharmony_ci/* 375762306a36Sopenharmony_ci * Update the block_group and space info counters. 375862306a36Sopenharmony_ci * 375962306a36Sopenharmony_ci * @cache: The cache we are manipulating 376062306a36Sopenharmony_ci * @num_bytes: The number of bytes in question 376162306a36Sopenharmony_ci * @delalloc: The blocks are allocated for the delalloc write 376262306a36Sopenharmony_ci * 376362306a36Sopenharmony_ci * This is called by somebody who is freeing space that was never actually used 376462306a36Sopenharmony_ci * on disk. For example if you reserve some space for a new leaf in transaction 376562306a36Sopenharmony_ci * A and before transaction A commits you free that leaf, you call this with 376662306a36Sopenharmony_ci * reserve set to 0 in order to clear the reservation. 376762306a36Sopenharmony_ci */ 376862306a36Sopenharmony_civoid btrfs_free_reserved_bytes(struct btrfs_block_group *cache, 376962306a36Sopenharmony_ci u64 num_bytes, int delalloc) 377062306a36Sopenharmony_ci{ 377162306a36Sopenharmony_ci struct btrfs_space_info *space_info = cache->space_info; 377262306a36Sopenharmony_ci 377362306a36Sopenharmony_ci spin_lock(&space_info->lock); 377462306a36Sopenharmony_ci spin_lock(&cache->lock); 377562306a36Sopenharmony_ci if (cache->ro) 377662306a36Sopenharmony_ci space_info->bytes_readonly += num_bytes; 377762306a36Sopenharmony_ci cache->reserved -= num_bytes; 377862306a36Sopenharmony_ci space_info->bytes_reserved -= num_bytes; 377962306a36Sopenharmony_ci space_info->max_extent_size = 0; 378062306a36Sopenharmony_ci 378162306a36Sopenharmony_ci if (delalloc) 378262306a36Sopenharmony_ci cache->delalloc_bytes -= num_bytes; 378362306a36Sopenharmony_ci spin_unlock(&cache->lock); 378462306a36Sopenharmony_ci 378562306a36Sopenharmony_ci btrfs_try_granting_tickets(cache->fs_info, space_info); 378662306a36Sopenharmony_ci spin_unlock(&space_info->lock); 378762306a36Sopenharmony_ci} 378862306a36Sopenharmony_ci 378962306a36Sopenharmony_cistatic void force_metadata_allocation(struct btrfs_fs_info *info) 379062306a36Sopenharmony_ci{ 379162306a36Sopenharmony_ci struct list_head *head = &info->space_info; 379262306a36Sopenharmony_ci struct btrfs_space_info *found; 379362306a36Sopenharmony_ci 379462306a36Sopenharmony_ci list_for_each_entry(found, head, list) { 379562306a36Sopenharmony_ci if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 379662306a36Sopenharmony_ci found->force_alloc = CHUNK_ALLOC_FORCE; 379762306a36Sopenharmony_ci } 379862306a36Sopenharmony_ci} 379962306a36Sopenharmony_ci 380062306a36Sopenharmony_cistatic int should_alloc_chunk(struct btrfs_fs_info *fs_info, 380162306a36Sopenharmony_ci struct btrfs_space_info *sinfo, int force) 380262306a36Sopenharmony_ci{ 380362306a36Sopenharmony_ci u64 bytes_used = btrfs_space_info_used(sinfo, false); 380462306a36Sopenharmony_ci u64 thresh; 380562306a36Sopenharmony_ci 380662306a36Sopenharmony_ci if (force == CHUNK_ALLOC_FORCE) 380762306a36Sopenharmony_ci return 1; 380862306a36Sopenharmony_ci 380962306a36Sopenharmony_ci /* 381062306a36Sopenharmony_ci * in limited mode, we want to have some free space up to 381162306a36Sopenharmony_ci * about 1% of the FS size. 381262306a36Sopenharmony_ci */ 381362306a36Sopenharmony_ci if (force == CHUNK_ALLOC_LIMITED) { 381462306a36Sopenharmony_ci thresh = btrfs_super_total_bytes(fs_info->super_copy); 381562306a36Sopenharmony_ci thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); 381662306a36Sopenharmony_ci 381762306a36Sopenharmony_ci if (sinfo->total_bytes - bytes_used < thresh) 381862306a36Sopenharmony_ci return 1; 381962306a36Sopenharmony_ci } 382062306a36Sopenharmony_ci 382162306a36Sopenharmony_ci if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) 382262306a36Sopenharmony_ci return 0; 382362306a36Sopenharmony_ci return 1; 382462306a36Sopenharmony_ci} 382562306a36Sopenharmony_ci 382662306a36Sopenharmony_ciint btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 382762306a36Sopenharmony_ci{ 382862306a36Sopenharmony_ci u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); 382962306a36Sopenharmony_ci 383062306a36Sopenharmony_ci return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 383162306a36Sopenharmony_ci} 383262306a36Sopenharmony_ci 383362306a36Sopenharmony_cistatic struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) 383462306a36Sopenharmony_ci{ 383562306a36Sopenharmony_ci struct btrfs_block_group *bg; 383662306a36Sopenharmony_ci int ret; 383762306a36Sopenharmony_ci 383862306a36Sopenharmony_ci /* 383962306a36Sopenharmony_ci * Check if we have enough space in the system space info because we 384062306a36Sopenharmony_ci * will need to update device items in the chunk btree and insert a new 384162306a36Sopenharmony_ci * chunk item in the chunk btree as well. This will allocate a new 384262306a36Sopenharmony_ci * system block group if needed. 384362306a36Sopenharmony_ci */ 384462306a36Sopenharmony_ci check_system_chunk(trans, flags); 384562306a36Sopenharmony_ci 384662306a36Sopenharmony_ci bg = btrfs_create_chunk(trans, flags); 384762306a36Sopenharmony_ci if (IS_ERR(bg)) { 384862306a36Sopenharmony_ci ret = PTR_ERR(bg); 384962306a36Sopenharmony_ci goto out; 385062306a36Sopenharmony_ci } 385162306a36Sopenharmony_ci 385262306a36Sopenharmony_ci ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 385362306a36Sopenharmony_ci /* 385462306a36Sopenharmony_ci * Normally we are not expected to fail with -ENOSPC here, since we have 385562306a36Sopenharmony_ci * previously reserved space in the system space_info and allocated one 385662306a36Sopenharmony_ci * new system chunk if necessary. However there are three exceptions: 385762306a36Sopenharmony_ci * 385862306a36Sopenharmony_ci * 1) We may have enough free space in the system space_info but all the 385962306a36Sopenharmony_ci * existing system block groups have a profile which can not be used 386062306a36Sopenharmony_ci * for extent allocation. 386162306a36Sopenharmony_ci * 386262306a36Sopenharmony_ci * This happens when mounting in degraded mode. For example we have a 386362306a36Sopenharmony_ci * RAID1 filesystem with 2 devices, lose one device and mount the fs 386462306a36Sopenharmony_ci * using the other device in degraded mode. If we then allocate a chunk, 386562306a36Sopenharmony_ci * we may have enough free space in the existing system space_info, but 386662306a36Sopenharmony_ci * none of the block groups can be used for extent allocation since they 386762306a36Sopenharmony_ci * have a RAID1 profile, and because we are in degraded mode with a 386862306a36Sopenharmony_ci * single device, we are forced to allocate a new system chunk with a 386962306a36Sopenharmony_ci * SINGLE profile. Making check_system_chunk() iterate over all system 387062306a36Sopenharmony_ci * block groups and check if they have a usable profile and enough space 387162306a36Sopenharmony_ci * can be slow on very large filesystems, so we tolerate the -ENOSPC and 387262306a36Sopenharmony_ci * try again after forcing allocation of a new system chunk. Like this 387362306a36Sopenharmony_ci * we avoid paying the cost of that search in normal circumstances, when 387462306a36Sopenharmony_ci * we were not mounted in degraded mode; 387562306a36Sopenharmony_ci * 387662306a36Sopenharmony_ci * 2) We had enough free space info the system space_info, and one suitable 387762306a36Sopenharmony_ci * block group to allocate from when we called check_system_chunk() 387862306a36Sopenharmony_ci * above. However right after we called it, the only system block group 387962306a36Sopenharmony_ci * with enough free space got turned into RO mode by a running scrub, 388062306a36Sopenharmony_ci * and in this case we have to allocate a new one and retry. We only 388162306a36Sopenharmony_ci * need do this allocate and retry once, since we have a transaction 388262306a36Sopenharmony_ci * handle and scrub uses the commit root to search for block groups; 388362306a36Sopenharmony_ci * 388462306a36Sopenharmony_ci * 3) We had one system block group with enough free space when we called 388562306a36Sopenharmony_ci * check_system_chunk(), but after that, right before we tried to 388662306a36Sopenharmony_ci * allocate the last extent buffer we needed, a discard operation came 388762306a36Sopenharmony_ci * in and it temporarily removed the last free space entry from the 388862306a36Sopenharmony_ci * block group (discard removes a free space entry, discards it, and 388962306a36Sopenharmony_ci * then adds back the entry to the block group cache). 389062306a36Sopenharmony_ci */ 389162306a36Sopenharmony_ci if (ret == -ENOSPC) { 389262306a36Sopenharmony_ci const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); 389362306a36Sopenharmony_ci struct btrfs_block_group *sys_bg; 389462306a36Sopenharmony_ci 389562306a36Sopenharmony_ci sys_bg = btrfs_create_chunk(trans, sys_flags); 389662306a36Sopenharmony_ci if (IS_ERR(sys_bg)) { 389762306a36Sopenharmony_ci ret = PTR_ERR(sys_bg); 389862306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 389962306a36Sopenharmony_ci goto out; 390062306a36Sopenharmony_ci } 390162306a36Sopenharmony_ci 390262306a36Sopenharmony_ci ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 390362306a36Sopenharmony_ci if (ret) { 390462306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 390562306a36Sopenharmony_ci goto out; 390662306a36Sopenharmony_ci } 390762306a36Sopenharmony_ci 390862306a36Sopenharmony_ci ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 390962306a36Sopenharmony_ci if (ret) { 391062306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 391162306a36Sopenharmony_ci goto out; 391262306a36Sopenharmony_ci } 391362306a36Sopenharmony_ci } else if (ret) { 391462306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 391562306a36Sopenharmony_ci goto out; 391662306a36Sopenharmony_ci } 391762306a36Sopenharmony_ciout: 391862306a36Sopenharmony_ci btrfs_trans_release_chunk_metadata(trans); 391962306a36Sopenharmony_ci 392062306a36Sopenharmony_ci if (ret) 392162306a36Sopenharmony_ci return ERR_PTR(ret); 392262306a36Sopenharmony_ci 392362306a36Sopenharmony_ci btrfs_get_block_group(bg); 392462306a36Sopenharmony_ci return bg; 392562306a36Sopenharmony_ci} 392662306a36Sopenharmony_ci 392762306a36Sopenharmony_ci/* 392862306a36Sopenharmony_ci * Chunk allocation is done in 2 phases: 392962306a36Sopenharmony_ci * 393062306a36Sopenharmony_ci * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for 393162306a36Sopenharmony_ci * the chunk, the chunk mapping, create its block group and add the items 393262306a36Sopenharmony_ci * that belong in the chunk btree to it - more specifically, we need to 393362306a36Sopenharmony_ci * update device items in the chunk btree and add a new chunk item to it. 393462306a36Sopenharmony_ci * 393562306a36Sopenharmony_ci * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block 393662306a36Sopenharmony_ci * group item to the extent btree and the device extent items to the devices 393762306a36Sopenharmony_ci * btree. 393862306a36Sopenharmony_ci * 393962306a36Sopenharmony_ci * This is done to prevent deadlocks. For example when COWing a node from the 394062306a36Sopenharmony_ci * extent btree we are holding a write lock on the node's parent and if we 394162306a36Sopenharmony_ci * trigger chunk allocation and attempted to insert the new block group item 394262306a36Sopenharmony_ci * in the extent btree right way, we could deadlock because the path for the 394362306a36Sopenharmony_ci * insertion can include that parent node. At first glance it seems impossible 394462306a36Sopenharmony_ci * to trigger chunk allocation after starting a transaction since tasks should 394562306a36Sopenharmony_ci * reserve enough transaction units (metadata space), however while that is true 394662306a36Sopenharmony_ci * most of the time, chunk allocation may still be triggered for several reasons: 394762306a36Sopenharmony_ci * 394862306a36Sopenharmony_ci * 1) When reserving metadata, we check if there is enough free space in the 394962306a36Sopenharmony_ci * metadata space_info and therefore don't trigger allocation of a new chunk. 395062306a36Sopenharmony_ci * However later when the task actually tries to COW an extent buffer from 395162306a36Sopenharmony_ci * the extent btree or from the device btree for example, it is forced to 395262306a36Sopenharmony_ci * allocate a new block group (chunk) because the only one that had enough 395362306a36Sopenharmony_ci * free space was just turned to RO mode by a running scrub for example (or 395462306a36Sopenharmony_ci * device replace, block group reclaim thread, etc), so we can not use it 395562306a36Sopenharmony_ci * for allocating an extent and end up being forced to allocate a new one; 395662306a36Sopenharmony_ci * 395762306a36Sopenharmony_ci * 2) Because we only check that the metadata space_info has enough free bytes, 395862306a36Sopenharmony_ci * we end up not allocating a new metadata chunk in that case. However if 395962306a36Sopenharmony_ci * the filesystem was mounted in degraded mode, none of the existing block 396062306a36Sopenharmony_ci * groups might be suitable for extent allocation due to their incompatible 396162306a36Sopenharmony_ci * profile (for e.g. mounting a 2 devices filesystem, where all block groups 396262306a36Sopenharmony_ci * use a RAID1 profile, in degraded mode using a single device). In this case 396362306a36Sopenharmony_ci * when the task attempts to COW some extent buffer of the extent btree for 396462306a36Sopenharmony_ci * example, it will trigger allocation of a new metadata block group with a 396562306a36Sopenharmony_ci * suitable profile (SINGLE profile in the example of the degraded mount of 396662306a36Sopenharmony_ci * the RAID1 filesystem); 396762306a36Sopenharmony_ci * 396862306a36Sopenharmony_ci * 3) The task has reserved enough transaction units / metadata space, but when 396962306a36Sopenharmony_ci * it attempts to COW an extent buffer from the extent or device btree for 397062306a36Sopenharmony_ci * example, it does not find any free extent in any metadata block group, 397162306a36Sopenharmony_ci * therefore forced to try to allocate a new metadata block group. 397262306a36Sopenharmony_ci * This is because some other task allocated all available extents in the 397362306a36Sopenharmony_ci * meanwhile - this typically happens with tasks that don't reserve space 397462306a36Sopenharmony_ci * properly, either intentionally or as a bug. One example where this is 397562306a36Sopenharmony_ci * done intentionally is fsync, as it does not reserve any transaction units 397662306a36Sopenharmony_ci * and ends up allocating a variable number of metadata extents for log 397762306a36Sopenharmony_ci * tree extent buffers; 397862306a36Sopenharmony_ci * 397962306a36Sopenharmony_ci * 4) The task has reserved enough transaction units / metadata space, but right 398062306a36Sopenharmony_ci * before it tries to allocate the last extent buffer it needs, a discard 398162306a36Sopenharmony_ci * operation comes in and, temporarily, removes the last free space entry from 398262306a36Sopenharmony_ci * the only metadata block group that had free space (discard starts by 398362306a36Sopenharmony_ci * removing a free space entry from a block group, then does the discard 398462306a36Sopenharmony_ci * operation and, once it's done, it adds back the free space entry to the 398562306a36Sopenharmony_ci * block group). 398662306a36Sopenharmony_ci * 398762306a36Sopenharmony_ci * We also need this 2 phases setup when adding a device to a filesystem with 398862306a36Sopenharmony_ci * a seed device - we must create new metadata and system chunks without adding 398962306a36Sopenharmony_ci * any of the block group items to the chunk, extent and device btrees. If we 399062306a36Sopenharmony_ci * did not do it this way, we would get ENOSPC when attempting to update those 399162306a36Sopenharmony_ci * btrees, since all the chunks from the seed device are read-only. 399262306a36Sopenharmony_ci * 399362306a36Sopenharmony_ci * Phase 1 does the updates and insertions to the chunk btree because if we had 399462306a36Sopenharmony_ci * it done in phase 2 and have a thundering herd of tasks allocating chunks in 399562306a36Sopenharmony_ci * parallel, we risk having too many system chunks allocated by many tasks if 399662306a36Sopenharmony_ci * many tasks reach phase 1 without the previous ones completing phase 2. In the 399762306a36Sopenharmony_ci * extreme case this leads to exhaustion of the system chunk array in the 399862306a36Sopenharmony_ci * superblock. This is easier to trigger if using a btree node/leaf size of 64K 399962306a36Sopenharmony_ci * and with RAID filesystems (so we have more device items in the chunk btree). 400062306a36Sopenharmony_ci * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of 400162306a36Sopenharmony_ci * the system chunk array due to concurrent allocations") provides more details. 400262306a36Sopenharmony_ci * 400362306a36Sopenharmony_ci * Allocation of system chunks does not happen through this function. A task that 400462306a36Sopenharmony_ci * needs to update the chunk btree (the only btree that uses system chunks), must 400562306a36Sopenharmony_ci * preallocate chunk space by calling either check_system_chunk() or 400662306a36Sopenharmony_ci * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or 400762306a36Sopenharmony_ci * metadata chunk or when removing a chunk, while the later is used before doing 400862306a36Sopenharmony_ci * a modification to the chunk btree - use cases for the later are adding, 400962306a36Sopenharmony_ci * removing and resizing a device as well as relocation of a system chunk. 401062306a36Sopenharmony_ci * See the comment below for more details. 401162306a36Sopenharmony_ci * 401262306a36Sopenharmony_ci * The reservation of system space, done through check_system_chunk(), as well 401362306a36Sopenharmony_ci * as all the updates and insertions into the chunk btree must be done while 401462306a36Sopenharmony_ci * holding fs_info->chunk_mutex. This is important to guarantee that while COWing 401562306a36Sopenharmony_ci * an extent buffer from the chunks btree we never trigger allocation of a new 401662306a36Sopenharmony_ci * system chunk, which would result in a deadlock (trying to lock twice an 401762306a36Sopenharmony_ci * extent buffer of the chunk btree, first time before triggering the chunk 401862306a36Sopenharmony_ci * allocation and the second time during chunk allocation while attempting to 401962306a36Sopenharmony_ci * update the chunks btree). The system chunk array is also updated while holding 402062306a36Sopenharmony_ci * that mutex. The same logic applies to removing chunks - we must reserve system 402162306a36Sopenharmony_ci * space, update the chunk btree and the system chunk array in the superblock 402262306a36Sopenharmony_ci * while holding fs_info->chunk_mutex. 402362306a36Sopenharmony_ci * 402462306a36Sopenharmony_ci * This function, btrfs_chunk_alloc(), belongs to phase 1. 402562306a36Sopenharmony_ci * 402662306a36Sopenharmony_ci * If @force is CHUNK_ALLOC_FORCE: 402762306a36Sopenharmony_ci * - return 1 if it successfully allocates a chunk, 402862306a36Sopenharmony_ci * - return errors including -ENOSPC otherwise. 402962306a36Sopenharmony_ci * If @force is NOT CHUNK_ALLOC_FORCE: 403062306a36Sopenharmony_ci * - return 0 if it doesn't need to allocate a new chunk, 403162306a36Sopenharmony_ci * - return 1 if it successfully allocates a chunk, 403262306a36Sopenharmony_ci * - return errors including -ENOSPC otherwise. 403362306a36Sopenharmony_ci */ 403462306a36Sopenharmony_ciint btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 403562306a36Sopenharmony_ci enum btrfs_chunk_alloc_enum force) 403662306a36Sopenharmony_ci{ 403762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 403862306a36Sopenharmony_ci struct btrfs_space_info *space_info; 403962306a36Sopenharmony_ci struct btrfs_block_group *ret_bg; 404062306a36Sopenharmony_ci bool wait_for_alloc = false; 404162306a36Sopenharmony_ci bool should_alloc = false; 404262306a36Sopenharmony_ci bool from_extent_allocation = false; 404362306a36Sopenharmony_ci int ret = 0; 404462306a36Sopenharmony_ci 404562306a36Sopenharmony_ci if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { 404662306a36Sopenharmony_ci from_extent_allocation = true; 404762306a36Sopenharmony_ci force = CHUNK_ALLOC_FORCE; 404862306a36Sopenharmony_ci } 404962306a36Sopenharmony_ci 405062306a36Sopenharmony_ci /* Don't re-enter if we're already allocating a chunk */ 405162306a36Sopenharmony_ci if (trans->allocating_chunk) 405262306a36Sopenharmony_ci return -ENOSPC; 405362306a36Sopenharmony_ci /* 405462306a36Sopenharmony_ci * Allocation of system chunks can not happen through this path, as we 405562306a36Sopenharmony_ci * could end up in a deadlock if we are allocating a data or metadata 405662306a36Sopenharmony_ci * chunk and there is another task modifying the chunk btree. 405762306a36Sopenharmony_ci * 405862306a36Sopenharmony_ci * This is because while we are holding the chunk mutex, we will attempt 405962306a36Sopenharmony_ci * to add the new chunk item to the chunk btree or update an existing 406062306a36Sopenharmony_ci * device item in the chunk btree, while the other task that is modifying 406162306a36Sopenharmony_ci * the chunk btree is attempting to COW an extent buffer while holding a 406262306a36Sopenharmony_ci * lock on it and on its parent - if the COW operation triggers a system 406362306a36Sopenharmony_ci * chunk allocation, then we can deadlock because we are holding the 406462306a36Sopenharmony_ci * chunk mutex and we may need to access that extent buffer or its parent 406562306a36Sopenharmony_ci * in order to add the chunk item or update a device item. 406662306a36Sopenharmony_ci * 406762306a36Sopenharmony_ci * Tasks that want to modify the chunk tree should reserve system space 406862306a36Sopenharmony_ci * before updating the chunk btree, by calling either 406962306a36Sopenharmony_ci * btrfs_reserve_chunk_metadata() or check_system_chunk(). 407062306a36Sopenharmony_ci * It's possible that after a task reserves the space, it still ends up 407162306a36Sopenharmony_ci * here - this happens in the cases described above at do_chunk_alloc(). 407262306a36Sopenharmony_ci * The task will have to either retry or fail. 407362306a36Sopenharmony_ci */ 407462306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 407562306a36Sopenharmony_ci return -ENOSPC; 407662306a36Sopenharmony_ci 407762306a36Sopenharmony_ci space_info = btrfs_find_space_info(fs_info, flags); 407862306a36Sopenharmony_ci ASSERT(space_info); 407962306a36Sopenharmony_ci 408062306a36Sopenharmony_ci do { 408162306a36Sopenharmony_ci spin_lock(&space_info->lock); 408262306a36Sopenharmony_ci if (force < space_info->force_alloc) 408362306a36Sopenharmony_ci force = space_info->force_alloc; 408462306a36Sopenharmony_ci should_alloc = should_alloc_chunk(fs_info, space_info, force); 408562306a36Sopenharmony_ci if (space_info->full) { 408662306a36Sopenharmony_ci /* No more free physical space */ 408762306a36Sopenharmony_ci if (should_alloc) 408862306a36Sopenharmony_ci ret = -ENOSPC; 408962306a36Sopenharmony_ci else 409062306a36Sopenharmony_ci ret = 0; 409162306a36Sopenharmony_ci spin_unlock(&space_info->lock); 409262306a36Sopenharmony_ci return ret; 409362306a36Sopenharmony_ci } else if (!should_alloc) { 409462306a36Sopenharmony_ci spin_unlock(&space_info->lock); 409562306a36Sopenharmony_ci return 0; 409662306a36Sopenharmony_ci } else if (space_info->chunk_alloc) { 409762306a36Sopenharmony_ci /* 409862306a36Sopenharmony_ci * Someone is already allocating, so we need to block 409962306a36Sopenharmony_ci * until this someone is finished and then loop to 410062306a36Sopenharmony_ci * recheck if we should continue with our allocation 410162306a36Sopenharmony_ci * attempt. 410262306a36Sopenharmony_ci */ 410362306a36Sopenharmony_ci wait_for_alloc = true; 410462306a36Sopenharmony_ci force = CHUNK_ALLOC_NO_FORCE; 410562306a36Sopenharmony_ci spin_unlock(&space_info->lock); 410662306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 410762306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 410862306a36Sopenharmony_ci } else { 410962306a36Sopenharmony_ci /* Proceed with allocation */ 411062306a36Sopenharmony_ci space_info->chunk_alloc = 1; 411162306a36Sopenharmony_ci wait_for_alloc = false; 411262306a36Sopenharmony_ci spin_unlock(&space_info->lock); 411362306a36Sopenharmony_ci } 411462306a36Sopenharmony_ci 411562306a36Sopenharmony_ci cond_resched(); 411662306a36Sopenharmony_ci } while (wait_for_alloc); 411762306a36Sopenharmony_ci 411862306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 411962306a36Sopenharmony_ci trans->allocating_chunk = true; 412062306a36Sopenharmony_ci 412162306a36Sopenharmony_ci /* 412262306a36Sopenharmony_ci * If we have mixed data/metadata chunks we want to make sure we keep 412362306a36Sopenharmony_ci * allocating mixed chunks instead of individual chunks. 412462306a36Sopenharmony_ci */ 412562306a36Sopenharmony_ci if (btrfs_mixed_space_info(space_info)) 412662306a36Sopenharmony_ci flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 412762306a36Sopenharmony_ci 412862306a36Sopenharmony_ci /* 412962306a36Sopenharmony_ci * if we're doing a data chunk, go ahead and make sure that 413062306a36Sopenharmony_ci * we keep a reasonable number of metadata chunks allocated in the 413162306a36Sopenharmony_ci * FS as well. 413262306a36Sopenharmony_ci */ 413362306a36Sopenharmony_ci if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 413462306a36Sopenharmony_ci fs_info->data_chunk_allocations++; 413562306a36Sopenharmony_ci if (!(fs_info->data_chunk_allocations % 413662306a36Sopenharmony_ci fs_info->metadata_ratio)) 413762306a36Sopenharmony_ci force_metadata_allocation(fs_info); 413862306a36Sopenharmony_ci } 413962306a36Sopenharmony_ci 414062306a36Sopenharmony_ci ret_bg = do_chunk_alloc(trans, flags); 414162306a36Sopenharmony_ci trans->allocating_chunk = false; 414262306a36Sopenharmony_ci 414362306a36Sopenharmony_ci if (IS_ERR(ret_bg)) { 414462306a36Sopenharmony_ci ret = PTR_ERR(ret_bg); 414562306a36Sopenharmony_ci } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) { 414662306a36Sopenharmony_ci /* 414762306a36Sopenharmony_ci * New block group is likely to be used soon. Try to activate 414862306a36Sopenharmony_ci * it now. Failure is OK for now. 414962306a36Sopenharmony_ci */ 415062306a36Sopenharmony_ci btrfs_zone_activate(ret_bg); 415162306a36Sopenharmony_ci } 415262306a36Sopenharmony_ci 415362306a36Sopenharmony_ci if (!ret) 415462306a36Sopenharmony_ci btrfs_put_block_group(ret_bg); 415562306a36Sopenharmony_ci 415662306a36Sopenharmony_ci spin_lock(&space_info->lock); 415762306a36Sopenharmony_ci if (ret < 0) { 415862306a36Sopenharmony_ci if (ret == -ENOSPC) 415962306a36Sopenharmony_ci space_info->full = 1; 416062306a36Sopenharmony_ci else 416162306a36Sopenharmony_ci goto out; 416262306a36Sopenharmony_ci } else { 416362306a36Sopenharmony_ci ret = 1; 416462306a36Sopenharmony_ci space_info->max_extent_size = 0; 416562306a36Sopenharmony_ci } 416662306a36Sopenharmony_ci 416762306a36Sopenharmony_ci space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 416862306a36Sopenharmony_ciout: 416962306a36Sopenharmony_ci space_info->chunk_alloc = 0; 417062306a36Sopenharmony_ci spin_unlock(&space_info->lock); 417162306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 417262306a36Sopenharmony_ci 417362306a36Sopenharmony_ci return ret; 417462306a36Sopenharmony_ci} 417562306a36Sopenharmony_ci 417662306a36Sopenharmony_cistatic u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 417762306a36Sopenharmony_ci{ 417862306a36Sopenharmony_ci u64 num_dev; 417962306a36Sopenharmony_ci 418062306a36Sopenharmony_ci num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; 418162306a36Sopenharmony_ci if (!num_dev) 418262306a36Sopenharmony_ci num_dev = fs_info->fs_devices->rw_devices; 418362306a36Sopenharmony_ci 418462306a36Sopenharmony_ci return num_dev; 418562306a36Sopenharmony_ci} 418662306a36Sopenharmony_ci 418762306a36Sopenharmony_cistatic void reserve_chunk_space(struct btrfs_trans_handle *trans, 418862306a36Sopenharmony_ci u64 bytes, 418962306a36Sopenharmony_ci u64 type) 419062306a36Sopenharmony_ci{ 419162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 419262306a36Sopenharmony_ci struct btrfs_space_info *info; 419362306a36Sopenharmony_ci u64 left; 419462306a36Sopenharmony_ci int ret = 0; 419562306a36Sopenharmony_ci 419662306a36Sopenharmony_ci /* 419762306a36Sopenharmony_ci * Needed because we can end up allocating a system chunk and for an 419862306a36Sopenharmony_ci * atomic and race free space reservation in the chunk block reserve. 419962306a36Sopenharmony_ci */ 420062306a36Sopenharmony_ci lockdep_assert_held(&fs_info->chunk_mutex); 420162306a36Sopenharmony_ci 420262306a36Sopenharmony_ci info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 420362306a36Sopenharmony_ci spin_lock(&info->lock); 420462306a36Sopenharmony_ci left = info->total_bytes - btrfs_space_info_used(info, true); 420562306a36Sopenharmony_ci spin_unlock(&info->lock); 420662306a36Sopenharmony_ci 420762306a36Sopenharmony_ci if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 420862306a36Sopenharmony_ci btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 420962306a36Sopenharmony_ci left, bytes, type); 421062306a36Sopenharmony_ci btrfs_dump_space_info(fs_info, info, 0, 0); 421162306a36Sopenharmony_ci } 421262306a36Sopenharmony_ci 421362306a36Sopenharmony_ci if (left < bytes) { 421462306a36Sopenharmony_ci u64 flags = btrfs_system_alloc_profile(fs_info); 421562306a36Sopenharmony_ci struct btrfs_block_group *bg; 421662306a36Sopenharmony_ci 421762306a36Sopenharmony_ci /* 421862306a36Sopenharmony_ci * Ignore failure to create system chunk. We might end up not 421962306a36Sopenharmony_ci * needing it, as we might not need to COW all nodes/leafs from 422062306a36Sopenharmony_ci * the paths we visit in the chunk tree (they were already COWed 422162306a36Sopenharmony_ci * or created in the current transaction for example). 422262306a36Sopenharmony_ci */ 422362306a36Sopenharmony_ci bg = btrfs_create_chunk(trans, flags); 422462306a36Sopenharmony_ci if (IS_ERR(bg)) { 422562306a36Sopenharmony_ci ret = PTR_ERR(bg); 422662306a36Sopenharmony_ci } else { 422762306a36Sopenharmony_ci /* 422862306a36Sopenharmony_ci * We have a new chunk. We also need to activate it for 422962306a36Sopenharmony_ci * zoned filesystem. 423062306a36Sopenharmony_ci */ 423162306a36Sopenharmony_ci ret = btrfs_zoned_activate_one_bg(fs_info, info, true); 423262306a36Sopenharmony_ci if (ret < 0) 423362306a36Sopenharmony_ci return; 423462306a36Sopenharmony_ci 423562306a36Sopenharmony_ci /* 423662306a36Sopenharmony_ci * If we fail to add the chunk item here, we end up 423762306a36Sopenharmony_ci * trying again at phase 2 of chunk allocation, at 423862306a36Sopenharmony_ci * btrfs_create_pending_block_groups(). So ignore 423962306a36Sopenharmony_ci * any error here. An ENOSPC here could happen, due to 424062306a36Sopenharmony_ci * the cases described at do_chunk_alloc() - the system 424162306a36Sopenharmony_ci * block group we just created was just turned into RO 424262306a36Sopenharmony_ci * mode by a scrub for example, or a running discard 424362306a36Sopenharmony_ci * temporarily removed its free space entries, etc. 424462306a36Sopenharmony_ci */ 424562306a36Sopenharmony_ci btrfs_chunk_alloc_add_chunk_item(trans, bg); 424662306a36Sopenharmony_ci } 424762306a36Sopenharmony_ci } 424862306a36Sopenharmony_ci 424962306a36Sopenharmony_ci if (!ret) { 425062306a36Sopenharmony_ci ret = btrfs_block_rsv_add(fs_info, 425162306a36Sopenharmony_ci &fs_info->chunk_block_rsv, 425262306a36Sopenharmony_ci bytes, BTRFS_RESERVE_NO_FLUSH); 425362306a36Sopenharmony_ci if (!ret) 425462306a36Sopenharmony_ci trans->chunk_bytes_reserved += bytes; 425562306a36Sopenharmony_ci } 425662306a36Sopenharmony_ci} 425762306a36Sopenharmony_ci 425862306a36Sopenharmony_ci/* 425962306a36Sopenharmony_ci * Reserve space in the system space for allocating or removing a chunk. 426062306a36Sopenharmony_ci * The caller must be holding fs_info->chunk_mutex. 426162306a36Sopenharmony_ci */ 426262306a36Sopenharmony_civoid check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 426362306a36Sopenharmony_ci{ 426462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 426562306a36Sopenharmony_ci const u64 num_devs = get_profile_num_devs(fs_info, type); 426662306a36Sopenharmony_ci u64 bytes; 426762306a36Sopenharmony_ci 426862306a36Sopenharmony_ci /* num_devs device items to update and 1 chunk item to add or remove. */ 426962306a36Sopenharmony_ci bytes = btrfs_calc_metadata_size(fs_info, num_devs) + 427062306a36Sopenharmony_ci btrfs_calc_insert_metadata_size(fs_info, 1); 427162306a36Sopenharmony_ci 427262306a36Sopenharmony_ci reserve_chunk_space(trans, bytes, type); 427362306a36Sopenharmony_ci} 427462306a36Sopenharmony_ci 427562306a36Sopenharmony_ci/* 427662306a36Sopenharmony_ci * Reserve space in the system space, if needed, for doing a modification to the 427762306a36Sopenharmony_ci * chunk btree. 427862306a36Sopenharmony_ci * 427962306a36Sopenharmony_ci * @trans: A transaction handle. 428062306a36Sopenharmony_ci * @is_item_insertion: Indicate if the modification is for inserting a new item 428162306a36Sopenharmony_ci * in the chunk btree or if it's for the deletion or update 428262306a36Sopenharmony_ci * of an existing item. 428362306a36Sopenharmony_ci * 428462306a36Sopenharmony_ci * This is used in a context where we need to update the chunk btree outside 428562306a36Sopenharmony_ci * block group allocation and removal, to avoid a deadlock with a concurrent 428662306a36Sopenharmony_ci * task that is allocating a metadata or data block group and therefore needs to 428762306a36Sopenharmony_ci * update the chunk btree while holding the chunk mutex. After the update to the 428862306a36Sopenharmony_ci * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. 428962306a36Sopenharmony_ci * 429062306a36Sopenharmony_ci */ 429162306a36Sopenharmony_civoid btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, 429262306a36Sopenharmony_ci bool is_item_insertion) 429362306a36Sopenharmony_ci{ 429462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 429562306a36Sopenharmony_ci u64 bytes; 429662306a36Sopenharmony_ci 429762306a36Sopenharmony_ci if (is_item_insertion) 429862306a36Sopenharmony_ci bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 429962306a36Sopenharmony_ci else 430062306a36Sopenharmony_ci bytes = btrfs_calc_metadata_size(fs_info, 1); 430162306a36Sopenharmony_ci 430262306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 430362306a36Sopenharmony_ci reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); 430462306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 430562306a36Sopenharmony_ci} 430662306a36Sopenharmony_ci 430762306a36Sopenharmony_civoid btrfs_put_block_group_cache(struct btrfs_fs_info *info) 430862306a36Sopenharmony_ci{ 430962306a36Sopenharmony_ci struct btrfs_block_group *block_group; 431062306a36Sopenharmony_ci 431162306a36Sopenharmony_ci block_group = btrfs_lookup_first_block_group(info, 0); 431262306a36Sopenharmony_ci while (block_group) { 431362306a36Sopenharmony_ci btrfs_wait_block_group_cache_done(block_group); 431462306a36Sopenharmony_ci spin_lock(&block_group->lock); 431562306a36Sopenharmony_ci if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, 431662306a36Sopenharmony_ci &block_group->runtime_flags)) { 431762306a36Sopenharmony_ci struct inode *inode = block_group->inode; 431862306a36Sopenharmony_ci 431962306a36Sopenharmony_ci block_group->inode = NULL; 432062306a36Sopenharmony_ci spin_unlock(&block_group->lock); 432162306a36Sopenharmony_ci 432262306a36Sopenharmony_ci ASSERT(block_group->io_ctl.inode == NULL); 432362306a36Sopenharmony_ci iput(inode); 432462306a36Sopenharmony_ci } else { 432562306a36Sopenharmony_ci spin_unlock(&block_group->lock); 432662306a36Sopenharmony_ci } 432762306a36Sopenharmony_ci block_group = btrfs_next_block_group(block_group); 432862306a36Sopenharmony_ci } 432962306a36Sopenharmony_ci} 433062306a36Sopenharmony_ci 433162306a36Sopenharmony_ci/* 433262306a36Sopenharmony_ci * Must be called only after stopping all workers, since we could have block 433362306a36Sopenharmony_ci * group caching kthreads running, and therefore they could race with us if we 433462306a36Sopenharmony_ci * freed the block groups before stopping them. 433562306a36Sopenharmony_ci */ 433662306a36Sopenharmony_ciint btrfs_free_block_groups(struct btrfs_fs_info *info) 433762306a36Sopenharmony_ci{ 433862306a36Sopenharmony_ci struct btrfs_block_group *block_group; 433962306a36Sopenharmony_ci struct btrfs_space_info *space_info; 434062306a36Sopenharmony_ci struct btrfs_caching_control *caching_ctl; 434162306a36Sopenharmony_ci struct rb_node *n; 434262306a36Sopenharmony_ci 434362306a36Sopenharmony_ci if (btrfs_is_zoned(info)) { 434462306a36Sopenharmony_ci if (info->active_meta_bg) { 434562306a36Sopenharmony_ci btrfs_put_block_group(info->active_meta_bg); 434662306a36Sopenharmony_ci info->active_meta_bg = NULL; 434762306a36Sopenharmony_ci } 434862306a36Sopenharmony_ci if (info->active_system_bg) { 434962306a36Sopenharmony_ci btrfs_put_block_group(info->active_system_bg); 435062306a36Sopenharmony_ci info->active_system_bg = NULL; 435162306a36Sopenharmony_ci } 435262306a36Sopenharmony_ci } 435362306a36Sopenharmony_ci 435462306a36Sopenharmony_ci write_lock(&info->block_group_cache_lock); 435562306a36Sopenharmony_ci while (!list_empty(&info->caching_block_groups)) { 435662306a36Sopenharmony_ci caching_ctl = list_entry(info->caching_block_groups.next, 435762306a36Sopenharmony_ci struct btrfs_caching_control, list); 435862306a36Sopenharmony_ci list_del(&caching_ctl->list); 435962306a36Sopenharmony_ci btrfs_put_caching_control(caching_ctl); 436062306a36Sopenharmony_ci } 436162306a36Sopenharmony_ci write_unlock(&info->block_group_cache_lock); 436262306a36Sopenharmony_ci 436362306a36Sopenharmony_ci spin_lock(&info->unused_bgs_lock); 436462306a36Sopenharmony_ci while (!list_empty(&info->unused_bgs)) { 436562306a36Sopenharmony_ci block_group = list_first_entry(&info->unused_bgs, 436662306a36Sopenharmony_ci struct btrfs_block_group, 436762306a36Sopenharmony_ci bg_list); 436862306a36Sopenharmony_ci list_del_init(&block_group->bg_list); 436962306a36Sopenharmony_ci btrfs_put_block_group(block_group); 437062306a36Sopenharmony_ci } 437162306a36Sopenharmony_ci 437262306a36Sopenharmony_ci while (!list_empty(&info->reclaim_bgs)) { 437362306a36Sopenharmony_ci block_group = list_first_entry(&info->reclaim_bgs, 437462306a36Sopenharmony_ci struct btrfs_block_group, 437562306a36Sopenharmony_ci bg_list); 437662306a36Sopenharmony_ci list_del_init(&block_group->bg_list); 437762306a36Sopenharmony_ci btrfs_put_block_group(block_group); 437862306a36Sopenharmony_ci } 437962306a36Sopenharmony_ci spin_unlock(&info->unused_bgs_lock); 438062306a36Sopenharmony_ci 438162306a36Sopenharmony_ci spin_lock(&info->zone_active_bgs_lock); 438262306a36Sopenharmony_ci while (!list_empty(&info->zone_active_bgs)) { 438362306a36Sopenharmony_ci block_group = list_first_entry(&info->zone_active_bgs, 438462306a36Sopenharmony_ci struct btrfs_block_group, 438562306a36Sopenharmony_ci active_bg_list); 438662306a36Sopenharmony_ci list_del_init(&block_group->active_bg_list); 438762306a36Sopenharmony_ci btrfs_put_block_group(block_group); 438862306a36Sopenharmony_ci } 438962306a36Sopenharmony_ci spin_unlock(&info->zone_active_bgs_lock); 439062306a36Sopenharmony_ci 439162306a36Sopenharmony_ci write_lock(&info->block_group_cache_lock); 439262306a36Sopenharmony_ci while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { 439362306a36Sopenharmony_ci block_group = rb_entry(n, struct btrfs_block_group, 439462306a36Sopenharmony_ci cache_node); 439562306a36Sopenharmony_ci rb_erase_cached(&block_group->cache_node, 439662306a36Sopenharmony_ci &info->block_group_cache_tree); 439762306a36Sopenharmony_ci RB_CLEAR_NODE(&block_group->cache_node); 439862306a36Sopenharmony_ci write_unlock(&info->block_group_cache_lock); 439962306a36Sopenharmony_ci 440062306a36Sopenharmony_ci down_write(&block_group->space_info->groups_sem); 440162306a36Sopenharmony_ci list_del(&block_group->list); 440262306a36Sopenharmony_ci up_write(&block_group->space_info->groups_sem); 440362306a36Sopenharmony_ci 440462306a36Sopenharmony_ci /* 440562306a36Sopenharmony_ci * We haven't cached this block group, which means we could 440662306a36Sopenharmony_ci * possibly have excluded extents on this block group. 440762306a36Sopenharmony_ci */ 440862306a36Sopenharmony_ci if (block_group->cached == BTRFS_CACHE_NO || 440962306a36Sopenharmony_ci block_group->cached == BTRFS_CACHE_ERROR) 441062306a36Sopenharmony_ci btrfs_free_excluded_extents(block_group); 441162306a36Sopenharmony_ci 441262306a36Sopenharmony_ci btrfs_remove_free_space_cache(block_group); 441362306a36Sopenharmony_ci ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 441462306a36Sopenharmony_ci ASSERT(list_empty(&block_group->dirty_list)); 441562306a36Sopenharmony_ci ASSERT(list_empty(&block_group->io_list)); 441662306a36Sopenharmony_ci ASSERT(list_empty(&block_group->bg_list)); 441762306a36Sopenharmony_ci ASSERT(refcount_read(&block_group->refs) == 1); 441862306a36Sopenharmony_ci ASSERT(block_group->swap_extents == 0); 441962306a36Sopenharmony_ci btrfs_put_block_group(block_group); 442062306a36Sopenharmony_ci 442162306a36Sopenharmony_ci write_lock(&info->block_group_cache_lock); 442262306a36Sopenharmony_ci } 442362306a36Sopenharmony_ci write_unlock(&info->block_group_cache_lock); 442462306a36Sopenharmony_ci 442562306a36Sopenharmony_ci btrfs_release_global_block_rsv(info); 442662306a36Sopenharmony_ci 442762306a36Sopenharmony_ci while (!list_empty(&info->space_info)) { 442862306a36Sopenharmony_ci space_info = list_entry(info->space_info.next, 442962306a36Sopenharmony_ci struct btrfs_space_info, 443062306a36Sopenharmony_ci list); 443162306a36Sopenharmony_ci 443262306a36Sopenharmony_ci /* 443362306a36Sopenharmony_ci * Do not hide this behind enospc_debug, this is actually 443462306a36Sopenharmony_ci * important and indicates a real bug if this happens. 443562306a36Sopenharmony_ci */ 443662306a36Sopenharmony_ci if (WARN_ON(space_info->bytes_pinned > 0 || 443762306a36Sopenharmony_ci space_info->bytes_may_use > 0)) 443862306a36Sopenharmony_ci btrfs_dump_space_info(info, space_info, 0, 0); 443962306a36Sopenharmony_ci 444062306a36Sopenharmony_ci /* 444162306a36Sopenharmony_ci * If there was a failure to cleanup a log tree, very likely due 444262306a36Sopenharmony_ci * to an IO failure on a writeback attempt of one or more of its 444362306a36Sopenharmony_ci * extent buffers, we could not do proper (and cheap) unaccounting 444462306a36Sopenharmony_ci * of their reserved space, so don't warn on bytes_reserved > 0 in 444562306a36Sopenharmony_ci * that case. 444662306a36Sopenharmony_ci */ 444762306a36Sopenharmony_ci if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || 444862306a36Sopenharmony_ci !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { 444962306a36Sopenharmony_ci if (WARN_ON(space_info->bytes_reserved > 0)) 445062306a36Sopenharmony_ci btrfs_dump_space_info(info, space_info, 0, 0); 445162306a36Sopenharmony_ci } 445262306a36Sopenharmony_ci 445362306a36Sopenharmony_ci WARN_ON(space_info->reclaim_size > 0); 445462306a36Sopenharmony_ci list_del(&space_info->list); 445562306a36Sopenharmony_ci btrfs_sysfs_remove_space_info(space_info); 445662306a36Sopenharmony_ci } 445762306a36Sopenharmony_ci return 0; 445862306a36Sopenharmony_ci} 445962306a36Sopenharmony_ci 446062306a36Sopenharmony_civoid btrfs_freeze_block_group(struct btrfs_block_group *cache) 446162306a36Sopenharmony_ci{ 446262306a36Sopenharmony_ci atomic_inc(&cache->frozen); 446362306a36Sopenharmony_ci} 446462306a36Sopenharmony_ci 446562306a36Sopenharmony_civoid btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) 446662306a36Sopenharmony_ci{ 446762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = block_group->fs_info; 446862306a36Sopenharmony_ci struct extent_map_tree *em_tree; 446962306a36Sopenharmony_ci struct extent_map *em; 447062306a36Sopenharmony_ci bool cleanup; 447162306a36Sopenharmony_ci 447262306a36Sopenharmony_ci spin_lock(&block_group->lock); 447362306a36Sopenharmony_ci cleanup = (atomic_dec_and_test(&block_group->frozen) && 447462306a36Sopenharmony_ci test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)); 447562306a36Sopenharmony_ci spin_unlock(&block_group->lock); 447662306a36Sopenharmony_ci 447762306a36Sopenharmony_ci if (cleanup) { 447862306a36Sopenharmony_ci em_tree = &fs_info->mapping_tree; 447962306a36Sopenharmony_ci write_lock(&em_tree->lock); 448062306a36Sopenharmony_ci em = lookup_extent_mapping(em_tree, block_group->start, 448162306a36Sopenharmony_ci 1); 448262306a36Sopenharmony_ci BUG_ON(!em); /* logic error, can't happen */ 448362306a36Sopenharmony_ci remove_extent_mapping(em_tree, em); 448462306a36Sopenharmony_ci write_unlock(&em_tree->lock); 448562306a36Sopenharmony_ci 448662306a36Sopenharmony_ci /* once for us and once for the tree */ 448762306a36Sopenharmony_ci free_extent_map(em); 448862306a36Sopenharmony_ci free_extent_map(em); 448962306a36Sopenharmony_ci 449062306a36Sopenharmony_ci /* 449162306a36Sopenharmony_ci * We may have left one free space entry and other possible 449262306a36Sopenharmony_ci * tasks trimming this block group have left 1 entry each one. 449362306a36Sopenharmony_ci * Free them if any. 449462306a36Sopenharmony_ci */ 449562306a36Sopenharmony_ci btrfs_remove_free_space_cache(block_group); 449662306a36Sopenharmony_ci } 449762306a36Sopenharmony_ci} 449862306a36Sopenharmony_ci 449962306a36Sopenharmony_cibool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) 450062306a36Sopenharmony_ci{ 450162306a36Sopenharmony_ci bool ret = true; 450262306a36Sopenharmony_ci 450362306a36Sopenharmony_ci spin_lock(&bg->lock); 450462306a36Sopenharmony_ci if (bg->ro) 450562306a36Sopenharmony_ci ret = false; 450662306a36Sopenharmony_ci else 450762306a36Sopenharmony_ci bg->swap_extents++; 450862306a36Sopenharmony_ci spin_unlock(&bg->lock); 450962306a36Sopenharmony_ci 451062306a36Sopenharmony_ci return ret; 451162306a36Sopenharmony_ci} 451262306a36Sopenharmony_ci 451362306a36Sopenharmony_civoid btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) 451462306a36Sopenharmony_ci{ 451562306a36Sopenharmony_ci spin_lock(&bg->lock); 451662306a36Sopenharmony_ci ASSERT(!bg->ro); 451762306a36Sopenharmony_ci ASSERT(bg->swap_extents >= amount); 451862306a36Sopenharmony_ci bg->swap_extents -= amount; 451962306a36Sopenharmony_ci spin_unlock(&bg->lock); 452062306a36Sopenharmony_ci} 452162306a36Sopenharmony_ci 452262306a36Sopenharmony_cienum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) 452362306a36Sopenharmony_ci{ 452462306a36Sopenharmony_ci if (size <= SZ_128K) 452562306a36Sopenharmony_ci return BTRFS_BG_SZ_SMALL; 452662306a36Sopenharmony_ci if (size <= SZ_8M) 452762306a36Sopenharmony_ci return BTRFS_BG_SZ_MEDIUM; 452862306a36Sopenharmony_ci return BTRFS_BG_SZ_LARGE; 452962306a36Sopenharmony_ci} 453062306a36Sopenharmony_ci 453162306a36Sopenharmony_ci/* 453262306a36Sopenharmony_ci * Handle a block group allocating an extent in a size class 453362306a36Sopenharmony_ci * 453462306a36Sopenharmony_ci * @bg: The block group we allocated in. 453562306a36Sopenharmony_ci * @size_class: The size class of the allocation. 453662306a36Sopenharmony_ci * @force_wrong_size_class: Whether we are desperate enough to allow 453762306a36Sopenharmony_ci * mismatched size classes. 453862306a36Sopenharmony_ci * 453962306a36Sopenharmony_ci * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the 454062306a36Sopenharmony_ci * case of a race that leads to the wrong size class without 454162306a36Sopenharmony_ci * force_wrong_size_class set. 454262306a36Sopenharmony_ci * 454362306a36Sopenharmony_ci * find_free_extent will skip block groups with a mismatched size class until 454462306a36Sopenharmony_ci * it really needs to avoid ENOSPC. In that case it will set 454562306a36Sopenharmony_ci * force_wrong_size_class. However, if a block group is newly allocated and 454662306a36Sopenharmony_ci * doesn't yet have a size class, then it is possible for two allocations of 454762306a36Sopenharmony_ci * different sizes to race and both try to use it. The loser is caught here and 454862306a36Sopenharmony_ci * has to retry. 454962306a36Sopenharmony_ci */ 455062306a36Sopenharmony_ciint btrfs_use_block_group_size_class(struct btrfs_block_group *bg, 455162306a36Sopenharmony_ci enum btrfs_block_group_size_class size_class, 455262306a36Sopenharmony_ci bool force_wrong_size_class) 455362306a36Sopenharmony_ci{ 455462306a36Sopenharmony_ci ASSERT(size_class != BTRFS_BG_SZ_NONE); 455562306a36Sopenharmony_ci 455662306a36Sopenharmony_ci /* The new allocation is in the right size class, do nothing */ 455762306a36Sopenharmony_ci if (bg->size_class == size_class) 455862306a36Sopenharmony_ci return 0; 455962306a36Sopenharmony_ci /* 456062306a36Sopenharmony_ci * The new allocation is in a mismatched size class. 456162306a36Sopenharmony_ci * This means one of two things: 456262306a36Sopenharmony_ci * 456362306a36Sopenharmony_ci * 1. Two tasks in find_free_extent for different size_classes raced 456462306a36Sopenharmony_ci * and hit the same empty block_group. Make the loser try again. 456562306a36Sopenharmony_ci * 2. A call to find_free_extent got desperate enough to set 456662306a36Sopenharmony_ci * 'force_wrong_slab'. Don't change the size_class, but allow the 456762306a36Sopenharmony_ci * allocation. 456862306a36Sopenharmony_ci */ 456962306a36Sopenharmony_ci if (bg->size_class != BTRFS_BG_SZ_NONE) { 457062306a36Sopenharmony_ci if (force_wrong_size_class) 457162306a36Sopenharmony_ci return 0; 457262306a36Sopenharmony_ci return -EAGAIN; 457362306a36Sopenharmony_ci } 457462306a36Sopenharmony_ci /* 457562306a36Sopenharmony_ci * The happy new block group case: the new allocation is the first 457662306a36Sopenharmony_ci * one in the block_group so we set size_class. 457762306a36Sopenharmony_ci */ 457862306a36Sopenharmony_ci bg->size_class = size_class; 457962306a36Sopenharmony_ci 458062306a36Sopenharmony_ci return 0; 458162306a36Sopenharmony_ci} 458262306a36Sopenharmony_ci 458362306a36Sopenharmony_cibool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) 458462306a36Sopenharmony_ci{ 458562306a36Sopenharmony_ci if (btrfs_is_zoned(bg->fs_info)) 458662306a36Sopenharmony_ci return false; 458762306a36Sopenharmony_ci if (!btrfs_is_block_group_data_only(bg)) 458862306a36Sopenharmony_ci return false; 458962306a36Sopenharmony_ci return true; 459062306a36Sopenharmony_ci} 4591