162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2007 Oracle. All rights reserved. 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#include <linux/sched.h> 762306a36Sopenharmony_ci#include <linux/sched/mm.h> 862306a36Sopenharmony_ci#include <linux/slab.h> 962306a36Sopenharmony_ci#include <linux/ratelimit.h> 1062306a36Sopenharmony_ci#include <linux/kthread.h> 1162306a36Sopenharmony_ci#include <linux/semaphore.h> 1262306a36Sopenharmony_ci#include <linux/uuid.h> 1362306a36Sopenharmony_ci#include <linux/list_sort.h> 1462306a36Sopenharmony_ci#include <linux/namei.h> 1562306a36Sopenharmony_ci#include "misc.h" 1662306a36Sopenharmony_ci#include "ctree.h" 1762306a36Sopenharmony_ci#include "extent_map.h" 1862306a36Sopenharmony_ci#include "disk-io.h" 1962306a36Sopenharmony_ci#include "transaction.h" 2062306a36Sopenharmony_ci#include "print-tree.h" 2162306a36Sopenharmony_ci#include "volumes.h" 2262306a36Sopenharmony_ci#include "raid56.h" 2362306a36Sopenharmony_ci#include "rcu-string.h" 2462306a36Sopenharmony_ci#include "dev-replace.h" 2562306a36Sopenharmony_ci#include "sysfs.h" 2662306a36Sopenharmony_ci#include "tree-checker.h" 2762306a36Sopenharmony_ci#include "space-info.h" 2862306a36Sopenharmony_ci#include "block-group.h" 2962306a36Sopenharmony_ci#include "discard.h" 3062306a36Sopenharmony_ci#include "zoned.h" 3162306a36Sopenharmony_ci#include "fs.h" 3262306a36Sopenharmony_ci#include "accessors.h" 3362306a36Sopenharmony_ci#include "uuid-tree.h" 3462306a36Sopenharmony_ci#include "ioctl.h" 3562306a36Sopenharmony_ci#include "relocation.h" 3662306a36Sopenharmony_ci#include "scrub.h" 3762306a36Sopenharmony_ci#include "super.h" 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 4062306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_RAID10 | \ 4162306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_RAID56_MASK) 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ciconst struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 4462306a36Sopenharmony_ci [BTRFS_RAID_RAID10] = { 4562306a36Sopenharmony_ci .sub_stripes = 2, 4662306a36Sopenharmony_ci .dev_stripes = 1, 4762306a36Sopenharmony_ci .devs_max = 0, /* 0 == as many as possible */ 4862306a36Sopenharmony_ci .devs_min = 2, 4962306a36Sopenharmony_ci .tolerated_failures = 1, 5062306a36Sopenharmony_ci .devs_increment = 2, 5162306a36Sopenharmony_ci .ncopies = 2, 5262306a36Sopenharmony_ci .nparity = 0, 5362306a36Sopenharmony_ci .raid_name = "raid10", 5462306a36Sopenharmony_ci .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 5562306a36Sopenharmony_ci .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 5662306a36Sopenharmony_ci }, 5762306a36Sopenharmony_ci [BTRFS_RAID_RAID1] = { 5862306a36Sopenharmony_ci .sub_stripes = 1, 5962306a36Sopenharmony_ci .dev_stripes = 1, 6062306a36Sopenharmony_ci .devs_max = 2, 6162306a36Sopenharmony_ci .devs_min = 2, 6262306a36Sopenharmony_ci .tolerated_failures = 1, 6362306a36Sopenharmony_ci .devs_increment = 2, 6462306a36Sopenharmony_ci .ncopies = 2, 6562306a36Sopenharmony_ci .nparity = 0, 6662306a36Sopenharmony_ci .raid_name = "raid1", 6762306a36Sopenharmony_ci .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 6862306a36Sopenharmony_ci .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 6962306a36Sopenharmony_ci }, 7062306a36Sopenharmony_ci [BTRFS_RAID_RAID1C3] = { 7162306a36Sopenharmony_ci .sub_stripes = 1, 7262306a36Sopenharmony_ci .dev_stripes = 1, 7362306a36Sopenharmony_ci .devs_max = 3, 7462306a36Sopenharmony_ci .devs_min = 3, 7562306a36Sopenharmony_ci .tolerated_failures = 2, 7662306a36Sopenharmony_ci .devs_increment = 3, 7762306a36Sopenharmony_ci .ncopies = 3, 7862306a36Sopenharmony_ci .nparity = 0, 7962306a36Sopenharmony_ci .raid_name = "raid1c3", 8062306a36Sopenharmony_ci .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 8162306a36Sopenharmony_ci .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 8262306a36Sopenharmony_ci }, 8362306a36Sopenharmony_ci [BTRFS_RAID_RAID1C4] = { 8462306a36Sopenharmony_ci .sub_stripes = 1, 8562306a36Sopenharmony_ci .dev_stripes = 1, 8662306a36Sopenharmony_ci .devs_max = 4, 8762306a36Sopenharmony_ci .devs_min = 4, 8862306a36Sopenharmony_ci .tolerated_failures = 3, 8962306a36Sopenharmony_ci .devs_increment = 4, 9062306a36Sopenharmony_ci .ncopies = 4, 9162306a36Sopenharmony_ci .nparity = 0, 9262306a36Sopenharmony_ci .raid_name = "raid1c4", 9362306a36Sopenharmony_ci .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 9462306a36Sopenharmony_ci .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 9562306a36Sopenharmony_ci }, 9662306a36Sopenharmony_ci [BTRFS_RAID_DUP] = { 9762306a36Sopenharmony_ci .sub_stripes = 1, 9862306a36Sopenharmony_ci .dev_stripes = 2, 9962306a36Sopenharmony_ci .devs_max = 1, 10062306a36Sopenharmony_ci .devs_min = 1, 10162306a36Sopenharmony_ci .tolerated_failures = 0, 10262306a36Sopenharmony_ci .devs_increment = 1, 10362306a36Sopenharmony_ci .ncopies = 2, 10462306a36Sopenharmony_ci .nparity = 0, 10562306a36Sopenharmony_ci .raid_name = "dup", 10662306a36Sopenharmony_ci .bg_flag = BTRFS_BLOCK_GROUP_DUP, 10762306a36Sopenharmony_ci .mindev_error = 0, 10862306a36Sopenharmony_ci }, 10962306a36Sopenharmony_ci [BTRFS_RAID_RAID0] = { 11062306a36Sopenharmony_ci .sub_stripes = 1, 11162306a36Sopenharmony_ci .dev_stripes = 1, 11262306a36Sopenharmony_ci .devs_max = 0, 11362306a36Sopenharmony_ci .devs_min = 1, 11462306a36Sopenharmony_ci .tolerated_failures = 0, 11562306a36Sopenharmony_ci .devs_increment = 1, 11662306a36Sopenharmony_ci .ncopies = 1, 11762306a36Sopenharmony_ci .nparity = 0, 11862306a36Sopenharmony_ci .raid_name = "raid0", 11962306a36Sopenharmony_ci .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 12062306a36Sopenharmony_ci .mindev_error = 0, 12162306a36Sopenharmony_ci }, 12262306a36Sopenharmony_ci [BTRFS_RAID_SINGLE] = { 12362306a36Sopenharmony_ci .sub_stripes = 1, 12462306a36Sopenharmony_ci .dev_stripes = 1, 12562306a36Sopenharmony_ci .devs_max = 1, 12662306a36Sopenharmony_ci .devs_min = 1, 12762306a36Sopenharmony_ci .tolerated_failures = 0, 12862306a36Sopenharmony_ci .devs_increment = 1, 12962306a36Sopenharmony_ci .ncopies = 1, 13062306a36Sopenharmony_ci .nparity = 0, 13162306a36Sopenharmony_ci .raid_name = "single", 13262306a36Sopenharmony_ci .bg_flag = 0, 13362306a36Sopenharmony_ci .mindev_error = 0, 13462306a36Sopenharmony_ci }, 13562306a36Sopenharmony_ci [BTRFS_RAID_RAID5] = { 13662306a36Sopenharmony_ci .sub_stripes = 1, 13762306a36Sopenharmony_ci .dev_stripes = 1, 13862306a36Sopenharmony_ci .devs_max = 0, 13962306a36Sopenharmony_ci .devs_min = 2, 14062306a36Sopenharmony_ci .tolerated_failures = 1, 14162306a36Sopenharmony_ci .devs_increment = 1, 14262306a36Sopenharmony_ci .ncopies = 1, 14362306a36Sopenharmony_ci .nparity = 1, 14462306a36Sopenharmony_ci .raid_name = "raid5", 14562306a36Sopenharmony_ci .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 14662306a36Sopenharmony_ci .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 14762306a36Sopenharmony_ci }, 14862306a36Sopenharmony_ci [BTRFS_RAID_RAID6] = { 14962306a36Sopenharmony_ci .sub_stripes = 1, 15062306a36Sopenharmony_ci .dev_stripes = 1, 15162306a36Sopenharmony_ci .devs_max = 0, 15262306a36Sopenharmony_ci .devs_min = 3, 15362306a36Sopenharmony_ci .tolerated_failures = 2, 15462306a36Sopenharmony_ci .devs_increment = 1, 15562306a36Sopenharmony_ci .ncopies = 1, 15662306a36Sopenharmony_ci .nparity = 2, 15762306a36Sopenharmony_ci .raid_name = "raid6", 15862306a36Sopenharmony_ci .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 15962306a36Sopenharmony_ci .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 16062306a36Sopenharmony_ci }, 16162306a36Sopenharmony_ci}; 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci/* 16462306a36Sopenharmony_ci * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 16562306a36Sopenharmony_ci * can be used as index to access btrfs_raid_array[]. 16662306a36Sopenharmony_ci */ 16762306a36Sopenharmony_cienum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 16862306a36Sopenharmony_ci{ 16962306a36Sopenharmony_ci const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci if (!profile) 17262306a36Sopenharmony_ci return BTRFS_RAID_SINGLE; 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci return BTRFS_BG_FLAG_TO_INDEX(profile); 17562306a36Sopenharmony_ci} 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ciconst char *btrfs_bg_type_to_raid_name(u64 flags) 17862306a36Sopenharmony_ci{ 17962306a36Sopenharmony_ci const int index = btrfs_bg_flags_to_raid_index(flags); 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci if (index >= BTRFS_NR_RAID_TYPES) 18262306a36Sopenharmony_ci return NULL; 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci return btrfs_raid_array[index].raid_name; 18562306a36Sopenharmony_ci} 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ciint btrfs_nr_parity_stripes(u64 type) 18862306a36Sopenharmony_ci{ 18962306a36Sopenharmony_ci enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci return btrfs_raid_array[index].nparity; 19262306a36Sopenharmony_ci} 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci/* 19562306a36Sopenharmony_ci * Fill @buf with textual description of @bg_flags, no more than @size_buf 19662306a36Sopenharmony_ci * bytes including terminating null byte. 19762306a36Sopenharmony_ci */ 19862306a36Sopenharmony_civoid btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 19962306a36Sopenharmony_ci{ 20062306a36Sopenharmony_ci int i; 20162306a36Sopenharmony_ci int ret; 20262306a36Sopenharmony_ci char *bp = buf; 20362306a36Sopenharmony_ci u64 flags = bg_flags; 20462306a36Sopenharmony_ci u32 size_bp = size_buf; 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci if (!flags) { 20762306a36Sopenharmony_ci strcpy(bp, "NONE"); 20862306a36Sopenharmony_ci return; 20962306a36Sopenharmony_ci } 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci#define DESCRIBE_FLAG(flag, desc) \ 21262306a36Sopenharmony_ci do { \ 21362306a36Sopenharmony_ci if (flags & (flag)) { \ 21462306a36Sopenharmony_ci ret = snprintf(bp, size_bp, "%s|", (desc)); \ 21562306a36Sopenharmony_ci if (ret < 0 || ret >= size_bp) \ 21662306a36Sopenharmony_ci goto out_overflow; \ 21762306a36Sopenharmony_ci size_bp -= ret; \ 21862306a36Sopenharmony_ci bp += ret; \ 21962306a36Sopenharmony_ci flags &= ~(flag); \ 22062306a36Sopenharmony_ci } \ 22162306a36Sopenharmony_ci } while (0) 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 22462306a36Sopenharmony_ci DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 22562306a36Sopenharmony_ci DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 22862306a36Sopenharmony_ci for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 22962306a36Sopenharmony_ci DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 23062306a36Sopenharmony_ci btrfs_raid_array[i].raid_name); 23162306a36Sopenharmony_ci#undef DESCRIBE_FLAG 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci if (flags) { 23462306a36Sopenharmony_ci ret = snprintf(bp, size_bp, "0x%llx|", flags); 23562306a36Sopenharmony_ci size_bp -= ret; 23662306a36Sopenharmony_ci } 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci if (size_bp < size_buf) 23962306a36Sopenharmony_ci buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci /* 24262306a36Sopenharmony_ci * The text is trimmed, it's up to the caller to provide sufficiently 24362306a36Sopenharmony_ci * large buffer 24462306a36Sopenharmony_ci */ 24562306a36Sopenharmony_ciout_overflow:; 24662306a36Sopenharmony_ci} 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_cistatic int init_first_rw_device(struct btrfs_trans_handle *trans); 24962306a36Sopenharmony_cistatic int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 25062306a36Sopenharmony_cistatic void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci/* 25362306a36Sopenharmony_ci * Device locking 25462306a36Sopenharmony_ci * ============== 25562306a36Sopenharmony_ci * 25662306a36Sopenharmony_ci * There are several mutexes that protect manipulation of devices and low-level 25762306a36Sopenharmony_ci * structures like chunks but not block groups, extents or files 25862306a36Sopenharmony_ci * 25962306a36Sopenharmony_ci * uuid_mutex (global lock) 26062306a36Sopenharmony_ci * ------------------------ 26162306a36Sopenharmony_ci * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 26262306a36Sopenharmony_ci * the SCAN_DEV ioctl registration or from mount either implicitly (the first 26362306a36Sopenharmony_ci * device) or requested by the device= mount option 26462306a36Sopenharmony_ci * 26562306a36Sopenharmony_ci * the mutex can be very coarse and can cover long-running operations 26662306a36Sopenharmony_ci * 26762306a36Sopenharmony_ci * protects: updates to fs_devices counters like missing devices, rw devices, 26862306a36Sopenharmony_ci * seeding, structure cloning, opening/closing devices at mount/umount time 26962306a36Sopenharmony_ci * 27062306a36Sopenharmony_ci * global::fs_devs - add, remove, updates to the global list 27162306a36Sopenharmony_ci * 27262306a36Sopenharmony_ci * does not protect: manipulation of the fs_devices::devices list in general 27362306a36Sopenharmony_ci * but in mount context it could be used to exclude list modifications by eg. 27462306a36Sopenharmony_ci * scan ioctl 27562306a36Sopenharmony_ci * 27662306a36Sopenharmony_ci * btrfs_device::name - renames (write side), read is RCU 27762306a36Sopenharmony_ci * 27862306a36Sopenharmony_ci * fs_devices::device_list_mutex (per-fs, with RCU) 27962306a36Sopenharmony_ci * ------------------------------------------------ 28062306a36Sopenharmony_ci * protects updates to fs_devices::devices, ie. adding and deleting 28162306a36Sopenharmony_ci * 28262306a36Sopenharmony_ci * simple list traversal with read-only actions can be done with RCU protection 28362306a36Sopenharmony_ci * 28462306a36Sopenharmony_ci * may be used to exclude some operations from running concurrently without any 28562306a36Sopenharmony_ci * modifications to the list (see write_all_supers) 28662306a36Sopenharmony_ci * 28762306a36Sopenharmony_ci * Is not required at mount and close times, because our device list is 28862306a36Sopenharmony_ci * protected by the uuid_mutex at that point. 28962306a36Sopenharmony_ci * 29062306a36Sopenharmony_ci * balance_mutex 29162306a36Sopenharmony_ci * ------------- 29262306a36Sopenharmony_ci * protects balance structures (status, state) and context accessed from 29362306a36Sopenharmony_ci * several places (internally, ioctl) 29462306a36Sopenharmony_ci * 29562306a36Sopenharmony_ci * chunk_mutex 29662306a36Sopenharmony_ci * ----------- 29762306a36Sopenharmony_ci * protects chunks, adding or removing during allocation, trim or when a new 29862306a36Sopenharmony_ci * device is added/removed. Additionally it also protects post_commit_list of 29962306a36Sopenharmony_ci * individual devices, since they can be added to the transaction's 30062306a36Sopenharmony_ci * post_commit_list only with chunk_mutex held. 30162306a36Sopenharmony_ci * 30262306a36Sopenharmony_ci * cleaner_mutex 30362306a36Sopenharmony_ci * ------------- 30462306a36Sopenharmony_ci * a big lock that is held by the cleaner thread and prevents running subvolume 30562306a36Sopenharmony_ci * cleaning together with relocation or delayed iputs 30662306a36Sopenharmony_ci * 30762306a36Sopenharmony_ci * 30862306a36Sopenharmony_ci * Lock nesting 30962306a36Sopenharmony_ci * ============ 31062306a36Sopenharmony_ci * 31162306a36Sopenharmony_ci * uuid_mutex 31262306a36Sopenharmony_ci * device_list_mutex 31362306a36Sopenharmony_ci * chunk_mutex 31462306a36Sopenharmony_ci * balance_mutex 31562306a36Sopenharmony_ci * 31662306a36Sopenharmony_ci * 31762306a36Sopenharmony_ci * Exclusive operations 31862306a36Sopenharmony_ci * ==================== 31962306a36Sopenharmony_ci * 32062306a36Sopenharmony_ci * Maintains the exclusivity of the following operations that apply to the 32162306a36Sopenharmony_ci * whole filesystem and cannot run in parallel. 32262306a36Sopenharmony_ci * 32362306a36Sopenharmony_ci * - Balance (*) 32462306a36Sopenharmony_ci * - Device add 32562306a36Sopenharmony_ci * - Device remove 32662306a36Sopenharmony_ci * - Device replace (*) 32762306a36Sopenharmony_ci * - Resize 32862306a36Sopenharmony_ci * 32962306a36Sopenharmony_ci * The device operations (as above) can be in one of the following states: 33062306a36Sopenharmony_ci * 33162306a36Sopenharmony_ci * - Running state 33262306a36Sopenharmony_ci * - Paused state 33362306a36Sopenharmony_ci * - Completed state 33462306a36Sopenharmony_ci * 33562306a36Sopenharmony_ci * Only device operations marked with (*) can go into the Paused state for the 33662306a36Sopenharmony_ci * following reasons: 33762306a36Sopenharmony_ci * 33862306a36Sopenharmony_ci * - ioctl (only Balance can be Paused through ioctl) 33962306a36Sopenharmony_ci * - filesystem remounted as read-only 34062306a36Sopenharmony_ci * - filesystem unmounted and mounted as read-only 34162306a36Sopenharmony_ci * - system power-cycle and filesystem mounted as read-only 34262306a36Sopenharmony_ci * - filesystem or device errors leading to forced read-only 34362306a36Sopenharmony_ci * 34462306a36Sopenharmony_ci * The status of exclusive operation is set and cleared atomically. 34562306a36Sopenharmony_ci * During the course of Paused state, fs_info::exclusive_operation remains set. 34662306a36Sopenharmony_ci * A device operation in Paused or Running state can be canceled or resumed 34762306a36Sopenharmony_ci * either by ioctl (Balance only) or when remounted as read-write. 34862306a36Sopenharmony_ci * The exclusive status is cleared when the device operation is canceled or 34962306a36Sopenharmony_ci * completed. 35062306a36Sopenharmony_ci */ 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ciDEFINE_MUTEX(uuid_mutex); 35362306a36Sopenharmony_cistatic LIST_HEAD(fs_uuids); 35462306a36Sopenharmony_cistruct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 35562306a36Sopenharmony_ci{ 35662306a36Sopenharmony_ci return &fs_uuids; 35762306a36Sopenharmony_ci} 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci/* 36062306a36Sopenharmony_ci * alloc_fs_devices - allocate struct btrfs_fs_devices 36162306a36Sopenharmony_ci * @fsid: if not NULL, copy the UUID to fs_devices::fsid 36262306a36Sopenharmony_ci * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 36362306a36Sopenharmony_ci * 36462306a36Sopenharmony_ci * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 36562306a36Sopenharmony_ci * The returned struct is not linked onto any lists and can be destroyed with 36662306a36Sopenharmony_ci * kfree() right away. 36762306a36Sopenharmony_ci */ 36862306a36Sopenharmony_cistatic struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 36962306a36Sopenharmony_ci const u8 *metadata_fsid) 37062306a36Sopenharmony_ci{ 37162306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devs; 37262306a36Sopenharmony_ci 37362306a36Sopenharmony_ci ASSERT(fsid || !metadata_fsid); 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 37662306a36Sopenharmony_ci if (!fs_devs) 37762306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci mutex_init(&fs_devs->device_list_mutex); 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci INIT_LIST_HEAD(&fs_devs->devices); 38262306a36Sopenharmony_ci INIT_LIST_HEAD(&fs_devs->alloc_list); 38362306a36Sopenharmony_ci INIT_LIST_HEAD(&fs_devs->fs_list); 38462306a36Sopenharmony_ci INIT_LIST_HEAD(&fs_devs->seed_list); 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci if (fsid) { 38762306a36Sopenharmony_ci memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 38862306a36Sopenharmony_ci memcpy(fs_devs->metadata_uuid, 38962306a36Sopenharmony_ci metadata_fsid ?: fsid, BTRFS_FSID_SIZE); 39062306a36Sopenharmony_ci } 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_ci return fs_devs; 39362306a36Sopenharmony_ci} 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_cistatic void btrfs_free_device(struct btrfs_device *device) 39662306a36Sopenharmony_ci{ 39762306a36Sopenharmony_ci WARN_ON(!list_empty(&device->post_commit_list)); 39862306a36Sopenharmony_ci rcu_string_free(device->name); 39962306a36Sopenharmony_ci extent_io_tree_release(&device->alloc_state); 40062306a36Sopenharmony_ci btrfs_destroy_dev_zone_info(device); 40162306a36Sopenharmony_ci kfree(device); 40262306a36Sopenharmony_ci} 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_cistatic void free_fs_devices(struct btrfs_fs_devices *fs_devices) 40562306a36Sopenharmony_ci{ 40662306a36Sopenharmony_ci struct btrfs_device *device; 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci WARN_ON(fs_devices->opened); 40962306a36Sopenharmony_ci while (!list_empty(&fs_devices->devices)) { 41062306a36Sopenharmony_ci device = list_entry(fs_devices->devices.next, 41162306a36Sopenharmony_ci struct btrfs_device, dev_list); 41262306a36Sopenharmony_ci list_del(&device->dev_list); 41362306a36Sopenharmony_ci btrfs_free_device(device); 41462306a36Sopenharmony_ci } 41562306a36Sopenharmony_ci kfree(fs_devices); 41662306a36Sopenharmony_ci} 41762306a36Sopenharmony_ci 41862306a36Sopenharmony_civoid __exit btrfs_cleanup_fs_uuids(void) 41962306a36Sopenharmony_ci{ 42062306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices; 42162306a36Sopenharmony_ci 42262306a36Sopenharmony_ci while (!list_empty(&fs_uuids)) { 42362306a36Sopenharmony_ci fs_devices = list_entry(fs_uuids.next, 42462306a36Sopenharmony_ci struct btrfs_fs_devices, fs_list); 42562306a36Sopenharmony_ci list_del(&fs_devices->fs_list); 42662306a36Sopenharmony_ci free_fs_devices(fs_devices); 42762306a36Sopenharmony_ci } 42862306a36Sopenharmony_ci} 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_cistatic bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices, 43162306a36Sopenharmony_ci const u8 *fsid, const u8 *metadata_fsid) 43262306a36Sopenharmony_ci{ 43362306a36Sopenharmony_ci if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0) 43462306a36Sopenharmony_ci return false; 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci if (!metadata_fsid) 43762306a36Sopenharmony_ci return true; 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ci if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) 44062306a36Sopenharmony_ci return false; 44162306a36Sopenharmony_ci 44262306a36Sopenharmony_ci return true; 44362306a36Sopenharmony_ci} 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_cistatic noinline struct btrfs_fs_devices *find_fsid( 44662306a36Sopenharmony_ci const u8 *fsid, const u8 *metadata_fsid) 44762306a36Sopenharmony_ci{ 44862306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices; 44962306a36Sopenharmony_ci 45062306a36Sopenharmony_ci ASSERT(fsid); 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci /* Handle non-split brain cases */ 45362306a36Sopenharmony_ci list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 45462306a36Sopenharmony_ci if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid)) 45562306a36Sopenharmony_ci return fs_devices; 45662306a36Sopenharmony_ci } 45762306a36Sopenharmony_ci return NULL; 45862306a36Sopenharmony_ci} 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci/* 46162306a36Sopenharmony_ci * First check if the metadata_uuid is different from the fsid in the given 46262306a36Sopenharmony_ci * fs_devices. Then check if the given fsid is the same as the metadata_uuid 46362306a36Sopenharmony_ci * in the fs_devices. If it is, return true; otherwise, return false. 46462306a36Sopenharmony_ci */ 46562306a36Sopenharmony_cistatic inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices, 46662306a36Sopenharmony_ci const u8 *fsid) 46762306a36Sopenharmony_ci{ 46862306a36Sopenharmony_ci return memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 46962306a36Sopenharmony_ci BTRFS_FSID_SIZE) != 0 && 47062306a36Sopenharmony_ci memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0; 47162306a36Sopenharmony_ci} 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_cistatic struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 47462306a36Sopenharmony_ci struct btrfs_super_block *disk_super) 47562306a36Sopenharmony_ci{ 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices; 47862306a36Sopenharmony_ci 47962306a36Sopenharmony_ci /* 48062306a36Sopenharmony_ci * Handle scanned device having completed its fsid change but 48162306a36Sopenharmony_ci * belonging to a fs_devices that was created by first scanning 48262306a36Sopenharmony_ci * a device which didn't have its fsid/metadata_uuid changed 48362306a36Sopenharmony_ci * at all and the CHANGING_FSID_V2 flag set. 48462306a36Sopenharmony_ci */ 48562306a36Sopenharmony_ci list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 48662306a36Sopenharmony_ci if (!fs_devices->fsid_change) 48762306a36Sopenharmony_ci continue; 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid, 49062306a36Sopenharmony_ci fs_devices->fsid)) 49162306a36Sopenharmony_ci return fs_devices; 49262306a36Sopenharmony_ci } 49362306a36Sopenharmony_ci 49462306a36Sopenharmony_ci /* 49562306a36Sopenharmony_ci * Handle scanned device having completed its fsid change but 49662306a36Sopenharmony_ci * belonging to a fs_devices that was created by a device that 49762306a36Sopenharmony_ci * has an outdated pair of fsid/metadata_uuid and 49862306a36Sopenharmony_ci * CHANGING_FSID_V2 flag set. 49962306a36Sopenharmony_ci */ 50062306a36Sopenharmony_ci list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 50162306a36Sopenharmony_ci if (!fs_devices->fsid_change) 50262306a36Sopenharmony_ci continue; 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_ci if (check_fsid_changed(fs_devices, disk_super->metadata_uuid)) 50562306a36Sopenharmony_ci return fs_devices; 50662306a36Sopenharmony_ci } 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 50962306a36Sopenharmony_ci} 51062306a36Sopenharmony_ci 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_cistatic int 51362306a36Sopenharmony_cibtrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, 51462306a36Sopenharmony_ci int flush, struct block_device **bdev, 51562306a36Sopenharmony_ci struct btrfs_super_block **disk_super) 51662306a36Sopenharmony_ci{ 51762306a36Sopenharmony_ci int ret; 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_ci *bdev = blkdev_get_by_path(device_path, flags, holder, NULL); 52062306a36Sopenharmony_ci 52162306a36Sopenharmony_ci if (IS_ERR(*bdev)) { 52262306a36Sopenharmony_ci ret = PTR_ERR(*bdev); 52362306a36Sopenharmony_ci goto error; 52462306a36Sopenharmony_ci } 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_ci if (flush) 52762306a36Sopenharmony_ci sync_blockdev(*bdev); 52862306a36Sopenharmony_ci ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 52962306a36Sopenharmony_ci if (ret) { 53062306a36Sopenharmony_ci blkdev_put(*bdev, holder); 53162306a36Sopenharmony_ci goto error; 53262306a36Sopenharmony_ci } 53362306a36Sopenharmony_ci invalidate_bdev(*bdev); 53462306a36Sopenharmony_ci *disk_super = btrfs_read_dev_super(*bdev); 53562306a36Sopenharmony_ci if (IS_ERR(*disk_super)) { 53662306a36Sopenharmony_ci ret = PTR_ERR(*disk_super); 53762306a36Sopenharmony_ci blkdev_put(*bdev, holder); 53862306a36Sopenharmony_ci goto error; 53962306a36Sopenharmony_ci } 54062306a36Sopenharmony_ci 54162306a36Sopenharmony_ci return 0; 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_cierror: 54462306a36Sopenharmony_ci *bdev = NULL; 54562306a36Sopenharmony_ci return ret; 54662306a36Sopenharmony_ci} 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_ci/* 54962306a36Sopenharmony_ci * Search and remove all stale devices (which are not mounted). When both 55062306a36Sopenharmony_ci * inputs are NULL, it will search and release all stale devices. 55162306a36Sopenharmony_ci * 55262306a36Sopenharmony_ci * @devt: Optional. When provided will it release all unmounted devices 55362306a36Sopenharmony_ci * matching this devt only. 55462306a36Sopenharmony_ci * @skip_device: Optional. Will skip this device when searching for the stale 55562306a36Sopenharmony_ci * devices. 55662306a36Sopenharmony_ci * 55762306a36Sopenharmony_ci * Return: 0 for success or if @devt is 0. 55862306a36Sopenharmony_ci * -EBUSY if @devt is a mounted device. 55962306a36Sopenharmony_ci * -ENOENT if @devt does not match any device in the list. 56062306a36Sopenharmony_ci */ 56162306a36Sopenharmony_cistatic int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) 56262306a36Sopenharmony_ci{ 56362306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 56462306a36Sopenharmony_ci struct btrfs_device *device, *tmp_device; 56562306a36Sopenharmony_ci int ret = 0; 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_ci lockdep_assert_held(&uuid_mutex); 56862306a36Sopenharmony_ci 56962306a36Sopenharmony_ci if (devt) 57062306a36Sopenharmony_ci ret = -ENOENT; 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 57362306a36Sopenharmony_ci 57462306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 57562306a36Sopenharmony_ci list_for_each_entry_safe(device, tmp_device, 57662306a36Sopenharmony_ci &fs_devices->devices, dev_list) { 57762306a36Sopenharmony_ci if (skip_device && skip_device == device) 57862306a36Sopenharmony_ci continue; 57962306a36Sopenharmony_ci if (devt && devt != device->devt) 58062306a36Sopenharmony_ci continue; 58162306a36Sopenharmony_ci if (fs_devices->opened) { 58262306a36Sopenharmony_ci /* for an already deleted device return 0 */ 58362306a36Sopenharmony_ci if (devt && ret != 0) 58462306a36Sopenharmony_ci ret = -EBUSY; 58562306a36Sopenharmony_ci break; 58662306a36Sopenharmony_ci } 58762306a36Sopenharmony_ci 58862306a36Sopenharmony_ci /* delete the stale device */ 58962306a36Sopenharmony_ci fs_devices->num_devices--; 59062306a36Sopenharmony_ci list_del(&device->dev_list); 59162306a36Sopenharmony_ci btrfs_free_device(device); 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_ci ret = 0; 59462306a36Sopenharmony_ci } 59562306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 59662306a36Sopenharmony_ci 59762306a36Sopenharmony_ci if (fs_devices->num_devices == 0) { 59862306a36Sopenharmony_ci btrfs_sysfs_remove_fsid(fs_devices); 59962306a36Sopenharmony_ci list_del(&fs_devices->fs_list); 60062306a36Sopenharmony_ci free_fs_devices(fs_devices); 60162306a36Sopenharmony_ci } 60262306a36Sopenharmony_ci } 60362306a36Sopenharmony_ci 60462306a36Sopenharmony_ci return ret; 60562306a36Sopenharmony_ci} 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci/* 60862306a36Sopenharmony_ci * This is only used on mount, and we are protected from competing things 60962306a36Sopenharmony_ci * messing with our fs_devices by the uuid_mutex, thus we do not need the 61062306a36Sopenharmony_ci * fs_devices->device_list_mutex here. 61162306a36Sopenharmony_ci */ 61262306a36Sopenharmony_cistatic int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 61362306a36Sopenharmony_ci struct btrfs_device *device, blk_mode_t flags, 61462306a36Sopenharmony_ci void *holder) 61562306a36Sopenharmony_ci{ 61662306a36Sopenharmony_ci struct block_device *bdev; 61762306a36Sopenharmony_ci struct btrfs_super_block *disk_super; 61862306a36Sopenharmony_ci u64 devid; 61962306a36Sopenharmony_ci int ret; 62062306a36Sopenharmony_ci 62162306a36Sopenharmony_ci if (device->bdev) 62262306a36Sopenharmony_ci return -EINVAL; 62362306a36Sopenharmony_ci if (!device->name) 62462306a36Sopenharmony_ci return -EINVAL; 62562306a36Sopenharmony_ci 62662306a36Sopenharmony_ci ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 62762306a36Sopenharmony_ci &bdev, &disk_super); 62862306a36Sopenharmony_ci if (ret) 62962306a36Sopenharmony_ci return ret; 63062306a36Sopenharmony_ci 63162306a36Sopenharmony_ci devid = btrfs_stack_device_id(&disk_super->dev_item); 63262306a36Sopenharmony_ci if (devid != device->devid) 63362306a36Sopenharmony_ci goto error_free_page; 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 63662306a36Sopenharmony_ci goto error_free_page; 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci device->generation = btrfs_super_generation(disk_super); 63962306a36Sopenharmony_ci 64062306a36Sopenharmony_ci if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 64162306a36Sopenharmony_ci if (btrfs_super_incompat_flags(disk_super) & 64262306a36Sopenharmony_ci BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 64362306a36Sopenharmony_ci pr_err( 64462306a36Sopenharmony_ci "BTRFS: Invalid seeding and uuid-changed device detected\n"); 64562306a36Sopenharmony_ci goto error_free_page; 64662306a36Sopenharmony_ci } 64762306a36Sopenharmony_ci 64862306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 64962306a36Sopenharmony_ci fs_devices->seeding = true; 65062306a36Sopenharmony_ci } else { 65162306a36Sopenharmony_ci if (bdev_read_only(bdev)) 65262306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 65362306a36Sopenharmony_ci else 65462306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 65562306a36Sopenharmony_ci } 65662306a36Sopenharmony_ci 65762306a36Sopenharmony_ci if (!bdev_nonrot(bdev)) 65862306a36Sopenharmony_ci fs_devices->rotating = true; 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_ci if (bdev_max_discard_sectors(bdev)) 66162306a36Sopenharmony_ci fs_devices->discardable = true; 66262306a36Sopenharmony_ci 66362306a36Sopenharmony_ci device->bdev = bdev; 66462306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 66562306a36Sopenharmony_ci device->holder = holder; 66662306a36Sopenharmony_ci 66762306a36Sopenharmony_ci fs_devices->open_devices++; 66862306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 66962306a36Sopenharmony_ci device->devid != BTRFS_DEV_REPLACE_DEVID) { 67062306a36Sopenharmony_ci fs_devices->rw_devices++; 67162306a36Sopenharmony_ci list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 67262306a36Sopenharmony_ci } 67362306a36Sopenharmony_ci btrfs_release_disk_super(disk_super); 67462306a36Sopenharmony_ci 67562306a36Sopenharmony_ci return 0; 67662306a36Sopenharmony_ci 67762306a36Sopenharmony_cierror_free_page: 67862306a36Sopenharmony_ci btrfs_release_disk_super(disk_super); 67962306a36Sopenharmony_ci blkdev_put(bdev, holder); 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci return -EINVAL; 68262306a36Sopenharmony_ci} 68362306a36Sopenharmony_ci 68462306a36Sopenharmony_ciu8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) 68562306a36Sopenharmony_ci{ 68662306a36Sopenharmony_ci bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & 68762306a36Sopenharmony_ci BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; 69062306a36Sopenharmony_ci} 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci/* 69362306a36Sopenharmony_ci * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 69462306a36Sopenharmony_ci * being created with a disk that has already completed its fsid change. Such 69562306a36Sopenharmony_ci * disk can belong to an fs which has its FSID changed or to one which doesn't. 69662306a36Sopenharmony_ci * Handle both cases here. 69762306a36Sopenharmony_ci */ 69862306a36Sopenharmony_cistatic struct btrfs_fs_devices *find_fsid_inprogress( 69962306a36Sopenharmony_ci struct btrfs_super_block *disk_super) 70062306a36Sopenharmony_ci{ 70162306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices; 70262306a36Sopenharmony_ci 70362306a36Sopenharmony_ci list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 70462306a36Sopenharmony_ci if (fs_devices->fsid_change) 70562306a36Sopenharmony_ci continue; 70662306a36Sopenharmony_ci 70762306a36Sopenharmony_ci if (check_fsid_changed(fs_devices, disk_super->fsid)) 70862306a36Sopenharmony_ci return fs_devices; 70962306a36Sopenharmony_ci } 71062306a36Sopenharmony_ci 71162306a36Sopenharmony_ci return find_fsid(disk_super->fsid, NULL); 71262306a36Sopenharmony_ci} 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_cistatic struct btrfs_fs_devices *find_fsid_changed( 71562306a36Sopenharmony_ci struct btrfs_super_block *disk_super) 71662306a36Sopenharmony_ci{ 71762306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices; 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_ci /* 72062306a36Sopenharmony_ci * Handles the case where scanned device is part of an fs that had 72162306a36Sopenharmony_ci * multiple successful changes of FSID but currently device didn't 72262306a36Sopenharmony_ci * observe it. Meaning our fsid will be different than theirs. We need 72362306a36Sopenharmony_ci * to handle two subcases : 72462306a36Sopenharmony_ci * 1 - The fs still continues to have different METADATA/FSID uuids. 72562306a36Sopenharmony_ci * 2 - The fs is switched back to its original FSID (METADATA/FSID 72662306a36Sopenharmony_ci * are equal). 72762306a36Sopenharmony_ci */ 72862306a36Sopenharmony_ci list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 72962306a36Sopenharmony_ci /* Changed UUIDs */ 73062306a36Sopenharmony_ci if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) && 73162306a36Sopenharmony_ci memcmp(fs_devices->fsid, disk_super->fsid, 73262306a36Sopenharmony_ci BTRFS_FSID_SIZE) != 0) 73362306a36Sopenharmony_ci return fs_devices; 73462306a36Sopenharmony_ci 73562306a36Sopenharmony_ci /* Unchanged UUIDs */ 73662306a36Sopenharmony_ci if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 73762306a36Sopenharmony_ci BTRFS_FSID_SIZE) == 0 && 73862306a36Sopenharmony_ci memcmp(fs_devices->fsid, disk_super->metadata_uuid, 73962306a36Sopenharmony_ci BTRFS_FSID_SIZE) == 0) 74062306a36Sopenharmony_ci return fs_devices; 74162306a36Sopenharmony_ci } 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci return NULL; 74462306a36Sopenharmony_ci} 74562306a36Sopenharmony_ci 74662306a36Sopenharmony_cistatic struct btrfs_fs_devices *find_fsid_reverted_metadata( 74762306a36Sopenharmony_ci struct btrfs_super_block *disk_super) 74862306a36Sopenharmony_ci{ 74962306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices; 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci /* 75262306a36Sopenharmony_ci * Handle the case where the scanned device is part of an fs whose last 75362306a36Sopenharmony_ci * metadata UUID change reverted it to the original FSID. At the same 75462306a36Sopenharmony_ci * time fs_devices was first created by another constituent device 75562306a36Sopenharmony_ci * which didn't fully observe the operation. This results in an 75662306a36Sopenharmony_ci * btrfs_fs_devices created with metadata/fsid different AND 75762306a36Sopenharmony_ci * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 75862306a36Sopenharmony_ci * fs_devices equal to the FSID of the disk. 75962306a36Sopenharmony_ci */ 76062306a36Sopenharmony_ci list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 76162306a36Sopenharmony_ci if (!fs_devices->fsid_change) 76262306a36Sopenharmony_ci continue; 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci if (check_fsid_changed(fs_devices, disk_super->fsid)) 76562306a36Sopenharmony_ci return fs_devices; 76662306a36Sopenharmony_ci } 76762306a36Sopenharmony_ci 76862306a36Sopenharmony_ci return NULL; 76962306a36Sopenharmony_ci} 77062306a36Sopenharmony_ci/* 77162306a36Sopenharmony_ci * Add new device to list of registered devices 77262306a36Sopenharmony_ci * 77362306a36Sopenharmony_ci * Returns: 77462306a36Sopenharmony_ci * device pointer which was just added or updated when successful 77562306a36Sopenharmony_ci * error pointer when failed 77662306a36Sopenharmony_ci */ 77762306a36Sopenharmony_cistatic noinline struct btrfs_device *device_list_add(const char *path, 77862306a36Sopenharmony_ci struct btrfs_super_block *disk_super, 77962306a36Sopenharmony_ci bool *new_device_added) 78062306a36Sopenharmony_ci{ 78162306a36Sopenharmony_ci struct btrfs_device *device; 78262306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = NULL; 78362306a36Sopenharmony_ci struct rcu_string *name; 78462306a36Sopenharmony_ci u64 found_transid = btrfs_super_generation(disk_super); 78562306a36Sopenharmony_ci u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 78662306a36Sopenharmony_ci dev_t path_devt; 78762306a36Sopenharmony_ci int error; 78862306a36Sopenharmony_ci bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 78962306a36Sopenharmony_ci BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 79062306a36Sopenharmony_ci bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 79162306a36Sopenharmony_ci BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 79262306a36Sopenharmony_ci 79362306a36Sopenharmony_ci error = lookup_bdev(path, &path_devt); 79462306a36Sopenharmony_ci if (error) { 79562306a36Sopenharmony_ci btrfs_err(NULL, "failed to lookup block device for path %s: %d", 79662306a36Sopenharmony_ci path, error); 79762306a36Sopenharmony_ci return ERR_PTR(error); 79862306a36Sopenharmony_ci } 79962306a36Sopenharmony_ci 80062306a36Sopenharmony_ci if (fsid_change_in_progress) { 80162306a36Sopenharmony_ci if (!has_metadata_uuid) 80262306a36Sopenharmony_ci fs_devices = find_fsid_inprogress(disk_super); 80362306a36Sopenharmony_ci else 80462306a36Sopenharmony_ci fs_devices = find_fsid_changed(disk_super); 80562306a36Sopenharmony_ci } else if (has_metadata_uuid) { 80662306a36Sopenharmony_ci fs_devices = find_fsid_with_metadata_uuid(disk_super); 80762306a36Sopenharmony_ci } else { 80862306a36Sopenharmony_ci fs_devices = find_fsid_reverted_metadata(disk_super); 80962306a36Sopenharmony_ci if (!fs_devices) 81062306a36Sopenharmony_ci fs_devices = find_fsid(disk_super->fsid, NULL); 81162306a36Sopenharmony_ci } 81262306a36Sopenharmony_ci 81362306a36Sopenharmony_ci 81462306a36Sopenharmony_ci if (!fs_devices) { 81562306a36Sopenharmony_ci fs_devices = alloc_fs_devices(disk_super->fsid, 81662306a36Sopenharmony_ci has_metadata_uuid ? disk_super->metadata_uuid : NULL); 81762306a36Sopenharmony_ci if (IS_ERR(fs_devices)) 81862306a36Sopenharmony_ci return ERR_CAST(fs_devices); 81962306a36Sopenharmony_ci 82062306a36Sopenharmony_ci fs_devices->fsid_change = fsid_change_in_progress; 82162306a36Sopenharmony_ci 82262306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 82362306a36Sopenharmony_ci list_add(&fs_devices->fs_list, &fs_uuids); 82462306a36Sopenharmony_ci 82562306a36Sopenharmony_ci device = NULL; 82662306a36Sopenharmony_ci } else { 82762306a36Sopenharmony_ci struct btrfs_dev_lookup_args args = { 82862306a36Sopenharmony_ci .devid = devid, 82962306a36Sopenharmony_ci .uuid = disk_super->dev_item.uuid, 83062306a36Sopenharmony_ci }; 83162306a36Sopenharmony_ci 83262306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 83362306a36Sopenharmony_ci device = btrfs_find_device(fs_devices, &args); 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_ci /* 83662306a36Sopenharmony_ci * If this disk has been pulled into an fs devices created by 83762306a36Sopenharmony_ci * a device which had the CHANGING_FSID_V2 flag then replace the 83862306a36Sopenharmony_ci * metadata_uuid/fsid values of the fs_devices. 83962306a36Sopenharmony_ci */ 84062306a36Sopenharmony_ci if (fs_devices->fsid_change && 84162306a36Sopenharmony_ci found_transid > fs_devices->latest_generation) { 84262306a36Sopenharmony_ci memcpy(fs_devices->fsid, disk_super->fsid, 84362306a36Sopenharmony_ci BTRFS_FSID_SIZE); 84462306a36Sopenharmony_ci memcpy(fs_devices->metadata_uuid, 84562306a36Sopenharmony_ci btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); 84662306a36Sopenharmony_ci fs_devices->fsid_change = false; 84762306a36Sopenharmony_ci } 84862306a36Sopenharmony_ci } 84962306a36Sopenharmony_ci 85062306a36Sopenharmony_ci if (!device) { 85162306a36Sopenharmony_ci unsigned int nofs_flag; 85262306a36Sopenharmony_ci 85362306a36Sopenharmony_ci if (fs_devices->opened) { 85462306a36Sopenharmony_ci btrfs_err(NULL, 85562306a36Sopenharmony_ci"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", 85662306a36Sopenharmony_ci path, fs_devices->fsid, current->comm, 85762306a36Sopenharmony_ci task_pid_nr(current)); 85862306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 85962306a36Sopenharmony_ci return ERR_PTR(-EBUSY); 86062306a36Sopenharmony_ci } 86162306a36Sopenharmony_ci 86262306a36Sopenharmony_ci nofs_flag = memalloc_nofs_save(); 86362306a36Sopenharmony_ci device = btrfs_alloc_device(NULL, &devid, 86462306a36Sopenharmony_ci disk_super->dev_item.uuid, path); 86562306a36Sopenharmony_ci memalloc_nofs_restore(nofs_flag); 86662306a36Sopenharmony_ci if (IS_ERR(device)) { 86762306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 86862306a36Sopenharmony_ci /* we can safely leave the fs_devices entry around */ 86962306a36Sopenharmony_ci return device; 87062306a36Sopenharmony_ci } 87162306a36Sopenharmony_ci 87262306a36Sopenharmony_ci device->devt = path_devt; 87362306a36Sopenharmony_ci 87462306a36Sopenharmony_ci list_add_rcu(&device->dev_list, &fs_devices->devices); 87562306a36Sopenharmony_ci fs_devices->num_devices++; 87662306a36Sopenharmony_ci 87762306a36Sopenharmony_ci device->fs_devices = fs_devices; 87862306a36Sopenharmony_ci *new_device_added = true; 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci if (disk_super->label[0]) 88162306a36Sopenharmony_ci pr_info( 88262306a36Sopenharmony_ci "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 88362306a36Sopenharmony_ci disk_super->label, devid, found_transid, path, 88462306a36Sopenharmony_ci current->comm, task_pid_nr(current)); 88562306a36Sopenharmony_ci else 88662306a36Sopenharmony_ci pr_info( 88762306a36Sopenharmony_ci "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 88862306a36Sopenharmony_ci disk_super->fsid, devid, found_transid, path, 88962306a36Sopenharmony_ci current->comm, task_pid_nr(current)); 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_ci } else if (!device->name || strcmp(device->name->str, path)) { 89262306a36Sopenharmony_ci /* 89362306a36Sopenharmony_ci * When FS is already mounted. 89462306a36Sopenharmony_ci * 1. If you are here and if the device->name is NULL that 89562306a36Sopenharmony_ci * means this device was missing at time of FS mount. 89662306a36Sopenharmony_ci * 2. If you are here and if the device->name is different 89762306a36Sopenharmony_ci * from 'path' that means either 89862306a36Sopenharmony_ci * a. The same device disappeared and reappeared with 89962306a36Sopenharmony_ci * different name. or 90062306a36Sopenharmony_ci * b. The missing-disk-which-was-replaced, has 90162306a36Sopenharmony_ci * reappeared now. 90262306a36Sopenharmony_ci * 90362306a36Sopenharmony_ci * We must allow 1 and 2a above. But 2b would be a spurious 90462306a36Sopenharmony_ci * and unintentional. 90562306a36Sopenharmony_ci * 90662306a36Sopenharmony_ci * Further in case of 1 and 2a above, the disk at 'path' 90762306a36Sopenharmony_ci * would have missed some transaction when it was away and 90862306a36Sopenharmony_ci * in case of 2a the stale bdev has to be updated as well. 90962306a36Sopenharmony_ci * 2b must not be allowed at all time. 91062306a36Sopenharmony_ci */ 91162306a36Sopenharmony_ci 91262306a36Sopenharmony_ci /* 91362306a36Sopenharmony_ci * For now, we do allow update to btrfs_fs_device through the 91462306a36Sopenharmony_ci * btrfs dev scan cli after FS has been mounted. We're still 91562306a36Sopenharmony_ci * tracking a problem where systems fail mount by subvolume id 91662306a36Sopenharmony_ci * when we reject replacement on a mounted FS. 91762306a36Sopenharmony_ci */ 91862306a36Sopenharmony_ci if (!fs_devices->opened && found_transid < device->generation) { 91962306a36Sopenharmony_ci /* 92062306a36Sopenharmony_ci * That is if the FS is _not_ mounted and if you 92162306a36Sopenharmony_ci * are here, that means there is more than one 92262306a36Sopenharmony_ci * disk with same uuid and devid.We keep the one 92362306a36Sopenharmony_ci * with larger generation number or the last-in if 92462306a36Sopenharmony_ci * generation are equal. 92562306a36Sopenharmony_ci */ 92662306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 92762306a36Sopenharmony_ci btrfs_err(NULL, 92862306a36Sopenharmony_ci"device %s already registered with a higher generation, found %llu expect %llu", 92962306a36Sopenharmony_ci path, found_transid, device->generation); 93062306a36Sopenharmony_ci return ERR_PTR(-EEXIST); 93162306a36Sopenharmony_ci } 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci /* 93462306a36Sopenharmony_ci * We are going to replace the device path for a given devid, 93562306a36Sopenharmony_ci * make sure it's the same device if the device is mounted 93662306a36Sopenharmony_ci * 93762306a36Sopenharmony_ci * NOTE: the device->fs_info may not be reliable here so pass 93862306a36Sopenharmony_ci * in a NULL to message helpers instead. This avoids a possible 93962306a36Sopenharmony_ci * use-after-free when the fs_info and fs_info->sb are already 94062306a36Sopenharmony_ci * torn down. 94162306a36Sopenharmony_ci */ 94262306a36Sopenharmony_ci if (device->bdev) { 94362306a36Sopenharmony_ci if (device->devt != path_devt) { 94462306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 94562306a36Sopenharmony_ci btrfs_warn_in_rcu(NULL, 94662306a36Sopenharmony_ci "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 94762306a36Sopenharmony_ci path, devid, found_transid, 94862306a36Sopenharmony_ci current->comm, 94962306a36Sopenharmony_ci task_pid_nr(current)); 95062306a36Sopenharmony_ci return ERR_PTR(-EEXIST); 95162306a36Sopenharmony_ci } 95262306a36Sopenharmony_ci btrfs_info_in_rcu(NULL, 95362306a36Sopenharmony_ci "devid %llu device path %s changed to %s scanned by %s (%d)", 95462306a36Sopenharmony_ci devid, btrfs_dev_name(device), 95562306a36Sopenharmony_ci path, current->comm, 95662306a36Sopenharmony_ci task_pid_nr(current)); 95762306a36Sopenharmony_ci } 95862306a36Sopenharmony_ci 95962306a36Sopenharmony_ci name = rcu_string_strdup(path, GFP_NOFS); 96062306a36Sopenharmony_ci if (!name) { 96162306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 96262306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 96362306a36Sopenharmony_ci } 96462306a36Sopenharmony_ci rcu_string_free(device->name); 96562306a36Sopenharmony_ci rcu_assign_pointer(device->name, name); 96662306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 96762306a36Sopenharmony_ci fs_devices->missing_devices--; 96862306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 96962306a36Sopenharmony_ci } 97062306a36Sopenharmony_ci device->devt = path_devt; 97162306a36Sopenharmony_ci } 97262306a36Sopenharmony_ci 97362306a36Sopenharmony_ci /* 97462306a36Sopenharmony_ci * Unmount does not free the btrfs_device struct but would zero 97562306a36Sopenharmony_ci * generation along with most of the other members. So just update 97662306a36Sopenharmony_ci * it back. We need it to pick the disk with largest generation 97762306a36Sopenharmony_ci * (as above). 97862306a36Sopenharmony_ci */ 97962306a36Sopenharmony_ci if (!fs_devices->opened) { 98062306a36Sopenharmony_ci device->generation = found_transid; 98162306a36Sopenharmony_ci fs_devices->latest_generation = max_t(u64, found_transid, 98262306a36Sopenharmony_ci fs_devices->latest_generation); 98362306a36Sopenharmony_ci } 98462306a36Sopenharmony_ci 98562306a36Sopenharmony_ci fs_devices->total_devices = btrfs_super_num_devices(disk_super); 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 98862306a36Sopenharmony_ci return device; 98962306a36Sopenharmony_ci} 99062306a36Sopenharmony_ci 99162306a36Sopenharmony_cistatic struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 99262306a36Sopenharmony_ci{ 99362306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices; 99462306a36Sopenharmony_ci struct btrfs_device *device; 99562306a36Sopenharmony_ci struct btrfs_device *orig_dev; 99662306a36Sopenharmony_ci int ret = 0; 99762306a36Sopenharmony_ci 99862306a36Sopenharmony_ci lockdep_assert_held(&uuid_mutex); 99962306a36Sopenharmony_ci 100062306a36Sopenharmony_ci fs_devices = alloc_fs_devices(orig->fsid, NULL); 100162306a36Sopenharmony_ci if (IS_ERR(fs_devices)) 100262306a36Sopenharmony_ci return fs_devices; 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci fs_devices->total_devices = orig->total_devices; 100562306a36Sopenharmony_ci 100662306a36Sopenharmony_ci list_for_each_entry(orig_dev, &orig->devices, dev_list) { 100762306a36Sopenharmony_ci const char *dev_path = NULL; 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_ci /* 101062306a36Sopenharmony_ci * This is ok to do without RCU read locked because we hold the 101162306a36Sopenharmony_ci * uuid mutex so nothing we touch in here is going to disappear. 101262306a36Sopenharmony_ci */ 101362306a36Sopenharmony_ci if (orig_dev->name) 101462306a36Sopenharmony_ci dev_path = orig_dev->name->str; 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci device = btrfs_alloc_device(NULL, &orig_dev->devid, 101762306a36Sopenharmony_ci orig_dev->uuid, dev_path); 101862306a36Sopenharmony_ci if (IS_ERR(device)) { 101962306a36Sopenharmony_ci ret = PTR_ERR(device); 102062306a36Sopenharmony_ci goto error; 102162306a36Sopenharmony_ci } 102262306a36Sopenharmony_ci 102362306a36Sopenharmony_ci if (orig_dev->zone_info) { 102462306a36Sopenharmony_ci struct btrfs_zoned_device_info *zone_info; 102562306a36Sopenharmony_ci 102662306a36Sopenharmony_ci zone_info = btrfs_clone_dev_zone_info(orig_dev); 102762306a36Sopenharmony_ci if (!zone_info) { 102862306a36Sopenharmony_ci btrfs_free_device(device); 102962306a36Sopenharmony_ci ret = -ENOMEM; 103062306a36Sopenharmony_ci goto error; 103162306a36Sopenharmony_ci } 103262306a36Sopenharmony_ci device->zone_info = zone_info; 103362306a36Sopenharmony_ci } 103462306a36Sopenharmony_ci 103562306a36Sopenharmony_ci list_add(&device->dev_list, &fs_devices->devices); 103662306a36Sopenharmony_ci device->fs_devices = fs_devices; 103762306a36Sopenharmony_ci fs_devices->num_devices++; 103862306a36Sopenharmony_ci } 103962306a36Sopenharmony_ci return fs_devices; 104062306a36Sopenharmony_cierror: 104162306a36Sopenharmony_ci free_fs_devices(fs_devices); 104262306a36Sopenharmony_ci return ERR_PTR(ret); 104362306a36Sopenharmony_ci} 104462306a36Sopenharmony_ci 104562306a36Sopenharmony_cistatic void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 104662306a36Sopenharmony_ci struct btrfs_device **latest_dev) 104762306a36Sopenharmony_ci{ 104862306a36Sopenharmony_ci struct btrfs_device *device, *next; 104962306a36Sopenharmony_ci 105062306a36Sopenharmony_ci /* This is the initialized path, it is safe to release the devices. */ 105162306a36Sopenharmony_ci list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 105262306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 105362306a36Sopenharmony_ci if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 105462306a36Sopenharmony_ci &device->dev_state) && 105562306a36Sopenharmony_ci !test_bit(BTRFS_DEV_STATE_MISSING, 105662306a36Sopenharmony_ci &device->dev_state) && 105762306a36Sopenharmony_ci (!*latest_dev || 105862306a36Sopenharmony_ci device->generation > (*latest_dev)->generation)) { 105962306a36Sopenharmony_ci *latest_dev = device; 106062306a36Sopenharmony_ci } 106162306a36Sopenharmony_ci continue; 106262306a36Sopenharmony_ci } 106362306a36Sopenharmony_ci 106462306a36Sopenharmony_ci /* 106562306a36Sopenharmony_ci * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 106662306a36Sopenharmony_ci * in btrfs_init_dev_replace() so just continue. 106762306a36Sopenharmony_ci */ 106862306a36Sopenharmony_ci if (device->devid == BTRFS_DEV_REPLACE_DEVID) 106962306a36Sopenharmony_ci continue; 107062306a36Sopenharmony_ci 107162306a36Sopenharmony_ci if (device->bdev) { 107262306a36Sopenharmony_ci blkdev_put(device->bdev, device->holder); 107362306a36Sopenharmony_ci device->bdev = NULL; 107462306a36Sopenharmony_ci fs_devices->open_devices--; 107562306a36Sopenharmony_ci } 107662306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 107762306a36Sopenharmony_ci list_del_init(&device->dev_alloc_list); 107862306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 107962306a36Sopenharmony_ci fs_devices->rw_devices--; 108062306a36Sopenharmony_ci } 108162306a36Sopenharmony_ci list_del_init(&device->dev_list); 108262306a36Sopenharmony_ci fs_devices->num_devices--; 108362306a36Sopenharmony_ci btrfs_free_device(device); 108462306a36Sopenharmony_ci } 108562306a36Sopenharmony_ci 108662306a36Sopenharmony_ci} 108762306a36Sopenharmony_ci 108862306a36Sopenharmony_ci/* 108962306a36Sopenharmony_ci * After we have read the system tree and know devids belonging to this 109062306a36Sopenharmony_ci * filesystem, remove the device which does not belong there. 109162306a36Sopenharmony_ci */ 109262306a36Sopenharmony_civoid btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 109362306a36Sopenharmony_ci{ 109462306a36Sopenharmony_ci struct btrfs_device *latest_dev = NULL; 109562306a36Sopenharmony_ci struct btrfs_fs_devices *seed_dev; 109662306a36Sopenharmony_ci 109762306a36Sopenharmony_ci mutex_lock(&uuid_mutex); 109862306a36Sopenharmony_ci __btrfs_free_extra_devids(fs_devices, &latest_dev); 109962306a36Sopenharmony_ci 110062306a36Sopenharmony_ci list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 110162306a36Sopenharmony_ci __btrfs_free_extra_devids(seed_dev, &latest_dev); 110262306a36Sopenharmony_ci 110362306a36Sopenharmony_ci fs_devices->latest_dev = latest_dev; 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_ci mutex_unlock(&uuid_mutex); 110662306a36Sopenharmony_ci} 110762306a36Sopenharmony_ci 110862306a36Sopenharmony_cistatic void btrfs_close_bdev(struct btrfs_device *device) 110962306a36Sopenharmony_ci{ 111062306a36Sopenharmony_ci if (!device->bdev) 111162306a36Sopenharmony_ci return; 111262306a36Sopenharmony_ci 111362306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 111462306a36Sopenharmony_ci sync_blockdev(device->bdev); 111562306a36Sopenharmony_ci invalidate_bdev(device->bdev); 111662306a36Sopenharmony_ci } 111762306a36Sopenharmony_ci 111862306a36Sopenharmony_ci blkdev_put(device->bdev, device->holder); 111962306a36Sopenharmony_ci} 112062306a36Sopenharmony_ci 112162306a36Sopenharmony_cistatic void btrfs_close_one_device(struct btrfs_device *device) 112262306a36Sopenharmony_ci{ 112362306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = device->fs_devices; 112462306a36Sopenharmony_ci 112562306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 112662306a36Sopenharmony_ci device->devid != BTRFS_DEV_REPLACE_DEVID) { 112762306a36Sopenharmony_ci list_del_init(&device->dev_alloc_list); 112862306a36Sopenharmony_ci fs_devices->rw_devices--; 112962306a36Sopenharmony_ci } 113062306a36Sopenharmony_ci 113162306a36Sopenharmony_ci if (device->devid == BTRFS_DEV_REPLACE_DEVID) 113262306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 113362306a36Sopenharmony_ci 113462306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 113562306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 113662306a36Sopenharmony_ci fs_devices->missing_devices--; 113762306a36Sopenharmony_ci } 113862306a36Sopenharmony_ci 113962306a36Sopenharmony_ci btrfs_close_bdev(device); 114062306a36Sopenharmony_ci if (device->bdev) { 114162306a36Sopenharmony_ci fs_devices->open_devices--; 114262306a36Sopenharmony_ci device->bdev = NULL; 114362306a36Sopenharmony_ci } 114462306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 114562306a36Sopenharmony_ci btrfs_destroy_dev_zone_info(device); 114662306a36Sopenharmony_ci 114762306a36Sopenharmony_ci device->fs_info = NULL; 114862306a36Sopenharmony_ci atomic_set(&device->dev_stats_ccnt, 0); 114962306a36Sopenharmony_ci extent_io_tree_release(&device->alloc_state); 115062306a36Sopenharmony_ci 115162306a36Sopenharmony_ci /* 115262306a36Sopenharmony_ci * Reset the flush error record. We might have a transient flush error 115362306a36Sopenharmony_ci * in this mount, and if so we aborted the current transaction and set 115462306a36Sopenharmony_ci * the fs to an error state, guaranteeing no super blocks can be further 115562306a36Sopenharmony_ci * committed. However that error might be transient and if we unmount the 115662306a36Sopenharmony_ci * filesystem and mount it again, we should allow the mount to succeed 115762306a36Sopenharmony_ci * (btrfs_check_rw_degradable() should not fail) - if after mounting the 115862306a36Sopenharmony_ci * filesystem again we still get flush errors, then we will again abort 115962306a36Sopenharmony_ci * any transaction and set the error state, guaranteeing no commits of 116062306a36Sopenharmony_ci * unsafe super blocks. 116162306a36Sopenharmony_ci */ 116262306a36Sopenharmony_ci device->last_flush_error = 0; 116362306a36Sopenharmony_ci 116462306a36Sopenharmony_ci /* Verify the device is back in a pristine state */ 116562306a36Sopenharmony_ci WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 116662306a36Sopenharmony_ci WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 116762306a36Sopenharmony_ci WARN_ON(!list_empty(&device->dev_alloc_list)); 116862306a36Sopenharmony_ci WARN_ON(!list_empty(&device->post_commit_list)); 116962306a36Sopenharmony_ci} 117062306a36Sopenharmony_ci 117162306a36Sopenharmony_cistatic void close_fs_devices(struct btrfs_fs_devices *fs_devices) 117262306a36Sopenharmony_ci{ 117362306a36Sopenharmony_ci struct btrfs_device *device, *tmp; 117462306a36Sopenharmony_ci 117562306a36Sopenharmony_ci lockdep_assert_held(&uuid_mutex); 117662306a36Sopenharmony_ci 117762306a36Sopenharmony_ci if (--fs_devices->opened > 0) 117862306a36Sopenharmony_ci return; 117962306a36Sopenharmony_ci 118062306a36Sopenharmony_ci list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 118162306a36Sopenharmony_ci btrfs_close_one_device(device); 118262306a36Sopenharmony_ci 118362306a36Sopenharmony_ci WARN_ON(fs_devices->open_devices); 118462306a36Sopenharmony_ci WARN_ON(fs_devices->rw_devices); 118562306a36Sopenharmony_ci fs_devices->opened = 0; 118662306a36Sopenharmony_ci fs_devices->seeding = false; 118762306a36Sopenharmony_ci fs_devices->fs_info = NULL; 118862306a36Sopenharmony_ci} 118962306a36Sopenharmony_ci 119062306a36Sopenharmony_civoid btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 119162306a36Sopenharmony_ci{ 119262306a36Sopenharmony_ci LIST_HEAD(list); 119362306a36Sopenharmony_ci struct btrfs_fs_devices *tmp; 119462306a36Sopenharmony_ci 119562306a36Sopenharmony_ci mutex_lock(&uuid_mutex); 119662306a36Sopenharmony_ci close_fs_devices(fs_devices); 119762306a36Sopenharmony_ci if (!fs_devices->opened) { 119862306a36Sopenharmony_ci list_splice_init(&fs_devices->seed_list, &list); 119962306a36Sopenharmony_ci 120062306a36Sopenharmony_ci /* 120162306a36Sopenharmony_ci * If the struct btrfs_fs_devices is not assembled with any 120262306a36Sopenharmony_ci * other device, it can be re-initialized during the next mount 120362306a36Sopenharmony_ci * without the needing device-scan step. Therefore, it can be 120462306a36Sopenharmony_ci * fully freed. 120562306a36Sopenharmony_ci */ 120662306a36Sopenharmony_ci if (fs_devices->num_devices == 1) { 120762306a36Sopenharmony_ci list_del(&fs_devices->fs_list); 120862306a36Sopenharmony_ci free_fs_devices(fs_devices); 120962306a36Sopenharmony_ci } 121062306a36Sopenharmony_ci } 121162306a36Sopenharmony_ci 121262306a36Sopenharmony_ci 121362306a36Sopenharmony_ci list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 121462306a36Sopenharmony_ci close_fs_devices(fs_devices); 121562306a36Sopenharmony_ci list_del(&fs_devices->seed_list); 121662306a36Sopenharmony_ci free_fs_devices(fs_devices); 121762306a36Sopenharmony_ci } 121862306a36Sopenharmony_ci mutex_unlock(&uuid_mutex); 121962306a36Sopenharmony_ci} 122062306a36Sopenharmony_ci 122162306a36Sopenharmony_cistatic int open_fs_devices(struct btrfs_fs_devices *fs_devices, 122262306a36Sopenharmony_ci blk_mode_t flags, void *holder) 122362306a36Sopenharmony_ci{ 122462306a36Sopenharmony_ci struct btrfs_device *device; 122562306a36Sopenharmony_ci struct btrfs_device *latest_dev = NULL; 122662306a36Sopenharmony_ci struct btrfs_device *tmp_device; 122762306a36Sopenharmony_ci 122862306a36Sopenharmony_ci list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 122962306a36Sopenharmony_ci dev_list) { 123062306a36Sopenharmony_ci int ret; 123162306a36Sopenharmony_ci 123262306a36Sopenharmony_ci ret = btrfs_open_one_device(fs_devices, device, flags, holder); 123362306a36Sopenharmony_ci if (ret == 0 && 123462306a36Sopenharmony_ci (!latest_dev || device->generation > latest_dev->generation)) { 123562306a36Sopenharmony_ci latest_dev = device; 123662306a36Sopenharmony_ci } else if (ret == -ENODATA) { 123762306a36Sopenharmony_ci fs_devices->num_devices--; 123862306a36Sopenharmony_ci list_del(&device->dev_list); 123962306a36Sopenharmony_ci btrfs_free_device(device); 124062306a36Sopenharmony_ci } 124162306a36Sopenharmony_ci } 124262306a36Sopenharmony_ci if (fs_devices->open_devices == 0) 124362306a36Sopenharmony_ci return -EINVAL; 124462306a36Sopenharmony_ci 124562306a36Sopenharmony_ci fs_devices->opened = 1; 124662306a36Sopenharmony_ci fs_devices->latest_dev = latest_dev; 124762306a36Sopenharmony_ci fs_devices->total_rw_bytes = 0; 124862306a36Sopenharmony_ci fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 124962306a36Sopenharmony_ci fs_devices->read_policy = BTRFS_READ_POLICY_PID; 125062306a36Sopenharmony_ci 125162306a36Sopenharmony_ci return 0; 125262306a36Sopenharmony_ci} 125362306a36Sopenharmony_ci 125462306a36Sopenharmony_cistatic int devid_cmp(void *priv, const struct list_head *a, 125562306a36Sopenharmony_ci const struct list_head *b) 125662306a36Sopenharmony_ci{ 125762306a36Sopenharmony_ci const struct btrfs_device *dev1, *dev2; 125862306a36Sopenharmony_ci 125962306a36Sopenharmony_ci dev1 = list_entry(a, struct btrfs_device, dev_list); 126062306a36Sopenharmony_ci dev2 = list_entry(b, struct btrfs_device, dev_list); 126162306a36Sopenharmony_ci 126262306a36Sopenharmony_ci if (dev1->devid < dev2->devid) 126362306a36Sopenharmony_ci return -1; 126462306a36Sopenharmony_ci else if (dev1->devid > dev2->devid) 126562306a36Sopenharmony_ci return 1; 126662306a36Sopenharmony_ci return 0; 126762306a36Sopenharmony_ci} 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_ciint btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 127062306a36Sopenharmony_ci blk_mode_t flags, void *holder) 127162306a36Sopenharmony_ci{ 127262306a36Sopenharmony_ci int ret; 127362306a36Sopenharmony_ci 127462306a36Sopenharmony_ci lockdep_assert_held(&uuid_mutex); 127562306a36Sopenharmony_ci /* 127662306a36Sopenharmony_ci * The device_list_mutex cannot be taken here in case opening the 127762306a36Sopenharmony_ci * underlying device takes further locks like open_mutex. 127862306a36Sopenharmony_ci * 127962306a36Sopenharmony_ci * We also don't need the lock here as this is called during mount and 128062306a36Sopenharmony_ci * exclusion is provided by uuid_mutex 128162306a36Sopenharmony_ci */ 128262306a36Sopenharmony_ci 128362306a36Sopenharmony_ci if (fs_devices->opened) { 128462306a36Sopenharmony_ci fs_devices->opened++; 128562306a36Sopenharmony_ci ret = 0; 128662306a36Sopenharmony_ci } else { 128762306a36Sopenharmony_ci list_sort(NULL, &fs_devices->devices, devid_cmp); 128862306a36Sopenharmony_ci ret = open_fs_devices(fs_devices, flags, holder); 128962306a36Sopenharmony_ci } 129062306a36Sopenharmony_ci 129162306a36Sopenharmony_ci return ret; 129262306a36Sopenharmony_ci} 129362306a36Sopenharmony_ci 129462306a36Sopenharmony_civoid btrfs_release_disk_super(struct btrfs_super_block *super) 129562306a36Sopenharmony_ci{ 129662306a36Sopenharmony_ci struct page *page = virt_to_page(super); 129762306a36Sopenharmony_ci 129862306a36Sopenharmony_ci put_page(page); 129962306a36Sopenharmony_ci} 130062306a36Sopenharmony_ci 130162306a36Sopenharmony_cistatic struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 130262306a36Sopenharmony_ci u64 bytenr, u64 bytenr_orig) 130362306a36Sopenharmony_ci{ 130462306a36Sopenharmony_ci struct btrfs_super_block *disk_super; 130562306a36Sopenharmony_ci struct page *page; 130662306a36Sopenharmony_ci void *p; 130762306a36Sopenharmony_ci pgoff_t index; 130862306a36Sopenharmony_ci 130962306a36Sopenharmony_ci /* make sure our super fits in the device */ 131062306a36Sopenharmony_ci if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) 131162306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 131262306a36Sopenharmony_ci 131362306a36Sopenharmony_ci /* make sure our super fits in the page */ 131462306a36Sopenharmony_ci if (sizeof(*disk_super) > PAGE_SIZE) 131562306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 131662306a36Sopenharmony_ci 131762306a36Sopenharmony_ci /* make sure our super doesn't straddle pages on disk */ 131862306a36Sopenharmony_ci index = bytenr >> PAGE_SHIFT; 131962306a36Sopenharmony_ci if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 132062306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 132162306a36Sopenharmony_ci 132262306a36Sopenharmony_ci /* pull in the page with our super */ 132362306a36Sopenharmony_ci page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 132462306a36Sopenharmony_ci 132562306a36Sopenharmony_ci if (IS_ERR(page)) 132662306a36Sopenharmony_ci return ERR_CAST(page); 132762306a36Sopenharmony_ci 132862306a36Sopenharmony_ci p = page_address(page); 132962306a36Sopenharmony_ci 133062306a36Sopenharmony_ci /* align our pointer to the offset of the super block */ 133162306a36Sopenharmony_ci disk_super = p + offset_in_page(bytenr); 133262306a36Sopenharmony_ci 133362306a36Sopenharmony_ci if (btrfs_super_bytenr(disk_super) != bytenr_orig || 133462306a36Sopenharmony_ci btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 133562306a36Sopenharmony_ci btrfs_release_disk_super(p); 133662306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 133762306a36Sopenharmony_ci } 133862306a36Sopenharmony_ci 133962306a36Sopenharmony_ci if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 134062306a36Sopenharmony_ci disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 134162306a36Sopenharmony_ci 134262306a36Sopenharmony_ci return disk_super; 134362306a36Sopenharmony_ci} 134462306a36Sopenharmony_ci 134562306a36Sopenharmony_ciint btrfs_forget_devices(dev_t devt) 134662306a36Sopenharmony_ci{ 134762306a36Sopenharmony_ci int ret; 134862306a36Sopenharmony_ci 134962306a36Sopenharmony_ci mutex_lock(&uuid_mutex); 135062306a36Sopenharmony_ci ret = btrfs_free_stale_devices(devt, NULL); 135162306a36Sopenharmony_ci mutex_unlock(&uuid_mutex); 135262306a36Sopenharmony_ci 135362306a36Sopenharmony_ci return ret; 135462306a36Sopenharmony_ci} 135562306a36Sopenharmony_ci 135662306a36Sopenharmony_ci/* 135762306a36Sopenharmony_ci * Look for a btrfs signature on a device. This may be called out of the mount path 135862306a36Sopenharmony_ci * and we are not allowed to call set_blocksize during the scan. The superblock 135962306a36Sopenharmony_ci * is read via pagecache 136062306a36Sopenharmony_ci */ 136162306a36Sopenharmony_cistruct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) 136262306a36Sopenharmony_ci{ 136362306a36Sopenharmony_ci struct btrfs_super_block *disk_super; 136462306a36Sopenharmony_ci bool new_device_added = false; 136562306a36Sopenharmony_ci struct btrfs_device *device = NULL; 136662306a36Sopenharmony_ci struct block_device *bdev; 136762306a36Sopenharmony_ci u64 bytenr, bytenr_orig; 136862306a36Sopenharmony_ci int ret; 136962306a36Sopenharmony_ci 137062306a36Sopenharmony_ci lockdep_assert_held(&uuid_mutex); 137162306a36Sopenharmony_ci 137262306a36Sopenharmony_ci /* 137362306a36Sopenharmony_ci * we would like to check all the supers, but that would make 137462306a36Sopenharmony_ci * a btrfs mount succeed after a mkfs from a different FS. 137562306a36Sopenharmony_ci * So, we need to add a special mount option to scan for 137662306a36Sopenharmony_ci * later supers, using BTRFS_SUPER_MIRROR_MAX instead 137762306a36Sopenharmony_ci */ 137862306a36Sopenharmony_ci 137962306a36Sopenharmony_ci /* 138062306a36Sopenharmony_ci * Avoid an exclusive open here, as the systemd-udev may initiate the 138162306a36Sopenharmony_ci * device scan which may race with the user's mount or mkfs command, 138262306a36Sopenharmony_ci * resulting in failure. 138362306a36Sopenharmony_ci * Since the device scan is solely for reading purposes, there is no 138462306a36Sopenharmony_ci * need for an exclusive open. Additionally, the devices are read again 138562306a36Sopenharmony_ci * during the mount process. It is ok to get some inconsistent 138662306a36Sopenharmony_ci * values temporarily, as the device paths of the fsid are the only 138762306a36Sopenharmony_ci * required information for assembling the volume. 138862306a36Sopenharmony_ci */ 138962306a36Sopenharmony_ci bdev = blkdev_get_by_path(path, flags, NULL, NULL); 139062306a36Sopenharmony_ci if (IS_ERR(bdev)) 139162306a36Sopenharmony_ci return ERR_CAST(bdev); 139262306a36Sopenharmony_ci 139362306a36Sopenharmony_ci bytenr_orig = btrfs_sb_offset(0); 139462306a36Sopenharmony_ci ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 139562306a36Sopenharmony_ci if (ret) { 139662306a36Sopenharmony_ci device = ERR_PTR(ret); 139762306a36Sopenharmony_ci goto error_bdev_put; 139862306a36Sopenharmony_ci } 139962306a36Sopenharmony_ci 140062306a36Sopenharmony_ci disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 140162306a36Sopenharmony_ci if (IS_ERR(disk_super)) { 140262306a36Sopenharmony_ci device = ERR_CAST(disk_super); 140362306a36Sopenharmony_ci goto error_bdev_put; 140462306a36Sopenharmony_ci } 140562306a36Sopenharmony_ci 140662306a36Sopenharmony_ci device = device_list_add(path, disk_super, &new_device_added); 140762306a36Sopenharmony_ci if (!IS_ERR(device) && new_device_added) 140862306a36Sopenharmony_ci btrfs_free_stale_devices(device->devt, device); 140962306a36Sopenharmony_ci 141062306a36Sopenharmony_ci btrfs_release_disk_super(disk_super); 141162306a36Sopenharmony_ci 141262306a36Sopenharmony_cierror_bdev_put: 141362306a36Sopenharmony_ci blkdev_put(bdev, NULL); 141462306a36Sopenharmony_ci 141562306a36Sopenharmony_ci return device; 141662306a36Sopenharmony_ci} 141762306a36Sopenharmony_ci 141862306a36Sopenharmony_ci/* 141962306a36Sopenharmony_ci * Try to find a chunk that intersects [start, start + len] range and when one 142062306a36Sopenharmony_ci * such is found, record the end of it in *start 142162306a36Sopenharmony_ci */ 142262306a36Sopenharmony_cistatic bool contains_pending_extent(struct btrfs_device *device, u64 *start, 142362306a36Sopenharmony_ci u64 len) 142462306a36Sopenharmony_ci{ 142562306a36Sopenharmony_ci u64 physical_start, physical_end; 142662306a36Sopenharmony_ci 142762306a36Sopenharmony_ci lockdep_assert_held(&device->fs_info->chunk_mutex); 142862306a36Sopenharmony_ci 142962306a36Sopenharmony_ci if (find_first_extent_bit(&device->alloc_state, *start, 143062306a36Sopenharmony_ci &physical_start, &physical_end, 143162306a36Sopenharmony_ci CHUNK_ALLOCATED, NULL)) { 143262306a36Sopenharmony_ci 143362306a36Sopenharmony_ci if (in_range(physical_start, *start, len) || 143462306a36Sopenharmony_ci in_range(*start, physical_start, 143562306a36Sopenharmony_ci physical_end + 1 - physical_start)) { 143662306a36Sopenharmony_ci *start = physical_end + 1; 143762306a36Sopenharmony_ci return true; 143862306a36Sopenharmony_ci } 143962306a36Sopenharmony_ci } 144062306a36Sopenharmony_ci return false; 144162306a36Sopenharmony_ci} 144262306a36Sopenharmony_ci 144362306a36Sopenharmony_cistatic u64 dev_extent_search_start(struct btrfs_device *device) 144462306a36Sopenharmony_ci{ 144562306a36Sopenharmony_ci switch (device->fs_devices->chunk_alloc_policy) { 144662306a36Sopenharmony_ci case BTRFS_CHUNK_ALLOC_REGULAR: 144762306a36Sopenharmony_ci return BTRFS_DEVICE_RANGE_RESERVED; 144862306a36Sopenharmony_ci case BTRFS_CHUNK_ALLOC_ZONED: 144962306a36Sopenharmony_ci /* 145062306a36Sopenharmony_ci * We don't care about the starting region like regular 145162306a36Sopenharmony_ci * allocator, because we anyway use/reserve the first two zones 145262306a36Sopenharmony_ci * for superblock logging. 145362306a36Sopenharmony_ci */ 145462306a36Sopenharmony_ci return 0; 145562306a36Sopenharmony_ci default: 145662306a36Sopenharmony_ci BUG(); 145762306a36Sopenharmony_ci } 145862306a36Sopenharmony_ci} 145962306a36Sopenharmony_ci 146062306a36Sopenharmony_cistatic bool dev_extent_hole_check_zoned(struct btrfs_device *device, 146162306a36Sopenharmony_ci u64 *hole_start, u64 *hole_size, 146262306a36Sopenharmony_ci u64 num_bytes) 146362306a36Sopenharmony_ci{ 146462306a36Sopenharmony_ci u64 zone_size = device->zone_info->zone_size; 146562306a36Sopenharmony_ci u64 pos; 146662306a36Sopenharmony_ci int ret; 146762306a36Sopenharmony_ci bool changed = false; 146862306a36Sopenharmony_ci 146962306a36Sopenharmony_ci ASSERT(IS_ALIGNED(*hole_start, zone_size)); 147062306a36Sopenharmony_ci 147162306a36Sopenharmony_ci while (*hole_size > 0) { 147262306a36Sopenharmony_ci pos = btrfs_find_allocatable_zones(device, *hole_start, 147362306a36Sopenharmony_ci *hole_start + *hole_size, 147462306a36Sopenharmony_ci num_bytes); 147562306a36Sopenharmony_ci if (pos != *hole_start) { 147662306a36Sopenharmony_ci *hole_size = *hole_start + *hole_size - pos; 147762306a36Sopenharmony_ci *hole_start = pos; 147862306a36Sopenharmony_ci changed = true; 147962306a36Sopenharmony_ci if (*hole_size < num_bytes) 148062306a36Sopenharmony_ci break; 148162306a36Sopenharmony_ci } 148262306a36Sopenharmony_ci 148362306a36Sopenharmony_ci ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 148462306a36Sopenharmony_ci 148562306a36Sopenharmony_ci /* Range is ensured to be empty */ 148662306a36Sopenharmony_ci if (!ret) 148762306a36Sopenharmony_ci return changed; 148862306a36Sopenharmony_ci 148962306a36Sopenharmony_ci /* Given hole range was invalid (outside of device) */ 149062306a36Sopenharmony_ci if (ret == -ERANGE) { 149162306a36Sopenharmony_ci *hole_start += *hole_size; 149262306a36Sopenharmony_ci *hole_size = 0; 149362306a36Sopenharmony_ci return true; 149462306a36Sopenharmony_ci } 149562306a36Sopenharmony_ci 149662306a36Sopenharmony_ci *hole_start += zone_size; 149762306a36Sopenharmony_ci *hole_size -= zone_size; 149862306a36Sopenharmony_ci changed = true; 149962306a36Sopenharmony_ci } 150062306a36Sopenharmony_ci 150162306a36Sopenharmony_ci return changed; 150262306a36Sopenharmony_ci} 150362306a36Sopenharmony_ci 150462306a36Sopenharmony_ci/* 150562306a36Sopenharmony_ci * Check if specified hole is suitable for allocation. 150662306a36Sopenharmony_ci * 150762306a36Sopenharmony_ci * @device: the device which we have the hole 150862306a36Sopenharmony_ci * @hole_start: starting position of the hole 150962306a36Sopenharmony_ci * @hole_size: the size of the hole 151062306a36Sopenharmony_ci * @num_bytes: the size of the free space that we need 151162306a36Sopenharmony_ci * 151262306a36Sopenharmony_ci * This function may modify @hole_start and @hole_size to reflect the suitable 151362306a36Sopenharmony_ci * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 151462306a36Sopenharmony_ci */ 151562306a36Sopenharmony_cistatic bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 151662306a36Sopenharmony_ci u64 *hole_size, u64 num_bytes) 151762306a36Sopenharmony_ci{ 151862306a36Sopenharmony_ci bool changed = false; 151962306a36Sopenharmony_ci u64 hole_end = *hole_start + *hole_size; 152062306a36Sopenharmony_ci 152162306a36Sopenharmony_ci for (;;) { 152262306a36Sopenharmony_ci /* 152362306a36Sopenharmony_ci * Check before we set max_hole_start, otherwise we could end up 152462306a36Sopenharmony_ci * sending back this offset anyway. 152562306a36Sopenharmony_ci */ 152662306a36Sopenharmony_ci if (contains_pending_extent(device, hole_start, *hole_size)) { 152762306a36Sopenharmony_ci if (hole_end >= *hole_start) 152862306a36Sopenharmony_ci *hole_size = hole_end - *hole_start; 152962306a36Sopenharmony_ci else 153062306a36Sopenharmony_ci *hole_size = 0; 153162306a36Sopenharmony_ci changed = true; 153262306a36Sopenharmony_ci } 153362306a36Sopenharmony_ci 153462306a36Sopenharmony_ci switch (device->fs_devices->chunk_alloc_policy) { 153562306a36Sopenharmony_ci case BTRFS_CHUNK_ALLOC_REGULAR: 153662306a36Sopenharmony_ci /* No extra check */ 153762306a36Sopenharmony_ci break; 153862306a36Sopenharmony_ci case BTRFS_CHUNK_ALLOC_ZONED: 153962306a36Sopenharmony_ci if (dev_extent_hole_check_zoned(device, hole_start, 154062306a36Sopenharmony_ci hole_size, num_bytes)) { 154162306a36Sopenharmony_ci changed = true; 154262306a36Sopenharmony_ci /* 154362306a36Sopenharmony_ci * The changed hole can contain pending extent. 154462306a36Sopenharmony_ci * Loop again to check that. 154562306a36Sopenharmony_ci */ 154662306a36Sopenharmony_ci continue; 154762306a36Sopenharmony_ci } 154862306a36Sopenharmony_ci break; 154962306a36Sopenharmony_ci default: 155062306a36Sopenharmony_ci BUG(); 155162306a36Sopenharmony_ci } 155262306a36Sopenharmony_ci 155362306a36Sopenharmony_ci break; 155462306a36Sopenharmony_ci } 155562306a36Sopenharmony_ci 155662306a36Sopenharmony_ci return changed; 155762306a36Sopenharmony_ci} 155862306a36Sopenharmony_ci 155962306a36Sopenharmony_ci/* 156062306a36Sopenharmony_ci * Find free space in the specified device. 156162306a36Sopenharmony_ci * 156262306a36Sopenharmony_ci * @device: the device which we search the free space in 156362306a36Sopenharmony_ci * @num_bytes: the size of the free space that we need 156462306a36Sopenharmony_ci * @search_start: the position from which to begin the search 156562306a36Sopenharmony_ci * @start: store the start of the free space. 156662306a36Sopenharmony_ci * @len: the size of the free space. that we find, or the size 156762306a36Sopenharmony_ci * of the max free space if we don't find suitable free space 156862306a36Sopenharmony_ci * 156962306a36Sopenharmony_ci * This does a pretty simple search, the expectation is that it is called very 157062306a36Sopenharmony_ci * infrequently and that a given device has a small number of extents. 157162306a36Sopenharmony_ci * 157262306a36Sopenharmony_ci * @start is used to store the start of the free space if we find. But if we 157362306a36Sopenharmony_ci * don't find suitable free space, it will be used to store the start position 157462306a36Sopenharmony_ci * of the max free space. 157562306a36Sopenharmony_ci * 157662306a36Sopenharmony_ci * @len is used to store the size of the free space that we find. 157762306a36Sopenharmony_ci * But if we don't find suitable free space, it is used to store the size of 157862306a36Sopenharmony_ci * the max free space. 157962306a36Sopenharmony_ci * 158062306a36Sopenharmony_ci * NOTE: This function will search *commit* root of device tree, and does extra 158162306a36Sopenharmony_ci * check to ensure dev extents are not double allocated. 158262306a36Sopenharmony_ci * This makes the function safe to allocate dev extents but may not report 158362306a36Sopenharmony_ci * correct usable device space, as device extent freed in current transaction 158462306a36Sopenharmony_ci * is not reported as available. 158562306a36Sopenharmony_ci */ 158662306a36Sopenharmony_cistatic int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 158762306a36Sopenharmony_ci u64 *start, u64 *len) 158862306a36Sopenharmony_ci{ 158962306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = device->fs_info; 159062306a36Sopenharmony_ci struct btrfs_root *root = fs_info->dev_root; 159162306a36Sopenharmony_ci struct btrfs_key key; 159262306a36Sopenharmony_ci struct btrfs_dev_extent *dev_extent; 159362306a36Sopenharmony_ci struct btrfs_path *path; 159462306a36Sopenharmony_ci u64 search_start; 159562306a36Sopenharmony_ci u64 hole_size; 159662306a36Sopenharmony_ci u64 max_hole_start; 159762306a36Sopenharmony_ci u64 max_hole_size = 0; 159862306a36Sopenharmony_ci u64 extent_end; 159962306a36Sopenharmony_ci u64 search_end = device->total_bytes; 160062306a36Sopenharmony_ci int ret; 160162306a36Sopenharmony_ci int slot; 160262306a36Sopenharmony_ci struct extent_buffer *l; 160362306a36Sopenharmony_ci 160462306a36Sopenharmony_ci search_start = dev_extent_search_start(device); 160562306a36Sopenharmony_ci max_hole_start = search_start; 160662306a36Sopenharmony_ci 160762306a36Sopenharmony_ci WARN_ON(device->zone_info && 160862306a36Sopenharmony_ci !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 160962306a36Sopenharmony_ci 161062306a36Sopenharmony_ci path = btrfs_alloc_path(); 161162306a36Sopenharmony_ci if (!path) { 161262306a36Sopenharmony_ci ret = -ENOMEM; 161362306a36Sopenharmony_ci goto out; 161462306a36Sopenharmony_ci } 161562306a36Sopenharmony_ciagain: 161662306a36Sopenharmony_ci if (search_start >= search_end || 161762306a36Sopenharmony_ci test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 161862306a36Sopenharmony_ci ret = -ENOSPC; 161962306a36Sopenharmony_ci goto out; 162062306a36Sopenharmony_ci } 162162306a36Sopenharmony_ci 162262306a36Sopenharmony_ci path->reada = READA_FORWARD; 162362306a36Sopenharmony_ci path->search_commit_root = 1; 162462306a36Sopenharmony_ci path->skip_locking = 1; 162562306a36Sopenharmony_ci 162662306a36Sopenharmony_ci key.objectid = device->devid; 162762306a36Sopenharmony_ci key.offset = search_start; 162862306a36Sopenharmony_ci key.type = BTRFS_DEV_EXTENT_KEY; 162962306a36Sopenharmony_ci 163062306a36Sopenharmony_ci ret = btrfs_search_backwards(root, &key, path); 163162306a36Sopenharmony_ci if (ret < 0) 163262306a36Sopenharmony_ci goto out; 163362306a36Sopenharmony_ci 163462306a36Sopenharmony_ci while (search_start < search_end) { 163562306a36Sopenharmony_ci l = path->nodes[0]; 163662306a36Sopenharmony_ci slot = path->slots[0]; 163762306a36Sopenharmony_ci if (slot >= btrfs_header_nritems(l)) { 163862306a36Sopenharmony_ci ret = btrfs_next_leaf(root, path); 163962306a36Sopenharmony_ci if (ret == 0) 164062306a36Sopenharmony_ci continue; 164162306a36Sopenharmony_ci if (ret < 0) 164262306a36Sopenharmony_ci goto out; 164362306a36Sopenharmony_ci 164462306a36Sopenharmony_ci break; 164562306a36Sopenharmony_ci } 164662306a36Sopenharmony_ci btrfs_item_key_to_cpu(l, &key, slot); 164762306a36Sopenharmony_ci 164862306a36Sopenharmony_ci if (key.objectid < device->devid) 164962306a36Sopenharmony_ci goto next; 165062306a36Sopenharmony_ci 165162306a36Sopenharmony_ci if (key.objectid > device->devid) 165262306a36Sopenharmony_ci break; 165362306a36Sopenharmony_ci 165462306a36Sopenharmony_ci if (key.type != BTRFS_DEV_EXTENT_KEY) 165562306a36Sopenharmony_ci goto next; 165662306a36Sopenharmony_ci 165762306a36Sopenharmony_ci if (key.offset > search_end) 165862306a36Sopenharmony_ci break; 165962306a36Sopenharmony_ci 166062306a36Sopenharmony_ci if (key.offset > search_start) { 166162306a36Sopenharmony_ci hole_size = key.offset - search_start; 166262306a36Sopenharmony_ci dev_extent_hole_check(device, &search_start, &hole_size, 166362306a36Sopenharmony_ci num_bytes); 166462306a36Sopenharmony_ci 166562306a36Sopenharmony_ci if (hole_size > max_hole_size) { 166662306a36Sopenharmony_ci max_hole_start = search_start; 166762306a36Sopenharmony_ci max_hole_size = hole_size; 166862306a36Sopenharmony_ci } 166962306a36Sopenharmony_ci 167062306a36Sopenharmony_ci /* 167162306a36Sopenharmony_ci * If this free space is greater than which we need, 167262306a36Sopenharmony_ci * it must be the max free space that we have found 167362306a36Sopenharmony_ci * until now, so max_hole_start must point to the start 167462306a36Sopenharmony_ci * of this free space and the length of this free space 167562306a36Sopenharmony_ci * is stored in max_hole_size. Thus, we return 167662306a36Sopenharmony_ci * max_hole_start and max_hole_size and go back to the 167762306a36Sopenharmony_ci * caller. 167862306a36Sopenharmony_ci */ 167962306a36Sopenharmony_ci if (hole_size >= num_bytes) { 168062306a36Sopenharmony_ci ret = 0; 168162306a36Sopenharmony_ci goto out; 168262306a36Sopenharmony_ci } 168362306a36Sopenharmony_ci } 168462306a36Sopenharmony_ci 168562306a36Sopenharmony_ci dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 168662306a36Sopenharmony_ci extent_end = key.offset + btrfs_dev_extent_length(l, 168762306a36Sopenharmony_ci dev_extent); 168862306a36Sopenharmony_ci if (extent_end > search_start) 168962306a36Sopenharmony_ci search_start = extent_end; 169062306a36Sopenharmony_cinext: 169162306a36Sopenharmony_ci path->slots[0]++; 169262306a36Sopenharmony_ci cond_resched(); 169362306a36Sopenharmony_ci } 169462306a36Sopenharmony_ci 169562306a36Sopenharmony_ci /* 169662306a36Sopenharmony_ci * At this point, search_start should be the end of 169762306a36Sopenharmony_ci * allocated dev extents, and when shrinking the device, 169862306a36Sopenharmony_ci * search_end may be smaller than search_start. 169962306a36Sopenharmony_ci */ 170062306a36Sopenharmony_ci if (search_end > search_start) { 170162306a36Sopenharmony_ci hole_size = search_end - search_start; 170262306a36Sopenharmony_ci if (dev_extent_hole_check(device, &search_start, &hole_size, 170362306a36Sopenharmony_ci num_bytes)) { 170462306a36Sopenharmony_ci btrfs_release_path(path); 170562306a36Sopenharmony_ci goto again; 170662306a36Sopenharmony_ci } 170762306a36Sopenharmony_ci 170862306a36Sopenharmony_ci if (hole_size > max_hole_size) { 170962306a36Sopenharmony_ci max_hole_start = search_start; 171062306a36Sopenharmony_ci max_hole_size = hole_size; 171162306a36Sopenharmony_ci } 171262306a36Sopenharmony_ci } 171362306a36Sopenharmony_ci 171462306a36Sopenharmony_ci /* See above. */ 171562306a36Sopenharmony_ci if (max_hole_size < num_bytes) 171662306a36Sopenharmony_ci ret = -ENOSPC; 171762306a36Sopenharmony_ci else 171862306a36Sopenharmony_ci ret = 0; 171962306a36Sopenharmony_ci 172062306a36Sopenharmony_ci ASSERT(max_hole_start + max_hole_size <= search_end); 172162306a36Sopenharmony_ciout: 172262306a36Sopenharmony_ci btrfs_free_path(path); 172362306a36Sopenharmony_ci *start = max_hole_start; 172462306a36Sopenharmony_ci if (len) 172562306a36Sopenharmony_ci *len = max_hole_size; 172662306a36Sopenharmony_ci return ret; 172762306a36Sopenharmony_ci} 172862306a36Sopenharmony_ci 172962306a36Sopenharmony_cistatic int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 173062306a36Sopenharmony_ci struct btrfs_device *device, 173162306a36Sopenharmony_ci u64 start, u64 *dev_extent_len) 173262306a36Sopenharmony_ci{ 173362306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = device->fs_info; 173462306a36Sopenharmony_ci struct btrfs_root *root = fs_info->dev_root; 173562306a36Sopenharmony_ci int ret; 173662306a36Sopenharmony_ci struct btrfs_path *path; 173762306a36Sopenharmony_ci struct btrfs_key key; 173862306a36Sopenharmony_ci struct btrfs_key found_key; 173962306a36Sopenharmony_ci struct extent_buffer *leaf = NULL; 174062306a36Sopenharmony_ci struct btrfs_dev_extent *extent = NULL; 174162306a36Sopenharmony_ci 174262306a36Sopenharmony_ci path = btrfs_alloc_path(); 174362306a36Sopenharmony_ci if (!path) 174462306a36Sopenharmony_ci return -ENOMEM; 174562306a36Sopenharmony_ci 174662306a36Sopenharmony_ci key.objectid = device->devid; 174762306a36Sopenharmony_ci key.offset = start; 174862306a36Sopenharmony_ci key.type = BTRFS_DEV_EXTENT_KEY; 174962306a36Sopenharmony_ciagain: 175062306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 175162306a36Sopenharmony_ci if (ret > 0) { 175262306a36Sopenharmony_ci ret = btrfs_previous_item(root, path, key.objectid, 175362306a36Sopenharmony_ci BTRFS_DEV_EXTENT_KEY); 175462306a36Sopenharmony_ci if (ret) 175562306a36Sopenharmony_ci goto out; 175662306a36Sopenharmony_ci leaf = path->nodes[0]; 175762306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 175862306a36Sopenharmony_ci extent = btrfs_item_ptr(leaf, path->slots[0], 175962306a36Sopenharmony_ci struct btrfs_dev_extent); 176062306a36Sopenharmony_ci BUG_ON(found_key.offset > start || found_key.offset + 176162306a36Sopenharmony_ci btrfs_dev_extent_length(leaf, extent) < start); 176262306a36Sopenharmony_ci key = found_key; 176362306a36Sopenharmony_ci btrfs_release_path(path); 176462306a36Sopenharmony_ci goto again; 176562306a36Sopenharmony_ci } else if (ret == 0) { 176662306a36Sopenharmony_ci leaf = path->nodes[0]; 176762306a36Sopenharmony_ci extent = btrfs_item_ptr(leaf, path->slots[0], 176862306a36Sopenharmony_ci struct btrfs_dev_extent); 176962306a36Sopenharmony_ci } else { 177062306a36Sopenharmony_ci goto out; 177162306a36Sopenharmony_ci } 177262306a36Sopenharmony_ci 177362306a36Sopenharmony_ci *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 177462306a36Sopenharmony_ci 177562306a36Sopenharmony_ci ret = btrfs_del_item(trans, root, path); 177662306a36Sopenharmony_ci if (ret == 0) 177762306a36Sopenharmony_ci set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 177862306a36Sopenharmony_ciout: 177962306a36Sopenharmony_ci btrfs_free_path(path); 178062306a36Sopenharmony_ci return ret; 178162306a36Sopenharmony_ci} 178262306a36Sopenharmony_ci 178362306a36Sopenharmony_cistatic u64 find_next_chunk(struct btrfs_fs_info *fs_info) 178462306a36Sopenharmony_ci{ 178562306a36Sopenharmony_ci struct extent_map_tree *em_tree; 178662306a36Sopenharmony_ci struct extent_map *em; 178762306a36Sopenharmony_ci struct rb_node *n; 178862306a36Sopenharmony_ci u64 ret = 0; 178962306a36Sopenharmony_ci 179062306a36Sopenharmony_ci em_tree = &fs_info->mapping_tree; 179162306a36Sopenharmony_ci read_lock(&em_tree->lock); 179262306a36Sopenharmony_ci n = rb_last(&em_tree->map.rb_root); 179362306a36Sopenharmony_ci if (n) { 179462306a36Sopenharmony_ci em = rb_entry(n, struct extent_map, rb_node); 179562306a36Sopenharmony_ci ret = em->start + em->len; 179662306a36Sopenharmony_ci } 179762306a36Sopenharmony_ci read_unlock(&em_tree->lock); 179862306a36Sopenharmony_ci 179962306a36Sopenharmony_ci return ret; 180062306a36Sopenharmony_ci} 180162306a36Sopenharmony_ci 180262306a36Sopenharmony_cistatic noinline int find_next_devid(struct btrfs_fs_info *fs_info, 180362306a36Sopenharmony_ci u64 *devid_ret) 180462306a36Sopenharmony_ci{ 180562306a36Sopenharmony_ci int ret; 180662306a36Sopenharmony_ci struct btrfs_key key; 180762306a36Sopenharmony_ci struct btrfs_key found_key; 180862306a36Sopenharmony_ci struct btrfs_path *path; 180962306a36Sopenharmony_ci 181062306a36Sopenharmony_ci path = btrfs_alloc_path(); 181162306a36Sopenharmony_ci if (!path) 181262306a36Sopenharmony_ci return -ENOMEM; 181362306a36Sopenharmony_ci 181462306a36Sopenharmony_ci key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 181562306a36Sopenharmony_ci key.type = BTRFS_DEV_ITEM_KEY; 181662306a36Sopenharmony_ci key.offset = (u64)-1; 181762306a36Sopenharmony_ci 181862306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 181962306a36Sopenharmony_ci if (ret < 0) 182062306a36Sopenharmony_ci goto error; 182162306a36Sopenharmony_ci 182262306a36Sopenharmony_ci if (ret == 0) { 182362306a36Sopenharmony_ci /* Corruption */ 182462306a36Sopenharmony_ci btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 182562306a36Sopenharmony_ci ret = -EUCLEAN; 182662306a36Sopenharmony_ci goto error; 182762306a36Sopenharmony_ci } 182862306a36Sopenharmony_ci 182962306a36Sopenharmony_ci ret = btrfs_previous_item(fs_info->chunk_root, path, 183062306a36Sopenharmony_ci BTRFS_DEV_ITEMS_OBJECTID, 183162306a36Sopenharmony_ci BTRFS_DEV_ITEM_KEY); 183262306a36Sopenharmony_ci if (ret) { 183362306a36Sopenharmony_ci *devid_ret = 1; 183462306a36Sopenharmony_ci } else { 183562306a36Sopenharmony_ci btrfs_item_key_to_cpu(path->nodes[0], &found_key, 183662306a36Sopenharmony_ci path->slots[0]); 183762306a36Sopenharmony_ci *devid_ret = found_key.offset + 1; 183862306a36Sopenharmony_ci } 183962306a36Sopenharmony_ci ret = 0; 184062306a36Sopenharmony_cierror: 184162306a36Sopenharmony_ci btrfs_free_path(path); 184262306a36Sopenharmony_ci return ret; 184362306a36Sopenharmony_ci} 184462306a36Sopenharmony_ci 184562306a36Sopenharmony_ci/* 184662306a36Sopenharmony_ci * the device information is stored in the chunk root 184762306a36Sopenharmony_ci * the btrfs_device struct should be fully filled in 184862306a36Sopenharmony_ci */ 184962306a36Sopenharmony_cistatic int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 185062306a36Sopenharmony_ci struct btrfs_device *device) 185162306a36Sopenharmony_ci{ 185262306a36Sopenharmony_ci int ret; 185362306a36Sopenharmony_ci struct btrfs_path *path; 185462306a36Sopenharmony_ci struct btrfs_dev_item *dev_item; 185562306a36Sopenharmony_ci struct extent_buffer *leaf; 185662306a36Sopenharmony_ci struct btrfs_key key; 185762306a36Sopenharmony_ci unsigned long ptr; 185862306a36Sopenharmony_ci 185962306a36Sopenharmony_ci path = btrfs_alloc_path(); 186062306a36Sopenharmony_ci if (!path) 186162306a36Sopenharmony_ci return -ENOMEM; 186262306a36Sopenharmony_ci 186362306a36Sopenharmony_ci key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 186462306a36Sopenharmony_ci key.type = BTRFS_DEV_ITEM_KEY; 186562306a36Sopenharmony_ci key.offset = device->devid; 186662306a36Sopenharmony_ci 186762306a36Sopenharmony_ci btrfs_reserve_chunk_metadata(trans, true); 186862306a36Sopenharmony_ci ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 186962306a36Sopenharmony_ci &key, sizeof(*dev_item)); 187062306a36Sopenharmony_ci btrfs_trans_release_chunk_metadata(trans); 187162306a36Sopenharmony_ci if (ret) 187262306a36Sopenharmony_ci goto out; 187362306a36Sopenharmony_ci 187462306a36Sopenharmony_ci leaf = path->nodes[0]; 187562306a36Sopenharmony_ci dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 187662306a36Sopenharmony_ci 187762306a36Sopenharmony_ci btrfs_set_device_id(leaf, dev_item, device->devid); 187862306a36Sopenharmony_ci btrfs_set_device_generation(leaf, dev_item, 0); 187962306a36Sopenharmony_ci btrfs_set_device_type(leaf, dev_item, device->type); 188062306a36Sopenharmony_ci btrfs_set_device_io_align(leaf, dev_item, device->io_align); 188162306a36Sopenharmony_ci btrfs_set_device_io_width(leaf, dev_item, device->io_width); 188262306a36Sopenharmony_ci btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 188362306a36Sopenharmony_ci btrfs_set_device_total_bytes(leaf, dev_item, 188462306a36Sopenharmony_ci btrfs_device_get_disk_total_bytes(device)); 188562306a36Sopenharmony_ci btrfs_set_device_bytes_used(leaf, dev_item, 188662306a36Sopenharmony_ci btrfs_device_get_bytes_used(device)); 188762306a36Sopenharmony_ci btrfs_set_device_group(leaf, dev_item, 0); 188862306a36Sopenharmony_ci btrfs_set_device_seek_speed(leaf, dev_item, 0); 188962306a36Sopenharmony_ci btrfs_set_device_bandwidth(leaf, dev_item, 0); 189062306a36Sopenharmony_ci btrfs_set_device_start_offset(leaf, dev_item, 0); 189162306a36Sopenharmony_ci 189262306a36Sopenharmony_ci ptr = btrfs_device_uuid(dev_item); 189362306a36Sopenharmony_ci write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 189462306a36Sopenharmony_ci ptr = btrfs_device_fsid(dev_item); 189562306a36Sopenharmony_ci write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 189662306a36Sopenharmony_ci ptr, BTRFS_FSID_SIZE); 189762306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 189862306a36Sopenharmony_ci 189962306a36Sopenharmony_ci ret = 0; 190062306a36Sopenharmony_ciout: 190162306a36Sopenharmony_ci btrfs_free_path(path); 190262306a36Sopenharmony_ci return ret; 190362306a36Sopenharmony_ci} 190462306a36Sopenharmony_ci 190562306a36Sopenharmony_ci/* 190662306a36Sopenharmony_ci * Function to update ctime/mtime for a given device path. 190762306a36Sopenharmony_ci * Mainly used for ctime/mtime based probe like libblkid. 190862306a36Sopenharmony_ci * 190962306a36Sopenharmony_ci * We don't care about errors here, this is just to be kind to userspace. 191062306a36Sopenharmony_ci */ 191162306a36Sopenharmony_cistatic void update_dev_time(const char *device_path) 191262306a36Sopenharmony_ci{ 191362306a36Sopenharmony_ci struct path path; 191462306a36Sopenharmony_ci int ret; 191562306a36Sopenharmony_ci 191662306a36Sopenharmony_ci ret = kern_path(device_path, LOOKUP_FOLLOW, &path); 191762306a36Sopenharmony_ci if (ret) 191862306a36Sopenharmony_ci return; 191962306a36Sopenharmony_ci 192062306a36Sopenharmony_ci inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); 192162306a36Sopenharmony_ci path_put(&path); 192262306a36Sopenharmony_ci} 192362306a36Sopenharmony_ci 192462306a36Sopenharmony_cistatic int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, 192562306a36Sopenharmony_ci struct btrfs_device *device) 192662306a36Sopenharmony_ci{ 192762306a36Sopenharmony_ci struct btrfs_root *root = device->fs_info->chunk_root; 192862306a36Sopenharmony_ci int ret; 192962306a36Sopenharmony_ci struct btrfs_path *path; 193062306a36Sopenharmony_ci struct btrfs_key key; 193162306a36Sopenharmony_ci 193262306a36Sopenharmony_ci path = btrfs_alloc_path(); 193362306a36Sopenharmony_ci if (!path) 193462306a36Sopenharmony_ci return -ENOMEM; 193562306a36Sopenharmony_ci 193662306a36Sopenharmony_ci key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 193762306a36Sopenharmony_ci key.type = BTRFS_DEV_ITEM_KEY; 193862306a36Sopenharmony_ci key.offset = device->devid; 193962306a36Sopenharmony_ci 194062306a36Sopenharmony_ci btrfs_reserve_chunk_metadata(trans, false); 194162306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 194262306a36Sopenharmony_ci btrfs_trans_release_chunk_metadata(trans); 194362306a36Sopenharmony_ci if (ret) { 194462306a36Sopenharmony_ci if (ret > 0) 194562306a36Sopenharmony_ci ret = -ENOENT; 194662306a36Sopenharmony_ci goto out; 194762306a36Sopenharmony_ci } 194862306a36Sopenharmony_ci 194962306a36Sopenharmony_ci ret = btrfs_del_item(trans, root, path); 195062306a36Sopenharmony_ciout: 195162306a36Sopenharmony_ci btrfs_free_path(path); 195262306a36Sopenharmony_ci return ret; 195362306a36Sopenharmony_ci} 195462306a36Sopenharmony_ci 195562306a36Sopenharmony_ci/* 195662306a36Sopenharmony_ci * Verify that @num_devices satisfies the RAID profile constraints in the whole 195762306a36Sopenharmony_ci * filesystem. It's up to the caller to adjust that number regarding eg. device 195862306a36Sopenharmony_ci * replace. 195962306a36Sopenharmony_ci */ 196062306a36Sopenharmony_cistatic int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 196162306a36Sopenharmony_ci u64 num_devices) 196262306a36Sopenharmony_ci{ 196362306a36Sopenharmony_ci u64 all_avail; 196462306a36Sopenharmony_ci unsigned seq; 196562306a36Sopenharmony_ci int i; 196662306a36Sopenharmony_ci 196762306a36Sopenharmony_ci do { 196862306a36Sopenharmony_ci seq = read_seqbegin(&fs_info->profiles_lock); 196962306a36Sopenharmony_ci 197062306a36Sopenharmony_ci all_avail = fs_info->avail_data_alloc_bits | 197162306a36Sopenharmony_ci fs_info->avail_system_alloc_bits | 197262306a36Sopenharmony_ci fs_info->avail_metadata_alloc_bits; 197362306a36Sopenharmony_ci } while (read_seqretry(&fs_info->profiles_lock, seq)); 197462306a36Sopenharmony_ci 197562306a36Sopenharmony_ci for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 197662306a36Sopenharmony_ci if (!(all_avail & btrfs_raid_array[i].bg_flag)) 197762306a36Sopenharmony_ci continue; 197862306a36Sopenharmony_ci 197962306a36Sopenharmony_ci if (num_devices < btrfs_raid_array[i].devs_min) 198062306a36Sopenharmony_ci return btrfs_raid_array[i].mindev_error; 198162306a36Sopenharmony_ci } 198262306a36Sopenharmony_ci 198362306a36Sopenharmony_ci return 0; 198462306a36Sopenharmony_ci} 198562306a36Sopenharmony_ci 198662306a36Sopenharmony_cistatic struct btrfs_device * btrfs_find_next_active_device( 198762306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 198862306a36Sopenharmony_ci{ 198962306a36Sopenharmony_ci struct btrfs_device *next_device; 199062306a36Sopenharmony_ci 199162306a36Sopenharmony_ci list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 199262306a36Sopenharmony_ci if (next_device != device && 199362306a36Sopenharmony_ci !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 199462306a36Sopenharmony_ci && next_device->bdev) 199562306a36Sopenharmony_ci return next_device; 199662306a36Sopenharmony_ci } 199762306a36Sopenharmony_ci 199862306a36Sopenharmony_ci return NULL; 199962306a36Sopenharmony_ci} 200062306a36Sopenharmony_ci 200162306a36Sopenharmony_ci/* 200262306a36Sopenharmony_ci * Helper function to check if the given device is part of s_bdev / latest_dev 200362306a36Sopenharmony_ci * and replace it with the provided or the next active device, in the context 200462306a36Sopenharmony_ci * where this function called, there should be always be another device (or 200562306a36Sopenharmony_ci * this_dev) which is active. 200662306a36Sopenharmony_ci */ 200762306a36Sopenharmony_civoid __cold btrfs_assign_next_active_device(struct btrfs_device *device, 200862306a36Sopenharmony_ci struct btrfs_device *next_device) 200962306a36Sopenharmony_ci{ 201062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = device->fs_info; 201162306a36Sopenharmony_ci 201262306a36Sopenharmony_ci if (!next_device) 201362306a36Sopenharmony_ci next_device = btrfs_find_next_active_device(fs_info->fs_devices, 201462306a36Sopenharmony_ci device); 201562306a36Sopenharmony_ci ASSERT(next_device); 201662306a36Sopenharmony_ci 201762306a36Sopenharmony_ci if (fs_info->sb->s_bdev && 201862306a36Sopenharmony_ci (fs_info->sb->s_bdev == device->bdev)) 201962306a36Sopenharmony_ci fs_info->sb->s_bdev = next_device->bdev; 202062306a36Sopenharmony_ci 202162306a36Sopenharmony_ci if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 202262306a36Sopenharmony_ci fs_info->fs_devices->latest_dev = next_device; 202362306a36Sopenharmony_ci} 202462306a36Sopenharmony_ci 202562306a36Sopenharmony_ci/* 202662306a36Sopenharmony_ci * Return btrfs_fs_devices::num_devices excluding the device that's being 202762306a36Sopenharmony_ci * currently replaced. 202862306a36Sopenharmony_ci */ 202962306a36Sopenharmony_cistatic u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 203062306a36Sopenharmony_ci{ 203162306a36Sopenharmony_ci u64 num_devices = fs_info->fs_devices->num_devices; 203262306a36Sopenharmony_ci 203362306a36Sopenharmony_ci down_read(&fs_info->dev_replace.rwsem); 203462306a36Sopenharmony_ci if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 203562306a36Sopenharmony_ci ASSERT(num_devices > 1); 203662306a36Sopenharmony_ci num_devices--; 203762306a36Sopenharmony_ci } 203862306a36Sopenharmony_ci up_read(&fs_info->dev_replace.rwsem); 203962306a36Sopenharmony_ci 204062306a36Sopenharmony_ci return num_devices; 204162306a36Sopenharmony_ci} 204262306a36Sopenharmony_ci 204362306a36Sopenharmony_cistatic void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, 204462306a36Sopenharmony_ci struct block_device *bdev, int copy_num) 204562306a36Sopenharmony_ci{ 204662306a36Sopenharmony_ci struct btrfs_super_block *disk_super; 204762306a36Sopenharmony_ci const size_t len = sizeof(disk_super->magic); 204862306a36Sopenharmony_ci const u64 bytenr = btrfs_sb_offset(copy_num); 204962306a36Sopenharmony_ci int ret; 205062306a36Sopenharmony_ci 205162306a36Sopenharmony_ci disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); 205262306a36Sopenharmony_ci if (IS_ERR(disk_super)) 205362306a36Sopenharmony_ci return; 205462306a36Sopenharmony_ci 205562306a36Sopenharmony_ci memset(&disk_super->magic, 0, len); 205662306a36Sopenharmony_ci folio_mark_dirty(virt_to_folio(disk_super)); 205762306a36Sopenharmony_ci btrfs_release_disk_super(disk_super); 205862306a36Sopenharmony_ci 205962306a36Sopenharmony_ci ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1); 206062306a36Sopenharmony_ci if (ret) 206162306a36Sopenharmony_ci btrfs_warn(fs_info, "error clearing superblock number %d (%d)", 206262306a36Sopenharmony_ci copy_num, ret); 206362306a36Sopenharmony_ci} 206462306a36Sopenharmony_ci 206562306a36Sopenharmony_civoid btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 206662306a36Sopenharmony_ci struct block_device *bdev, 206762306a36Sopenharmony_ci const char *device_path) 206862306a36Sopenharmony_ci{ 206962306a36Sopenharmony_ci int copy_num; 207062306a36Sopenharmony_ci 207162306a36Sopenharmony_ci if (!bdev) 207262306a36Sopenharmony_ci return; 207362306a36Sopenharmony_ci 207462306a36Sopenharmony_ci for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 207562306a36Sopenharmony_ci if (bdev_is_zoned(bdev)) 207662306a36Sopenharmony_ci btrfs_reset_sb_log_zones(bdev, copy_num); 207762306a36Sopenharmony_ci else 207862306a36Sopenharmony_ci btrfs_scratch_superblock(fs_info, bdev, copy_num); 207962306a36Sopenharmony_ci } 208062306a36Sopenharmony_ci 208162306a36Sopenharmony_ci /* Notify udev that device has changed */ 208262306a36Sopenharmony_ci btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 208362306a36Sopenharmony_ci 208462306a36Sopenharmony_ci /* Update ctime/mtime for device path for libblkid */ 208562306a36Sopenharmony_ci update_dev_time(device_path); 208662306a36Sopenharmony_ci} 208762306a36Sopenharmony_ci 208862306a36Sopenharmony_ciint btrfs_rm_device(struct btrfs_fs_info *fs_info, 208962306a36Sopenharmony_ci struct btrfs_dev_lookup_args *args, 209062306a36Sopenharmony_ci struct block_device **bdev, void **holder) 209162306a36Sopenharmony_ci{ 209262306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 209362306a36Sopenharmony_ci struct btrfs_device *device; 209462306a36Sopenharmony_ci struct btrfs_fs_devices *cur_devices; 209562306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 209662306a36Sopenharmony_ci u64 num_devices; 209762306a36Sopenharmony_ci int ret = 0; 209862306a36Sopenharmony_ci 209962306a36Sopenharmony_ci if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 210062306a36Sopenharmony_ci btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); 210162306a36Sopenharmony_ci return -EINVAL; 210262306a36Sopenharmony_ci } 210362306a36Sopenharmony_ci 210462306a36Sopenharmony_ci /* 210562306a36Sopenharmony_ci * The device list in fs_devices is accessed without locks (neither 210662306a36Sopenharmony_ci * uuid_mutex nor device_list_mutex) as it won't change on a mounted 210762306a36Sopenharmony_ci * filesystem and another device rm cannot run. 210862306a36Sopenharmony_ci */ 210962306a36Sopenharmony_ci num_devices = btrfs_num_devices(fs_info); 211062306a36Sopenharmony_ci 211162306a36Sopenharmony_ci ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 211262306a36Sopenharmony_ci if (ret) 211362306a36Sopenharmony_ci return ret; 211462306a36Sopenharmony_ci 211562306a36Sopenharmony_ci device = btrfs_find_device(fs_info->fs_devices, args); 211662306a36Sopenharmony_ci if (!device) { 211762306a36Sopenharmony_ci if (args->missing) 211862306a36Sopenharmony_ci ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 211962306a36Sopenharmony_ci else 212062306a36Sopenharmony_ci ret = -ENOENT; 212162306a36Sopenharmony_ci return ret; 212262306a36Sopenharmony_ci } 212362306a36Sopenharmony_ci 212462306a36Sopenharmony_ci if (btrfs_pinned_by_swapfile(fs_info, device)) { 212562306a36Sopenharmony_ci btrfs_warn_in_rcu(fs_info, 212662306a36Sopenharmony_ci "cannot remove device %s (devid %llu) due to active swapfile", 212762306a36Sopenharmony_ci btrfs_dev_name(device), device->devid); 212862306a36Sopenharmony_ci return -ETXTBSY; 212962306a36Sopenharmony_ci } 213062306a36Sopenharmony_ci 213162306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 213262306a36Sopenharmony_ci return BTRFS_ERROR_DEV_TGT_REPLACE; 213362306a36Sopenharmony_ci 213462306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 213562306a36Sopenharmony_ci fs_info->fs_devices->rw_devices == 1) 213662306a36Sopenharmony_ci return BTRFS_ERROR_DEV_ONLY_WRITABLE; 213762306a36Sopenharmony_ci 213862306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 213962306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 214062306a36Sopenharmony_ci list_del_init(&device->dev_alloc_list); 214162306a36Sopenharmony_ci device->fs_devices->rw_devices--; 214262306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 214362306a36Sopenharmony_ci } 214462306a36Sopenharmony_ci 214562306a36Sopenharmony_ci ret = btrfs_shrink_device(device, 0); 214662306a36Sopenharmony_ci if (ret) 214762306a36Sopenharmony_ci goto error_undo; 214862306a36Sopenharmony_ci 214962306a36Sopenharmony_ci trans = btrfs_start_transaction(fs_info->chunk_root, 0); 215062306a36Sopenharmony_ci if (IS_ERR(trans)) { 215162306a36Sopenharmony_ci ret = PTR_ERR(trans); 215262306a36Sopenharmony_ci goto error_undo; 215362306a36Sopenharmony_ci } 215462306a36Sopenharmony_ci 215562306a36Sopenharmony_ci ret = btrfs_rm_dev_item(trans, device); 215662306a36Sopenharmony_ci if (ret) { 215762306a36Sopenharmony_ci /* Any error in dev item removal is critical */ 215862306a36Sopenharmony_ci btrfs_crit(fs_info, 215962306a36Sopenharmony_ci "failed to remove device item for devid %llu: %d", 216062306a36Sopenharmony_ci device->devid, ret); 216162306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 216262306a36Sopenharmony_ci btrfs_end_transaction(trans); 216362306a36Sopenharmony_ci return ret; 216462306a36Sopenharmony_ci } 216562306a36Sopenharmony_ci 216662306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 216762306a36Sopenharmony_ci btrfs_scrub_cancel_dev(device); 216862306a36Sopenharmony_ci 216962306a36Sopenharmony_ci /* 217062306a36Sopenharmony_ci * the device list mutex makes sure that we don't change 217162306a36Sopenharmony_ci * the device list while someone else is writing out all 217262306a36Sopenharmony_ci * the device supers. Whoever is writing all supers, should 217362306a36Sopenharmony_ci * lock the device list mutex before getting the number of 217462306a36Sopenharmony_ci * devices in the super block (super_copy). Conversely, 217562306a36Sopenharmony_ci * whoever updates the number of devices in the super block 217662306a36Sopenharmony_ci * (super_copy) should hold the device list mutex. 217762306a36Sopenharmony_ci */ 217862306a36Sopenharmony_ci 217962306a36Sopenharmony_ci /* 218062306a36Sopenharmony_ci * In normal cases the cur_devices == fs_devices. But in case 218162306a36Sopenharmony_ci * of deleting a seed device, the cur_devices should point to 218262306a36Sopenharmony_ci * its own fs_devices listed under the fs_devices->seed_list. 218362306a36Sopenharmony_ci */ 218462306a36Sopenharmony_ci cur_devices = device->fs_devices; 218562306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 218662306a36Sopenharmony_ci list_del_rcu(&device->dev_list); 218762306a36Sopenharmony_ci 218862306a36Sopenharmony_ci cur_devices->num_devices--; 218962306a36Sopenharmony_ci cur_devices->total_devices--; 219062306a36Sopenharmony_ci /* Update total_devices of the parent fs_devices if it's seed */ 219162306a36Sopenharmony_ci if (cur_devices != fs_devices) 219262306a36Sopenharmony_ci fs_devices->total_devices--; 219362306a36Sopenharmony_ci 219462306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 219562306a36Sopenharmony_ci cur_devices->missing_devices--; 219662306a36Sopenharmony_ci 219762306a36Sopenharmony_ci btrfs_assign_next_active_device(device, NULL); 219862306a36Sopenharmony_ci 219962306a36Sopenharmony_ci if (device->bdev) { 220062306a36Sopenharmony_ci cur_devices->open_devices--; 220162306a36Sopenharmony_ci /* remove sysfs entry */ 220262306a36Sopenharmony_ci btrfs_sysfs_remove_device(device); 220362306a36Sopenharmony_ci } 220462306a36Sopenharmony_ci 220562306a36Sopenharmony_ci num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 220662306a36Sopenharmony_ci btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 220762306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 220862306a36Sopenharmony_ci 220962306a36Sopenharmony_ci /* 221062306a36Sopenharmony_ci * At this point, the device is zero sized and detached from the 221162306a36Sopenharmony_ci * devices list. All that's left is to zero out the old supers and 221262306a36Sopenharmony_ci * free the device. 221362306a36Sopenharmony_ci * 221462306a36Sopenharmony_ci * We cannot call btrfs_close_bdev() here because we're holding the sb 221562306a36Sopenharmony_ci * write lock, and blkdev_put() will pull in the ->open_mutex on the 221662306a36Sopenharmony_ci * block device and it's dependencies. Instead just flush the device 221762306a36Sopenharmony_ci * and let the caller do the final blkdev_put. 221862306a36Sopenharmony_ci */ 221962306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 222062306a36Sopenharmony_ci btrfs_scratch_superblocks(fs_info, device->bdev, 222162306a36Sopenharmony_ci device->name->str); 222262306a36Sopenharmony_ci if (device->bdev) { 222362306a36Sopenharmony_ci sync_blockdev(device->bdev); 222462306a36Sopenharmony_ci invalidate_bdev(device->bdev); 222562306a36Sopenharmony_ci } 222662306a36Sopenharmony_ci } 222762306a36Sopenharmony_ci 222862306a36Sopenharmony_ci *bdev = device->bdev; 222962306a36Sopenharmony_ci *holder = device->holder; 223062306a36Sopenharmony_ci synchronize_rcu(); 223162306a36Sopenharmony_ci btrfs_free_device(device); 223262306a36Sopenharmony_ci 223362306a36Sopenharmony_ci /* 223462306a36Sopenharmony_ci * This can happen if cur_devices is the private seed devices list. We 223562306a36Sopenharmony_ci * cannot call close_fs_devices() here because it expects the uuid_mutex 223662306a36Sopenharmony_ci * to be held, but in fact we don't need that for the private 223762306a36Sopenharmony_ci * seed_devices, we can simply decrement cur_devices->opened and then 223862306a36Sopenharmony_ci * remove it from our list and free the fs_devices. 223962306a36Sopenharmony_ci */ 224062306a36Sopenharmony_ci if (cur_devices->num_devices == 0) { 224162306a36Sopenharmony_ci list_del_init(&cur_devices->seed_list); 224262306a36Sopenharmony_ci ASSERT(cur_devices->opened == 1); 224362306a36Sopenharmony_ci cur_devices->opened--; 224462306a36Sopenharmony_ci free_fs_devices(cur_devices); 224562306a36Sopenharmony_ci } 224662306a36Sopenharmony_ci 224762306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 224862306a36Sopenharmony_ci 224962306a36Sopenharmony_ci return ret; 225062306a36Sopenharmony_ci 225162306a36Sopenharmony_cierror_undo: 225262306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 225362306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 225462306a36Sopenharmony_ci list_add(&device->dev_alloc_list, 225562306a36Sopenharmony_ci &fs_devices->alloc_list); 225662306a36Sopenharmony_ci device->fs_devices->rw_devices++; 225762306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 225862306a36Sopenharmony_ci } 225962306a36Sopenharmony_ci return ret; 226062306a36Sopenharmony_ci} 226162306a36Sopenharmony_ci 226262306a36Sopenharmony_civoid btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 226362306a36Sopenharmony_ci{ 226462306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices; 226562306a36Sopenharmony_ci 226662306a36Sopenharmony_ci lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 226762306a36Sopenharmony_ci 226862306a36Sopenharmony_ci /* 226962306a36Sopenharmony_ci * in case of fs with no seed, srcdev->fs_devices will point 227062306a36Sopenharmony_ci * to fs_devices of fs_info. However when the dev being replaced is 227162306a36Sopenharmony_ci * a seed dev it will point to the seed's local fs_devices. In short 227262306a36Sopenharmony_ci * srcdev will have its correct fs_devices in both the cases. 227362306a36Sopenharmony_ci */ 227462306a36Sopenharmony_ci fs_devices = srcdev->fs_devices; 227562306a36Sopenharmony_ci 227662306a36Sopenharmony_ci list_del_rcu(&srcdev->dev_list); 227762306a36Sopenharmony_ci list_del(&srcdev->dev_alloc_list); 227862306a36Sopenharmony_ci fs_devices->num_devices--; 227962306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 228062306a36Sopenharmony_ci fs_devices->missing_devices--; 228162306a36Sopenharmony_ci 228262306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 228362306a36Sopenharmony_ci fs_devices->rw_devices--; 228462306a36Sopenharmony_ci 228562306a36Sopenharmony_ci if (srcdev->bdev) 228662306a36Sopenharmony_ci fs_devices->open_devices--; 228762306a36Sopenharmony_ci} 228862306a36Sopenharmony_ci 228962306a36Sopenharmony_civoid btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 229062306a36Sopenharmony_ci{ 229162306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 229262306a36Sopenharmony_ci 229362306a36Sopenharmony_ci mutex_lock(&uuid_mutex); 229462306a36Sopenharmony_ci 229562306a36Sopenharmony_ci btrfs_close_bdev(srcdev); 229662306a36Sopenharmony_ci synchronize_rcu(); 229762306a36Sopenharmony_ci btrfs_free_device(srcdev); 229862306a36Sopenharmony_ci 229962306a36Sopenharmony_ci /* if this is no devs we rather delete the fs_devices */ 230062306a36Sopenharmony_ci if (!fs_devices->num_devices) { 230162306a36Sopenharmony_ci /* 230262306a36Sopenharmony_ci * On a mounted FS, num_devices can't be zero unless it's a 230362306a36Sopenharmony_ci * seed. In case of a seed device being replaced, the replace 230462306a36Sopenharmony_ci * target added to the sprout FS, so there will be no more 230562306a36Sopenharmony_ci * device left under the seed FS. 230662306a36Sopenharmony_ci */ 230762306a36Sopenharmony_ci ASSERT(fs_devices->seeding); 230862306a36Sopenharmony_ci 230962306a36Sopenharmony_ci list_del_init(&fs_devices->seed_list); 231062306a36Sopenharmony_ci close_fs_devices(fs_devices); 231162306a36Sopenharmony_ci free_fs_devices(fs_devices); 231262306a36Sopenharmony_ci } 231362306a36Sopenharmony_ci mutex_unlock(&uuid_mutex); 231462306a36Sopenharmony_ci} 231562306a36Sopenharmony_ci 231662306a36Sopenharmony_civoid btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 231762306a36Sopenharmony_ci{ 231862306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 231962306a36Sopenharmony_ci 232062306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 232162306a36Sopenharmony_ci 232262306a36Sopenharmony_ci btrfs_sysfs_remove_device(tgtdev); 232362306a36Sopenharmony_ci 232462306a36Sopenharmony_ci if (tgtdev->bdev) 232562306a36Sopenharmony_ci fs_devices->open_devices--; 232662306a36Sopenharmony_ci 232762306a36Sopenharmony_ci fs_devices->num_devices--; 232862306a36Sopenharmony_ci 232962306a36Sopenharmony_ci btrfs_assign_next_active_device(tgtdev, NULL); 233062306a36Sopenharmony_ci 233162306a36Sopenharmony_ci list_del_rcu(&tgtdev->dev_list); 233262306a36Sopenharmony_ci 233362306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 233462306a36Sopenharmony_ci 233562306a36Sopenharmony_ci btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 233662306a36Sopenharmony_ci tgtdev->name->str); 233762306a36Sopenharmony_ci 233862306a36Sopenharmony_ci btrfs_close_bdev(tgtdev); 233962306a36Sopenharmony_ci synchronize_rcu(); 234062306a36Sopenharmony_ci btrfs_free_device(tgtdev); 234162306a36Sopenharmony_ci} 234262306a36Sopenharmony_ci 234362306a36Sopenharmony_ci/* 234462306a36Sopenharmony_ci * Populate args from device at path. 234562306a36Sopenharmony_ci * 234662306a36Sopenharmony_ci * @fs_info: the filesystem 234762306a36Sopenharmony_ci * @args: the args to populate 234862306a36Sopenharmony_ci * @path: the path to the device 234962306a36Sopenharmony_ci * 235062306a36Sopenharmony_ci * This will read the super block of the device at @path and populate @args with 235162306a36Sopenharmony_ci * the devid, fsid, and uuid. This is meant to be used for ioctls that need to 235262306a36Sopenharmony_ci * lookup a device to operate on, but need to do it before we take any locks. 235362306a36Sopenharmony_ci * This properly handles the special case of "missing" that a user may pass in, 235462306a36Sopenharmony_ci * and does some basic sanity checks. The caller must make sure that @path is 235562306a36Sopenharmony_ci * properly NUL terminated before calling in, and must call 235662306a36Sopenharmony_ci * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and 235762306a36Sopenharmony_ci * uuid buffers. 235862306a36Sopenharmony_ci * 235962306a36Sopenharmony_ci * Return: 0 for success, -errno for failure 236062306a36Sopenharmony_ci */ 236162306a36Sopenharmony_ciint btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, 236262306a36Sopenharmony_ci struct btrfs_dev_lookup_args *args, 236362306a36Sopenharmony_ci const char *path) 236462306a36Sopenharmony_ci{ 236562306a36Sopenharmony_ci struct btrfs_super_block *disk_super; 236662306a36Sopenharmony_ci struct block_device *bdev; 236762306a36Sopenharmony_ci int ret; 236862306a36Sopenharmony_ci 236962306a36Sopenharmony_ci if (!path || !path[0]) 237062306a36Sopenharmony_ci return -EINVAL; 237162306a36Sopenharmony_ci if (!strcmp(path, "missing")) { 237262306a36Sopenharmony_ci args->missing = true; 237362306a36Sopenharmony_ci return 0; 237462306a36Sopenharmony_ci } 237562306a36Sopenharmony_ci 237662306a36Sopenharmony_ci args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); 237762306a36Sopenharmony_ci args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); 237862306a36Sopenharmony_ci if (!args->uuid || !args->fsid) { 237962306a36Sopenharmony_ci btrfs_put_dev_args_from_path(args); 238062306a36Sopenharmony_ci return -ENOMEM; 238162306a36Sopenharmony_ci } 238262306a36Sopenharmony_ci 238362306a36Sopenharmony_ci ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, 238462306a36Sopenharmony_ci &bdev, &disk_super); 238562306a36Sopenharmony_ci if (ret) { 238662306a36Sopenharmony_ci btrfs_put_dev_args_from_path(args); 238762306a36Sopenharmony_ci return ret; 238862306a36Sopenharmony_ci } 238962306a36Sopenharmony_ci 239062306a36Sopenharmony_ci args->devid = btrfs_stack_device_id(&disk_super->dev_item); 239162306a36Sopenharmony_ci memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); 239262306a36Sopenharmony_ci if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 239362306a36Sopenharmony_ci memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); 239462306a36Sopenharmony_ci else 239562306a36Sopenharmony_ci memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 239662306a36Sopenharmony_ci btrfs_release_disk_super(disk_super); 239762306a36Sopenharmony_ci blkdev_put(bdev, NULL); 239862306a36Sopenharmony_ci return 0; 239962306a36Sopenharmony_ci} 240062306a36Sopenharmony_ci 240162306a36Sopenharmony_ci/* 240262306a36Sopenharmony_ci * Only use this jointly with btrfs_get_dev_args_from_path() because we will 240362306a36Sopenharmony_ci * allocate our ->uuid and ->fsid pointers, everybody else uses local variables 240462306a36Sopenharmony_ci * that don't need to be freed. 240562306a36Sopenharmony_ci */ 240662306a36Sopenharmony_civoid btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) 240762306a36Sopenharmony_ci{ 240862306a36Sopenharmony_ci kfree(args->uuid); 240962306a36Sopenharmony_ci kfree(args->fsid); 241062306a36Sopenharmony_ci args->uuid = NULL; 241162306a36Sopenharmony_ci args->fsid = NULL; 241262306a36Sopenharmony_ci} 241362306a36Sopenharmony_ci 241462306a36Sopenharmony_cistruct btrfs_device *btrfs_find_device_by_devspec( 241562306a36Sopenharmony_ci struct btrfs_fs_info *fs_info, u64 devid, 241662306a36Sopenharmony_ci const char *device_path) 241762306a36Sopenharmony_ci{ 241862306a36Sopenharmony_ci BTRFS_DEV_LOOKUP_ARGS(args); 241962306a36Sopenharmony_ci struct btrfs_device *device; 242062306a36Sopenharmony_ci int ret; 242162306a36Sopenharmony_ci 242262306a36Sopenharmony_ci if (devid) { 242362306a36Sopenharmony_ci args.devid = devid; 242462306a36Sopenharmony_ci device = btrfs_find_device(fs_info->fs_devices, &args); 242562306a36Sopenharmony_ci if (!device) 242662306a36Sopenharmony_ci return ERR_PTR(-ENOENT); 242762306a36Sopenharmony_ci return device; 242862306a36Sopenharmony_ci } 242962306a36Sopenharmony_ci 243062306a36Sopenharmony_ci ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); 243162306a36Sopenharmony_ci if (ret) 243262306a36Sopenharmony_ci return ERR_PTR(ret); 243362306a36Sopenharmony_ci device = btrfs_find_device(fs_info->fs_devices, &args); 243462306a36Sopenharmony_ci btrfs_put_dev_args_from_path(&args); 243562306a36Sopenharmony_ci if (!device) 243662306a36Sopenharmony_ci return ERR_PTR(-ENOENT); 243762306a36Sopenharmony_ci return device; 243862306a36Sopenharmony_ci} 243962306a36Sopenharmony_ci 244062306a36Sopenharmony_cistatic struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) 244162306a36Sopenharmony_ci{ 244262306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 244362306a36Sopenharmony_ci struct btrfs_fs_devices *old_devices; 244462306a36Sopenharmony_ci struct btrfs_fs_devices *seed_devices; 244562306a36Sopenharmony_ci 244662306a36Sopenharmony_ci lockdep_assert_held(&uuid_mutex); 244762306a36Sopenharmony_ci if (!fs_devices->seeding) 244862306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 244962306a36Sopenharmony_ci 245062306a36Sopenharmony_ci /* 245162306a36Sopenharmony_ci * Private copy of the seed devices, anchored at 245262306a36Sopenharmony_ci * fs_info->fs_devices->seed_list 245362306a36Sopenharmony_ci */ 245462306a36Sopenharmony_ci seed_devices = alloc_fs_devices(NULL, NULL); 245562306a36Sopenharmony_ci if (IS_ERR(seed_devices)) 245662306a36Sopenharmony_ci return seed_devices; 245762306a36Sopenharmony_ci 245862306a36Sopenharmony_ci /* 245962306a36Sopenharmony_ci * It's necessary to retain a copy of the original seed fs_devices in 246062306a36Sopenharmony_ci * fs_uuids so that filesystems which have been seeded can successfully 246162306a36Sopenharmony_ci * reference the seed device from open_seed_devices. This also supports 246262306a36Sopenharmony_ci * multiple fs seed. 246362306a36Sopenharmony_ci */ 246462306a36Sopenharmony_ci old_devices = clone_fs_devices(fs_devices); 246562306a36Sopenharmony_ci if (IS_ERR(old_devices)) { 246662306a36Sopenharmony_ci kfree(seed_devices); 246762306a36Sopenharmony_ci return old_devices; 246862306a36Sopenharmony_ci } 246962306a36Sopenharmony_ci 247062306a36Sopenharmony_ci list_add(&old_devices->fs_list, &fs_uuids); 247162306a36Sopenharmony_ci 247262306a36Sopenharmony_ci memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 247362306a36Sopenharmony_ci seed_devices->opened = 1; 247462306a36Sopenharmony_ci INIT_LIST_HEAD(&seed_devices->devices); 247562306a36Sopenharmony_ci INIT_LIST_HEAD(&seed_devices->alloc_list); 247662306a36Sopenharmony_ci mutex_init(&seed_devices->device_list_mutex); 247762306a36Sopenharmony_ci 247862306a36Sopenharmony_ci return seed_devices; 247962306a36Sopenharmony_ci} 248062306a36Sopenharmony_ci 248162306a36Sopenharmony_ci/* 248262306a36Sopenharmony_ci * Splice seed devices into the sprout fs_devices. 248362306a36Sopenharmony_ci * Generate a new fsid for the sprouted read-write filesystem. 248462306a36Sopenharmony_ci */ 248562306a36Sopenharmony_cistatic void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, 248662306a36Sopenharmony_ci struct btrfs_fs_devices *seed_devices) 248762306a36Sopenharmony_ci{ 248862306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 248962306a36Sopenharmony_ci struct btrfs_super_block *disk_super = fs_info->super_copy; 249062306a36Sopenharmony_ci struct btrfs_device *device; 249162306a36Sopenharmony_ci u64 super_flags; 249262306a36Sopenharmony_ci 249362306a36Sopenharmony_ci /* 249462306a36Sopenharmony_ci * We are updating the fsid, the thread leading to device_list_add() 249562306a36Sopenharmony_ci * could race, so uuid_mutex is needed. 249662306a36Sopenharmony_ci */ 249762306a36Sopenharmony_ci lockdep_assert_held(&uuid_mutex); 249862306a36Sopenharmony_ci 249962306a36Sopenharmony_ci /* 250062306a36Sopenharmony_ci * The threads listed below may traverse dev_list but can do that without 250162306a36Sopenharmony_ci * device_list_mutex: 250262306a36Sopenharmony_ci * - All device ops and balance - as we are in btrfs_exclop_start. 250362306a36Sopenharmony_ci * - Various dev_list readers - are using RCU. 250462306a36Sopenharmony_ci * - btrfs_ioctl_fitrim() - is using RCU. 250562306a36Sopenharmony_ci * 250662306a36Sopenharmony_ci * For-read threads as below are using device_list_mutex: 250762306a36Sopenharmony_ci * - Readonly scrub btrfs_scrub_dev() 250862306a36Sopenharmony_ci * - Readonly scrub btrfs_scrub_progress() 250962306a36Sopenharmony_ci * - btrfs_get_dev_stats() 251062306a36Sopenharmony_ci */ 251162306a36Sopenharmony_ci lockdep_assert_held(&fs_devices->device_list_mutex); 251262306a36Sopenharmony_ci 251362306a36Sopenharmony_ci list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 251462306a36Sopenharmony_ci synchronize_rcu); 251562306a36Sopenharmony_ci list_for_each_entry(device, &seed_devices->devices, dev_list) 251662306a36Sopenharmony_ci device->fs_devices = seed_devices; 251762306a36Sopenharmony_ci 251862306a36Sopenharmony_ci fs_devices->seeding = false; 251962306a36Sopenharmony_ci fs_devices->num_devices = 0; 252062306a36Sopenharmony_ci fs_devices->open_devices = 0; 252162306a36Sopenharmony_ci fs_devices->missing_devices = 0; 252262306a36Sopenharmony_ci fs_devices->rotating = false; 252362306a36Sopenharmony_ci list_add(&seed_devices->seed_list, &fs_devices->seed_list); 252462306a36Sopenharmony_ci 252562306a36Sopenharmony_ci generate_random_uuid(fs_devices->fsid); 252662306a36Sopenharmony_ci memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 252762306a36Sopenharmony_ci memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 252862306a36Sopenharmony_ci 252962306a36Sopenharmony_ci super_flags = btrfs_super_flags(disk_super) & 253062306a36Sopenharmony_ci ~BTRFS_SUPER_FLAG_SEEDING; 253162306a36Sopenharmony_ci btrfs_set_super_flags(disk_super, super_flags); 253262306a36Sopenharmony_ci} 253362306a36Sopenharmony_ci 253462306a36Sopenharmony_ci/* 253562306a36Sopenharmony_ci * Store the expected generation for seed devices in device items. 253662306a36Sopenharmony_ci */ 253762306a36Sopenharmony_cistatic int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 253862306a36Sopenharmony_ci{ 253962306a36Sopenharmony_ci BTRFS_DEV_LOOKUP_ARGS(args); 254062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 254162306a36Sopenharmony_ci struct btrfs_root *root = fs_info->chunk_root; 254262306a36Sopenharmony_ci struct btrfs_path *path; 254362306a36Sopenharmony_ci struct extent_buffer *leaf; 254462306a36Sopenharmony_ci struct btrfs_dev_item *dev_item; 254562306a36Sopenharmony_ci struct btrfs_device *device; 254662306a36Sopenharmony_ci struct btrfs_key key; 254762306a36Sopenharmony_ci u8 fs_uuid[BTRFS_FSID_SIZE]; 254862306a36Sopenharmony_ci u8 dev_uuid[BTRFS_UUID_SIZE]; 254962306a36Sopenharmony_ci int ret; 255062306a36Sopenharmony_ci 255162306a36Sopenharmony_ci path = btrfs_alloc_path(); 255262306a36Sopenharmony_ci if (!path) 255362306a36Sopenharmony_ci return -ENOMEM; 255462306a36Sopenharmony_ci 255562306a36Sopenharmony_ci key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 255662306a36Sopenharmony_ci key.offset = 0; 255762306a36Sopenharmony_ci key.type = BTRFS_DEV_ITEM_KEY; 255862306a36Sopenharmony_ci 255962306a36Sopenharmony_ci while (1) { 256062306a36Sopenharmony_ci btrfs_reserve_chunk_metadata(trans, false); 256162306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 256262306a36Sopenharmony_ci btrfs_trans_release_chunk_metadata(trans); 256362306a36Sopenharmony_ci if (ret < 0) 256462306a36Sopenharmony_ci goto error; 256562306a36Sopenharmony_ci 256662306a36Sopenharmony_ci leaf = path->nodes[0]; 256762306a36Sopenharmony_cinext_slot: 256862306a36Sopenharmony_ci if (path->slots[0] >= btrfs_header_nritems(leaf)) { 256962306a36Sopenharmony_ci ret = btrfs_next_leaf(root, path); 257062306a36Sopenharmony_ci if (ret > 0) 257162306a36Sopenharmony_ci break; 257262306a36Sopenharmony_ci if (ret < 0) 257362306a36Sopenharmony_ci goto error; 257462306a36Sopenharmony_ci leaf = path->nodes[0]; 257562306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 257662306a36Sopenharmony_ci btrfs_release_path(path); 257762306a36Sopenharmony_ci continue; 257862306a36Sopenharmony_ci } 257962306a36Sopenharmony_ci 258062306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 258162306a36Sopenharmony_ci if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 258262306a36Sopenharmony_ci key.type != BTRFS_DEV_ITEM_KEY) 258362306a36Sopenharmony_ci break; 258462306a36Sopenharmony_ci 258562306a36Sopenharmony_ci dev_item = btrfs_item_ptr(leaf, path->slots[0], 258662306a36Sopenharmony_ci struct btrfs_dev_item); 258762306a36Sopenharmony_ci args.devid = btrfs_device_id(leaf, dev_item); 258862306a36Sopenharmony_ci read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 258962306a36Sopenharmony_ci BTRFS_UUID_SIZE); 259062306a36Sopenharmony_ci read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 259162306a36Sopenharmony_ci BTRFS_FSID_SIZE); 259262306a36Sopenharmony_ci args.uuid = dev_uuid; 259362306a36Sopenharmony_ci args.fsid = fs_uuid; 259462306a36Sopenharmony_ci device = btrfs_find_device(fs_info->fs_devices, &args); 259562306a36Sopenharmony_ci BUG_ON(!device); /* Logic error */ 259662306a36Sopenharmony_ci 259762306a36Sopenharmony_ci if (device->fs_devices->seeding) { 259862306a36Sopenharmony_ci btrfs_set_device_generation(leaf, dev_item, 259962306a36Sopenharmony_ci device->generation); 260062306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 260162306a36Sopenharmony_ci } 260262306a36Sopenharmony_ci 260362306a36Sopenharmony_ci path->slots[0]++; 260462306a36Sopenharmony_ci goto next_slot; 260562306a36Sopenharmony_ci } 260662306a36Sopenharmony_ci ret = 0; 260762306a36Sopenharmony_cierror: 260862306a36Sopenharmony_ci btrfs_free_path(path); 260962306a36Sopenharmony_ci return ret; 261062306a36Sopenharmony_ci} 261162306a36Sopenharmony_ci 261262306a36Sopenharmony_ciint btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 261362306a36Sopenharmony_ci{ 261462306a36Sopenharmony_ci struct btrfs_root *root = fs_info->dev_root; 261562306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 261662306a36Sopenharmony_ci struct btrfs_device *device; 261762306a36Sopenharmony_ci struct block_device *bdev; 261862306a36Sopenharmony_ci struct super_block *sb = fs_info->sb; 261962306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 262062306a36Sopenharmony_ci struct btrfs_fs_devices *seed_devices = NULL; 262162306a36Sopenharmony_ci u64 orig_super_total_bytes; 262262306a36Sopenharmony_ci u64 orig_super_num_devices; 262362306a36Sopenharmony_ci int ret = 0; 262462306a36Sopenharmony_ci bool seeding_dev = false; 262562306a36Sopenharmony_ci bool locked = false; 262662306a36Sopenharmony_ci 262762306a36Sopenharmony_ci if (sb_rdonly(sb) && !fs_devices->seeding) 262862306a36Sopenharmony_ci return -EROFS; 262962306a36Sopenharmony_ci 263062306a36Sopenharmony_ci bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, 263162306a36Sopenharmony_ci fs_info->bdev_holder, NULL); 263262306a36Sopenharmony_ci if (IS_ERR(bdev)) 263362306a36Sopenharmony_ci return PTR_ERR(bdev); 263462306a36Sopenharmony_ci 263562306a36Sopenharmony_ci if (!btrfs_check_device_zone_type(fs_info, bdev)) { 263662306a36Sopenharmony_ci ret = -EINVAL; 263762306a36Sopenharmony_ci goto error; 263862306a36Sopenharmony_ci } 263962306a36Sopenharmony_ci 264062306a36Sopenharmony_ci if (fs_devices->seeding) { 264162306a36Sopenharmony_ci seeding_dev = true; 264262306a36Sopenharmony_ci down_write(&sb->s_umount); 264362306a36Sopenharmony_ci mutex_lock(&uuid_mutex); 264462306a36Sopenharmony_ci locked = true; 264562306a36Sopenharmony_ci } 264662306a36Sopenharmony_ci 264762306a36Sopenharmony_ci sync_blockdev(bdev); 264862306a36Sopenharmony_ci 264962306a36Sopenharmony_ci rcu_read_lock(); 265062306a36Sopenharmony_ci list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 265162306a36Sopenharmony_ci if (device->bdev == bdev) { 265262306a36Sopenharmony_ci ret = -EEXIST; 265362306a36Sopenharmony_ci rcu_read_unlock(); 265462306a36Sopenharmony_ci goto error; 265562306a36Sopenharmony_ci } 265662306a36Sopenharmony_ci } 265762306a36Sopenharmony_ci rcu_read_unlock(); 265862306a36Sopenharmony_ci 265962306a36Sopenharmony_ci device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); 266062306a36Sopenharmony_ci if (IS_ERR(device)) { 266162306a36Sopenharmony_ci /* we can safely leave the fs_devices entry around */ 266262306a36Sopenharmony_ci ret = PTR_ERR(device); 266362306a36Sopenharmony_ci goto error; 266462306a36Sopenharmony_ci } 266562306a36Sopenharmony_ci 266662306a36Sopenharmony_ci device->fs_info = fs_info; 266762306a36Sopenharmony_ci device->bdev = bdev; 266862306a36Sopenharmony_ci ret = lookup_bdev(device_path, &device->devt); 266962306a36Sopenharmony_ci if (ret) 267062306a36Sopenharmony_ci goto error_free_device; 267162306a36Sopenharmony_ci 267262306a36Sopenharmony_ci ret = btrfs_get_dev_zone_info(device, false); 267362306a36Sopenharmony_ci if (ret) 267462306a36Sopenharmony_ci goto error_free_device; 267562306a36Sopenharmony_ci 267662306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 0); 267762306a36Sopenharmony_ci if (IS_ERR(trans)) { 267862306a36Sopenharmony_ci ret = PTR_ERR(trans); 267962306a36Sopenharmony_ci goto error_free_zone; 268062306a36Sopenharmony_ci } 268162306a36Sopenharmony_ci 268262306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 268362306a36Sopenharmony_ci device->generation = trans->transid; 268462306a36Sopenharmony_ci device->io_width = fs_info->sectorsize; 268562306a36Sopenharmony_ci device->io_align = fs_info->sectorsize; 268662306a36Sopenharmony_ci device->sector_size = fs_info->sectorsize; 268762306a36Sopenharmony_ci device->total_bytes = 268862306a36Sopenharmony_ci round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); 268962306a36Sopenharmony_ci device->disk_total_bytes = device->total_bytes; 269062306a36Sopenharmony_ci device->commit_total_bytes = device->total_bytes; 269162306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 269262306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 269362306a36Sopenharmony_ci device->holder = fs_info->bdev_holder; 269462306a36Sopenharmony_ci device->dev_stats_valid = 1; 269562306a36Sopenharmony_ci set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 269662306a36Sopenharmony_ci 269762306a36Sopenharmony_ci if (seeding_dev) { 269862306a36Sopenharmony_ci btrfs_clear_sb_rdonly(sb); 269962306a36Sopenharmony_ci 270062306a36Sopenharmony_ci /* GFP_KERNEL allocation must not be under device_list_mutex */ 270162306a36Sopenharmony_ci seed_devices = btrfs_init_sprout(fs_info); 270262306a36Sopenharmony_ci if (IS_ERR(seed_devices)) { 270362306a36Sopenharmony_ci ret = PTR_ERR(seed_devices); 270462306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 270562306a36Sopenharmony_ci goto error_trans; 270662306a36Sopenharmony_ci } 270762306a36Sopenharmony_ci } 270862306a36Sopenharmony_ci 270962306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 271062306a36Sopenharmony_ci if (seeding_dev) { 271162306a36Sopenharmony_ci btrfs_setup_sprout(fs_info, seed_devices); 271262306a36Sopenharmony_ci btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 271362306a36Sopenharmony_ci device); 271462306a36Sopenharmony_ci } 271562306a36Sopenharmony_ci 271662306a36Sopenharmony_ci device->fs_devices = fs_devices; 271762306a36Sopenharmony_ci 271862306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 271962306a36Sopenharmony_ci list_add_rcu(&device->dev_list, &fs_devices->devices); 272062306a36Sopenharmony_ci list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 272162306a36Sopenharmony_ci fs_devices->num_devices++; 272262306a36Sopenharmony_ci fs_devices->open_devices++; 272362306a36Sopenharmony_ci fs_devices->rw_devices++; 272462306a36Sopenharmony_ci fs_devices->total_devices++; 272562306a36Sopenharmony_ci fs_devices->total_rw_bytes += device->total_bytes; 272662306a36Sopenharmony_ci 272762306a36Sopenharmony_ci atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 272862306a36Sopenharmony_ci 272962306a36Sopenharmony_ci if (!bdev_nonrot(bdev)) 273062306a36Sopenharmony_ci fs_devices->rotating = true; 273162306a36Sopenharmony_ci 273262306a36Sopenharmony_ci orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 273362306a36Sopenharmony_ci btrfs_set_super_total_bytes(fs_info->super_copy, 273462306a36Sopenharmony_ci round_down(orig_super_total_bytes + device->total_bytes, 273562306a36Sopenharmony_ci fs_info->sectorsize)); 273662306a36Sopenharmony_ci 273762306a36Sopenharmony_ci orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 273862306a36Sopenharmony_ci btrfs_set_super_num_devices(fs_info->super_copy, 273962306a36Sopenharmony_ci orig_super_num_devices + 1); 274062306a36Sopenharmony_ci 274162306a36Sopenharmony_ci /* 274262306a36Sopenharmony_ci * we've got more storage, clear any full flags on the space 274362306a36Sopenharmony_ci * infos 274462306a36Sopenharmony_ci */ 274562306a36Sopenharmony_ci btrfs_clear_space_info_full(fs_info); 274662306a36Sopenharmony_ci 274762306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 274862306a36Sopenharmony_ci 274962306a36Sopenharmony_ci /* Add sysfs device entry */ 275062306a36Sopenharmony_ci btrfs_sysfs_add_device(device); 275162306a36Sopenharmony_ci 275262306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 275362306a36Sopenharmony_ci 275462306a36Sopenharmony_ci if (seeding_dev) { 275562306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 275662306a36Sopenharmony_ci ret = init_first_rw_device(trans); 275762306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 275862306a36Sopenharmony_ci if (ret) { 275962306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 276062306a36Sopenharmony_ci goto error_sysfs; 276162306a36Sopenharmony_ci } 276262306a36Sopenharmony_ci } 276362306a36Sopenharmony_ci 276462306a36Sopenharmony_ci ret = btrfs_add_dev_item(trans, device); 276562306a36Sopenharmony_ci if (ret) { 276662306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 276762306a36Sopenharmony_ci goto error_sysfs; 276862306a36Sopenharmony_ci } 276962306a36Sopenharmony_ci 277062306a36Sopenharmony_ci if (seeding_dev) { 277162306a36Sopenharmony_ci ret = btrfs_finish_sprout(trans); 277262306a36Sopenharmony_ci if (ret) { 277362306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 277462306a36Sopenharmony_ci goto error_sysfs; 277562306a36Sopenharmony_ci } 277662306a36Sopenharmony_ci 277762306a36Sopenharmony_ci /* 277862306a36Sopenharmony_ci * fs_devices now represents the newly sprouted filesystem and 277962306a36Sopenharmony_ci * its fsid has been changed by btrfs_sprout_splice(). 278062306a36Sopenharmony_ci */ 278162306a36Sopenharmony_ci btrfs_sysfs_update_sprout_fsid(fs_devices); 278262306a36Sopenharmony_ci } 278362306a36Sopenharmony_ci 278462306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 278562306a36Sopenharmony_ci 278662306a36Sopenharmony_ci if (seeding_dev) { 278762306a36Sopenharmony_ci mutex_unlock(&uuid_mutex); 278862306a36Sopenharmony_ci up_write(&sb->s_umount); 278962306a36Sopenharmony_ci locked = false; 279062306a36Sopenharmony_ci 279162306a36Sopenharmony_ci if (ret) /* transaction commit */ 279262306a36Sopenharmony_ci return ret; 279362306a36Sopenharmony_ci 279462306a36Sopenharmony_ci ret = btrfs_relocate_sys_chunks(fs_info); 279562306a36Sopenharmony_ci if (ret < 0) 279662306a36Sopenharmony_ci btrfs_handle_fs_error(fs_info, ret, 279762306a36Sopenharmony_ci "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 279862306a36Sopenharmony_ci trans = btrfs_attach_transaction(root); 279962306a36Sopenharmony_ci if (IS_ERR(trans)) { 280062306a36Sopenharmony_ci if (PTR_ERR(trans) == -ENOENT) 280162306a36Sopenharmony_ci return 0; 280262306a36Sopenharmony_ci ret = PTR_ERR(trans); 280362306a36Sopenharmony_ci trans = NULL; 280462306a36Sopenharmony_ci goto error_sysfs; 280562306a36Sopenharmony_ci } 280662306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 280762306a36Sopenharmony_ci } 280862306a36Sopenharmony_ci 280962306a36Sopenharmony_ci /* 281062306a36Sopenharmony_ci * Now that we have written a new super block to this device, check all 281162306a36Sopenharmony_ci * other fs_devices list if device_path alienates any other scanned 281262306a36Sopenharmony_ci * device. 281362306a36Sopenharmony_ci * We can ignore the return value as it typically returns -EINVAL and 281462306a36Sopenharmony_ci * only succeeds if the device was an alien. 281562306a36Sopenharmony_ci */ 281662306a36Sopenharmony_ci btrfs_forget_devices(device->devt); 281762306a36Sopenharmony_ci 281862306a36Sopenharmony_ci /* Update ctime/mtime for blkid or udev */ 281962306a36Sopenharmony_ci update_dev_time(device_path); 282062306a36Sopenharmony_ci 282162306a36Sopenharmony_ci return ret; 282262306a36Sopenharmony_ci 282362306a36Sopenharmony_cierror_sysfs: 282462306a36Sopenharmony_ci btrfs_sysfs_remove_device(device); 282562306a36Sopenharmony_ci mutex_lock(&fs_info->fs_devices->device_list_mutex); 282662306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 282762306a36Sopenharmony_ci list_del_rcu(&device->dev_list); 282862306a36Sopenharmony_ci list_del(&device->dev_alloc_list); 282962306a36Sopenharmony_ci fs_info->fs_devices->num_devices--; 283062306a36Sopenharmony_ci fs_info->fs_devices->open_devices--; 283162306a36Sopenharmony_ci fs_info->fs_devices->rw_devices--; 283262306a36Sopenharmony_ci fs_info->fs_devices->total_devices--; 283362306a36Sopenharmony_ci fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 283462306a36Sopenharmony_ci atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 283562306a36Sopenharmony_ci btrfs_set_super_total_bytes(fs_info->super_copy, 283662306a36Sopenharmony_ci orig_super_total_bytes); 283762306a36Sopenharmony_ci btrfs_set_super_num_devices(fs_info->super_copy, 283862306a36Sopenharmony_ci orig_super_num_devices); 283962306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 284062306a36Sopenharmony_ci mutex_unlock(&fs_info->fs_devices->device_list_mutex); 284162306a36Sopenharmony_cierror_trans: 284262306a36Sopenharmony_ci if (seeding_dev) 284362306a36Sopenharmony_ci btrfs_set_sb_rdonly(sb); 284462306a36Sopenharmony_ci if (trans) 284562306a36Sopenharmony_ci btrfs_end_transaction(trans); 284662306a36Sopenharmony_cierror_free_zone: 284762306a36Sopenharmony_ci btrfs_destroy_dev_zone_info(device); 284862306a36Sopenharmony_cierror_free_device: 284962306a36Sopenharmony_ci btrfs_free_device(device); 285062306a36Sopenharmony_cierror: 285162306a36Sopenharmony_ci blkdev_put(bdev, fs_info->bdev_holder); 285262306a36Sopenharmony_ci if (locked) { 285362306a36Sopenharmony_ci mutex_unlock(&uuid_mutex); 285462306a36Sopenharmony_ci up_write(&sb->s_umount); 285562306a36Sopenharmony_ci } 285662306a36Sopenharmony_ci return ret; 285762306a36Sopenharmony_ci} 285862306a36Sopenharmony_ci 285962306a36Sopenharmony_cistatic noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 286062306a36Sopenharmony_ci struct btrfs_device *device) 286162306a36Sopenharmony_ci{ 286262306a36Sopenharmony_ci int ret; 286362306a36Sopenharmony_ci struct btrfs_path *path; 286462306a36Sopenharmony_ci struct btrfs_root *root = device->fs_info->chunk_root; 286562306a36Sopenharmony_ci struct btrfs_dev_item *dev_item; 286662306a36Sopenharmony_ci struct extent_buffer *leaf; 286762306a36Sopenharmony_ci struct btrfs_key key; 286862306a36Sopenharmony_ci 286962306a36Sopenharmony_ci path = btrfs_alloc_path(); 287062306a36Sopenharmony_ci if (!path) 287162306a36Sopenharmony_ci return -ENOMEM; 287262306a36Sopenharmony_ci 287362306a36Sopenharmony_ci key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 287462306a36Sopenharmony_ci key.type = BTRFS_DEV_ITEM_KEY; 287562306a36Sopenharmony_ci key.offset = device->devid; 287662306a36Sopenharmony_ci 287762306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 287862306a36Sopenharmony_ci if (ret < 0) 287962306a36Sopenharmony_ci goto out; 288062306a36Sopenharmony_ci 288162306a36Sopenharmony_ci if (ret > 0) { 288262306a36Sopenharmony_ci ret = -ENOENT; 288362306a36Sopenharmony_ci goto out; 288462306a36Sopenharmony_ci } 288562306a36Sopenharmony_ci 288662306a36Sopenharmony_ci leaf = path->nodes[0]; 288762306a36Sopenharmony_ci dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 288862306a36Sopenharmony_ci 288962306a36Sopenharmony_ci btrfs_set_device_id(leaf, dev_item, device->devid); 289062306a36Sopenharmony_ci btrfs_set_device_type(leaf, dev_item, device->type); 289162306a36Sopenharmony_ci btrfs_set_device_io_align(leaf, dev_item, device->io_align); 289262306a36Sopenharmony_ci btrfs_set_device_io_width(leaf, dev_item, device->io_width); 289362306a36Sopenharmony_ci btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 289462306a36Sopenharmony_ci btrfs_set_device_total_bytes(leaf, dev_item, 289562306a36Sopenharmony_ci btrfs_device_get_disk_total_bytes(device)); 289662306a36Sopenharmony_ci btrfs_set_device_bytes_used(leaf, dev_item, 289762306a36Sopenharmony_ci btrfs_device_get_bytes_used(device)); 289862306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 289962306a36Sopenharmony_ci 290062306a36Sopenharmony_ciout: 290162306a36Sopenharmony_ci btrfs_free_path(path); 290262306a36Sopenharmony_ci return ret; 290362306a36Sopenharmony_ci} 290462306a36Sopenharmony_ci 290562306a36Sopenharmony_ciint btrfs_grow_device(struct btrfs_trans_handle *trans, 290662306a36Sopenharmony_ci struct btrfs_device *device, u64 new_size) 290762306a36Sopenharmony_ci{ 290862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = device->fs_info; 290962306a36Sopenharmony_ci struct btrfs_super_block *super_copy = fs_info->super_copy; 291062306a36Sopenharmony_ci u64 old_total; 291162306a36Sopenharmony_ci u64 diff; 291262306a36Sopenharmony_ci int ret; 291362306a36Sopenharmony_ci 291462306a36Sopenharmony_ci if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 291562306a36Sopenharmony_ci return -EACCES; 291662306a36Sopenharmony_ci 291762306a36Sopenharmony_ci new_size = round_down(new_size, fs_info->sectorsize); 291862306a36Sopenharmony_ci 291962306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 292062306a36Sopenharmony_ci old_total = btrfs_super_total_bytes(super_copy); 292162306a36Sopenharmony_ci diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 292262306a36Sopenharmony_ci 292362306a36Sopenharmony_ci if (new_size <= device->total_bytes || 292462306a36Sopenharmony_ci test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 292562306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 292662306a36Sopenharmony_ci return -EINVAL; 292762306a36Sopenharmony_ci } 292862306a36Sopenharmony_ci 292962306a36Sopenharmony_ci btrfs_set_super_total_bytes(super_copy, 293062306a36Sopenharmony_ci round_down(old_total + diff, fs_info->sectorsize)); 293162306a36Sopenharmony_ci device->fs_devices->total_rw_bytes += diff; 293262306a36Sopenharmony_ci 293362306a36Sopenharmony_ci btrfs_device_set_total_bytes(device, new_size); 293462306a36Sopenharmony_ci btrfs_device_set_disk_total_bytes(device, new_size); 293562306a36Sopenharmony_ci btrfs_clear_space_info_full(device->fs_info); 293662306a36Sopenharmony_ci if (list_empty(&device->post_commit_list)) 293762306a36Sopenharmony_ci list_add_tail(&device->post_commit_list, 293862306a36Sopenharmony_ci &trans->transaction->dev_update_list); 293962306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 294062306a36Sopenharmony_ci 294162306a36Sopenharmony_ci btrfs_reserve_chunk_metadata(trans, false); 294262306a36Sopenharmony_ci ret = btrfs_update_device(trans, device); 294362306a36Sopenharmony_ci btrfs_trans_release_chunk_metadata(trans); 294462306a36Sopenharmony_ci 294562306a36Sopenharmony_ci return ret; 294662306a36Sopenharmony_ci} 294762306a36Sopenharmony_ci 294862306a36Sopenharmony_cistatic int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 294962306a36Sopenharmony_ci{ 295062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 295162306a36Sopenharmony_ci struct btrfs_root *root = fs_info->chunk_root; 295262306a36Sopenharmony_ci int ret; 295362306a36Sopenharmony_ci struct btrfs_path *path; 295462306a36Sopenharmony_ci struct btrfs_key key; 295562306a36Sopenharmony_ci 295662306a36Sopenharmony_ci path = btrfs_alloc_path(); 295762306a36Sopenharmony_ci if (!path) 295862306a36Sopenharmony_ci return -ENOMEM; 295962306a36Sopenharmony_ci 296062306a36Sopenharmony_ci key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 296162306a36Sopenharmony_ci key.offset = chunk_offset; 296262306a36Sopenharmony_ci key.type = BTRFS_CHUNK_ITEM_KEY; 296362306a36Sopenharmony_ci 296462306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 296562306a36Sopenharmony_ci if (ret < 0) 296662306a36Sopenharmony_ci goto out; 296762306a36Sopenharmony_ci else if (ret > 0) { /* Logic error or corruption */ 296862306a36Sopenharmony_ci btrfs_handle_fs_error(fs_info, -ENOENT, 296962306a36Sopenharmony_ci "Failed lookup while freeing chunk."); 297062306a36Sopenharmony_ci ret = -ENOENT; 297162306a36Sopenharmony_ci goto out; 297262306a36Sopenharmony_ci } 297362306a36Sopenharmony_ci 297462306a36Sopenharmony_ci ret = btrfs_del_item(trans, root, path); 297562306a36Sopenharmony_ci if (ret < 0) 297662306a36Sopenharmony_ci btrfs_handle_fs_error(fs_info, ret, 297762306a36Sopenharmony_ci "Failed to delete chunk item."); 297862306a36Sopenharmony_ciout: 297962306a36Sopenharmony_ci btrfs_free_path(path); 298062306a36Sopenharmony_ci return ret; 298162306a36Sopenharmony_ci} 298262306a36Sopenharmony_ci 298362306a36Sopenharmony_cistatic int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 298462306a36Sopenharmony_ci{ 298562306a36Sopenharmony_ci struct btrfs_super_block *super_copy = fs_info->super_copy; 298662306a36Sopenharmony_ci struct btrfs_disk_key *disk_key; 298762306a36Sopenharmony_ci struct btrfs_chunk *chunk; 298862306a36Sopenharmony_ci u8 *ptr; 298962306a36Sopenharmony_ci int ret = 0; 299062306a36Sopenharmony_ci u32 num_stripes; 299162306a36Sopenharmony_ci u32 array_size; 299262306a36Sopenharmony_ci u32 len = 0; 299362306a36Sopenharmony_ci u32 cur; 299462306a36Sopenharmony_ci struct btrfs_key key; 299562306a36Sopenharmony_ci 299662306a36Sopenharmony_ci lockdep_assert_held(&fs_info->chunk_mutex); 299762306a36Sopenharmony_ci array_size = btrfs_super_sys_array_size(super_copy); 299862306a36Sopenharmony_ci 299962306a36Sopenharmony_ci ptr = super_copy->sys_chunk_array; 300062306a36Sopenharmony_ci cur = 0; 300162306a36Sopenharmony_ci 300262306a36Sopenharmony_ci while (cur < array_size) { 300362306a36Sopenharmony_ci disk_key = (struct btrfs_disk_key *)ptr; 300462306a36Sopenharmony_ci btrfs_disk_key_to_cpu(&key, disk_key); 300562306a36Sopenharmony_ci 300662306a36Sopenharmony_ci len = sizeof(*disk_key); 300762306a36Sopenharmony_ci 300862306a36Sopenharmony_ci if (key.type == BTRFS_CHUNK_ITEM_KEY) { 300962306a36Sopenharmony_ci chunk = (struct btrfs_chunk *)(ptr + len); 301062306a36Sopenharmony_ci num_stripes = btrfs_stack_chunk_num_stripes(chunk); 301162306a36Sopenharmony_ci len += btrfs_chunk_item_size(num_stripes); 301262306a36Sopenharmony_ci } else { 301362306a36Sopenharmony_ci ret = -EIO; 301462306a36Sopenharmony_ci break; 301562306a36Sopenharmony_ci } 301662306a36Sopenharmony_ci if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 301762306a36Sopenharmony_ci key.offset == chunk_offset) { 301862306a36Sopenharmony_ci memmove(ptr, ptr + len, array_size - (cur + len)); 301962306a36Sopenharmony_ci array_size -= len; 302062306a36Sopenharmony_ci btrfs_set_super_sys_array_size(super_copy, array_size); 302162306a36Sopenharmony_ci } else { 302262306a36Sopenharmony_ci ptr += len; 302362306a36Sopenharmony_ci cur += len; 302462306a36Sopenharmony_ci } 302562306a36Sopenharmony_ci } 302662306a36Sopenharmony_ci return ret; 302762306a36Sopenharmony_ci} 302862306a36Sopenharmony_ci 302962306a36Sopenharmony_ci/* 303062306a36Sopenharmony_ci * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 303162306a36Sopenharmony_ci * @logical: Logical block offset in bytes. 303262306a36Sopenharmony_ci * @length: Length of extent in bytes. 303362306a36Sopenharmony_ci * 303462306a36Sopenharmony_ci * Return: Chunk mapping or ERR_PTR. 303562306a36Sopenharmony_ci */ 303662306a36Sopenharmony_cistruct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 303762306a36Sopenharmony_ci u64 logical, u64 length) 303862306a36Sopenharmony_ci{ 303962306a36Sopenharmony_ci struct extent_map_tree *em_tree; 304062306a36Sopenharmony_ci struct extent_map *em; 304162306a36Sopenharmony_ci 304262306a36Sopenharmony_ci em_tree = &fs_info->mapping_tree; 304362306a36Sopenharmony_ci read_lock(&em_tree->lock); 304462306a36Sopenharmony_ci em = lookup_extent_mapping(em_tree, logical, length); 304562306a36Sopenharmony_ci read_unlock(&em_tree->lock); 304662306a36Sopenharmony_ci 304762306a36Sopenharmony_ci if (!em) { 304862306a36Sopenharmony_ci btrfs_crit(fs_info, 304962306a36Sopenharmony_ci "unable to find chunk map for logical %llu length %llu", 305062306a36Sopenharmony_ci logical, length); 305162306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 305262306a36Sopenharmony_ci } 305362306a36Sopenharmony_ci 305462306a36Sopenharmony_ci if (em->start > logical || em->start + em->len <= logical) { 305562306a36Sopenharmony_ci btrfs_crit(fs_info, 305662306a36Sopenharmony_ci "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", 305762306a36Sopenharmony_ci logical, logical + length, em->start, em->start + em->len); 305862306a36Sopenharmony_ci free_extent_map(em); 305962306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 306062306a36Sopenharmony_ci } 306162306a36Sopenharmony_ci 306262306a36Sopenharmony_ci /* callers are responsible for dropping em's ref. */ 306362306a36Sopenharmony_ci return em; 306462306a36Sopenharmony_ci} 306562306a36Sopenharmony_ci 306662306a36Sopenharmony_cistatic int remove_chunk_item(struct btrfs_trans_handle *trans, 306762306a36Sopenharmony_ci struct map_lookup *map, u64 chunk_offset) 306862306a36Sopenharmony_ci{ 306962306a36Sopenharmony_ci int i; 307062306a36Sopenharmony_ci 307162306a36Sopenharmony_ci /* 307262306a36Sopenharmony_ci * Removing chunk items and updating the device items in the chunks btree 307362306a36Sopenharmony_ci * requires holding the chunk_mutex. 307462306a36Sopenharmony_ci * See the comment at btrfs_chunk_alloc() for the details. 307562306a36Sopenharmony_ci */ 307662306a36Sopenharmony_ci lockdep_assert_held(&trans->fs_info->chunk_mutex); 307762306a36Sopenharmony_ci 307862306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 307962306a36Sopenharmony_ci int ret; 308062306a36Sopenharmony_ci 308162306a36Sopenharmony_ci ret = btrfs_update_device(trans, map->stripes[i].dev); 308262306a36Sopenharmony_ci if (ret) 308362306a36Sopenharmony_ci return ret; 308462306a36Sopenharmony_ci } 308562306a36Sopenharmony_ci 308662306a36Sopenharmony_ci return btrfs_free_chunk(trans, chunk_offset); 308762306a36Sopenharmony_ci} 308862306a36Sopenharmony_ci 308962306a36Sopenharmony_ciint btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 309062306a36Sopenharmony_ci{ 309162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 309262306a36Sopenharmony_ci struct extent_map *em; 309362306a36Sopenharmony_ci struct map_lookup *map; 309462306a36Sopenharmony_ci u64 dev_extent_len = 0; 309562306a36Sopenharmony_ci int i, ret = 0; 309662306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 309762306a36Sopenharmony_ci 309862306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 309962306a36Sopenharmony_ci if (IS_ERR(em)) { 310062306a36Sopenharmony_ci /* 310162306a36Sopenharmony_ci * This is a logic error, but we don't want to just rely on the 310262306a36Sopenharmony_ci * user having built with ASSERT enabled, so if ASSERT doesn't 310362306a36Sopenharmony_ci * do anything we still error out. 310462306a36Sopenharmony_ci */ 310562306a36Sopenharmony_ci ASSERT(0); 310662306a36Sopenharmony_ci return PTR_ERR(em); 310762306a36Sopenharmony_ci } 310862306a36Sopenharmony_ci map = em->map_lookup; 310962306a36Sopenharmony_ci 311062306a36Sopenharmony_ci /* 311162306a36Sopenharmony_ci * First delete the device extent items from the devices btree. 311262306a36Sopenharmony_ci * We take the device_list_mutex to avoid racing with the finishing phase 311362306a36Sopenharmony_ci * of a device replace operation. See the comment below before acquiring 311462306a36Sopenharmony_ci * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 311562306a36Sopenharmony_ci * because that can result in a deadlock when deleting the device extent 311662306a36Sopenharmony_ci * items from the devices btree - COWing an extent buffer from the btree 311762306a36Sopenharmony_ci * may result in allocating a new metadata chunk, which would attempt to 311862306a36Sopenharmony_ci * lock again fs_info->chunk_mutex. 311962306a36Sopenharmony_ci */ 312062306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 312162306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 312262306a36Sopenharmony_ci struct btrfs_device *device = map->stripes[i].dev; 312362306a36Sopenharmony_ci ret = btrfs_free_dev_extent(trans, device, 312462306a36Sopenharmony_ci map->stripes[i].physical, 312562306a36Sopenharmony_ci &dev_extent_len); 312662306a36Sopenharmony_ci if (ret) { 312762306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 312862306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 312962306a36Sopenharmony_ci goto out; 313062306a36Sopenharmony_ci } 313162306a36Sopenharmony_ci 313262306a36Sopenharmony_ci if (device->bytes_used > 0) { 313362306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 313462306a36Sopenharmony_ci btrfs_device_set_bytes_used(device, 313562306a36Sopenharmony_ci device->bytes_used - dev_extent_len); 313662306a36Sopenharmony_ci atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 313762306a36Sopenharmony_ci btrfs_clear_space_info_full(fs_info); 313862306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 313962306a36Sopenharmony_ci } 314062306a36Sopenharmony_ci } 314162306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 314262306a36Sopenharmony_ci 314362306a36Sopenharmony_ci /* 314462306a36Sopenharmony_ci * We acquire fs_info->chunk_mutex for 2 reasons: 314562306a36Sopenharmony_ci * 314662306a36Sopenharmony_ci * 1) Just like with the first phase of the chunk allocation, we must 314762306a36Sopenharmony_ci * reserve system space, do all chunk btree updates and deletions, and 314862306a36Sopenharmony_ci * update the system chunk array in the superblock while holding this 314962306a36Sopenharmony_ci * mutex. This is for similar reasons as explained on the comment at 315062306a36Sopenharmony_ci * the top of btrfs_chunk_alloc(); 315162306a36Sopenharmony_ci * 315262306a36Sopenharmony_ci * 2) Prevent races with the final phase of a device replace operation 315362306a36Sopenharmony_ci * that replaces the device object associated with the map's stripes, 315462306a36Sopenharmony_ci * because the device object's id can change at any time during that 315562306a36Sopenharmony_ci * final phase of the device replace operation 315662306a36Sopenharmony_ci * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 315762306a36Sopenharmony_ci * replaced device and then see it with an ID of 315862306a36Sopenharmony_ci * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 315962306a36Sopenharmony_ci * the device item, which does not exists on the chunk btree. 316062306a36Sopenharmony_ci * The finishing phase of device replace acquires both the 316162306a36Sopenharmony_ci * device_list_mutex and the chunk_mutex, in that order, so we are 316262306a36Sopenharmony_ci * safe by just acquiring the chunk_mutex. 316362306a36Sopenharmony_ci */ 316462306a36Sopenharmony_ci trans->removing_chunk = true; 316562306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 316662306a36Sopenharmony_ci 316762306a36Sopenharmony_ci check_system_chunk(trans, map->type); 316862306a36Sopenharmony_ci 316962306a36Sopenharmony_ci ret = remove_chunk_item(trans, map, chunk_offset); 317062306a36Sopenharmony_ci /* 317162306a36Sopenharmony_ci * Normally we should not get -ENOSPC since we reserved space before 317262306a36Sopenharmony_ci * through the call to check_system_chunk(). 317362306a36Sopenharmony_ci * 317462306a36Sopenharmony_ci * Despite our system space_info having enough free space, we may not 317562306a36Sopenharmony_ci * be able to allocate extents from its block groups, because all have 317662306a36Sopenharmony_ci * an incompatible profile, which will force us to allocate a new system 317762306a36Sopenharmony_ci * block group with the right profile, or right after we called 317862306a36Sopenharmony_ci * check_system_space() above, a scrub turned the only system block group 317962306a36Sopenharmony_ci * with enough free space into RO mode. 318062306a36Sopenharmony_ci * This is explained with more detail at do_chunk_alloc(). 318162306a36Sopenharmony_ci * 318262306a36Sopenharmony_ci * So if we get -ENOSPC, allocate a new system chunk and retry once. 318362306a36Sopenharmony_ci */ 318462306a36Sopenharmony_ci if (ret == -ENOSPC) { 318562306a36Sopenharmony_ci const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 318662306a36Sopenharmony_ci struct btrfs_block_group *sys_bg; 318762306a36Sopenharmony_ci 318862306a36Sopenharmony_ci sys_bg = btrfs_create_chunk(trans, sys_flags); 318962306a36Sopenharmony_ci if (IS_ERR(sys_bg)) { 319062306a36Sopenharmony_ci ret = PTR_ERR(sys_bg); 319162306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 319262306a36Sopenharmony_ci goto out; 319362306a36Sopenharmony_ci } 319462306a36Sopenharmony_ci 319562306a36Sopenharmony_ci ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 319662306a36Sopenharmony_ci if (ret) { 319762306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 319862306a36Sopenharmony_ci goto out; 319962306a36Sopenharmony_ci } 320062306a36Sopenharmony_ci 320162306a36Sopenharmony_ci ret = remove_chunk_item(trans, map, chunk_offset); 320262306a36Sopenharmony_ci if (ret) { 320362306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 320462306a36Sopenharmony_ci goto out; 320562306a36Sopenharmony_ci } 320662306a36Sopenharmony_ci } else if (ret) { 320762306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 320862306a36Sopenharmony_ci goto out; 320962306a36Sopenharmony_ci } 321062306a36Sopenharmony_ci 321162306a36Sopenharmony_ci trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 321262306a36Sopenharmony_ci 321362306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 321462306a36Sopenharmony_ci ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 321562306a36Sopenharmony_ci if (ret) { 321662306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 321762306a36Sopenharmony_ci goto out; 321862306a36Sopenharmony_ci } 321962306a36Sopenharmony_ci } 322062306a36Sopenharmony_ci 322162306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 322262306a36Sopenharmony_ci trans->removing_chunk = false; 322362306a36Sopenharmony_ci 322462306a36Sopenharmony_ci /* 322562306a36Sopenharmony_ci * We are done with chunk btree updates and deletions, so release the 322662306a36Sopenharmony_ci * system space we previously reserved (with check_system_chunk()). 322762306a36Sopenharmony_ci */ 322862306a36Sopenharmony_ci btrfs_trans_release_chunk_metadata(trans); 322962306a36Sopenharmony_ci 323062306a36Sopenharmony_ci ret = btrfs_remove_block_group(trans, chunk_offset, em); 323162306a36Sopenharmony_ci if (ret) { 323262306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 323362306a36Sopenharmony_ci goto out; 323462306a36Sopenharmony_ci } 323562306a36Sopenharmony_ci 323662306a36Sopenharmony_ciout: 323762306a36Sopenharmony_ci if (trans->removing_chunk) { 323862306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 323962306a36Sopenharmony_ci trans->removing_chunk = false; 324062306a36Sopenharmony_ci } 324162306a36Sopenharmony_ci /* once for us */ 324262306a36Sopenharmony_ci free_extent_map(em); 324362306a36Sopenharmony_ci return ret; 324462306a36Sopenharmony_ci} 324562306a36Sopenharmony_ci 324662306a36Sopenharmony_ciint btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 324762306a36Sopenharmony_ci{ 324862306a36Sopenharmony_ci struct btrfs_root *root = fs_info->chunk_root; 324962306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 325062306a36Sopenharmony_ci struct btrfs_block_group *block_group; 325162306a36Sopenharmony_ci u64 length; 325262306a36Sopenharmony_ci int ret; 325362306a36Sopenharmony_ci 325462306a36Sopenharmony_ci if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 325562306a36Sopenharmony_ci btrfs_err(fs_info, 325662306a36Sopenharmony_ci "relocate: not supported on extent tree v2 yet"); 325762306a36Sopenharmony_ci return -EINVAL; 325862306a36Sopenharmony_ci } 325962306a36Sopenharmony_ci 326062306a36Sopenharmony_ci /* 326162306a36Sopenharmony_ci * Prevent races with automatic removal of unused block groups. 326262306a36Sopenharmony_ci * After we relocate and before we remove the chunk with offset 326362306a36Sopenharmony_ci * chunk_offset, automatic removal of the block group can kick in, 326462306a36Sopenharmony_ci * resulting in a failure when calling btrfs_remove_chunk() below. 326562306a36Sopenharmony_ci * 326662306a36Sopenharmony_ci * Make sure to acquire this mutex before doing a tree search (dev 326762306a36Sopenharmony_ci * or chunk trees) to find chunks. Otherwise the cleaner kthread might 326862306a36Sopenharmony_ci * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 326962306a36Sopenharmony_ci * we release the path used to search the chunk/dev tree and before 327062306a36Sopenharmony_ci * the current task acquires this mutex and calls us. 327162306a36Sopenharmony_ci */ 327262306a36Sopenharmony_ci lockdep_assert_held(&fs_info->reclaim_bgs_lock); 327362306a36Sopenharmony_ci 327462306a36Sopenharmony_ci /* step one, relocate all the extents inside this chunk */ 327562306a36Sopenharmony_ci btrfs_scrub_pause(fs_info); 327662306a36Sopenharmony_ci ret = btrfs_relocate_block_group(fs_info, chunk_offset); 327762306a36Sopenharmony_ci btrfs_scrub_continue(fs_info); 327862306a36Sopenharmony_ci if (ret) { 327962306a36Sopenharmony_ci /* 328062306a36Sopenharmony_ci * If we had a transaction abort, stop all running scrubs. 328162306a36Sopenharmony_ci * See transaction.c:cleanup_transaction() why we do it here. 328262306a36Sopenharmony_ci */ 328362306a36Sopenharmony_ci if (BTRFS_FS_ERROR(fs_info)) 328462306a36Sopenharmony_ci btrfs_scrub_cancel(fs_info); 328562306a36Sopenharmony_ci return ret; 328662306a36Sopenharmony_ci } 328762306a36Sopenharmony_ci 328862306a36Sopenharmony_ci block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 328962306a36Sopenharmony_ci if (!block_group) 329062306a36Sopenharmony_ci return -ENOENT; 329162306a36Sopenharmony_ci btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 329262306a36Sopenharmony_ci length = block_group->length; 329362306a36Sopenharmony_ci btrfs_put_block_group(block_group); 329462306a36Sopenharmony_ci 329562306a36Sopenharmony_ci /* 329662306a36Sopenharmony_ci * On a zoned file system, discard the whole block group, this will 329762306a36Sopenharmony_ci * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 329862306a36Sopenharmony_ci * resetting the zone fails, don't treat it as a fatal problem from the 329962306a36Sopenharmony_ci * filesystem's point of view. 330062306a36Sopenharmony_ci */ 330162306a36Sopenharmony_ci if (btrfs_is_zoned(fs_info)) { 330262306a36Sopenharmony_ci ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 330362306a36Sopenharmony_ci if (ret) 330462306a36Sopenharmony_ci btrfs_info(fs_info, 330562306a36Sopenharmony_ci "failed to reset zone %llu after relocation", 330662306a36Sopenharmony_ci chunk_offset); 330762306a36Sopenharmony_ci } 330862306a36Sopenharmony_ci 330962306a36Sopenharmony_ci trans = btrfs_start_trans_remove_block_group(root->fs_info, 331062306a36Sopenharmony_ci chunk_offset); 331162306a36Sopenharmony_ci if (IS_ERR(trans)) { 331262306a36Sopenharmony_ci ret = PTR_ERR(trans); 331362306a36Sopenharmony_ci btrfs_handle_fs_error(root->fs_info, ret, NULL); 331462306a36Sopenharmony_ci return ret; 331562306a36Sopenharmony_ci } 331662306a36Sopenharmony_ci 331762306a36Sopenharmony_ci /* 331862306a36Sopenharmony_ci * step two, delete the device extents and the 331962306a36Sopenharmony_ci * chunk tree entries 332062306a36Sopenharmony_ci */ 332162306a36Sopenharmony_ci ret = btrfs_remove_chunk(trans, chunk_offset); 332262306a36Sopenharmony_ci btrfs_end_transaction(trans); 332362306a36Sopenharmony_ci return ret; 332462306a36Sopenharmony_ci} 332562306a36Sopenharmony_ci 332662306a36Sopenharmony_cistatic int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 332762306a36Sopenharmony_ci{ 332862306a36Sopenharmony_ci struct btrfs_root *chunk_root = fs_info->chunk_root; 332962306a36Sopenharmony_ci struct btrfs_path *path; 333062306a36Sopenharmony_ci struct extent_buffer *leaf; 333162306a36Sopenharmony_ci struct btrfs_chunk *chunk; 333262306a36Sopenharmony_ci struct btrfs_key key; 333362306a36Sopenharmony_ci struct btrfs_key found_key; 333462306a36Sopenharmony_ci u64 chunk_type; 333562306a36Sopenharmony_ci bool retried = false; 333662306a36Sopenharmony_ci int failed = 0; 333762306a36Sopenharmony_ci int ret; 333862306a36Sopenharmony_ci 333962306a36Sopenharmony_ci path = btrfs_alloc_path(); 334062306a36Sopenharmony_ci if (!path) 334162306a36Sopenharmony_ci return -ENOMEM; 334262306a36Sopenharmony_ci 334362306a36Sopenharmony_ciagain: 334462306a36Sopenharmony_ci key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 334562306a36Sopenharmony_ci key.offset = (u64)-1; 334662306a36Sopenharmony_ci key.type = BTRFS_CHUNK_ITEM_KEY; 334762306a36Sopenharmony_ci 334862306a36Sopenharmony_ci while (1) { 334962306a36Sopenharmony_ci mutex_lock(&fs_info->reclaim_bgs_lock); 335062306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 335162306a36Sopenharmony_ci if (ret < 0) { 335262306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 335362306a36Sopenharmony_ci goto error; 335462306a36Sopenharmony_ci } 335562306a36Sopenharmony_ci BUG_ON(ret == 0); /* Corruption */ 335662306a36Sopenharmony_ci 335762306a36Sopenharmony_ci ret = btrfs_previous_item(chunk_root, path, key.objectid, 335862306a36Sopenharmony_ci key.type); 335962306a36Sopenharmony_ci if (ret) 336062306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 336162306a36Sopenharmony_ci if (ret < 0) 336262306a36Sopenharmony_ci goto error; 336362306a36Sopenharmony_ci if (ret > 0) 336462306a36Sopenharmony_ci break; 336562306a36Sopenharmony_ci 336662306a36Sopenharmony_ci leaf = path->nodes[0]; 336762306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 336862306a36Sopenharmony_ci 336962306a36Sopenharmony_ci chunk = btrfs_item_ptr(leaf, path->slots[0], 337062306a36Sopenharmony_ci struct btrfs_chunk); 337162306a36Sopenharmony_ci chunk_type = btrfs_chunk_type(leaf, chunk); 337262306a36Sopenharmony_ci btrfs_release_path(path); 337362306a36Sopenharmony_ci 337462306a36Sopenharmony_ci if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 337562306a36Sopenharmony_ci ret = btrfs_relocate_chunk(fs_info, found_key.offset); 337662306a36Sopenharmony_ci if (ret == -ENOSPC) 337762306a36Sopenharmony_ci failed++; 337862306a36Sopenharmony_ci else 337962306a36Sopenharmony_ci BUG_ON(ret); 338062306a36Sopenharmony_ci } 338162306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 338262306a36Sopenharmony_ci 338362306a36Sopenharmony_ci if (found_key.offset == 0) 338462306a36Sopenharmony_ci break; 338562306a36Sopenharmony_ci key.offset = found_key.offset - 1; 338662306a36Sopenharmony_ci } 338762306a36Sopenharmony_ci ret = 0; 338862306a36Sopenharmony_ci if (failed && !retried) { 338962306a36Sopenharmony_ci failed = 0; 339062306a36Sopenharmony_ci retried = true; 339162306a36Sopenharmony_ci goto again; 339262306a36Sopenharmony_ci } else if (WARN_ON(failed && retried)) { 339362306a36Sopenharmony_ci ret = -ENOSPC; 339462306a36Sopenharmony_ci } 339562306a36Sopenharmony_cierror: 339662306a36Sopenharmony_ci btrfs_free_path(path); 339762306a36Sopenharmony_ci return ret; 339862306a36Sopenharmony_ci} 339962306a36Sopenharmony_ci 340062306a36Sopenharmony_ci/* 340162306a36Sopenharmony_ci * return 1 : allocate a data chunk successfully, 340262306a36Sopenharmony_ci * return <0: errors during allocating a data chunk, 340362306a36Sopenharmony_ci * return 0 : no need to allocate a data chunk. 340462306a36Sopenharmony_ci */ 340562306a36Sopenharmony_cistatic int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 340662306a36Sopenharmony_ci u64 chunk_offset) 340762306a36Sopenharmony_ci{ 340862306a36Sopenharmony_ci struct btrfs_block_group *cache; 340962306a36Sopenharmony_ci u64 bytes_used; 341062306a36Sopenharmony_ci u64 chunk_type; 341162306a36Sopenharmony_ci 341262306a36Sopenharmony_ci cache = btrfs_lookup_block_group(fs_info, chunk_offset); 341362306a36Sopenharmony_ci ASSERT(cache); 341462306a36Sopenharmony_ci chunk_type = cache->flags; 341562306a36Sopenharmony_ci btrfs_put_block_group(cache); 341662306a36Sopenharmony_ci 341762306a36Sopenharmony_ci if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 341862306a36Sopenharmony_ci return 0; 341962306a36Sopenharmony_ci 342062306a36Sopenharmony_ci spin_lock(&fs_info->data_sinfo->lock); 342162306a36Sopenharmony_ci bytes_used = fs_info->data_sinfo->bytes_used; 342262306a36Sopenharmony_ci spin_unlock(&fs_info->data_sinfo->lock); 342362306a36Sopenharmony_ci 342462306a36Sopenharmony_ci if (!bytes_used) { 342562306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 342662306a36Sopenharmony_ci int ret; 342762306a36Sopenharmony_ci 342862306a36Sopenharmony_ci trans = btrfs_join_transaction(fs_info->tree_root); 342962306a36Sopenharmony_ci if (IS_ERR(trans)) 343062306a36Sopenharmony_ci return PTR_ERR(trans); 343162306a36Sopenharmony_ci 343262306a36Sopenharmony_ci ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 343362306a36Sopenharmony_ci btrfs_end_transaction(trans); 343462306a36Sopenharmony_ci if (ret < 0) 343562306a36Sopenharmony_ci return ret; 343662306a36Sopenharmony_ci return 1; 343762306a36Sopenharmony_ci } 343862306a36Sopenharmony_ci 343962306a36Sopenharmony_ci return 0; 344062306a36Sopenharmony_ci} 344162306a36Sopenharmony_ci 344262306a36Sopenharmony_cistatic int insert_balance_item(struct btrfs_fs_info *fs_info, 344362306a36Sopenharmony_ci struct btrfs_balance_control *bctl) 344462306a36Sopenharmony_ci{ 344562306a36Sopenharmony_ci struct btrfs_root *root = fs_info->tree_root; 344662306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 344762306a36Sopenharmony_ci struct btrfs_balance_item *item; 344862306a36Sopenharmony_ci struct btrfs_disk_balance_args disk_bargs; 344962306a36Sopenharmony_ci struct btrfs_path *path; 345062306a36Sopenharmony_ci struct extent_buffer *leaf; 345162306a36Sopenharmony_ci struct btrfs_key key; 345262306a36Sopenharmony_ci int ret, err; 345362306a36Sopenharmony_ci 345462306a36Sopenharmony_ci path = btrfs_alloc_path(); 345562306a36Sopenharmony_ci if (!path) 345662306a36Sopenharmony_ci return -ENOMEM; 345762306a36Sopenharmony_ci 345862306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 0); 345962306a36Sopenharmony_ci if (IS_ERR(trans)) { 346062306a36Sopenharmony_ci btrfs_free_path(path); 346162306a36Sopenharmony_ci return PTR_ERR(trans); 346262306a36Sopenharmony_ci } 346362306a36Sopenharmony_ci 346462306a36Sopenharmony_ci key.objectid = BTRFS_BALANCE_OBJECTID; 346562306a36Sopenharmony_ci key.type = BTRFS_TEMPORARY_ITEM_KEY; 346662306a36Sopenharmony_ci key.offset = 0; 346762306a36Sopenharmony_ci 346862306a36Sopenharmony_ci ret = btrfs_insert_empty_item(trans, root, path, &key, 346962306a36Sopenharmony_ci sizeof(*item)); 347062306a36Sopenharmony_ci if (ret) 347162306a36Sopenharmony_ci goto out; 347262306a36Sopenharmony_ci 347362306a36Sopenharmony_ci leaf = path->nodes[0]; 347462306a36Sopenharmony_ci item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 347562306a36Sopenharmony_ci 347662306a36Sopenharmony_ci memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 347762306a36Sopenharmony_ci 347862306a36Sopenharmony_ci btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 347962306a36Sopenharmony_ci btrfs_set_balance_data(leaf, item, &disk_bargs); 348062306a36Sopenharmony_ci btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 348162306a36Sopenharmony_ci btrfs_set_balance_meta(leaf, item, &disk_bargs); 348262306a36Sopenharmony_ci btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 348362306a36Sopenharmony_ci btrfs_set_balance_sys(leaf, item, &disk_bargs); 348462306a36Sopenharmony_ci 348562306a36Sopenharmony_ci btrfs_set_balance_flags(leaf, item, bctl->flags); 348662306a36Sopenharmony_ci 348762306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 348862306a36Sopenharmony_ciout: 348962306a36Sopenharmony_ci btrfs_free_path(path); 349062306a36Sopenharmony_ci err = btrfs_commit_transaction(trans); 349162306a36Sopenharmony_ci if (err && !ret) 349262306a36Sopenharmony_ci ret = err; 349362306a36Sopenharmony_ci return ret; 349462306a36Sopenharmony_ci} 349562306a36Sopenharmony_ci 349662306a36Sopenharmony_cistatic int del_balance_item(struct btrfs_fs_info *fs_info) 349762306a36Sopenharmony_ci{ 349862306a36Sopenharmony_ci struct btrfs_root *root = fs_info->tree_root; 349962306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 350062306a36Sopenharmony_ci struct btrfs_path *path; 350162306a36Sopenharmony_ci struct btrfs_key key; 350262306a36Sopenharmony_ci int ret, err; 350362306a36Sopenharmony_ci 350462306a36Sopenharmony_ci path = btrfs_alloc_path(); 350562306a36Sopenharmony_ci if (!path) 350662306a36Sopenharmony_ci return -ENOMEM; 350762306a36Sopenharmony_ci 350862306a36Sopenharmony_ci trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 350962306a36Sopenharmony_ci if (IS_ERR(trans)) { 351062306a36Sopenharmony_ci btrfs_free_path(path); 351162306a36Sopenharmony_ci return PTR_ERR(trans); 351262306a36Sopenharmony_ci } 351362306a36Sopenharmony_ci 351462306a36Sopenharmony_ci key.objectid = BTRFS_BALANCE_OBJECTID; 351562306a36Sopenharmony_ci key.type = BTRFS_TEMPORARY_ITEM_KEY; 351662306a36Sopenharmony_ci key.offset = 0; 351762306a36Sopenharmony_ci 351862306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 351962306a36Sopenharmony_ci if (ret < 0) 352062306a36Sopenharmony_ci goto out; 352162306a36Sopenharmony_ci if (ret > 0) { 352262306a36Sopenharmony_ci ret = -ENOENT; 352362306a36Sopenharmony_ci goto out; 352462306a36Sopenharmony_ci } 352562306a36Sopenharmony_ci 352662306a36Sopenharmony_ci ret = btrfs_del_item(trans, root, path); 352762306a36Sopenharmony_ciout: 352862306a36Sopenharmony_ci btrfs_free_path(path); 352962306a36Sopenharmony_ci err = btrfs_commit_transaction(trans); 353062306a36Sopenharmony_ci if (err && !ret) 353162306a36Sopenharmony_ci ret = err; 353262306a36Sopenharmony_ci return ret; 353362306a36Sopenharmony_ci} 353462306a36Sopenharmony_ci 353562306a36Sopenharmony_ci/* 353662306a36Sopenharmony_ci * This is a heuristic used to reduce the number of chunks balanced on 353762306a36Sopenharmony_ci * resume after balance was interrupted. 353862306a36Sopenharmony_ci */ 353962306a36Sopenharmony_cistatic void update_balance_args(struct btrfs_balance_control *bctl) 354062306a36Sopenharmony_ci{ 354162306a36Sopenharmony_ci /* 354262306a36Sopenharmony_ci * Turn on soft mode for chunk types that were being converted. 354362306a36Sopenharmony_ci */ 354462306a36Sopenharmony_ci if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 354562306a36Sopenharmony_ci bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 354662306a36Sopenharmony_ci if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 354762306a36Sopenharmony_ci bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 354862306a36Sopenharmony_ci if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 354962306a36Sopenharmony_ci bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 355062306a36Sopenharmony_ci 355162306a36Sopenharmony_ci /* 355262306a36Sopenharmony_ci * Turn on usage filter if is not already used. The idea is 355362306a36Sopenharmony_ci * that chunks that we have already balanced should be 355462306a36Sopenharmony_ci * reasonably full. Don't do it for chunks that are being 355562306a36Sopenharmony_ci * converted - that will keep us from relocating unconverted 355662306a36Sopenharmony_ci * (albeit full) chunks. 355762306a36Sopenharmony_ci */ 355862306a36Sopenharmony_ci if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 355962306a36Sopenharmony_ci !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 356062306a36Sopenharmony_ci !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 356162306a36Sopenharmony_ci bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 356262306a36Sopenharmony_ci bctl->data.usage = 90; 356362306a36Sopenharmony_ci } 356462306a36Sopenharmony_ci if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 356562306a36Sopenharmony_ci !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 356662306a36Sopenharmony_ci !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 356762306a36Sopenharmony_ci bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 356862306a36Sopenharmony_ci bctl->sys.usage = 90; 356962306a36Sopenharmony_ci } 357062306a36Sopenharmony_ci if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 357162306a36Sopenharmony_ci !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 357262306a36Sopenharmony_ci !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 357362306a36Sopenharmony_ci bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 357462306a36Sopenharmony_ci bctl->meta.usage = 90; 357562306a36Sopenharmony_ci } 357662306a36Sopenharmony_ci} 357762306a36Sopenharmony_ci 357862306a36Sopenharmony_ci/* 357962306a36Sopenharmony_ci * Clear the balance status in fs_info and delete the balance item from disk. 358062306a36Sopenharmony_ci */ 358162306a36Sopenharmony_cistatic void reset_balance_state(struct btrfs_fs_info *fs_info) 358262306a36Sopenharmony_ci{ 358362306a36Sopenharmony_ci struct btrfs_balance_control *bctl = fs_info->balance_ctl; 358462306a36Sopenharmony_ci int ret; 358562306a36Sopenharmony_ci 358662306a36Sopenharmony_ci BUG_ON(!fs_info->balance_ctl); 358762306a36Sopenharmony_ci 358862306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 358962306a36Sopenharmony_ci fs_info->balance_ctl = NULL; 359062306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 359162306a36Sopenharmony_ci 359262306a36Sopenharmony_ci kfree(bctl); 359362306a36Sopenharmony_ci ret = del_balance_item(fs_info); 359462306a36Sopenharmony_ci if (ret) 359562306a36Sopenharmony_ci btrfs_handle_fs_error(fs_info, ret, NULL); 359662306a36Sopenharmony_ci} 359762306a36Sopenharmony_ci 359862306a36Sopenharmony_ci/* 359962306a36Sopenharmony_ci * Balance filters. Return 1 if chunk should be filtered out 360062306a36Sopenharmony_ci * (should not be balanced). 360162306a36Sopenharmony_ci */ 360262306a36Sopenharmony_cistatic int chunk_profiles_filter(u64 chunk_type, 360362306a36Sopenharmony_ci struct btrfs_balance_args *bargs) 360462306a36Sopenharmony_ci{ 360562306a36Sopenharmony_ci chunk_type = chunk_to_extended(chunk_type) & 360662306a36Sopenharmony_ci BTRFS_EXTENDED_PROFILE_MASK; 360762306a36Sopenharmony_ci 360862306a36Sopenharmony_ci if (bargs->profiles & chunk_type) 360962306a36Sopenharmony_ci return 0; 361062306a36Sopenharmony_ci 361162306a36Sopenharmony_ci return 1; 361262306a36Sopenharmony_ci} 361362306a36Sopenharmony_ci 361462306a36Sopenharmony_cistatic int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 361562306a36Sopenharmony_ci struct btrfs_balance_args *bargs) 361662306a36Sopenharmony_ci{ 361762306a36Sopenharmony_ci struct btrfs_block_group *cache; 361862306a36Sopenharmony_ci u64 chunk_used; 361962306a36Sopenharmony_ci u64 user_thresh_min; 362062306a36Sopenharmony_ci u64 user_thresh_max; 362162306a36Sopenharmony_ci int ret = 1; 362262306a36Sopenharmony_ci 362362306a36Sopenharmony_ci cache = btrfs_lookup_block_group(fs_info, chunk_offset); 362462306a36Sopenharmony_ci chunk_used = cache->used; 362562306a36Sopenharmony_ci 362662306a36Sopenharmony_ci if (bargs->usage_min == 0) 362762306a36Sopenharmony_ci user_thresh_min = 0; 362862306a36Sopenharmony_ci else 362962306a36Sopenharmony_ci user_thresh_min = mult_perc(cache->length, bargs->usage_min); 363062306a36Sopenharmony_ci 363162306a36Sopenharmony_ci if (bargs->usage_max == 0) 363262306a36Sopenharmony_ci user_thresh_max = 1; 363362306a36Sopenharmony_ci else if (bargs->usage_max > 100) 363462306a36Sopenharmony_ci user_thresh_max = cache->length; 363562306a36Sopenharmony_ci else 363662306a36Sopenharmony_ci user_thresh_max = mult_perc(cache->length, bargs->usage_max); 363762306a36Sopenharmony_ci 363862306a36Sopenharmony_ci if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 363962306a36Sopenharmony_ci ret = 0; 364062306a36Sopenharmony_ci 364162306a36Sopenharmony_ci btrfs_put_block_group(cache); 364262306a36Sopenharmony_ci return ret; 364362306a36Sopenharmony_ci} 364462306a36Sopenharmony_ci 364562306a36Sopenharmony_cistatic int chunk_usage_filter(struct btrfs_fs_info *fs_info, 364662306a36Sopenharmony_ci u64 chunk_offset, struct btrfs_balance_args *bargs) 364762306a36Sopenharmony_ci{ 364862306a36Sopenharmony_ci struct btrfs_block_group *cache; 364962306a36Sopenharmony_ci u64 chunk_used, user_thresh; 365062306a36Sopenharmony_ci int ret = 1; 365162306a36Sopenharmony_ci 365262306a36Sopenharmony_ci cache = btrfs_lookup_block_group(fs_info, chunk_offset); 365362306a36Sopenharmony_ci chunk_used = cache->used; 365462306a36Sopenharmony_ci 365562306a36Sopenharmony_ci if (bargs->usage_min == 0) 365662306a36Sopenharmony_ci user_thresh = 1; 365762306a36Sopenharmony_ci else if (bargs->usage > 100) 365862306a36Sopenharmony_ci user_thresh = cache->length; 365962306a36Sopenharmony_ci else 366062306a36Sopenharmony_ci user_thresh = mult_perc(cache->length, bargs->usage); 366162306a36Sopenharmony_ci 366262306a36Sopenharmony_ci if (chunk_used < user_thresh) 366362306a36Sopenharmony_ci ret = 0; 366462306a36Sopenharmony_ci 366562306a36Sopenharmony_ci btrfs_put_block_group(cache); 366662306a36Sopenharmony_ci return ret; 366762306a36Sopenharmony_ci} 366862306a36Sopenharmony_ci 366962306a36Sopenharmony_cistatic int chunk_devid_filter(struct extent_buffer *leaf, 367062306a36Sopenharmony_ci struct btrfs_chunk *chunk, 367162306a36Sopenharmony_ci struct btrfs_balance_args *bargs) 367262306a36Sopenharmony_ci{ 367362306a36Sopenharmony_ci struct btrfs_stripe *stripe; 367462306a36Sopenharmony_ci int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 367562306a36Sopenharmony_ci int i; 367662306a36Sopenharmony_ci 367762306a36Sopenharmony_ci for (i = 0; i < num_stripes; i++) { 367862306a36Sopenharmony_ci stripe = btrfs_stripe_nr(chunk, i); 367962306a36Sopenharmony_ci if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 368062306a36Sopenharmony_ci return 0; 368162306a36Sopenharmony_ci } 368262306a36Sopenharmony_ci 368362306a36Sopenharmony_ci return 1; 368462306a36Sopenharmony_ci} 368562306a36Sopenharmony_ci 368662306a36Sopenharmony_cistatic u64 calc_data_stripes(u64 type, int num_stripes) 368762306a36Sopenharmony_ci{ 368862306a36Sopenharmony_ci const int index = btrfs_bg_flags_to_raid_index(type); 368962306a36Sopenharmony_ci const int ncopies = btrfs_raid_array[index].ncopies; 369062306a36Sopenharmony_ci const int nparity = btrfs_raid_array[index].nparity; 369162306a36Sopenharmony_ci 369262306a36Sopenharmony_ci return (num_stripes - nparity) / ncopies; 369362306a36Sopenharmony_ci} 369462306a36Sopenharmony_ci 369562306a36Sopenharmony_ci/* [pstart, pend) */ 369662306a36Sopenharmony_cistatic int chunk_drange_filter(struct extent_buffer *leaf, 369762306a36Sopenharmony_ci struct btrfs_chunk *chunk, 369862306a36Sopenharmony_ci struct btrfs_balance_args *bargs) 369962306a36Sopenharmony_ci{ 370062306a36Sopenharmony_ci struct btrfs_stripe *stripe; 370162306a36Sopenharmony_ci int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 370262306a36Sopenharmony_ci u64 stripe_offset; 370362306a36Sopenharmony_ci u64 stripe_length; 370462306a36Sopenharmony_ci u64 type; 370562306a36Sopenharmony_ci int factor; 370662306a36Sopenharmony_ci int i; 370762306a36Sopenharmony_ci 370862306a36Sopenharmony_ci if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 370962306a36Sopenharmony_ci return 0; 371062306a36Sopenharmony_ci 371162306a36Sopenharmony_ci type = btrfs_chunk_type(leaf, chunk); 371262306a36Sopenharmony_ci factor = calc_data_stripes(type, num_stripes); 371362306a36Sopenharmony_ci 371462306a36Sopenharmony_ci for (i = 0; i < num_stripes; i++) { 371562306a36Sopenharmony_ci stripe = btrfs_stripe_nr(chunk, i); 371662306a36Sopenharmony_ci if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 371762306a36Sopenharmony_ci continue; 371862306a36Sopenharmony_ci 371962306a36Sopenharmony_ci stripe_offset = btrfs_stripe_offset(leaf, stripe); 372062306a36Sopenharmony_ci stripe_length = btrfs_chunk_length(leaf, chunk); 372162306a36Sopenharmony_ci stripe_length = div_u64(stripe_length, factor); 372262306a36Sopenharmony_ci 372362306a36Sopenharmony_ci if (stripe_offset < bargs->pend && 372462306a36Sopenharmony_ci stripe_offset + stripe_length > bargs->pstart) 372562306a36Sopenharmony_ci return 0; 372662306a36Sopenharmony_ci } 372762306a36Sopenharmony_ci 372862306a36Sopenharmony_ci return 1; 372962306a36Sopenharmony_ci} 373062306a36Sopenharmony_ci 373162306a36Sopenharmony_ci/* [vstart, vend) */ 373262306a36Sopenharmony_cistatic int chunk_vrange_filter(struct extent_buffer *leaf, 373362306a36Sopenharmony_ci struct btrfs_chunk *chunk, 373462306a36Sopenharmony_ci u64 chunk_offset, 373562306a36Sopenharmony_ci struct btrfs_balance_args *bargs) 373662306a36Sopenharmony_ci{ 373762306a36Sopenharmony_ci if (chunk_offset < bargs->vend && 373862306a36Sopenharmony_ci chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 373962306a36Sopenharmony_ci /* at least part of the chunk is inside this vrange */ 374062306a36Sopenharmony_ci return 0; 374162306a36Sopenharmony_ci 374262306a36Sopenharmony_ci return 1; 374362306a36Sopenharmony_ci} 374462306a36Sopenharmony_ci 374562306a36Sopenharmony_cistatic int chunk_stripes_range_filter(struct extent_buffer *leaf, 374662306a36Sopenharmony_ci struct btrfs_chunk *chunk, 374762306a36Sopenharmony_ci struct btrfs_balance_args *bargs) 374862306a36Sopenharmony_ci{ 374962306a36Sopenharmony_ci int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 375062306a36Sopenharmony_ci 375162306a36Sopenharmony_ci if (bargs->stripes_min <= num_stripes 375262306a36Sopenharmony_ci && num_stripes <= bargs->stripes_max) 375362306a36Sopenharmony_ci return 0; 375462306a36Sopenharmony_ci 375562306a36Sopenharmony_ci return 1; 375662306a36Sopenharmony_ci} 375762306a36Sopenharmony_ci 375862306a36Sopenharmony_cistatic int chunk_soft_convert_filter(u64 chunk_type, 375962306a36Sopenharmony_ci struct btrfs_balance_args *bargs) 376062306a36Sopenharmony_ci{ 376162306a36Sopenharmony_ci if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 376262306a36Sopenharmony_ci return 0; 376362306a36Sopenharmony_ci 376462306a36Sopenharmony_ci chunk_type = chunk_to_extended(chunk_type) & 376562306a36Sopenharmony_ci BTRFS_EXTENDED_PROFILE_MASK; 376662306a36Sopenharmony_ci 376762306a36Sopenharmony_ci if (bargs->target == chunk_type) 376862306a36Sopenharmony_ci return 1; 376962306a36Sopenharmony_ci 377062306a36Sopenharmony_ci return 0; 377162306a36Sopenharmony_ci} 377262306a36Sopenharmony_ci 377362306a36Sopenharmony_cistatic int should_balance_chunk(struct extent_buffer *leaf, 377462306a36Sopenharmony_ci struct btrfs_chunk *chunk, u64 chunk_offset) 377562306a36Sopenharmony_ci{ 377662306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = leaf->fs_info; 377762306a36Sopenharmony_ci struct btrfs_balance_control *bctl = fs_info->balance_ctl; 377862306a36Sopenharmony_ci struct btrfs_balance_args *bargs = NULL; 377962306a36Sopenharmony_ci u64 chunk_type = btrfs_chunk_type(leaf, chunk); 378062306a36Sopenharmony_ci 378162306a36Sopenharmony_ci /* type filter */ 378262306a36Sopenharmony_ci if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 378362306a36Sopenharmony_ci (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 378462306a36Sopenharmony_ci return 0; 378562306a36Sopenharmony_ci } 378662306a36Sopenharmony_ci 378762306a36Sopenharmony_ci if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 378862306a36Sopenharmony_ci bargs = &bctl->data; 378962306a36Sopenharmony_ci else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 379062306a36Sopenharmony_ci bargs = &bctl->sys; 379162306a36Sopenharmony_ci else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 379262306a36Sopenharmony_ci bargs = &bctl->meta; 379362306a36Sopenharmony_ci 379462306a36Sopenharmony_ci /* profiles filter */ 379562306a36Sopenharmony_ci if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 379662306a36Sopenharmony_ci chunk_profiles_filter(chunk_type, bargs)) { 379762306a36Sopenharmony_ci return 0; 379862306a36Sopenharmony_ci } 379962306a36Sopenharmony_ci 380062306a36Sopenharmony_ci /* usage filter */ 380162306a36Sopenharmony_ci if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 380262306a36Sopenharmony_ci chunk_usage_filter(fs_info, chunk_offset, bargs)) { 380362306a36Sopenharmony_ci return 0; 380462306a36Sopenharmony_ci } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 380562306a36Sopenharmony_ci chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 380662306a36Sopenharmony_ci return 0; 380762306a36Sopenharmony_ci } 380862306a36Sopenharmony_ci 380962306a36Sopenharmony_ci /* devid filter */ 381062306a36Sopenharmony_ci if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 381162306a36Sopenharmony_ci chunk_devid_filter(leaf, chunk, bargs)) { 381262306a36Sopenharmony_ci return 0; 381362306a36Sopenharmony_ci } 381462306a36Sopenharmony_ci 381562306a36Sopenharmony_ci /* drange filter, makes sense only with devid filter */ 381662306a36Sopenharmony_ci if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 381762306a36Sopenharmony_ci chunk_drange_filter(leaf, chunk, bargs)) { 381862306a36Sopenharmony_ci return 0; 381962306a36Sopenharmony_ci } 382062306a36Sopenharmony_ci 382162306a36Sopenharmony_ci /* vrange filter */ 382262306a36Sopenharmony_ci if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 382362306a36Sopenharmony_ci chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 382462306a36Sopenharmony_ci return 0; 382562306a36Sopenharmony_ci } 382662306a36Sopenharmony_ci 382762306a36Sopenharmony_ci /* stripes filter */ 382862306a36Sopenharmony_ci if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 382962306a36Sopenharmony_ci chunk_stripes_range_filter(leaf, chunk, bargs)) { 383062306a36Sopenharmony_ci return 0; 383162306a36Sopenharmony_ci } 383262306a36Sopenharmony_ci 383362306a36Sopenharmony_ci /* soft profile changing mode */ 383462306a36Sopenharmony_ci if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 383562306a36Sopenharmony_ci chunk_soft_convert_filter(chunk_type, bargs)) { 383662306a36Sopenharmony_ci return 0; 383762306a36Sopenharmony_ci } 383862306a36Sopenharmony_ci 383962306a36Sopenharmony_ci /* 384062306a36Sopenharmony_ci * limited by count, must be the last filter 384162306a36Sopenharmony_ci */ 384262306a36Sopenharmony_ci if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 384362306a36Sopenharmony_ci if (bargs->limit == 0) 384462306a36Sopenharmony_ci return 0; 384562306a36Sopenharmony_ci else 384662306a36Sopenharmony_ci bargs->limit--; 384762306a36Sopenharmony_ci } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 384862306a36Sopenharmony_ci /* 384962306a36Sopenharmony_ci * Same logic as the 'limit' filter; the minimum cannot be 385062306a36Sopenharmony_ci * determined here because we do not have the global information 385162306a36Sopenharmony_ci * about the count of all chunks that satisfy the filters. 385262306a36Sopenharmony_ci */ 385362306a36Sopenharmony_ci if (bargs->limit_max == 0) 385462306a36Sopenharmony_ci return 0; 385562306a36Sopenharmony_ci else 385662306a36Sopenharmony_ci bargs->limit_max--; 385762306a36Sopenharmony_ci } 385862306a36Sopenharmony_ci 385962306a36Sopenharmony_ci return 1; 386062306a36Sopenharmony_ci} 386162306a36Sopenharmony_ci 386262306a36Sopenharmony_cistatic int __btrfs_balance(struct btrfs_fs_info *fs_info) 386362306a36Sopenharmony_ci{ 386462306a36Sopenharmony_ci struct btrfs_balance_control *bctl = fs_info->balance_ctl; 386562306a36Sopenharmony_ci struct btrfs_root *chunk_root = fs_info->chunk_root; 386662306a36Sopenharmony_ci u64 chunk_type; 386762306a36Sopenharmony_ci struct btrfs_chunk *chunk; 386862306a36Sopenharmony_ci struct btrfs_path *path = NULL; 386962306a36Sopenharmony_ci struct btrfs_key key; 387062306a36Sopenharmony_ci struct btrfs_key found_key; 387162306a36Sopenharmony_ci struct extent_buffer *leaf; 387262306a36Sopenharmony_ci int slot; 387362306a36Sopenharmony_ci int ret; 387462306a36Sopenharmony_ci int enospc_errors = 0; 387562306a36Sopenharmony_ci bool counting = true; 387662306a36Sopenharmony_ci /* The single value limit and min/max limits use the same bytes in the */ 387762306a36Sopenharmony_ci u64 limit_data = bctl->data.limit; 387862306a36Sopenharmony_ci u64 limit_meta = bctl->meta.limit; 387962306a36Sopenharmony_ci u64 limit_sys = bctl->sys.limit; 388062306a36Sopenharmony_ci u32 count_data = 0; 388162306a36Sopenharmony_ci u32 count_meta = 0; 388262306a36Sopenharmony_ci u32 count_sys = 0; 388362306a36Sopenharmony_ci int chunk_reserved = 0; 388462306a36Sopenharmony_ci 388562306a36Sopenharmony_ci path = btrfs_alloc_path(); 388662306a36Sopenharmony_ci if (!path) { 388762306a36Sopenharmony_ci ret = -ENOMEM; 388862306a36Sopenharmony_ci goto error; 388962306a36Sopenharmony_ci } 389062306a36Sopenharmony_ci 389162306a36Sopenharmony_ci /* zero out stat counters */ 389262306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 389362306a36Sopenharmony_ci memset(&bctl->stat, 0, sizeof(bctl->stat)); 389462306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 389562306a36Sopenharmony_ciagain: 389662306a36Sopenharmony_ci if (!counting) { 389762306a36Sopenharmony_ci /* 389862306a36Sopenharmony_ci * The single value limit and min/max limits use the same bytes 389962306a36Sopenharmony_ci * in the 390062306a36Sopenharmony_ci */ 390162306a36Sopenharmony_ci bctl->data.limit = limit_data; 390262306a36Sopenharmony_ci bctl->meta.limit = limit_meta; 390362306a36Sopenharmony_ci bctl->sys.limit = limit_sys; 390462306a36Sopenharmony_ci } 390562306a36Sopenharmony_ci key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 390662306a36Sopenharmony_ci key.offset = (u64)-1; 390762306a36Sopenharmony_ci key.type = BTRFS_CHUNK_ITEM_KEY; 390862306a36Sopenharmony_ci 390962306a36Sopenharmony_ci while (1) { 391062306a36Sopenharmony_ci if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 391162306a36Sopenharmony_ci atomic_read(&fs_info->balance_cancel_req)) { 391262306a36Sopenharmony_ci ret = -ECANCELED; 391362306a36Sopenharmony_ci goto error; 391462306a36Sopenharmony_ci } 391562306a36Sopenharmony_ci 391662306a36Sopenharmony_ci mutex_lock(&fs_info->reclaim_bgs_lock); 391762306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 391862306a36Sopenharmony_ci if (ret < 0) { 391962306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 392062306a36Sopenharmony_ci goto error; 392162306a36Sopenharmony_ci } 392262306a36Sopenharmony_ci 392362306a36Sopenharmony_ci /* 392462306a36Sopenharmony_ci * this shouldn't happen, it means the last relocate 392562306a36Sopenharmony_ci * failed 392662306a36Sopenharmony_ci */ 392762306a36Sopenharmony_ci if (ret == 0) 392862306a36Sopenharmony_ci BUG(); /* FIXME break ? */ 392962306a36Sopenharmony_ci 393062306a36Sopenharmony_ci ret = btrfs_previous_item(chunk_root, path, 0, 393162306a36Sopenharmony_ci BTRFS_CHUNK_ITEM_KEY); 393262306a36Sopenharmony_ci if (ret) { 393362306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 393462306a36Sopenharmony_ci ret = 0; 393562306a36Sopenharmony_ci break; 393662306a36Sopenharmony_ci } 393762306a36Sopenharmony_ci 393862306a36Sopenharmony_ci leaf = path->nodes[0]; 393962306a36Sopenharmony_ci slot = path->slots[0]; 394062306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &found_key, slot); 394162306a36Sopenharmony_ci 394262306a36Sopenharmony_ci if (found_key.objectid != key.objectid) { 394362306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 394462306a36Sopenharmony_ci break; 394562306a36Sopenharmony_ci } 394662306a36Sopenharmony_ci 394762306a36Sopenharmony_ci chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 394862306a36Sopenharmony_ci chunk_type = btrfs_chunk_type(leaf, chunk); 394962306a36Sopenharmony_ci 395062306a36Sopenharmony_ci if (!counting) { 395162306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 395262306a36Sopenharmony_ci bctl->stat.considered++; 395362306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 395462306a36Sopenharmony_ci } 395562306a36Sopenharmony_ci 395662306a36Sopenharmony_ci ret = should_balance_chunk(leaf, chunk, found_key.offset); 395762306a36Sopenharmony_ci 395862306a36Sopenharmony_ci btrfs_release_path(path); 395962306a36Sopenharmony_ci if (!ret) { 396062306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 396162306a36Sopenharmony_ci goto loop; 396262306a36Sopenharmony_ci } 396362306a36Sopenharmony_ci 396462306a36Sopenharmony_ci if (counting) { 396562306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 396662306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 396762306a36Sopenharmony_ci bctl->stat.expected++; 396862306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 396962306a36Sopenharmony_ci 397062306a36Sopenharmony_ci if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 397162306a36Sopenharmony_ci count_data++; 397262306a36Sopenharmony_ci else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 397362306a36Sopenharmony_ci count_sys++; 397462306a36Sopenharmony_ci else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 397562306a36Sopenharmony_ci count_meta++; 397662306a36Sopenharmony_ci 397762306a36Sopenharmony_ci goto loop; 397862306a36Sopenharmony_ci } 397962306a36Sopenharmony_ci 398062306a36Sopenharmony_ci /* 398162306a36Sopenharmony_ci * Apply limit_min filter, no need to check if the LIMITS 398262306a36Sopenharmony_ci * filter is used, limit_min is 0 by default 398362306a36Sopenharmony_ci */ 398462306a36Sopenharmony_ci if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 398562306a36Sopenharmony_ci count_data < bctl->data.limit_min) 398662306a36Sopenharmony_ci || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 398762306a36Sopenharmony_ci count_meta < bctl->meta.limit_min) 398862306a36Sopenharmony_ci || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 398962306a36Sopenharmony_ci count_sys < bctl->sys.limit_min)) { 399062306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 399162306a36Sopenharmony_ci goto loop; 399262306a36Sopenharmony_ci } 399362306a36Sopenharmony_ci 399462306a36Sopenharmony_ci if (!chunk_reserved) { 399562306a36Sopenharmony_ci /* 399662306a36Sopenharmony_ci * We may be relocating the only data chunk we have, 399762306a36Sopenharmony_ci * which could potentially end up with losing data's 399862306a36Sopenharmony_ci * raid profile, so lets allocate an empty one in 399962306a36Sopenharmony_ci * advance. 400062306a36Sopenharmony_ci */ 400162306a36Sopenharmony_ci ret = btrfs_may_alloc_data_chunk(fs_info, 400262306a36Sopenharmony_ci found_key.offset); 400362306a36Sopenharmony_ci if (ret < 0) { 400462306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 400562306a36Sopenharmony_ci goto error; 400662306a36Sopenharmony_ci } else if (ret == 1) { 400762306a36Sopenharmony_ci chunk_reserved = 1; 400862306a36Sopenharmony_ci } 400962306a36Sopenharmony_ci } 401062306a36Sopenharmony_ci 401162306a36Sopenharmony_ci ret = btrfs_relocate_chunk(fs_info, found_key.offset); 401262306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 401362306a36Sopenharmony_ci if (ret == -ENOSPC) { 401462306a36Sopenharmony_ci enospc_errors++; 401562306a36Sopenharmony_ci } else if (ret == -ETXTBSY) { 401662306a36Sopenharmony_ci btrfs_info(fs_info, 401762306a36Sopenharmony_ci "skipping relocation of block group %llu due to active swapfile", 401862306a36Sopenharmony_ci found_key.offset); 401962306a36Sopenharmony_ci ret = 0; 402062306a36Sopenharmony_ci } else if (ret) { 402162306a36Sopenharmony_ci goto error; 402262306a36Sopenharmony_ci } else { 402362306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 402462306a36Sopenharmony_ci bctl->stat.completed++; 402562306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 402662306a36Sopenharmony_ci } 402762306a36Sopenharmony_ciloop: 402862306a36Sopenharmony_ci if (found_key.offset == 0) 402962306a36Sopenharmony_ci break; 403062306a36Sopenharmony_ci key.offset = found_key.offset - 1; 403162306a36Sopenharmony_ci } 403262306a36Sopenharmony_ci 403362306a36Sopenharmony_ci if (counting) { 403462306a36Sopenharmony_ci btrfs_release_path(path); 403562306a36Sopenharmony_ci counting = false; 403662306a36Sopenharmony_ci goto again; 403762306a36Sopenharmony_ci } 403862306a36Sopenharmony_cierror: 403962306a36Sopenharmony_ci btrfs_free_path(path); 404062306a36Sopenharmony_ci if (enospc_errors) { 404162306a36Sopenharmony_ci btrfs_info(fs_info, "%d enospc errors during balance", 404262306a36Sopenharmony_ci enospc_errors); 404362306a36Sopenharmony_ci if (!ret) 404462306a36Sopenharmony_ci ret = -ENOSPC; 404562306a36Sopenharmony_ci } 404662306a36Sopenharmony_ci 404762306a36Sopenharmony_ci return ret; 404862306a36Sopenharmony_ci} 404962306a36Sopenharmony_ci 405062306a36Sopenharmony_ci/* 405162306a36Sopenharmony_ci * See if a given profile is valid and reduced. 405262306a36Sopenharmony_ci * 405362306a36Sopenharmony_ci * @flags: profile to validate 405462306a36Sopenharmony_ci * @extended: if true @flags is treated as an extended profile 405562306a36Sopenharmony_ci */ 405662306a36Sopenharmony_cistatic int alloc_profile_is_valid(u64 flags, int extended) 405762306a36Sopenharmony_ci{ 405862306a36Sopenharmony_ci u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 405962306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_PROFILE_MASK); 406062306a36Sopenharmony_ci 406162306a36Sopenharmony_ci flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 406262306a36Sopenharmony_ci 406362306a36Sopenharmony_ci /* 1) check that all other bits are zeroed */ 406462306a36Sopenharmony_ci if (flags & ~mask) 406562306a36Sopenharmony_ci return 0; 406662306a36Sopenharmony_ci 406762306a36Sopenharmony_ci /* 2) see if profile is reduced */ 406862306a36Sopenharmony_ci if (flags == 0) 406962306a36Sopenharmony_ci return !extended; /* "0" is valid for usual profiles */ 407062306a36Sopenharmony_ci 407162306a36Sopenharmony_ci return has_single_bit_set(flags); 407262306a36Sopenharmony_ci} 407362306a36Sopenharmony_ci 407462306a36Sopenharmony_ci/* 407562306a36Sopenharmony_ci * Validate target profile against allowed profiles and return true if it's OK. 407662306a36Sopenharmony_ci * Otherwise print the error message and return false. 407762306a36Sopenharmony_ci */ 407862306a36Sopenharmony_cistatic inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 407962306a36Sopenharmony_ci const struct btrfs_balance_args *bargs, 408062306a36Sopenharmony_ci u64 allowed, const char *type) 408162306a36Sopenharmony_ci{ 408262306a36Sopenharmony_ci if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 408362306a36Sopenharmony_ci return true; 408462306a36Sopenharmony_ci 408562306a36Sopenharmony_ci /* Profile is valid and does not have bits outside of the allowed set */ 408662306a36Sopenharmony_ci if (alloc_profile_is_valid(bargs->target, 1) && 408762306a36Sopenharmony_ci (bargs->target & ~allowed) == 0) 408862306a36Sopenharmony_ci return true; 408962306a36Sopenharmony_ci 409062306a36Sopenharmony_ci btrfs_err(fs_info, "balance: invalid convert %s profile %s", 409162306a36Sopenharmony_ci type, btrfs_bg_type_to_raid_name(bargs->target)); 409262306a36Sopenharmony_ci return false; 409362306a36Sopenharmony_ci} 409462306a36Sopenharmony_ci 409562306a36Sopenharmony_ci/* 409662306a36Sopenharmony_ci * Fill @buf with textual description of balance filter flags @bargs, up to 409762306a36Sopenharmony_ci * @size_buf including the terminating null. The output may be trimmed if it 409862306a36Sopenharmony_ci * does not fit into the provided buffer. 409962306a36Sopenharmony_ci */ 410062306a36Sopenharmony_cistatic void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 410162306a36Sopenharmony_ci u32 size_buf) 410262306a36Sopenharmony_ci{ 410362306a36Sopenharmony_ci int ret; 410462306a36Sopenharmony_ci u32 size_bp = size_buf; 410562306a36Sopenharmony_ci char *bp = buf; 410662306a36Sopenharmony_ci u64 flags = bargs->flags; 410762306a36Sopenharmony_ci char tmp_buf[128] = {'\0'}; 410862306a36Sopenharmony_ci 410962306a36Sopenharmony_ci if (!flags) 411062306a36Sopenharmony_ci return; 411162306a36Sopenharmony_ci 411262306a36Sopenharmony_ci#define CHECK_APPEND_NOARG(a) \ 411362306a36Sopenharmony_ci do { \ 411462306a36Sopenharmony_ci ret = snprintf(bp, size_bp, (a)); \ 411562306a36Sopenharmony_ci if (ret < 0 || ret >= size_bp) \ 411662306a36Sopenharmony_ci goto out_overflow; \ 411762306a36Sopenharmony_ci size_bp -= ret; \ 411862306a36Sopenharmony_ci bp += ret; \ 411962306a36Sopenharmony_ci } while (0) 412062306a36Sopenharmony_ci 412162306a36Sopenharmony_ci#define CHECK_APPEND_1ARG(a, v1) \ 412262306a36Sopenharmony_ci do { \ 412362306a36Sopenharmony_ci ret = snprintf(bp, size_bp, (a), (v1)); \ 412462306a36Sopenharmony_ci if (ret < 0 || ret >= size_bp) \ 412562306a36Sopenharmony_ci goto out_overflow; \ 412662306a36Sopenharmony_ci size_bp -= ret; \ 412762306a36Sopenharmony_ci bp += ret; \ 412862306a36Sopenharmony_ci } while (0) 412962306a36Sopenharmony_ci 413062306a36Sopenharmony_ci#define CHECK_APPEND_2ARG(a, v1, v2) \ 413162306a36Sopenharmony_ci do { \ 413262306a36Sopenharmony_ci ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 413362306a36Sopenharmony_ci if (ret < 0 || ret >= size_bp) \ 413462306a36Sopenharmony_ci goto out_overflow; \ 413562306a36Sopenharmony_ci size_bp -= ret; \ 413662306a36Sopenharmony_ci bp += ret; \ 413762306a36Sopenharmony_ci } while (0) 413862306a36Sopenharmony_ci 413962306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_CONVERT) 414062306a36Sopenharmony_ci CHECK_APPEND_1ARG("convert=%s,", 414162306a36Sopenharmony_ci btrfs_bg_type_to_raid_name(bargs->target)); 414262306a36Sopenharmony_ci 414362306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_SOFT) 414462306a36Sopenharmony_ci CHECK_APPEND_NOARG("soft,"); 414562306a36Sopenharmony_ci 414662306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 414762306a36Sopenharmony_ci btrfs_describe_block_groups(bargs->profiles, tmp_buf, 414862306a36Sopenharmony_ci sizeof(tmp_buf)); 414962306a36Sopenharmony_ci CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 415062306a36Sopenharmony_ci } 415162306a36Sopenharmony_ci 415262306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_USAGE) 415362306a36Sopenharmony_ci CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 415462306a36Sopenharmony_ci 415562306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 415662306a36Sopenharmony_ci CHECK_APPEND_2ARG("usage=%u..%u,", 415762306a36Sopenharmony_ci bargs->usage_min, bargs->usage_max); 415862306a36Sopenharmony_ci 415962306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_DEVID) 416062306a36Sopenharmony_ci CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 416162306a36Sopenharmony_ci 416262306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_DRANGE) 416362306a36Sopenharmony_ci CHECK_APPEND_2ARG("drange=%llu..%llu,", 416462306a36Sopenharmony_ci bargs->pstart, bargs->pend); 416562306a36Sopenharmony_ci 416662306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_VRANGE) 416762306a36Sopenharmony_ci CHECK_APPEND_2ARG("vrange=%llu..%llu,", 416862306a36Sopenharmony_ci bargs->vstart, bargs->vend); 416962306a36Sopenharmony_ci 417062306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_LIMIT) 417162306a36Sopenharmony_ci CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 417262306a36Sopenharmony_ci 417362306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 417462306a36Sopenharmony_ci CHECK_APPEND_2ARG("limit=%u..%u,", 417562306a36Sopenharmony_ci bargs->limit_min, bargs->limit_max); 417662306a36Sopenharmony_ci 417762306a36Sopenharmony_ci if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 417862306a36Sopenharmony_ci CHECK_APPEND_2ARG("stripes=%u..%u,", 417962306a36Sopenharmony_ci bargs->stripes_min, bargs->stripes_max); 418062306a36Sopenharmony_ci 418162306a36Sopenharmony_ci#undef CHECK_APPEND_2ARG 418262306a36Sopenharmony_ci#undef CHECK_APPEND_1ARG 418362306a36Sopenharmony_ci#undef CHECK_APPEND_NOARG 418462306a36Sopenharmony_ci 418562306a36Sopenharmony_ciout_overflow: 418662306a36Sopenharmony_ci 418762306a36Sopenharmony_ci if (size_bp < size_buf) 418862306a36Sopenharmony_ci buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 418962306a36Sopenharmony_ci else 419062306a36Sopenharmony_ci buf[0] = '\0'; 419162306a36Sopenharmony_ci} 419262306a36Sopenharmony_ci 419362306a36Sopenharmony_cistatic void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 419462306a36Sopenharmony_ci{ 419562306a36Sopenharmony_ci u32 size_buf = 1024; 419662306a36Sopenharmony_ci char tmp_buf[192] = {'\0'}; 419762306a36Sopenharmony_ci char *buf; 419862306a36Sopenharmony_ci char *bp; 419962306a36Sopenharmony_ci u32 size_bp = size_buf; 420062306a36Sopenharmony_ci int ret; 420162306a36Sopenharmony_ci struct btrfs_balance_control *bctl = fs_info->balance_ctl; 420262306a36Sopenharmony_ci 420362306a36Sopenharmony_ci buf = kzalloc(size_buf, GFP_KERNEL); 420462306a36Sopenharmony_ci if (!buf) 420562306a36Sopenharmony_ci return; 420662306a36Sopenharmony_ci 420762306a36Sopenharmony_ci bp = buf; 420862306a36Sopenharmony_ci 420962306a36Sopenharmony_ci#define CHECK_APPEND_1ARG(a, v1) \ 421062306a36Sopenharmony_ci do { \ 421162306a36Sopenharmony_ci ret = snprintf(bp, size_bp, (a), (v1)); \ 421262306a36Sopenharmony_ci if (ret < 0 || ret >= size_bp) \ 421362306a36Sopenharmony_ci goto out_overflow; \ 421462306a36Sopenharmony_ci size_bp -= ret; \ 421562306a36Sopenharmony_ci bp += ret; \ 421662306a36Sopenharmony_ci } while (0) 421762306a36Sopenharmony_ci 421862306a36Sopenharmony_ci if (bctl->flags & BTRFS_BALANCE_FORCE) 421962306a36Sopenharmony_ci CHECK_APPEND_1ARG("%s", "-f "); 422062306a36Sopenharmony_ci 422162306a36Sopenharmony_ci if (bctl->flags & BTRFS_BALANCE_DATA) { 422262306a36Sopenharmony_ci describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 422362306a36Sopenharmony_ci CHECK_APPEND_1ARG("-d%s ", tmp_buf); 422462306a36Sopenharmony_ci } 422562306a36Sopenharmony_ci 422662306a36Sopenharmony_ci if (bctl->flags & BTRFS_BALANCE_METADATA) { 422762306a36Sopenharmony_ci describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 422862306a36Sopenharmony_ci CHECK_APPEND_1ARG("-m%s ", tmp_buf); 422962306a36Sopenharmony_ci } 423062306a36Sopenharmony_ci 423162306a36Sopenharmony_ci if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 423262306a36Sopenharmony_ci describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 423362306a36Sopenharmony_ci CHECK_APPEND_1ARG("-s%s ", tmp_buf); 423462306a36Sopenharmony_ci } 423562306a36Sopenharmony_ci 423662306a36Sopenharmony_ci#undef CHECK_APPEND_1ARG 423762306a36Sopenharmony_ci 423862306a36Sopenharmony_ciout_overflow: 423962306a36Sopenharmony_ci 424062306a36Sopenharmony_ci if (size_bp < size_buf) 424162306a36Sopenharmony_ci buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 424262306a36Sopenharmony_ci btrfs_info(fs_info, "balance: %s %s", 424362306a36Sopenharmony_ci (bctl->flags & BTRFS_BALANCE_RESUME) ? 424462306a36Sopenharmony_ci "resume" : "start", buf); 424562306a36Sopenharmony_ci 424662306a36Sopenharmony_ci kfree(buf); 424762306a36Sopenharmony_ci} 424862306a36Sopenharmony_ci 424962306a36Sopenharmony_ci/* 425062306a36Sopenharmony_ci * Should be called with balance mutexe held 425162306a36Sopenharmony_ci */ 425262306a36Sopenharmony_ciint btrfs_balance(struct btrfs_fs_info *fs_info, 425362306a36Sopenharmony_ci struct btrfs_balance_control *bctl, 425462306a36Sopenharmony_ci struct btrfs_ioctl_balance_args *bargs) 425562306a36Sopenharmony_ci{ 425662306a36Sopenharmony_ci u64 meta_target, data_target; 425762306a36Sopenharmony_ci u64 allowed; 425862306a36Sopenharmony_ci int mixed = 0; 425962306a36Sopenharmony_ci int ret; 426062306a36Sopenharmony_ci u64 num_devices; 426162306a36Sopenharmony_ci unsigned seq; 426262306a36Sopenharmony_ci bool reducing_redundancy; 426362306a36Sopenharmony_ci bool paused = false; 426462306a36Sopenharmony_ci int i; 426562306a36Sopenharmony_ci 426662306a36Sopenharmony_ci if (btrfs_fs_closing(fs_info) || 426762306a36Sopenharmony_ci atomic_read(&fs_info->balance_pause_req) || 426862306a36Sopenharmony_ci btrfs_should_cancel_balance(fs_info)) { 426962306a36Sopenharmony_ci ret = -EINVAL; 427062306a36Sopenharmony_ci goto out; 427162306a36Sopenharmony_ci } 427262306a36Sopenharmony_ci 427362306a36Sopenharmony_ci allowed = btrfs_super_incompat_flags(fs_info->super_copy); 427462306a36Sopenharmony_ci if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 427562306a36Sopenharmony_ci mixed = 1; 427662306a36Sopenharmony_ci 427762306a36Sopenharmony_ci /* 427862306a36Sopenharmony_ci * In case of mixed groups both data and meta should be picked, 427962306a36Sopenharmony_ci * and identical options should be given for both of them. 428062306a36Sopenharmony_ci */ 428162306a36Sopenharmony_ci allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 428262306a36Sopenharmony_ci if (mixed && (bctl->flags & allowed)) { 428362306a36Sopenharmony_ci if (!(bctl->flags & BTRFS_BALANCE_DATA) || 428462306a36Sopenharmony_ci !(bctl->flags & BTRFS_BALANCE_METADATA) || 428562306a36Sopenharmony_ci memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 428662306a36Sopenharmony_ci btrfs_err(fs_info, 428762306a36Sopenharmony_ci "balance: mixed groups data and metadata options must be the same"); 428862306a36Sopenharmony_ci ret = -EINVAL; 428962306a36Sopenharmony_ci goto out; 429062306a36Sopenharmony_ci } 429162306a36Sopenharmony_ci } 429262306a36Sopenharmony_ci 429362306a36Sopenharmony_ci /* 429462306a36Sopenharmony_ci * rw_devices will not change at the moment, device add/delete/replace 429562306a36Sopenharmony_ci * are exclusive 429662306a36Sopenharmony_ci */ 429762306a36Sopenharmony_ci num_devices = fs_info->fs_devices->rw_devices; 429862306a36Sopenharmony_ci 429962306a36Sopenharmony_ci /* 430062306a36Sopenharmony_ci * SINGLE profile on-disk has no profile bit, but in-memory we have a 430162306a36Sopenharmony_ci * special bit for it, to make it easier to distinguish. Thus we need 430262306a36Sopenharmony_ci * to set it manually, or balance would refuse the profile. 430362306a36Sopenharmony_ci */ 430462306a36Sopenharmony_ci allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 430562306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 430662306a36Sopenharmony_ci if (num_devices >= btrfs_raid_array[i].devs_min) 430762306a36Sopenharmony_ci allowed |= btrfs_raid_array[i].bg_flag; 430862306a36Sopenharmony_ci 430962306a36Sopenharmony_ci if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 431062306a36Sopenharmony_ci !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 431162306a36Sopenharmony_ci !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 431262306a36Sopenharmony_ci ret = -EINVAL; 431362306a36Sopenharmony_ci goto out; 431462306a36Sopenharmony_ci } 431562306a36Sopenharmony_ci 431662306a36Sopenharmony_ci /* 431762306a36Sopenharmony_ci * Allow to reduce metadata or system integrity only if force set for 431862306a36Sopenharmony_ci * profiles with redundancy (copies, parity) 431962306a36Sopenharmony_ci */ 432062306a36Sopenharmony_ci allowed = 0; 432162306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 432262306a36Sopenharmony_ci if (btrfs_raid_array[i].ncopies >= 2 || 432362306a36Sopenharmony_ci btrfs_raid_array[i].tolerated_failures >= 1) 432462306a36Sopenharmony_ci allowed |= btrfs_raid_array[i].bg_flag; 432562306a36Sopenharmony_ci } 432662306a36Sopenharmony_ci do { 432762306a36Sopenharmony_ci seq = read_seqbegin(&fs_info->profiles_lock); 432862306a36Sopenharmony_ci 432962306a36Sopenharmony_ci if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 433062306a36Sopenharmony_ci (fs_info->avail_system_alloc_bits & allowed) && 433162306a36Sopenharmony_ci !(bctl->sys.target & allowed)) || 433262306a36Sopenharmony_ci ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 433362306a36Sopenharmony_ci (fs_info->avail_metadata_alloc_bits & allowed) && 433462306a36Sopenharmony_ci !(bctl->meta.target & allowed))) 433562306a36Sopenharmony_ci reducing_redundancy = true; 433662306a36Sopenharmony_ci else 433762306a36Sopenharmony_ci reducing_redundancy = false; 433862306a36Sopenharmony_ci 433962306a36Sopenharmony_ci /* if we're not converting, the target field is uninitialized */ 434062306a36Sopenharmony_ci meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 434162306a36Sopenharmony_ci bctl->meta.target : fs_info->avail_metadata_alloc_bits; 434262306a36Sopenharmony_ci data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 434362306a36Sopenharmony_ci bctl->data.target : fs_info->avail_data_alloc_bits; 434462306a36Sopenharmony_ci } while (read_seqretry(&fs_info->profiles_lock, seq)); 434562306a36Sopenharmony_ci 434662306a36Sopenharmony_ci if (reducing_redundancy) { 434762306a36Sopenharmony_ci if (bctl->flags & BTRFS_BALANCE_FORCE) { 434862306a36Sopenharmony_ci btrfs_info(fs_info, 434962306a36Sopenharmony_ci "balance: force reducing metadata redundancy"); 435062306a36Sopenharmony_ci } else { 435162306a36Sopenharmony_ci btrfs_err(fs_info, 435262306a36Sopenharmony_ci "balance: reduces metadata redundancy, use --force if you want this"); 435362306a36Sopenharmony_ci ret = -EINVAL; 435462306a36Sopenharmony_ci goto out; 435562306a36Sopenharmony_ci } 435662306a36Sopenharmony_ci } 435762306a36Sopenharmony_ci 435862306a36Sopenharmony_ci if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 435962306a36Sopenharmony_ci btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 436062306a36Sopenharmony_ci btrfs_warn(fs_info, 436162306a36Sopenharmony_ci "balance: metadata profile %s has lower redundancy than data profile %s", 436262306a36Sopenharmony_ci btrfs_bg_type_to_raid_name(meta_target), 436362306a36Sopenharmony_ci btrfs_bg_type_to_raid_name(data_target)); 436462306a36Sopenharmony_ci } 436562306a36Sopenharmony_ci 436662306a36Sopenharmony_ci ret = insert_balance_item(fs_info, bctl); 436762306a36Sopenharmony_ci if (ret && ret != -EEXIST) 436862306a36Sopenharmony_ci goto out; 436962306a36Sopenharmony_ci 437062306a36Sopenharmony_ci if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 437162306a36Sopenharmony_ci BUG_ON(ret == -EEXIST); 437262306a36Sopenharmony_ci BUG_ON(fs_info->balance_ctl); 437362306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 437462306a36Sopenharmony_ci fs_info->balance_ctl = bctl; 437562306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 437662306a36Sopenharmony_ci } else { 437762306a36Sopenharmony_ci BUG_ON(ret != -EEXIST); 437862306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 437962306a36Sopenharmony_ci update_balance_args(bctl); 438062306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 438162306a36Sopenharmony_ci } 438262306a36Sopenharmony_ci 438362306a36Sopenharmony_ci ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 438462306a36Sopenharmony_ci set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 438562306a36Sopenharmony_ci describe_balance_start_or_resume(fs_info); 438662306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 438762306a36Sopenharmony_ci 438862306a36Sopenharmony_ci ret = __btrfs_balance(fs_info); 438962306a36Sopenharmony_ci 439062306a36Sopenharmony_ci mutex_lock(&fs_info->balance_mutex); 439162306a36Sopenharmony_ci if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { 439262306a36Sopenharmony_ci btrfs_info(fs_info, "balance: paused"); 439362306a36Sopenharmony_ci btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); 439462306a36Sopenharmony_ci paused = true; 439562306a36Sopenharmony_ci } 439662306a36Sopenharmony_ci /* 439762306a36Sopenharmony_ci * Balance can be canceled by: 439862306a36Sopenharmony_ci * 439962306a36Sopenharmony_ci * - Regular cancel request 440062306a36Sopenharmony_ci * Then ret == -ECANCELED and balance_cancel_req > 0 440162306a36Sopenharmony_ci * 440262306a36Sopenharmony_ci * - Fatal signal to "btrfs" process 440362306a36Sopenharmony_ci * Either the signal caught by wait_reserve_ticket() and callers 440462306a36Sopenharmony_ci * got -EINTR, or caught by btrfs_should_cancel_balance() and 440562306a36Sopenharmony_ci * got -ECANCELED. 440662306a36Sopenharmony_ci * Either way, in this case balance_cancel_req = 0, and 440762306a36Sopenharmony_ci * ret == -EINTR or ret == -ECANCELED. 440862306a36Sopenharmony_ci * 440962306a36Sopenharmony_ci * So here we only check the return value to catch canceled balance. 441062306a36Sopenharmony_ci */ 441162306a36Sopenharmony_ci else if (ret == -ECANCELED || ret == -EINTR) 441262306a36Sopenharmony_ci btrfs_info(fs_info, "balance: canceled"); 441362306a36Sopenharmony_ci else 441462306a36Sopenharmony_ci btrfs_info(fs_info, "balance: ended with status: %d", ret); 441562306a36Sopenharmony_ci 441662306a36Sopenharmony_ci clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 441762306a36Sopenharmony_ci 441862306a36Sopenharmony_ci if (bargs) { 441962306a36Sopenharmony_ci memset(bargs, 0, sizeof(*bargs)); 442062306a36Sopenharmony_ci btrfs_update_ioctl_balance_args(fs_info, bargs); 442162306a36Sopenharmony_ci } 442262306a36Sopenharmony_ci 442362306a36Sopenharmony_ci /* We didn't pause, we can clean everything up. */ 442462306a36Sopenharmony_ci if (!paused) { 442562306a36Sopenharmony_ci reset_balance_state(fs_info); 442662306a36Sopenharmony_ci btrfs_exclop_finish(fs_info); 442762306a36Sopenharmony_ci } 442862306a36Sopenharmony_ci 442962306a36Sopenharmony_ci wake_up(&fs_info->balance_wait_q); 443062306a36Sopenharmony_ci 443162306a36Sopenharmony_ci return ret; 443262306a36Sopenharmony_ciout: 443362306a36Sopenharmony_ci if (bctl->flags & BTRFS_BALANCE_RESUME) 443462306a36Sopenharmony_ci reset_balance_state(fs_info); 443562306a36Sopenharmony_ci else 443662306a36Sopenharmony_ci kfree(bctl); 443762306a36Sopenharmony_ci btrfs_exclop_finish(fs_info); 443862306a36Sopenharmony_ci 443962306a36Sopenharmony_ci return ret; 444062306a36Sopenharmony_ci} 444162306a36Sopenharmony_ci 444262306a36Sopenharmony_cistatic int balance_kthread(void *data) 444362306a36Sopenharmony_ci{ 444462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = data; 444562306a36Sopenharmony_ci int ret = 0; 444662306a36Sopenharmony_ci 444762306a36Sopenharmony_ci sb_start_write(fs_info->sb); 444862306a36Sopenharmony_ci mutex_lock(&fs_info->balance_mutex); 444962306a36Sopenharmony_ci if (fs_info->balance_ctl) 445062306a36Sopenharmony_ci ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 445162306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 445262306a36Sopenharmony_ci sb_end_write(fs_info->sb); 445362306a36Sopenharmony_ci 445462306a36Sopenharmony_ci return ret; 445562306a36Sopenharmony_ci} 445662306a36Sopenharmony_ci 445762306a36Sopenharmony_ciint btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 445862306a36Sopenharmony_ci{ 445962306a36Sopenharmony_ci struct task_struct *tsk; 446062306a36Sopenharmony_ci 446162306a36Sopenharmony_ci mutex_lock(&fs_info->balance_mutex); 446262306a36Sopenharmony_ci if (!fs_info->balance_ctl) { 446362306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 446462306a36Sopenharmony_ci return 0; 446562306a36Sopenharmony_ci } 446662306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 446762306a36Sopenharmony_ci 446862306a36Sopenharmony_ci if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 446962306a36Sopenharmony_ci btrfs_info(fs_info, "balance: resume skipped"); 447062306a36Sopenharmony_ci return 0; 447162306a36Sopenharmony_ci } 447262306a36Sopenharmony_ci 447362306a36Sopenharmony_ci spin_lock(&fs_info->super_lock); 447462306a36Sopenharmony_ci ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); 447562306a36Sopenharmony_ci fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; 447662306a36Sopenharmony_ci spin_unlock(&fs_info->super_lock); 447762306a36Sopenharmony_ci /* 447862306a36Sopenharmony_ci * A ro->rw remount sequence should continue with the paused balance 447962306a36Sopenharmony_ci * regardless of who pauses it, system or the user as of now, so set 448062306a36Sopenharmony_ci * the resume flag. 448162306a36Sopenharmony_ci */ 448262306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 448362306a36Sopenharmony_ci fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 448462306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 448562306a36Sopenharmony_ci 448662306a36Sopenharmony_ci tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 448762306a36Sopenharmony_ci return PTR_ERR_OR_ZERO(tsk); 448862306a36Sopenharmony_ci} 448962306a36Sopenharmony_ci 449062306a36Sopenharmony_ciint btrfs_recover_balance(struct btrfs_fs_info *fs_info) 449162306a36Sopenharmony_ci{ 449262306a36Sopenharmony_ci struct btrfs_balance_control *bctl; 449362306a36Sopenharmony_ci struct btrfs_balance_item *item; 449462306a36Sopenharmony_ci struct btrfs_disk_balance_args disk_bargs; 449562306a36Sopenharmony_ci struct btrfs_path *path; 449662306a36Sopenharmony_ci struct extent_buffer *leaf; 449762306a36Sopenharmony_ci struct btrfs_key key; 449862306a36Sopenharmony_ci int ret; 449962306a36Sopenharmony_ci 450062306a36Sopenharmony_ci path = btrfs_alloc_path(); 450162306a36Sopenharmony_ci if (!path) 450262306a36Sopenharmony_ci return -ENOMEM; 450362306a36Sopenharmony_ci 450462306a36Sopenharmony_ci key.objectid = BTRFS_BALANCE_OBJECTID; 450562306a36Sopenharmony_ci key.type = BTRFS_TEMPORARY_ITEM_KEY; 450662306a36Sopenharmony_ci key.offset = 0; 450762306a36Sopenharmony_ci 450862306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 450962306a36Sopenharmony_ci if (ret < 0) 451062306a36Sopenharmony_ci goto out; 451162306a36Sopenharmony_ci if (ret > 0) { /* ret = -ENOENT; */ 451262306a36Sopenharmony_ci ret = 0; 451362306a36Sopenharmony_ci goto out; 451462306a36Sopenharmony_ci } 451562306a36Sopenharmony_ci 451662306a36Sopenharmony_ci bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 451762306a36Sopenharmony_ci if (!bctl) { 451862306a36Sopenharmony_ci ret = -ENOMEM; 451962306a36Sopenharmony_ci goto out; 452062306a36Sopenharmony_ci } 452162306a36Sopenharmony_ci 452262306a36Sopenharmony_ci leaf = path->nodes[0]; 452362306a36Sopenharmony_ci item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 452462306a36Sopenharmony_ci 452562306a36Sopenharmony_ci bctl->flags = btrfs_balance_flags(leaf, item); 452662306a36Sopenharmony_ci bctl->flags |= BTRFS_BALANCE_RESUME; 452762306a36Sopenharmony_ci 452862306a36Sopenharmony_ci btrfs_balance_data(leaf, item, &disk_bargs); 452962306a36Sopenharmony_ci btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 453062306a36Sopenharmony_ci btrfs_balance_meta(leaf, item, &disk_bargs); 453162306a36Sopenharmony_ci btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 453262306a36Sopenharmony_ci btrfs_balance_sys(leaf, item, &disk_bargs); 453362306a36Sopenharmony_ci btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 453462306a36Sopenharmony_ci 453562306a36Sopenharmony_ci /* 453662306a36Sopenharmony_ci * This should never happen, as the paused balance state is recovered 453762306a36Sopenharmony_ci * during mount without any chance of other exclusive ops to collide. 453862306a36Sopenharmony_ci * 453962306a36Sopenharmony_ci * This gives the exclusive op status to balance and keeps in paused 454062306a36Sopenharmony_ci * state until user intervention (cancel or umount). If the ownership 454162306a36Sopenharmony_ci * cannot be assigned, show a message but do not fail. The balance 454262306a36Sopenharmony_ci * is in a paused state and must have fs_info::balance_ctl properly 454362306a36Sopenharmony_ci * set up. 454462306a36Sopenharmony_ci */ 454562306a36Sopenharmony_ci if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) 454662306a36Sopenharmony_ci btrfs_warn(fs_info, 454762306a36Sopenharmony_ci "balance: cannot set exclusive op status, resume manually"); 454862306a36Sopenharmony_ci 454962306a36Sopenharmony_ci btrfs_release_path(path); 455062306a36Sopenharmony_ci 455162306a36Sopenharmony_ci mutex_lock(&fs_info->balance_mutex); 455262306a36Sopenharmony_ci BUG_ON(fs_info->balance_ctl); 455362306a36Sopenharmony_ci spin_lock(&fs_info->balance_lock); 455462306a36Sopenharmony_ci fs_info->balance_ctl = bctl; 455562306a36Sopenharmony_ci spin_unlock(&fs_info->balance_lock); 455662306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 455762306a36Sopenharmony_ciout: 455862306a36Sopenharmony_ci btrfs_free_path(path); 455962306a36Sopenharmony_ci return ret; 456062306a36Sopenharmony_ci} 456162306a36Sopenharmony_ci 456262306a36Sopenharmony_ciint btrfs_pause_balance(struct btrfs_fs_info *fs_info) 456362306a36Sopenharmony_ci{ 456462306a36Sopenharmony_ci int ret = 0; 456562306a36Sopenharmony_ci 456662306a36Sopenharmony_ci mutex_lock(&fs_info->balance_mutex); 456762306a36Sopenharmony_ci if (!fs_info->balance_ctl) { 456862306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 456962306a36Sopenharmony_ci return -ENOTCONN; 457062306a36Sopenharmony_ci } 457162306a36Sopenharmony_ci 457262306a36Sopenharmony_ci if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 457362306a36Sopenharmony_ci atomic_inc(&fs_info->balance_pause_req); 457462306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 457562306a36Sopenharmony_ci 457662306a36Sopenharmony_ci wait_event(fs_info->balance_wait_q, 457762306a36Sopenharmony_ci !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 457862306a36Sopenharmony_ci 457962306a36Sopenharmony_ci mutex_lock(&fs_info->balance_mutex); 458062306a36Sopenharmony_ci /* we are good with balance_ctl ripped off from under us */ 458162306a36Sopenharmony_ci BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 458262306a36Sopenharmony_ci atomic_dec(&fs_info->balance_pause_req); 458362306a36Sopenharmony_ci } else { 458462306a36Sopenharmony_ci ret = -ENOTCONN; 458562306a36Sopenharmony_ci } 458662306a36Sopenharmony_ci 458762306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 458862306a36Sopenharmony_ci return ret; 458962306a36Sopenharmony_ci} 459062306a36Sopenharmony_ci 459162306a36Sopenharmony_ciint btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 459262306a36Sopenharmony_ci{ 459362306a36Sopenharmony_ci mutex_lock(&fs_info->balance_mutex); 459462306a36Sopenharmony_ci if (!fs_info->balance_ctl) { 459562306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 459662306a36Sopenharmony_ci return -ENOTCONN; 459762306a36Sopenharmony_ci } 459862306a36Sopenharmony_ci 459962306a36Sopenharmony_ci /* 460062306a36Sopenharmony_ci * A paused balance with the item stored on disk can be resumed at 460162306a36Sopenharmony_ci * mount time if the mount is read-write. Otherwise it's still paused 460262306a36Sopenharmony_ci * and we must not allow cancelling as it deletes the item. 460362306a36Sopenharmony_ci */ 460462306a36Sopenharmony_ci if (sb_rdonly(fs_info->sb)) { 460562306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 460662306a36Sopenharmony_ci return -EROFS; 460762306a36Sopenharmony_ci } 460862306a36Sopenharmony_ci 460962306a36Sopenharmony_ci atomic_inc(&fs_info->balance_cancel_req); 461062306a36Sopenharmony_ci /* 461162306a36Sopenharmony_ci * if we are running just wait and return, balance item is 461262306a36Sopenharmony_ci * deleted in btrfs_balance in this case 461362306a36Sopenharmony_ci */ 461462306a36Sopenharmony_ci if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 461562306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 461662306a36Sopenharmony_ci wait_event(fs_info->balance_wait_q, 461762306a36Sopenharmony_ci !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 461862306a36Sopenharmony_ci mutex_lock(&fs_info->balance_mutex); 461962306a36Sopenharmony_ci } else { 462062306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 462162306a36Sopenharmony_ci /* 462262306a36Sopenharmony_ci * Lock released to allow other waiters to continue, we'll 462362306a36Sopenharmony_ci * reexamine the status again. 462462306a36Sopenharmony_ci */ 462562306a36Sopenharmony_ci mutex_lock(&fs_info->balance_mutex); 462662306a36Sopenharmony_ci 462762306a36Sopenharmony_ci if (fs_info->balance_ctl) { 462862306a36Sopenharmony_ci reset_balance_state(fs_info); 462962306a36Sopenharmony_ci btrfs_exclop_finish(fs_info); 463062306a36Sopenharmony_ci btrfs_info(fs_info, "balance: canceled"); 463162306a36Sopenharmony_ci } 463262306a36Sopenharmony_ci } 463362306a36Sopenharmony_ci 463462306a36Sopenharmony_ci ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 463562306a36Sopenharmony_ci atomic_dec(&fs_info->balance_cancel_req); 463662306a36Sopenharmony_ci mutex_unlock(&fs_info->balance_mutex); 463762306a36Sopenharmony_ci return 0; 463862306a36Sopenharmony_ci} 463962306a36Sopenharmony_ci 464062306a36Sopenharmony_ciint btrfs_uuid_scan_kthread(void *data) 464162306a36Sopenharmony_ci{ 464262306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = data; 464362306a36Sopenharmony_ci struct btrfs_root *root = fs_info->tree_root; 464462306a36Sopenharmony_ci struct btrfs_key key; 464562306a36Sopenharmony_ci struct btrfs_path *path = NULL; 464662306a36Sopenharmony_ci int ret = 0; 464762306a36Sopenharmony_ci struct extent_buffer *eb; 464862306a36Sopenharmony_ci int slot; 464962306a36Sopenharmony_ci struct btrfs_root_item root_item; 465062306a36Sopenharmony_ci u32 item_size; 465162306a36Sopenharmony_ci struct btrfs_trans_handle *trans = NULL; 465262306a36Sopenharmony_ci bool closing = false; 465362306a36Sopenharmony_ci 465462306a36Sopenharmony_ci path = btrfs_alloc_path(); 465562306a36Sopenharmony_ci if (!path) { 465662306a36Sopenharmony_ci ret = -ENOMEM; 465762306a36Sopenharmony_ci goto out; 465862306a36Sopenharmony_ci } 465962306a36Sopenharmony_ci 466062306a36Sopenharmony_ci key.objectid = 0; 466162306a36Sopenharmony_ci key.type = BTRFS_ROOT_ITEM_KEY; 466262306a36Sopenharmony_ci key.offset = 0; 466362306a36Sopenharmony_ci 466462306a36Sopenharmony_ci while (1) { 466562306a36Sopenharmony_ci if (btrfs_fs_closing(fs_info)) { 466662306a36Sopenharmony_ci closing = true; 466762306a36Sopenharmony_ci break; 466862306a36Sopenharmony_ci } 466962306a36Sopenharmony_ci ret = btrfs_search_forward(root, &key, path, 467062306a36Sopenharmony_ci BTRFS_OLDEST_GENERATION); 467162306a36Sopenharmony_ci if (ret) { 467262306a36Sopenharmony_ci if (ret > 0) 467362306a36Sopenharmony_ci ret = 0; 467462306a36Sopenharmony_ci break; 467562306a36Sopenharmony_ci } 467662306a36Sopenharmony_ci 467762306a36Sopenharmony_ci if (key.type != BTRFS_ROOT_ITEM_KEY || 467862306a36Sopenharmony_ci (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 467962306a36Sopenharmony_ci key.objectid != BTRFS_FS_TREE_OBJECTID) || 468062306a36Sopenharmony_ci key.objectid > BTRFS_LAST_FREE_OBJECTID) 468162306a36Sopenharmony_ci goto skip; 468262306a36Sopenharmony_ci 468362306a36Sopenharmony_ci eb = path->nodes[0]; 468462306a36Sopenharmony_ci slot = path->slots[0]; 468562306a36Sopenharmony_ci item_size = btrfs_item_size(eb, slot); 468662306a36Sopenharmony_ci if (item_size < sizeof(root_item)) 468762306a36Sopenharmony_ci goto skip; 468862306a36Sopenharmony_ci 468962306a36Sopenharmony_ci read_extent_buffer(eb, &root_item, 469062306a36Sopenharmony_ci btrfs_item_ptr_offset(eb, slot), 469162306a36Sopenharmony_ci (int)sizeof(root_item)); 469262306a36Sopenharmony_ci if (btrfs_root_refs(&root_item) == 0) 469362306a36Sopenharmony_ci goto skip; 469462306a36Sopenharmony_ci 469562306a36Sopenharmony_ci if (!btrfs_is_empty_uuid(root_item.uuid) || 469662306a36Sopenharmony_ci !btrfs_is_empty_uuid(root_item.received_uuid)) { 469762306a36Sopenharmony_ci if (trans) 469862306a36Sopenharmony_ci goto update_tree; 469962306a36Sopenharmony_ci 470062306a36Sopenharmony_ci btrfs_release_path(path); 470162306a36Sopenharmony_ci /* 470262306a36Sopenharmony_ci * 1 - subvol uuid item 470362306a36Sopenharmony_ci * 1 - received_subvol uuid item 470462306a36Sopenharmony_ci */ 470562306a36Sopenharmony_ci trans = btrfs_start_transaction(fs_info->uuid_root, 2); 470662306a36Sopenharmony_ci if (IS_ERR(trans)) { 470762306a36Sopenharmony_ci ret = PTR_ERR(trans); 470862306a36Sopenharmony_ci break; 470962306a36Sopenharmony_ci } 471062306a36Sopenharmony_ci continue; 471162306a36Sopenharmony_ci } else { 471262306a36Sopenharmony_ci goto skip; 471362306a36Sopenharmony_ci } 471462306a36Sopenharmony_ciupdate_tree: 471562306a36Sopenharmony_ci btrfs_release_path(path); 471662306a36Sopenharmony_ci if (!btrfs_is_empty_uuid(root_item.uuid)) { 471762306a36Sopenharmony_ci ret = btrfs_uuid_tree_add(trans, root_item.uuid, 471862306a36Sopenharmony_ci BTRFS_UUID_KEY_SUBVOL, 471962306a36Sopenharmony_ci key.objectid); 472062306a36Sopenharmony_ci if (ret < 0) { 472162306a36Sopenharmony_ci btrfs_warn(fs_info, "uuid_tree_add failed %d", 472262306a36Sopenharmony_ci ret); 472362306a36Sopenharmony_ci break; 472462306a36Sopenharmony_ci } 472562306a36Sopenharmony_ci } 472662306a36Sopenharmony_ci 472762306a36Sopenharmony_ci if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 472862306a36Sopenharmony_ci ret = btrfs_uuid_tree_add(trans, 472962306a36Sopenharmony_ci root_item.received_uuid, 473062306a36Sopenharmony_ci BTRFS_UUID_KEY_RECEIVED_SUBVOL, 473162306a36Sopenharmony_ci key.objectid); 473262306a36Sopenharmony_ci if (ret < 0) { 473362306a36Sopenharmony_ci btrfs_warn(fs_info, "uuid_tree_add failed %d", 473462306a36Sopenharmony_ci ret); 473562306a36Sopenharmony_ci break; 473662306a36Sopenharmony_ci } 473762306a36Sopenharmony_ci } 473862306a36Sopenharmony_ci 473962306a36Sopenharmony_ciskip: 474062306a36Sopenharmony_ci btrfs_release_path(path); 474162306a36Sopenharmony_ci if (trans) { 474262306a36Sopenharmony_ci ret = btrfs_end_transaction(trans); 474362306a36Sopenharmony_ci trans = NULL; 474462306a36Sopenharmony_ci if (ret) 474562306a36Sopenharmony_ci break; 474662306a36Sopenharmony_ci } 474762306a36Sopenharmony_ci 474862306a36Sopenharmony_ci if (key.offset < (u64)-1) { 474962306a36Sopenharmony_ci key.offset++; 475062306a36Sopenharmony_ci } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 475162306a36Sopenharmony_ci key.offset = 0; 475262306a36Sopenharmony_ci key.type = BTRFS_ROOT_ITEM_KEY; 475362306a36Sopenharmony_ci } else if (key.objectid < (u64)-1) { 475462306a36Sopenharmony_ci key.offset = 0; 475562306a36Sopenharmony_ci key.type = BTRFS_ROOT_ITEM_KEY; 475662306a36Sopenharmony_ci key.objectid++; 475762306a36Sopenharmony_ci } else { 475862306a36Sopenharmony_ci break; 475962306a36Sopenharmony_ci } 476062306a36Sopenharmony_ci cond_resched(); 476162306a36Sopenharmony_ci } 476262306a36Sopenharmony_ci 476362306a36Sopenharmony_ciout: 476462306a36Sopenharmony_ci btrfs_free_path(path); 476562306a36Sopenharmony_ci if (trans && !IS_ERR(trans)) 476662306a36Sopenharmony_ci btrfs_end_transaction(trans); 476762306a36Sopenharmony_ci if (ret) 476862306a36Sopenharmony_ci btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 476962306a36Sopenharmony_ci else if (!closing) 477062306a36Sopenharmony_ci set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 477162306a36Sopenharmony_ci up(&fs_info->uuid_tree_rescan_sem); 477262306a36Sopenharmony_ci return 0; 477362306a36Sopenharmony_ci} 477462306a36Sopenharmony_ci 477562306a36Sopenharmony_ciint btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 477662306a36Sopenharmony_ci{ 477762306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 477862306a36Sopenharmony_ci struct btrfs_root *tree_root = fs_info->tree_root; 477962306a36Sopenharmony_ci struct btrfs_root *uuid_root; 478062306a36Sopenharmony_ci struct task_struct *task; 478162306a36Sopenharmony_ci int ret; 478262306a36Sopenharmony_ci 478362306a36Sopenharmony_ci /* 478462306a36Sopenharmony_ci * 1 - root node 478562306a36Sopenharmony_ci * 1 - root item 478662306a36Sopenharmony_ci */ 478762306a36Sopenharmony_ci trans = btrfs_start_transaction(tree_root, 2); 478862306a36Sopenharmony_ci if (IS_ERR(trans)) 478962306a36Sopenharmony_ci return PTR_ERR(trans); 479062306a36Sopenharmony_ci 479162306a36Sopenharmony_ci uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 479262306a36Sopenharmony_ci if (IS_ERR(uuid_root)) { 479362306a36Sopenharmony_ci ret = PTR_ERR(uuid_root); 479462306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 479562306a36Sopenharmony_ci btrfs_end_transaction(trans); 479662306a36Sopenharmony_ci return ret; 479762306a36Sopenharmony_ci } 479862306a36Sopenharmony_ci 479962306a36Sopenharmony_ci fs_info->uuid_root = uuid_root; 480062306a36Sopenharmony_ci 480162306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 480262306a36Sopenharmony_ci if (ret) 480362306a36Sopenharmony_ci return ret; 480462306a36Sopenharmony_ci 480562306a36Sopenharmony_ci down(&fs_info->uuid_tree_rescan_sem); 480662306a36Sopenharmony_ci task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 480762306a36Sopenharmony_ci if (IS_ERR(task)) { 480862306a36Sopenharmony_ci /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 480962306a36Sopenharmony_ci btrfs_warn(fs_info, "failed to start uuid_scan task"); 481062306a36Sopenharmony_ci up(&fs_info->uuid_tree_rescan_sem); 481162306a36Sopenharmony_ci return PTR_ERR(task); 481262306a36Sopenharmony_ci } 481362306a36Sopenharmony_ci 481462306a36Sopenharmony_ci return 0; 481562306a36Sopenharmony_ci} 481662306a36Sopenharmony_ci 481762306a36Sopenharmony_ci/* 481862306a36Sopenharmony_ci * shrinking a device means finding all of the device extents past 481962306a36Sopenharmony_ci * the new size, and then following the back refs to the chunks. 482062306a36Sopenharmony_ci * The chunk relocation code actually frees the device extent 482162306a36Sopenharmony_ci */ 482262306a36Sopenharmony_ciint btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 482362306a36Sopenharmony_ci{ 482462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = device->fs_info; 482562306a36Sopenharmony_ci struct btrfs_root *root = fs_info->dev_root; 482662306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 482762306a36Sopenharmony_ci struct btrfs_dev_extent *dev_extent = NULL; 482862306a36Sopenharmony_ci struct btrfs_path *path; 482962306a36Sopenharmony_ci u64 length; 483062306a36Sopenharmony_ci u64 chunk_offset; 483162306a36Sopenharmony_ci int ret; 483262306a36Sopenharmony_ci int slot; 483362306a36Sopenharmony_ci int failed = 0; 483462306a36Sopenharmony_ci bool retried = false; 483562306a36Sopenharmony_ci struct extent_buffer *l; 483662306a36Sopenharmony_ci struct btrfs_key key; 483762306a36Sopenharmony_ci struct btrfs_super_block *super_copy = fs_info->super_copy; 483862306a36Sopenharmony_ci u64 old_total = btrfs_super_total_bytes(super_copy); 483962306a36Sopenharmony_ci u64 old_size = btrfs_device_get_total_bytes(device); 484062306a36Sopenharmony_ci u64 diff; 484162306a36Sopenharmony_ci u64 start; 484262306a36Sopenharmony_ci 484362306a36Sopenharmony_ci new_size = round_down(new_size, fs_info->sectorsize); 484462306a36Sopenharmony_ci start = new_size; 484562306a36Sopenharmony_ci diff = round_down(old_size - new_size, fs_info->sectorsize); 484662306a36Sopenharmony_ci 484762306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 484862306a36Sopenharmony_ci return -EINVAL; 484962306a36Sopenharmony_ci 485062306a36Sopenharmony_ci path = btrfs_alloc_path(); 485162306a36Sopenharmony_ci if (!path) 485262306a36Sopenharmony_ci return -ENOMEM; 485362306a36Sopenharmony_ci 485462306a36Sopenharmony_ci path->reada = READA_BACK; 485562306a36Sopenharmony_ci 485662306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 0); 485762306a36Sopenharmony_ci if (IS_ERR(trans)) { 485862306a36Sopenharmony_ci btrfs_free_path(path); 485962306a36Sopenharmony_ci return PTR_ERR(trans); 486062306a36Sopenharmony_ci } 486162306a36Sopenharmony_ci 486262306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 486362306a36Sopenharmony_ci 486462306a36Sopenharmony_ci btrfs_device_set_total_bytes(device, new_size); 486562306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 486662306a36Sopenharmony_ci device->fs_devices->total_rw_bytes -= diff; 486762306a36Sopenharmony_ci atomic64_sub(diff, &fs_info->free_chunk_space); 486862306a36Sopenharmony_ci } 486962306a36Sopenharmony_ci 487062306a36Sopenharmony_ci /* 487162306a36Sopenharmony_ci * Once the device's size has been set to the new size, ensure all 487262306a36Sopenharmony_ci * in-memory chunks are synced to disk so that the loop below sees them 487362306a36Sopenharmony_ci * and relocates them accordingly. 487462306a36Sopenharmony_ci */ 487562306a36Sopenharmony_ci if (contains_pending_extent(device, &start, diff)) { 487662306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 487762306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 487862306a36Sopenharmony_ci if (ret) 487962306a36Sopenharmony_ci goto done; 488062306a36Sopenharmony_ci } else { 488162306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 488262306a36Sopenharmony_ci btrfs_end_transaction(trans); 488362306a36Sopenharmony_ci } 488462306a36Sopenharmony_ci 488562306a36Sopenharmony_ciagain: 488662306a36Sopenharmony_ci key.objectid = device->devid; 488762306a36Sopenharmony_ci key.offset = (u64)-1; 488862306a36Sopenharmony_ci key.type = BTRFS_DEV_EXTENT_KEY; 488962306a36Sopenharmony_ci 489062306a36Sopenharmony_ci do { 489162306a36Sopenharmony_ci mutex_lock(&fs_info->reclaim_bgs_lock); 489262306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 489362306a36Sopenharmony_ci if (ret < 0) { 489462306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 489562306a36Sopenharmony_ci goto done; 489662306a36Sopenharmony_ci } 489762306a36Sopenharmony_ci 489862306a36Sopenharmony_ci ret = btrfs_previous_item(root, path, 0, key.type); 489962306a36Sopenharmony_ci if (ret) { 490062306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 490162306a36Sopenharmony_ci if (ret < 0) 490262306a36Sopenharmony_ci goto done; 490362306a36Sopenharmony_ci ret = 0; 490462306a36Sopenharmony_ci btrfs_release_path(path); 490562306a36Sopenharmony_ci break; 490662306a36Sopenharmony_ci } 490762306a36Sopenharmony_ci 490862306a36Sopenharmony_ci l = path->nodes[0]; 490962306a36Sopenharmony_ci slot = path->slots[0]; 491062306a36Sopenharmony_ci btrfs_item_key_to_cpu(l, &key, path->slots[0]); 491162306a36Sopenharmony_ci 491262306a36Sopenharmony_ci if (key.objectid != device->devid) { 491362306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 491462306a36Sopenharmony_ci btrfs_release_path(path); 491562306a36Sopenharmony_ci break; 491662306a36Sopenharmony_ci } 491762306a36Sopenharmony_ci 491862306a36Sopenharmony_ci dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 491962306a36Sopenharmony_ci length = btrfs_dev_extent_length(l, dev_extent); 492062306a36Sopenharmony_ci 492162306a36Sopenharmony_ci if (key.offset + length <= new_size) { 492262306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 492362306a36Sopenharmony_ci btrfs_release_path(path); 492462306a36Sopenharmony_ci break; 492562306a36Sopenharmony_ci } 492662306a36Sopenharmony_ci 492762306a36Sopenharmony_ci chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 492862306a36Sopenharmony_ci btrfs_release_path(path); 492962306a36Sopenharmony_ci 493062306a36Sopenharmony_ci /* 493162306a36Sopenharmony_ci * We may be relocating the only data chunk we have, 493262306a36Sopenharmony_ci * which could potentially end up with losing data's 493362306a36Sopenharmony_ci * raid profile, so lets allocate an empty one in 493462306a36Sopenharmony_ci * advance. 493562306a36Sopenharmony_ci */ 493662306a36Sopenharmony_ci ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 493762306a36Sopenharmony_ci if (ret < 0) { 493862306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 493962306a36Sopenharmony_ci goto done; 494062306a36Sopenharmony_ci } 494162306a36Sopenharmony_ci 494262306a36Sopenharmony_ci ret = btrfs_relocate_chunk(fs_info, chunk_offset); 494362306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 494462306a36Sopenharmony_ci if (ret == -ENOSPC) { 494562306a36Sopenharmony_ci failed++; 494662306a36Sopenharmony_ci } else if (ret) { 494762306a36Sopenharmony_ci if (ret == -ETXTBSY) { 494862306a36Sopenharmony_ci btrfs_warn(fs_info, 494962306a36Sopenharmony_ci "could not shrink block group %llu due to active swapfile", 495062306a36Sopenharmony_ci chunk_offset); 495162306a36Sopenharmony_ci } 495262306a36Sopenharmony_ci goto done; 495362306a36Sopenharmony_ci } 495462306a36Sopenharmony_ci } while (key.offset-- > 0); 495562306a36Sopenharmony_ci 495662306a36Sopenharmony_ci if (failed && !retried) { 495762306a36Sopenharmony_ci failed = 0; 495862306a36Sopenharmony_ci retried = true; 495962306a36Sopenharmony_ci goto again; 496062306a36Sopenharmony_ci } else if (failed && retried) { 496162306a36Sopenharmony_ci ret = -ENOSPC; 496262306a36Sopenharmony_ci goto done; 496362306a36Sopenharmony_ci } 496462306a36Sopenharmony_ci 496562306a36Sopenharmony_ci /* Shrinking succeeded, else we would be at "done". */ 496662306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 0); 496762306a36Sopenharmony_ci if (IS_ERR(trans)) { 496862306a36Sopenharmony_ci ret = PTR_ERR(trans); 496962306a36Sopenharmony_ci goto done; 497062306a36Sopenharmony_ci } 497162306a36Sopenharmony_ci 497262306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 497362306a36Sopenharmony_ci /* Clear all state bits beyond the shrunk device size */ 497462306a36Sopenharmony_ci clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 497562306a36Sopenharmony_ci CHUNK_STATE_MASK); 497662306a36Sopenharmony_ci 497762306a36Sopenharmony_ci btrfs_device_set_disk_total_bytes(device, new_size); 497862306a36Sopenharmony_ci if (list_empty(&device->post_commit_list)) 497962306a36Sopenharmony_ci list_add_tail(&device->post_commit_list, 498062306a36Sopenharmony_ci &trans->transaction->dev_update_list); 498162306a36Sopenharmony_ci 498262306a36Sopenharmony_ci WARN_ON(diff > old_total); 498362306a36Sopenharmony_ci btrfs_set_super_total_bytes(super_copy, 498462306a36Sopenharmony_ci round_down(old_total - diff, fs_info->sectorsize)); 498562306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 498662306a36Sopenharmony_ci 498762306a36Sopenharmony_ci btrfs_reserve_chunk_metadata(trans, false); 498862306a36Sopenharmony_ci /* Now btrfs_update_device() will change the on-disk size. */ 498962306a36Sopenharmony_ci ret = btrfs_update_device(trans, device); 499062306a36Sopenharmony_ci btrfs_trans_release_chunk_metadata(trans); 499162306a36Sopenharmony_ci if (ret < 0) { 499262306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 499362306a36Sopenharmony_ci btrfs_end_transaction(trans); 499462306a36Sopenharmony_ci } else { 499562306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 499662306a36Sopenharmony_ci } 499762306a36Sopenharmony_cidone: 499862306a36Sopenharmony_ci btrfs_free_path(path); 499962306a36Sopenharmony_ci if (ret) { 500062306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 500162306a36Sopenharmony_ci btrfs_device_set_total_bytes(device, old_size); 500262306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 500362306a36Sopenharmony_ci device->fs_devices->total_rw_bytes += diff; 500462306a36Sopenharmony_ci atomic64_add(diff, &fs_info->free_chunk_space); 500562306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 500662306a36Sopenharmony_ci } 500762306a36Sopenharmony_ci return ret; 500862306a36Sopenharmony_ci} 500962306a36Sopenharmony_ci 501062306a36Sopenharmony_cistatic int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 501162306a36Sopenharmony_ci struct btrfs_key *key, 501262306a36Sopenharmony_ci struct btrfs_chunk *chunk, int item_size) 501362306a36Sopenharmony_ci{ 501462306a36Sopenharmony_ci struct btrfs_super_block *super_copy = fs_info->super_copy; 501562306a36Sopenharmony_ci struct btrfs_disk_key disk_key; 501662306a36Sopenharmony_ci u32 array_size; 501762306a36Sopenharmony_ci u8 *ptr; 501862306a36Sopenharmony_ci 501962306a36Sopenharmony_ci lockdep_assert_held(&fs_info->chunk_mutex); 502062306a36Sopenharmony_ci 502162306a36Sopenharmony_ci array_size = btrfs_super_sys_array_size(super_copy); 502262306a36Sopenharmony_ci if (array_size + item_size + sizeof(disk_key) 502362306a36Sopenharmony_ci > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 502462306a36Sopenharmony_ci return -EFBIG; 502562306a36Sopenharmony_ci 502662306a36Sopenharmony_ci ptr = super_copy->sys_chunk_array + array_size; 502762306a36Sopenharmony_ci btrfs_cpu_key_to_disk(&disk_key, key); 502862306a36Sopenharmony_ci memcpy(ptr, &disk_key, sizeof(disk_key)); 502962306a36Sopenharmony_ci ptr += sizeof(disk_key); 503062306a36Sopenharmony_ci memcpy(ptr, chunk, item_size); 503162306a36Sopenharmony_ci item_size += sizeof(disk_key); 503262306a36Sopenharmony_ci btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 503362306a36Sopenharmony_ci 503462306a36Sopenharmony_ci return 0; 503562306a36Sopenharmony_ci} 503662306a36Sopenharmony_ci 503762306a36Sopenharmony_ci/* 503862306a36Sopenharmony_ci * sort the devices in descending order by max_avail, total_avail 503962306a36Sopenharmony_ci */ 504062306a36Sopenharmony_cistatic int btrfs_cmp_device_info(const void *a, const void *b) 504162306a36Sopenharmony_ci{ 504262306a36Sopenharmony_ci const struct btrfs_device_info *di_a = a; 504362306a36Sopenharmony_ci const struct btrfs_device_info *di_b = b; 504462306a36Sopenharmony_ci 504562306a36Sopenharmony_ci if (di_a->max_avail > di_b->max_avail) 504662306a36Sopenharmony_ci return -1; 504762306a36Sopenharmony_ci if (di_a->max_avail < di_b->max_avail) 504862306a36Sopenharmony_ci return 1; 504962306a36Sopenharmony_ci if (di_a->total_avail > di_b->total_avail) 505062306a36Sopenharmony_ci return -1; 505162306a36Sopenharmony_ci if (di_a->total_avail < di_b->total_avail) 505262306a36Sopenharmony_ci return 1; 505362306a36Sopenharmony_ci return 0; 505462306a36Sopenharmony_ci} 505562306a36Sopenharmony_ci 505662306a36Sopenharmony_cistatic void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 505762306a36Sopenharmony_ci{ 505862306a36Sopenharmony_ci if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 505962306a36Sopenharmony_ci return; 506062306a36Sopenharmony_ci 506162306a36Sopenharmony_ci btrfs_set_fs_incompat(info, RAID56); 506262306a36Sopenharmony_ci} 506362306a36Sopenharmony_ci 506462306a36Sopenharmony_cistatic void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 506562306a36Sopenharmony_ci{ 506662306a36Sopenharmony_ci if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 506762306a36Sopenharmony_ci return; 506862306a36Sopenharmony_ci 506962306a36Sopenharmony_ci btrfs_set_fs_incompat(info, RAID1C34); 507062306a36Sopenharmony_ci} 507162306a36Sopenharmony_ci 507262306a36Sopenharmony_ci/* 507362306a36Sopenharmony_ci * Structure used internally for btrfs_create_chunk() function. 507462306a36Sopenharmony_ci * Wraps needed parameters. 507562306a36Sopenharmony_ci */ 507662306a36Sopenharmony_cistruct alloc_chunk_ctl { 507762306a36Sopenharmony_ci u64 start; 507862306a36Sopenharmony_ci u64 type; 507962306a36Sopenharmony_ci /* Total number of stripes to allocate */ 508062306a36Sopenharmony_ci int num_stripes; 508162306a36Sopenharmony_ci /* sub_stripes info for map */ 508262306a36Sopenharmony_ci int sub_stripes; 508362306a36Sopenharmony_ci /* Stripes per device */ 508462306a36Sopenharmony_ci int dev_stripes; 508562306a36Sopenharmony_ci /* Maximum number of devices to use */ 508662306a36Sopenharmony_ci int devs_max; 508762306a36Sopenharmony_ci /* Minimum number of devices to use */ 508862306a36Sopenharmony_ci int devs_min; 508962306a36Sopenharmony_ci /* ndevs has to be a multiple of this */ 509062306a36Sopenharmony_ci int devs_increment; 509162306a36Sopenharmony_ci /* Number of copies */ 509262306a36Sopenharmony_ci int ncopies; 509362306a36Sopenharmony_ci /* Number of stripes worth of bytes to store parity information */ 509462306a36Sopenharmony_ci int nparity; 509562306a36Sopenharmony_ci u64 max_stripe_size; 509662306a36Sopenharmony_ci u64 max_chunk_size; 509762306a36Sopenharmony_ci u64 dev_extent_min; 509862306a36Sopenharmony_ci u64 stripe_size; 509962306a36Sopenharmony_ci u64 chunk_size; 510062306a36Sopenharmony_ci int ndevs; 510162306a36Sopenharmony_ci}; 510262306a36Sopenharmony_ci 510362306a36Sopenharmony_cistatic void init_alloc_chunk_ctl_policy_regular( 510462306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices, 510562306a36Sopenharmony_ci struct alloc_chunk_ctl *ctl) 510662306a36Sopenharmony_ci{ 510762306a36Sopenharmony_ci struct btrfs_space_info *space_info; 510862306a36Sopenharmony_ci 510962306a36Sopenharmony_ci space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); 511062306a36Sopenharmony_ci ASSERT(space_info); 511162306a36Sopenharmony_ci 511262306a36Sopenharmony_ci ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); 511362306a36Sopenharmony_ci ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); 511462306a36Sopenharmony_ci 511562306a36Sopenharmony_ci if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) 511662306a36Sopenharmony_ci ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); 511762306a36Sopenharmony_ci 511862306a36Sopenharmony_ci /* We don't want a chunk larger than 10% of writable space */ 511962306a36Sopenharmony_ci ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), 512062306a36Sopenharmony_ci ctl->max_chunk_size); 512162306a36Sopenharmony_ci ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes); 512262306a36Sopenharmony_ci} 512362306a36Sopenharmony_ci 512462306a36Sopenharmony_cistatic void init_alloc_chunk_ctl_policy_zoned( 512562306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices, 512662306a36Sopenharmony_ci struct alloc_chunk_ctl *ctl) 512762306a36Sopenharmony_ci{ 512862306a36Sopenharmony_ci u64 zone_size = fs_devices->fs_info->zone_size; 512962306a36Sopenharmony_ci u64 limit; 513062306a36Sopenharmony_ci int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 513162306a36Sopenharmony_ci int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 513262306a36Sopenharmony_ci u64 min_chunk_size = min_data_stripes * zone_size; 513362306a36Sopenharmony_ci u64 type = ctl->type; 513462306a36Sopenharmony_ci 513562306a36Sopenharmony_ci ctl->max_stripe_size = zone_size; 513662306a36Sopenharmony_ci if (type & BTRFS_BLOCK_GROUP_DATA) { 513762306a36Sopenharmony_ci ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 513862306a36Sopenharmony_ci zone_size); 513962306a36Sopenharmony_ci } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 514062306a36Sopenharmony_ci ctl->max_chunk_size = ctl->max_stripe_size; 514162306a36Sopenharmony_ci } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 514262306a36Sopenharmony_ci ctl->max_chunk_size = 2 * ctl->max_stripe_size; 514362306a36Sopenharmony_ci ctl->devs_max = min_t(int, ctl->devs_max, 514462306a36Sopenharmony_ci BTRFS_MAX_DEVS_SYS_CHUNK); 514562306a36Sopenharmony_ci } else { 514662306a36Sopenharmony_ci BUG(); 514762306a36Sopenharmony_ci } 514862306a36Sopenharmony_ci 514962306a36Sopenharmony_ci /* We don't want a chunk larger than 10% of writable space */ 515062306a36Sopenharmony_ci limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), 515162306a36Sopenharmony_ci zone_size), 515262306a36Sopenharmony_ci min_chunk_size); 515362306a36Sopenharmony_ci ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 515462306a36Sopenharmony_ci ctl->dev_extent_min = zone_size * ctl->dev_stripes; 515562306a36Sopenharmony_ci} 515662306a36Sopenharmony_ci 515762306a36Sopenharmony_cistatic void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 515862306a36Sopenharmony_ci struct alloc_chunk_ctl *ctl) 515962306a36Sopenharmony_ci{ 516062306a36Sopenharmony_ci int index = btrfs_bg_flags_to_raid_index(ctl->type); 516162306a36Sopenharmony_ci 516262306a36Sopenharmony_ci ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 516362306a36Sopenharmony_ci ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 516462306a36Sopenharmony_ci ctl->devs_max = btrfs_raid_array[index].devs_max; 516562306a36Sopenharmony_ci if (!ctl->devs_max) 516662306a36Sopenharmony_ci ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 516762306a36Sopenharmony_ci ctl->devs_min = btrfs_raid_array[index].devs_min; 516862306a36Sopenharmony_ci ctl->devs_increment = btrfs_raid_array[index].devs_increment; 516962306a36Sopenharmony_ci ctl->ncopies = btrfs_raid_array[index].ncopies; 517062306a36Sopenharmony_ci ctl->nparity = btrfs_raid_array[index].nparity; 517162306a36Sopenharmony_ci ctl->ndevs = 0; 517262306a36Sopenharmony_ci 517362306a36Sopenharmony_ci switch (fs_devices->chunk_alloc_policy) { 517462306a36Sopenharmony_ci case BTRFS_CHUNK_ALLOC_REGULAR: 517562306a36Sopenharmony_ci init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 517662306a36Sopenharmony_ci break; 517762306a36Sopenharmony_ci case BTRFS_CHUNK_ALLOC_ZONED: 517862306a36Sopenharmony_ci init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 517962306a36Sopenharmony_ci break; 518062306a36Sopenharmony_ci default: 518162306a36Sopenharmony_ci BUG(); 518262306a36Sopenharmony_ci } 518362306a36Sopenharmony_ci} 518462306a36Sopenharmony_ci 518562306a36Sopenharmony_cistatic int gather_device_info(struct btrfs_fs_devices *fs_devices, 518662306a36Sopenharmony_ci struct alloc_chunk_ctl *ctl, 518762306a36Sopenharmony_ci struct btrfs_device_info *devices_info) 518862306a36Sopenharmony_ci{ 518962306a36Sopenharmony_ci struct btrfs_fs_info *info = fs_devices->fs_info; 519062306a36Sopenharmony_ci struct btrfs_device *device; 519162306a36Sopenharmony_ci u64 total_avail; 519262306a36Sopenharmony_ci u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 519362306a36Sopenharmony_ci int ret; 519462306a36Sopenharmony_ci int ndevs = 0; 519562306a36Sopenharmony_ci u64 max_avail; 519662306a36Sopenharmony_ci u64 dev_offset; 519762306a36Sopenharmony_ci 519862306a36Sopenharmony_ci /* 519962306a36Sopenharmony_ci * in the first pass through the devices list, we gather information 520062306a36Sopenharmony_ci * about the available holes on each device. 520162306a36Sopenharmony_ci */ 520262306a36Sopenharmony_ci list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 520362306a36Sopenharmony_ci if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 520462306a36Sopenharmony_ci WARN(1, KERN_ERR 520562306a36Sopenharmony_ci "BTRFS: read-only device in alloc_list\n"); 520662306a36Sopenharmony_ci continue; 520762306a36Sopenharmony_ci } 520862306a36Sopenharmony_ci 520962306a36Sopenharmony_ci if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 521062306a36Sopenharmony_ci &device->dev_state) || 521162306a36Sopenharmony_ci test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 521262306a36Sopenharmony_ci continue; 521362306a36Sopenharmony_ci 521462306a36Sopenharmony_ci if (device->total_bytes > device->bytes_used) 521562306a36Sopenharmony_ci total_avail = device->total_bytes - device->bytes_used; 521662306a36Sopenharmony_ci else 521762306a36Sopenharmony_ci total_avail = 0; 521862306a36Sopenharmony_ci 521962306a36Sopenharmony_ci /* If there is no space on this device, skip it. */ 522062306a36Sopenharmony_ci if (total_avail < ctl->dev_extent_min) 522162306a36Sopenharmony_ci continue; 522262306a36Sopenharmony_ci 522362306a36Sopenharmony_ci ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 522462306a36Sopenharmony_ci &max_avail); 522562306a36Sopenharmony_ci if (ret && ret != -ENOSPC) 522662306a36Sopenharmony_ci return ret; 522762306a36Sopenharmony_ci 522862306a36Sopenharmony_ci if (ret == 0) 522962306a36Sopenharmony_ci max_avail = dev_extent_want; 523062306a36Sopenharmony_ci 523162306a36Sopenharmony_ci if (max_avail < ctl->dev_extent_min) { 523262306a36Sopenharmony_ci if (btrfs_test_opt(info, ENOSPC_DEBUG)) 523362306a36Sopenharmony_ci btrfs_debug(info, 523462306a36Sopenharmony_ci "%s: devid %llu has no free space, have=%llu want=%llu", 523562306a36Sopenharmony_ci __func__, device->devid, max_avail, 523662306a36Sopenharmony_ci ctl->dev_extent_min); 523762306a36Sopenharmony_ci continue; 523862306a36Sopenharmony_ci } 523962306a36Sopenharmony_ci 524062306a36Sopenharmony_ci if (ndevs == fs_devices->rw_devices) { 524162306a36Sopenharmony_ci WARN(1, "%s: found more than %llu devices\n", 524262306a36Sopenharmony_ci __func__, fs_devices->rw_devices); 524362306a36Sopenharmony_ci break; 524462306a36Sopenharmony_ci } 524562306a36Sopenharmony_ci devices_info[ndevs].dev_offset = dev_offset; 524662306a36Sopenharmony_ci devices_info[ndevs].max_avail = max_avail; 524762306a36Sopenharmony_ci devices_info[ndevs].total_avail = total_avail; 524862306a36Sopenharmony_ci devices_info[ndevs].dev = device; 524962306a36Sopenharmony_ci ++ndevs; 525062306a36Sopenharmony_ci } 525162306a36Sopenharmony_ci ctl->ndevs = ndevs; 525262306a36Sopenharmony_ci 525362306a36Sopenharmony_ci /* 525462306a36Sopenharmony_ci * now sort the devices by hole size / available space 525562306a36Sopenharmony_ci */ 525662306a36Sopenharmony_ci sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 525762306a36Sopenharmony_ci btrfs_cmp_device_info, NULL); 525862306a36Sopenharmony_ci 525962306a36Sopenharmony_ci return 0; 526062306a36Sopenharmony_ci} 526162306a36Sopenharmony_ci 526262306a36Sopenharmony_cistatic int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 526362306a36Sopenharmony_ci struct btrfs_device_info *devices_info) 526462306a36Sopenharmony_ci{ 526562306a36Sopenharmony_ci /* Number of stripes that count for block group size */ 526662306a36Sopenharmony_ci int data_stripes; 526762306a36Sopenharmony_ci 526862306a36Sopenharmony_ci /* 526962306a36Sopenharmony_ci * The primary goal is to maximize the number of stripes, so use as 527062306a36Sopenharmony_ci * many devices as possible, even if the stripes are not maximum sized. 527162306a36Sopenharmony_ci * 527262306a36Sopenharmony_ci * The DUP profile stores more than one stripe per device, the 527362306a36Sopenharmony_ci * max_avail is the total size so we have to adjust. 527462306a36Sopenharmony_ci */ 527562306a36Sopenharmony_ci ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 527662306a36Sopenharmony_ci ctl->dev_stripes); 527762306a36Sopenharmony_ci ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 527862306a36Sopenharmony_ci 527962306a36Sopenharmony_ci /* This will have to be fixed for RAID1 and RAID10 over more drives */ 528062306a36Sopenharmony_ci data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 528162306a36Sopenharmony_ci 528262306a36Sopenharmony_ci /* 528362306a36Sopenharmony_ci * Use the number of data stripes to figure out how big this chunk is 528462306a36Sopenharmony_ci * really going to be in terms of logical address space, and compare 528562306a36Sopenharmony_ci * that answer with the max chunk size. If it's higher, we try to 528662306a36Sopenharmony_ci * reduce stripe_size. 528762306a36Sopenharmony_ci */ 528862306a36Sopenharmony_ci if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 528962306a36Sopenharmony_ci /* 529062306a36Sopenharmony_ci * Reduce stripe_size, round it up to a 16MB boundary again and 529162306a36Sopenharmony_ci * then use it, unless it ends up being even bigger than the 529262306a36Sopenharmony_ci * previous value we had already. 529362306a36Sopenharmony_ci */ 529462306a36Sopenharmony_ci ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 529562306a36Sopenharmony_ci data_stripes), SZ_16M), 529662306a36Sopenharmony_ci ctl->stripe_size); 529762306a36Sopenharmony_ci } 529862306a36Sopenharmony_ci 529962306a36Sopenharmony_ci /* Stripe size should not go beyond 1G. */ 530062306a36Sopenharmony_ci ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); 530162306a36Sopenharmony_ci 530262306a36Sopenharmony_ci /* Align to BTRFS_STRIPE_LEN */ 530362306a36Sopenharmony_ci ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 530462306a36Sopenharmony_ci ctl->chunk_size = ctl->stripe_size * data_stripes; 530562306a36Sopenharmony_ci 530662306a36Sopenharmony_ci return 0; 530762306a36Sopenharmony_ci} 530862306a36Sopenharmony_ci 530962306a36Sopenharmony_cistatic int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 531062306a36Sopenharmony_ci struct btrfs_device_info *devices_info) 531162306a36Sopenharmony_ci{ 531262306a36Sopenharmony_ci u64 zone_size = devices_info[0].dev->zone_info->zone_size; 531362306a36Sopenharmony_ci /* Number of stripes that count for block group size */ 531462306a36Sopenharmony_ci int data_stripes; 531562306a36Sopenharmony_ci 531662306a36Sopenharmony_ci /* 531762306a36Sopenharmony_ci * It should hold because: 531862306a36Sopenharmony_ci * dev_extent_min == dev_extent_want == zone_size * dev_stripes 531962306a36Sopenharmony_ci */ 532062306a36Sopenharmony_ci ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 532162306a36Sopenharmony_ci 532262306a36Sopenharmony_ci ctl->stripe_size = zone_size; 532362306a36Sopenharmony_ci ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 532462306a36Sopenharmony_ci data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 532562306a36Sopenharmony_ci 532662306a36Sopenharmony_ci /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 532762306a36Sopenharmony_ci if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 532862306a36Sopenharmony_ci ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 532962306a36Sopenharmony_ci ctl->stripe_size) + ctl->nparity, 533062306a36Sopenharmony_ci ctl->dev_stripes); 533162306a36Sopenharmony_ci ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 533262306a36Sopenharmony_ci data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 533362306a36Sopenharmony_ci ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 533462306a36Sopenharmony_ci } 533562306a36Sopenharmony_ci 533662306a36Sopenharmony_ci ctl->chunk_size = ctl->stripe_size * data_stripes; 533762306a36Sopenharmony_ci 533862306a36Sopenharmony_ci return 0; 533962306a36Sopenharmony_ci} 534062306a36Sopenharmony_ci 534162306a36Sopenharmony_cistatic int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 534262306a36Sopenharmony_ci struct alloc_chunk_ctl *ctl, 534362306a36Sopenharmony_ci struct btrfs_device_info *devices_info) 534462306a36Sopenharmony_ci{ 534562306a36Sopenharmony_ci struct btrfs_fs_info *info = fs_devices->fs_info; 534662306a36Sopenharmony_ci 534762306a36Sopenharmony_ci /* 534862306a36Sopenharmony_ci * Round down to number of usable stripes, devs_increment can be any 534962306a36Sopenharmony_ci * number so we can't use round_down() that requires power of 2, while 535062306a36Sopenharmony_ci * rounddown is safe. 535162306a36Sopenharmony_ci */ 535262306a36Sopenharmony_ci ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 535362306a36Sopenharmony_ci 535462306a36Sopenharmony_ci if (ctl->ndevs < ctl->devs_min) { 535562306a36Sopenharmony_ci if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 535662306a36Sopenharmony_ci btrfs_debug(info, 535762306a36Sopenharmony_ci "%s: not enough devices with free space: have=%d minimum required=%d", 535862306a36Sopenharmony_ci __func__, ctl->ndevs, ctl->devs_min); 535962306a36Sopenharmony_ci } 536062306a36Sopenharmony_ci return -ENOSPC; 536162306a36Sopenharmony_ci } 536262306a36Sopenharmony_ci 536362306a36Sopenharmony_ci ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 536462306a36Sopenharmony_ci 536562306a36Sopenharmony_ci switch (fs_devices->chunk_alloc_policy) { 536662306a36Sopenharmony_ci case BTRFS_CHUNK_ALLOC_REGULAR: 536762306a36Sopenharmony_ci return decide_stripe_size_regular(ctl, devices_info); 536862306a36Sopenharmony_ci case BTRFS_CHUNK_ALLOC_ZONED: 536962306a36Sopenharmony_ci return decide_stripe_size_zoned(ctl, devices_info); 537062306a36Sopenharmony_ci default: 537162306a36Sopenharmony_ci BUG(); 537262306a36Sopenharmony_ci } 537362306a36Sopenharmony_ci} 537462306a36Sopenharmony_ci 537562306a36Sopenharmony_cistatic struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 537662306a36Sopenharmony_ci struct alloc_chunk_ctl *ctl, 537762306a36Sopenharmony_ci struct btrfs_device_info *devices_info) 537862306a36Sopenharmony_ci{ 537962306a36Sopenharmony_ci struct btrfs_fs_info *info = trans->fs_info; 538062306a36Sopenharmony_ci struct map_lookup *map = NULL; 538162306a36Sopenharmony_ci struct extent_map_tree *em_tree; 538262306a36Sopenharmony_ci struct btrfs_block_group *block_group; 538362306a36Sopenharmony_ci struct extent_map *em; 538462306a36Sopenharmony_ci u64 start = ctl->start; 538562306a36Sopenharmony_ci u64 type = ctl->type; 538662306a36Sopenharmony_ci int ret; 538762306a36Sopenharmony_ci int i; 538862306a36Sopenharmony_ci int j; 538962306a36Sopenharmony_ci 539062306a36Sopenharmony_ci map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 539162306a36Sopenharmony_ci if (!map) 539262306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 539362306a36Sopenharmony_ci map->num_stripes = ctl->num_stripes; 539462306a36Sopenharmony_ci 539562306a36Sopenharmony_ci for (i = 0; i < ctl->ndevs; ++i) { 539662306a36Sopenharmony_ci for (j = 0; j < ctl->dev_stripes; ++j) { 539762306a36Sopenharmony_ci int s = i * ctl->dev_stripes + j; 539862306a36Sopenharmony_ci map->stripes[s].dev = devices_info[i].dev; 539962306a36Sopenharmony_ci map->stripes[s].physical = devices_info[i].dev_offset + 540062306a36Sopenharmony_ci j * ctl->stripe_size; 540162306a36Sopenharmony_ci } 540262306a36Sopenharmony_ci } 540362306a36Sopenharmony_ci map->io_align = BTRFS_STRIPE_LEN; 540462306a36Sopenharmony_ci map->io_width = BTRFS_STRIPE_LEN; 540562306a36Sopenharmony_ci map->type = type; 540662306a36Sopenharmony_ci map->sub_stripes = ctl->sub_stripes; 540762306a36Sopenharmony_ci 540862306a36Sopenharmony_ci trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 540962306a36Sopenharmony_ci 541062306a36Sopenharmony_ci em = alloc_extent_map(); 541162306a36Sopenharmony_ci if (!em) { 541262306a36Sopenharmony_ci kfree(map); 541362306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 541462306a36Sopenharmony_ci } 541562306a36Sopenharmony_ci set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 541662306a36Sopenharmony_ci em->map_lookup = map; 541762306a36Sopenharmony_ci em->start = start; 541862306a36Sopenharmony_ci em->len = ctl->chunk_size; 541962306a36Sopenharmony_ci em->block_start = 0; 542062306a36Sopenharmony_ci em->block_len = em->len; 542162306a36Sopenharmony_ci em->orig_block_len = ctl->stripe_size; 542262306a36Sopenharmony_ci 542362306a36Sopenharmony_ci em_tree = &info->mapping_tree; 542462306a36Sopenharmony_ci write_lock(&em_tree->lock); 542562306a36Sopenharmony_ci ret = add_extent_mapping(em_tree, em, 0); 542662306a36Sopenharmony_ci if (ret) { 542762306a36Sopenharmony_ci write_unlock(&em_tree->lock); 542862306a36Sopenharmony_ci free_extent_map(em); 542962306a36Sopenharmony_ci return ERR_PTR(ret); 543062306a36Sopenharmony_ci } 543162306a36Sopenharmony_ci write_unlock(&em_tree->lock); 543262306a36Sopenharmony_ci 543362306a36Sopenharmony_ci block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); 543462306a36Sopenharmony_ci if (IS_ERR(block_group)) 543562306a36Sopenharmony_ci goto error_del_extent; 543662306a36Sopenharmony_ci 543762306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 543862306a36Sopenharmony_ci struct btrfs_device *dev = map->stripes[i].dev; 543962306a36Sopenharmony_ci 544062306a36Sopenharmony_ci btrfs_device_set_bytes_used(dev, 544162306a36Sopenharmony_ci dev->bytes_used + ctl->stripe_size); 544262306a36Sopenharmony_ci if (list_empty(&dev->post_commit_list)) 544362306a36Sopenharmony_ci list_add_tail(&dev->post_commit_list, 544462306a36Sopenharmony_ci &trans->transaction->dev_update_list); 544562306a36Sopenharmony_ci } 544662306a36Sopenharmony_ci 544762306a36Sopenharmony_ci atomic64_sub(ctl->stripe_size * map->num_stripes, 544862306a36Sopenharmony_ci &info->free_chunk_space); 544962306a36Sopenharmony_ci 545062306a36Sopenharmony_ci free_extent_map(em); 545162306a36Sopenharmony_ci check_raid56_incompat_flag(info, type); 545262306a36Sopenharmony_ci check_raid1c34_incompat_flag(info, type); 545362306a36Sopenharmony_ci 545462306a36Sopenharmony_ci return block_group; 545562306a36Sopenharmony_ci 545662306a36Sopenharmony_cierror_del_extent: 545762306a36Sopenharmony_ci write_lock(&em_tree->lock); 545862306a36Sopenharmony_ci remove_extent_mapping(em_tree, em); 545962306a36Sopenharmony_ci write_unlock(&em_tree->lock); 546062306a36Sopenharmony_ci 546162306a36Sopenharmony_ci /* One for our allocation */ 546262306a36Sopenharmony_ci free_extent_map(em); 546362306a36Sopenharmony_ci /* One for the tree reference */ 546462306a36Sopenharmony_ci free_extent_map(em); 546562306a36Sopenharmony_ci 546662306a36Sopenharmony_ci return block_group; 546762306a36Sopenharmony_ci} 546862306a36Sopenharmony_ci 546962306a36Sopenharmony_cistruct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 547062306a36Sopenharmony_ci u64 type) 547162306a36Sopenharmony_ci{ 547262306a36Sopenharmony_ci struct btrfs_fs_info *info = trans->fs_info; 547362306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = info->fs_devices; 547462306a36Sopenharmony_ci struct btrfs_device_info *devices_info = NULL; 547562306a36Sopenharmony_ci struct alloc_chunk_ctl ctl; 547662306a36Sopenharmony_ci struct btrfs_block_group *block_group; 547762306a36Sopenharmony_ci int ret; 547862306a36Sopenharmony_ci 547962306a36Sopenharmony_ci lockdep_assert_held(&info->chunk_mutex); 548062306a36Sopenharmony_ci 548162306a36Sopenharmony_ci if (!alloc_profile_is_valid(type, 0)) { 548262306a36Sopenharmony_ci ASSERT(0); 548362306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 548462306a36Sopenharmony_ci } 548562306a36Sopenharmony_ci 548662306a36Sopenharmony_ci if (list_empty(&fs_devices->alloc_list)) { 548762306a36Sopenharmony_ci if (btrfs_test_opt(info, ENOSPC_DEBUG)) 548862306a36Sopenharmony_ci btrfs_debug(info, "%s: no writable device", __func__); 548962306a36Sopenharmony_ci return ERR_PTR(-ENOSPC); 549062306a36Sopenharmony_ci } 549162306a36Sopenharmony_ci 549262306a36Sopenharmony_ci if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 549362306a36Sopenharmony_ci btrfs_err(info, "invalid chunk type 0x%llx requested", type); 549462306a36Sopenharmony_ci ASSERT(0); 549562306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 549662306a36Sopenharmony_ci } 549762306a36Sopenharmony_ci 549862306a36Sopenharmony_ci ctl.start = find_next_chunk(info); 549962306a36Sopenharmony_ci ctl.type = type; 550062306a36Sopenharmony_ci init_alloc_chunk_ctl(fs_devices, &ctl); 550162306a36Sopenharmony_ci 550262306a36Sopenharmony_ci devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 550362306a36Sopenharmony_ci GFP_NOFS); 550462306a36Sopenharmony_ci if (!devices_info) 550562306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 550662306a36Sopenharmony_ci 550762306a36Sopenharmony_ci ret = gather_device_info(fs_devices, &ctl, devices_info); 550862306a36Sopenharmony_ci if (ret < 0) { 550962306a36Sopenharmony_ci block_group = ERR_PTR(ret); 551062306a36Sopenharmony_ci goto out; 551162306a36Sopenharmony_ci } 551262306a36Sopenharmony_ci 551362306a36Sopenharmony_ci ret = decide_stripe_size(fs_devices, &ctl, devices_info); 551462306a36Sopenharmony_ci if (ret < 0) { 551562306a36Sopenharmony_ci block_group = ERR_PTR(ret); 551662306a36Sopenharmony_ci goto out; 551762306a36Sopenharmony_ci } 551862306a36Sopenharmony_ci 551962306a36Sopenharmony_ci block_group = create_chunk(trans, &ctl, devices_info); 552062306a36Sopenharmony_ci 552162306a36Sopenharmony_ciout: 552262306a36Sopenharmony_ci kfree(devices_info); 552362306a36Sopenharmony_ci return block_group; 552462306a36Sopenharmony_ci} 552562306a36Sopenharmony_ci 552662306a36Sopenharmony_ci/* 552762306a36Sopenharmony_ci * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 552862306a36Sopenharmony_ci * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 552962306a36Sopenharmony_ci * chunks. 553062306a36Sopenharmony_ci * 553162306a36Sopenharmony_ci * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 553262306a36Sopenharmony_ci * phases. 553362306a36Sopenharmony_ci */ 553462306a36Sopenharmony_ciint btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 553562306a36Sopenharmony_ci struct btrfs_block_group *bg) 553662306a36Sopenharmony_ci{ 553762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 553862306a36Sopenharmony_ci struct btrfs_root *chunk_root = fs_info->chunk_root; 553962306a36Sopenharmony_ci struct btrfs_key key; 554062306a36Sopenharmony_ci struct btrfs_chunk *chunk; 554162306a36Sopenharmony_ci struct btrfs_stripe *stripe; 554262306a36Sopenharmony_ci struct extent_map *em; 554362306a36Sopenharmony_ci struct map_lookup *map; 554462306a36Sopenharmony_ci size_t item_size; 554562306a36Sopenharmony_ci int i; 554662306a36Sopenharmony_ci int ret; 554762306a36Sopenharmony_ci 554862306a36Sopenharmony_ci /* 554962306a36Sopenharmony_ci * We take the chunk_mutex for 2 reasons: 555062306a36Sopenharmony_ci * 555162306a36Sopenharmony_ci * 1) Updates and insertions in the chunk btree must be done while holding 555262306a36Sopenharmony_ci * the chunk_mutex, as well as updating the system chunk array in the 555362306a36Sopenharmony_ci * superblock. See the comment on top of btrfs_chunk_alloc() for the 555462306a36Sopenharmony_ci * details; 555562306a36Sopenharmony_ci * 555662306a36Sopenharmony_ci * 2) To prevent races with the final phase of a device replace operation 555762306a36Sopenharmony_ci * that replaces the device object associated with the map's stripes, 555862306a36Sopenharmony_ci * because the device object's id can change at any time during that 555962306a36Sopenharmony_ci * final phase of the device replace operation 556062306a36Sopenharmony_ci * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 556162306a36Sopenharmony_ci * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 556262306a36Sopenharmony_ci * which would cause a failure when updating the device item, which does 556362306a36Sopenharmony_ci * not exists, or persisting a stripe of the chunk item with such ID. 556462306a36Sopenharmony_ci * Here we can't use the device_list_mutex because our caller already 556562306a36Sopenharmony_ci * has locked the chunk_mutex, and the final phase of device replace 556662306a36Sopenharmony_ci * acquires both mutexes - first the device_list_mutex and then the 556762306a36Sopenharmony_ci * chunk_mutex. Using any of those two mutexes protects us from a 556862306a36Sopenharmony_ci * concurrent device replace. 556962306a36Sopenharmony_ci */ 557062306a36Sopenharmony_ci lockdep_assert_held(&fs_info->chunk_mutex); 557162306a36Sopenharmony_ci 557262306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 557362306a36Sopenharmony_ci if (IS_ERR(em)) { 557462306a36Sopenharmony_ci ret = PTR_ERR(em); 557562306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 557662306a36Sopenharmony_ci return ret; 557762306a36Sopenharmony_ci } 557862306a36Sopenharmony_ci 557962306a36Sopenharmony_ci map = em->map_lookup; 558062306a36Sopenharmony_ci item_size = btrfs_chunk_item_size(map->num_stripes); 558162306a36Sopenharmony_ci 558262306a36Sopenharmony_ci chunk = kzalloc(item_size, GFP_NOFS); 558362306a36Sopenharmony_ci if (!chunk) { 558462306a36Sopenharmony_ci ret = -ENOMEM; 558562306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 558662306a36Sopenharmony_ci goto out; 558762306a36Sopenharmony_ci } 558862306a36Sopenharmony_ci 558962306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 559062306a36Sopenharmony_ci struct btrfs_device *device = map->stripes[i].dev; 559162306a36Sopenharmony_ci 559262306a36Sopenharmony_ci ret = btrfs_update_device(trans, device); 559362306a36Sopenharmony_ci if (ret) 559462306a36Sopenharmony_ci goto out; 559562306a36Sopenharmony_ci } 559662306a36Sopenharmony_ci 559762306a36Sopenharmony_ci stripe = &chunk->stripe; 559862306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 559962306a36Sopenharmony_ci struct btrfs_device *device = map->stripes[i].dev; 560062306a36Sopenharmony_ci const u64 dev_offset = map->stripes[i].physical; 560162306a36Sopenharmony_ci 560262306a36Sopenharmony_ci btrfs_set_stack_stripe_devid(stripe, device->devid); 560362306a36Sopenharmony_ci btrfs_set_stack_stripe_offset(stripe, dev_offset); 560462306a36Sopenharmony_ci memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 560562306a36Sopenharmony_ci stripe++; 560662306a36Sopenharmony_ci } 560762306a36Sopenharmony_ci 560862306a36Sopenharmony_ci btrfs_set_stack_chunk_length(chunk, bg->length); 560962306a36Sopenharmony_ci btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); 561062306a36Sopenharmony_ci btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN); 561162306a36Sopenharmony_ci btrfs_set_stack_chunk_type(chunk, map->type); 561262306a36Sopenharmony_ci btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 561362306a36Sopenharmony_ci btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN); 561462306a36Sopenharmony_ci btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN); 561562306a36Sopenharmony_ci btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 561662306a36Sopenharmony_ci btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 561762306a36Sopenharmony_ci 561862306a36Sopenharmony_ci key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 561962306a36Sopenharmony_ci key.type = BTRFS_CHUNK_ITEM_KEY; 562062306a36Sopenharmony_ci key.offset = bg->start; 562162306a36Sopenharmony_ci 562262306a36Sopenharmony_ci ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 562362306a36Sopenharmony_ci if (ret) 562462306a36Sopenharmony_ci goto out; 562562306a36Sopenharmony_ci 562662306a36Sopenharmony_ci set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); 562762306a36Sopenharmony_ci 562862306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 562962306a36Sopenharmony_ci ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 563062306a36Sopenharmony_ci if (ret) 563162306a36Sopenharmony_ci goto out; 563262306a36Sopenharmony_ci } 563362306a36Sopenharmony_ci 563462306a36Sopenharmony_ciout: 563562306a36Sopenharmony_ci kfree(chunk); 563662306a36Sopenharmony_ci free_extent_map(em); 563762306a36Sopenharmony_ci return ret; 563862306a36Sopenharmony_ci} 563962306a36Sopenharmony_ci 564062306a36Sopenharmony_cistatic noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 564162306a36Sopenharmony_ci{ 564262306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 564362306a36Sopenharmony_ci u64 alloc_profile; 564462306a36Sopenharmony_ci struct btrfs_block_group *meta_bg; 564562306a36Sopenharmony_ci struct btrfs_block_group *sys_bg; 564662306a36Sopenharmony_ci 564762306a36Sopenharmony_ci /* 564862306a36Sopenharmony_ci * When adding a new device for sprouting, the seed device is read-only 564962306a36Sopenharmony_ci * so we must first allocate a metadata and a system chunk. But before 565062306a36Sopenharmony_ci * adding the block group items to the extent, device and chunk btrees, 565162306a36Sopenharmony_ci * we must first: 565262306a36Sopenharmony_ci * 565362306a36Sopenharmony_ci * 1) Create both chunks without doing any changes to the btrees, as 565462306a36Sopenharmony_ci * otherwise we would get -ENOSPC since the block groups from the 565562306a36Sopenharmony_ci * seed device are read-only; 565662306a36Sopenharmony_ci * 565762306a36Sopenharmony_ci * 2) Add the device item for the new sprout device - finishing the setup 565862306a36Sopenharmony_ci * of a new block group requires updating the device item in the chunk 565962306a36Sopenharmony_ci * btree, so it must exist when we attempt to do it. The previous step 566062306a36Sopenharmony_ci * ensures this does not fail with -ENOSPC. 566162306a36Sopenharmony_ci * 566262306a36Sopenharmony_ci * After that we can add the block group items to their btrees: 566362306a36Sopenharmony_ci * update existing device item in the chunk btree, add a new block group 566462306a36Sopenharmony_ci * item to the extent btree, add a new chunk item to the chunk btree and 566562306a36Sopenharmony_ci * finally add the new device extent items to the devices btree. 566662306a36Sopenharmony_ci */ 566762306a36Sopenharmony_ci 566862306a36Sopenharmony_ci alloc_profile = btrfs_metadata_alloc_profile(fs_info); 566962306a36Sopenharmony_ci meta_bg = btrfs_create_chunk(trans, alloc_profile); 567062306a36Sopenharmony_ci if (IS_ERR(meta_bg)) 567162306a36Sopenharmony_ci return PTR_ERR(meta_bg); 567262306a36Sopenharmony_ci 567362306a36Sopenharmony_ci alloc_profile = btrfs_system_alloc_profile(fs_info); 567462306a36Sopenharmony_ci sys_bg = btrfs_create_chunk(trans, alloc_profile); 567562306a36Sopenharmony_ci if (IS_ERR(sys_bg)) 567662306a36Sopenharmony_ci return PTR_ERR(sys_bg); 567762306a36Sopenharmony_ci 567862306a36Sopenharmony_ci return 0; 567962306a36Sopenharmony_ci} 568062306a36Sopenharmony_ci 568162306a36Sopenharmony_cistatic inline int btrfs_chunk_max_errors(struct map_lookup *map) 568262306a36Sopenharmony_ci{ 568362306a36Sopenharmony_ci const int index = btrfs_bg_flags_to_raid_index(map->type); 568462306a36Sopenharmony_ci 568562306a36Sopenharmony_ci return btrfs_raid_array[index].tolerated_failures; 568662306a36Sopenharmony_ci} 568762306a36Sopenharmony_ci 568862306a36Sopenharmony_cibool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 568962306a36Sopenharmony_ci{ 569062306a36Sopenharmony_ci struct extent_map *em; 569162306a36Sopenharmony_ci struct map_lookup *map; 569262306a36Sopenharmony_ci int miss_ndevs = 0; 569362306a36Sopenharmony_ci int i; 569462306a36Sopenharmony_ci bool ret = true; 569562306a36Sopenharmony_ci 569662306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 569762306a36Sopenharmony_ci if (IS_ERR(em)) 569862306a36Sopenharmony_ci return false; 569962306a36Sopenharmony_ci 570062306a36Sopenharmony_ci map = em->map_lookup; 570162306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 570262306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_MISSING, 570362306a36Sopenharmony_ci &map->stripes[i].dev->dev_state)) { 570462306a36Sopenharmony_ci miss_ndevs++; 570562306a36Sopenharmony_ci continue; 570662306a36Sopenharmony_ci } 570762306a36Sopenharmony_ci if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 570862306a36Sopenharmony_ci &map->stripes[i].dev->dev_state)) { 570962306a36Sopenharmony_ci ret = false; 571062306a36Sopenharmony_ci goto end; 571162306a36Sopenharmony_ci } 571262306a36Sopenharmony_ci } 571362306a36Sopenharmony_ci 571462306a36Sopenharmony_ci /* 571562306a36Sopenharmony_ci * If the number of missing devices is larger than max errors, we can 571662306a36Sopenharmony_ci * not write the data into that chunk successfully. 571762306a36Sopenharmony_ci */ 571862306a36Sopenharmony_ci if (miss_ndevs > btrfs_chunk_max_errors(map)) 571962306a36Sopenharmony_ci ret = false; 572062306a36Sopenharmony_ciend: 572162306a36Sopenharmony_ci free_extent_map(em); 572262306a36Sopenharmony_ci return ret; 572362306a36Sopenharmony_ci} 572462306a36Sopenharmony_ci 572562306a36Sopenharmony_civoid btrfs_mapping_tree_free(struct extent_map_tree *tree) 572662306a36Sopenharmony_ci{ 572762306a36Sopenharmony_ci struct extent_map *em; 572862306a36Sopenharmony_ci 572962306a36Sopenharmony_ci while (1) { 573062306a36Sopenharmony_ci write_lock(&tree->lock); 573162306a36Sopenharmony_ci em = lookup_extent_mapping(tree, 0, (u64)-1); 573262306a36Sopenharmony_ci if (em) 573362306a36Sopenharmony_ci remove_extent_mapping(tree, em); 573462306a36Sopenharmony_ci write_unlock(&tree->lock); 573562306a36Sopenharmony_ci if (!em) 573662306a36Sopenharmony_ci break; 573762306a36Sopenharmony_ci /* once for us */ 573862306a36Sopenharmony_ci free_extent_map(em); 573962306a36Sopenharmony_ci /* once for the tree */ 574062306a36Sopenharmony_ci free_extent_map(em); 574162306a36Sopenharmony_ci } 574262306a36Sopenharmony_ci} 574362306a36Sopenharmony_ci 574462306a36Sopenharmony_ciint btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 574562306a36Sopenharmony_ci{ 574662306a36Sopenharmony_ci struct extent_map *em; 574762306a36Sopenharmony_ci struct map_lookup *map; 574862306a36Sopenharmony_ci enum btrfs_raid_types index; 574962306a36Sopenharmony_ci int ret = 1; 575062306a36Sopenharmony_ci 575162306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, logical, len); 575262306a36Sopenharmony_ci if (IS_ERR(em)) 575362306a36Sopenharmony_ci /* 575462306a36Sopenharmony_ci * We could return errors for these cases, but that could get 575562306a36Sopenharmony_ci * ugly and we'd probably do the same thing which is just not do 575662306a36Sopenharmony_ci * anything else and exit, so return 1 so the callers don't try 575762306a36Sopenharmony_ci * to use other copies. 575862306a36Sopenharmony_ci */ 575962306a36Sopenharmony_ci return 1; 576062306a36Sopenharmony_ci 576162306a36Sopenharmony_ci map = em->map_lookup; 576262306a36Sopenharmony_ci index = btrfs_bg_flags_to_raid_index(map->type); 576362306a36Sopenharmony_ci 576462306a36Sopenharmony_ci /* Non-RAID56, use their ncopies from btrfs_raid_array. */ 576562306a36Sopenharmony_ci if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 576662306a36Sopenharmony_ci ret = btrfs_raid_array[index].ncopies; 576762306a36Sopenharmony_ci else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 576862306a36Sopenharmony_ci ret = 2; 576962306a36Sopenharmony_ci else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 577062306a36Sopenharmony_ci /* 577162306a36Sopenharmony_ci * There could be two corrupted data stripes, we need 577262306a36Sopenharmony_ci * to loop retry in order to rebuild the correct data. 577362306a36Sopenharmony_ci * 577462306a36Sopenharmony_ci * Fail a stripe at a time on every retry except the 577562306a36Sopenharmony_ci * stripe under reconstruction. 577662306a36Sopenharmony_ci */ 577762306a36Sopenharmony_ci ret = map->num_stripes; 577862306a36Sopenharmony_ci free_extent_map(em); 577962306a36Sopenharmony_ci return ret; 578062306a36Sopenharmony_ci} 578162306a36Sopenharmony_ci 578262306a36Sopenharmony_ciunsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 578362306a36Sopenharmony_ci u64 logical) 578462306a36Sopenharmony_ci{ 578562306a36Sopenharmony_ci struct extent_map *em; 578662306a36Sopenharmony_ci struct map_lookup *map; 578762306a36Sopenharmony_ci unsigned long len = fs_info->sectorsize; 578862306a36Sopenharmony_ci 578962306a36Sopenharmony_ci if (!btrfs_fs_incompat(fs_info, RAID56)) 579062306a36Sopenharmony_ci return len; 579162306a36Sopenharmony_ci 579262306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, logical, len); 579362306a36Sopenharmony_ci 579462306a36Sopenharmony_ci if (!WARN_ON(IS_ERR(em))) { 579562306a36Sopenharmony_ci map = em->map_lookup; 579662306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 579762306a36Sopenharmony_ci len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 579862306a36Sopenharmony_ci free_extent_map(em); 579962306a36Sopenharmony_ci } 580062306a36Sopenharmony_ci return len; 580162306a36Sopenharmony_ci} 580262306a36Sopenharmony_ci 580362306a36Sopenharmony_ciint btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 580462306a36Sopenharmony_ci{ 580562306a36Sopenharmony_ci struct extent_map *em; 580662306a36Sopenharmony_ci struct map_lookup *map; 580762306a36Sopenharmony_ci int ret = 0; 580862306a36Sopenharmony_ci 580962306a36Sopenharmony_ci if (!btrfs_fs_incompat(fs_info, RAID56)) 581062306a36Sopenharmony_ci return 0; 581162306a36Sopenharmony_ci 581262306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, logical, len); 581362306a36Sopenharmony_ci 581462306a36Sopenharmony_ci if(!WARN_ON(IS_ERR(em))) { 581562306a36Sopenharmony_ci map = em->map_lookup; 581662306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 581762306a36Sopenharmony_ci ret = 1; 581862306a36Sopenharmony_ci free_extent_map(em); 581962306a36Sopenharmony_ci } 582062306a36Sopenharmony_ci return ret; 582162306a36Sopenharmony_ci} 582262306a36Sopenharmony_ci 582362306a36Sopenharmony_cistatic int find_live_mirror(struct btrfs_fs_info *fs_info, 582462306a36Sopenharmony_ci struct map_lookup *map, int first, 582562306a36Sopenharmony_ci int dev_replace_is_ongoing) 582662306a36Sopenharmony_ci{ 582762306a36Sopenharmony_ci int i; 582862306a36Sopenharmony_ci int num_stripes; 582962306a36Sopenharmony_ci int preferred_mirror; 583062306a36Sopenharmony_ci int tolerance; 583162306a36Sopenharmony_ci struct btrfs_device *srcdev; 583262306a36Sopenharmony_ci 583362306a36Sopenharmony_ci ASSERT((map->type & 583462306a36Sopenharmony_ci (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 583562306a36Sopenharmony_ci 583662306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_RAID10) 583762306a36Sopenharmony_ci num_stripes = map->sub_stripes; 583862306a36Sopenharmony_ci else 583962306a36Sopenharmony_ci num_stripes = map->num_stripes; 584062306a36Sopenharmony_ci 584162306a36Sopenharmony_ci switch (fs_info->fs_devices->read_policy) { 584262306a36Sopenharmony_ci default: 584362306a36Sopenharmony_ci /* Shouldn't happen, just warn and use pid instead of failing */ 584462306a36Sopenharmony_ci btrfs_warn_rl(fs_info, 584562306a36Sopenharmony_ci "unknown read_policy type %u, reset to pid", 584662306a36Sopenharmony_ci fs_info->fs_devices->read_policy); 584762306a36Sopenharmony_ci fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 584862306a36Sopenharmony_ci fallthrough; 584962306a36Sopenharmony_ci case BTRFS_READ_POLICY_PID: 585062306a36Sopenharmony_ci preferred_mirror = first + (current->pid % num_stripes); 585162306a36Sopenharmony_ci break; 585262306a36Sopenharmony_ci } 585362306a36Sopenharmony_ci 585462306a36Sopenharmony_ci if (dev_replace_is_ongoing && 585562306a36Sopenharmony_ci fs_info->dev_replace.cont_reading_from_srcdev_mode == 585662306a36Sopenharmony_ci BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 585762306a36Sopenharmony_ci srcdev = fs_info->dev_replace.srcdev; 585862306a36Sopenharmony_ci else 585962306a36Sopenharmony_ci srcdev = NULL; 586062306a36Sopenharmony_ci 586162306a36Sopenharmony_ci /* 586262306a36Sopenharmony_ci * try to avoid the drive that is the source drive for a 586362306a36Sopenharmony_ci * dev-replace procedure, only choose it if no other non-missing 586462306a36Sopenharmony_ci * mirror is available 586562306a36Sopenharmony_ci */ 586662306a36Sopenharmony_ci for (tolerance = 0; tolerance < 2; tolerance++) { 586762306a36Sopenharmony_ci if (map->stripes[preferred_mirror].dev->bdev && 586862306a36Sopenharmony_ci (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 586962306a36Sopenharmony_ci return preferred_mirror; 587062306a36Sopenharmony_ci for (i = first; i < first + num_stripes; i++) { 587162306a36Sopenharmony_ci if (map->stripes[i].dev->bdev && 587262306a36Sopenharmony_ci (tolerance || map->stripes[i].dev != srcdev)) 587362306a36Sopenharmony_ci return i; 587462306a36Sopenharmony_ci } 587562306a36Sopenharmony_ci } 587662306a36Sopenharmony_ci 587762306a36Sopenharmony_ci /* we couldn't find one that doesn't fail. Just return something 587862306a36Sopenharmony_ci * and the io error handling code will clean up eventually 587962306a36Sopenharmony_ci */ 588062306a36Sopenharmony_ci return preferred_mirror; 588162306a36Sopenharmony_ci} 588262306a36Sopenharmony_ci 588362306a36Sopenharmony_cistatic struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, 588462306a36Sopenharmony_ci u16 total_stripes) 588562306a36Sopenharmony_ci{ 588662306a36Sopenharmony_ci struct btrfs_io_context *bioc; 588762306a36Sopenharmony_ci 588862306a36Sopenharmony_ci bioc = kzalloc( 588962306a36Sopenharmony_ci /* The size of btrfs_io_context */ 589062306a36Sopenharmony_ci sizeof(struct btrfs_io_context) + 589162306a36Sopenharmony_ci /* Plus the variable array for the stripes */ 589262306a36Sopenharmony_ci sizeof(struct btrfs_io_stripe) * (total_stripes), 589362306a36Sopenharmony_ci GFP_NOFS); 589462306a36Sopenharmony_ci 589562306a36Sopenharmony_ci if (!bioc) 589662306a36Sopenharmony_ci return NULL; 589762306a36Sopenharmony_ci 589862306a36Sopenharmony_ci refcount_set(&bioc->refs, 1); 589962306a36Sopenharmony_ci 590062306a36Sopenharmony_ci bioc->fs_info = fs_info; 590162306a36Sopenharmony_ci bioc->replace_stripe_src = -1; 590262306a36Sopenharmony_ci bioc->full_stripe_logical = (u64)-1; 590362306a36Sopenharmony_ci 590462306a36Sopenharmony_ci return bioc; 590562306a36Sopenharmony_ci} 590662306a36Sopenharmony_ci 590762306a36Sopenharmony_civoid btrfs_get_bioc(struct btrfs_io_context *bioc) 590862306a36Sopenharmony_ci{ 590962306a36Sopenharmony_ci WARN_ON(!refcount_read(&bioc->refs)); 591062306a36Sopenharmony_ci refcount_inc(&bioc->refs); 591162306a36Sopenharmony_ci} 591262306a36Sopenharmony_ci 591362306a36Sopenharmony_civoid btrfs_put_bioc(struct btrfs_io_context *bioc) 591462306a36Sopenharmony_ci{ 591562306a36Sopenharmony_ci if (!bioc) 591662306a36Sopenharmony_ci return; 591762306a36Sopenharmony_ci if (refcount_dec_and_test(&bioc->refs)) 591862306a36Sopenharmony_ci kfree(bioc); 591962306a36Sopenharmony_ci} 592062306a36Sopenharmony_ci 592162306a36Sopenharmony_ci/* 592262306a36Sopenharmony_ci * Please note that, discard won't be sent to target device of device 592362306a36Sopenharmony_ci * replace. 592462306a36Sopenharmony_ci */ 592562306a36Sopenharmony_cistruct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, 592662306a36Sopenharmony_ci u64 logical, u64 *length_ret, 592762306a36Sopenharmony_ci u32 *num_stripes) 592862306a36Sopenharmony_ci{ 592962306a36Sopenharmony_ci struct extent_map *em; 593062306a36Sopenharmony_ci struct map_lookup *map; 593162306a36Sopenharmony_ci struct btrfs_discard_stripe *stripes; 593262306a36Sopenharmony_ci u64 length = *length_ret; 593362306a36Sopenharmony_ci u64 offset; 593462306a36Sopenharmony_ci u32 stripe_nr; 593562306a36Sopenharmony_ci u32 stripe_nr_end; 593662306a36Sopenharmony_ci u32 stripe_cnt; 593762306a36Sopenharmony_ci u64 stripe_end_offset; 593862306a36Sopenharmony_ci u64 stripe_offset; 593962306a36Sopenharmony_ci u32 stripe_index; 594062306a36Sopenharmony_ci u32 factor = 0; 594162306a36Sopenharmony_ci u32 sub_stripes = 0; 594262306a36Sopenharmony_ci u32 stripes_per_dev = 0; 594362306a36Sopenharmony_ci u32 remaining_stripes = 0; 594462306a36Sopenharmony_ci u32 last_stripe = 0; 594562306a36Sopenharmony_ci int ret; 594662306a36Sopenharmony_ci int i; 594762306a36Sopenharmony_ci 594862306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, logical, length); 594962306a36Sopenharmony_ci if (IS_ERR(em)) 595062306a36Sopenharmony_ci return ERR_CAST(em); 595162306a36Sopenharmony_ci 595262306a36Sopenharmony_ci map = em->map_lookup; 595362306a36Sopenharmony_ci 595462306a36Sopenharmony_ci /* we don't discard raid56 yet */ 595562306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 595662306a36Sopenharmony_ci ret = -EOPNOTSUPP; 595762306a36Sopenharmony_ci goto out_free_map; 595862306a36Sopenharmony_ci } 595962306a36Sopenharmony_ci 596062306a36Sopenharmony_ci offset = logical - em->start; 596162306a36Sopenharmony_ci length = min_t(u64, em->start + em->len - logical, length); 596262306a36Sopenharmony_ci *length_ret = length; 596362306a36Sopenharmony_ci 596462306a36Sopenharmony_ci /* 596562306a36Sopenharmony_ci * stripe_nr counts the total number of stripes we have to stride 596662306a36Sopenharmony_ci * to get to this block 596762306a36Sopenharmony_ci */ 596862306a36Sopenharmony_ci stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; 596962306a36Sopenharmony_ci 597062306a36Sopenharmony_ci /* stripe_offset is the offset of this block in its stripe */ 597162306a36Sopenharmony_ci stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr); 597262306a36Sopenharmony_ci 597362306a36Sopenharmony_ci stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> 597462306a36Sopenharmony_ci BTRFS_STRIPE_LEN_SHIFT; 597562306a36Sopenharmony_ci stripe_cnt = stripe_nr_end - stripe_nr; 597662306a36Sopenharmony_ci stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) - 597762306a36Sopenharmony_ci (offset + length); 597862306a36Sopenharmony_ci /* 597962306a36Sopenharmony_ci * after this, stripe_nr is the number of stripes on this 598062306a36Sopenharmony_ci * device we have to walk to find the data, and stripe_index is 598162306a36Sopenharmony_ci * the number of our device in the stripe array 598262306a36Sopenharmony_ci */ 598362306a36Sopenharmony_ci *num_stripes = 1; 598462306a36Sopenharmony_ci stripe_index = 0; 598562306a36Sopenharmony_ci if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 598662306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_RAID10)) { 598762306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_RAID0) 598862306a36Sopenharmony_ci sub_stripes = 1; 598962306a36Sopenharmony_ci else 599062306a36Sopenharmony_ci sub_stripes = map->sub_stripes; 599162306a36Sopenharmony_ci 599262306a36Sopenharmony_ci factor = map->num_stripes / sub_stripes; 599362306a36Sopenharmony_ci *num_stripes = min_t(u64, map->num_stripes, 599462306a36Sopenharmony_ci sub_stripes * stripe_cnt); 599562306a36Sopenharmony_ci stripe_index = stripe_nr % factor; 599662306a36Sopenharmony_ci stripe_nr /= factor; 599762306a36Sopenharmony_ci stripe_index *= sub_stripes; 599862306a36Sopenharmony_ci 599962306a36Sopenharmony_ci remaining_stripes = stripe_cnt % factor; 600062306a36Sopenharmony_ci stripes_per_dev = stripe_cnt / factor; 600162306a36Sopenharmony_ci last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes; 600262306a36Sopenharmony_ci } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 600362306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_DUP)) { 600462306a36Sopenharmony_ci *num_stripes = map->num_stripes; 600562306a36Sopenharmony_ci } else { 600662306a36Sopenharmony_ci stripe_index = stripe_nr % map->num_stripes; 600762306a36Sopenharmony_ci stripe_nr /= map->num_stripes; 600862306a36Sopenharmony_ci } 600962306a36Sopenharmony_ci 601062306a36Sopenharmony_ci stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); 601162306a36Sopenharmony_ci if (!stripes) { 601262306a36Sopenharmony_ci ret = -ENOMEM; 601362306a36Sopenharmony_ci goto out_free_map; 601462306a36Sopenharmony_ci } 601562306a36Sopenharmony_ci 601662306a36Sopenharmony_ci for (i = 0; i < *num_stripes; i++) { 601762306a36Sopenharmony_ci stripes[i].physical = 601862306a36Sopenharmony_ci map->stripes[stripe_index].physical + 601962306a36Sopenharmony_ci stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); 602062306a36Sopenharmony_ci stripes[i].dev = map->stripes[stripe_index].dev; 602162306a36Sopenharmony_ci 602262306a36Sopenharmony_ci if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 602362306a36Sopenharmony_ci BTRFS_BLOCK_GROUP_RAID10)) { 602462306a36Sopenharmony_ci stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev); 602562306a36Sopenharmony_ci 602662306a36Sopenharmony_ci if (i / sub_stripes < remaining_stripes) 602762306a36Sopenharmony_ci stripes[i].length += BTRFS_STRIPE_LEN; 602862306a36Sopenharmony_ci 602962306a36Sopenharmony_ci /* 603062306a36Sopenharmony_ci * Special for the first stripe and 603162306a36Sopenharmony_ci * the last stripe: 603262306a36Sopenharmony_ci * 603362306a36Sopenharmony_ci * |-------|...|-------| 603462306a36Sopenharmony_ci * |----------| 603562306a36Sopenharmony_ci * off end_off 603662306a36Sopenharmony_ci */ 603762306a36Sopenharmony_ci if (i < sub_stripes) 603862306a36Sopenharmony_ci stripes[i].length -= stripe_offset; 603962306a36Sopenharmony_ci 604062306a36Sopenharmony_ci if (stripe_index >= last_stripe && 604162306a36Sopenharmony_ci stripe_index <= (last_stripe + 604262306a36Sopenharmony_ci sub_stripes - 1)) 604362306a36Sopenharmony_ci stripes[i].length -= stripe_end_offset; 604462306a36Sopenharmony_ci 604562306a36Sopenharmony_ci if (i == sub_stripes - 1) 604662306a36Sopenharmony_ci stripe_offset = 0; 604762306a36Sopenharmony_ci } else { 604862306a36Sopenharmony_ci stripes[i].length = length; 604962306a36Sopenharmony_ci } 605062306a36Sopenharmony_ci 605162306a36Sopenharmony_ci stripe_index++; 605262306a36Sopenharmony_ci if (stripe_index == map->num_stripes) { 605362306a36Sopenharmony_ci stripe_index = 0; 605462306a36Sopenharmony_ci stripe_nr++; 605562306a36Sopenharmony_ci } 605662306a36Sopenharmony_ci } 605762306a36Sopenharmony_ci 605862306a36Sopenharmony_ci free_extent_map(em); 605962306a36Sopenharmony_ci return stripes; 606062306a36Sopenharmony_ciout_free_map: 606162306a36Sopenharmony_ci free_extent_map(em); 606262306a36Sopenharmony_ci return ERR_PTR(ret); 606362306a36Sopenharmony_ci} 606462306a36Sopenharmony_ci 606562306a36Sopenharmony_cistatic bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 606662306a36Sopenharmony_ci{ 606762306a36Sopenharmony_ci struct btrfs_block_group *cache; 606862306a36Sopenharmony_ci bool ret; 606962306a36Sopenharmony_ci 607062306a36Sopenharmony_ci /* Non zoned filesystem does not use "to_copy" flag */ 607162306a36Sopenharmony_ci if (!btrfs_is_zoned(fs_info)) 607262306a36Sopenharmony_ci return false; 607362306a36Sopenharmony_ci 607462306a36Sopenharmony_ci cache = btrfs_lookup_block_group(fs_info, logical); 607562306a36Sopenharmony_ci 607662306a36Sopenharmony_ci ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); 607762306a36Sopenharmony_ci 607862306a36Sopenharmony_ci btrfs_put_block_group(cache); 607962306a36Sopenharmony_ci return ret; 608062306a36Sopenharmony_ci} 608162306a36Sopenharmony_ci 608262306a36Sopenharmony_cistatic void handle_ops_on_dev_replace(enum btrfs_map_op op, 608362306a36Sopenharmony_ci struct btrfs_io_context *bioc, 608462306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace, 608562306a36Sopenharmony_ci u64 logical, 608662306a36Sopenharmony_ci int *num_stripes_ret, int *max_errors_ret) 608762306a36Sopenharmony_ci{ 608862306a36Sopenharmony_ci u64 srcdev_devid = dev_replace->srcdev->devid; 608962306a36Sopenharmony_ci /* 609062306a36Sopenharmony_ci * At this stage, num_stripes is still the real number of stripes, 609162306a36Sopenharmony_ci * excluding the duplicated stripes. 609262306a36Sopenharmony_ci */ 609362306a36Sopenharmony_ci int num_stripes = *num_stripes_ret; 609462306a36Sopenharmony_ci int nr_extra_stripes = 0; 609562306a36Sopenharmony_ci int max_errors = *max_errors_ret; 609662306a36Sopenharmony_ci int i; 609762306a36Sopenharmony_ci 609862306a36Sopenharmony_ci /* 609962306a36Sopenharmony_ci * A block group which has "to_copy" set will eventually be copied by 610062306a36Sopenharmony_ci * the dev-replace process. We can avoid cloning IO here. 610162306a36Sopenharmony_ci */ 610262306a36Sopenharmony_ci if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 610362306a36Sopenharmony_ci return; 610462306a36Sopenharmony_ci 610562306a36Sopenharmony_ci /* 610662306a36Sopenharmony_ci * Duplicate the write operations while the dev-replace procedure is 610762306a36Sopenharmony_ci * running. Since the copying of the old disk to the new disk takes 610862306a36Sopenharmony_ci * place at run time while the filesystem is mounted writable, the 610962306a36Sopenharmony_ci * regular write operations to the old disk have to be duplicated to go 611062306a36Sopenharmony_ci * to the new disk as well. 611162306a36Sopenharmony_ci * 611262306a36Sopenharmony_ci * Note that device->missing is handled by the caller, and that the 611362306a36Sopenharmony_ci * write to the old disk is already set up in the stripes array. 611462306a36Sopenharmony_ci */ 611562306a36Sopenharmony_ci for (i = 0; i < num_stripes; i++) { 611662306a36Sopenharmony_ci struct btrfs_io_stripe *old = &bioc->stripes[i]; 611762306a36Sopenharmony_ci struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes]; 611862306a36Sopenharmony_ci 611962306a36Sopenharmony_ci if (old->dev->devid != srcdev_devid) 612062306a36Sopenharmony_ci continue; 612162306a36Sopenharmony_ci 612262306a36Sopenharmony_ci new->physical = old->physical; 612362306a36Sopenharmony_ci new->dev = dev_replace->tgtdev; 612462306a36Sopenharmony_ci if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) 612562306a36Sopenharmony_ci bioc->replace_stripe_src = i; 612662306a36Sopenharmony_ci nr_extra_stripes++; 612762306a36Sopenharmony_ci } 612862306a36Sopenharmony_ci 612962306a36Sopenharmony_ci /* We can only have at most 2 extra nr_stripes (for DUP). */ 613062306a36Sopenharmony_ci ASSERT(nr_extra_stripes <= 2); 613162306a36Sopenharmony_ci /* 613262306a36Sopenharmony_ci * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for 613362306a36Sopenharmony_ci * replace. 613462306a36Sopenharmony_ci * If we have 2 extra stripes, only choose the one with smaller physical. 613562306a36Sopenharmony_ci */ 613662306a36Sopenharmony_ci if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { 613762306a36Sopenharmony_ci struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; 613862306a36Sopenharmony_ci struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; 613962306a36Sopenharmony_ci 614062306a36Sopenharmony_ci /* Only DUP can have two extra stripes. */ 614162306a36Sopenharmony_ci ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); 614262306a36Sopenharmony_ci 614362306a36Sopenharmony_ci /* 614462306a36Sopenharmony_ci * Swap the last stripe stripes and reduce @nr_extra_stripes. 614562306a36Sopenharmony_ci * The extra stripe would still be there, but won't be accessed. 614662306a36Sopenharmony_ci */ 614762306a36Sopenharmony_ci if (first->physical > second->physical) { 614862306a36Sopenharmony_ci swap(second->physical, first->physical); 614962306a36Sopenharmony_ci swap(second->dev, first->dev); 615062306a36Sopenharmony_ci nr_extra_stripes--; 615162306a36Sopenharmony_ci } 615262306a36Sopenharmony_ci } 615362306a36Sopenharmony_ci 615462306a36Sopenharmony_ci *num_stripes_ret = num_stripes + nr_extra_stripes; 615562306a36Sopenharmony_ci *max_errors_ret = max_errors + nr_extra_stripes; 615662306a36Sopenharmony_ci bioc->replace_nr_stripes = nr_extra_stripes; 615762306a36Sopenharmony_ci} 615862306a36Sopenharmony_ci 615962306a36Sopenharmony_cistatic u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, 616062306a36Sopenharmony_ci u64 offset, u32 *stripe_nr, u64 *stripe_offset, 616162306a36Sopenharmony_ci u64 *full_stripe_start) 616262306a36Sopenharmony_ci{ 616362306a36Sopenharmony_ci /* 616462306a36Sopenharmony_ci * Stripe_nr is the stripe where this block falls. stripe_offset is 616562306a36Sopenharmony_ci * the offset of this block in its stripe. 616662306a36Sopenharmony_ci */ 616762306a36Sopenharmony_ci *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; 616862306a36Sopenharmony_ci *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; 616962306a36Sopenharmony_ci ASSERT(*stripe_offset < U32_MAX); 617062306a36Sopenharmony_ci 617162306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 617262306a36Sopenharmony_ci unsigned long full_stripe_len = 617362306a36Sopenharmony_ci btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 617462306a36Sopenharmony_ci 617562306a36Sopenharmony_ci /* 617662306a36Sopenharmony_ci * For full stripe start, we use previously calculated 617762306a36Sopenharmony_ci * @stripe_nr. Align it to nr_data_stripes, then multiply with 617862306a36Sopenharmony_ci * STRIPE_LEN. 617962306a36Sopenharmony_ci * 618062306a36Sopenharmony_ci * By this we can avoid u64 division completely. And we have 618162306a36Sopenharmony_ci * to go rounddown(), not round_down(), as nr_data_stripes is 618262306a36Sopenharmony_ci * not ensured to be power of 2. 618362306a36Sopenharmony_ci */ 618462306a36Sopenharmony_ci *full_stripe_start = 618562306a36Sopenharmony_ci btrfs_stripe_nr_to_offset( 618662306a36Sopenharmony_ci rounddown(*stripe_nr, nr_data_stripes(map))); 618762306a36Sopenharmony_ci 618862306a36Sopenharmony_ci ASSERT(*full_stripe_start + full_stripe_len > offset); 618962306a36Sopenharmony_ci ASSERT(*full_stripe_start <= offset); 619062306a36Sopenharmony_ci /* 619162306a36Sopenharmony_ci * For writes to RAID56, allow to write a full stripe set, but 619262306a36Sopenharmony_ci * no straddling of stripe sets. 619362306a36Sopenharmony_ci */ 619462306a36Sopenharmony_ci if (op == BTRFS_MAP_WRITE) 619562306a36Sopenharmony_ci return full_stripe_len - (offset - *full_stripe_start); 619662306a36Sopenharmony_ci } 619762306a36Sopenharmony_ci 619862306a36Sopenharmony_ci /* 619962306a36Sopenharmony_ci * For other RAID types and for RAID56 reads, allow a single stripe (on 620062306a36Sopenharmony_ci * a single disk). 620162306a36Sopenharmony_ci */ 620262306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) 620362306a36Sopenharmony_ci return BTRFS_STRIPE_LEN - *stripe_offset; 620462306a36Sopenharmony_ci return U64_MAX; 620562306a36Sopenharmony_ci} 620662306a36Sopenharmony_ci 620762306a36Sopenharmony_cistatic void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, 620862306a36Sopenharmony_ci u32 stripe_index, u64 stripe_offset, u32 stripe_nr) 620962306a36Sopenharmony_ci{ 621062306a36Sopenharmony_ci dst->dev = map->stripes[stripe_index].dev; 621162306a36Sopenharmony_ci dst->physical = map->stripes[stripe_index].physical + 621262306a36Sopenharmony_ci stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); 621362306a36Sopenharmony_ci} 621462306a36Sopenharmony_ci 621562306a36Sopenharmony_ci/* 621662306a36Sopenharmony_ci * Map one logical range to one or more physical ranges. 621762306a36Sopenharmony_ci * 621862306a36Sopenharmony_ci * @length: (Mandatory) mapped length of this run. 621962306a36Sopenharmony_ci * One logical range can be split into different segments 622062306a36Sopenharmony_ci * due to factors like zones and RAID0/5/6/10 stripe 622162306a36Sopenharmony_ci * boundaries. 622262306a36Sopenharmony_ci * 622362306a36Sopenharmony_ci * @bioc_ret: (Mandatory) returned btrfs_io_context structure. 622462306a36Sopenharmony_ci * which has one or more physical ranges (btrfs_io_stripe) 622562306a36Sopenharmony_ci * recorded inside. 622662306a36Sopenharmony_ci * Caller should call btrfs_put_bioc() to free it after use. 622762306a36Sopenharmony_ci * 622862306a36Sopenharmony_ci * @smap: (Optional) single physical range optimization. 622962306a36Sopenharmony_ci * If the map request can be fulfilled by one single 623062306a36Sopenharmony_ci * physical range, and this is parameter is not NULL, 623162306a36Sopenharmony_ci * then @bioc_ret would be NULL, and @smap would be 623262306a36Sopenharmony_ci * updated. 623362306a36Sopenharmony_ci * 623462306a36Sopenharmony_ci * @mirror_num_ret: (Mandatory) returned mirror number if the original 623562306a36Sopenharmony_ci * value is 0. 623662306a36Sopenharmony_ci * 623762306a36Sopenharmony_ci * Mirror number 0 means to choose any live mirrors. 623862306a36Sopenharmony_ci * 623962306a36Sopenharmony_ci * For non-RAID56 profiles, non-zero mirror_num means 624062306a36Sopenharmony_ci * the Nth mirror. (e.g. mirror_num 1 means the first 624162306a36Sopenharmony_ci * copy). 624262306a36Sopenharmony_ci * 624362306a36Sopenharmony_ci * For RAID56 profile, mirror 1 means rebuild from P and 624462306a36Sopenharmony_ci * the remaining data stripes. 624562306a36Sopenharmony_ci * 624662306a36Sopenharmony_ci * For RAID6 profile, mirror > 2 means mark another 624762306a36Sopenharmony_ci * data/P stripe error and rebuild from the remaining 624862306a36Sopenharmony_ci * stripes.. 624962306a36Sopenharmony_ci * 625062306a36Sopenharmony_ci * @need_raid_map: (Used only for integrity checker) whether the map wants 625162306a36Sopenharmony_ci * a full stripe map (including all data and P/Q stripes) 625262306a36Sopenharmony_ci * for RAID56. Should always be 1 except integrity checker. 625362306a36Sopenharmony_ci */ 625462306a36Sopenharmony_ciint btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 625562306a36Sopenharmony_ci u64 logical, u64 *length, 625662306a36Sopenharmony_ci struct btrfs_io_context **bioc_ret, 625762306a36Sopenharmony_ci struct btrfs_io_stripe *smap, int *mirror_num_ret, 625862306a36Sopenharmony_ci int need_raid_map) 625962306a36Sopenharmony_ci{ 626062306a36Sopenharmony_ci struct extent_map *em; 626162306a36Sopenharmony_ci struct map_lookup *map; 626262306a36Sopenharmony_ci u64 map_offset; 626362306a36Sopenharmony_ci u64 stripe_offset; 626462306a36Sopenharmony_ci u32 stripe_nr; 626562306a36Sopenharmony_ci u32 stripe_index; 626662306a36Sopenharmony_ci int data_stripes; 626762306a36Sopenharmony_ci int i; 626862306a36Sopenharmony_ci int ret = 0; 626962306a36Sopenharmony_ci int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); 627062306a36Sopenharmony_ci int num_stripes; 627162306a36Sopenharmony_ci int num_copies; 627262306a36Sopenharmony_ci int max_errors = 0; 627362306a36Sopenharmony_ci struct btrfs_io_context *bioc = NULL; 627462306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 627562306a36Sopenharmony_ci int dev_replace_is_ongoing = 0; 627662306a36Sopenharmony_ci u16 num_alloc_stripes; 627762306a36Sopenharmony_ci u64 raid56_full_stripe_start = (u64)-1; 627862306a36Sopenharmony_ci u64 max_len; 627962306a36Sopenharmony_ci 628062306a36Sopenharmony_ci ASSERT(bioc_ret); 628162306a36Sopenharmony_ci 628262306a36Sopenharmony_ci num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); 628362306a36Sopenharmony_ci if (mirror_num > num_copies) 628462306a36Sopenharmony_ci return -EINVAL; 628562306a36Sopenharmony_ci 628662306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, logical, *length); 628762306a36Sopenharmony_ci if (IS_ERR(em)) 628862306a36Sopenharmony_ci return PTR_ERR(em); 628962306a36Sopenharmony_ci 629062306a36Sopenharmony_ci map = em->map_lookup; 629162306a36Sopenharmony_ci data_stripes = nr_data_stripes(map); 629262306a36Sopenharmony_ci 629362306a36Sopenharmony_ci map_offset = logical - em->start; 629462306a36Sopenharmony_ci max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, 629562306a36Sopenharmony_ci &stripe_offset, &raid56_full_stripe_start); 629662306a36Sopenharmony_ci *length = min_t(u64, em->len - map_offset, max_len); 629762306a36Sopenharmony_ci 629862306a36Sopenharmony_ci down_read(&dev_replace->rwsem); 629962306a36Sopenharmony_ci dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 630062306a36Sopenharmony_ci /* 630162306a36Sopenharmony_ci * Hold the semaphore for read during the whole operation, write is 630262306a36Sopenharmony_ci * requested at commit time but must wait. 630362306a36Sopenharmony_ci */ 630462306a36Sopenharmony_ci if (!dev_replace_is_ongoing) 630562306a36Sopenharmony_ci up_read(&dev_replace->rwsem); 630662306a36Sopenharmony_ci 630762306a36Sopenharmony_ci num_stripes = 1; 630862306a36Sopenharmony_ci stripe_index = 0; 630962306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 631062306a36Sopenharmony_ci stripe_index = stripe_nr % map->num_stripes; 631162306a36Sopenharmony_ci stripe_nr /= map->num_stripes; 631262306a36Sopenharmony_ci if (op == BTRFS_MAP_READ) 631362306a36Sopenharmony_ci mirror_num = 1; 631462306a36Sopenharmony_ci } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 631562306a36Sopenharmony_ci if (op != BTRFS_MAP_READ) { 631662306a36Sopenharmony_ci num_stripes = map->num_stripes; 631762306a36Sopenharmony_ci } else if (mirror_num) { 631862306a36Sopenharmony_ci stripe_index = mirror_num - 1; 631962306a36Sopenharmony_ci } else { 632062306a36Sopenharmony_ci stripe_index = find_live_mirror(fs_info, map, 0, 632162306a36Sopenharmony_ci dev_replace_is_ongoing); 632262306a36Sopenharmony_ci mirror_num = stripe_index + 1; 632362306a36Sopenharmony_ci } 632462306a36Sopenharmony_ci 632562306a36Sopenharmony_ci } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 632662306a36Sopenharmony_ci if (op != BTRFS_MAP_READ) { 632762306a36Sopenharmony_ci num_stripes = map->num_stripes; 632862306a36Sopenharmony_ci } else if (mirror_num) { 632962306a36Sopenharmony_ci stripe_index = mirror_num - 1; 633062306a36Sopenharmony_ci } else { 633162306a36Sopenharmony_ci mirror_num = 1; 633262306a36Sopenharmony_ci } 633362306a36Sopenharmony_ci 633462306a36Sopenharmony_ci } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 633562306a36Sopenharmony_ci u32 factor = map->num_stripes / map->sub_stripes; 633662306a36Sopenharmony_ci 633762306a36Sopenharmony_ci stripe_index = (stripe_nr % factor) * map->sub_stripes; 633862306a36Sopenharmony_ci stripe_nr /= factor; 633962306a36Sopenharmony_ci 634062306a36Sopenharmony_ci if (op != BTRFS_MAP_READ) 634162306a36Sopenharmony_ci num_stripes = map->sub_stripes; 634262306a36Sopenharmony_ci else if (mirror_num) 634362306a36Sopenharmony_ci stripe_index += mirror_num - 1; 634462306a36Sopenharmony_ci else { 634562306a36Sopenharmony_ci int old_stripe_index = stripe_index; 634662306a36Sopenharmony_ci stripe_index = find_live_mirror(fs_info, map, 634762306a36Sopenharmony_ci stripe_index, 634862306a36Sopenharmony_ci dev_replace_is_ongoing); 634962306a36Sopenharmony_ci mirror_num = stripe_index - old_stripe_index + 1; 635062306a36Sopenharmony_ci } 635162306a36Sopenharmony_ci 635262306a36Sopenharmony_ci } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 635362306a36Sopenharmony_ci if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) { 635462306a36Sopenharmony_ci /* 635562306a36Sopenharmony_ci * Push stripe_nr back to the start of the full stripe 635662306a36Sopenharmony_ci * For those cases needing a full stripe, @stripe_nr 635762306a36Sopenharmony_ci * is the full stripe number. 635862306a36Sopenharmony_ci * 635962306a36Sopenharmony_ci * Originally we go raid56_full_stripe_start / full_stripe_len, 636062306a36Sopenharmony_ci * but that can be expensive. Here we just divide 636162306a36Sopenharmony_ci * @stripe_nr with @data_stripes. 636262306a36Sopenharmony_ci */ 636362306a36Sopenharmony_ci stripe_nr /= data_stripes; 636462306a36Sopenharmony_ci 636562306a36Sopenharmony_ci /* RAID[56] write or recovery. Return all stripes */ 636662306a36Sopenharmony_ci num_stripes = map->num_stripes; 636762306a36Sopenharmony_ci max_errors = btrfs_chunk_max_errors(map); 636862306a36Sopenharmony_ci 636962306a36Sopenharmony_ci /* Return the length to the full stripe end */ 637062306a36Sopenharmony_ci *length = min(logical + *length, 637162306a36Sopenharmony_ci raid56_full_stripe_start + em->start + 637262306a36Sopenharmony_ci btrfs_stripe_nr_to_offset(data_stripes)) - 637362306a36Sopenharmony_ci logical; 637462306a36Sopenharmony_ci stripe_index = 0; 637562306a36Sopenharmony_ci stripe_offset = 0; 637662306a36Sopenharmony_ci } else { 637762306a36Sopenharmony_ci /* 637862306a36Sopenharmony_ci * Mirror #0 or #1 means the original data block. 637962306a36Sopenharmony_ci * Mirror #2 is RAID5 parity block. 638062306a36Sopenharmony_ci * Mirror #3 is RAID6 Q block. 638162306a36Sopenharmony_ci */ 638262306a36Sopenharmony_ci stripe_index = stripe_nr % data_stripes; 638362306a36Sopenharmony_ci stripe_nr /= data_stripes; 638462306a36Sopenharmony_ci if (mirror_num > 1) 638562306a36Sopenharmony_ci stripe_index = data_stripes + mirror_num - 2; 638662306a36Sopenharmony_ci 638762306a36Sopenharmony_ci /* We distribute the parity blocks across stripes */ 638862306a36Sopenharmony_ci stripe_index = (stripe_nr + stripe_index) % map->num_stripes; 638962306a36Sopenharmony_ci if (op == BTRFS_MAP_READ && mirror_num <= 1) 639062306a36Sopenharmony_ci mirror_num = 1; 639162306a36Sopenharmony_ci } 639262306a36Sopenharmony_ci } else { 639362306a36Sopenharmony_ci /* 639462306a36Sopenharmony_ci * After this, stripe_nr is the number of stripes on this 639562306a36Sopenharmony_ci * device we have to walk to find the data, and stripe_index is 639662306a36Sopenharmony_ci * the number of our device in the stripe array 639762306a36Sopenharmony_ci */ 639862306a36Sopenharmony_ci stripe_index = stripe_nr % map->num_stripes; 639962306a36Sopenharmony_ci stripe_nr /= map->num_stripes; 640062306a36Sopenharmony_ci mirror_num = stripe_index + 1; 640162306a36Sopenharmony_ci } 640262306a36Sopenharmony_ci if (stripe_index >= map->num_stripes) { 640362306a36Sopenharmony_ci btrfs_crit(fs_info, 640462306a36Sopenharmony_ci "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 640562306a36Sopenharmony_ci stripe_index, map->num_stripes); 640662306a36Sopenharmony_ci ret = -EINVAL; 640762306a36Sopenharmony_ci goto out; 640862306a36Sopenharmony_ci } 640962306a36Sopenharmony_ci 641062306a36Sopenharmony_ci num_alloc_stripes = num_stripes; 641162306a36Sopenharmony_ci if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 641262306a36Sopenharmony_ci op != BTRFS_MAP_READ) 641362306a36Sopenharmony_ci /* 641462306a36Sopenharmony_ci * For replace case, we need to add extra stripes for extra 641562306a36Sopenharmony_ci * duplicated stripes. 641662306a36Sopenharmony_ci * 641762306a36Sopenharmony_ci * For both WRITE and GET_READ_MIRRORS, we may have at most 641862306a36Sopenharmony_ci * 2 more stripes (DUP types, otherwise 1). 641962306a36Sopenharmony_ci */ 642062306a36Sopenharmony_ci num_alloc_stripes += 2; 642162306a36Sopenharmony_ci 642262306a36Sopenharmony_ci /* 642362306a36Sopenharmony_ci * If this I/O maps to a single device, try to return the device and 642462306a36Sopenharmony_ci * physical block information on the stack instead of allocating an 642562306a36Sopenharmony_ci * I/O context structure. 642662306a36Sopenharmony_ci */ 642762306a36Sopenharmony_ci if (smap && num_alloc_stripes == 1 && 642862306a36Sopenharmony_ci !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { 642962306a36Sopenharmony_ci set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); 643062306a36Sopenharmony_ci if (mirror_num_ret) 643162306a36Sopenharmony_ci *mirror_num_ret = mirror_num; 643262306a36Sopenharmony_ci *bioc_ret = NULL; 643362306a36Sopenharmony_ci ret = 0; 643462306a36Sopenharmony_ci goto out; 643562306a36Sopenharmony_ci } 643662306a36Sopenharmony_ci 643762306a36Sopenharmony_ci bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); 643862306a36Sopenharmony_ci if (!bioc) { 643962306a36Sopenharmony_ci ret = -ENOMEM; 644062306a36Sopenharmony_ci goto out; 644162306a36Sopenharmony_ci } 644262306a36Sopenharmony_ci bioc->map_type = map->type; 644362306a36Sopenharmony_ci 644462306a36Sopenharmony_ci /* 644562306a36Sopenharmony_ci * For RAID56 full map, we need to make sure the stripes[] follows the 644662306a36Sopenharmony_ci * rule that data stripes are all ordered, then followed with P and Q 644762306a36Sopenharmony_ci * (if we have). 644862306a36Sopenharmony_ci * 644962306a36Sopenharmony_ci * It's still mostly the same as other profiles, just with extra rotation. 645062306a36Sopenharmony_ci */ 645162306a36Sopenharmony_ci if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 645262306a36Sopenharmony_ci (op != BTRFS_MAP_READ || mirror_num > 1)) { 645362306a36Sopenharmony_ci /* 645462306a36Sopenharmony_ci * For RAID56 @stripe_nr is already the number of full stripes 645562306a36Sopenharmony_ci * before us, which is also the rotation value (needs to modulo 645662306a36Sopenharmony_ci * with num_stripes). 645762306a36Sopenharmony_ci * 645862306a36Sopenharmony_ci * In this case, we just add @stripe_nr with @i, then do the 645962306a36Sopenharmony_ci * modulo, to reduce one modulo call. 646062306a36Sopenharmony_ci */ 646162306a36Sopenharmony_ci bioc->full_stripe_logical = em->start + 646262306a36Sopenharmony_ci btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); 646362306a36Sopenharmony_ci for (i = 0; i < num_stripes; i++) 646462306a36Sopenharmony_ci set_io_stripe(&bioc->stripes[i], map, 646562306a36Sopenharmony_ci (i + stripe_nr) % num_stripes, 646662306a36Sopenharmony_ci stripe_offset, stripe_nr); 646762306a36Sopenharmony_ci } else { 646862306a36Sopenharmony_ci /* 646962306a36Sopenharmony_ci * For all other non-RAID56 profiles, just copy the target 647062306a36Sopenharmony_ci * stripe into the bioc. 647162306a36Sopenharmony_ci */ 647262306a36Sopenharmony_ci for (i = 0; i < num_stripes; i++) { 647362306a36Sopenharmony_ci set_io_stripe(&bioc->stripes[i], map, stripe_index, 647462306a36Sopenharmony_ci stripe_offset, stripe_nr); 647562306a36Sopenharmony_ci stripe_index++; 647662306a36Sopenharmony_ci } 647762306a36Sopenharmony_ci } 647862306a36Sopenharmony_ci 647962306a36Sopenharmony_ci if (op != BTRFS_MAP_READ) 648062306a36Sopenharmony_ci max_errors = btrfs_chunk_max_errors(map); 648162306a36Sopenharmony_ci 648262306a36Sopenharmony_ci if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 648362306a36Sopenharmony_ci op != BTRFS_MAP_READ) { 648462306a36Sopenharmony_ci handle_ops_on_dev_replace(op, bioc, dev_replace, logical, 648562306a36Sopenharmony_ci &num_stripes, &max_errors); 648662306a36Sopenharmony_ci } 648762306a36Sopenharmony_ci 648862306a36Sopenharmony_ci *bioc_ret = bioc; 648962306a36Sopenharmony_ci bioc->num_stripes = num_stripes; 649062306a36Sopenharmony_ci bioc->max_errors = max_errors; 649162306a36Sopenharmony_ci bioc->mirror_num = mirror_num; 649262306a36Sopenharmony_ci 649362306a36Sopenharmony_ciout: 649462306a36Sopenharmony_ci if (dev_replace_is_ongoing) { 649562306a36Sopenharmony_ci lockdep_assert_held(&dev_replace->rwsem); 649662306a36Sopenharmony_ci /* Unlock and let waiting writers proceed */ 649762306a36Sopenharmony_ci up_read(&dev_replace->rwsem); 649862306a36Sopenharmony_ci } 649962306a36Sopenharmony_ci free_extent_map(em); 650062306a36Sopenharmony_ci return ret; 650162306a36Sopenharmony_ci} 650262306a36Sopenharmony_ci 650362306a36Sopenharmony_cistatic bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, 650462306a36Sopenharmony_ci const struct btrfs_fs_devices *fs_devices) 650562306a36Sopenharmony_ci{ 650662306a36Sopenharmony_ci if (args->fsid == NULL) 650762306a36Sopenharmony_ci return true; 650862306a36Sopenharmony_ci if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) 650962306a36Sopenharmony_ci return true; 651062306a36Sopenharmony_ci return false; 651162306a36Sopenharmony_ci} 651262306a36Sopenharmony_ci 651362306a36Sopenharmony_cistatic bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, 651462306a36Sopenharmony_ci const struct btrfs_device *device) 651562306a36Sopenharmony_ci{ 651662306a36Sopenharmony_ci if (args->missing) { 651762306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && 651862306a36Sopenharmony_ci !device->bdev) 651962306a36Sopenharmony_ci return true; 652062306a36Sopenharmony_ci return false; 652162306a36Sopenharmony_ci } 652262306a36Sopenharmony_ci 652362306a36Sopenharmony_ci if (device->devid != args->devid) 652462306a36Sopenharmony_ci return false; 652562306a36Sopenharmony_ci if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) 652662306a36Sopenharmony_ci return false; 652762306a36Sopenharmony_ci return true; 652862306a36Sopenharmony_ci} 652962306a36Sopenharmony_ci 653062306a36Sopenharmony_ci/* 653162306a36Sopenharmony_ci * Find a device specified by @devid or @uuid in the list of @fs_devices, or 653262306a36Sopenharmony_ci * return NULL. 653362306a36Sopenharmony_ci * 653462306a36Sopenharmony_ci * If devid and uuid are both specified, the match must be exact, otherwise 653562306a36Sopenharmony_ci * only devid is used. 653662306a36Sopenharmony_ci */ 653762306a36Sopenharmony_cistruct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, 653862306a36Sopenharmony_ci const struct btrfs_dev_lookup_args *args) 653962306a36Sopenharmony_ci{ 654062306a36Sopenharmony_ci struct btrfs_device *device; 654162306a36Sopenharmony_ci struct btrfs_fs_devices *seed_devs; 654262306a36Sopenharmony_ci 654362306a36Sopenharmony_ci if (dev_args_match_fs_devices(args, fs_devices)) { 654462306a36Sopenharmony_ci list_for_each_entry(device, &fs_devices->devices, dev_list) { 654562306a36Sopenharmony_ci if (dev_args_match_device(args, device)) 654662306a36Sopenharmony_ci return device; 654762306a36Sopenharmony_ci } 654862306a36Sopenharmony_ci } 654962306a36Sopenharmony_ci 655062306a36Sopenharmony_ci list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 655162306a36Sopenharmony_ci if (!dev_args_match_fs_devices(args, seed_devs)) 655262306a36Sopenharmony_ci continue; 655362306a36Sopenharmony_ci list_for_each_entry(device, &seed_devs->devices, dev_list) { 655462306a36Sopenharmony_ci if (dev_args_match_device(args, device)) 655562306a36Sopenharmony_ci return device; 655662306a36Sopenharmony_ci } 655762306a36Sopenharmony_ci } 655862306a36Sopenharmony_ci 655962306a36Sopenharmony_ci return NULL; 656062306a36Sopenharmony_ci} 656162306a36Sopenharmony_ci 656262306a36Sopenharmony_cistatic struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 656362306a36Sopenharmony_ci u64 devid, u8 *dev_uuid) 656462306a36Sopenharmony_ci{ 656562306a36Sopenharmony_ci struct btrfs_device *device; 656662306a36Sopenharmony_ci unsigned int nofs_flag; 656762306a36Sopenharmony_ci 656862306a36Sopenharmony_ci /* 656962306a36Sopenharmony_ci * We call this under the chunk_mutex, so we want to use NOFS for this 657062306a36Sopenharmony_ci * allocation, however we don't want to change btrfs_alloc_device() to 657162306a36Sopenharmony_ci * always do NOFS because we use it in a lot of other GFP_KERNEL safe 657262306a36Sopenharmony_ci * places. 657362306a36Sopenharmony_ci */ 657462306a36Sopenharmony_ci 657562306a36Sopenharmony_ci nofs_flag = memalloc_nofs_save(); 657662306a36Sopenharmony_ci device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); 657762306a36Sopenharmony_ci memalloc_nofs_restore(nofs_flag); 657862306a36Sopenharmony_ci if (IS_ERR(device)) 657962306a36Sopenharmony_ci return device; 658062306a36Sopenharmony_ci 658162306a36Sopenharmony_ci list_add(&device->dev_list, &fs_devices->devices); 658262306a36Sopenharmony_ci device->fs_devices = fs_devices; 658362306a36Sopenharmony_ci fs_devices->num_devices++; 658462306a36Sopenharmony_ci 658562306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 658662306a36Sopenharmony_ci fs_devices->missing_devices++; 658762306a36Sopenharmony_ci 658862306a36Sopenharmony_ci return device; 658962306a36Sopenharmony_ci} 659062306a36Sopenharmony_ci 659162306a36Sopenharmony_ci/* 659262306a36Sopenharmony_ci * Allocate new device struct, set up devid and UUID. 659362306a36Sopenharmony_ci * 659462306a36Sopenharmony_ci * @fs_info: used only for generating a new devid, can be NULL if 659562306a36Sopenharmony_ci * devid is provided (i.e. @devid != NULL). 659662306a36Sopenharmony_ci * @devid: a pointer to devid for this device. If NULL a new devid 659762306a36Sopenharmony_ci * is generated. 659862306a36Sopenharmony_ci * @uuid: a pointer to UUID for this device. If NULL a new UUID 659962306a36Sopenharmony_ci * is generated. 660062306a36Sopenharmony_ci * @path: a pointer to device path if available, NULL otherwise. 660162306a36Sopenharmony_ci * 660262306a36Sopenharmony_ci * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 660362306a36Sopenharmony_ci * on error. Returned struct is not linked onto any lists and must be 660462306a36Sopenharmony_ci * destroyed with btrfs_free_device. 660562306a36Sopenharmony_ci */ 660662306a36Sopenharmony_cistruct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 660762306a36Sopenharmony_ci const u64 *devid, const u8 *uuid, 660862306a36Sopenharmony_ci const char *path) 660962306a36Sopenharmony_ci{ 661062306a36Sopenharmony_ci struct btrfs_device *dev; 661162306a36Sopenharmony_ci u64 tmp; 661262306a36Sopenharmony_ci 661362306a36Sopenharmony_ci if (WARN_ON(!devid && !fs_info)) 661462306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 661562306a36Sopenharmony_ci 661662306a36Sopenharmony_ci dev = kzalloc(sizeof(*dev), GFP_KERNEL); 661762306a36Sopenharmony_ci if (!dev) 661862306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 661962306a36Sopenharmony_ci 662062306a36Sopenharmony_ci INIT_LIST_HEAD(&dev->dev_list); 662162306a36Sopenharmony_ci INIT_LIST_HEAD(&dev->dev_alloc_list); 662262306a36Sopenharmony_ci INIT_LIST_HEAD(&dev->post_commit_list); 662362306a36Sopenharmony_ci 662462306a36Sopenharmony_ci atomic_set(&dev->dev_stats_ccnt, 0); 662562306a36Sopenharmony_ci btrfs_device_data_ordered_init(dev); 662662306a36Sopenharmony_ci extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); 662762306a36Sopenharmony_ci 662862306a36Sopenharmony_ci if (devid) 662962306a36Sopenharmony_ci tmp = *devid; 663062306a36Sopenharmony_ci else { 663162306a36Sopenharmony_ci int ret; 663262306a36Sopenharmony_ci 663362306a36Sopenharmony_ci ret = find_next_devid(fs_info, &tmp); 663462306a36Sopenharmony_ci if (ret) { 663562306a36Sopenharmony_ci btrfs_free_device(dev); 663662306a36Sopenharmony_ci return ERR_PTR(ret); 663762306a36Sopenharmony_ci } 663862306a36Sopenharmony_ci } 663962306a36Sopenharmony_ci dev->devid = tmp; 664062306a36Sopenharmony_ci 664162306a36Sopenharmony_ci if (uuid) 664262306a36Sopenharmony_ci memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 664362306a36Sopenharmony_ci else 664462306a36Sopenharmony_ci generate_random_uuid(dev->uuid); 664562306a36Sopenharmony_ci 664662306a36Sopenharmony_ci if (path) { 664762306a36Sopenharmony_ci struct rcu_string *name; 664862306a36Sopenharmony_ci 664962306a36Sopenharmony_ci name = rcu_string_strdup(path, GFP_KERNEL); 665062306a36Sopenharmony_ci if (!name) { 665162306a36Sopenharmony_ci btrfs_free_device(dev); 665262306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 665362306a36Sopenharmony_ci } 665462306a36Sopenharmony_ci rcu_assign_pointer(dev->name, name); 665562306a36Sopenharmony_ci } 665662306a36Sopenharmony_ci 665762306a36Sopenharmony_ci return dev; 665862306a36Sopenharmony_ci} 665962306a36Sopenharmony_ci 666062306a36Sopenharmony_cistatic void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 666162306a36Sopenharmony_ci u64 devid, u8 *uuid, bool error) 666262306a36Sopenharmony_ci{ 666362306a36Sopenharmony_ci if (error) 666462306a36Sopenharmony_ci btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 666562306a36Sopenharmony_ci devid, uuid); 666662306a36Sopenharmony_ci else 666762306a36Sopenharmony_ci btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 666862306a36Sopenharmony_ci devid, uuid); 666962306a36Sopenharmony_ci} 667062306a36Sopenharmony_ci 667162306a36Sopenharmony_ciu64 btrfs_calc_stripe_length(const struct extent_map *em) 667262306a36Sopenharmony_ci{ 667362306a36Sopenharmony_ci const struct map_lookup *map = em->map_lookup; 667462306a36Sopenharmony_ci const int data_stripes = calc_data_stripes(map->type, map->num_stripes); 667562306a36Sopenharmony_ci 667662306a36Sopenharmony_ci return div_u64(em->len, data_stripes); 667762306a36Sopenharmony_ci} 667862306a36Sopenharmony_ci 667962306a36Sopenharmony_ci#if BITS_PER_LONG == 32 668062306a36Sopenharmony_ci/* 668162306a36Sopenharmony_ci * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 668262306a36Sopenharmony_ci * can't be accessed on 32bit systems. 668362306a36Sopenharmony_ci * 668462306a36Sopenharmony_ci * This function do mount time check to reject the fs if it already has 668562306a36Sopenharmony_ci * metadata chunk beyond that limit. 668662306a36Sopenharmony_ci */ 668762306a36Sopenharmony_cistatic int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 668862306a36Sopenharmony_ci u64 logical, u64 length, u64 type) 668962306a36Sopenharmony_ci{ 669062306a36Sopenharmony_ci if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 669162306a36Sopenharmony_ci return 0; 669262306a36Sopenharmony_ci 669362306a36Sopenharmony_ci if (logical + length < MAX_LFS_FILESIZE) 669462306a36Sopenharmony_ci return 0; 669562306a36Sopenharmony_ci 669662306a36Sopenharmony_ci btrfs_err_32bit_limit(fs_info); 669762306a36Sopenharmony_ci return -EOVERFLOW; 669862306a36Sopenharmony_ci} 669962306a36Sopenharmony_ci 670062306a36Sopenharmony_ci/* 670162306a36Sopenharmony_ci * This is to give early warning for any metadata chunk reaching 670262306a36Sopenharmony_ci * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 670362306a36Sopenharmony_ci * Although we can still access the metadata, it's not going to be possible 670462306a36Sopenharmony_ci * once the limit is reached. 670562306a36Sopenharmony_ci */ 670662306a36Sopenharmony_cistatic void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 670762306a36Sopenharmony_ci u64 logical, u64 length, u64 type) 670862306a36Sopenharmony_ci{ 670962306a36Sopenharmony_ci if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 671062306a36Sopenharmony_ci return; 671162306a36Sopenharmony_ci 671262306a36Sopenharmony_ci if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 671362306a36Sopenharmony_ci return; 671462306a36Sopenharmony_ci 671562306a36Sopenharmony_ci btrfs_warn_32bit_limit(fs_info); 671662306a36Sopenharmony_ci} 671762306a36Sopenharmony_ci#endif 671862306a36Sopenharmony_ci 671962306a36Sopenharmony_cistatic struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, 672062306a36Sopenharmony_ci u64 devid, u8 *uuid) 672162306a36Sopenharmony_ci{ 672262306a36Sopenharmony_ci struct btrfs_device *dev; 672362306a36Sopenharmony_ci 672462306a36Sopenharmony_ci if (!btrfs_test_opt(fs_info, DEGRADED)) { 672562306a36Sopenharmony_ci btrfs_report_missing_device(fs_info, devid, uuid, true); 672662306a36Sopenharmony_ci return ERR_PTR(-ENOENT); 672762306a36Sopenharmony_ci } 672862306a36Sopenharmony_ci 672962306a36Sopenharmony_ci dev = add_missing_dev(fs_info->fs_devices, devid, uuid); 673062306a36Sopenharmony_ci if (IS_ERR(dev)) { 673162306a36Sopenharmony_ci btrfs_err(fs_info, "failed to init missing device %llu: %ld", 673262306a36Sopenharmony_ci devid, PTR_ERR(dev)); 673362306a36Sopenharmony_ci return dev; 673462306a36Sopenharmony_ci } 673562306a36Sopenharmony_ci btrfs_report_missing_device(fs_info, devid, uuid, false); 673662306a36Sopenharmony_ci 673762306a36Sopenharmony_ci return dev; 673862306a36Sopenharmony_ci} 673962306a36Sopenharmony_ci 674062306a36Sopenharmony_cistatic int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 674162306a36Sopenharmony_ci struct btrfs_chunk *chunk) 674262306a36Sopenharmony_ci{ 674362306a36Sopenharmony_ci BTRFS_DEV_LOOKUP_ARGS(args); 674462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = leaf->fs_info; 674562306a36Sopenharmony_ci struct extent_map_tree *map_tree = &fs_info->mapping_tree; 674662306a36Sopenharmony_ci struct map_lookup *map; 674762306a36Sopenharmony_ci struct extent_map *em; 674862306a36Sopenharmony_ci u64 logical; 674962306a36Sopenharmony_ci u64 length; 675062306a36Sopenharmony_ci u64 devid; 675162306a36Sopenharmony_ci u64 type; 675262306a36Sopenharmony_ci u8 uuid[BTRFS_UUID_SIZE]; 675362306a36Sopenharmony_ci int index; 675462306a36Sopenharmony_ci int num_stripes; 675562306a36Sopenharmony_ci int ret; 675662306a36Sopenharmony_ci int i; 675762306a36Sopenharmony_ci 675862306a36Sopenharmony_ci logical = key->offset; 675962306a36Sopenharmony_ci length = btrfs_chunk_length(leaf, chunk); 676062306a36Sopenharmony_ci type = btrfs_chunk_type(leaf, chunk); 676162306a36Sopenharmony_ci index = btrfs_bg_flags_to_raid_index(type); 676262306a36Sopenharmony_ci num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 676362306a36Sopenharmony_ci 676462306a36Sopenharmony_ci#if BITS_PER_LONG == 32 676562306a36Sopenharmony_ci ret = check_32bit_meta_chunk(fs_info, logical, length, type); 676662306a36Sopenharmony_ci if (ret < 0) 676762306a36Sopenharmony_ci return ret; 676862306a36Sopenharmony_ci warn_32bit_meta_chunk(fs_info, logical, length, type); 676962306a36Sopenharmony_ci#endif 677062306a36Sopenharmony_ci 677162306a36Sopenharmony_ci /* 677262306a36Sopenharmony_ci * Only need to verify chunk item if we're reading from sys chunk array, 677362306a36Sopenharmony_ci * as chunk item in tree block is already verified by tree-checker. 677462306a36Sopenharmony_ci */ 677562306a36Sopenharmony_ci if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 677662306a36Sopenharmony_ci ret = btrfs_check_chunk_valid(leaf, chunk, logical); 677762306a36Sopenharmony_ci if (ret) 677862306a36Sopenharmony_ci return ret; 677962306a36Sopenharmony_ci } 678062306a36Sopenharmony_ci 678162306a36Sopenharmony_ci read_lock(&map_tree->lock); 678262306a36Sopenharmony_ci em = lookup_extent_mapping(map_tree, logical, 1); 678362306a36Sopenharmony_ci read_unlock(&map_tree->lock); 678462306a36Sopenharmony_ci 678562306a36Sopenharmony_ci /* already mapped? */ 678662306a36Sopenharmony_ci if (em && em->start <= logical && em->start + em->len > logical) { 678762306a36Sopenharmony_ci free_extent_map(em); 678862306a36Sopenharmony_ci return 0; 678962306a36Sopenharmony_ci } else if (em) { 679062306a36Sopenharmony_ci free_extent_map(em); 679162306a36Sopenharmony_ci } 679262306a36Sopenharmony_ci 679362306a36Sopenharmony_ci em = alloc_extent_map(); 679462306a36Sopenharmony_ci if (!em) 679562306a36Sopenharmony_ci return -ENOMEM; 679662306a36Sopenharmony_ci map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 679762306a36Sopenharmony_ci if (!map) { 679862306a36Sopenharmony_ci free_extent_map(em); 679962306a36Sopenharmony_ci return -ENOMEM; 680062306a36Sopenharmony_ci } 680162306a36Sopenharmony_ci 680262306a36Sopenharmony_ci set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 680362306a36Sopenharmony_ci em->map_lookup = map; 680462306a36Sopenharmony_ci em->start = logical; 680562306a36Sopenharmony_ci em->len = length; 680662306a36Sopenharmony_ci em->orig_start = 0; 680762306a36Sopenharmony_ci em->block_start = 0; 680862306a36Sopenharmony_ci em->block_len = em->len; 680962306a36Sopenharmony_ci 681062306a36Sopenharmony_ci map->num_stripes = num_stripes; 681162306a36Sopenharmony_ci map->io_width = btrfs_chunk_io_width(leaf, chunk); 681262306a36Sopenharmony_ci map->io_align = btrfs_chunk_io_align(leaf, chunk); 681362306a36Sopenharmony_ci map->type = type; 681462306a36Sopenharmony_ci /* 681562306a36Sopenharmony_ci * We can't use the sub_stripes value, as for profiles other than 681662306a36Sopenharmony_ci * RAID10, they may have 0 as sub_stripes for filesystems created by 681762306a36Sopenharmony_ci * older mkfs (<v5.4). 681862306a36Sopenharmony_ci * In that case, it can cause divide-by-zero errors later. 681962306a36Sopenharmony_ci * Since currently sub_stripes is fixed for each profile, let's 682062306a36Sopenharmony_ci * use the trusted value instead. 682162306a36Sopenharmony_ci */ 682262306a36Sopenharmony_ci map->sub_stripes = btrfs_raid_array[index].sub_stripes; 682362306a36Sopenharmony_ci map->verified_stripes = 0; 682462306a36Sopenharmony_ci em->orig_block_len = btrfs_calc_stripe_length(em); 682562306a36Sopenharmony_ci for (i = 0; i < num_stripes; i++) { 682662306a36Sopenharmony_ci map->stripes[i].physical = 682762306a36Sopenharmony_ci btrfs_stripe_offset_nr(leaf, chunk, i); 682862306a36Sopenharmony_ci devid = btrfs_stripe_devid_nr(leaf, chunk, i); 682962306a36Sopenharmony_ci args.devid = devid; 683062306a36Sopenharmony_ci read_extent_buffer(leaf, uuid, (unsigned long) 683162306a36Sopenharmony_ci btrfs_stripe_dev_uuid_nr(chunk, i), 683262306a36Sopenharmony_ci BTRFS_UUID_SIZE); 683362306a36Sopenharmony_ci args.uuid = uuid; 683462306a36Sopenharmony_ci map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); 683562306a36Sopenharmony_ci if (!map->stripes[i].dev) { 683662306a36Sopenharmony_ci map->stripes[i].dev = handle_missing_device(fs_info, 683762306a36Sopenharmony_ci devid, uuid); 683862306a36Sopenharmony_ci if (IS_ERR(map->stripes[i].dev)) { 683962306a36Sopenharmony_ci ret = PTR_ERR(map->stripes[i].dev); 684062306a36Sopenharmony_ci free_extent_map(em); 684162306a36Sopenharmony_ci return ret; 684262306a36Sopenharmony_ci } 684362306a36Sopenharmony_ci } 684462306a36Sopenharmony_ci 684562306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 684662306a36Sopenharmony_ci &(map->stripes[i].dev->dev_state)); 684762306a36Sopenharmony_ci } 684862306a36Sopenharmony_ci 684962306a36Sopenharmony_ci write_lock(&map_tree->lock); 685062306a36Sopenharmony_ci ret = add_extent_mapping(map_tree, em, 0); 685162306a36Sopenharmony_ci write_unlock(&map_tree->lock); 685262306a36Sopenharmony_ci if (ret < 0) { 685362306a36Sopenharmony_ci btrfs_err(fs_info, 685462306a36Sopenharmony_ci "failed to add chunk map, start=%llu len=%llu: %d", 685562306a36Sopenharmony_ci em->start, em->len, ret); 685662306a36Sopenharmony_ci } 685762306a36Sopenharmony_ci free_extent_map(em); 685862306a36Sopenharmony_ci 685962306a36Sopenharmony_ci return ret; 686062306a36Sopenharmony_ci} 686162306a36Sopenharmony_ci 686262306a36Sopenharmony_cistatic void fill_device_from_item(struct extent_buffer *leaf, 686362306a36Sopenharmony_ci struct btrfs_dev_item *dev_item, 686462306a36Sopenharmony_ci struct btrfs_device *device) 686562306a36Sopenharmony_ci{ 686662306a36Sopenharmony_ci unsigned long ptr; 686762306a36Sopenharmony_ci 686862306a36Sopenharmony_ci device->devid = btrfs_device_id(leaf, dev_item); 686962306a36Sopenharmony_ci device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 687062306a36Sopenharmony_ci device->total_bytes = device->disk_total_bytes; 687162306a36Sopenharmony_ci device->commit_total_bytes = device->disk_total_bytes; 687262306a36Sopenharmony_ci device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 687362306a36Sopenharmony_ci device->commit_bytes_used = device->bytes_used; 687462306a36Sopenharmony_ci device->type = btrfs_device_type(leaf, dev_item); 687562306a36Sopenharmony_ci device->io_align = btrfs_device_io_align(leaf, dev_item); 687662306a36Sopenharmony_ci device->io_width = btrfs_device_io_width(leaf, dev_item); 687762306a36Sopenharmony_ci device->sector_size = btrfs_device_sector_size(leaf, dev_item); 687862306a36Sopenharmony_ci WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 687962306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 688062306a36Sopenharmony_ci 688162306a36Sopenharmony_ci ptr = btrfs_device_uuid(dev_item); 688262306a36Sopenharmony_ci read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 688362306a36Sopenharmony_ci} 688462306a36Sopenharmony_ci 688562306a36Sopenharmony_cistatic struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 688662306a36Sopenharmony_ci u8 *fsid) 688762306a36Sopenharmony_ci{ 688862306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices; 688962306a36Sopenharmony_ci int ret; 689062306a36Sopenharmony_ci 689162306a36Sopenharmony_ci lockdep_assert_held(&uuid_mutex); 689262306a36Sopenharmony_ci ASSERT(fsid); 689362306a36Sopenharmony_ci 689462306a36Sopenharmony_ci /* This will match only for multi-device seed fs */ 689562306a36Sopenharmony_ci list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 689662306a36Sopenharmony_ci if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 689762306a36Sopenharmony_ci return fs_devices; 689862306a36Sopenharmony_ci 689962306a36Sopenharmony_ci 690062306a36Sopenharmony_ci fs_devices = find_fsid(fsid, NULL); 690162306a36Sopenharmony_ci if (!fs_devices) { 690262306a36Sopenharmony_ci if (!btrfs_test_opt(fs_info, DEGRADED)) 690362306a36Sopenharmony_ci return ERR_PTR(-ENOENT); 690462306a36Sopenharmony_ci 690562306a36Sopenharmony_ci fs_devices = alloc_fs_devices(fsid, NULL); 690662306a36Sopenharmony_ci if (IS_ERR(fs_devices)) 690762306a36Sopenharmony_ci return fs_devices; 690862306a36Sopenharmony_ci 690962306a36Sopenharmony_ci fs_devices->seeding = true; 691062306a36Sopenharmony_ci fs_devices->opened = 1; 691162306a36Sopenharmony_ci return fs_devices; 691262306a36Sopenharmony_ci } 691362306a36Sopenharmony_ci 691462306a36Sopenharmony_ci /* 691562306a36Sopenharmony_ci * Upon first call for a seed fs fsid, just create a private copy of the 691662306a36Sopenharmony_ci * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 691762306a36Sopenharmony_ci */ 691862306a36Sopenharmony_ci fs_devices = clone_fs_devices(fs_devices); 691962306a36Sopenharmony_ci if (IS_ERR(fs_devices)) 692062306a36Sopenharmony_ci return fs_devices; 692162306a36Sopenharmony_ci 692262306a36Sopenharmony_ci ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); 692362306a36Sopenharmony_ci if (ret) { 692462306a36Sopenharmony_ci free_fs_devices(fs_devices); 692562306a36Sopenharmony_ci return ERR_PTR(ret); 692662306a36Sopenharmony_ci } 692762306a36Sopenharmony_ci 692862306a36Sopenharmony_ci if (!fs_devices->seeding) { 692962306a36Sopenharmony_ci close_fs_devices(fs_devices); 693062306a36Sopenharmony_ci free_fs_devices(fs_devices); 693162306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 693262306a36Sopenharmony_ci } 693362306a36Sopenharmony_ci 693462306a36Sopenharmony_ci list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 693562306a36Sopenharmony_ci 693662306a36Sopenharmony_ci return fs_devices; 693762306a36Sopenharmony_ci} 693862306a36Sopenharmony_ci 693962306a36Sopenharmony_cistatic int read_one_dev(struct extent_buffer *leaf, 694062306a36Sopenharmony_ci struct btrfs_dev_item *dev_item) 694162306a36Sopenharmony_ci{ 694262306a36Sopenharmony_ci BTRFS_DEV_LOOKUP_ARGS(args); 694362306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = leaf->fs_info; 694462306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 694562306a36Sopenharmony_ci struct btrfs_device *device; 694662306a36Sopenharmony_ci u64 devid; 694762306a36Sopenharmony_ci int ret; 694862306a36Sopenharmony_ci u8 fs_uuid[BTRFS_FSID_SIZE]; 694962306a36Sopenharmony_ci u8 dev_uuid[BTRFS_UUID_SIZE]; 695062306a36Sopenharmony_ci 695162306a36Sopenharmony_ci devid = btrfs_device_id(leaf, dev_item); 695262306a36Sopenharmony_ci args.devid = devid; 695362306a36Sopenharmony_ci read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 695462306a36Sopenharmony_ci BTRFS_UUID_SIZE); 695562306a36Sopenharmony_ci read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 695662306a36Sopenharmony_ci BTRFS_FSID_SIZE); 695762306a36Sopenharmony_ci args.uuid = dev_uuid; 695862306a36Sopenharmony_ci args.fsid = fs_uuid; 695962306a36Sopenharmony_ci 696062306a36Sopenharmony_ci if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 696162306a36Sopenharmony_ci fs_devices = open_seed_devices(fs_info, fs_uuid); 696262306a36Sopenharmony_ci if (IS_ERR(fs_devices)) 696362306a36Sopenharmony_ci return PTR_ERR(fs_devices); 696462306a36Sopenharmony_ci } 696562306a36Sopenharmony_ci 696662306a36Sopenharmony_ci device = btrfs_find_device(fs_info->fs_devices, &args); 696762306a36Sopenharmony_ci if (!device) { 696862306a36Sopenharmony_ci if (!btrfs_test_opt(fs_info, DEGRADED)) { 696962306a36Sopenharmony_ci btrfs_report_missing_device(fs_info, devid, 697062306a36Sopenharmony_ci dev_uuid, true); 697162306a36Sopenharmony_ci return -ENOENT; 697262306a36Sopenharmony_ci } 697362306a36Sopenharmony_ci 697462306a36Sopenharmony_ci device = add_missing_dev(fs_devices, devid, dev_uuid); 697562306a36Sopenharmony_ci if (IS_ERR(device)) { 697662306a36Sopenharmony_ci btrfs_err(fs_info, 697762306a36Sopenharmony_ci "failed to add missing dev %llu: %ld", 697862306a36Sopenharmony_ci devid, PTR_ERR(device)); 697962306a36Sopenharmony_ci return PTR_ERR(device); 698062306a36Sopenharmony_ci } 698162306a36Sopenharmony_ci btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 698262306a36Sopenharmony_ci } else { 698362306a36Sopenharmony_ci if (!device->bdev) { 698462306a36Sopenharmony_ci if (!btrfs_test_opt(fs_info, DEGRADED)) { 698562306a36Sopenharmony_ci btrfs_report_missing_device(fs_info, 698662306a36Sopenharmony_ci devid, dev_uuid, true); 698762306a36Sopenharmony_ci return -ENOENT; 698862306a36Sopenharmony_ci } 698962306a36Sopenharmony_ci btrfs_report_missing_device(fs_info, devid, 699062306a36Sopenharmony_ci dev_uuid, false); 699162306a36Sopenharmony_ci } 699262306a36Sopenharmony_ci 699362306a36Sopenharmony_ci if (!device->bdev && 699462306a36Sopenharmony_ci !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 699562306a36Sopenharmony_ci /* 699662306a36Sopenharmony_ci * this happens when a device that was properly setup 699762306a36Sopenharmony_ci * in the device info lists suddenly goes bad. 699862306a36Sopenharmony_ci * device->bdev is NULL, and so we have to set 699962306a36Sopenharmony_ci * device->missing to one here 700062306a36Sopenharmony_ci */ 700162306a36Sopenharmony_ci device->fs_devices->missing_devices++; 700262306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 700362306a36Sopenharmony_ci } 700462306a36Sopenharmony_ci 700562306a36Sopenharmony_ci /* Move the device to its own fs_devices */ 700662306a36Sopenharmony_ci if (device->fs_devices != fs_devices) { 700762306a36Sopenharmony_ci ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 700862306a36Sopenharmony_ci &device->dev_state)); 700962306a36Sopenharmony_ci 701062306a36Sopenharmony_ci list_move(&device->dev_list, &fs_devices->devices); 701162306a36Sopenharmony_ci device->fs_devices->num_devices--; 701262306a36Sopenharmony_ci fs_devices->num_devices++; 701362306a36Sopenharmony_ci 701462306a36Sopenharmony_ci device->fs_devices->missing_devices--; 701562306a36Sopenharmony_ci fs_devices->missing_devices++; 701662306a36Sopenharmony_ci 701762306a36Sopenharmony_ci device->fs_devices = fs_devices; 701862306a36Sopenharmony_ci } 701962306a36Sopenharmony_ci } 702062306a36Sopenharmony_ci 702162306a36Sopenharmony_ci if (device->fs_devices != fs_info->fs_devices) { 702262306a36Sopenharmony_ci BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 702362306a36Sopenharmony_ci if (device->generation != 702462306a36Sopenharmony_ci btrfs_device_generation(leaf, dev_item)) 702562306a36Sopenharmony_ci return -EINVAL; 702662306a36Sopenharmony_ci } 702762306a36Sopenharmony_ci 702862306a36Sopenharmony_ci fill_device_from_item(leaf, dev_item, device); 702962306a36Sopenharmony_ci if (device->bdev) { 703062306a36Sopenharmony_ci u64 max_total_bytes = bdev_nr_bytes(device->bdev); 703162306a36Sopenharmony_ci 703262306a36Sopenharmony_ci if (device->total_bytes > max_total_bytes) { 703362306a36Sopenharmony_ci btrfs_err(fs_info, 703462306a36Sopenharmony_ci "device total_bytes should be at most %llu but found %llu", 703562306a36Sopenharmony_ci max_total_bytes, device->total_bytes); 703662306a36Sopenharmony_ci return -EINVAL; 703762306a36Sopenharmony_ci } 703862306a36Sopenharmony_ci } 703962306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 704062306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 704162306a36Sopenharmony_ci !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 704262306a36Sopenharmony_ci device->fs_devices->total_rw_bytes += device->total_bytes; 704362306a36Sopenharmony_ci atomic64_add(device->total_bytes - device->bytes_used, 704462306a36Sopenharmony_ci &fs_info->free_chunk_space); 704562306a36Sopenharmony_ci } 704662306a36Sopenharmony_ci ret = 0; 704762306a36Sopenharmony_ci return ret; 704862306a36Sopenharmony_ci} 704962306a36Sopenharmony_ci 705062306a36Sopenharmony_ciint btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 705162306a36Sopenharmony_ci{ 705262306a36Sopenharmony_ci struct btrfs_super_block *super_copy = fs_info->super_copy; 705362306a36Sopenharmony_ci struct extent_buffer *sb; 705462306a36Sopenharmony_ci struct btrfs_disk_key *disk_key; 705562306a36Sopenharmony_ci struct btrfs_chunk *chunk; 705662306a36Sopenharmony_ci u8 *array_ptr; 705762306a36Sopenharmony_ci unsigned long sb_array_offset; 705862306a36Sopenharmony_ci int ret = 0; 705962306a36Sopenharmony_ci u32 num_stripes; 706062306a36Sopenharmony_ci u32 array_size; 706162306a36Sopenharmony_ci u32 len = 0; 706262306a36Sopenharmony_ci u32 cur_offset; 706362306a36Sopenharmony_ci u64 type; 706462306a36Sopenharmony_ci struct btrfs_key key; 706562306a36Sopenharmony_ci 706662306a36Sopenharmony_ci ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 706762306a36Sopenharmony_ci 706862306a36Sopenharmony_ci /* 706962306a36Sopenharmony_ci * We allocated a dummy extent, just to use extent buffer accessors. 707062306a36Sopenharmony_ci * There will be unused space after BTRFS_SUPER_INFO_SIZE, but 707162306a36Sopenharmony_ci * that's fine, we will not go beyond system chunk array anyway. 707262306a36Sopenharmony_ci */ 707362306a36Sopenharmony_ci sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); 707462306a36Sopenharmony_ci if (!sb) 707562306a36Sopenharmony_ci return -ENOMEM; 707662306a36Sopenharmony_ci set_extent_buffer_uptodate(sb); 707762306a36Sopenharmony_ci 707862306a36Sopenharmony_ci write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 707962306a36Sopenharmony_ci array_size = btrfs_super_sys_array_size(super_copy); 708062306a36Sopenharmony_ci 708162306a36Sopenharmony_ci array_ptr = super_copy->sys_chunk_array; 708262306a36Sopenharmony_ci sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 708362306a36Sopenharmony_ci cur_offset = 0; 708462306a36Sopenharmony_ci 708562306a36Sopenharmony_ci while (cur_offset < array_size) { 708662306a36Sopenharmony_ci disk_key = (struct btrfs_disk_key *)array_ptr; 708762306a36Sopenharmony_ci len = sizeof(*disk_key); 708862306a36Sopenharmony_ci if (cur_offset + len > array_size) 708962306a36Sopenharmony_ci goto out_short_read; 709062306a36Sopenharmony_ci 709162306a36Sopenharmony_ci btrfs_disk_key_to_cpu(&key, disk_key); 709262306a36Sopenharmony_ci 709362306a36Sopenharmony_ci array_ptr += len; 709462306a36Sopenharmony_ci sb_array_offset += len; 709562306a36Sopenharmony_ci cur_offset += len; 709662306a36Sopenharmony_ci 709762306a36Sopenharmony_ci if (key.type != BTRFS_CHUNK_ITEM_KEY) { 709862306a36Sopenharmony_ci btrfs_err(fs_info, 709962306a36Sopenharmony_ci "unexpected item type %u in sys_array at offset %u", 710062306a36Sopenharmony_ci (u32)key.type, cur_offset); 710162306a36Sopenharmony_ci ret = -EIO; 710262306a36Sopenharmony_ci break; 710362306a36Sopenharmony_ci } 710462306a36Sopenharmony_ci 710562306a36Sopenharmony_ci chunk = (struct btrfs_chunk *)sb_array_offset; 710662306a36Sopenharmony_ci /* 710762306a36Sopenharmony_ci * At least one btrfs_chunk with one stripe must be present, 710862306a36Sopenharmony_ci * exact stripe count check comes afterwards 710962306a36Sopenharmony_ci */ 711062306a36Sopenharmony_ci len = btrfs_chunk_item_size(1); 711162306a36Sopenharmony_ci if (cur_offset + len > array_size) 711262306a36Sopenharmony_ci goto out_short_read; 711362306a36Sopenharmony_ci 711462306a36Sopenharmony_ci num_stripes = btrfs_chunk_num_stripes(sb, chunk); 711562306a36Sopenharmony_ci if (!num_stripes) { 711662306a36Sopenharmony_ci btrfs_err(fs_info, 711762306a36Sopenharmony_ci "invalid number of stripes %u in sys_array at offset %u", 711862306a36Sopenharmony_ci num_stripes, cur_offset); 711962306a36Sopenharmony_ci ret = -EIO; 712062306a36Sopenharmony_ci break; 712162306a36Sopenharmony_ci } 712262306a36Sopenharmony_ci 712362306a36Sopenharmony_ci type = btrfs_chunk_type(sb, chunk); 712462306a36Sopenharmony_ci if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 712562306a36Sopenharmony_ci btrfs_err(fs_info, 712662306a36Sopenharmony_ci "invalid chunk type %llu in sys_array at offset %u", 712762306a36Sopenharmony_ci type, cur_offset); 712862306a36Sopenharmony_ci ret = -EIO; 712962306a36Sopenharmony_ci break; 713062306a36Sopenharmony_ci } 713162306a36Sopenharmony_ci 713262306a36Sopenharmony_ci len = btrfs_chunk_item_size(num_stripes); 713362306a36Sopenharmony_ci if (cur_offset + len > array_size) 713462306a36Sopenharmony_ci goto out_short_read; 713562306a36Sopenharmony_ci 713662306a36Sopenharmony_ci ret = read_one_chunk(&key, sb, chunk); 713762306a36Sopenharmony_ci if (ret) 713862306a36Sopenharmony_ci break; 713962306a36Sopenharmony_ci 714062306a36Sopenharmony_ci array_ptr += len; 714162306a36Sopenharmony_ci sb_array_offset += len; 714262306a36Sopenharmony_ci cur_offset += len; 714362306a36Sopenharmony_ci } 714462306a36Sopenharmony_ci clear_extent_buffer_uptodate(sb); 714562306a36Sopenharmony_ci free_extent_buffer_stale(sb); 714662306a36Sopenharmony_ci return ret; 714762306a36Sopenharmony_ci 714862306a36Sopenharmony_ciout_short_read: 714962306a36Sopenharmony_ci btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 715062306a36Sopenharmony_ci len, cur_offset); 715162306a36Sopenharmony_ci clear_extent_buffer_uptodate(sb); 715262306a36Sopenharmony_ci free_extent_buffer_stale(sb); 715362306a36Sopenharmony_ci return -EIO; 715462306a36Sopenharmony_ci} 715562306a36Sopenharmony_ci 715662306a36Sopenharmony_ci/* 715762306a36Sopenharmony_ci * Check if all chunks in the fs are OK for read-write degraded mount 715862306a36Sopenharmony_ci * 715962306a36Sopenharmony_ci * If the @failing_dev is specified, it's accounted as missing. 716062306a36Sopenharmony_ci * 716162306a36Sopenharmony_ci * Return true if all chunks meet the minimal RW mount requirements. 716262306a36Sopenharmony_ci * Return false if any chunk doesn't meet the minimal RW mount requirements. 716362306a36Sopenharmony_ci */ 716462306a36Sopenharmony_cibool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 716562306a36Sopenharmony_ci struct btrfs_device *failing_dev) 716662306a36Sopenharmony_ci{ 716762306a36Sopenharmony_ci struct extent_map_tree *map_tree = &fs_info->mapping_tree; 716862306a36Sopenharmony_ci struct extent_map *em; 716962306a36Sopenharmony_ci u64 next_start = 0; 717062306a36Sopenharmony_ci bool ret = true; 717162306a36Sopenharmony_ci 717262306a36Sopenharmony_ci read_lock(&map_tree->lock); 717362306a36Sopenharmony_ci em = lookup_extent_mapping(map_tree, 0, (u64)-1); 717462306a36Sopenharmony_ci read_unlock(&map_tree->lock); 717562306a36Sopenharmony_ci /* No chunk at all? Return false anyway */ 717662306a36Sopenharmony_ci if (!em) { 717762306a36Sopenharmony_ci ret = false; 717862306a36Sopenharmony_ci goto out; 717962306a36Sopenharmony_ci } 718062306a36Sopenharmony_ci while (em) { 718162306a36Sopenharmony_ci struct map_lookup *map; 718262306a36Sopenharmony_ci int missing = 0; 718362306a36Sopenharmony_ci int max_tolerated; 718462306a36Sopenharmony_ci int i; 718562306a36Sopenharmony_ci 718662306a36Sopenharmony_ci map = em->map_lookup; 718762306a36Sopenharmony_ci max_tolerated = 718862306a36Sopenharmony_ci btrfs_get_num_tolerated_disk_barrier_failures( 718962306a36Sopenharmony_ci map->type); 719062306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 719162306a36Sopenharmony_ci struct btrfs_device *dev = map->stripes[i].dev; 719262306a36Sopenharmony_ci 719362306a36Sopenharmony_ci if (!dev || !dev->bdev || 719462306a36Sopenharmony_ci test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 719562306a36Sopenharmony_ci dev->last_flush_error) 719662306a36Sopenharmony_ci missing++; 719762306a36Sopenharmony_ci else if (failing_dev && failing_dev == dev) 719862306a36Sopenharmony_ci missing++; 719962306a36Sopenharmony_ci } 720062306a36Sopenharmony_ci if (missing > max_tolerated) { 720162306a36Sopenharmony_ci if (!failing_dev) 720262306a36Sopenharmony_ci btrfs_warn(fs_info, 720362306a36Sopenharmony_ci "chunk %llu missing %d devices, max tolerance is %d for writable mount", 720462306a36Sopenharmony_ci em->start, missing, max_tolerated); 720562306a36Sopenharmony_ci free_extent_map(em); 720662306a36Sopenharmony_ci ret = false; 720762306a36Sopenharmony_ci goto out; 720862306a36Sopenharmony_ci } 720962306a36Sopenharmony_ci next_start = extent_map_end(em); 721062306a36Sopenharmony_ci free_extent_map(em); 721162306a36Sopenharmony_ci 721262306a36Sopenharmony_ci read_lock(&map_tree->lock); 721362306a36Sopenharmony_ci em = lookup_extent_mapping(map_tree, next_start, 721462306a36Sopenharmony_ci (u64)(-1) - next_start); 721562306a36Sopenharmony_ci read_unlock(&map_tree->lock); 721662306a36Sopenharmony_ci } 721762306a36Sopenharmony_ciout: 721862306a36Sopenharmony_ci return ret; 721962306a36Sopenharmony_ci} 722062306a36Sopenharmony_ci 722162306a36Sopenharmony_cistatic void readahead_tree_node_children(struct extent_buffer *node) 722262306a36Sopenharmony_ci{ 722362306a36Sopenharmony_ci int i; 722462306a36Sopenharmony_ci const int nr_items = btrfs_header_nritems(node); 722562306a36Sopenharmony_ci 722662306a36Sopenharmony_ci for (i = 0; i < nr_items; i++) 722762306a36Sopenharmony_ci btrfs_readahead_node_child(node, i); 722862306a36Sopenharmony_ci} 722962306a36Sopenharmony_ci 723062306a36Sopenharmony_ciint btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 723162306a36Sopenharmony_ci{ 723262306a36Sopenharmony_ci struct btrfs_root *root = fs_info->chunk_root; 723362306a36Sopenharmony_ci struct btrfs_path *path; 723462306a36Sopenharmony_ci struct extent_buffer *leaf; 723562306a36Sopenharmony_ci struct btrfs_key key; 723662306a36Sopenharmony_ci struct btrfs_key found_key; 723762306a36Sopenharmony_ci int ret; 723862306a36Sopenharmony_ci int slot; 723962306a36Sopenharmony_ci int iter_ret = 0; 724062306a36Sopenharmony_ci u64 total_dev = 0; 724162306a36Sopenharmony_ci u64 last_ra_node = 0; 724262306a36Sopenharmony_ci 724362306a36Sopenharmony_ci path = btrfs_alloc_path(); 724462306a36Sopenharmony_ci if (!path) 724562306a36Sopenharmony_ci return -ENOMEM; 724662306a36Sopenharmony_ci 724762306a36Sopenharmony_ci /* 724862306a36Sopenharmony_ci * uuid_mutex is needed only if we are mounting a sprout FS 724962306a36Sopenharmony_ci * otherwise we don't need it. 725062306a36Sopenharmony_ci */ 725162306a36Sopenharmony_ci mutex_lock(&uuid_mutex); 725262306a36Sopenharmony_ci 725362306a36Sopenharmony_ci /* 725462306a36Sopenharmony_ci * It is possible for mount and umount to race in such a way that 725562306a36Sopenharmony_ci * we execute this code path, but open_fs_devices failed to clear 725662306a36Sopenharmony_ci * total_rw_bytes. We certainly want it cleared before reading the 725762306a36Sopenharmony_ci * device items, so clear it here. 725862306a36Sopenharmony_ci */ 725962306a36Sopenharmony_ci fs_info->fs_devices->total_rw_bytes = 0; 726062306a36Sopenharmony_ci 726162306a36Sopenharmony_ci /* 726262306a36Sopenharmony_ci * Lockdep complains about possible circular locking dependency between 726362306a36Sopenharmony_ci * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores 726462306a36Sopenharmony_ci * used for freeze procection of a fs (struct super_block.s_writers), 726562306a36Sopenharmony_ci * which we take when starting a transaction, and extent buffers of the 726662306a36Sopenharmony_ci * chunk tree if we call read_one_dev() while holding a lock on an 726762306a36Sopenharmony_ci * extent buffer of the chunk tree. Since we are mounting the filesystem 726862306a36Sopenharmony_ci * and at this point there can't be any concurrent task modifying the 726962306a36Sopenharmony_ci * chunk tree, to keep it simple, just skip locking on the chunk tree. 727062306a36Sopenharmony_ci */ 727162306a36Sopenharmony_ci ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 727262306a36Sopenharmony_ci path->skip_locking = 1; 727362306a36Sopenharmony_ci 727462306a36Sopenharmony_ci /* 727562306a36Sopenharmony_ci * Read all device items, and then all the chunk items. All 727662306a36Sopenharmony_ci * device items are found before any chunk item (their object id 727762306a36Sopenharmony_ci * is smaller than the lowest possible object id for a chunk 727862306a36Sopenharmony_ci * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 727962306a36Sopenharmony_ci */ 728062306a36Sopenharmony_ci key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 728162306a36Sopenharmony_ci key.offset = 0; 728262306a36Sopenharmony_ci key.type = 0; 728362306a36Sopenharmony_ci btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 728462306a36Sopenharmony_ci struct extent_buffer *node = path->nodes[1]; 728562306a36Sopenharmony_ci 728662306a36Sopenharmony_ci leaf = path->nodes[0]; 728762306a36Sopenharmony_ci slot = path->slots[0]; 728862306a36Sopenharmony_ci 728962306a36Sopenharmony_ci if (node) { 729062306a36Sopenharmony_ci if (last_ra_node != node->start) { 729162306a36Sopenharmony_ci readahead_tree_node_children(node); 729262306a36Sopenharmony_ci last_ra_node = node->start; 729362306a36Sopenharmony_ci } 729462306a36Sopenharmony_ci } 729562306a36Sopenharmony_ci if (found_key.type == BTRFS_DEV_ITEM_KEY) { 729662306a36Sopenharmony_ci struct btrfs_dev_item *dev_item; 729762306a36Sopenharmony_ci dev_item = btrfs_item_ptr(leaf, slot, 729862306a36Sopenharmony_ci struct btrfs_dev_item); 729962306a36Sopenharmony_ci ret = read_one_dev(leaf, dev_item); 730062306a36Sopenharmony_ci if (ret) 730162306a36Sopenharmony_ci goto error; 730262306a36Sopenharmony_ci total_dev++; 730362306a36Sopenharmony_ci } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 730462306a36Sopenharmony_ci struct btrfs_chunk *chunk; 730562306a36Sopenharmony_ci 730662306a36Sopenharmony_ci /* 730762306a36Sopenharmony_ci * We are only called at mount time, so no need to take 730862306a36Sopenharmony_ci * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 730962306a36Sopenharmony_ci * we always lock first fs_info->chunk_mutex before 731062306a36Sopenharmony_ci * acquiring any locks on the chunk tree. This is a 731162306a36Sopenharmony_ci * requirement for chunk allocation, see the comment on 731262306a36Sopenharmony_ci * top of btrfs_chunk_alloc() for details. 731362306a36Sopenharmony_ci */ 731462306a36Sopenharmony_ci chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 731562306a36Sopenharmony_ci ret = read_one_chunk(&found_key, leaf, chunk); 731662306a36Sopenharmony_ci if (ret) 731762306a36Sopenharmony_ci goto error; 731862306a36Sopenharmony_ci } 731962306a36Sopenharmony_ci } 732062306a36Sopenharmony_ci /* Catch error found during iteration */ 732162306a36Sopenharmony_ci if (iter_ret < 0) { 732262306a36Sopenharmony_ci ret = iter_ret; 732362306a36Sopenharmony_ci goto error; 732462306a36Sopenharmony_ci } 732562306a36Sopenharmony_ci 732662306a36Sopenharmony_ci /* 732762306a36Sopenharmony_ci * After loading chunk tree, we've got all device information, 732862306a36Sopenharmony_ci * do another round of validation checks. 732962306a36Sopenharmony_ci */ 733062306a36Sopenharmony_ci if (total_dev != fs_info->fs_devices->total_devices) { 733162306a36Sopenharmony_ci btrfs_warn(fs_info, 733262306a36Sopenharmony_ci"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", 733362306a36Sopenharmony_ci btrfs_super_num_devices(fs_info->super_copy), 733462306a36Sopenharmony_ci total_dev); 733562306a36Sopenharmony_ci fs_info->fs_devices->total_devices = total_dev; 733662306a36Sopenharmony_ci btrfs_set_super_num_devices(fs_info->super_copy, total_dev); 733762306a36Sopenharmony_ci } 733862306a36Sopenharmony_ci if (btrfs_super_total_bytes(fs_info->super_copy) < 733962306a36Sopenharmony_ci fs_info->fs_devices->total_rw_bytes) { 734062306a36Sopenharmony_ci btrfs_err(fs_info, 734162306a36Sopenharmony_ci "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 734262306a36Sopenharmony_ci btrfs_super_total_bytes(fs_info->super_copy), 734362306a36Sopenharmony_ci fs_info->fs_devices->total_rw_bytes); 734462306a36Sopenharmony_ci ret = -EINVAL; 734562306a36Sopenharmony_ci goto error; 734662306a36Sopenharmony_ci } 734762306a36Sopenharmony_ci ret = 0; 734862306a36Sopenharmony_cierror: 734962306a36Sopenharmony_ci mutex_unlock(&uuid_mutex); 735062306a36Sopenharmony_ci 735162306a36Sopenharmony_ci btrfs_free_path(path); 735262306a36Sopenharmony_ci return ret; 735362306a36Sopenharmony_ci} 735462306a36Sopenharmony_ci 735562306a36Sopenharmony_ciint btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 735662306a36Sopenharmony_ci{ 735762306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 735862306a36Sopenharmony_ci struct btrfs_device *device; 735962306a36Sopenharmony_ci int ret = 0; 736062306a36Sopenharmony_ci 736162306a36Sopenharmony_ci fs_devices->fs_info = fs_info; 736262306a36Sopenharmony_ci 736362306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 736462306a36Sopenharmony_ci list_for_each_entry(device, &fs_devices->devices, dev_list) 736562306a36Sopenharmony_ci device->fs_info = fs_info; 736662306a36Sopenharmony_ci 736762306a36Sopenharmony_ci list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 736862306a36Sopenharmony_ci list_for_each_entry(device, &seed_devs->devices, dev_list) { 736962306a36Sopenharmony_ci device->fs_info = fs_info; 737062306a36Sopenharmony_ci ret = btrfs_get_dev_zone_info(device, false); 737162306a36Sopenharmony_ci if (ret) 737262306a36Sopenharmony_ci break; 737362306a36Sopenharmony_ci } 737462306a36Sopenharmony_ci 737562306a36Sopenharmony_ci seed_devs->fs_info = fs_info; 737662306a36Sopenharmony_ci } 737762306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 737862306a36Sopenharmony_ci 737962306a36Sopenharmony_ci return ret; 738062306a36Sopenharmony_ci} 738162306a36Sopenharmony_ci 738262306a36Sopenharmony_cistatic u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 738362306a36Sopenharmony_ci const struct btrfs_dev_stats_item *ptr, 738462306a36Sopenharmony_ci int index) 738562306a36Sopenharmony_ci{ 738662306a36Sopenharmony_ci u64 val; 738762306a36Sopenharmony_ci 738862306a36Sopenharmony_ci read_extent_buffer(eb, &val, 738962306a36Sopenharmony_ci offsetof(struct btrfs_dev_stats_item, values) + 739062306a36Sopenharmony_ci ((unsigned long)ptr) + (index * sizeof(u64)), 739162306a36Sopenharmony_ci sizeof(val)); 739262306a36Sopenharmony_ci return val; 739362306a36Sopenharmony_ci} 739462306a36Sopenharmony_ci 739562306a36Sopenharmony_cistatic void btrfs_set_dev_stats_value(struct extent_buffer *eb, 739662306a36Sopenharmony_ci struct btrfs_dev_stats_item *ptr, 739762306a36Sopenharmony_ci int index, u64 val) 739862306a36Sopenharmony_ci{ 739962306a36Sopenharmony_ci write_extent_buffer(eb, &val, 740062306a36Sopenharmony_ci offsetof(struct btrfs_dev_stats_item, values) + 740162306a36Sopenharmony_ci ((unsigned long)ptr) + (index * sizeof(u64)), 740262306a36Sopenharmony_ci sizeof(val)); 740362306a36Sopenharmony_ci} 740462306a36Sopenharmony_ci 740562306a36Sopenharmony_cistatic int btrfs_device_init_dev_stats(struct btrfs_device *device, 740662306a36Sopenharmony_ci struct btrfs_path *path) 740762306a36Sopenharmony_ci{ 740862306a36Sopenharmony_ci struct btrfs_dev_stats_item *ptr; 740962306a36Sopenharmony_ci struct extent_buffer *eb; 741062306a36Sopenharmony_ci struct btrfs_key key; 741162306a36Sopenharmony_ci int item_size; 741262306a36Sopenharmony_ci int i, ret, slot; 741362306a36Sopenharmony_ci 741462306a36Sopenharmony_ci if (!device->fs_info->dev_root) 741562306a36Sopenharmony_ci return 0; 741662306a36Sopenharmony_ci 741762306a36Sopenharmony_ci key.objectid = BTRFS_DEV_STATS_OBJECTID; 741862306a36Sopenharmony_ci key.type = BTRFS_PERSISTENT_ITEM_KEY; 741962306a36Sopenharmony_ci key.offset = device->devid; 742062306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 742162306a36Sopenharmony_ci if (ret) { 742262306a36Sopenharmony_ci for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 742362306a36Sopenharmony_ci btrfs_dev_stat_set(device, i, 0); 742462306a36Sopenharmony_ci device->dev_stats_valid = 1; 742562306a36Sopenharmony_ci btrfs_release_path(path); 742662306a36Sopenharmony_ci return ret < 0 ? ret : 0; 742762306a36Sopenharmony_ci } 742862306a36Sopenharmony_ci slot = path->slots[0]; 742962306a36Sopenharmony_ci eb = path->nodes[0]; 743062306a36Sopenharmony_ci item_size = btrfs_item_size(eb, slot); 743162306a36Sopenharmony_ci 743262306a36Sopenharmony_ci ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 743362306a36Sopenharmony_ci 743462306a36Sopenharmony_ci for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 743562306a36Sopenharmony_ci if (item_size >= (1 + i) * sizeof(__le64)) 743662306a36Sopenharmony_ci btrfs_dev_stat_set(device, i, 743762306a36Sopenharmony_ci btrfs_dev_stats_value(eb, ptr, i)); 743862306a36Sopenharmony_ci else 743962306a36Sopenharmony_ci btrfs_dev_stat_set(device, i, 0); 744062306a36Sopenharmony_ci } 744162306a36Sopenharmony_ci 744262306a36Sopenharmony_ci device->dev_stats_valid = 1; 744362306a36Sopenharmony_ci btrfs_dev_stat_print_on_load(device); 744462306a36Sopenharmony_ci btrfs_release_path(path); 744562306a36Sopenharmony_ci 744662306a36Sopenharmony_ci return 0; 744762306a36Sopenharmony_ci} 744862306a36Sopenharmony_ci 744962306a36Sopenharmony_ciint btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 745062306a36Sopenharmony_ci{ 745162306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 745262306a36Sopenharmony_ci struct btrfs_device *device; 745362306a36Sopenharmony_ci struct btrfs_path *path = NULL; 745462306a36Sopenharmony_ci int ret = 0; 745562306a36Sopenharmony_ci 745662306a36Sopenharmony_ci path = btrfs_alloc_path(); 745762306a36Sopenharmony_ci if (!path) 745862306a36Sopenharmony_ci return -ENOMEM; 745962306a36Sopenharmony_ci 746062306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 746162306a36Sopenharmony_ci list_for_each_entry(device, &fs_devices->devices, dev_list) { 746262306a36Sopenharmony_ci ret = btrfs_device_init_dev_stats(device, path); 746362306a36Sopenharmony_ci if (ret) 746462306a36Sopenharmony_ci goto out; 746562306a36Sopenharmony_ci } 746662306a36Sopenharmony_ci list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 746762306a36Sopenharmony_ci list_for_each_entry(device, &seed_devs->devices, dev_list) { 746862306a36Sopenharmony_ci ret = btrfs_device_init_dev_stats(device, path); 746962306a36Sopenharmony_ci if (ret) 747062306a36Sopenharmony_ci goto out; 747162306a36Sopenharmony_ci } 747262306a36Sopenharmony_ci } 747362306a36Sopenharmony_ciout: 747462306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 747562306a36Sopenharmony_ci 747662306a36Sopenharmony_ci btrfs_free_path(path); 747762306a36Sopenharmony_ci return ret; 747862306a36Sopenharmony_ci} 747962306a36Sopenharmony_ci 748062306a36Sopenharmony_cistatic int update_dev_stat_item(struct btrfs_trans_handle *trans, 748162306a36Sopenharmony_ci struct btrfs_device *device) 748262306a36Sopenharmony_ci{ 748362306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 748462306a36Sopenharmony_ci struct btrfs_root *dev_root = fs_info->dev_root; 748562306a36Sopenharmony_ci struct btrfs_path *path; 748662306a36Sopenharmony_ci struct btrfs_key key; 748762306a36Sopenharmony_ci struct extent_buffer *eb; 748862306a36Sopenharmony_ci struct btrfs_dev_stats_item *ptr; 748962306a36Sopenharmony_ci int ret; 749062306a36Sopenharmony_ci int i; 749162306a36Sopenharmony_ci 749262306a36Sopenharmony_ci key.objectid = BTRFS_DEV_STATS_OBJECTID; 749362306a36Sopenharmony_ci key.type = BTRFS_PERSISTENT_ITEM_KEY; 749462306a36Sopenharmony_ci key.offset = device->devid; 749562306a36Sopenharmony_ci 749662306a36Sopenharmony_ci path = btrfs_alloc_path(); 749762306a36Sopenharmony_ci if (!path) 749862306a36Sopenharmony_ci return -ENOMEM; 749962306a36Sopenharmony_ci ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 750062306a36Sopenharmony_ci if (ret < 0) { 750162306a36Sopenharmony_ci btrfs_warn_in_rcu(fs_info, 750262306a36Sopenharmony_ci "error %d while searching for dev_stats item for device %s", 750362306a36Sopenharmony_ci ret, btrfs_dev_name(device)); 750462306a36Sopenharmony_ci goto out; 750562306a36Sopenharmony_ci } 750662306a36Sopenharmony_ci 750762306a36Sopenharmony_ci if (ret == 0 && 750862306a36Sopenharmony_ci btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 750962306a36Sopenharmony_ci /* need to delete old one and insert a new one */ 751062306a36Sopenharmony_ci ret = btrfs_del_item(trans, dev_root, path); 751162306a36Sopenharmony_ci if (ret != 0) { 751262306a36Sopenharmony_ci btrfs_warn_in_rcu(fs_info, 751362306a36Sopenharmony_ci "delete too small dev_stats item for device %s failed %d", 751462306a36Sopenharmony_ci btrfs_dev_name(device), ret); 751562306a36Sopenharmony_ci goto out; 751662306a36Sopenharmony_ci } 751762306a36Sopenharmony_ci ret = 1; 751862306a36Sopenharmony_ci } 751962306a36Sopenharmony_ci 752062306a36Sopenharmony_ci if (ret == 1) { 752162306a36Sopenharmony_ci /* need to insert a new item */ 752262306a36Sopenharmony_ci btrfs_release_path(path); 752362306a36Sopenharmony_ci ret = btrfs_insert_empty_item(trans, dev_root, path, 752462306a36Sopenharmony_ci &key, sizeof(*ptr)); 752562306a36Sopenharmony_ci if (ret < 0) { 752662306a36Sopenharmony_ci btrfs_warn_in_rcu(fs_info, 752762306a36Sopenharmony_ci "insert dev_stats item for device %s failed %d", 752862306a36Sopenharmony_ci btrfs_dev_name(device), ret); 752962306a36Sopenharmony_ci goto out; 753062306a36Sopenharmony_ci } 753162306a36Sopenharmony_ci } 753262306a36Sopenharmony_ci 753362306a36Sopenharmony_ci eb = path->nodes[0]; 753462306a36Sopenharmony_ci ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 753562306a36Sopenharmony_ci for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 753662306a36Sopenharmony_ci btrfs_set_dev_stats_value(eb, ptr, i, 753762306a36Sopenharmony_ci btrfs_dev_stat_read(device, i)); 753862306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, eb); 753962306a36Sopenharmony_ci 754062306a36Sopenharmony_ciout: 754162306a36Sopenharmony_ci btrfs_free_path(path); 754262306a36Sopenharmony_ci return ret; 754362306a36Sopenharmony_ci} 754462306a36Sopenharmony_ci 754562306a36Sopenharmony_ci/* 754662306a36Sopenharmony_ci * called from commit_transaction. Writes all changed device stats to disk. 754762306a36Sopenharmony_ci */ 754862306a36Sopenharmony_ciint btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 754962306a36Sopenharmony_ci{ 755062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 755162306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 755262306a36Sopenharmony_ci struct btrfs_device *device; 755362306a36Sopenharmony_ci int stats_cnt; 755462306a36Sopenharmony_ci int ret = 0; 755562306a36Sopenharmony_ci 755662306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 755762306a36Sopenharmony_ci list_for_each_entry(device, &fs_devices->devices, dev_list) { 755862306a36Sopenharmony_ci stats_cnt = atomic_read(&device->dev_stats_ccnt); 755962306a36Sopenharmony_ci if (!device->dev_stats_valid || stats_cnt == 0) 756062306a36Sopenharmony_ci continue; 756162306a36Sopenharmony_ci 756262306a36Sopenharmony_ci 756362306a36Sopenharmony_ci /* 756462306a36Sopenharmony_ci * There is a LOAD-LOAD control dependency between the value of 756562306a36Sopenharmony_ci * dev_stats_ccnt and updating the on-disk values which requires 756662306a36Sopenharmony_ci * reading the in-memory counters. Such control dependencies 756762306a36Sopenharmony_ci * require explicit read memory barriers. 756862306a36Sopenharmony_ci * 756962306a36Sopenharmony_ci * This memory barriers pairs with smp_mb__before_atomic in 757062306a36Sopenharmony_ci * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 757162306a36Sopenharmony_ci * barrier implied by atomic_xchg in 757262306a36Sopenharmony_ci * btrfs_dev_stats_read_and_reset 757362306a36Sopenharmony_ci */ 757462306a36Sopenharmony_ci smp_rmb(); 757562306a36Sopenharmony_ci 757662306a36Sopenharmony_ci ret = update_dev_stat_item(trans, device); 757762306a36Sopenharmony_ci if (!ret) 757862306a36Sopenharmony_ci atomic_sub(stats_cnt, &device->dev_stats_ccnt); 757962306a36Sopenharmony_ci } 758062306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 758162306a36Sopenharmony_ci 758262306a36Sopenharmony_ci return ret; 758362306a36Sopenharmony_ci} 758462306a36Sopenharmony_ci 758562306a36Sopenharmony_civoid btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 758662306a36Sopenharmony_ci{ 758762306a36Sopenharmony_ci btrfs_dev_stat_inc(dev, index); 758862306a36Sopenharmony_ci 758962306a36Sopenharmony_ci if (!dev->dev_stats_valid) 759062306a36Sopenharmony_ci return; 759162306a36Sopenharmony_ci btrfs_err_rl_in_rcu(dev->fs_info, 759262306a36Sopenharmony_ci "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 759362306a36Sopenharmony_ci btrfs_dev_name(dev), 759462306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 759562306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 759662306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 759762306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 759862306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 759962306a36Sopenharmony_ci} 760062306a36Sopenharmony_ci 760162306a36Sopenharmony_cistatic void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 760262306a36Sopenharmony_ci{ 760362306a36Sopenharmony_ci int i; 760462306a36Sopenharmony_ci 760562306a36Sopenharmony_ci for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 760662306a36Sopenharmony_ci if (btrfs_dev_stat_read(dev, i) != 0) 760762306a36Sopenharmony_ci break; 760862306a36Sopenharmony_ci if (i == BTRFS_DEV_STAT_VALUES_MAX) 760962306a36Sopenharmony_ci return; /* all values == 0, suppress message */ 761062306a36Sopenharmony_ci 761162306a36Sopenharmony_ci btrfs_info_in_rcu(dev->fs_info, 761262306a36Sopenharmony_ci "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 761362306a36Sopenharmony_ci btrfs_dev_name(dev), 761462306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 761562306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 761662306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 761762306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 761862306a36Sopenharmony_ci btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 761962306a36Sopenharmony_ci} 762062306a36Sopenharmony_ci 762162306a36Sopenharmony_ciint btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 762262306a36Sopenharmony_ci struct btrfs_ioctl_get_dev_stats *stats) 762362306a36Sopenharmony_ci{ 762462306a36Sopenharmony_ci BTRFS_DEV_LOOKUP_ARGS(args); 762562306a36Sopenharmony_ci struct btrfs_device *dev; 762662306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 762762306a36Sopenharmony_ci int i; 762862306a36Sopenharmony_ci 762962306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 763062306a36Sopenharmony_ci args.devid = stats->devid; 763162306a36Sopenharmony_ci dev = btrfs_find_device(fs_info->fs_devices, &args); 763262306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 763362306a36Sopenharmony_ci 763462306a36Sopenharmony_ci if (!dev) { 763562306a36Sopenharmony_ci btrfs_warn(fs_info, "get dev_stats failed, device not found"); 763662306a36Sopenharmony_ci return -ENODEV; 763762306a36Sopenharmony_ci } else if (!dev->dev_stats_valid) { 763862306a36Sopenharmony_ci btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 763962306a36Sopenharmony_ci return -ENODEV; 764062306a36Sopenharmony_ci } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 764162306a36Sopenharmony_ci for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 764262306a36Sopenharmony_ci if (stats->nr_items > i) 764362306a36Sopenharmony_ci stats->values[i] = 764462306a36Sopenharmony_ci btrfs_dev_stat_read_and_reset(dev, i); 764562306a36Sopenharmony_ci else 764662306a36Sopenharmony_ci btrfs_dev_stat_set(dev, i, 0); 764762306a36Sopenharmony_ci } 764862306a36Sopenharmony_ci btrfs_info(fs_info, "device stats zeroed by %s (%d)", 764962306a36Sopenharmony_ci current->comm, task_pid_nr(current)); 765062306a36Sopenharmony_ci } else { 765162306a36Sopenharmony_ci for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 765262306a36Sopenharmony_ci if (stats->nr_items > i) 765362306a36Sopenharmony_ci stats->values[i] = btrfs_dev_stat_read(dev, i); 765462306a36Sopenharmony_ci } 765562306a36Sopenharmony_ci if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 765662306a36Sopenharmony_ci stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 765762306a36Sopenharmony_ci return 0; 765862306a36Sopenharmony_ci} 765962306a36Sopenharmony_ci 766062306a36Sopenharmony_ci/* 766162306a36Sopenharmony_ci * Update the size and bytes used for each device where it changed. This is 766262306a36Sopenharmony_ci * delayed since we would otherwise get errors while writing out the 766362306a36Sopenharmony_ci * superblocks. 766462306a36Sopenharmony_ci * 766562306a36Sopenharmony_ci * Must be invoked during transaction commit. 766662306a36Sopenharmony_ci */ 766762306a36Sopenharmony_civoid btrfs_commit_device_sizes(struct btrfs_transaction *trans) 766862306a36Sopenharmony_ci{ 766962306a36Sopenharmony_ci struct btrfs_device *curr, *next; 767062306a36Sopenharmony_ci 767162306a36Sopenharmony_ci ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 767262306a36Sopenharmony_ci 767362306a36Sopenharmony_ci if (list_empty(&trans->dev_update_list)) 767462306a36Sopenharmony_ci return; 767562306a36Sopenharmony_ci 767662306a36Sopenharmony_ci /* 767762306a36Sopenharmony_ci * We don't need the device_list_mutex here. This list is owned by the 767862306a36Sopenharmony_ci * transaction and the transaction must complete before the device is 767962306a36Sopenharmony_ci * released. 768062306a36Sopenharmony_ci */ 768162306a36Sopenharmony_ci mutex_lock(&trans->fs_info->chunk_mutex); 768262306a36Sopenharmony_ci list_for_each_entry_safe(curr, next, &trans->dev_update_list, 768362306a36Sopenharmony_ci post_commit_list) { 768462306a36Sopenharmony_ci list_del_init(&curr->post_commit_list); 768562306a36Sopenharmony_ci curr->commit_total_bytes = curr->disk_total_bytes; 768662306a36Sopenharmony_ci curr->commit_bytes_used = curr->bytes_used; 768762306a36Sopenharmony_ci } 768862306a36Sopenharmony_ci mutex_unlock(&trans->fs_info->chunk_mutex); 768962306a36Sopenharmony_ci} 769062306a36Sopenharmony_ci 769162306a36Sopenharmony_ci/* 769262306a36Sopenharmony_ci * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 769362306a36Sopenharmony_ci */ 769462306a36Sopenharmony_ciint btrfs_bg_type_to_factor(u64 flags) 769562306a36Sopenharmony_ci{ 769662306a36Sopenharmony_ci const int index = btrfs_bg_flags_to_raid_index(flags); 769762306a36Sopenharmony_ci 769862306a36Sopenharmony_ci return btrfs_raid_array[index].ncopies; 769962306a36Sopenharmony_ci} 770062306a36Sopenharmony_ci 770162306a36Sopenharmony_ci 770262306a36Sopenharmony_ci 770362306a36Sopenharmony_cistatic int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 770462306a36Sopenharmony_ci u64 chunk_offset, u64 devid, 770562306a36Sopenharmony_ci u64 physical_offset, u64 physical_len) 770662306a36Sopenharmony_ci{ 770762306a36Sopenharmony_ci struct btrfs_dev_lookup_args args = { .devid = devid }; 770862306a36Sopenharmony_ci struct extent_map_tree *em_tree = &fs_info->mapping_tree; 770962306a36Sopenharmony_ci struct extent_map *em; 771062306a36Sopenharmony_ci struct map_lookup *map; 771162306a36Sopenharmony_ci struct btrfs_device *dev; 771262306a36Sopenharmony_ci u64 stripe_len; 771362306a36Sopenharmony_ci bool found = false; 771462306a36Sopenharmony_ci int ret = 0; 771562306a36Sopenharmony_ci int i; 771662306a36Sopenharmony_ci 771762306a36Sopenharmony_ci read_lock(&em_tree->lock); 771862306a36Sopenharmony_ci em = lookup_extent_mapping(em_tree, chunk_offset, 1); 771962306a36Sopenharmony_ci read_unlock(&em_tree->lock); 772062306a36Sopenharmony_ci 772162306a36Sopenharmony_ci if (!em) { 772262306a36Sopenharmony_ci btrfs_err(fs_info, 772362306a36Sopenharmony_ci"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 772462306a36Sopenharmony_ci physical_offset, devid); 772562306a36Sopenharmony_ci ret = -EUCLEAN; 772662306a36Sopenharmony_ci goto out; 772762306a36Sopenharmony_ci } 772862306a36Sopenharmony_ci 772962306a36Sopenharmony_ci map = em->map_lookup; 773062306a36Sopenharmony_ci stripe_len = btrfs_calc_stripe_length(em); 773162306a36Sopenharmony_ci if (physical_len != stripe_len) { 773262306a36Sopenharmony_ci btrfs_err(fs_info, 773362306a36Sopenharmony_ci"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 773462306a36Sopenharmony_ci physical_offset, devid, em->start, physical_len, 773562306a36Sopenharmony_ci stripe_len); 773662306a36Sopenharmony_ci ret = -EUCLEAN; 773762306a36Sopenharmony_ci goto out; 773862306a36Sopenharmony_ci } 773962306a36Sopenharmony_ci 774062306a36Sopenharmony_ci /* 774162306a36Sopenharmony_ci * Very old mkfs.btrfs (before v4.1) will not respect the reserved 774262306a36Sopenharmony_ci * space. Although kernel can handle it without problem, better to warn 774362306a36Sopenharmony_ci * the users. 774462306a36Sopenharmony_ci */ 774562306a36Sopenharmony_ci if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) 774662306a36Sopenharmony_ci btrfs_warn(fs_info, 774762306a36Sopenharmony_ci "devid %llu physical %llu len %llu inside the reserved space", 774862306a36Sopenharmony_ci devid, physical_offset, physical_len); 774962306a36Sopenharmony_ci 775062306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 775162306a36Sopenharmony_ci if (map->stripes[i].dev->devid == devid && 775262306a36Sopenharmony_ci map->stripes[i].physical == physical_offset) { 775362306a36Sopenharmony_ci found = true; 775462306a36Sopenharmony_ci if (map->verified_stripes >= map->num_stripes) { 775562306a36Sopenharmony_ci btrfs_err(fs_info, 775662306a36Sopenharmony_ci "too many dev extents for chunk %llu found", 775762306a36Sopenharmony_ci em->start); 775862306a36Sopenharmony_ci ret = -EUCLEAN; 775962306a36Sopenharmony_ci goto out; 776062306a36Sopenharmony_ci } 776162306a36Sopenharmony_ci map->verified_stripes++; 776262306a36Sopenharmony_ci break; 776362306a36Sopenharmony_ci } 776462306a36Sopenharmony_ci } 776562306a36Sopenharmony_ci if (!found) { 776662306a36Sopenharmony_ci btrfs_err(fs_info, 776762306a36Sopenharmony_ci "dev extent physical offset %llu devid %llu has no corresponding chunk", 776862306a36Sopenharmony_ci physical_offset, devid); 776962306a36Sopenharmony_ci ret = -EUCLEAN; 777062306a36Sopenharmony_ci } 777162306a36Sopenharmony_ci 777262306a36Sopenharmony_ci /* Make sure no dev extent is beyond device boundary */ 777362306a36Sopenharmony_ci dev = btrfs_find_device(fs_info->fs_devices, &args); 777462306a36Sopenharmony_ci if (!dev) { 777562306a36Sopenharmony_ci btrfs_err(fs_info, "failed to find devid %llu", devid); 777662306a36Sopenharmony_ci ret = -EUCLEAN; 777762306a36Sopenharmony_ci goto out; 777862306a36Sopenharmony_ci } 777962306a36Sopenharmony_ci 778062306a36Sopenharmony_ci if (physical_offset + physical_len > dev->disk_total_bytes) { 778162306a36Sopenharmony_ci btrfs_err(fs_info, 778262306a36Sopenharmony_ci"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 778362306a36Sopenharmony_ci devid, physical_offset, physical_len, 778462306a36Sopenharmony_ci dev->disk_total_bytes); 778562306a36Sopenharmony_ci ret = -EUCLEAN; 778662306a36Sopenharmony_ci goto out; 778762306a36Sopenharmony_ci } 778862306a36Sopenharmony_ci 778962306a36Sopenharmony_ci if (dev->zone_info) { 779062306a36Sopenharmony_ci u64 zone_size = dev->zone_info->zone_size; 779162306a36Sopenharmony_ci 779262306a36Sopenharmony_ci if (!IS_ALIGNED(physical_offset, zone_size) || 779362306a36Sopenharmony_ci !IS_ALIGNED(physical_len, zone_size)) { 779462306a36Sopenharmony_ci btrfs_err(fs_info, 779562306a36Sopenharmony_ci"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 779662306a36Sopenharmony_ci devid, physical_offset, physical_len); 779762306a36Sopenharmony_ci ret = -EUCLEAN; 779862306a36Sopenharmony_ci goto out; 779962306a36Sopenharmony_ci } 780062306a36Sopenharmony_ci } 780162306a36Sopenharmony_ci 780262306a36Sopenharmony_ciout: 780362306a36Sopenharmony_ci free_extent_map(em); 780462306a36Sopenharmony_ci return ret; 780562306a36Sopenharmony_ci} 780662306a36Sopenharmony_ci 780762306a36Sopenharmony_cistatic int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 780862306a36Sopenharmony_ci{ 780962306a36Sopenharmony_ci struct extent_map_tree *em_tree = &fs_info->mapping_tree; 781062306a36Sopenharmony_ci struct extent_map *em; 781162306a36Sopenharmony_ci struct rb_node *node; 781262306a36Sopenharmony_ci int ret = 0; 781362306a36Sopenharmony_ci 781462306a36Sopenharmony_ci read_lock(&em_tree->lock); 781562306a36Sopenharmony_ci for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 781662306a36Sopenharmony_ci em = rb_entry(node, struct extent_map, rb_node); 781762306a36Sopenharmony_ci if (em->map_lookup->num_stripes != 781862306a36Sopenharmony_ci em->map_lookup->verified_stripes) { 781962306a36Sopenharmony_ci btrfs_err(fs_info, 782062306a36Sopenharmony_ci "chunk %llu has missing dev extent, have %d expect %d", 782162306a36Sopenharmony_ci em->start, em->map_lookup->verified_stripes, 782262306a36Sopenharmony_ci em->map_lookup->num_stripes); 782362306a36Sopenharmony_ci ret = -EUCLEAN; 782462306a36Sopenharmony_ci goto out; 782562306a36Sopenharmony_ci } 782662306a36Sopenharmony_ci } 782762306a36Sopenharmony_ciout: 782862306a36Sopenharmony_ci read_unlock(&em_tree->lock); 782962306a36Sopenharmony_ci return ret; 783062306a36Sopenharmony_ci} 783162306a36Sopenharmony_ci 783262306a36Sopenharmony_ci/* 783362306a36Sopenharmony_ci * Ensure that all dev extents are mapped to correct chunk, otherwise 783462306a36Sopenharmony_ci * later chunk allocation/free would cause unexpected behavior. 783562306a36Sopenharmony_ci * 783662306a36Sopenharmony_ci * NOTE: This will iterate through the whole device tree, which should be of 783762306a36Sopenharmony_ci * the same size level as the chunk tree. This slightly increases mount time. 783862306a36Sopenharmony_ci */ 783962306a36Sopenharmony_ciint btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 784062306a36Sopenharmony_ci{ 784162306a36Sopenharmony_ci struct btrfs_path *path; 784262306a36Sopenharmony_ci struct btrfs_root *root = fs_info->dev_root; 784362306a36Sopenharmony_ci struct btrfs_key key; 784462306a36Sopenharmony_ci u64 prev_devid = 0; 784562306a36Sopenharmony_ci u64 prev_dev_ext_end = 0; 784662306a36Sopenharmony_ci int ret = 0; 784762306a36Sopenharmony_ci 784862306a36Sopenharmony_ci /* 784962306a36Sopenharmony_ci * We don't have a dev_root because we mounted with ignorebadroots and 785062306a36Sopenharmony_ci * failed to load the root, so we want to skip the verification in this 785162306a36Sopenharmony_ci * case for sure. 785262306a36Sopenharmony_ci * 785362306a36Sopenharmony_ci * However if the dev root is fine, but the tree itself is corrupted 785462306a36Sopenharmony_ci * we'd still fail to mount. This verification is only to make sure 785562306a36Sopenharmony_ci * writes can happen safely, so instead just bypass this check 785662306a36Sopenharmony_ci * completely in the case of IGNOREBADROOTS. 785762306a36Sopenharmony_ci */ 785862306a36Sopenharmony_ci if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 785962306a36Sopenharmony_ci return 0; 786062306a36Sopenharmony_ci 786162306a36Sopenharmony_ci key.objectid = 1; 786262306a36Sopenharmony_ci key.type = BTRFS_DEV_EXTENT_KEY; 786362306a36Sopenharmony_ci key.offset = 0; 786462306a36Sopenharmony_ci 786562306a36Sopenharmony_ci path = btrfs_alloc_path(); 786662306a36Sopenharmony_ci if (!path) 786762306a36Sopenharmony_ci return -ENOMEM; 786862306a36Sopenharmony_ci 786962306a36Sopenharmony_ci path->reada = READA_FORWARD; 787062306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 787162306a36Sopenharmony_ci if (ret < 0) 787262306a36Sopenharmony_ci goto out; 787362306a36Sopenharmony_ci 787462306a36Sopenharmony_ci if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 787562306a36Sopenharmony_ci ret = btrfs_next_leaf(root, path); 787662306a36Sopenharmony_ci if (ret < 0) 787762306a36Sopenharmony_ci goto out; 787862306a36Sopenharmony_ci /* No dev extents at all? Not good */ 787962306a36Sopenharmony_ci if (ret > 0) { 788062306a36Sopenharmony_ci ret = -EUCLEAN; 788162306a36Sopenharmony_ci goto out; 788262306a36Sopenharmony_ci } 788362306a36Sopenharmony_ci } 788462306a36Sopenharmony_ci while (1) { 788562306a36Sopenharmony_ci struct extent_buffer *leaf = path->nodes[0]; 788662306a36Sopenharmony_ci struct btrfs_dev_extent *dext; 788762306a36Sopenharmony_ci int slot = path->slots[0]; 788862306a36Sopenharmony_ci u64 chunk_offset; 788962306a36Sopenharmony_ci u64 physical_offset; 789062306a36Sopenharmony_ci u64 physical_len; 789162306a36Sopenharmony_ci u64 devid; 789262306a36Sopenharmony_ci 789362306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, slot); 789462306a36Sopenharmony_ci if (key.type != BTRFS_DEV_EXTENT_KEY) 789562306a36Sopenharmony_ci break; 789662306a36Sopenharmony_ci devid = key.objectid; 789762306a36Sopenharmony_ci physical_offset = key.offset; 789862306a36Sopenharmony_ci 789962306a36Sopenharmony_ci dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 790062306a36Sopenharmony_ci chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 790162306a36Sopenharmony_ci physical_len = btrfs_dev_extent_length(leaf, dext); 790262306a36Sopenharmony_ci 790362306a36Sopenharmony_ci /* Check if this dev extent overlaps with the previous one */ 790462306a36Sopenharmony_ci if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 790562306a36Sopenharmony_ci btrfs_err(fs_info, 790662306a36Sopenharmony_ci"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 790762306a36Sopenharmony_ci devid, physical_offset, prev_dev_ext_end); 790862306a36Sopenharmony_ci ret = -EUCLEAN; 790962306a36Sopenharmony_ci goto out; 791062306a36Sopenharmony_ci } 791162306a36Sopenharmony_ci 791262306a36Sopenharmony_ci ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 791362306a36Sopenharmony_ci physical_offset, physical_len); 791462306a36Sopenharmony_ci if (ret < 0) 791562306a36Sopenharmony_ci goto out; 791662306a36Sopenharmony_ci prev_devid = devid; 791762306a36Sopenharmony_ci prev_dev_ext_end = physical_offset + physical_len; 791862306a36Sopenharmony_ci 791962306a36Sopenharmony_ci ret = btrfs_next_item(root, path); 792062306a36Sopenharmony_ci if (ret < 0) 792162306a36Sopenharmony_ci goto out; 792262306a36Sopenharmony_ci if (ret > 0) { 792362306a36Sopenharmony_ci ret = 0; 792462306a36Sopenharmony_ci break; 792562306a36Sopenharmony_ci } 792662306a36Sopenharmony_ci } 792762306a36Sopenharmony_ci 792862306a36Sopenharmony_ci /* Ensure all chunks have corresponding dev extents */ 792962306a36Sopenharmony_ci ret = verify_chunk_dev_extent_mapping(fs_info); 793062306a36Sopenharmony_ciout: 793162306a36Sopenharmony_ci btrfs_free_path(path); 793262306a36Sopenharmony_ci return ret; 793362306a36Sopenharmony_ci} 793462306a36Sopenharmony_ci 793562306a36Sopenharmony_ci/* 793662306a36Sopenharmony_ci * Check whether the given block group or device is pinned by any inode being 793762306a36Sopenharmony_ci * used as a swapfile. 793862306a36Sopenharmony_ci */ 793962306a36Sopenharmony_cibool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 794062306a36Sopenharmony_ci{ 794162306a36Sopenharmony_ci struct btrfs_swapfile_pin *sp; 794262306a36Sopenharmony_ci struct rb_node *node; 794362306a36Sopenharmony_ci 794462306a36Sopenharmony_ci spin_lock(&fs_info->swapfile_pins_lock); 794562306a36Sopenharmony_ci node = fs_info->swapfile_pins.rb_node; 794662306a36Sopenharmony_ci while (node) { 794762306a36Sopenharmony_ci sp = rb_entry(node, struct btrfs_swapfile_pin, node); 794862306a36Sopenharmony_ci if (ptr < sp->ptr) 794962306a36Sopenharmony_ci node = node->rb_left; 795062306a36Sopenharmony_ci else if (ptr > sp->ptr) 795162306a36Sopenharmony_ci node = node->rb_right; 795262306a36Sopenharmony_ci else 795362306a36Sopenharmony_ci break; 795462306a36Sopenharmony_ci } 795562306a36Sopenharmony_ci spin_unlock(&fs_info->swapfile_pins_lock); 795662306a36Sopenharmony_ci return node != NULL; 795762306a36Sopenharmony_ci} 795862306a36Sopenharmony_ci 795962306a36Sopenharmony_cistatic int relocating_repair_kthread(void *data) 796062306a36Sopenharmony_ci{ 796162306a36Sopenharmony_ci struct btrfs_block_group *cache = data; 796262306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = cache->fs_info; 796362306a36Sopenharmony_ci u64 target; 796462306a36Sopenharmony_ci int ret = 0; 796562306a36Sopenharmony_ci 796662306a36Sopenharmony_ci target = cache->start; 796762306a36Sopenharmony_ci btrfs_put_block_group(cache); 796862306a36Sopenharmony_ci 796962306a36Sopenharmony_ci sb_start_write(fs_info->sb); 797062306a36Sopenharmony_ci if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 797162306a36Sopenharmony_ci btrfs_info(fs_info, 797262306a36Sopenharmony_ci "zoned: skip relocating block group %llu to repair: EBUSY", 797362306a36Sopenharmony_ci target); 797462306a36Sopenharmony_ci sb_end_write(fs_info->sb); 797562306a36Sopenharmony_ci return -EBUSY; 797662306a36Sopenharmony_ci } 797762306a36Sopenharmony_ci 797862306a36Sopenharmony_ci mutex_lock(&fs_info->reclaim_bgs_lock); 797962306a36Sopenharmony_ci 798062306a36Sopenharmony_ci /* Ensure block group still exists */ 798162306a36Sopenharmony_ci cache = btrfs_lookup_block_group(fs_info, target); 798262306a36Sopenharmony_ci if (!cache) 798362306a36Sopenharmony_ci goto out; 798462306a36Sopenharmony_ci 798562306a36Sopenharmony_ci if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) 798662306a36Sopenharmony_ci goto out; 798762306a36Sopenharmony_ci 798862306a36Sopenharmony_ci ret = btrfs_may_alloc_data_chunk(fs_info, target); 798962306a36Sopenharmony_ci if (ret < 0) 799062306a36Sopenharmony_ci goto out; 799162306a36Sopenharmony_ci 799262306a36Sopenharmony_ci btrfs_info(fs_info, 799362306a36Sopenharmony_ci "zoned: relocating block group %llu to repair IO failure", 799462306a36Sopenharmony_ci target); 799562306a36Sopenharmony_ci ret = btrfs_relocate_chunk(fs_info, target); 799662306a36Sopenharmony_ci 799762306a36Sopenharmony_ciout: 799862306a36Sopenharmony_ci if (cache) 799962306a36Sopenharmony_ci btrfs_put_block_group(cache); 800062306a36Sopenharmony_ci mutex_unlock(&fs_info->reclaim_bgs_lock); 800162306a36Sopenharmony_ci btrfs_exclop_finish(fs_info); 800262306a36Sopenharmony_ci sb_end_write(fs_info->sb); 800362306a36Sopenharmony_ci 800462306a36Sopenharmony_ci return ret; 800562306a36Sopenharmony_ci} 800662306a36Sopenharmony_ci 800762306a36Sopenharmony_cibool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 800862306a36Sopenharmony_ci{ 800962306a36Sopenharmony_ci struct btrfs_block_group *cache; 801062306a36Sopenharmony_ci 801162306a36Sopenharmony_ci if (!btrfs_is_zoned(fs_info)) 801262306a36Sopenharmony_ci return false; 801362306a36Sopenharmony_ci 801462306a36Sopenharmony_ci /* Do not attempt to repair in degraded state */ 801562306a36Sopenharmony_ci if (btrfs_test_opt(fs_info, DEGRADED)) 801662306a36Sopenharmony_ci return true; 801762306a36Sopenharmony_ci 801862306a36Sopenharmony_ci cache = btrfs_lookup_block_group(fs_info, logical); 801962306a36Sopenharmony_ci if (!cache) 802062306a36Sopenharmony_ci return true; 802162306a36Sopenharmony_ci 802262306a36Sopenharmony_ci if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { 802362306a36Sopenharmony_ci btrfs_put_block_group(cache); 802462306a36Sopenharmony_ci return true; 802562306a36Sopenharmony_ci } 802662306a36Sopenharmony_ci 802762306a36Sopenharmony_ci kthread_run(relocating_repair_kthread, cache, 802862306a36Sopenharmony_ci "btrfs-relocating-repair"); 802962306a36Sopenharmony_ci 803062306a36Sopenharmony_ci return true; 803162306a36Sopenharmony_ci} 803262306a36Sopenharmony_ci 803362306a36Sopenharmony_cistatic void map_raid56_repair_block(struct btrfs_io_context *bioc, 803462306a36Sopenharmony_ci struct btrfs_io_stripe *smap, 803562306a36Sopenharmony_ci u64 logical) 803662306a36Sopenharmony_ci{ 803762306a36Sopenharmony_ci int data_stripes = nr_bioc_data_stripes(bioc); 803862306a36Sopenharmony_ci int i; 803962306a36Sopenharmony_ci 804062306a36Sopenharmony_ci for (i = 0; i < data_stripes; i++) { 804162306a36Sopenharmony_ci u64 stripe_start = bioc->full_stripe_logical + 804262306a36Sopenharmony_ci btrfs_stripe_nr_to_offset(i); 804362306a36Sopenharmony_ci 804462306a36Sopenharmony_ci if (logical >= stripe_start && 804562306a36Sopenharmony_ci logical < stripe_start + BTRFS_STRIPE_LEN) 804662306a36Sopenharmony_ci break; 804762306a36Sopenharmony_ci } 804862306a36Sopenharmony_ci ASSERT(i < data_stripes); 804962306a36Sopenharmony_ci smap->dev = bioc->stripes[i].dev; 805062306a36Sopenharmony_ci smap->physical = bioc->stripes[i].physical + 805162306a36Sopenharmony_ci ((logical - bioc->full_stripe_logical) & 805262306a36Sopenharmony_ci BTRFS_STRIPE_LEN_MASK); 805362306a36Sopenharmony_ci} 805462306a36Sopenharmony_ci 805562306a36Sopenharmony_ci/* 805662306a36Sopenharmony_ci * Map a repair write into a single device. 805762306a36Sopenharmony_ci * 805862306a36Sopenharmony_ci * A repair write is triggered by read time repair or scrub, which would only 805962306a36Sopenharmony_ci * update the contents of a single device. 806062306a36Sopenharmony_ci * Not update any other mirrors nor go through RMW path. 806162306a36Sopenharmony_ci * 806262306a36Sopenharmony_ci * Callers should ensure: 806362306a36Sopenharmony_ci * 806462306a36Sopenharmony_ci * - Call btrfs_bio_counter_inc_blocked() first 806562306a36Sopenharmony_ci * - The range does not cross stripe boundary 806662306a36Sopenharmony_ci * - Has a valid @mirror_num passed in. 806762306a36Sopenharmony_ci */ 806862306a36Sopenharmony_ciint btrfs_map_repair_block(struct btrfs_fs_info *fs_info, 806962306a36Sopenharmony_ci struct btrfs_io_stripe *smap, u64 logical, 807062306a36Sopenharmony_ci u32 length, int mirror_num) 807162306a36Sopenharmony_ci{ 807262306a36Sopenharmony_ci struct btrfs_io_context *bioc = NULL; 807362306a36Sopenharmony_ci u64 map_length = length; 807462306a36Sopenharmony_ci int mirror_ret = mirror_num; 807562306a36Sopenharmony_ci int ret; 807662306a36Sopenharmony_ci 807762306a36Sopenharmony_ci ASSERT(mirror_num > 0); 807862306a36Sopenharmony_ci 807962306a36Sopenharmony_ci ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, 808062306a36Sopenharmony_ci &bioc, smap, &mirror_ret, true); 808162306a36Sopenharmony_ci if (ret < 0) 808262306a36Sopenharmony_ci return ret; 808362306a36Sopenharmony_ci 808462306a36Sopenharmony_ci /* The map range should not cross stripe boundary. */ 808562306a36Sopenharmony_ci ASSERT(map_length >= length); 808662306a36Sopenharmony_ci 808762306a36Sopenharmony_ci /* Already mapped to single stripe. */ 808862306a36Sopenharmony_ci if (!bioc) 808962306a36Sopenharmony_ci goto out; 809062306a36Sopenharmony_ci 809162306a36Sopenharmony_ci /* Map the RAID56 multi-stripe writes to a single one. */ 809262306a36Sopenharmony_ci if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 809362306a36Sopenharmony_ci map_raid56_repair_block(bioc, smap, logical); 809462306a36Sopenharmony_ci goto out; 809562306a36Sopenharmony_ci } 809662306a36Sopenharmony_ci 809762306a36Sopenharmony_ci ASSERT(mirror_num <= bioc->num_stripes); 809862306a36Sopenharmony_ci smap->dev = bioc->stripes[mirror_num - 1].dev; 809962306a36Sopenharmony_ci smap->physical = bioc->stripes[mirror_num - 1].physical; 810062306a36Sopenharmony_ciout: 810162306a36Sopenharmony_ci btrfs_put_bioc(bioc); 810262306a36Sopenharmony_ci ASSERT(smap->dev); 810362306a36Sopenharmony_ci return 0; 810462306a36Sopenharmony_ci} 8105