162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) STRATO AG 2012. All rights reserved. 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#include <linux/sched.h> 762306a36Sopenharmony_ci#include <linux/bio.h> 862306a36Sopenharmony_ci#include <linux/slab.h> 962306a36Sopenharmony_ci#include <linux/blkdev.h> 1062306a36Sopenharmony_ci#include <linux/kthread.h> 1162306a36Sopenharmony_ci#include <linux/math64.h> 1262306a36Sopenharmony_ci#include "misc.h" 1362306a36Sopenharmony_ci#include "ctree.h" 1462306a36Sopenharmony_ci#include "extent_map.h" 1562306a36Sopenharmony_ci#include "disk-io.h" 1662306a36Sopenharmony_ci#include "transaction.h" 1762306a36Sopenharmony_ci#include "print-tree.h" 1862306a36Sopenharmony_ci#include "volumes.h" 1962306a36Sopenharmony_ci#include "async-thread.h" 2062306a36Sopenharmony_ci#include "check-integrity.h" 2162306a36Sopenharmony_ci#include "dev-replace.h" 2262306a36Sopenharmony_ci#include "sysfs.h" 2362306a36Sopenharmony_ci#include "zoned.h" 2462306a36Sopenharmony_ci#include "block-group.h" 2562306a36Sopenharmony_ci#include "fs.h" 2662306a36Sopenharmony_ci#include "accessors.h" 2762306a36Sopenharmony_ci#include "scrub.h" 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci/* 3062306a36Sopenharmony_ci * Device replace overview 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci * [Objective] 3362306a36Sopenharmony_ci * To copy all extents (both new and on-disk) from source device to target 3462306a36Sopenharmony_ci * device, while still keeping the filesystem read-write. 3562306a36Sopenharmony_ci * 3662306a36Sopenharmony_ci * [Method] 3762306a36Sopenharmony_ci * There are two main methods involved: 3862306a36Sopenharmony_ci * 3962306a36Sopenharmony_ci * - Write duplication 4062306a36Sopenharmony_ci * 4162306a36Sopenharmony_ci * All new writes will be written to both target and source devices, so even 4262306a36Sopenharmony_ci * if replace gets canceled, sources device still contains up-to-date data. 4362306a36Sopenharmony_ci * 4462306a36Sopenharmony_ci * Location: handle_ops_on_dev_replace() from btrfs_map_block() 4562306a36Sopenharmony_ci * Start: btrfs_dev_replace_start() 4662306a36Sopenharmony_ci * End: btrfs_dev_replace_finishing() 4762306a36Sopenharmony_ci * Content: Latest data/metadata 4862306a36Sopenharmony_ci * 4962306a36Sopenharmony_ci * - Copy existing extents 5062306a36Sopenharmony_ci * 5162306a36Sopenharmony_ci * This happens by re-using scrub facility, as scrub also iterates through 5262306a36Sopenharmony_ci * existing extents from commit root. 5362306a36Sopenharmony_ci * 5462306a36Sopenharmony_ci * Location: scrub_write_block_to_dev_replace() from 5562306a36Sopenharmony_ci * scrub_block_complete() 5662306a36Sopenharmony_ci * Content: Data/meta from commit root. 5762306a36Sopenharmony_ci * 5862306a36Sopenharmony_ci * Due to the content difference, we need to avoid nocow write when dev-replace 5962306a36Sopenharmony_ci * is happening. This is done by marking the block group read-only and waiting 6062306a36Sopenharmony_ci * for NOCOW writes. 6162306a36Sopenharmony_ci * 6262306a36Sopenharmony_ci * After replace is done, the finishing part is done by swapping the target and 6362306a36Sopenharmony_ci * source devices. 6462306a36Sopenharmony_ci * 6562306a36Sopenharmony_ci * Location: btrfs_dev_replace_update_device_in_mapping_tree() from 6662306a36Sopenharmony_ci * btrfs_dev_replace_finishing() 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_cistatic int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 7062306a36Sopenharmony_ci int scrub_ret); 7162306a36Sopenharmony_cistatic int btrfs_dev_replace_kthread(void *data); 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ciint btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) 7462306a36Sopenharmony_ci{ 7562306a36Sopenharmony_ci struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID }; 7662306a36Sopenharmony_ci struct btrfs_key key; 7762306a36Sopenharmony_ci struct btrfs_root *dev_root = fs_info->dev_root; 7862306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 7962306a36Sopenharmony_ci struct extent_buffer *eb; 8062306a36Sopenharmony_ci int slot; 8162306a36Sopenharmony_ci int ret = 0; 8262306a36Sopenharmony_ci struct btrfs_path *path = NULL; 8362306a36Sopenharmony_ci int item_size; 8462306a36Sopenharmony_ci struct btrfs_dev_replace_item *ptr; 8562306a36Sopenharmony_ci u64 src_devid; 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci if (!dev_root) 8862306a36Sopenharmony_ci return 0; 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci path = btrfs_alloc_path(); 9162306a36Sopenharmony_ci if (!path) { 9262306a36Sopenharmony_ci ret = -ENOMEM; 9362306a36Sopenharmony_ci goto out; 9462306a36Sopenharmony_ci } 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci key.objectid = 0; 9762306a36Sopenharmony_ci key.type = BTRFS_DEV_REPLACE_KEY; 9862306a36Sopenharmony_ci key.offset = 0; 9962306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 10062306a36Sopenharmony_ci if (ret) { 10162306a36Sopenharmony_cino_valid_dev_replace_entry_found: 10262306a36Sopenharmony_ci /* 10362306a36Sopenharmony_ci * We don't have a replace item or it's corrupted. If there is 10462306a36Sopenharmony_ci * a replace target, fail the mount. 10562306a36Sopenharmony_ci */ 10662306a36Sopenharmony_ci if (btrfs_find_device(fs_info->fs_devices, &args)) { 10762306a36Sopenharmony_ci btrfs_err(fs_info, 10862306a36Sopenharmony_ci "found replace target device without a valid replace item"); 10962306a36Sopenharmony_ci ret = -EUCLEAN; 11062306a36Sopenharmony_ci goto out; 11162306a36Sopenharmony_ci } 11262306a36Sopenharmony_ci ret = 0; 11362306a36Sopenharmony_ci dev_replace->replace_state = 11462306a36Sopenharmony_ci BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; 11562306a36Sopenharmony_ci dev_replace->cont_reading_from_srcdev_mode = 11662306a36Sopenharmony_ci BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; 11762306a36Sopenharmony_ci dev_replace->time_started = 0; 11862306a36Sopenharmony_ci dev_replace->time_stopped = 0; 11962306a36Sopenharmony_ci atomic64_set(&dev_replace->num_write_errors, 0); 12062306a36Sopenharmony_ci atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 12162306a36Sopenharmony_ci dev_replace->cursor_left = 0; 12262306a36Sopenharmony_ci dev_replace->committed_cursor_left = 0; 12362306a36Sopenharmony_ci dev_replace->cursor_left_last_write_of_item = 0; 12462306a36Sopenharmony_ci dev_replace->cursor_right = 0; 12562306a36Sopenharmony_ci dev_replace->srcdev = NULL; 12662306a36Sopenharmony_ci dev_replace->tgtdev = NULL; 12762306a36Sopenharmony_ci dev_replace->is_valid = 0; 12862306a36Sopenharmony_ci dev_replace->item_needs_writeback = 0; 12962306a36Sopenharmony_ci goto out; 13062306a36Sopenharmony_ci } 13162306a36Sopenharmony_ci slot = path->slots[0]; 13262306a36Sopenharmony_ci eb = path->nodes[0]; 13362306a36Sopenharmony_ci item_size = btrfs_item_size(eb, slot); 13462306a36Sopenharmony_ci ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci if (item_size != sizeof(struct btrfs_dev_replace_item)) { 13762306a36Sopenharmony_ci btrfs_warn(fs_info, 13862306a36Sopenharmony_ci "dev_replace entry found has unexpected size, ignore entry"); 13962306a36Sopenharmony_ci goto no_valid_dev_replace_entry_found; 14062306a36Sopenharmony_ci } 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci src_devid = btrfs_dev_replace_src_devid(eb, ptr); 14362306a36Sopenharmony_ci dev_replace->cont_reading_from_srcdev_mode = 14462306a36Sopenharmony_ci btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); 14562306a36Sopenharmony_ci dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); 14662306a36Sopenharmony_ci dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); 14762306a36Sopenharmony_ci dev_replace->time_stopped = 14862306a36Sopenharmony_ci btrfs_dev_replace_time_stopped(eb, ptr); 14962306a36Sopenharmony_ci atomic64_set(&dev_replace->num_write_errors, 15062306a36Sopenharmony_ci btrfs_dev_replace_num_write_errors(eb, ptr)); 15162306a36Sopenharmony_ci atomic64_set(&dev_replace->num_uncorrectable_read_errors, 15262306a36Sopenharmony_ci btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); 15362306a36Sopenharmony_ci dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); 15462306a36Sopenharmony_ci dev_replace->committed_cursor_left = dev_replace->cursor_left; 15562306a36Sopenharmony_ci dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; 15662306a36Sopenharmony_ci dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); 15762306a36Sopenharmony_ci dev_replace->is_valid = 1; 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_ci dev_replace->item_needs_writeback = 0; 16062306a36Sopenharmony_ci switch (dev_replace->replace_state) { 16162306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 16262306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 16362306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 16462306a36Sopenharmony_ci /* 16562306a36Sopenharmony_ci * We don't have an active replace item but if there is a 16662306a36Sopenharmony_ci * replace target, fail the mount. 16762306a36Sopenharmony_ci */ 16862306a36Sopenharmony_ci if (btrfs_find_device(fs_info->fs_devices, &args)) { 16962306a36Sopenharmony_ci btrfs_err(fs_info, 17062306a36Sopenharmony_ci"replace without active item, run 'device scan --forget' on the target device"); 17162306a36Sopenharmony_ci ret = -EUCLEAN; 17262306a36Sopenharmony_ci } else { 17362306a36Sopenharmony_ci dev_replace->srcdev = NULL; 17462306a36Sopenharmony_ci dev_replace->tgtdev = NULL; 17562306a36Sopenharmony_ci } 17662306a36Sopenharmony_ci break; 17762306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 17862306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 17962306a36Sopenharmony_ci dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args); 18062306a36Sopenharmony_ci args.devid = src_devid; 18162306a36Sopenharmony_ci dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args); 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci /* 18462306a36Sopenharmony_ci * allow 'btrfs dev replace_cancel' if src/tgt device is 18562306a36Sopenharmony_ci * missing 18662306a36Sopenharmony_ci */ 18762306a36Sopenharmony_ci if (!dev_replace->srcdev && 18862306a36Sopenharmony_ci !btrfs_test_opt(fs_info, DEGRADED)) { 18962306a36Sopenharmony_ci ret = -EIO; 19062306a36Sopenharmony_ci btrfs_warn(fs_info, 19162306a36Sopenharmony_ci "cannot mount because device replace operation is ongoing and"); 19262306a36Sopenharmony_ci btrfs_warn(fs_info, 19362306a36Sopenharmony_ci "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", 19462306a36Sopenharmony_ci src_devid); 19562306a36Sopenharmony_ci } 19662306a36Sopenharmony_ci if (!dev_replace->tgtdev && 19762306a36Sopenharmony_ci !btrfs_test_opt(fs_info, DEGRADED)) { 19862306a36Sopenharmony_ci ret = -EIO; 19962306a36Sopenharmony_ci btrfs_warn(fs_info, 20062306a36Sopenharmony_ci "cannot mount because device replace operation is ongoing and"); 20162306a36Sopenharmony_ci btrfs_warn(fs_info, 20262306a36Sopenharmony_ci "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", 20362306a36Sopenharmony_ci BTRFS_DEV_REPLACE_DEVID); 20462306a36Sopenharmony_ci } 20562306a36Sopenharmony_ci if (dev_replace->tgtdev) { 20662306a36Sopenharmony_ci if (dev_replace->srcdev) { 20762306a36Sopenharmony_ci dev_replace->tgtdev->total_bytes = 20862306a36Sopenharmony_ci dev_replace->srcdev->total_bytes; 20962306a36Sopenharmony_ci dev_replace->tgtdev->disk_total_bytes = 21062306a36Sopenharmony_ci dev_replace->srcdev->disk_total_bytes; 21162306a36Sopenharmony_ci dev_replace->tgtdev->commit_total_bytes = 21262306a36Sopenharmony_ci dev_replace->srcdev->commit_total_bytes; 21362306a36Sopenharmony_ci dev_replace->tgtdev->bytes_used = 21462306a36Sopenharmony_ci dev_replace->srcdev->bytes_used; 21562306a36Sopenharmony_ci dev_replace->tgtdev->commit_bytes_used = 21662306a36Sopenharmony_ci dev_replace->srcdev->commit_bytes_used; 21762306a36Sopenharmony_ci } 21862306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_REPLACE_TGT, 21962306a36Sopenharmony_ci &dev_replace->tgtdev->dev_state); 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci WARN_ON(fs_info->fs_devices->rw_devices == 0); 22262306a36Sopenharmony_ci dev_replace->tgtdev->io_width = fs_info->sectorsize; 22362306a36Sopenharmony_ci dev_replace->tgtdev->io_align = fs_info->sectorsize; 22462306a36Sopenharmony_ci dev_replace->tgtdev->sector_size = fs_info->sectorsize; 22562306a36Sopenharmony_ci dev_replace->tgtdev->fs_info = fs_info; 22662306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 22762306a36Sopenharmony_ci &dev_replace->tgtdev->dev_state); 22862306a36Sopenharmony_ci } 22962306a36Sopenharmony_ci break; 23062306a36Sopenharmony_ci } 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ciout: 23362306a36Sopenharmony_ci btrfs_free_path(path); 23462306a36Sopenharmony_ci return ret; 23562306a36Sopenharmony_ci} 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci/* 23862306a36Sopenharmony_ci * Initialize a new device for device replace target from a given source dev 23962306a36Sopenharmony_ci * and path. 24062306a36Sopenharmony_ci * 24162306a36Sopenharmony_ci * Return 0 and new device in @device_out, otherwise return < 0 24262306a36Sopenharmony_ci */ 24362306a36Sopenharmony_cistatic int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 24462306a36Sopenharmony_ci const char *device_path, 24562306a36Sopenharmony_ci struct btrfs_device *srcdev, 24662306a36Sopenharmony_ci struct btrfs_device **device_out) 24762306a36Sopenharmony_ci{ 24862306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 24962306a36Sopenharmony_ci struct btrfs_device *device; 25062306a36Sopenharmony_ci struct block_device *bdev; 25162306a36Sopenharmony_ci u64 devid = BTRFS_DEV_REPLACE_DEVID; 25262306a36Sopenharmony_ci int ret = 0; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci *device_out = NULL; 25562306a36Sopenharmony_ci if (srcdev->fs_devices->seeding) { 25662306a36Sopenharmony_ci btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 25762306a36Sopenharmony_ci return -EINVAL; 25862306a36Sopenharmony_ci } 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, 26162306a36Sopenharmony_ci fs_info->bdev_holder, NULL); 26262306a36Sopenharmony_ci if (IS_ERR(bdev)) { 26362306a36Sopenharmony_ci btrfs_err(fs_info, "target device %s is invalid!", device_path); 26462306a36Sopenharmony_ci return PTR_ERR(bdev); 26562306a36Sopenharmony_ci } 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci if (!btrfs_check_device_zone_type(fs_info, bdev)) { 26862306a36Sopenharmony_ci btrfs_err(fs_info, 26962306a36Sopenharmony_ci "dev-replace: zoned type of target device mismatch with filesystem"); 27062306a36Sopenharmony_ci ret = -EINVAL; 27162306a36Sopenharmony_ci goto error; 27262306a36Sopenharmony_ci } 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci sync_blockdev(bdev); 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci list_for_each_entry(device, &fs_devices->devices, dev_list) { 27762306a36Sopenharmony_ci if (device->bdev == bdev) { 27862306a36Sopenharmony_ci btrfs_err(fs_info, 27962306a36Sopenharmony_ci "target device is in the filesystem!"); 28062306a36Sopenharmony_ci ret = -EEXIST; 28162306a36Sopenharmony_ci goto error; 28262306a36Sopenharmony_ci } 28362306a36Sopenharmony_ci } 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) { 28762306a36Sopenharmony_ci btrfs_err(fs_info, 28862306a36Sopenharmony_ci "target device is smaller than source device!"); 28962306a36Sopenharmony_ci ret = -EINVAL; 29062306a36Sopenharmony_ci goto error; 29162306a36Sopenharmony_ci } 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci device = btrfs_alloc_device(NULL, &devid, NULL, device_path); 29562306a36Sopenharmony_ci if (IS_ERR(device)) { 29662306a36Sopenharmony_ci ret = PTR_ERR(device); 29762306a36Sopenharmony_ci goto error; 29862306a36Sopenharmony_ci } 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci ret = lookup_bdev(device_path, &device->devt); 30162306a36Sopenharmony_ci if (ret) 30262306a36Sopenharmony_ci goto error; 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 30562306a36Sopenharmony_ci device->generation = 0; 30662306a36Sopenharmony_ci device->io_width = fs_info->sectorsize; 30762306a36Sopenharmony_ci device->io_align = fs_info->sectorsize; 30862306a36Sopenharmony_ci device->sector_size = fs_info->sectorsize; 30962306a36Sopenharmony_ci device->total_bytes = btrfs_device_get_total_bytes(srcdev); 31062306a36Sopenharmony_ci device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 31162306a36Sopenharmony_ci device->bytes_used = btrfs_device_get_bytes_used(srcdev); 31262306a36Sopenharmony_ci device->commit_total_bytes = srcdev->commit_total_bytes; 31362306a36Sopenharmony_ci device->commit_bytes_used = device->bytes_used; 31462306a36Sopenharmony_ci device->fs_info = fs_info; 31562306a36Sopenharmony_ci device->bdev = bdev; 31662306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 31762306a36Sopenharmony_ci set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 31862306a36Sopenharmony_ci device->holder = fs_info->bdev_holder; 31962306a36Sopenharmony_ci device->dev_stats_valid = 1; 32062306a36Sopenharmony_ci set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 32162306a36Sopenharmony_ci device->fs_devices = fs_devices; 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci ret = btrfs_get_dev_zone_info(device, false); 32462306a36Sopenharmony_ci if (ret) 32562306a36Sopenharmony_ci goto error; 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 32862306a36Sopenharmony_ci list_add(&device->dev_list, &fs_devices->devices); 32962306a36Sopenharmony_ci fs_devices->num_devices++; 33062306a36Sopenharmony_ci fs_devices->open_devices++; 33162306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 33262306a36Sopenharmony_ci 33362306a36Sopenharmony_ci *device_out = device; 33462306a36Sopenharmony_ci return 0; 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_cierror: 33762306a36Sopenharmony_ci blkdev_put(bdev, fs_info->bdev_holder); 33862306a36Sopenharmony_ci return ret; 33962306a36Sopenharmony_ci} 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci/* 34262306a36Sopenharmony_ci * called from commit_transaction. Writes changed device replace state to 34362306a36Sopenharmony_ci * disk. 34462306a36Sopenharmony_ci */ 34562306a36Sopenharmony_ciint btrfs_run_dev_replace(struct btrfs_trans_handle *trans) 34662306a36Sopenharmony_ci{ 34762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 34862306a36Sopenharmony_ci int ret; 34962306a36Sopenharmony_ci struct btrfs_root *dev_root = fs_info->dev_root; 35062306a36Sopenharmony_ci struct btrfs_path *path; 35162306a36Sopenharmony_ci struct btrfs_key key; 35262306a36Sopenharmony_ci struct extent_buffer *eb; 35362306a36Sopenharmony_ci struct btrfs_dev_replace_item *ptr; 35462306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci down_read(&dev_replace->rwsem); 35762306a36Sopenharmony_ci if (!dev_replace->is_valid || 35862306a36Sopenharmony_ci !dev_replace->item_needs_writeback) { 35962306a36Sopenharmony_ci up_read(&dev_replace->rwsem); 36062306a36Sopenharmony_ci return 0; 36162306a36Sopenharmony_ci } 36262306a36Sopenharmony_ci up_read(&dev_replace->rwsem); 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_ci key.objectid = 0; 36562306a36Sopenharmony_ci key.type = BTRFS_DEV_REPLACE_KEY; 36662306a36Sopenharmony_ci key.offset = 0; 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci path = btrfs_alloc_path(); 36962306a36Sopenharmony_ci if (!path) { 37062306a36Sopenharmony_ci ret = -ENOMEM; 37162306a36Sopenharmony_ci goto out; 37262306a36Sopenharmony_ci } 37362306a36Sopenharmony_ci ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 37462306a36Sopenharmony_ci if (ret < 0) { 37562306a36Sopenharmony_ci btrfs_warn(fs_info, 37662306a36Sopenharmony_ci "error %d while searching for dev_replace item!", 37762306a36Sopenharmony_ci ret); 37862306a36Sopenharmony_ci goto out; 37962306a36Sopenharmony_ci } 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci if (ret == 0 && 38262306a36Sopenharmony_ci btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 38362306a36Sopenharmony_ci /* 38462306a36Sopenharmony_ci * need to delete old one and insert a new one. 38562306a36Sopenharmony_ci * Since no attempt is made to recover any old state, if the 38662306a36Sopenharmony_ci * dev_replace state is 'running', the data on the target 38762306a36Sopenharmony_ci * drive is lost. 38862306a36Sopenharmony_ci * It would be possible to recover the state: just make sure 38962306a36Sopenharmony_ci * that the beginning of the item is never changed and always 39062306a36Sopenharmony_ci * contains all the essential information. Then read this 39162306a36Sopenharmony_ci * minimal set of information and use it as a base for the 39262306a36Sopenharmony_ci * new state. 39362306a36Sopenharmony_ci */ 39462306a36Sopenharmony_ci ret = btrfs_del_item(trans, dev_root, path); 39562306a36Sopenharmony_ci if (ret != 0) { 39662306a36Sopenharmony_ci btrfs_warn(fs_info, 39762306a36Sopenharmony_ci "delete too small dev_replace item failed %d!", 39862306a36Sopenharmony_ci ret); 39962306a36Sopenharmony_ci goto out; 40062306a36Sopenharmony_ci } 40162306a36Sopenharmony_ci ret = 1; 40262306a36Sopenharmony_ci } 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_ci if (ret == 1) { 40562306a36Sopenharmony_ci /* need to insert a new item */ 40662306a36Sopenharmony_ci btrfs_release_path(path); 40762306a36Sopenharmony_ci ret = btrfs_insert_empty_item(trans, dev_root, path, 40862306a36Sopenharmony_ci &key, sizeof(*ptr)); 40962306a36Sopenharmony_ci if (ret < 0) { 41062306a36Sopenharmony_ci btrfs_warn(fs_info, 41162306a36Sopenharmony_ci "insert dev_replace item failed %d!", ret); 41262306a36Sopenharmony_ci goto out; 41362306a36Sopenharmony_ci } 41462306a36Sopenharmony_ci } 41562306a36Sopenharmony_ci 41662306a36Sopenharmony_ci eb = path->nodes[0]; 41762306a36Sopenharmony_ci ptr = btrfs_item_ptr(eb, path->slots[0], 41862306a36Sopenharmony_ci struct btrfs_dev_replace_item); 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci down_write(&dev_replace->rwsem); 42162306a36Sopenharmony_ci if (dev_replace->srcdev) 42262306a36Sopenharmony_ci btrfs_set_dev_replace_src_devid(eb, ptr, 42362306a36Sopenharmony_ci dev_replace->srcdev->devid); 42462306a36Sopenharmony_ci else 42562306a36Sopenharmony_ci btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); 42662306a36Sopenharmony_ci btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, 42762306a36Sopenharmony_ci dev_replace->cont_reading_from_srcdev_mode); 42862306a36Sopenharmony_ci btrfs_set_dev_replace_replace_state(eb, ptr, 42962306a36Sopenharmony_ci dev_replace->replace_state); 43062306a36Sopenharmony_ci btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); 43162306a36Sopenharmony_ci btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); 43262306a36Sopenharmony_ci btrfs_set_dev_replace_num_write_errors(eb, ptr, 43362306a36Sopenharmony_ci atomic64_read(&dev_replace->num_write_errors)); 43462306a36Sopenharmony_ci btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, 43562306a36Sopenharmony_ci atomic64_read(&dev_replace->num_uncorrectable_read_errors)); 43662306a36Sopenharmony_ci dev_replace->cursor_left_last_write_of_item = 43762306a36Sopenharmony_ci dev_replace->cursor_left; 43862306a36Sopenharmony_ci btrfs_set_dev_replace_cursor_left(eb, ptr, 43962306a36Sopenharmony_ci dev_replace->cursor_left_last_write_of_item); 44062306a36Sopenharmony_ci btrfs_set_dev_replace_cursor_right(eb, ptr, 44162306a36Sopenharmony_ci dev_replace->cursor_right); 44262306a36Sopenharmony_ci dev_replace->item_needs_writeback = 0; 44362306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, eb); 44662306a36Sopenharmony_ci 44762306a36Sopenharmony_ciout: 44862306a36Sopenharmony_ci btrfs_free_path(path); 44962306a36Sopenharmony_ci 45062306a36Sopenharmony_ci return ret; 45162306a36Sopenharmony_ci} 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_cistatic int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, 45462306a36Sopenharmony_ci struct btrfs_device *src_dev) 45562306a36Sopenharmony_ci{ 45662306a36Sopenharmony_ci struct btrfs_path *path; 45762306a36Sopenharmony_ci struct btrfs_key key; 45862306a36Sopenharmony_ci struct btrfs_key found_key; 45962306a36Sopenharmony_ci struct btrfs_root *root = fs_info->dev_root; 46062306a36Sopenharmony_ci struct btrfs_dev_extent *dev_extent = NULL; 46162306a36Sopenharmony_ci struct btrfs_block_group *cache; 46262306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 46362306a36Sopenharmony_ci int iter_ret = 0; 46462306a36Sopenharmony_ci int ret = 0; 46562306a36Sopenharmony_ci u64 chunk_offset; 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_ci /* Do not use "to_copy" on non zoned filesystem for now */ 46862306a36Sopenharmony_ci if (!btrfs_is_zoned(fs_info)) 46962306a36Sopenharmony_ci return 0; 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci /* Ensure we don't have pending new block group */ 47462306a36Sopenharmony_ci spin_lock(&fs_info->trans_lock); 47562306a36Sopenharmony_ci while (fs_info->running_transaction && 47662306a36Sopenharmony_ci !list_empty(&fs_info->running_transaction->dev_update_list)) { 47762306a36Sopenharmony_ci spin_unlock(&fs_info->trans_lock); 47862306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 47962306a36Sopenharmony_ci trans = btrfs_attach_transaction(root); 48062306a36Sopenharmony_ci if (IS_ERR(trans)) { 48162306a36Sopenharmony_ci ret = PTR_ERR(trans); 48262306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 48362306a36Sopenharmony_ci if (ret == -ENOENT) { 48462306a36Sopenharmony_ci spin_lock(&fs_info->trans_lock); 48562306a36Sopenharmony_ci continue; 48662306a36Sopenharmony_ci } else { 48762306a36Sopenharmony_ci goto unlock; 48862306a36Sopenharmony_ci } 48962306a36Sopenharmony_ci } 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 49262306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 49362306a36Sopenharmony_ci if (ret) 49462306a36Sopenharmony_ci goto unlock; 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci spin_lock(&fs_info->trans_lock); 49762306a36Sopenharmony_ci } 49862306a36Sopenharmony_ci spin_unlock(&fs_info->trans_lock); 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci path = btrfs_alloc_path(); 50162306a36Sopenharmony_ci if (!path) { 50262306a36Sopenharmony_ci ret = -ENOMEM; 50362306a36Sopenharmony_ci goto unlock; 50462306a36Sopenharmony_ci } 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci path->reada = READA_FORWARD; 50762306a36Sopenharmony_ci path->search_commit_root = 1; 50862306a36Sopenharmony_ci path->skip_locking = 1; 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci key.objectid = src_dev->devid; 51162306a36Sopenharmony_ci key.type = BTRFS_DEV_EXTENT_KEY; 51262306a36Sopenharmony_ci key.offset = 0; 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_ci btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 51562306a36Sopenharmony_ci struct extent_buffer *leaf = path->nodes[0]; 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci if (found_key.objectid != src_dev->devid) 51862306a36Sopenharmony_ci break; 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci if (found_key.type != BTRFS_DEV_EXTENT_KEY) 52162306a36Sopenharmony_ci break; 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci if (found_key.offset < key.offset) 52462306a36Sopenharmony_ci break; 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_ci dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); 52962306a36Sopenharmony_ci 53062306a36Sopenharmony_ci cache = btrfs_lookup_block_group(fs_info, chunk_offset); 53162306a36Sopenharmony_ci if (!cache) 53262306a36Sopenharmony_ci continue; 53362306a36Sopenharmony_ci 53462306a36Sopenharmony_ci set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); 53562306a36Sopenharmony_ci btrfs_put_block_group(cache); 53662306a36Sopenharmony_ci } 53762306a36Sopenharmony_ci if (iter_ret < 0) 53862306a36Sopenharmony_ci ret = iter_ret; 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci btrfs_free_path(path); 54162306a36Sopenharmony_ciunlock: 54262306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_ci return ret; 54562306a36Sopenharmony_ci} 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_cibool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, 54862306a36Sopenharmony_ci struct btrfs_block_group *cache, 54962306a36Sopenharmony_ci u64 physical) 55062306a36Sopenharmony_ci{ 55162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = cache->fs_info; 55262306a36Sopenharmony_ci struct extent_map *em; 55362306a36Sopenharmony_ci struct map_lookup *map; 55462306a36Sopenharmony_ci u64 chunk_offset = cache->start; 55562306a36Sopenharmony_ci int num_extents, cur_extent; 55662306a36Sopenharmony_ci int i; 55762306a36Sopenharmony_ci 55862306a36Sopenharmony_ci /* Do not use "to_copy" on non zoned filesystem for now */ 55962306a36Sopenharmony_ci if (!btrfs_is_zoned(fs_info)) 56062306a36Sopenharmony_ci return true; 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_ci spin_lock(&cache->lock); 56362306a36Sopenharmony_ci if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { 56462306a36Sopenharmony_ci spin_unlock(&cache->lock); 56562306a36Sopenharmony_ci return true; 56662306a36Sopenharmony_ci } 56762306a36Sopenharmony_ci spin_unlock(&cache->lock); 56862306a36Sopenharmony_ci 56962306a36Sopenharmony_ci em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 57062306a36Sopenharmony_ci ASSERT(!IS_ERR(em)); 57162306a36Sopenharmony_ci map = em->map_lookup; 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci num_extents = 0; 57462306a36Sopenharmony_ci cur_extent = 0; 57562306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) { 57662306a36Sopenharmony_ci /* We have more device extent to copy */ 57762306a36Sopenharmony_ci if (srcdev != map->stripes[i].dev) 57862306a36Sopenharmony_ci continue; 57962306a36Sopenharmony_ci 58062306a36Sopenharmony_ci num_extents++; 58162306a36Sopenharmony_ci if (physical == map->stripes[i].physical) 58262306a36Sopenharmony_ci cur_extent = i; 58362306a36Sopenharmony_ci } 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci free_extent_map(em); 58662306a36Sopenharmony_ci 58762306a36Sopenharmony_ci if (num_extents > 1 && cur_extent < num_extents - 1) { 58862306a36Sopenharmony_ci /* 58962306a36Sopenharmony_ci * Has more stripes on this device. Keep this block group 59062306a36Sopenharmony_ci * readonly until we finish all the stripes. 59162306a36Sopenharmony_ci */ 59262306a36Sopenharmony_ci return false; 59362306a36Sopenharmony_ci } 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci /* Last stripe on this device */ 59662306a36Sopenharmony_ci clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci return true; 59962306a36Sopenharmony_ci} 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_cistatic int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, 60262306a36Sopenharmony_ci const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, 60362306a36Sopenharmony_ci int read_src) 60462306a36Sopenharmony_ci{ 60562306a36Sopenharmony_ci struct btrfs_root *root = fs_info->dev_root; 60662306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 60762306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 60862306a36Sopenharmony_ci int ret; 60962306a36Sopenharmony_ci struct btrfs_device *tgt_device = NULL; 61062306a36Sopenharmony_ci struct btrfs_device *src_device = NULL; 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, 61362306a36Sopenharmony_ci srcdev_name); 61462306a36Sopenharmony_ci if (IS_ERR(src_device)) 61562306a36Sopenharmony_ci return PTR_ERR(src_device); 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci if (btrfs_pinned_by_swapfile(fs_info, src_device)) { 61862306a36Sopenharmony_ci btrfs_warn_in_rcu(fs_info, 61962306a36Sopenharmony_ci "cannot replace device %s (devid %llu) due to active swapfile", 62062306a36Sopenharmony_ci btrfs_dev_name(src_device), src_device->devid); 62162306a36Sopenharmony_ci return -ETXTBSY; 62262306a36Sopenharmony_ci } 62362306a36Sopenharmony_ci 62462306a36Sopenharmony_ci /* 62562306a36Sopenharmony_ci * Here we commit the transaction to make sure commit_total_bytes 62662306a36Sopenharmony_ci * of all the devices are updated. 62762306a36Sopenharmony_ci */ 62862306a36Sopenharmony_ci trans = btrfs_attach_transaction(root); 62962306a36Sopenharmony_ci if (!IS_ERR(trans)) { 63062306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 63162306a36Sopenharmony_ci if (ret) 63262306a36Sopenharmony_ci return ret; 63362306a36Sopenharmony_ci } else if (PTR_ERR(trans) != -ENOENT) { 63462306a36Sopenharmony_ci return PTR_ERR(trans); 63562306a36Sopenharmony_ci } 63662306a36Sopenharmony_ci 63762306a36Sopenharmony_ci ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, 63862306a36Sopenharmony_ci src_device, &tgt_device); 63962306a36Sopenharmony_ci if (ret) 64062306a36Sopenharmony_ci return ret; 64162306a36Sopenharmony_ci 64262306a36Sopenharmony_ci ret = mark_block_group_to_copy(fs_info, src_device); 64362306a36Sopenharmony_ci if (ret) 64462306a36Sopenharmony_ci return ret; 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_ci down_write(&dev_replace->rwsem); 64762306a36Sopenharmony_ci switch (dev_replace->replace_state) { 64862306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 64962306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 65062306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 65162306a36Sopenharmony_ci break; 65262306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 65362306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 65462306a36Sopenharmony_ci ASSERT(0); 65562306a36Sopenharmony_ci ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; 65662306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 65762306a36Sopenharmony_ci goto leave; 65862306a36Sopenharmony_ci } 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_ci dev_replace->cont_reading_from_srcdev_mode = read_src; 66162306a36Sopenharmony_ci dev_replace->srcdev = src_device; 66262306a36Sopenharmony_ci dev_replace->tgtdev = tgt_device; 66362306a36Sopenharmony_ci 66462306a36Sopenharmony_ci btrfs_info_in_rcu(fs_info, 66562306a36Sopenharmony_ci "dev_replace from %s (devid %llu) to %s started", 66662306a36Sopenharmony_ci btrfs_dev_name(src_device), 66762306a36Sopenharmony_ci src_device->devid, 66862306a36Sopenharmony_ci btrfs_dev_name(tgt_device)); 66962306a36Sopenharmony_ci 67062306a36Sopenharmony_ci /* 67162306a36Sopenharmony_ci * from now on, the writes to the srcdev are all duplicated to 67262306a36Sopenharmony_ci * go to the tgtdev as well (refer to btrfs_map_block()). 67362306a36Sopenharmony_ci */ 67462306a36Sopenharmony_ci dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 67562306a36Sopenharmony_ci dev_replace->time_started = ktime_get_real_seconds(); 67662306a36Sopenharmony_ci dev_replace->cursor_left = 0; 67762306a36Sopenharmony_ci dev_replace->committed_cursor_left = 0; 67862306a36Sopenharmony_ci dev_replace->cursor_left_last_write_of_item = 0; 67962306a36Sopenharmony_ci dev_replace->cursor_right = 0; 68062306a36Sopenharmony_ci dev_replace->is_valid = 1; 68162306a36Sopenharmony_ci dev_replace->item_needs_writeback = 1; 68262306a36Sopenharmony_ci atomic64_set(&dev_replace->num_write_errors, 0); 68362306a36Sopenharmony_ci atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 68462306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci ret = btrfs_sysfs_add_device(tgt_device); 68762306a36Sopenharmony_ci if (ret) 68862306a36Sopenharmony_ci btrfs_err(fs_info, "kobj add dev failed %d", ret); 68962306a36Sopenharmony_ci 69062306a36Sopenharmony_ci btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci /* 69362306a36Sopenharmony_ci * Commit dev_replace state and reserve 1 item for it. 69462306a36Sopenharmony_ci * This is crucial to ensure we won't miss copying extents for new block 69562306a36Sopenharmony_ci * groups that are allocated after we started the device replace, and 69662306a36Sopenharmony_ci * must be done after setting up the device replace state. 69762306a36Sopenharmony_ci */ 69862306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 1); 69962306a36Sopenharmony_ci if (IS_ERR(trans)) { 70062306a36Sopenharmony_ci ret = PTR_ERR(trans); 70162306a36Sopenharmony_ci down_write(&dev_replace->rwsem); 70262306a36Sopenharmony_ci dev_replace->replace_state = 70362306a36Sopenharmony_ci BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; 70462306a36Sopenharmony_ci dev_replace->srcdev = NULL; 70562306a36Sopenharmony_ci dev_replace->tgtdev = NULL; 70662306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 70762306a36Sopenharmony_ci goto leave; 70862306a36Sopenharmony_ci } 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 71162306a36Sopenharmony_ci WARN_ON(ret); 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_ci /* the disk copy procedure reuses the scrub code */ 71462306a36Sopenharmony_ci ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, 71562306a36Sopenharmony_ci btrfs_device_get_total_bytes(src_device), 71662306a36Sopenharmony_ci &dev_replace->scrub_progress, 0, 1); 71762306a36Sopenharmony_ci 71862306a36Sopenharmony_ci ret = btrfs_dev_replace_finishing(fs_info, ret); 71962306a36Sopenharmony_ci if (ret == -EINPROGRESS) 72062306a36Sopenharmony_ci ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci return ret; 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_cileave: 72562306a36Sopenharmony_ci btrfs_destroy_dev_replace_tgtdev(tgt_device); 72662306a36Sopenharmony_ci return ret; 72762306a36Sopenharmony_ci} 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_cistatic int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args) 73062306a36Sopenharmony_ci{ 73162306a36Sopenharmony_ci if (args->start.srcdevid == 0) { 73262306a36Sopenharmony_ci if (memchr(args->start.srcdev_name, 0, 73362306a36Sopenharmony_ci sizeof(args->start.srcdev_name)) == NULL) 73462306a36Sopenharmony_ci return -ENAMETOOLONG; 73562306a36Sopenharmony_ci } else { 73662306a36Sopenharmony_ci args->start.srcdev_name[0] = 0; 73762306a36Sopenharmony_ci } 73862306a36Sopenharmony_ci 73962306a36Sopenharmony_ci if (memchr(args->start.tgtdev_name, 0, 74062306a36Sopenharmony_ci sizeof(args->start.tgtdev_name)) == NULL) 74162306a36Sopenharmony_ci return -ENAMETOOLONG; 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci return 0; 74462306a36Sopenharmony_ci} 74562306a36Sopenharmony_ci 74662306a36Sopenharmony_ciint btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, 74762306a36Sopenharmony_ci struct btrfs_ioctl_dev_replace_args *args) 74862306a36Sopenharmony_ci{ 74962306a36Sopenharmony_ci int ret; 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci switch (args->start.cont_reading_from_srcdev_mode) { 75262306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 75362306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 75462306a36Sopenharmony_ci break; 75562306a36Sopenharmony_ci default: 75662306a36Sopenharmony_ci return -EINVAL; 75762306a36Sopenharmony_ci } 75862306a36Sopenharmony_ci ret = btrfs_check_replace_dev_names(args); 75962306a36Sopenharmony_ci if (ret < 0) 76062306a36Sopenharmony_ci return ret; 76162306a36Sopenharmony_ci 76262306a36Sopenharmony_ci ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, 76362306a36Sopenharmony_ci args->start.srcdevid, 76462306a36Sopenharmony_ci args->start.srcdev_name, 76562306a36Sopenharmony_ci args->start.cont_reading_from_srcdev_mode); 76662306a36Sopenharmony_ci args->result = ret; 76762306a36Sopenharmony_ci /* don't warn if EINPROGRESS, someone else might be running scrub */ 76862306a36Sopenharmony_ci if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || 76962306a36Sopenharmony_ci ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) 77062306a36Sopenharmony_ci return 0; 77162306a36Sopenharmony_ci 77262306a36Sopenharmony_ci return ret; 77362306a36Sopenharmony_ci} 77462306a36Sopenharmony_ci 77562306a36Sopenharmony_ci/* 77662306a36Sopenharmony_ci * blocked until all in-flight bios operations are finished. 77762306a36Sopenharmony_ci */ 77862306a36Sopenharmony_cistatic void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) 77962306a36Sopenharmony_ci{ 78062306a36Sopenharmony_ci set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); 78162306a36Sopenharmony_ci wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum( 78262306a36Sopenharmony_ci &fs_info->dev_replace.bio_counter)); 78362306a36Sopenharmony_ci} 78462306a36Sopenharmony_ci 78562306a36Sopenharmony_ci/* 78662306a36Sopenharmony_ci * we have removed target device, it is safe to allow new bios request. 78762306a36Sopenharmony_ci */ 78862306a36Sopenharmony_cistatic void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) 78962306a36Sopenharmony_ci{ 79062306a36Sopenharmony_ci clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); 79162306a36Sopenharmony_ci wake_up(&fs_info->dev_replace.replace_wait); 79262306a36Sopenharmony_ci} 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_ci/* 79562306a36Sopenharmony_ci * When finishing the device replace, before swapping the source device with the 79662306a36Sopenharmony_ci * target device we must update the chunk allocation state in the target device, 79762306a36Sopenharmony_ci * as it is empty because replace works by directly copying the chunks and not 79862306a36Sopenharmony_ci * through the normal chunk allocation path. 79962306a36Sopenharmony_ci */ 80062306a36Sopenharmony_cistatic int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, 80162306a36Sopenharmony_ci struct btrfs_device *tgtdev) 80262306a36Sopenharmony_ci{ 80362306a36Sopenharmony_ci struct extent_state *cached_state = NULL; 80462306a36Sopenharmony_ci u64 start = 0; 80562306a36Sopenharmony_ci u64 found_start; 80662306a36Sopenharmony_ci u64 found_end; 80762306a36Sopenharmony_ci int ret = 0; 80862306a36Sopenharmony_ci 80962306a36Sopenharmony_ci lockdep_assert_held(&srcdev->fs_info->chunk_mutex); 81062306a36Sopenharmony_ci 81162306a36Sopenharmony_ci while (find_first_extent_bit(&srcdev->alloc_state, start, 81262306a36Sopenharmony_ci &found_start, &found_end, 81362306a36Sopenharmony_ci CHUNK_ALLOCATED, &cached_state)) { 81462306a36Sopenharmony_ci ret = set_extent_bit(&tgtdev->alloc_state, found_start, 81562306a36Sopenharmony_ci found_end, CHUNK_ALLOCATED, NULL); 81662306a36Sopenharmony_ci if (ret) 81762306a36Sopenharmony_ci break; 81862306a36Sopenharmony_ci start = found_end + 1; 81962306a36Sopenharmony_ci } 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_ci free_extent_state(cached_state); 82262306a36Sopenharmony_ci return ret; 82362306a36Sopenharmony_ci} 82462306a36Sopenharmony_ci 82562306a36Sopenharmony_cistatic void btrfs_dev_replace_update_device_in_mapping_tree( 82662306a36Sopenharmony_ci struct btrfs_fs_info *fs_info, 82762306a36Sopenharmony_ci struct btrfs_device *srcdev, 82862306a36Sopenharmony_ci struct btrfs_device *tgtdev) 82962306a36Sopenharmony_ci{ 83062306a36Sopenharmony_ci struct extent_map_tree *em_tree = &fs_info->mapping_tree; 83162306a36Sopenharmony_ci struct extent_map *em; 83262306a36Sopenharmony_ci struct map_lookup *map; 83362306a36Sopenharmony_ci u64 start = 0; 83462306a36Sopenharmony_ci int i; 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci write_lock(&em_tree->lock); 83762306a36Sopenharmony_ci do { 83862306a36Sopenharmony_ci em = lookup_extent_mapping(em_tree, start, (u64)-1); 83962306a36Sopenharmony_ci if (!em) 84062306a36Sopenharmony_ci break; 84162306a36Sopenharmony_ci map = em->map_lookup; 84262306a36Sopenharmony_ci for (i = 0; i < map->num_stripes; i++) 84362306a36Sopenharmony_ci if (srcdev == map->stripes[i].dev) 84462306a36Sopenharmony_ci map->stripes[i].dev = tgtdev; 84562306a36Sopenharmony_ci start = em->start + em->len; 84662306a36Sopenharmony_ci free_extent_map(em); 84762306a36Sopenharmony_ci } while (start); 84862306a36Sopenharmony_ci write_unlock(&em_tree->lock); 84962306a36Sopenharmony_ci} 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_cistatic int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 85262306a36Sopenharmony_ci int scrub_ret) 85362306a36Sopenharmony_ci{ 85462306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 85562306a36Sopenharmony_ci struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 85662306a36Sopenharmony_ci struct btrfs_device *tgt_device; 85762306a36Sopenharmony_ci struct btrfs_device *src_device; 85862306a36Sopenharmony_ci struct btrfs_root *root = fs_info->tree_root; 85962306a36Sopenharmony_ci u8 uuid_tmp[BTRFS_UUID_SIZE]; 86062306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 86162306a36Sopenharmony_ci int ret = 0; 86262306a36Sopenharmony_ci 86362306a36Sopenharmony_ci /* don't allow cancel or unmount to disturb the finishing procedure */ 86462306a36Sopenharmony_ci mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_ci down_read(&dev_replace->rwsem); 86762306a36Sopenharmony_ci /* was the operation canceled, or is it finished? */ 86862306a36Sopenharmony_ci if (dev_replace->replace_state != 86962306a36Sopenharmony_ci BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { 87062306a36Sopenharmony_ci up_read(&dev_replace->rwsem); 87162306a36Sopenharmony_ci mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 87262306a36Sopenharmony_ci return 0; 87362306a36Sopenharmony_ci } 87462306a36Sopenharmony_ci 87562306a36Sopenharmony_ci tgt_device = dev_replace->tgtdev; 87662306a36Sopenharmony_ci src_device = dev_replace->srcdev; 87762306a36Sopenharmony_ci up_read(&dev_replace->rwsem); 87862306a36Sopenharmony_ci 87962306a36Sopenharmony_ci /* 88062306a36Sopenharmony_ci * flush all outstanding I/O and inode extent mappings before the 88162306a36Sopenharmony_ci * copy operation is declared as being finished 88262306a36Sopenharmony_ci */ 88362306a36Sopenharmony_ci ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); 88462306a36Sopenharmony_ci if (ret) { 88562306a36Sopenharmony_ci mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 88662306a36Sopenharmony_ci return ret; 88762306a36Sopenharmony_ci } 88862306a36Sopenharmony_ci btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_ci /* 89162306a36Sopenharmony_ci * We have to use this loop approach because at this point src_device 89262306a36Sopenharmony_ci * has to be available for transaction commit to complete, yet new 89362306a36Sopenharmony_ci * chunks shouldn't be allocated on the device. 89462306a36Sopenharmony_ci */ 89562306a36Sopenharmony_ci while (1) { 89662306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 0); 89762306a36Sopenharmony_ci if (IS_ERR(trans)) { 89862306a36Sopenharmony_ci mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 89962306a36Sopenharmony_ci return PTR_ERR(trans); 90062306a36Sopenharmony_ci } 90162306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 90262306a36Sopenharmony_ci WARN_ON(ret); 90362306a36Sopenharmony_ci 90462306a36Sopenharmony_ci /* Prevent write_all_supers() during the finishing procedure */ 90562306a36Sopenharmony_ci mutex_lock(&fs_devices->device_list_mutex); 90662306a36Sopenharmony_ci /* Prevent new chunks being allocated on the source device */ 90762306a36Sopenharmony_ci mutex_lock(&fs_info->chunk_mutex); 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_ci if (!list_empty(&src_device->post_commit_list)) { 91062306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 91162306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 91262306a36Sopenharmony_ci } else { 91362306a36Sopenharmony_ci break; 91462306a36Sopenharmony_ci } 91562306a36Sopenharmony_ci } 91662306a36Sopenharmony_ci 91762306a36Sopenharmony_ci down_write(&dev_replace->rwsem); 91862306a36Sopenharmony_ci dev_replace->replace_state = 91962306a36Sopenharmony_ci scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 92062306a36Sopenharmony_ci : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; 92162306a36Sopenharmony_ci dev_replace->tgtdev = NULL; 92262306a36Sopenharmony_ci dev_replace->srcdev = NULL; 92362306a36Sopenharmony_ci dev_replace->time_stopped = ktime_get_real_seconds(); 92462306a36Sopenharmony_ci dev_replace->item_needs_writeback = 1; 92562306a36Sopenharmony_ci 92662306a36Sopenharmony_ci /* 92762306a36Sopenharmony_ci * Update allocation state in the new device and replace the old device 92862306a36Sopenharmony_ci * with the new one in the mapping tree. 92962306a36Sopenharmony_ci */ 93062306a36Sopenharmony_ci if (!scrub_ret) { 93162306a36Sopenharmony_ci scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device); 93262306a36Sopenharmony_ci if (scrub_ret) 93362306a36Sopenharmony_ci goto error; 93462306a36Sopenharmony_ci btrfs_dev_replace_update_device_in_mapping_tree(fs_info, 93562306a36Sopenharmony_ci src_device, 93662306a36Sopenharmony_ci tgt_device); 93762306a36Sopenharmony_ci } else { 93862306a36Sopenharmony_ci if (scrub_ret != -ECANCELED) 93962306a36Sopenharmony_ci btrfs_err_in_rcu(fs_info, 94062306a36Sopenharmony_ci "btrfs_scrub_dev(%s, %llu, %s) failed %d", 94162306a36Sopenharmony_ci btrfs_dev_name(src_device), 94262306a36Sopenharmony_ci src_device->devid, 94362306a36Sopenharmony_ci btrfs_dev_name(tgt_device), scrub_ret); 94462306a36Sopenharmony_cierror: 94562306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 94662306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 94762306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 94862306a36Sopenharmony_ci btrfs_rm_dev_replace_blocked(fs_info); 94962306a36Sopenharmony_ci if (tgt_device) 95062306a36Sopenharmony_ci btrfs_destroy_dev_replace_tgtdev(tgt_device); 95162306a36Sopenharmony_ci btrfs_rm_dev_replace_unblocked(fs_info); 95262306a36Sopenharmony_ci mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 95362306a36Sopenharmony_ci 95462306a36Sopenharmony_ci return scrub_ret; 95562306a36Sopenharmony_ci } 95662306a36Sopenharmony_ci 95762306a36Sopenharmony_ci btrfs_info_in_rcu(fs_info, 95862306a36Sopenharmony_ci "dev_replace from %s (devid %llu) to %s finished", 95962306a36Sopenharmony_ci btrfs_dev_name(src_device), 96062306a36Sopenharmony_ci src_device->devid, 96162306a36Sopenharmony_ci btrfs_dev_name(tgt_device)); 96262306a36Sopenharmony_ci clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); 96362306a36Sopenharmony_ci tgt_device->devid = src_device->devid; 96462306a36Sopenharmony_ci src_device->devid = BTRFS_DEV_REPLACE_DEVID; 96562306a36Sopenharmony_ci memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); 96662306a36Sopenharmony_ci memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); 96762306a36Sopenharmony_ci memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); 96862306a36Sopenharmony_ci btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes); 96962306a36Sopenharmony_ci btrfs_device_set_disk_total_bytes(tgt_device, 97062306a36Sopenharmony_ci src_device->disk_total_bytes); 97162306a36Sopenharmony_ci btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); 97262306a36Sopenharmony_ci tgt_device->commit_bytes_used = src_device->bytes_used; 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci btrfs_assign_next_active_device(src_device, tgt_device); 97562306a36Sopenharmony_ci 97662306a36Sopenharmony_ci list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); 97762306a36Sopenharmony_ci fs_devices->rw_devices++; 97862306a36Sopenharmony_ci 97962306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 98062306a36Sopenharmony_ci btrfs_rm_dev_replace_blocked(fs_info); 98162306a36Sopenharmony_ci 98262306a36Sopenharmony_ci btrfs_rm_dev_replace_remove_srcdev(src_device); 98362306a36Sopenharmony_ci 98462306a36Sopenharmony_ci btrfs_rm_dev_replace_unblocked(fs_info); 98562306a36Sopenharmony_ci 98662306a36Sopenharmony_ci /* 98762306a36Sopenharmony_ci * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will 98862306a36Sopenharmony_ci * update on-disk dev stats value during commit transaction 98962306a36Sopenharmony_ci */ 99062306a36Sopenharmony_ci atomic_inc(&tgt_device->dev_stats_ccnt); 99162306a36Sopenharmony_ci 99262306a36Sopenharmony_ci /* 99362306a36Sopenharmony_ci * this is again a consistent state where no dev_replace procedure 99462306a36Sopenharmony_ci * is running, the target device is part of the filesystem, the 99562306a36Sopenharmony_ci * source device is not part of the filesystem anymore and its 1st 99662306a36Sopenharmony_ci * superblock is scratched out so that it is no longer marked to 99762306a36Sopenharmony_ci * belong to this filesystem. 99862306a36Sopenharmony_ci */ 99962306a36Sopenharmony_ci mutex_unlock(&fs_info->chunk_mutex); 100062306a36Sopenharmony_ci mutex_unlock(&fs_devices->device_list_mutex); 100162306a36Sopenharmony_ci 100262306a36Sopenharmony_ci /* replace the sysfs entry */ 100362306a36Sopenharmony_ci btrfs_sysfs_remove_device(src_device); 100462306a36Sopenharmony_ci btrfs_sysfs_update_devid(tgt_device); 100562306a36Sopenharmony_ci if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) 100662306a36Sopenharmony_ci btrfs_scratch_superblocks(fs_info, src_device->bdev, 100762306a36Sopenharmony_ci src_device->name->str); 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_ci /* write back the superblocks */ 101062306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 0); 101162306a36Sopenharmony_ci if (!IS_ERR(trans)) 101262306a36Sopenharmony_ci btrfs_commit_transaction(trans); 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_ci mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci btrfs_rm_dev_replace_free_srcdev(src_device); 101762306a36Sopenharmony_ci 101862306a36Sopenharmony_ci return 0; 101962306a36Sopenharmony_ci} 102062306a36Sopenharmony_ci 102162306a36Sopenharmony_ci/* 102262306a36Sopenharmony_ci * Read progress of device replace status according to the state and last 102362306a36Sopenharmony_ci * stored position. The value format is the same as for 102462306a36Sopenharmony_ci * btrfs_dev_replace::progress_1000 102562306a36Sopenharmony_ci */ 102662306a36Sopenharmony_cistatic u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info) 102762306a36Sopenharmony_ci{ 102862306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 102962306a36Sopenharmony_ci u64 ret = 0; 103062306a36Sopenharmony_ci 103162306a36Sopenharmony_ci switch (dev_replace->replace_state) { 103262306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 103362306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 103462306a36Sopenharmony_ci ret = 0; 103562306a36Sopenharmony_ci break; 103662306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 103762306a36Sopenharmony_ci ret = 1000; 103862306a36Sopenharmony_ci break; 103962306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 104062306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 104162306a36Sopenharmony_ci ret = div64_u64(dev_replace->cursor_left, 104262306a36Sopenharmony_ci div_u64(btrfs_device_get_total_bytes( 104362306a36Sopenharmony_ci dev_replace->srcdev), 1000)); 104462306a36Sopenharmony_ci break; 104562306a36Sopenharmony_ci } 104662306a36Sopenharmony_ci 104762306a36Sopenharmony_ci return ret; 104862306a36Sopenharmony_ci} 104962306a36Sopenharmony_ci 105062306a36Sopenharmony_civoid btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, 105162306a36Sopenharmony_ci struct btrfs_ioctl_dev_replace_args *args) 105262306a36Sopenharmony_ci{ 105362306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_ci down_read(&dev_replace->rwsem); 105662306a36Sopenharmony_ci /* even if !dev_replace_is_valid, the values are good enough for 105762306a36Sopenharmony_ci * the replace_status ioctl */ 105862306a36Sopenharmony_ci args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 105962306a36Sopenharmony_ci args->status.replace_state = dev_replace->replace_state; 106062306a36Sopenharmony_ci args->status.time_started = dev_replace->time_started; 106162306a36Sopenharmony_ci args->status.time_stopped = dev_replace->time_stopped; 106262306a36Sopenharmony_ci args->status.num_write_errors = 106362306a36Sopenharmony_ci atomic64_read(&dev_replace->num_write_errors); 106462306a36Sopenharmony_ci args->status.num_uncorrectable_read_errors = 106562306a36Sopenharmony_ci atomic64_read(&dev_replace->num_uncorrectable_read_errors); 106662306a36Sopenharmony_ci args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); 106762306a36Sopenharmony_ci up_read(&dev_replace->rwsem); 106862306a36Sopenharmony_ci} 106962306a36Sopenharmony_ci 107062306a36Sopenharmony_ciint btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) 107162306a36Sopenharmony_ci{ 107262306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 107362306a36Sopenharmony_ci struct btrfs_device *tgt_device = NULL; 107462306a36Sopenharmony_ci struct btrfs_device *src_device = NULL; 107562306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 107662306a36Sopenharmony_ci struct btrfs_root *root = fs_info->tree_root; 107762306a36Sopenharmony_ci int result; 107862306a36Sopenharmony_ci int ret; 107962306a36Sopenharmony_ci 108062306a36Sopenharmony_ci if (sb_rdonly(fs_info->sb)) 108162306a36Sopenharmony_ci return -EROFS; 108262306a36Sopenharmony_ci 108362306a36Sopenharmony_ci mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 108462306a36Sopenharmony_ci down_write(&dev_replace->rwsem); 108562306a36Sopenharmony_ci switch (dev_replace->replace_state) { 108662306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 108762306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 108862306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 108962306a36Sopenharmony_ci result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 109062306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 109162306a36Sopenharmony_ci break; 109262306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 109362306a36Sopenharmony_ci tgt_device = dev_replace->tgtdev; 109462306a36Sopenharmony_ci src_device = dev_replace->srcdev; 109562306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 109662306a36Sopenharmony_ci ret = btrfs_scrub_cancel(fs_info); 109762306a36Sopenharmony_ci if (ret < 0) { 109862306a36Sopenharmony_ci result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 109962306a36Sopenharmony_ci } else { 110062306a36Sopenharmony_ci result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 110162306a36Sopenharmony_ci /* 110262306a36Sopenharmony_ci * btrfs_dev_replace_finishing() will handle the 110362306a36Sopenharmony_ci * cleanup part 110462306a36Sopenharmony_ci */ 110562306a36Sopenharmony_ci btrfs_info_in_rcu(fs_info, 110662306a36Sopenharmony_ci "dev_replace from %s (devid %llu) to %s canceled", 110762306a36Sopenharmony_ci btrfs_dev_name(src_device), src_device->devid, 110862306a36Sopenharmony_ci btrfs_dev_name(tgt_device)); 110962306a36Sopenharmony_ci } 111062306a36Sopenharmony_ci break; 111162306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 111262306a36Sopenharmony_ci /* 111362306a36Sopenharmony_ci * Scrub doing the replace isn't running so we need to do the 111462306a36Sopenharmony_ci * cleanup step of btrfs_dev_replace_finishing() here 111562306a36Sopenharmony_ci */ 111662306a36Sopenharmony_ci result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 111762306a36Sopenharmony_ci tgt_device = dev_replace->tgtdev; 111862306a36Sopenharmony_ci src_device = dev_replace->srcdev; 111962306a36Sopenharmony_ci dev_replace->tgtdev = NULL; 112062306a36Sopenharmony_ci dev_replace->srcdev = NULL; 112162306a36Sopenharmony_ci dev_replace->replace_state = 112262306a36Sopenharmony_ci BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; 112362306a36Sopenharmony_ci dev_replace->time_stopped = ktime_get_real_seconds(); 112462306a36Sopenharmony_ci dev_replace->item_needs_writeback = 1; 112562306a36Sopenharmony_ci 112662306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci /* Scrub for replace must not be running in suspended state */ 112962306a36Sopenharmony_ci btrfs_scrub_cancel(fs_info); 113062306a36Sopenharmony_ci 113162306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 0); 113262306a36Sopenharmony_ci if (IS_ERR(trans)) { 113362306a36Sopenharmony_ci mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 113462306a36Sopenharmony_ci return PTR_ERR(trans); 113562306a36Sopenharmony_ci } 113662306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 113762306a36Sopenharmony_ci WARN_ON(ret); 113862306a36Sopenharmony_ci 113962306a36Sopenharmony_ci btrfs_info_in_rcu(fs_info, 114062306a36Sopenharmony_ci "suspended dev_replace from %s (devid %llu) to %s canceled", 114162306a36Sopenharmony_ci btrfs_dev_name(src_device), src_device->devid, 114262306a36Sopenharmony_ci btrfs_dev_name(tgt_device)); 114362306a36Sopenharmony_ci 114462306a36Sopenharmony_ci if (tgt_device) 114562306a36Sopenharmony_ci btrfs_destroy_dev_replace_tgtdev(tgt_device); 114662306a36Sopenharmony_ci break; 114762306a36Sopenharmony_ci default: 114862306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 114962306a36Sopenharmony_ci result = -EINVAL; 115062306a36Sopenharmony_ci } 115162306a36Sopenharmony_ci 115262306a36Sopenharmony_ci mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 115362306a36Sopenharmony_ci return result; 115462306a36Sopenharmony_ci} 115562306a36Sopenharmony_ci 115662306a36Sopenharmony_civoid btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) 115762306a36Sopenharmony_ci{ 115862306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 115962306a36Sopenharmony_ci 116062306a36Sopenharmony_ci mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 116162306a36Sopenharmony_ci down_write(&dev_replace->rwsem); 116262306a36Sopenharmony_ci 116362306a36Sopenharmony_ci switch (dev_replace->replace_state) { 116462306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 116562306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 116662306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 116762306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 116862306a36Sopenharmony_ci break; 116962306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 117062306a36Sopenharmony_ci dev_replace->replace_state = 117162306a36Sopenharmony_ci BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 117262306a36Sopenharmony_ci dev_replace->time_stopped = ktime_get_real_seconds(); 117362306a36Sopenharmony_ci dev_replace->item_needs_writeback = 1; 117462306a36Sopenharmony_ci btrfs_info(fs_info, "suspending dev_replace for unmount"); 117562306a36Sopenharmony_ci break; 117662306a36Sopenharmony_ci } 117762306a36Sopenharmony_ci 117862306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 117962306a36Sopenharmony_ci mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 118062306a36Sopenharmony_ci} 118162306a36Sopenharmony_ci 118262306a36Sopenharmony_ci/* resume dev_replace procedure that was interrupted by unmount */ 118362306a36Sopenharmony_ciint btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) 118462306a36Sopenharmony_ci{ 118562306a36Sopenharmony_ci struct task_struct *task; 118662306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 118762306a36Sopenharmony_ci 118862306a36Sopenharmony_ci down_write(&dev_replace->rwsem); 118962306a36Sopenharmony_ci 119062306a36Sopenharmony_ci switch (dev_replace->replace_state) { 119162306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 119262306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 119362306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 119462306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 119562306a36Sopenharmony_ci return 0; 119662306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 119762306a36Sopenharmony_ci break; 119862306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 119962306a36Sopenharmony_ci dev_replace->replace_state = 120062306a36Sopenharmony_ci BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 120162306a36Sopenharmony_ci break; 120262306a36Sopenharmony_ci } 120362306a36Sopenharmony_ci if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { 120462306a36Sopenharmony_ci btrfs_info(fs_info, 120562306a36Sopenharmony_ci "cannot continue dev_replace, tgtdev is missing"); 120662306a36Sopenharmony_ci btrfs_info(fs_info, 120762306a36Sopenharmony_ci "you may cancel the operation after 'mount -o degraded'"); 120862306a36Sopenharmony_ci dev_replace->replace_state = 120962306a36Sopenharmony_ci BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 121062306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 121162306a36Sopenharmony_ci return 0; 121262306a36Sopenharmony_ci } 121362306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 121462306a36Sopenharmony_ci 121562306a36Sopenharmony_ci /* 121662306a36Sopenharmony_ci * This could collide with a paused balance, but the exclusive op logic 121762306a36Sopenharmony_ci * should never allow both to start and pause. We don't want to allow 121862306a36Sopenharmony_ci * dev-replace to start anyway. 121962306a36Sopenharmony_ci */ 122062306a36Sopenharmony_ci if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { 122162306a36Sopenharmony_ci down_write(&dev_replace->rwsem); 122262306a36Sopenharmony_ci dev_replace->replace_state = 122362306a36Sopenharmony_ci BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 122462306a36Sopenharmony_ci up_write(&dev_replace->rwsem); 122562306a36Sopenharmony_ci btrfs_info(fs_info, 122662306a36Sopenharmony_ci "cannot resume dev-replace, other exclusive operation running"); 122762306a36Sopenharmony_ci return 0; 122862306a36Sopenharmony_ci } 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); 123162306a36Sopenharmony_ci return PTR_ERR_OR_ZERO(task); 123262306a36Sopenharmony_ci} 123362306a36Sopenharmony_ci 123462306a36Sopenharmony_cistatic int btrfs_dev_replace_kthread(void *data) 123562306a36Sopenharmony_ci{ 123662306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = data; 123762306a36Sopenharmony_ci struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 123862306a36Sopenharmony_ci u64 progress; 123962306a36Sopenharmony_ci int ret; 124062306a36Sopenharmony_ci 124162306a36Sopenharmony_ci progress = btrfs_dev_replace_progress(fs_info); 124262306a36Sopenharmony_ci progress = div_u64(progress, 10); 124362306a36Sopenharmony_ci btrfs_info_in_rcu(fs_info, 124462306a36Sopenharmony_ci "continuing dev_replace from %s (devid %llu) to target %s @%u%%", 124562306a36Sopenharmony_ci btrfs_dev_name(dev_replace->srcdev), 124662306a36Sopenharmony_ci dev_replace->srcdev->devid, 124762306a36Sopenharmony_ci btrfs_dev_name(dev_replace->tgtdev), 124862306a36Sopenharmony_ci (unsigned int)progress); 124962306a36Sopenharmony_ci 125062306a36Sopenharmony_ci ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, 125162306a36Sopenharmony_ci dev_replace->committed_cursor_left, 125262306a36Sopenharmony_ci btrfs_device_get_total_bytes(dev_replace->srcdev), 125362306a36Sopenharmony_ci &dev_replace->scrub_progress, 0, 1); 125462306a36Sopenharmony_ci ret = btrfs_dev_replace_finishing(fs_info, ret); 125562306a36Sopenharmony_ci WARN_ON(ret && ret != -ECANCELED); 125662306a36Sopenharmony_ci 125762306a36Sopenharmony_ci btrfs_exclop_finish(fs_info); 125862306a36Sopenharmony_ci return 0; 125962306a36Sopenharmony_ci} 126062306a36Sopenharmony_ci 126162306a36Sopenharmony_ciint __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) 126262306a36Sopenharmony_ci{ 126362306a36Sopenharmony_ci if (!dev_replace->is_valid) 126462306a36Sopenharmony_ci return 0; 126562306a36Sopenharmony_ci 126662306a36Sopenharmony_ci switch (dev_replace->replace_state) { 126762306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 126862306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 126962306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 127062306a36Sopenharmony_ci return 0; 127162306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 127262306a36Sopenharmony_ci case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 127362306a36Sopenharmony_ci /* 127462306a36Sopenharmony_ci * return true even if tgtdev is missing (this is 127562306a36Sopenharmony_ci * something that can happen if the dev_replace 127662306a36Sopenharmony_ci * procedure is suspended by an umount and then 127762306a36Sopenharmony_ci * the tgtdev is missing (or "btrfs dev scan") was 127862306a36Sopenharmony_ci * not called and the filesystem is remounted 127962306a36Sopenharmony_ci * in degraded state. This does not stop the 128062306a36Sopenharmony_ci * dev_replace procedure. It needs to be canceled 128162306a36Sopenharmony_ci * manually if the cancellation is wanted. 128262306a36Sopenharmony_ci */ 128362306a36Sopenharmony_ci break; 128462306a36Sopenharmony_ci } 128562306a36Sopenharmony_ci return 1; 128662306a36Sopenharmony_ci} 128762306a36Sopenharmony_ci 128862306a36Sopenharmony_civoid btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) 128962306a36Sopenharmony_ci{ 129062306a36Sopenharmony_ci percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); 129162306a36Sopenharmony_ci cond_wake_up_nomb(&fs_info->dev_replace.replace_wait); 129262306a36Sopenharmony_ci} 129362306a36Sopenharmony_ci 129462306a36Sopenharmony_civoid btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) 129562306a36Sopenharmony_ci{ 129662306a36Sopenharmony_ci while (1) { 129762306a36Sopenharmony_ci percpu_counter_inc(&fs_info->dev_replace.bio_counter); 129862306a36Sopenharmony_ci if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, 129962306a36Sopenharmony_ci &fs_info->fs_state))) 130062306a36Sopenharmony_ci break; 130162306a36Sopenharmony_ci 130262306a36Sopenharmony_ci btrfs_bio_counter_dec(fs_info); 130362306a36Sopenharmony_ci wait_event(fs_info->dev_replace.replace_wait, 130462306a36Sopenharmony_ci !test_bit(BTRFS_FS_STATE_DEV_REPLACING, 130562306a36Sopenharmony_ci &fs_info->fs_state)); 130662306a36Sopenharmony_ci } 130762306a36Sopenharmony_ci} 1308