162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) STRATO AG 2012.  All rights reserved.
462306a36Sopenharmony_ci */
562306a36Sopenharmony_ci
662306a36Sopenharmony_ci#include <linux/sched.h>
762306a36Sopenharmony_ci#include <linux/bio.h>
862306a36Sopenharmony_ci#include <linux/slab.h>
962306a36Sopenharmony_ci#include <linux/blkdev.h>
1062306a36Sopenharmony_ci#include <linux/kthread.h>
1162306a36Sopenharmony_ci#include <linux/math64.h>
1262306a36Sopenharmony_ci#include "misc.h"
1362306a36Sopenharmony_ci#include "ctree.h"
1462306a36Sopenharmony_ci#include "extent_map.h"
1562306a36Sopenharmony_ci#include "disk-io.h"
1662306a36Sopenharmony_ci#include "transaction.h"
1762306a36Sopenharmony_ci#include "print-tree.h"
1862306a36Sopenharmony_ci#include "volumes.h"
1962306a36Sopenharmony_ci#include "async-thread.h"
2062306a36Sopenharmony_ci#include "check-integrity.h"
2162306a36Sopenharmony_ci#include "dev-replace.h"
2262306a36Sopenharmony_ci#include "sysfs.h"
2362306a36Sopenharmony_ci#include "zoned.h"
2462306a36Sopenharmony_ci#include "block-group.h"
2562306a36Sopenharmony_ci#include "fs.h"
2662306a36Sopenharmony_ci#include "accessors.h"
2762306a36Sopenharmony_ci#include "scrub.h"
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ci/*
3062306a36Sopenharmony_ci * Device replace overview
3162306a36Sopenharmony_ci *
3262306a36Sopenharmony_ci * [Objective]
3362306a36Sopenharmony_ci * To copy all extents (both new and on-disk) from source device to target
3462306a36Sopenharmony_ci * device, while still keeping the filesystem read-write.
3562306a36Sopenharmony_ci *
3662306a36Sopenharmony_ci * [Method]
3762306a36Sopenharmony_ci * There are two main methods involved:
3862306a36Sopenharmony_ci *
3962306a36Sopenharmony_ci * - Write duplication
4062306a36Sopenharmony_ci *
4162306a36Sopenharmony_ci *   All new writes will be written to both target and source devices, so even
4262306a36Sopenharmony_ci *   if replace gets canceled, sources device still contains up-to-date data.
4362306a36Sopenharmony_ci *
4462306a36Sopenharmony_ci *   Location:		handle_ops_on_dev_replace() from btrfs_map_block()
4562306a36Sopenharmony_ci *   Start:		btrfs_dev_replace_start()
4662306a36Sopenharmony_ci *   End:		btrfs_dev_replace_finishing()
4762306a36Sopenharmony_ci *   Content:		Latest data/metadata
4862306a36Sopenharmony_ci *
4962306a36Sopenharmony_ci * - Copy existing extents
5062306a36Sopenharmony_ci *
5162306a36Sopenharmony_ci *   This happens by re-using scrub facility, as scrub also iterates through
5262306a36Sopenharmony_ci *   existing extents from commit root.
5362306a36Sopenharmony_ci *
5462306a36Sopenharmony_ci *   Location:		scrub_write_block_to_dev_replace() from
5562306a36Sopenharmony_ci *   			scrub_block_complete()
5662306a36Sopenharmony_ci *   Content:		Data/meta from commit root.
5762306a36Sopenharmony_ci *
5862306a36Sopenharmony_ci * Due to the content difference, we need to avoid nocow write when dev-replace
5962306a36Sopenharmony_ci * is happening.  This is done by marking the block group read-only and waiting
6062306a36Sopenharmony_ci * for NOCOW writes.
6162306a36Sopenharmony_ci *
6262306a36Sopenharmony_ci * After replace is done, the finishing part is done by swapping the target and
6362306a36Sopenharmony_ci * source devices.
6462306a36Sopenharmony_ci *
6562306a36Sopenharmony_ci *   Location:		btrfs_dev_replace_update_device_in_mapping_tree() from
6662306a36Sopenharmony_ci *   			btrfs_dev_replace_finishing()
6762306a36Sopenharmony_ci */
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_cistatic int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
7062306a36Sopenharmony_ci				       int scrub_ret);
7162306a36Sopenharmony_cistatic int btrfs_dev_replace_kthread(void *data);
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ciint btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
7462306a36Sopenharmony_ci{
7562306a36Sopenharmony_ci	struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
7662306a36Sopenharmony_ci	struct btrfs_key key;
7762306a36Sopenharmony_ci	struct btrfs_root *dev_root = fs_info->dev_root;
7862306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
7962306a36Sopenharmony_ci	struct extent_buffer *eb;
8062306a36Sopenharmony_ci	int slot;
8162306a36Sopenharmony_ci	int ret = 0;
8262306a36Sopenharmony_ci	struct btrfs_path *path = NULL;
8362306a36Sopenharmony_ci	int item_size;
8462306a36Sopenharmony_ci	struct btrfs_dev_replace_item *ptr;
8562306a36Sopenharmony_ci	u64 src_devid;
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci	if (!dev_root)
8862306a36Sopenharmony_ci		return 0;
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci	path = btrfs_alloc_path();
9162306a36Sopenharmony_ci	if (!path) {
9262306a36Sopenharmony_ci		ret = -ENOMEM;
9362306a36Sopenharmony_ci		goto out;
9462306a36Sopenharmony_ci	}
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	key.objectid = 0;
9762306a36Sopenharmony_ci	key.type = BTRFS_DEV_REPLACE_KEY;
9862306a36Sopenharmony_ci	key.offset = 0;
9962306a36Sopenharmony_ci	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
10062306a36Sopenharmony_ci	if (ret) {
10162306a36Sopenharmony_cino_valid_dev_replace_entry_found:
10262306a36Sopenharmony_ci		/*
10362306a36Sopenharmony_ci		 * We don't have a replace item or it's corrupted.  If there is
10462306a36Sopenharmony_ci		 * a replace target, fail the mount.
10562306a36Sopenharmony_ci		 */
10662306a36Sopenharmony_ci		if (btrfs_find_device(fs_info->fs_devices, &args)) {
10762306a36Sopenharmony_ci			btrfs_err(fs_info,
10862306a36Sopenharmony_ci			"found replace target device without a valid replace item");
10962306a36Sopenharmony_ci			ret = -EUCLEAN;
11062306a36Sopenharmony_ci			goto out;
11162306a36Sopenharmony_ci		}
11262306a36Sopenharmony_ci		ret = 0;
11362306a36Sopenharmony_ci		dev_replace->replace_state =
11462306a36Sopenharmony_ci			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
11562306a36Sopenharmony_ci		dev_replace->cont_reading_from_srcdev_mode =
11662306a36Sopenharmony_ci		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
11762306a36Sopenharmony_ci		dev_replace->time_started = 0;
11862306a36Sopenharmony_ci		dev_replace->time_stopped = 0;
11962306a36Sopenharmony_ci		atomic64_set(&dev_replace->num_write_errors, 0);
12062306a36Sopenharmony_ci		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
12162306a36Sopenharmony_ci		dev_replace->cursor_left = 0;
12262306a36Sopenharmony_ci		dev_replace->committed_cursor_left = 0;
12362306a36Sopenharmony_ci		dev_replace->cursor_left_last_write_of_item = 0;
12462306a36Sopenharmony_ci		dev_replace->cursor_right = 0;
12562306a36Sopenharmony_ci		dev_replace->srcdev = NULL;
12662306a36Sopenharmony_ci		dev_replace->tgtdev = NULL;
12762306a36Sopenharmony_ci		dev_replace->is_valid = 0;
12862306a36Sopenharmony_ci		dev_replace->item_needs_writeback = 0;
12962306a36Sopenharmony_ci		goto out;
13062306a36Sopenharmony_ci	}
13162306a36Sopenharmony_ci	slot = path->slots[0];
13262306a36Sopenharmony_ci	eb = path->nodes[0];
13362306a36Sopenharmony_ci	item_size = btrfs_item_size(eb, slot);
13462306a36Sopenharmony_ci	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
13762306a36Sopenharmony_ci		btrfs_warn(fs_info,
13862306a36Sopenharmony_ci			"dev_replace entry found has unexpected size, ignore entry");
13962306a36Sopenharmony_ci		goto no_valid_dev_replace_entry_found;
14062306a36Sopenharmony_ci	}
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
14362306a36Sopenharmony_ci	dev_replace->cont_reading_from_srcdev_mode =
14462306a36Sopenharmony_ci		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
14562306a36Sopenharmony_ci	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
14662306a36Sopenharmony_ci	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
14762306a36Sopenharmony_ci	dev_replace->time_stopped =
14862306a36Sopenharmony_ci		btrfs_dev_replace_time_stopped(eb, ptr);
14962306a36Sopenharmony_ci	atomic64_set(&dev_replace->num_write_errors,
15062306a36Sopenharmony_ci		     btrfs_dev_replace_num_write_errors(eb, ptr));
15162306a36Sopenharmony_ci	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
15262306a36Sopenharmony_ci		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
15362306a36Sopenharmony_ci	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
15462306a36Sopenharmony_ci	dev_replace->committed_cursor_left = dev_replace->cursor_left;
15562306a36Sopenharmony_ci	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
15662306a36Sopenharmony_ci	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
15762306a36Sopenharmony_ci	dev_replace->is_valid = 1;
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	dev_replace->item_needs_writeback = 0;
16062306a36Sopenharmony_ci	switch (dev_replace->replace_state) {
16162306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
16262306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
16362306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
16462306a36Sopenharmony_ci		/*
16562306a36Sopenharmony_ci		 * We don't have an active replace item but if there is a
16662306a36Sopenharmony_ci		 * replace target, fail the mount.
16762306a36Sopenharmony_ci		 */
16862306a36Sopenharmony_ci		if (btrfs_find_device(fs_info->fs_devices, &args)) {
16962306a36Sopenharmony_ci			btrfs_err(fs_info,
17062306a36Sopenharmony_ci"replace without active item, run 'device scan --forget' on the target device");
17162306a36Sopenharmony_ci			ret = -EUCLEAN;
17262306a36Sopenharmony_ci		} else {
17362306a36Sopenharmony_ci			dev_replace->srcdev = NULL;
17462306a36Sopenharmony_ci			dev_replace->tgtdev = NULL;
17562306a36Sopenharmony_ci		}
17662306a36Sopenharmony_ci		break;
17762306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
17862306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
17962306a36Sopenharmony_ci		dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
18062306a36Sopenharmony_ci		args.devid = src_devid;
18162306a36Sopenharmony_ci		dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci		/*
18462306a36Sopenharmony_ci		 * allow 'btrfs dev replace_cancel' if src/tgt device is
18562306a36Sopenharmony_ci		 * missing
18662306a36Sopenharmony_ci		 */
18762306a36Sopenharmony_ci		if (!dev_replace->srcdev &&
18862306a36Sopenharmony_ci		    !btrfs_test_opt(fs_info, DEGRADED)) {
18962306a36Sopenharmony_ci			ret = -EIO;
19062306a36Sopenharmony_ci			btrfs_warn(fs_info,
19162306a36Sopenharmony_ci			   "cannot mount because device replace operation is ongoing and");
19262306a36Sopenharmony_ci			btrfs_warn(fs_info,
19362306a36Sopenharmony_ci			   "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
19462306a36Sopenharmony_ci			   src_devid);
19562306a36Sopenharmony_ci		}
19662306a36Sopenharmony_ci		if (!dev_replace->tgtdev &&
19762306a36Sopenharmony_ci		    !btrfs_test_opt(fs_info, DEGRADED)) {
19862306a36Sopenharmony_ci			ret = -EIO;
19962306a36Sopenharmony_ci			btrfs_warn(fs_info,
20062306a36Sopenharmony_ci			   "cannot mount because device replace operation is ongoing and");
20162306a36Sopenharmony_ci			btrfs_warn(fs_info,
20262306a36Sopenharmony_ci			   "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
20362306a36Sopenharmony_ci				BTRFS_DEV_REPLACE_DEVID);
20462306a36Sopenharmony_ci		}
20562306a36Sopenharmony_ci		if (dev_replace->tgtdev) {
20662306a36Sopenharmony_ci			if (dev_replace->srcdev) {
20762306a36Sopenharmony_ci				dev_replace->tgtdev->total_bytes =
20862306a36Sopenharmony_ci					dev_replace->srcdev->total_bytes;
20962306a36Sopenharmony_ci				dev_replace->tgtdev->disk_total_bytes =
21062306a36Sopenharmony_ci					dev_replace->srcdev->disk_total_bytes;
21162306a36Sopenharmony_ci				dev_replace->tgtdev->commit_total_bytes =
21262306a36Sopenharmony_ci					dev_replace->srcdev->commit_total_bytes;
21362306a36Sopenharmony_ci				dev_replace->tgtdev->bytes_used =
21462306a36Sopenharmony_ci					dev_replace->srcdev->bytes_used;
21562306a36Sopenharmony_ci				dev_replace->tgtdev->commit_bytes_used =
21662306a36Sopenharmony_ci					dev_replace->srcdev->commit_bytes_used;
21762306a36Sopenharmony_ci			}
21862306a36Sopenharmony_ci			set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
21962306a36Sopenharmony_ci				&dev_replace->tgtdev->dev_state);
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci			WARN_ON(fs_info->fs_devices->rw_devices == 0);
22262306a36Sopenharmony_ci			dev_replace->tgtdev->io_width = fs_info->sectorsize;
22362306a36Sopenharmony_ci			dev_replace->tgtdev->io_align = fs_info->sectorsize;
22462306a36Sopenharmony_ci			dev_replace->tgtdev->sector_size = fs_info->sectorsize;
22562306a36Sopenharmony_ci			dev_replace->tgtdev->fs_info = fs_info;
22662306a36Sopenharmony_ci			set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
22762306a36Sopenharmony_ci				&dev_replace->tgtdev->dev_state);
22862306a36Sopenharmony_ci		}
22962306a36Sopenharmony_ci		break;
23062306a36Sopenharmony_ci	}
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ciout:
23362306a36Sopenharmony_ci	btrfs_free_path(path);
23462306a36Sopenharmony_ci	return ret;
23562306a36Sopenharmony_ci}
23662306a36Sopenharmony_ci
23762306a36Sopenharmony_ci/*
23862306a36Sopenharmony_ci * Initialize a new device for device replace target from a given source dev
23962306a36Sopenharmony_ci * and path.
24062306a36Sopenharmony_ci *
24162306a36Sopenharmony_ci * Return 0 and new device in @device_out, otherwise return < 0
24262306a36Sopenharmony_ci */
24362306a36Sopenharmony_cistatic int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
24462306a36Sopenharmony_ci				  const char *device_path,
24562306a36Sopenharmony_ci				  struct btrfs_device *srcdev,
24662306a36Sopenharmony_ci				  struct btrfs_device **device_out)
24762306a36Sopenharmony_ci{
24862306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
24962306a36Sopenharmony_ci	struct btrfs_device *device;
25062306a36Sopenharmony_ci	struct block_device *bdev;
25162306a36Sopenharmony_ci	u64 devid = BTRFS_DEV_REPLACE_DEVID;
25262306a36Sopenharmony_ci	int ret = 0;
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci	*device_out = NULL;
25562306a36Sopenharmony_ci	if (srcdev->fs_devices->seeding) {
25662306a36Sopenharmony_ci		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
25762306a36Sopenharmony_ci		return -EINVAL;
25862306a36Sopenharmony_ci	}
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
26162306a36Sopenharmony_ci				  fs_info->bdev_holder, NULL);
26262306a36Sopenharmony_ci	if (IS_ERR(bdev)) {
26362306a36Sopenharmony_ci		btrfs_err(fs_info, "target device %s is invalid!", device_path);
26462306a36Sopenharmony_ci		return PTR_ERR(bdev);
26562306a36Sopenharmony_ci	}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
26862306a36Sopenharmony_ci		btrfs_err(fs_info,
26962306a36Sopenharmony_ci		"dev-replace: zoned type of target device mismatch with filesystem");
27062306a36Sopenharmony_ci		ret = -EINVAL;
27162306a36Sopenharmony_ci		goto error;
27262306a36Sopenharmony_ci	}
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci	sync_blockdev(bdev);
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->devices, dev_list) {
27762306a36Sopenharmony_ci		if (device->bdev == bdev) {
27862306a36Sopenharmony_ci			btrfs_err(fs_info,
27962306a36Sopenharmony_ci				  "target device is in the filesystem!");
28062306a36Sopenharmony_ci			ret = -EEXIST;
28162306a36Sopenharmony_ci			goto error;
28262306a36Sopenharmony_ci		}
28362306a36Sopenharmony_ci	}
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
28762306a36Sopenharmony_ci		btrfs_err(fs_info,
28862306a36Sopenharmony_ci			  "target device is smaller than source device!");
28962306a36Sopenharmony_ci		ret = -EINVAL;
29062306a36Sopenharmony_ci		goto error;
29162306a36Sopenharmony_ci	}
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci	device = btrfs_alloc_device(NULL, &devid, NULL, device_path);
29562306a36Sopenharmony_ci	if (IS_ERR(device)) {
29662306a36Sopenharmony_ci		ret = PTR_ERR(device);
29762306a36Sopenharmony_ci		goto error;
29862306a36Sopenharmony_ci	}
29962306a36Sopenharmony_ci
30062306a36Sopenharmony_ci	ret = lookup_bdev(device_path, &device->devt);
30162306a36Sopenharmony_ci	if (ret)
30262306a36Sopenharmony_ci		goto error;
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_ci	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
30562306a36Sopenharmony_ci	device->generation = 0;
30662306a36Sopenharmony_ci	device->io_width = fs_info->sectorsize;
30762306a36Sopenharmony_ci	device->io_align = fs_info->sectorsize;
30862306a36Sopenharmony_ci	device->sector_size = fs_info->sectorsize;
30962306a36Sopenharmony_ci	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
31062306a36Sopenharmony_ci	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
31162306a36Sopenharmony_ci	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
31262306a36Sopenharmony_ci	device->commit_total_bytes = srcdev->commit_total_bytes;
31362306a36Sopenharmony_ci	device->commit_bytes_used = device->bytes_used;
31462306a36Sopenharmony_ci	device->fs_info = fs_info;
31562306a36Sopenharmony_ci	device->bdev = bdev;
31662306a36Sopenharmony_ci	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
31762306a36Sopenharmony_ci	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
31862306a36Sopenharmony_ci	device->holder = fs_info->bdev_holder;
31962306a36Sopenharmony_ci	device->dev_stats_valid = 1;
32062306a36Sopenharmony_ci	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
32162306a36Sopenharmony_ci	device->fs_devices = fs_devices;
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci	ret = btrfs_get_dev_zone_info(device, false);
32462306a36Sopenharmony_ci	if (ret)
32562306a36Sopenharmony_ci		goto error;
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
32862306a36Sopenharmony_ci	list_add(&device->dev_list, &fs_devices->devices);
32962306a36Sopenharmony_ci	fs_devices->num_devices++;
33062306a36Sopenharmony_ci	fs_devices->open_devices++;
33162306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci	*device_out = device;
33462306a36Sopenharmony_ci	return 0;
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_cierror:
33762306a36Sopenharmony_ci	blkdev_put(bdev, fs_info->bdev_holder);
33862306a36Sopenharmony_ci	return ret;
33962306a36Sopenharmony_ci}
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci/*
34262306a36Sopenharmony_ci * called from commit_transaction. Writes changed device replace state to
34362306a36Sopenharmony_ci * disk.
34462306a36Sopenharmony_ci */
34562306a36Sopenharmony_ciint btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
34662306a36Sopenharmony_ci{
34762306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = trans->fs_info;
34862306a36Sopenharmony_ci	int ret;
34962306a36Sopenharmony_ci	struct btrfs_root *dev_root = fs_info->dev_root;
35062306a36Sopenharmony_ci	struct btrfs_path *path;
35162306a36Sopenharmony_ci	struct btrfs_key key;
35262306a36Sopenharmony_ci	struct extent_buffer *eb;
35362306a36Sopenharmony_ci	struct btrfs_dev_replace_item *ptr;
35462306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci	down_read(&dev_replace->rwsem);
35762306a36Sopenharmony_ci	if (!dev_replace->is_valid ||
35862306a36Sopenharmony_ci	    !dev_replace->item_needs_writeback) {
35962306a36Sopenharmony_ci		up_read(&dev_replace->rwsem);
36062306a36Sopenharmony_ci		return 0;
36162306a36Sopenharmony_ci	}
36262306a36Sopenharmony_ci	up_read(&dev_replace->rwsem);
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci	key.objectid = 0;
36562306a36Sopenharmony_ci	key.type = BTRFS_DEV_REPLACE_KEY;
36662306a36Sopenharmony_ci	key.offset = 0;
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	path = btrfs_alloc_path();
36962306a36Sopenharmony_ci	if (!path) {
37062306a36Sopenharmony_ci		ret = -ENOMEM;
37162306a36Sopenharmony_ci		goto out;
37262306a36Sopenharmony_ci	}
37362306a36Sopenharmony_ci	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
37462306a36Sopenharmony_ci	if (ret < 0) {
37562306a36Sopenharmony_ci		btrfs_warn(fs_info,
37662306a36Sopenharmony_ci			   "error %d while searching for dev_replace item!",
37762306a36Sopenharmony_ci			   ret);
37862306a36Sopenharmony_ci		goto out;
37962306a36Sopenharmony_ci	}
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci	if (ret == 0 &&
38262306a36Sopenharmony_ci	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
38362306a36Sopenharmony_ci		/*
38462306a36Sopenharmony_ci		 * need to delete old one and insert a new one.
38562306a36Sopenharmony_ci		 * Since no attempt is made to recover any old state, if the
38662306a36Sopenharmony_ci		 * dev_replace state is 'running', the data on the target
38762306a36Sopenharmony_ci		 * drive is lost.
38862306a36Sopenharmony_ci		 * It would be possible to recover the state: just make sure
38962306a36Sopenharmony_ci		 * that the beginning of the item is never changed and always
39062306a36Sopenharmony_ci		 * contains all the essential information. Then read this
39162306a36Sopenharmony_ci		 * minimal set of information and use it as a base for the
39262306a36Sopenharmony_ci		 * new state.
39362306a36Sopenharmony_ci		 */
39462306a36Sopenharmony_ci		ret = btrfs_del_item(trans, dev_root, path);
39562306a36Sopenharmony_ci		if (ret != 0) {
39662306a36Sopenharmony_ci			btrfs_warn(fs_info,
39762306a36Sopenharmony_ci				   "delete too small dev_replace item failed %d!",
39862306a36Sopenharmony_ci				   ret);
39962306a36Sopenharmony_ci			goto out;
40062306a36Sopenharmony_ci		}
40162306a36Sopenharmony_ci		ret = 1;
40262306a36Sopenharmony_ci	}
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci	if (ret == 1) {
40562306a36Sopenharmony_ci		/* need to insert a new item */
40662306a36Sopenharmony_ci		btrfs_release_path(path);
40762306a36Sopenharmony_ci		ret = btrfs_insert_empty_item(trans, dev_root, path,
40862306a36Sopenharmony_ci					      &key, sizeof(*ptr));
40962306a36Sopenharmony_ci		if (ret < 0) {
41062306a36Sopenharmony_ci			btrfs_warn(fs_info,
41162306a36Sopenharmony_ci				   "insert dev_replace item failed %d!", ret);
41262306a36Sopenharmony_ci			goto out;
41362306a36Sopenharmony_ci		}
41462306a36Sopenharmony_ci	}
41562306a36Sopenharmony_ci
41662306a36Sopenharmony_ci	eb = path->nodes[0];
41762306a36Sopenharmony_ci	ptr = btrfs_item_ptr(eb, path->slots[0],
41862306a36Sopenharmony_ci			     struct btrfs_dev_replace_item);
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci	down_write(&dev_replace->rwsem);
42162306a36Sopenharmony_ci	if (dev_replace->srcdev)
42262306a36Sopenharmony_ci		btrfs_set_dev_replace_src_devid(eb, ptr,
42362306a36Sopenharmony_ci			dev_replace->srcdev->devid);
42462306a36Sopenharmony_ci	else
42562306a36Sopenharmony_ci		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
42662306a36Sopenharmony_ci	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
42762306a36Sopenharmony_ci		dev_replace->cont_reading_from_srcdev_mode);
42862306a36Sopenharmony_ci	btrfs_set_dev_replace_replace_state(eb, ptr,
42962306a36Sopenharmony_ci		dev_replace->replace_state);
43062306a36Sopenharmony_ci	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
43162306a36Sopenharmony_ci	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
43262306a36Sopenharmony_ci	btrfs_set_dev_replace_num_write_errors(eb, ptr,
43362306a36Sopenharmony_ci		atomic64_read(&dev_replace->num_write_errors));
43462306a36Sopenharmony_ci	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
43562306a36Sopenharmony_ci		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
43662306a36Sopenharmony_ci	dev_replace->cursor_left_last_write_of_item =
43762306a36Sopenharmony_ci		dev_replace->cursor_left;
43862306a36Sopenharmony_ci	btrfs_set_dev_replace_cursor_left(eb, ptr,
43962306a36Sopenharmony_ci		dev_replace->cursor_left_last_write_of_item);
44062306a36Sopenharmony_ci	btrfs_set_dev_replace_cursor_right(eb, ptr,
44162306a36Sopenharmony_ci		dev_replace->cursor_right);
44262306a36Sopenharmony_ci	dev_replace->item_needs_writeback = 0;
44362306a36Sopenharmony_ci	up_write(&dev_replace->rwsem);
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci	btrfs_mark_buffer_dirty(trans, eb);
44662306a36Sopenharmony_ci
44762306a36Sopenharmony_ciout:
44862306a36Sopenharmony_ci	btrfs_free_path(path);
44962306a36Sopenharmony_ci
45062306a36Sopenharmony_ci	return ret;
45162306a36Sopenharmony_ci}
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_cistatic int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
45462306a36Sopenharmony_ci				    struct btrfs_device *src_dev)
45562306a36Sopenharmony_ci{
45662306a36Sopenharmony_ci	struct btrfs_path *path;
45762306a36Sopenharmony_ci	struct btrfs_key key;
45862306a36Sopenharmony_ci	struct btrfs_key found_key;
45962306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->dev_root;
46062306a36Sopenharmony_ci	struct btrfs_dev_extent *dev_extent = NULL;
46162306a36Sopenharmony_ci	struct btrfs_block_group *cache;
46262306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
46362306a36Sopenharmony_ci	int iter_ret = 0;
46462306a36Sopenharmony_ci	int ret = 0;
46562306a36Sopenharmony_ci	u64 chunk_offset;
46662306a36Sopenharmony_ci
46762306a36Sopenharmony_ci	/* Do not use "to_copy" on non zoned filesystem for now */
46862306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
46962306a36Sopenharmony_ci		return 0;
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci	mutex_lock(&fs_info->chunk_mutex);
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci	/* Ensure we don't have pending new block group */
47462306a36Sopenharmony_ci	spin_lock(&fs_info->trans_lock);
47562306a36Sopenharmony_ci	while (fs_info->running_transaction &&
47662306a36Sopenharmony_ci	       !list_empty(&fs_info->running_transaction->dev_update_list)) {
47762306a36Sopenharmony_ci		spin_unlock(&fs_info->trans_lock);
47862306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
47962306a36Sopenharmony_ci		trans = btrfs_attach_transaction(root);
48062306a36Sopenharmony_ci		if (IS_ERR(trans)) {
48162306a36Sopenharmony_ci			ret = PTR_ERR(trans);
48262306a36Sopenharmony_ci			mutex_lock(&fs_info->chunk_mutex);
48362306a36Sopenharmony_ci			if (ret == -ENOENT) {
48462306a36Sopenharmony_ci				spin_lock(&fs_info->trans_lock);
48562306a36Sopenharmony_ci				continue;
48662306a36Sopenharmony_ci			} else {
48762306a36Sopenharmony_ci				goto unlock;
48862306a36Sopenharmony_ci			}
48962306a36Sopenharmony_ci		}
49062306a36Sopenharmony_ci
49162306a36Sopenharmony_ci		ret = btrfs_commit_transaction(trans);
49262306a36Sopenharmony_ci		mutex_lock(&fs_info->chunk_mutex);
49362306a36Sopenharmony_ci		if (ret)
49462306a36Sopenharmony_ci			goto unlock;
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ci		spin_lock(&fs_info->trans_lock);
49762306a36Sopenharmony_ci	}
49862306a36Sopenharmony_ci	spin_unlock(&fs_info->trans_lock);
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	path = btrfs_alloc_path();
50162306a36Sopenharmony_ci	if (!path) {
50262306a36Sopenharmony_ci		ret = -ENOMEM;
50362306a36Sopenharmony_ci		goto unlock;
50462306a36Sopenharmony_ci	}
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	path->reada = READA_FORWARD;
50762306a36Sopenharmony_ci	path->search_commit_root = 1;
50862306a36Sopenharmony_ci	path->skip_locking = 1;
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	key.objectid = src_dev->devid;
51162306a36Sopenharmony_ci	key.type = BTRFS_DEV_EXTENT_KEY;
51262306a36Sopenharmony_ci	key.offset = 0;
51362306a36Sopenharmony_ci
51462306a36Sopenharmony_ci	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
51562306a36Sopenharmony_ci		struct extent_buffer *leaf = path->nodes[0];
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci		if (found_key.objectid != src_dev->devid)
51862306a36Sopenharmony_ci			break;
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
52162306a36Sopenharmony_ci			break;
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci		if (found_key.offset < key.offset)
52462306a36Sopenharmony_ci			break;
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci		dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
52762306a36Sopenharmony_ci
52862306a36Sopenharmony_ci		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
53162306a36Sopenharmony_ci		if (!cache)
53262306a36Sopenharmony_ci			continue;
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci		set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
53562306a36Sopenharmony_ci		btrfs_put_block_group(cache);
53662306a36Sopenharmony_ci	}
53762306a36Sopenharmony_ci	if (iter_ret < 0)
53862306a36Sopenharmony_ci		ret = iter_ret;
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_ci	btrfs_free_path(path);
54162306a36Sopenharmony_ciunlock:
54262306a36Sopenharmony_ci	mutex_unlock(&fs_info->chunk_mutex);
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci	return ret;
54562306a36Sopenharmony_ci}
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_cibool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
54862306a36Sopenharmony_ci				      struct btrfs_block_group *cache,
54962306a36Sopenharmony_ci				      u64 physical)
55062306a36Sopenharmony_ci{
55162306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = cache->fs_info;
55262306a36Sopenharmony_ci	struct extent_map *em;
55362306a36Sopenharmony_ci	struct map_lookup *map;
55462306a36Sopenharmony_ci	u64 chunk_offset = cache->start;
55562306a36Sopenharmony_ci	int num_extents, cur_extent;
55662306a36Sopenharmony_ci	int i;
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci	/* Do not use "to_copy" on non zoned filesystem for now */
55962306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
56062306a36Sopenharmony_ci		return true;
56162306a36Sopenharmony_ci
56262306a36Sopenharmony_ci	spin_lock(&cache->lock);
56362306a36Sopenharmony_ci	if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
56462306a36Sopenharmony_ci		spin_unlock(&cache->lock);
56562306a36Sopenharmony_ci		return true;
56662306a36Sopenharmony_ci	}
56762306a36Sopenharmony_ci	spin_unlock(&cache->lock);
56862306a36Sopenharmony_ci
56962306a36Sopenharmony_ci	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
57062306a36Sopenharmony_ci	ASSERT(!IS_ERR(em));
57162306a36Sopenharmony_ci	map = em->map_lookup;
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci	num_extents = 0;
57462306a36Sopenharmony_ci	cur_extent = 0;
57562306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
57662306a36Sopenharmony_ci		/* We have more device extent to copy */
57762306a36Sopenharmony_ci		if (srcdev != map->stripes[i].dev)
57862306a36Sopenharmony_ci			continue;
57962306a36Sopenharmony_ci
58062306a36Sopenharmony_ci		num_extents++;
58162306a36Sopenharmony_ci		if (physical == map->stripes[i].physical)
58262306a36Sopenharmony_ci			cur_extent = i;
58362306a36Sopenharmony_ci	}
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci	free_extent_map(em);
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_ci	if (num_extents > 1 && cur_extent < num_extents - 1) {
58862306a36Sopenharmony_ci		/*
58962306a36Sopenharmony_ci		 * Has more stripes on this device. Keep this block group
59062306a36Sopenharmony_ci		 * readonly until we finish all the stripes.
59162306a36Sopenharmony_ci		 */
59262306a36Sopenharmony_ci		return false;
59362306a36Sopenharmony_ci	}
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ci	/* Last stripe on this device */
59662306a36Sopenharmony_ci	clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	return true;
59962306a36Sopenharmony_ci}
60062306a36Sopenharmony_ci
60162306a36Sopenharmony_cistatic int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
60262306a36Sopenharmony_ci		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
60362306a36Sopenharmony_ci		int read_src)
60462306a36Sopenharmony_ci{
60562306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->dev_root;
60662306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
60762306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
60862306a36Sopenharmony_ci	int ret;
60962306a36Sopenharmony_ci	struct btrfs_device *tgt_device = NULL;
61062306a36Sopenharmony_ci	struct btrfs_device *src_device = NULL;
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
61362306a36Sopenharmony_ci						  srcdev_name);
61462306a36Sopenharmony_ci	if (IS_ERR(src_device))
61562306a36Sopenharmony_ci		return PTR_ERR(src_device);
61662306a36Sopenharmony_ci
61762306a36Sopenharmony_ci	if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
61862306a36Sopenharmony_ci		btrfs_warn_in_rcu(fs_info,
61962306a36Sopenharmony_ci	  "cannot replace device %s (devid %llu) due to active swapfile",
62062306a36Sopenharmony_ci			btrfs_dev_name(src_device), src_device->devid);
62162306a36Sopenharmony_ci		return -ETXTBSY;
62262306a36Sopenharmony_ci	}
62362306a36Sopenharmony_ci
62462306a36Sopenharmony_ci	/*
62562306a36Sopenharmony_ci	 * Here we commit the transaction to make sure commit_total_bytes
62662306a36Sopenharmony_ci	 * of all the devices are updated.
62762306a36Sopenharmony_ci	 */
62862306a36Sopenharmony_ci	trans = btrfs_attach_transaction(root);
62962306a36Sopenharmony_ci	if (!IS_ERR(trans)) {
63062306a36Sopenharmony_ci		ret = btrfs_commit_transaction(trans);
63162306a36Sopenharmony_ci		if (ret)
63262306a36Sopenharmony_ci			return ret;
63362306a36Sopenharmony_ci	} else if (PTR_ERR(trans) != -ENOENT) {
63462306a36Sopenharmony_ci		return PTR_ERR(trans);
63562306a36Sopenharmony_ci	}
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci	ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
63862306a36Sopenharmony_ci					    src_device, &tgt_device);
63962306a36Sopenharmony_ci	if (ret)
64062306a36Sopenharmony_ci		return ret;
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci	ret = mark_block_group_to_copy(fs_info, src_device);
64362306a36Sopenharmony_ci	if (ret)
64462306a36Sopenharmony_ci		return ret;
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_ci	down_write(&dev_replace->rwsem);
64762306a36Sopenharmony_ci	switch (dev_replace->replace_state) {
64862306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
64962306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
65062306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
65162306a36Sopenharmony_ci		break;
65262306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
65362306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
65462306a36Sopenharmony_ci		ASSERT(0);
65562306a36Sopenharmony_ci		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
65662306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
65762306a36Sopenharmony_ci		goto leave;
65862306a36Sopenharmony_ci	}
65962306a36Sopenharmony_ci
66062306a36Sopenharmony_ci	dev_replace->cont_reading_from_srcdev_mode = read_src;
66162306a36Sopenharmony_ci	dev_replace->srcdev = src_device;
66262306a36Sopenharmony_ci	dev_replace->tgtdev = tgt_device;
66362306a36Sopenharmony_ci
66462306a36Sopenharmony_ci	btrfs_info_in_rcu(fs_info,
66562306a36Sopenharmony_ci		      "dev_replace from %s (devid %llu) to %s started",
66662306a36Sopenharmony_ci		      btrfs_dev_name(src_device),
66762306a36Sopenharmony_ci		      src_device->devid,
66862306a36Sopenharmony_ci		      btrfs_dev_name(tgt_device));
66962306a36Sopenharmony_ci
67062306a36Sopenharmony_ci	/*
67162306a36Sopenharmony_ci	 * from now on, the writes to the srcdev are all duplicated to
67262306a36Sopenharmony_ci	 * go to the tgtdev as well (refer to btrfs_map_block()).
67362306a36Sopenharmony_ci	 */
67462306a36Sopenharmony_ci	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
67562306a36Sopenharmony_ci	dev_replace->time_started = ktime_get_real_seconds();
67662306a36Sopenharmony_ci	dev_replace->cursor_left = 0;
67762306a36Sopenharmony_ci	dev_replace->committed_cursor_left = 0;
67862306a36Sopenharmony_ci	dev_replace->cursor_left_last_write_of_item = 0;
67962306a36Sopenharmony_ci	dev_replace->cursor_right = 0;
68062306a36Sopenharmony_ci	dev_replace->is_valid = 1;
68162306a36Sopenharmony_ci	dev_replace->item_needs_writeback = 1;
68262306a36Sopenharmony_ci	atomic64_set(&dev_replace->num_write_errors, 0);
68362306a36Sopenharmony_ci	atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
68462306a36Sopenharmony_ci	up_write(&dev_replace->rwsem);
68562306a36Sopenharmony_ci
68662306a36Sopenharmony_ci	ret = btrfs_sysfs_add_device(tgt_device);
68762306a36Sopenharmony_ci	if (ret)
68862306a36Sopenharmony_ci		btrfs_err(fs_info, "kobj add dev failed %d", ret);
68962306a36Sopenharmony_ci
69062306a36Sopenharmony_ci	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	/*
69362306a36Sopenharmony_ci	 * Commit dev_replace state and reserve 1 item for it.
69462306a36Sopenharmony_ci	 * This is crucial to ensure we won't miss copying extents for new block
69562306a36Sopenharmony_ci	 * groups that are allocated after we started the device replace, and
69662306a36Sopenharmony_ci	 * must be done after setting up the device replace state.
69762306a36Sopenharmony_ci	 */
69862306a36Sopenharmony_ci	trans = btrfs_start_transaction(root, 1);
69962306a36Sopenharmony_ci	if (IS_ERR(trans)) {
70062306a36Sopenharmony_ci		ret = PTR_ERR(trans);
70162306a36Sopenharmony_ci		down_write(&dev_replace->rwsem);
70262306a36Sopenharmony_ci		dev_replace->replace_state =
70362306a36Sopenharmony_ci			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
70462306a36Sopenharmony_ci		dev_replace->srcdev = NULL;
70562306a36Sopenharmony_ci		dev_replace->tgtdev = NULL;
70662306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
70762306a36Sopenharmony_ci		goto leave;
70862306a36Sopenharmony_ci	}
70962306a36Sopenharmony_ci
71062306a36Sopenharmony_ci	ret = btrfs_commit_transaction(trans);
71162306a36Sopenharmony_ci	WARN_ON(ret);
71262306a36Sopenharmony_ci
71362306a36Sopenharmony_ci	/* the disk copy procedure reuses the scrub code */
71462306a36Sopenharmony_ci	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
71562306a36Sopenharmony_ci			      btrfs_device_get_total_bytes(src_device),
71662306a36Sopenharmony_ci			      &dev_replace->scrub_progress, 0, 1);
71762306a36Sopenharmony_ci
71862306a36Sopenharmony_ci	ret = btrfs_dev_replace_finishing(fs_info, ret);
71962306a36Sopenharmony_ci	if (ret == -EINPROGRESS)
72062306a36Sopenharmony_ci		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci	return ret;
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_cileave:
72562306a36Sopenharmony_ci	btrfs_destroy_dev_replace_tgtdev(tgt_device);
72662306a36Sopenharmony_ci	return ret;
72762306a36Sopenharmony_ci}
72862306a36Sopenharmony_ci
72962306a36Sopenharmony_cistatic int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
73062306a36Sopenharmony_ci{
73162306a36Sopenharmony_ci	if (args->start.srcdevid == 0) {
73262306a36Sopenharmony_ci		if (memchr(args->start.srcdev_name, 0,
73362306a36Sopenharmony_ci			   sizeof(args->start.srcdev_name)) == NULL)
73462306a36Sopenharmony_ci			return -ENAMETOOLONG;
73562306a36Sopenharmony_ci	} else {
73662306a36Sopenharmony_ci		args->start.srcdev_name[0] = 0;
73762306a36Sopenharmony_ci	}
73862306a36Sopenharmony_ci
73962306a36Sopenharmony_ci	if (memchr(args->start.tgtdev_name, 0,
74062306a36Sopenharmony_ci		   sizeof(args->start.tgtdev_name)) == NULL)
74162306a36Sopenharmony_ci	    return -ENAMETOOLONG;
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci	return 0;
74462306a36Sopenharmony_ci}
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_ciint btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
74762306a36Sopenharmony_ci			    struct btrfs_ioctl_dev_replace_args *args)
74862306a36Sopenharmony_ci{
74962306a36Sopenharmony_ci	int ret;
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_ci	switch (args->start.cont_reading_from_srcdev_mode) {
75262306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
75362306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
75462306a36Sopenharmony_ci		break;
75562306a36Sopenharmony_ci	default:
75662306a36Sopenharmony_ci		return -EINVAL;
75762306a36Sopenharmony_ci	}
75862306a36Sopenharmony_ci	ret = btrfs_check_replace_dev_names(args);
75962306a36Sopenharmony_ci	if (ret < 0)
76062306a36Sopenharmony_ci		return ret;
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_ci	ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
76362306a36Sopenharmony_ci					args->start.srcdevid,
76462306a36Sopenharmony_ci					args->start.srcdev_name,
76562306a36Sopenharmony_ci					args->start.cont_reading_from_srcdev_mode);
76662306a36Sopenharmony_ci	args->result = ret;
76762306a36Sopenharmony_ci	/* don't warn if EINPROGRESS, someone else might be running scrub */
76862306a36Sopenharmony_ci	if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
76962306a36Sopenharmony_ci	    ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
77062306a36Sopenharmony_ci		return 0;
77162306a36Sopenharmony_ci
77262306a36Sopenharmony_ci	return ret;
77362306a36Sopenharmony_ci}
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci/*
77662306a36Sopenharmony_ci * blocked until all in-flight bios operations are finished.
77762306a36Sopenharmony_ci */
77862306a36Sopenharmony_cistatic void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
77962306a36Sopenharmony_ci{
78062306a36Sopenharmony_ci	set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
78162306a36Sopenharmony_ci	wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
78262306a36Sopenharmony_ci		   &fs_info->dev_replace.bio_counter));
78362306a36Sopenharmony_ci}
78462306a36Sopenharmony_ci
78562306a36Sopenharmony_ci/*
78662306a36Sopenharmony_ci * we have removed target device, it is safe to allow new bios request.
78762306a36Sopenharmony_ci */
78862306a36Sopenharmony_cistatic void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
78962306a36Sopenharmony_ci{
79062306a36Sopenharmony_ci	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
79162306a36Sopenharmony_ci	wake_up(&fs_info->dev_replace.replace_wait);
79262306a36Sopenharmony_ci}
79362306a36Sopenharmony_ci
79462306a36Sopenharmony_ci/*
79562306a36Sopenharmony_ci * When finishing the device replace, before swapping the source device with the
79662306a36Sopenharmony_ci * target device we must update the chunk allocation state in the target device,
79762306a36Sopenharmony_ci * as it is empty because replace works by directly copying the chunks and not
79862306a36Sopenharmony_ci * through the normal chunk allocation path.
79962306a36Sopenharmony_ci */
80062306a36Sopenharmony_cistatic int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
80162306a36Sopenharmony_ci					struct btrfs_device *tgtdev)
80262306a36Sopenharmony_ci{
80362306a36Sopenharmony_ci	struct extent_state *cached_state = NULL;
80462306a36Sopenharmony_ci	u64 start = 0;
80562306a36Sopenharmony_ci	u64 found_start;
80662306a36Sopenharmony_ci	u64 found_end;
80762306a36Sopenharmony_ci	int ret = 0;
80862306a36Sopenharmony_ci
80962306a36Sopenharmony_ci	lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
81062306a36Sopenharmony_ci
81162306a36Sopenharmony_ci	while (find_first_extent_bit(&srcdev->alloc_state, start,
81262306a36Sopenharmony_ci				     &found_start, &found_end,
81362306a36Sopenharmony_ci				     CHUNK_ALLOCATED, &cached_state)) {
81462306a36Sopenharmony_ci		ret = set_extent_bit(&tgtdev->alloc_state, found_start,
81562306a36Sopenharmony_ci				     found_end, CHUNK_ALLOCATED, NULL);
81662306a36Sopenharmony_ci		if (ret)
81762306a36Sopenharmony_ci			break;
81862306a36Sopenharmony_ci		start = found_end + 1;
81962306a36Sopenharmony_ci	}
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_ci	free_extent_state(cached_state);
82262306a36Sopenharmony_ci	return ret;
82362306a36Sopenharmony_ci}
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_cistatic void btrfs_dev_replace_update_device_in_mapping_tree(
82662306a36Sopenharmony_ci						struct btrfs_fs_info *fs_info,
82762306a36Sopenharmony_ci						struct btrfs_device *srcdev,
82862306a36Sopenharmony_ci						struct btrfs_device *tgtdev)
82962306a36Sopenharmony_ci{
83062306a36Sopenharmony_ci	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
83162306a36Sopenharmony_ci	struct extent_map *em;
83262306a36Sopenharmony_ci	struct map_lookup *map;
83362306a36Sopenharmony_ci	u64 start = 0;
83462306a36Sopenharmony_ci	int i;
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci	write_lock(&em_tree->lock);
83762306a36Sopenharmony_ci	do {
83862306a36Sopenharmony_ci		em = lookup_extent_mapping(em_tree, start, (u64)-1);
83962306a36Sopenharmony_ci		if (!em)
84062306a36Sopenharmony_ci			break;
84162306a36Sopenharmony_ci		map = em->map_lookup;
84262306a36Sopenharmony_ci		for (i = 0; i < map->num_stripes; i++)
84362306a36Sopenharmony_ci			if (srcdev == map->stripes[i].dev)
84462306a36Sopenharmony_ci				map->stripes[i].dev = tgtdev;
84562306a36Sopenharmony_ci		start = em->start + em->len;
84662306a36Sopenharmony_ci		free_extent_map(em);
84762306a36Sopenharmony_ci	} while (start);
84862306a36Sopenharmony_ci	write_unlock(&em_tree->lock);
84962306a36Sopenharmony_ci}
85062306a36Sopenharmony_ci
85162306a36Sopenharmony_cistatic int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
85262306a36Sopenharmony_ci				       int scrub_ret)
85362306a36Sopenharmony_ci{
85462306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
85562306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
85662306a36Sopenharmony_ci	struct btrfs_device *tgt_device;
85762306a36Sopenharmony_ci	struct btrfs_device *src_device;
85862306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->tree_root;
85962306a36Sopenharmony_ci	u8 uuid_tmp[BTRFS_UUID_SIZE];
86062306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
86162306a36Sopenharmony_ci	int ret = 0;
86262306a36Sopenharmony_ci
86362306a36Sopenharmony_ci	/* don't allow cancel or unmount to disturb the finishing procedure */
86462306a36Sopenharmony_ci	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
86562306a36Sopenharmony_ci
86662306a36Sopenharmony_ci	down_read(&dev_replace->rwsem);
86762306a36Sopenharmony_ci	/* was the operation canceled, or is it finished? */
86862306a36Sopenharmony_ci	if (dev_replace->replace_state !=
86962306a36Sopenharmony_ci	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
87062306a36Sopenharmony_ci		up_read(&dev_replace->rwsem);
87162306a36Sopenharmony_ci		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
87262306a36Sopenharmony_ci		return 0;
87362306a36Sopenharmony_ci	}
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ci	tgt_device = dev_replace->tgtdev;
87662306a36Sopenharmony_ci	src_device = dev_replace->srcdev;
87762306a36Sopenharmony_ci	up_read(&dev_replace->rwsem);
87862306a36Sopenharmony_ci
87962306a36Sopenharmony_ci	/*
88062306a36Sopenharmony_ci	 * flush all outstanding I/O and inode extent mappings before the
88162306a36Sopenharmony_ci	 * copy operation is declared as being finished
88262306a36Sopenharmony_ci	 */
88362306a36Sopenharmony_ci	ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
88462306a36Sopenharmony_ci	if (ret) {
88562306a36Sopenharmony_ci		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
88662306a36Sopenharmony_ci		return ret;
88762306a36Sopenharmony_ci	}
88862306a36Sopenharmony_ci	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
88962306a36Sopenharmony_ci
89062306a36Sopenharmony_ci	/*
89162306a36Sopenharmony_ci	 * We have to use this loop approach because at this point src_device
89262306a36Sopenharmony_ci	 * has to be available for transaction commit to complete, yet new
89362306a36Sopenharmony_ci	 * chunks shouldn't be allocated on the device.
89462306a36Sopenharmony_ci	 */
89562306a36Sopenharmony_ci	while (1) {
89662306a36Sopenharmony_ci		trans = btrfs_start_transaction(root, 0);
89762306a36Sopenharmony_ci		if (IS_ERR(trans)) {
89862306a36Sopenharmony_ci			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
89962306a36Sopenharmony_ci			return PTR_ERR(trans);
90062306a36Sopenharmony_ci		}
90162306a36Sopenharmony_ci		ret = btrfs_commit_transaction(trans);
90262306a36Sopenharmony_ci		WARN_ON(ret);
90362306a36Sopenharmony_ci
90462306a36Sopenharmony_ci		/* Prevent write_all_supers() during the finishing procedure */
90562306a36Sopenharmony_ci		mutex_lock(&fs_devices->device_list_mutex);
90662306a36Sopenharmony_ci		/* Prevent new chunks being allocated on the source device */
90762306a36Sopenharmony_ci		mutex_lock(&fs_info->chunk_mutex);
90862306a36Sopenharmony_ci
90962306a36Sopenharmony_ci		if (!list_empty(&src_device->post_commit_list)) {
91062306a36Sopenharmony_ci			mutex_unlock(&fs_devices->device_list_mutex);
91162306a36Sopenharmony_ci			mutex_unlock(&fs_info->chunk_mutex);
91262306a36Sopenharmony_ci		} else {
91362306a36Sopenharmony_ci			break;
91462306a36Sopenharmony_ci		}
91562306a36Sopenharmony_ci	}
91662306a36Sopenharmony_ci
91762306a36Sopenharmony_ci	down_write(&dev_replace->rwsem);
91862306a36Sopenharmony_ci	dev_replace->replace_state =
91962306a36Sopenharmony_ci		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
92062306a36Sopenharmony_ci			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
92162306a36Sopenharmony_ci	dev_replace->tgtdev = NULL;
92262306a36Sopenharmony_ci	dev_replace->srcdev = NULL;
92362306a36Sopenharmony_ci	dev_replace->time_stopped = ktime_get_real_seconds();
92462306a36Sopenharmony_ci	dev_replace->item_needs_writeback = 1;
92562306a36Sopenharmony_ci
92662306a36Sopenharmony_ci	/*
92762306a36Sopenharmony_ci	 * Update allocation state in the new device and replace the old device
92862306a36Sopenharmony_ci	 * with the new one in the mapping tree.
92962306a36Sopenharmony_ci	 */
93062306a36Sopenharmony_ci	if (!scrub_ret) {
93162306a36Sopenharmony_ci		scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
93262306a36Sopenharmony_ci		if (scrub_ret)
93362306a36Sopenharmony_ci			goto error;
93462306a36Sopenharmony_ci		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
93562306a36Sopenharmony_ci								src_device,
93662306a36Sopenharmony_ci								tgt_device);
93762306a36Sopenharmony_ci	} else {
93862306a36Sopenharmony_ci		if (scrub_ret != -ECANCELED)
93962306a36Sopenharmony_ci			btrfs_err_in_rcu(fs_info,
94062306a36Sopenharmony_ci				 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
94162306a36Sopenharmony_ci				 btrfs_dev_name(src_device),
94262306a36Sopenharmony_ci				 src_device->devid,
94362306a36Sopenharmony_ci				 btrfs_dev_name(tgt_device), scrub_ret);
94462306a36Sopenharmony_cierror:
94562306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
94662306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
94762306a36Sopenharmony_ci		mutex_unlock(&fs_devices->device_list_mutex);
94862306a36Sopenharmony_ci		btrfs_rm_dev_replace_blocked(fs_info);
94962306a36Sopenharmony_ci		if (tgt_device)
95062306a36Sopenharmony_ci			btrfs_destroy_dev_replace_tgtdev(tgt_device);
95162306a36Sopenharmony_ci		btrfs_rm_dev_replace_unblocked(fs_info);
95262306a36Sopenharmony_ci		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
95362306a36Sopenharmony_ci
95462306a36Sopenharmony_ci		return scrub_ret;
95562306a36Sopenharmony_ci	}
95662306a36Sopenharmony_ci
95762306a36Sopenharmony_ci	btrfs_info_in_rcu(fs_info,
95862306a36Sopenharmony_ci			  "dev_replace from %s (devid %llu) to %s finished",
95962306a36Sopenharmony_ci			  btrfs_dev_name(src_device),
96062306a36Sopenharmony_ci			  src_device->devid,
96162306a36Sopenharmony_ci			  btrfs_dev_name(tgt_device));
96262306a36Sopenharmony_ci	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
96362306a36Sopenharmony_ci	tgt_device->devid = src_device->devid;
96462306a36Sopenharmony_ci	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
96562306a36Sopenharmony_ci	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
96662306a36Sopenharmony_ci	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
96762306a36Sopenharmony_ci	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
96862306a36Sopenharmony_ci	btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
96962306a36Sopenharmony_ci	btrfs_device_set_disk_total_bytes(tgt_device,
97062306a36Sopenharmony_ci					  src_device->disk_total_bytes);
97162306a36Sopenharmony_ci	btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
97262306a36Sopenharmony_ci	tgt_device->commit_bytes_used = src_device->bytes_used;
97362306a36Sopenharmony_ci
97462306a36Sopenharmony_ci	btrfs_assign_next_active_device(src_device, tgt_device);
97562306a36Sopenharmony_ci
97662306a36Sopenharmony_ci	list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
97762306a36Sopenharmony_ci	fs_devices->rw_devices++;
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci	up_write(&dev_replace->rwsem);
98062306a36Sopenharmony_ci	btrfs_rm_dev_replace_blocked(fs_info);
98162306a36Sopenharmony_ci
98262306a36Sopenharmony_ci	btrfs_rm_dev_replace_remove_srcdev(src_device);
98362306a36Sopenharmony_ci
98462306a36Sopenharmony_ci	btrfs_rm_dev_replace_unblocked(fs_info);
98562306a36Sopenharmony_ci
98662306a36Sopenharmony_ci	/*
98762306a36Sopenharmony_ci	 * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
98862306a36Sopenharmony_ci	 * update on-disk dev stats value during commit transaction
98962306a36Sopenharmony_ci	 */
99062306a36Sopenharmony_ci	atomic_inc(&tgt_device->dev_stats_ccnt);
99162306a36Sopenharmony_ci
99262306a36Sopenharmony_ci	/*
99362306a36Sopenharmony_ci	 * this is again a consistent state where no dev_replace procedure
99462306a36Sopenharmony_ci	 * is running, the target device is part of the filesystem, the
99562306a36Sopenharmony_ci	 * source device is not part of the filesystem anymore and its 1st
99662306a36Sopenharmony_ci	 * superblock is scratched out so that it is no longer marked to
99762306a36Sopenharmony_ci	 * belong to this filesystem.
99862306a36Sopenharmony_ci	 */
99962306a36Sopenharmony_ci	mutex_unlock(&fs_info->chunk_mutex);
100062306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_ci	/* replace the sysfs entry */
100362306a36Sopenharmony_ci	btrfs_sysfs_remove_device(src_device);
100462306a36Sopenharmony_ci	btrfs_sysfs_update_devid(tgt_device);
100562306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
100662306a36Sopenharmony_ci		btrfs_scratch_superblocks(fs_info, src_device->bdev,
100762306a36Sopenharmony_ci					  src_device->name->str);
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_ci	/* write back the superblocks */
101062306a36Sopenharmony_ci	trans = btrfs_start_transaction(root, 0);
101162306a36Sopenharmony_ci	if (!IS_ERR(trans))
101262306a36Sopenharmony_ci		btrfs_commit_transaction(trans);
101362306a36Sopenharmony_ci
101462306a36Sopenharmony_ci	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
101562306a36Sopenharmony_ci
101662306a36Sopenharmony_ci	btrfs_rm_dev_replace_free_srcdev(src_device);
101762306a36Sopenharmony_ci
101862306a36Sopenharmony_ci	return 0;
101962306a36Sopenharmony_ci}
102062306a36Sopenharmony_ci
102162306a36Sopenharmony_ci/*
102262306a36Sopenharmony_ci * Read progress of device replace status according to the state and last
102362306a36Sopenharmony_ci * stored position. The value format is the same as for
102462306a36Sopenharmony_ci * btrfs_dev_replace::progress_1000
102562306a36Sopenharmony_ci */
102662306a36Sopenharmony_cistatic u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
102762306a36Sopenharmony_ci{
102862306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
102962306a36Sopenharmony_ci	u64 ret = 0;
103062306a36Sopenharmony_ci
103162306a36Sopenharmony_ci	switch (dev_replace->replace_state) {
103262306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
103362306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
103462306a36Sopenharmony_ci		ret = 0;
103562306a36Sopenharmony_ci		break;
103662306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
103762306a36Sopenharmony_ci		ret = 1000;
103862306a36Sopenharmony_ci		break;
103962306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
104062306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
104162306a36Sopenharmony_ci		ret = div64_u64(dev_replace->cursor_left,
104262306a36Sopenharmony_ci				div_u64(btrfs_device_get_total_bytes(
104362306a36Sopenharmony_ci						dev_replace->srcdev), 1000));
104462306a36Sopenharmony_ci		break;
104562306a36Sopenharmony_ci	}
104662306a36Sopenharmony_ci
104762306a36Sopenharmony_ci	return ret;
104862306a36Sopenharmony_ci}
104962306a36Sopenharmony_ci
105062306a36Sopenharmony_civoid btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
105162306a36Sopenharmony_ci			      struct btrfs_ioctl_dev_replace_args *args)
105262306a36Sopenharmony_ci{
105362306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
105462306a36Sopenharmony_ci
105562306a36Sopenharmony_ci	down_read(&dev_replace->rwsem);
105662306a36Sopenharmony_ci	/* even if !dev_replace_is_valid, the values are good enough for
105762306a36Sopenharmony_ci	 * the replace_status ioctl */
105862306a36Sopenharmony_ci	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
105962306a36Sopenharmony_ci	args->status.replace_state = dev_replace->replace_state;
106062306a36Sopenharmony_ci	args->status.time_started = dev_replace->time_started;
106162306a36Sopenharmony_ci	args->status.time_stopped = dev_replace->time_stopped;
106262306a36Sopenharmony_ci	args->status.num_write_errors =
106362306a36Sopenharmony_ci		atomic64_read(&dev_replace->num_write_errors);
106462306a36Sopenharmony_ci	args->status.num_uncorrectable_read_errors =
106562306a36Sopenharmony_ci		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
106662306a36Sopenharmony_ci	args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
106762306a36Sopenharmony_ci	up_read(&dev_replace->rwsem);
106862306a36Sopenharmony_ci}
106962306a36Sopenharmony_ci
107062306a36Sopenharmony_ciint btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
107162306a36Sopenharmony_ci{
107262306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
107362306a36Sopenharmony_ci	struct btrfs_device *tgt_device = NULL;
107462306a36Sopenharmony_ci	struct btrfs_device *src_device = NULL;
107562306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
107662306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->tree_root;
107762306a36Sopenharmony_ci	int result;
107862306a36Sopenharmony_ci	int ret;
107962306a36Sopenharmony_ci
108062306a36Sopenharmony_ci	if (sb_rdonly(fs_info->sb))
108162306a36Sopenharmony_ci		return -EROFS;
108262306a36Sopenharmony_ci
108362306a36Sopenharmony_ci	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
108462306a36Sopenharmony_ci	down_write(&dev_replace->rwsem);
108562306a36Sopenharmony_ci	switch (dev_replace->replace_state) {
108662306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
108762306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
108862306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
108962306a36Sopenharmony_ci		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
109062306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
109162306a36Sopenharmony_ci		break;
109262306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
109362306a36Sopenharmony_ci		tgt_device = dev_replace->tgtdev;
109462306a36Sopenharmony_ci		src_device = dev_replace->srcdev;
109562306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
109662306a36Sopenharmony_ci		ret = btrfs_scrub_cancel(fs_info);
109762306a36Sopenharmony_ci		if (ret < 0) {
109862306a36Sopenharmony_ci			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
109962306a36Sopenharmony_ci		} else {
110062306a36Sopenharmony_ci			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
110162306a36Sopenharmony_ci			/*
110262306a36Sopenharmony_ci			 * btrfs_dev_replace_finishing() will handle the
110362306a36Sopenharmony_ci			 * cleanup part
110462306a36Sopenharmony_ci			 */
110562306a36Sopenharmony_ci			btrfs_info_in_rcu(fs_info,
110662306a36Sopenharmony_ci				"dev_replace from %s (devid %llu) to %s canceled",
110762306a36Sopenharmony_ci				btrfs_dev_name(src_device), src_device->devid,
110862306a36Sopenharmony_ci				btrfs_dev_name(tgt_device));
110962306a36Sopenharmony_ci		}
111062306a36Sopenharmony_ci		break;
111162306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
111262306a36Sopenharmony_ci		/*
111362306a36Sopenharmony_ci		 * Scrub doing the replace isn't running so we need to do the
111462306a36Sopenharmony_ci		 * cleanup step of btrfs_dev_replace_finishing() here
111562306a36Sopenharmony_ci		 */
111662306a36Sopenharmony_ci		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
111762306a36Sopenharmony_ci		tgt_device = dev_replace->tgtdev;
111862306a36Sopenharmony_ci		src_device = dev_replace->srcdev;
111962306a36Sopenharmony_ci		dev_replace->tgtdev = NULL;
112062306a36Sopenharmony_ci		dev_replace->srcdev = NULL;
112162306a36Sopenharmony_ci		dev_replace->replace_state =
112262306a36Sopenharmony_ci				BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
112362306a36Sopenharmony_ci		dev_replace->time_stopped = ktime_get_real_seconds();
112462306a36Sopenharmony_ci		dev_replace->item_needs_writeback = 1;
112562306a36Sopenharmony_ci
112662306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
112762306a36Sopenharmony_ci
112862306a36Sopenharmony_ci		/* Scrub for replace must not be running in suspended state */
112962306a36Sopenharmony_ci		btrfs_scrub_cancel(fs_info);
113062306a36Sopenharmony_ci
113162306a36Sopenharmony_ci		trans = btrfs_start_transaction(root, 0);
113262306a36Sopenharmony_ci		if (IS_ERR(trans)) {
113362306a36Sopenharmony_ci			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
113462306a36Sopenharmony_ci			return PTR_ERR(trans);
113562306a36Sopenharmony_ci		}
113662306a36Sopenharmony_ci		ret = btrfs_commit_transaction(trans);
113762306a36Sopenharmony_ci		WARN_ON(ret);
113862306a36Sopenharmony_ci
113962306a36Sopenharmony_ci		btrfs_info_in_rcu(fs_info,
114062306a36Sopenharmony_ci		"suspended dev_replace from %s (devid %llu) to %s canceled",
114162306a36Sopenharmony_ci			btrfs_dev_name(src_device), src_device->devid,
114262306a36Sopenharmony_ci			btrfs_dev_name(tgt_device));
114362306a36Sopenharmony_ci
114462306a36Sopenharmony_ci		if (tgt_device)
114562306a36Sopenharmony_ci			btrfs_destroy_dev_replace_tgtdev(tgt_device);
114662306a36Sopenharmony_ci		break;
114762306a36Sopenharmony_ci	default:
114862306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
114962306a36Sopenharmony_ci		result = -EINVAL;
115062306a36Sopenharmony_ci	}
115162306a36Sopenharmony_ci
115262306a36Sopenharmony_ci	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
115362306a36Sopenharmony_ci	return result;
115462306a36Sopenharmony_ci}
115562306a36Sopenharmony_ci
115662306a36Sopenharmony_civoid btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
115762306a36Sopenharmony_ci{
115862306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
115962306a36Sopenharmony_ci
116062306a36Sopenharmony_ci	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
116162306a36Sopenharmony_ci	down_write(&dev_replace->rwsem);
116262306a36Sopenharmony_ci
116362306a36Sopenharmony_ci	switch (dev_replace->replace_state) {
116462306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
116562306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
116662306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
116762306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
116862306a36Sopenharmony_ci		break;
116962306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
117062306a36Sopenharmony_ci		dev_replace->replace_state =
117162306a36Sopenharmony_ci			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
117262306a36Sopenharmony_ci		dev_replace->time_stopped = ktime_get_real_seconds();
117362306a36Sopenharmony_ci		dev_replace->item_needs_writeback = 1;
117462306a36Sopenharmony_ci		btrfs_info(fs_info, "suspending dev_replace for unmount");
117562306a36Sopenharmony_ci		break;
117662306a36Sopenharmony_ci	}
117762306a36Sopenharmony_ci
117862306a36Sopenharmony_ci	up_write(&dev_replace->rwsem);
117962306a36Sopenharmony_ci	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
118062306a36Sopenharmony_ci}
118162306a36Sopenharmony_ci
118262306a36Sopenharmony_ci/* resume dev_replace procedure that was interrupted by unmount */
118362306a36Sopenharmony_ciint btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
118462306a36Sopenharmony_ci{
118562306a36Sopenharmony_ci	struct task_struct *task;
118662306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
118762306a36Sopenharmony_ci
118862306a36Sopenharmony_ci	down_write(&dev_replace->rwsem);
118962306a36Sopenharmony_ci
119062306a36Sopenharmony_ci	switch (dev_replace->replace_state) {
119162306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
119262306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
119362306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
119462306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
119562306a36Sopenharmony_ci		return 0;
119662306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
119762306a36Sopenharmony_ci		break;
119862306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
119962306a36Sopenharmony_ci		dev_replace->replace_state =
120062306a36Sopenharmony_ci			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
120162306a36Sopenharmony_ci		break;
120262306a36Sopenharmony_ci	}
120362306a36Sopenharmony_ci	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
120462306a36Sopenharmony_ci		btrfs_info(fs_info,
120562306a36Sopenharmony_ci			   "cannot continue dev_replace, tgtdev is missing");
120662306a36Sopenharmony_ci		btrfs_info(fs_info,
120762306a36Sopenharmony_ci			   "you may cancel the operation after 'mount -o degraded'");
120862306a36Sopenharmony_ci		dev_replace->replace_state =
120962306a36Sopenharmony_ci					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
121062306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
121162306a36Sopenharmony_ci		return 0;
121262306a36Sopenharmony_ci	}
121362306a36Sopenharmony_ci	up_write(&dev_replace->rwsem);
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_ci	/*
121662306a36Sopenharmony_ci	 * This could collide with a paused balance, but the exclusive op logic
121762306a36Sopenharmony_ci	 * should never allow both to start and pause. We don't want to allow
121862306a36Sopenharmony_ci	 * dev-replace to start anyway.
121962306a36Sopenharmony_ci	 */
122062306a36Sopenharmony_ci	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
122162306a36Sopenharmony_ci		down_write(&dev_replace->rwsem);
122262306a36Sopenharmony_ci		dev_replace->replace_state =
122362306a36Sopenharmony_ci					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
122462306a36Sopenharmony_ci		up_write(&dev_replace->rwsem);
122562306a36Sopenharmony_ci		btrfs_info(fs_info,
122662306a36Sopenharmony_ci		"cannot resume dev-replace, other exclusive operation running");
122762306a36Sopenharmony_ci		return 0;
122862306a36Sopenharmony_ci	}
122962306a36Sopenharmony_ci
123062306a36Sopenharmony_ci	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
123162306a36Sopenharmony_ci	return PTR_ERR_OR_ZERO(task);
123262306a36Sopenharmony_ci}
123362306a36Sopenharmony_ci
123462306a36Sopenharmony_cistatic int btrfs_dev_replace_kthread(void *data)
123562306a36Sopenharmony_ci{
123662306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = data;
123762306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
123862306a36Sopenharmony_ci	u64 progress;
123962306a36Sopenharmony_ci	int ret;
124062306a36Sopenharmony_ci
124162306a36Sopenharmony_ci	progress = btrfs_dev_replace_progress(fs_info);
124262306a36Sopenharmony_ci	progress = div_u64(progress, 10);
124362306a36Sopenharmony_ci	btrfs_info_in_rcu(fs_info,
124462306a36Sopenharmony_ci		"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
124562306a36Sopenharmony_ci		btrfs_dev_name(dev_replace->srcdev),
124662306a36Sopenharmony_ci		dev_replace->srcdev->devid,
124762306a36Sopenharmony_ci		btrfs_dev_name(dev_replace->tgtdev),
124862306a36Sopenharmony_ci		(unsigned int)progress);
124962306a36Sopenharmony_ci
125062306a36Sopenharmony_ci	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
125162306a36Sopenharmony_ci			      dev_replace->committed_cursor_left,
125262306a36Sopenharmony_ci			      btrfs_device_get_total_bytes(dev_replace->srcdev),
125362306a36Sopenharmony_ci			      &dev_replace->scrub_progress, 0, 1);
125462306a36Sopenharmony_ci	ret = btrfs_dev_replace_finishing(fs_info, ret);
125562306a36Sopenharmony_ci	WARN_ON(ret && ret != -ECANCELED);
125662306a36Sopenharmony_ci
125762306a36Sopenharmony_ci	btrfs_exclop_finish(fs_info);
125862306a36Sopenharmony_ci	return 0;
125962306a36Sopenharmony_ci}
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ciint __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
126262306a36Sopenharmony_ci{
126362306a36Sopenharmony_ci	if (!dev_replace->is_valid)
126462306a36Sopenharmony_ci		return 0;
126562306a36Sopenharmony_ci
126662306a36Sopenharmony_ci	switch (dev_replace->replace_state) {
126762306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
126862306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
126962306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
127062306a36Sopenharmony_ci		return 0;
127162306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
127262306a36Sopenharmony_ci	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
127362306a36Sopenharmony_ci		/*
127462306a36Sopenharmony_ci		 * return true even if tgtdev is missing (this is
127562306a36Sopenharmony_ci		 * something that can happen if the dev_replace
127662306a36Sopenharmony_ci		 * procedure is suspended by an umount and then
127762306a36Sopenharmony_ci		 * the tgtdev is missing (or "btrfs dev scan") was
127862306a36Sopenharmony_ci		 * not called and the filesystem is remounted
127962306a36Sopenharmony_ci		 * in degraded state. This does not stop the
128062306a36Sopenharmony_ci		 * dev_replace procedure. It needs to be canceled
128162306a36Sopenharmony_ci		 * manually if the cancellation is wanted.
128262306a36Sopenharmony_ci		 */
128362306a36Sopenharmony_ci		break;
128462306a36Sopenharmony_ci	}
128562306a36Sopenharmony_ci	return 1;
128662306a36Sopenharmony_ci}
128762306a36Sopenharmony_ci
128862306a36Sopenharmony_civoid btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
128962306a36Sopenharmony_ci{
129062306a36Sopenharmony_ci	percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
129162306a36Sopenharmony_ci	cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
129262306a36Sopenharmony_ci}
129362306a36Sopenharmony_ci
129462306a36Sopenharmony_civoid btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
129562306a36Sopenharmony_ci{
129662306a36Sopenharmony_ci	while (1) {
129762306a36Sopenharmony_ci		percpu_counter_inc(&fs_info->dev_replace.bio_counter);
129862306a36Sopenharmony_ci		if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
129962306a36Sopenharmony_ci				     &fs_info->fs_state)))
130062306a36Sopenharmony_ci			break;
130162306a36Sopenharmony_ci
130262306a36Sopenharmony_ci		btrfs_bio_counter_dec(fs_info);
130362306a36Sopenharmony_ci		wait_event(fs_info->dev_replace.replace_wait,
130462306a36Sopenharmony_ci			   !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
130562306a36Sopenharmony_ci				     &fs_info->fs_state));
130662306a36Sopenharmony_ci	}
130762306a36Sopenharmony_ci}
1308