162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2007 Oracle.  All rights reserved.
462306a36Sopenharmony_ci */
562306a36Sopenharmony_ci
662306a36Sopenharmony_ci#include <linux/sched.h>
762306a36Sopenharmony_ci#include <linux/sched/mm.h>
862306a36Sopenharmony_ci#include <linux/slab.h>
962306a36Sopenharmony_ci#include <linux/ratelimit.h>
1062306a36Sopenharmony_ci#include <linux/kthread.h>
1162306a36Sopenharmony_ci#include <linux/semaphore.h>
1262306a36Sopenharmony_ci#include <linux/uuid.h>
1362306a36Sopenharmony_ci#include <linux/list_sort.h>
1462306a36Sopenharmony_ci#include <linux/namei.h>
1562306a36Sopenharmony_ci#include "misc.h"
1662306a36Sopenharmony_ci#include "ctree.h"
1762306a36Sopenharmony_ci#include "extent_map.h"
1862306a36Sopenharmony_ci#include "disk-io.h"
1962306a36Sopenharmony_ci#include "transaction.h"
2062306a36Sopenharmony_ci#include "print-tree.h"
2162306a36Sopenharmony_ci#include "volumes.h"
2262306a36Sopenharmony_ci#include "raid56.h"
2362306a36Sopenharmony_ci#include "rcu-string.h"
2462306a36Sopenharmony_ci#include "dev-replace.h"
2562306a36Sopenharmony_ci#include "sysfs.h"
2662306a36Sopenharmony_ci#include "tree-checker.h"
2762306a36Sopenharmony_ci#include "space-info.h"
2862306a36Sopenharmony_ci#include "block-group.h"
2962306a36Sopenharmony_ci#include "discard.h"
3062306a36Sopenharmony_ci#include "zoned.h"
3162306a36Sopenharmony_ci#include "fs.h"
3262306a36Sopenharmony_ci#include "accessors.h"
3362306a36Sopenharmony_ci#include "uuid-tree.h"
3462306a36Sopenharmony_ci#include "ioctl.h"
3562306a36Sopenharmony_ci#include "relocation.h"
3662306a36Sopenharmony_ci#include "scrub.h"
3762306a36Sopenharmony_ci#include "super.h"
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci#define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
4062306a36Sopenharmony_ci					 BTRFS_BLOCK_GROUP_RAID10 | \
4162306a36Sopenharmony_ci					 BTRFS_BLOCK_GROUP_RAID56_MASK)
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ciconst struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
4462306a36Sopenharmony_ci	[BTRFS_RAID_RAID10] = {
4562306a36Sopenharmony_ci		.sub_stripes	= 2,
4662306a36Sopenharmony_ci		.dev_stripes	= 1,
4762306a36Sopenharmony_ci		.devs_max	= 0,	/* 0 == as many as possible */
4862306a36Sopenharmony_ci		.devs_min	= 2,
4962306a36Sopenharmony_ci		.tolerated_failures = 1,
5062306a36Sopenharmony_ci		.devs_increment	= 2,
5162306a36Sopenharmony_ci		.ncopies	= 2,
5262306a36Sopenharmony_ci		.nparity        = 0,
5362306a36Sopenharmony_ci		.raid_name	= "raid10",
5462306a36Sopenharmony_ci		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
5562306a36Sopenharmony_ci		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
5662306a36Sopenharmony_ci	},
5762306a36Sopenharmony_ci	[BTRFS_RAID_RAID1] = {
5862306a36Sopenharmony_ci		.sub_stripes	= 1,
5962306a36Sopenharmony_ci		.dev_stripes	= 1,
6062306a36Sopenharmony_ci		.devs_max	= 2,
6162306a36Sopenharmony_ci		.devs_min	= 2,
6262306a36Sopenharmony_ci		.tolerated_failures = 1,
6362306a36Sopenharmony_ci		.devs_increment	= 2,
6462306a36Sopenharmony_ci		.ncopies	= 2,
6562306a36Sopenharmony_ci		.nparity        = 0,
6662306a36Sopenharmony_ci		.raid_name	= "raid1",
6762306a36Sopenharmony_ci		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
6862306a36Sopenharmony_ci		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
6962306a36Sopenharmony_ci	},
7062306a36Sopenharmony_ci	[BTRFS_RAID_RAID1C3] = {
7162306a36Sopenharmony_ci		.sub_stripes	= 1,
7262306a36Sopenharmony_ci		.dev_stripes	= 1,
7362306a36Sopenharmony_ci		.devs_max	= 3,
7462306a36Sopenharmony_ci		.devs_min	= 3,
7562306a36Sopenharmony_ci		.tolerated_failures = 2,
7662306a36Sopenharmony_ci		.devs_increment	= 3,
7762306a36Sopenharmony_ci		.ncopies	= 3,
7862306a36Sopenharmony_ci		.nparity        = 0,
7962306a36Sopenharmony_ci		.raid_name	= "raid1c3",
8062306a36Sopenharmony_ci		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
8162306a36Sopenharmony_ci		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
8262306a36Sopenharmony_ci	},
8362306a36Sopenharmony_ci	[BTRFS_RAID_RAID1C4] = {
8462306a36Sopenharmony_ci		.sub_stripes	= 1,
8562306a36Sopenharmony_ci		.dev_stripes	= 1,
8662306a36Sopenharmony_ci		.devs_max	= 4,
8762306a36Sopenharmony_ci		.devs_min	= 4,
8862306a36Sopenharmony_ci		.tolerated_failures = 3,
8962306a36Sopenharmony_ci		.devs_increment	= 4,
9062306a36Sopenharmony_ci		.ncopies	= 4,
9162306a36Sopenharmony_ci		.nparity        = 0,
9262306a36Sopenharmony_ci		.raid_name	= "raid1c4",
9362306a36Sopenharmony_ci		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C4,
9462306a36Sopenharmony_ci		.mindev_error	= BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
9562306a36Sopenharmony_ci	},
9662306a36Sopenharmony_ci	[BTRFS_RAID_DUP] = {
9762306a36Sopenharmony_ci		.sub_stripes	= 1,
9862306a36Sopenharmony_ci		.dev_stripes	= 2,
9962306a36Sopenharmony_ci		.devs_max	= 1,
10062306a36Sopenharmony_ci		.devs_min	= 1,
10162306a36Sopenharmony_ci		.tolerated_failures = 0,
10262306a36Sopenharmony_ci		.devs_increment	= 1,
10362306a36Sopenharmony_ci		.ncopies	= 2,
10462306a36Sopenharmony_ci		.nparity        = 0,
10562306a36Sopenharmony_ci		.raid_name	= "dup",
10662306a36Sopenharmony_ci		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
10762306a36Sopenharmony_ci		.mindev_error	= 0,
10862306a36Sopenharmony_ci	},
10962306a36Sopenharmony_ci	[BTRFS_RAID_RAID0] = {
11062306a36Sopenharmony_ci		.sub_stripes	= 1,
11162306a36Sopenharmony_ci		.dev_stripes	= 1,
11262306a36Sopenharmony_ci		.devs_max	= 0,
11362306a36Sopenharmony_ci		.devs_min	= 1,
11462306a36Sopenharmony_ci		.tolerated_failures = 0,
11562306a36Sopenharmony_ci		.devs_increment	= 1,
11662306a36Sopenharmony_ci		.ncopies	= 1,
11762306a36Sopenharmony_ci		.nparity        = 0,
11862306a36Sopenharmony_ci		.raid_name	= "raid0",
11962306a36Sopenharmony_ci		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
12062306a36Sopenharmony_ci		.mindev_error	= 0,
12162306a36Sopenharmony_ci	},
12262306a36Sopenharmony_ci	[BTRFS_RAID_SINGLE] = {
12362306a36Sopenharmony_ci		.sub_stripes	= 1,
12462306a36Sopenharmony_ci		.dev_stripes	= 1,
12562306a36Sopenharmony_ci		.devs_max	= 1,
12662306a36Sopenharmony_ci		.devs_min	= 1,
12762306a36Sopenharmony_ci		.tolerated_failures = 0,
12862306a36Sopenharmony_ci		.devs_increment	= 1,
12962306a36Sopenharmony_ci		.ncopies	= 1,
13062306a36Sopenharmony_ci		.nparity        = 0,
13162306a36Sopenharmony_ci		.raid_name	= "single",
13262306a36Sopenharmony_ci		.bg_flag	= 0,
13362306a36Sopenharmony_ci		.mindev_error	= 0,
13462306a36Sopenharmony_ci	},
13562306a36Sopenharmony_ci	[BTRFS_RAID_RAID5] = {
13662306a36Sopenharmony_ci		.sub_stripes	= 1,
13762306a36Sopenharmony_ci		.dev_stripes	= 1,
13862306a36Sopenharmony_ci		.devs_max	= 0,
13962306a36Sopenharmony_ci		.devs_min	= 2,
14062306a36Sopenharmony_ci		.tolerated_failures = 1,
14162306a36Sopenharmony_ci		.devs_increment	= 1,
14262306a36Sopenharmony_ci		.ncopies	= 1,
14362306a36Sopenharmony_ci		.nparity        = 1,
14462306a36Sopenharmony_ci		.raid_name	= "raid5",
14562306a36Sopenharmony_ci		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
14662306a36Sopenharmony_ci		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
14762306a36Sopenharmony_ci	},
14862306a36Sopenharmony_ci	[BTRFS_RAID_RAID6] = {
14962306a36Sopenharmony_ci		.sub_stripes	= 1,
15062306a36Sopenharmony_ci		.dev_stripes	= 1,
15162306a36Sopenharmony_ci		.devs_max	= 0,
15262306a36Sopenharmony_ci		.devs_min	= 3,
15362306a36Sopenharmony_ci		.tolerated_failures = 2,
15462306a36Sopenharmony_ci		.devs_increment	= 1,
15562306a36Sopenharmony_ci		.ncopies	= 1,
15662306a36Sopenharmony_ci		.nparity        = 2,
15762306a36Sopenharmony_ci		.raid_name	= "raid6",
15862306a36Sopenharmony_ci		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
15962306a36Sopenharmony_ci		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
16062306a36Sopenharmony_ci	},
16162306a36Sopenharmony_ci};
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci/*
16462306a36Sopenharmony_ci * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
16562306a36Sopenharmony_ci * can be used as index to access btrfs_raid_array[].
16662306a36Sopenharmony_ci */
16762306a36Sopenharmony_cienum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
16862306a36Sopenharmony_ci{
16962306a36Sopenharmony_ci	const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci	if (!profile)
17262306a36Sopenharmony_ci		return BTRFS_RAID_SINGLE;
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci	return BTRFS_BG_FLAG_TO_INDEX(profile);
17562306a36Sopenharmony_ci}
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_ciconst char *btrfs_bg_type_to_raid_name(u64 flags)
17862306a36Sopenharmony_ci{
17962306a36Sopenharmony_ci	const int index = btrfs_bg_flags_to_raid_index(flags);
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci	if (index >= BTRFS_NR_RAID_TYPES)
18262306a36Sopenharmony_ci		return NULL;
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci	return btrfs_raid_array[index].raid_name;
18562306a36Sopenharmony_ci}
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ciint btrfs_nr_parity_stripes(u64 type)
18862306a36Sopenharmony_ci{
18962306a36Sopenharmony_ci	enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci	return btrfs_raid_array[index].nparity;
19262306a36Sopenharmony_ci}
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci/*
19562306a36Sopenharmony_ci * Fill @buf with textual description of @bg_flags, no more than @size_buf
19662306a36Sopenharmony_ci * bytes including terminating null byte.
19762306a36Sopenharmony_ci */
19862306a36Sopenharmony_civoid btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
19962306a36Sopenharmony_ci{
20062306a36Sopenharmony_ci	int i;
20162306a36Sopenharmony_ci	int ret;
20262306a36Sopenharmony_ci	char *bp = buf;
20362306a36Sopenharmony_ci	u64 flags = bg_flags;
20462306a36Sopenharmony_ci	u32 size_bp = size_buf;
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci	if (!flags) {
20762306a36Sopenharmony_ci		strcpy(bp, "NONE");
20862306a36Sopenharmony_ci		return;
20962306a36Sopenharmony_ci	}
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci#define DESCRIBE_FLAG(flag, desc)						\
21262306a36Sopenharmony_ci	do {								\
21362306a36Sopenharmony_ci		if (flags & (flag)) {					\
21462306a36Sopenharmony_ci			ret = snprintf(bp, size_bp, "%s|", (desc));	\
21562306a36Sopenharmony_ci			if (ret < 0 || ret >= size_bp)			\
21662306a36Sopenharmony_ci				goto out_overflow;			\
21762306a36Sopenharmony_ci			size_bp -= ret;					\
21862306a36Sopenharmony_ci			bp += ret;					\
21962306a36Sopenharmony_ci			flags &= ~(flag);				\
22062306a36Sopenharmony_ci		}							\
22162306a36Sopenharmony_ci	} while (0)
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_ci	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
22462306a36Sopenharmony_ci	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
22562306a36Sopenharmony_ci	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
22862306a36Sopenharmony_ci	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
22962306a36Sopenharmony_ci		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
23062306a36Sopenharmony_ci			      btrfs_raid_array[i].raid_name);
23162306a36Sopenharmony_ci#undef DESCRIBE_FLAG
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci	if (flags) {
23462306a36Sopenharmony_ci		ret = snprintf(bp, size_bp, "0x%llx|", flags);
23562306a36Sopenharmony_ci		size_bp -= ret;
23662306a36Sopenharmony_ci	}
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	if (size_bp < size_buf)
23962306a36Sopenharmony_ci		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_ci	/*
24262306a36Sopenharmony_ci	 * The text is trimmed, it's up to the caller to provide sufficiently
24362306a36Sopenharmony_ci	 * large buffer
24462306a36Sopenharmony_ci	 */
24562306a36Sopenharmony_ciout_overflow:;
24662306a36Sopenharmony_ci}
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_cistatic int init_first_rw_device(struct btrfs_trans_handle *trans);
24962306a36Sopenharmony_cistatic int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
25062306a36Sopenharmony_cistatic void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci/*
25362306a36Sopenharmony_ci * Device locking
25462306a36Sopenharmony_ci * ==============
25562306a36Sopenharmony_ci *
25662306a36Sopenharmony_ci * There are several mutexes that protect manipulation of devices and low-level
25762306a36Sopenharmony_ci * structures like chunks but not block groups, extents or files
25862306a36Sopenharmony_ci *
25962306a36Sopenharmony_ci * uuid_mutex (global lock)
26062306a36Sopenharmony_ci * ------------------------
26162306a36Sopenharmony_ci * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
26262306a36Sopenharmony_ci * the SCAN_DEV ioctl registration or from mount either implicitly (the first
26362306a36Sopenharmony_ci * device) or requested by the device= mount option
26462306a36Sopenharmony_ci *
26562306a36Sopenharmony_ci * the mutex can be very coarse and can cover long-running operations
26662306a36Sopenharmony_ci *
26762306a36Sopenharmony_ci * protects: updates to fs_devices counters like missing devices, rw devices,
26862306a36Sopenharmony_ci * seeding, structure cloning, opening/closing devices at mount/umount time
26962306a36Sopenharmony_ci *
27062306a36Sopenharmony_ci * global::fs_devs - add, remove, updates to the global list
27162306a36Sopenharmony_ci *
27262306a36Sopenharmony_ci * does not protect: manipulation of the fs_devices::devices list in general
27362306a36Sopenharmony_ci * but in mount context it could be used to exclude list modifications by eg.
27462306a36Sopenharmony_ci * scan ioctl
27562306a36Sopenharmony_ci *
27662306a36Sopenharmony_ci * btrfs_device::name - renames (write side), read is RCU
27762306a36Sopenharmony_ci *
27862306a36Sopenharmony_ci * fs_devices::device_list_mutex (per-fs, with RCU)
27962306a36Sopenharmony_ci * ------------------------------------------------
28062306a36Sopenharmony_ci * protects updates to fs_devices::devices, ie. adding and deleting
28162306a36Sopenharmony_ci *
28262306a36Sopenharmony_ci * simple list traversal with read-only actions can be done with RCU protection
28362306a36Sopenharmony_ci *
28462306a36Sopenharmony_ci * may be used to exclude some operations from running concurrently without any
28562306a36Sopenharmony_ci * modifications to the list (see write_all_supers)
28662306a36Sopenharmony_ci *
28762306a36Sopenharmony_ci * Is not required at mount and close times, because our device list is
28862306a36Sopenharmony_ci * protected by the uuid_mutex at that point.
28962306a36Sopenharmony_ci *
29062306a36Sopenharmony_ci * balance_mutex
29162306a36Sopenharmony_ci * -------------
29262306a36Sopenharmony_ci * protects balance structures (status, state) and context accessed from
29362306a36Sopenharmony_ci * several places (internally, ioctl)
29462306a36Sopenharmony_ci *
29562306a36Sopenharmony_ci * chunk_mutex
29662306a36Sopenharmony_ci * -----------
29762306a36Sopenharmony_ci * protects chunks, adding or removing during allocation, trim or when a new
29862306a36Sopenharmony_ci * device is added/removed. Additionally it also protects post_commit_list of
29962306a36Sopenharmony_ci * individual devices, since they can be added to the transaction's
30062306a36Sopenharmony_ci * post_commit_list only with chunk_mutex held.
30162306a36Sopenharmony_ci *
30262306a36Sopenharmony_ci * cleaner_mutex
30362306a36Sopenharmony_ci * -------------
30462306a36Sopenharmony_ci * a big lock that is held by the cleaner thread and prevents running subvolume
30562306a36Sopenharmony_ci * cleaning together with relocation or delayed iputs
30662306a36Sopenharmony_ci *
30762306a36Sopenharmony_ci *
30862306a36Sopenharmony_ci * Lock nesting
30962306a36Sopenharmony_ci * ============
31062306a36Sopenharmony_ci *
31162306a36Sopenharmony_ci * uuid_mutex
31262306a36Sopenharmony_ci *   device_list_mutex
31362306a36Sopenharmony_ci *     chunk_mutex
31462306a36Sopenharmony_ci *   balance_mutex
31562306a36Sopenharmony_ci *
31662306a36Sopenharmony_ci *
31762306a36Sopenharmony_ci * Exclusive operations
31862306a36Sopenharmony_ci * ====================
31962306a36Sopenharmony_ci *
32062306a36Sopenharmony_ci * Maintains the exclusivity of the following operations that apply to the
32162306a36Sopenharmony_ci * whole filesystem and cannot run in parallel.
32262306a36Sopenharmony_ci *
32362306a36Sopenharmony_ci * - Balance (*)
32462306a36Sopenharmony_ci * - Device add
32562306a36Sopenharmony_ci * - Device remove
32662306a36Sopenharmony_ci * - Device replace (*)
32762306a36Sopenharmony_ci * - Resize
32862306a36Sopenharmony_ci *
32962306a36Sopenharmony_ci * The device operations (as above) can be in one of the following states:
33062306a36Sopenharmony_ci *
33162306a36Sopenharmony_ci * - Running state
33262306a36Sopenharmony_ci * - Paused state
33362306a36Sopenharmony_ci * - Completed state
33462306a36Sopenharmony_ci *
33562306a36Sopenharmony_ci * Only device operations marked with (*) can go into the Paused state for the
33662306a36Sopenharmony_ci * following reasons:
33762306a36Sopenharmony_ci *
33862306a36Sopenharmony_ci * - ioctl (only Balance can be Paused through ioctl)
33962306a36Sopenharmony_ci * - filesystem remounted as read-only
34062306a36Sopenharmony_ci * - filesystem unmounted and mounted as read-only
34162306a36Sopenharmony_ci * - system power-cycle and filesystem mounted as read-only
34262306a36Sopenharmony_ci * - filesystem or device errors leading to forced read-only
34362306a36Sopenharmony_ci *
34462306a36Sopenharmony_ci * The status of exclusive operation is set and cleared atomically.
34562306a36Sopenharmony_ci * During the course of Paused state, fs_info::exclusive_operation remains set.
34662306a36Sopenharmony_ci * A device operation in Paused or Running state can be canceled or resumed
34762306a36Sopenharmony_ci * either by ioctl (Balance only) or when remounted as read-write.
34862306a36Sopenharmony_ci * The exclusive status is cleared when the device operation is canceled or
34962306a36Sopenharmony_ci * completed.
35062306a36Sopenharmony_ci */
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_ciDEFINE_MUTEX(uuid_mutex);
35362306a36Sopenharmony_cistatic LIST_HEAD(fs_uuids);
35462306a36Sopenharmony_cistruct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
35562306a36Sopenharmony_ci{
35662306a36Sopenharmony_ci	return &fs_uuids;
35762306a36Sopenharmony_ci}
35862306a36Sopenharmony_ci
35962306a36Sopenharmony_ci/*
36062306a36Sopenharmony_ci * alloc_fs_devices - allocate struct btrfs_fs_devices
36162306a36Sopenharmony_ci * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
36262306a36Sopenharmony_ci * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
36362306a36Sopenharmony_ci *
36462306a36Sopenharmony_ci * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
36562306a36Sopenharmony_ci * The returned struct is not linked onto any lists and can be destroyed with
36662306a36Sopenharmony_ci * kfree() right away.
36762306a36Sopenharmony_ci */
36862306a36Sopenharmony_cistatic struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
36962306a36Sopenharmony_ci						 const u8 *metadata_fsid)
37062306a36Sopenharmony_ci{
37162306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devs;
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci	ASSERT(fsid || !metadata_fsid);
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_ci	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
37662306a36Sopenharmony_ci	if (!fs_devs)
37762306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci	mutex_init(&fs_devs->device_list_mutex);
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci	INIT_LIST_HEAD(&fs_devs->devices);
38262306a36Sopenharmony_ci	INIT_LIST_HEAD(&fs_devs->alloc_list);
38362306a36Sopenharmony_ci	INIT_LIST_HEAD(&fs_devs->fs_list);
38462306a36Sopenharmony_ci	INIT_LIST_HEAD(&fs_devs->seed_list);
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	if (fsid) {
38762306a36Sopenharmony_ci		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
38862306a36Sopenharmony_ci		memcpy(fs_devs->metadata_uuid,
38962306a36Sopenharmony_ci		       metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
39062306a36Sopenharmony_ci	}
39162306a36Sopenharmony_ci
39262306a36Sopenharmony_ci	return fs_devs;
39362306a36Sopenharmony_ci}
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_cistatic void btrfs_free_device(struct btrfs_device *device)
39662306a36Sopenharmony_ci{
39762306a36Sopenharmony_ci	WARN_ON(!list_empty(&device->post_commit_list));
39862306a36Sopenharmony_ci	rcu_string_free(device->name);
39962306a36Sopenharmony_ci	extent_io_tree_release(&device->alloc_state);
40062306a36Sopenharmony_ci	btrfs_destroy_dev_zone_info(device);
40162306a36Sopenharmony_ci	kfree(device);
40262306a36Sopenharmony_ci}
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_cistatic void free_fs_devices(struct btrfs_fs_devices *fs_devices)
40562306a36Sopenharmony_ci{
40662306a36Sopenharmony_ci	struct btrfs_device *device;
40762306a36Sopenharmony_ci
40862306a36Sopenharmony_ci	WARN_ON(fs_devices->opened);
40962306a36Sopenharmony_ci	while (!list_empty(&fs_devices->devices)) {
41062306a36Sopenharmony_ci		device = list_entry(fs_devices->devices.next,
41162306a36Sopenharmony_ci				    struct btrfs_device, dev_list);
41262306a36Sopenharmony_ci		list_del(&device->dev_list);
41362306a36Sopenharmony_ci		btrfs_free_device(device);
41462306a36Sopenharmony_ci	}
41562306a36Sopenharmony_ci	kfree(fs_devices);
41662306a36Sopenharmony_ci}
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_civoid __exit btrfs_cleanup_fs_uuids(void)
41962306a36Sopenharmony_ci{
42062306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices;
42162306a36Sopenharmony_ci
42262306a36Sopenharmony_ci	while (!list_empty(&fs_uuids)) {
42362306a36Sopenharmony_ci		fs_devices = list_entry(fs_uuids.next,
42462306a36Sopenharmony_ci					struct btrfs_fs_devices, fs_list);
42562306a36Sopenharmony_ci		list_del(&fs_devices->fs_list);
42662306a36Sopenharmony_ci		free_fs_devices(fs_devices);
42762306a36Sopenharmony_ci	}
42862306a36Sopenharmony_ci}
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_cistatic bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
43162306a36Sopenharmony_ci				  const u8 *fsid, const u8 *metadata_fsid)
43262306a36Sopenharmony_ci{
43362306a36Sopenharmony_ci	if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
43462306a36Sopenharmony_ci		return false;
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci	if (!metadata_fsid)
43762306a36Sopenharmony_ci		return true;
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
44062306a36Sopenharmony_ci		return false;
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_ci	return true;
44362306a36Sopenharmony_ci}
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_cistatic noinline struct btrfs_fs_devices *find_fsid(
44662306a36Sopenharmony_ci		const u8 *fsid, const u8 *metadata_fsid)
44762306a36Sopenharmony_ci{
44862306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices;
44962306a36Sopenharmony_ci
45062306a36Sopenharmony_ci	ASSERT(fsid);
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	/* Handle non-split brain cases */
45362306a36Sopenharmony_ci	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
45462306a36Sopenharmony_ci		if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
45562306a36Sopenharmony_ci			return fs_devices;
45662306a36Sopenharmony_ci	}
45762306a36Sopenharmony_ci	return NULL;
45862306a36Sopenharmony_ci}
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci/*
46162306a36Sopenharmony_ci * First check if the metadata_uuid is different from the fsid in the given
46262306a36Sopenharmony_ci * fs_devices. Then check if the given fsid is the same as the metadata_uuid
46362306a36Sopenharmony_ci * in the fs_devices. If it is, return true; otherwise, return false.
46462306a36Sopenharmony_ci */
46562306a36Sopenharmony_cistatic inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
46662306a36Sopenharmony_ci				      const u8 *fsid)
46762306a36Sopenharmony_ci{
46862306a36Sopenharmony_ci	return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
46962306a36Sopenharmony_ci		      BTRFS_FSID_SIZE) != 0 &&
47062306a36Sopenharmony_ci	       memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
47162306a36Sopenharmony_ci}
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_cistatic struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
47462306a36Sopenharmony_ci				struct btrfs_super_block *disk_super)
47562306a36Sopenharmony_ci{
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices;
47862306a36Sopenharmony_ci
47962306a36Sopenharmony_ci	/*
48062306a36Sopenharmony_ci	 * Handle scanned device having completed its fsid change but
48162306a36Sopenharmony_ci	 * belonging to a fs_devices that was created by first scanning
48262306a36Sopenharmony_ci	 * a device which didn't have its fsid/metadata_uuid changed
48362306a36Sopenharmony_ci	 * at all and the CHANGING_FSID_V2 flag set.
48462306a36Sopenharmony_ci	 */
48562306a36Sopenharmony_ci	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
48662306a36Sopenharmony_ci		if (!fs_devices->fsid_change)
48762306a36Sopenharmony_ci			continue;
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci		if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
49062306a36Sopenharmony_ci					  fs_devices->fsid))
49162306a36Sopenharmony_ci			return fs_devices;
49262306a36Sopenharmony_ci	}
49362306a36Sopenharmony_ci
49462306a36Sopenharmony_ci	/*
49562306a36Sopenharmony_ci	 * Handle scanned device having completed its fsid change but
49662306a36Sopenharmony_ci	 * belonging to a fs_devices that was created by a device that
49762306a36Sopenharmony_ci	 * has an outdated pair of fsid/metadata_uuid and
49862306a36Sopenharmony_ci	 * CHANGING_FSID_V2 flag set.
49962306a36Sopenharmony_ci	 */
50062306a36Sopenharmony_ci	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
50162306a36Sopenharmony_ci		if (!fs_devices->fsid_change)
50262306a36Sopenharmony_ci			continue;
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci		if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
50562306a36Sopenharmony_ci			return fs_devices;
50662306a36Sopenharmony_ci	}
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
50962306a36Sopenharmony_ci}
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_cistatic int
51362306a36Sopenharmony_cibtrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
51462306a36Sopenharmony_ci		      int flush, struct block_device **bdev,
51562306a36Sopenharmony_ci		      struct btrfs_super_block **disk_super)
51662306a36Sopenharmony_ci{
51762306a36Sopenharmony_ci	int ret;
51862306a36Sopenharmony_ci
51962306a36Sopenharmony_ci	*bdev = blkdev_get_by_path(device_path, flags, holder, NULL);
52062306a36Sopenharmony_ci
52162306a36Sopenharmony_ci	if (IS_ERR(*bdev)) {
52262306a36Sopenharmony_ci		ret = PTR_ERR(*bdev);
52362306a36Sopenharmony_ci		goto error;
52462306a36Sopenharmony_ci	}
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci	if (flush)
52762306a36Sopenharmony_ci		sync_blockdev(*bdev);
52862306a36Sopenharmony_ci	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
52962306a36Sopenharmony_ci	if (ret) {
53062306a36Sopenharmony_ci		blkdev_put(*bdev, holder);
53162306a36Sopenharmony_ci		goto error;
53262306a36Sopenharmony_ci	}
53362306a36Sopenharmony_ci	invalidate_bdev(*bdev);
53462306a36Sopenharmony_ci	*disk_super = btrfs_read_dev_super(*bdev);
53562306a36Sopenharmony_ci	if (IS_ERR(*disk_super)) {
53662306a36Sopenharmony_ci		ret = PTR_ERR(*disk_super);
53762306a36Sopenharmony_ci		blkdev_put(*bdev, holder);
53862306a36Sopenharmony_ci		goto error;
53962306a36Sopenharmony_ci	}
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci	return 0;
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_cierror:
54462306a36Sopenharmony_ci	*bdev = NULL;
54562306a36Sopenharmony_ci	return ret;
54662306a36Sopenharmony_ci}
54762306a36Sopenharmony_ci
54862306a36Sopenharmony_ci/*
54962306a36Sopenharmony_ci *  Search and remove all stale devices (which are not mounted).  When both
55062306a36Sopenharmony_ci *  inputs are NULL, it will search and release all stale devices.
55162306a36Sopenharmony_ci *
55262306a36Sopenharmony_ci *  @devt:         Optional. When provided will it release all unmounted devices
55362306a36Sopenharmony_ci *                 matching this devt only.
55462306a36Sopenharmony_ci *  @skip_device:  Optional. Will skip this device when searching for the stale
55562306a36Sopenharmony_ci *                 devices.
55662306a36Sopenharmony_ci *
55762306a36Sopenharmony_ci *  Return:	0 for success or if @devt is 0.
55862306a36Sopenharmony_ci *		-EBUSY if @devt is a mounted device.
55962306a36Sopenharmony_ci *		-ENOENT if @devt does not match any device in the list.
56062306a36Sopenharmony_ci */
56162306a36Sopenharmony_cistatic int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
56262306a36Sopenharmony_ci{
56362306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
56462306a36Sopenharmony_ci	struct btrfs_device *device, *tmp_device;
56562306a36Sopenharmony_ci	int ret = 0;
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_ci	lockdep_assert_held(&uuid_mutex);
56862306a36Sopenharmony_ci
56962306a36Sopenharmony_ci	if (devt)
57062306a36Sopenharmony_ci		ret = -ENOENT;
57162306a36Sopenharmony_ci
57262306a36Sopenharmony_ci	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_ci		mutex_lock(&fs_devices->device_list_mutex);
57562306a36Sopenharmony_ci		list_for_each_entry_safe(device, tmp_device,
57662306a36Sopenharmony_ci					 &fs_devices->devices, dev_list) {
57762306a36Sopenharmony_ci			if (skip_device && skip_device == device)
57862306a36Sopenharmony_ci				continue;
57962306a36Sopenharmony_ci			if (devt && devt != device->devt)
58062306a36Sopenharmony_ci				continue;
58162306a36Sopenharmony_ci			if (fs_devices->opened) {
58262306a36Sopenharmony_ci				/* for an already deleted device return 0 */
58362306a36Sopenharmony_ci				if (devt && ret != 0)
58462306a36Sopenharmony_ci					ret = -EBUSY;
58562306a36Sopenharmony_ci				break;
58662306a36Sopenharmony_ci			}
58762306a36Sopenharmony_ci
58862306a36Sopenharmony_ci			/* delete the stale device */
58962306a36Sopenharmony_ci			fs_devices->num_devices--;
59062306a36Sopenharmony_ci			list_del(&device->dev_list);
59162306a36Sopenharmony_ci			btrfs_free_device(device);
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ci			ret = 0;
59462306a36Sopenharmony_ci		}
59562306a36Sopenharmony_ci		mutex_unlock(&fs_devices->device_list_mutex);
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ci		if (fs_devices->num_devices == 0) {
59862306a36Sopenharmony_ci			btrfs_sysfs_remove_fsid(fs_devices);
59962306a36Sopenharmony_ci			list_del(&fs_devices->fs_list);
60062306a36Sopenharmony_ci			free_fs_devices(fs_devices);
60162306a36Sopenharmony_ci		}
60262306a36Sopenharmony_ci	}
60362306a36Sopenharmony_ci
60462306a36Sopenharmony_ci	return ret;
60562306a36Sopenharmony_ci}
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci/*
60862306a36Sopenharmony_ci * This is only used on mount, and we are protected from competing things
60962306a36Sopenharmony_ci * messing with our fs_devices by the uuid_mutex, thus we do not need the
61062306a36Sopenharmony_ci * fs_devices->device_list_mutex here.
61162306a36Sopenharmony_ci */
61262306a36Sopenharmony_cistatic int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
61362306a36Sopenharmony_ci			struct btrfs_device *device, blk_mode_t flags,
61462306a36Sopenharmony_ci			void *holder)
61562306a36Sopenharmony_ci{
61662306a36Sopenharmony_ci	struct block_device *bdev;
61762306a36Sopenharmony_ci	struct btrfs_super_block *disk_super;
61862306a36Sopenharmony_ci	u64 devid;
61962306a36Sopenharmony_ci	int ret;
62062306a36Sopenharmony_ci
62162306a36Sopenharmony_ci	if (device->bdev)
62262306a36Sopenharmony_ci		return -EINVAL;
62362306a36Sopenharmony_ci	if (!device->name)
62462306a36Sopenharmony_ci		return -EINVAL;
62562306a36Sopenharmony_ci
62662306a36Sopenharmony_ci	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
62762306a36Sopenharmony_ci				    &bdev, &disk_super);
62862306a36Sopenharmony_ci	if (ret)
62962306a36Sopenharmony_ci		return ret;
63062306a36Sopenharmony_ci
63162306a36Sopenharmony_ci	devid = btrfs_stack_device_id(&disk_super->dev_item);
63262306a36Sopenharmony_ci	if (devid != device->devid)
63362306a36Sopenharmony_ci		goto error_free_page;
63462306a36Sopenharmony_ci
63562306a36Sopenharmony_ci	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
63662306a36Sopenharmony_ci		goto error_free_page;
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	device->generation = btrfs_super_generation(disk_super);
63962306a36Sopenharmony_ci
64062306a36Sopenharmony_ci	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
64162306a36Sopenharmony_ci		if (btrfs_super_incompat_flags(disk_super) &
64262306a36Sopenharmony_ci		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
64362306a36Sopenharmony_ci			pr_err(
64462306a36Sopenharmony_ci		"BTRFS: Invalid seeding and uuid-changed device detected\n");
64562306a36Sopenharmony_ci			goto error_free_page;
64662306a36Sopenharmony_ci		}
64762306a36Sopenharmony_ci
64862306a36Sopenharmony_ci		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
64962306a36Sopenharmony_ci		fs_devices->seeding = true;
65062306a36Sopenharmony_ci	} else {
65162306a36Sopenharmony_ci		if (bdev_read_only(bdev))
65262306a36Sopenharmony_ci			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
65362306a36Sopenharmony_ci		else
65462306a36Sopenharmony_ci			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
65562306a36Sopenharmony_ci	}
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci	if (!bdev_nonrot(bdev))
65862306a36Sopenharmony_ci		fs_devices->rotating = true;
65962306a36Sopenharmony_ci
66062306a36Sopenharmony_ci	if (bdev_max_discard_sectors(bdev))
66162306a36Sopenharmony_ci		fs_devices->discardable = true;
66262306a36Sopenharmony_ci
66362306a36Sopenharmony_ci	device->bdev = bdev;
66462306a36Sopenharmony_ci	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
66562306a36Sopenharmony_ci	device->holder = holder;
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_ci	fs_devices->open_devices++;
66862306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
66962306a36Sopenharmony_ci	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
67062306a36Sopenharmony_ci		fs_devices->rw_devices++;
67162306a36Sopenharmony_ci		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
67262306a36Sopenharmony_ci	}
67362306a36Sopenharmony_ci	btrfs_release_disk_super(disk_super);
67462306a36Sopenharmony_ci
67562306a36Sopenharmony_ci	return 0;
67662306a36Sopenharmony_ci
67762306a36Sopenharmony_cierror_free_page:
67862306a36Sopenharmony_ci	btrfs_release_disk_super(disk_super);
67962306a36Sopenharmony_ci	blkdev_put(bdev, holder);
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_ci	return -EINVAL;
68262306a36Sopenharmony_ci}
68362306a36Sopenharmony_ci
68462306a36Sopenharmony_ciu8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
68562306a36Sopenharmony_ci{
68662306a36Sopenharmony_ci	bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
68762306a36Sopenharmony_ci				  BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
69062306a36Sopenharmony_ci}
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci/*
69362306a36Sopenharmony_ci * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
69462306a36Sopenharmony_ci * being created with a disk that has already completed its fsid change. Such
69562306a36Sopenharmony_ci * disk can belong to an fs which has its FSID changed or to one which doesn't.
69662306a36Sopenharmony_ci * Handle both cases here.
69762306a36Sopenharmony_ci */
69862306a36Sopenharmony_cistatic struct btrfs_fs_devices *find_fsid_inprogress(
69962306a36Sopenharmony_ci					struct btrfs_super_block *disk_super)
70062306a36Sopenharmony_ci{
70162306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices;
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_ci	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
70462306a36Sopenharmony_ci		if (fs_devices->fsid_change)
70562306a36Sopenharmony_ci			continue;
70662306a36Sopenharmony_ci
70762306a36Sopenharmony_ci		if (check_fsid_changed(fs_devices,  disk_super->fsid))
70862306a36Sopenharmony_ci			return fs_devices;
70962306a36Sopenharmony_ci	}
71062306a36Sopenharmony_ci
71162306a36Sopenharmony_ci	return find_fsid(disk_super->fsid, NULL);
71262306a36Sopenharmony_ci}
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_cistatic struct btrfs_fs_devices *find_fsid_changed(
71562306a36Sopenharmony_ci					struct btrfs_super_block *disk_super)
71662306a36Sopenharmony_ci{
71762306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices;
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_ci	/*
72062306a36Sopenharmony_ci	 * Handles the case where scanned device is part of an fs that had
72162306a36Sopenharmony_ci	 * multiple successful changes of FSID but currently device didn't
72262306a36Sopenharmony_ci	 * observe it. Meaning our fsid will be different than theirs. We need
72362306a36Sopenharmony_ci	 * to handle two subcases :
72462306a36Sopenharmony_ci	 *  1 - The fs still continues to have different METADATA/FSID uuids.
72562306a36Sopenharmony_ci	 *  2 - The fs is switched back to its original FSID (METADATA/FSID
72662306a36Sopenharmony_ci	 *  are equal).
72762306a36Sopenharmony_ci	 */
72862306a36Sopenharmony_ci	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
72962306a36Sopenharmony_ci		/* Changed UUIDs */
73062306a36Sopenharmony_ci		if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
73162306a36Sopenharmony_ci		    memcmp(fs_devices->fsid, disk_super->fsid,
73262306a36Sopenharmony_ci			   BTRFS_FSID_SIZE) != 0)
73362306a36Sopenharmony_ci			return fs_devices;
73462306a36Sopenharmony_ci
73562306a36Sopenharmony_ci		/* Unchanged UUIDs */
73662306a36Sopenharmony_ci		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
73762306a36Sopenharmony_ci			   BTRFS_FSID_SIZE) == 0 &&
73862306a36Sopenharmony_ci		    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
73962306a36Sopenharmony_ci			   BTRFS_FSID_SIZE) == 0)
74062306a36Sopenharmony_ci			return fs_devices;
74162306a36Sopenharmony_ci	}
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci	return NULL;
74462306a36Sopenharmony_ci}
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_cistatic struct btrfs_fs_devices *find_fsid_reverted_metadata(
74762306a36Sopenharmony_ci				struct btrfs_super_block *disk_super)
74862306a36Sopenharmony_ci{
74962306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices;
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_ci	/*
75262306a36Sopenharmony_ci	 * Handle the case where the scanned device is part of an fs whose last
75362306a36Sopenharmony_ci	 * metadata UUID change reverted it to the original FSID. At the same
75462306a36Sopenharmony_ci	 * time fs_devices was first created by another constituent device
75562306a36Sopenharmony_ci	 * which didn't fully observe the operation. This results in an
75662306a36Sopenharmony_ci	 * btrfs_fs_devices created with metadata/fsid different AND
75762306a36Sopenharmony_ci	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
75862306a36Sopenharmony_ci	 * fs_devices equal to the FSID of the disk.
75962306a36Sopenharmony_ci	 */
76062306a36Sopenharmony_ci	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
76162306a36Sopenharmony_ci		if (!fs_devices->fsid_change)
76262306a36Sopenharmony_ci			continue;
76362306a36Sopenharmony_ci
76462306a36Sopenharmony_ci		if (check_fsid_changed(fs_devices, disk_super->fsid))
76562306a36Sopenharmony_ci			return fs_devices;
76662306a36Sopenharmony_ci	}
76762306a36Sopenharmony_ci
76862306a36Sopenharmony_ci	return NULL;
76962306a36Sopenharmony_ci}
77062306a36Sopenharmony_ci/*
77162306a36Sopenharmony_ci * Add new device to list of registered devices
77262306a36Sopenharmony_ci *
77362306a36Sopenharmony_ci * Returns:
77462306a36Sopenharmony_ci * device pointer which was just added or updated when successful
77562306a36Sopenharmony_ci * error pointer when failed
77662306a36Sopenharmony_ci */
77762306a36Sopenharmony_cistatic noinline struct btrfs_device *device_list_add(const char *path,
77862306a36Sopenharmony_ci			   struct btrfs_super_block *disk_super,
77962306a36Sopenharmony_ci			   bool *new_device_added)
78062306a36Sopenharmony_ci{
78162306a36Sopenharmony_ci	struct btrfs_device *device;
78262306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = NULL;
78362306a36Sopenharmony_ci	struct rcu_string *name;
78462306a36Sopenharmony_ci	u64 found_transid = btrfs_super_generation(disk_super);
78562306a36Sopenharmony_ci	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
78662306a36Sopenharmony_ci	dev_t path_devt;
78762306a36Sopenharmony_ci	int error;
78862306a36Sopenharmony_ci	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
78962306a36Sopenharmony_ci		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
79062306a36Sopenharmony_ci	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
79162306a36Sopenharmony_ci					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	error = lookup_bdev(path, &path_devt);
79462306a36Sopenharmony_ci	if (error) {
79562306a36Sopenharmony_ci		btrfs_err(NULL, "failed to lookup block device for path %s: %d",
79662306a36Sopenharmony_ci			  path, error);
79762306a36Sopenharmony_ci		return ERR_PTR(error);
79862306a36Sopenharmony_ci	}
79962306a36Sopenharmony_ci
80062306a36Sopenharmony_ci	if (fsid_change_in_progress) {
80162306a36Sopenharmony_ci		if (!has_metadata_uuid)
80262306a36Sopenharmony_ci			fs_devices = find_fsid_inprogress(disk_super);
80362306a36Sopenharmony_ci		else
80462306a36Sopenharmony_ci			fs_devices = find_fsid_changed(disk_super);
80562306a36Sopenharmony_ci	} else if (has_metadata_uuid) {
80662306a36Sopenharmony_ci		fs_devices = find_fsid_with_metadata_uuid(disk_super);
80762306a36Sopenharmony_ci	} else {
80862306a36Sopenharmony_ci		fs_devices = find_fsid_reverted_metadata(disk_super);
80962306a36Sopenharmony_ci		if (!fs_devices)
81062306a36Sopenharmony_ci			fs_devices = find_fsid(disk_super->fsid, NULL);
81162306a36Sopenharmony_ci	}
81262306a36Sopenharmony_ci
81362306a36Sopenharmony_ci
81462306a36Sopenharmony_ci	if (!fs_devices) {
81562306a36Sopenharmony_ci		fs_devices = alloc_fs_devices(disk_super->fsid,
81662306a36Sopenharmony_ci				has_metadata_uuid ? disk_super->metadata_uuid : NULL);
81762306a36Sopenharmony_ci		if (IS_ERR(fs_devices))
81862306a36Sopenharmony_ci			return ERR_CAST(fs_devices);
81962306a36Sopenharmony_ci
82062306a36Sopenharmony_ci		fs_devices->fsid_change = fsid_change_in_progress;
82162306a36Sopenharmony_ci
82262306a36Sopenharmony_ci		mutex_lock(&fs_devices->device_list_mutex);
82362306a36Sopenharmony_ci		list_add(&fs_devices->fs_list, &fs_uuids);
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_ci		device = NULL;
82662306a36Sopenharmony_ci	} else {
82762306a36Sopenharmony_ci		struct btrfs_dev_lookup_args args = {
82862306a36Sopenharmony_ci			.devid = devid,
82962306a36Sopenharmony_ci			.uuid = disk_super->dev_item.uuid,
83062306a36Sopenharmony_ci		};
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_ci		mutex_lock(&fs_devices->device_list_mutex);
83362306a36Sopenharmony_ci		device = btrfs_find_device(fs_devices, &args);
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_ci		/*
83662306a36Sopenharmony_ci		 * If this disk has been pulled into an fs devices created by
83762306a36Sopenharmony_ci		 * a device which had the CHANGING_FSID_V2 flag then replace the
83862306a36Sopenharmony_ci		 * metadata_uuid/fsid values of the fs_devices.
83962306a36Sopenharmony_ci		 */
84062306a36Sopenharmony_ci		if (fs_devices->fsid_change &&
84162306a36Sopenharmony_ci		    found_transid > fs_devices->latest_generation) {
84262306a36Sopenharmony_ci			memcpy(fs_devices->fsid, disk_super->fsid,
84362306a36Sopenharmony_ci					BTRFS_FSID_SIZE);
84462306a36Sopenharmony_ci			memcpy(fs_devices->metadata_uuid,
84562306a36Sopenharmony_ci			       btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
84662306a36Sopenharmony_ci			fs_devices->fsid_change = false;
84762306a36Sopenharmony_ci		}
84862306a36Sopenharmony_ci	}
84962306a36Sopenharmony_ci
85062306a36Sopenharmony_ci	if (!device) {
85162306a36Sopenharmony_ci		unsigned int nofs_flag;
85262306a36Sopenharmony_ci
85362306a36Sopenharmony_ci		if (fs_devices->opened) {
85462306a36Sopenharmony_ci			btrfs_err(NULL,
85562306a36Sopenharmony_ci"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
85662306a36Sopenharmony_ci				  path, fs_devices->fsid, current->comm,
85762306a36Sopenharmony_ci				  task_pid_nr(current));
85862306a36Sopenharmony_ci			mutex_unlock(&fs_devices->device_list_mutex);
85962306a36Sopenharmony_ci			return ERR_PTR(-EBUSY);
86062306a36Sopenharmony_ci		}
86162306a36Sopenharmony_ci
86262306a36Sopenharmony_ci		nofs_flag = memalloc_nofs_save();
86362306a36Sopenharmony_ci		device = btrfs_alloc_device(NULL, &devid,
86462306a36Sopenharmony_ci					    disk_super->dev_item.uuid, path);
86562306a36Sopenharmony_ci		memalloc_nofs_restore(nofs_flag);
86662306a36Sopenharmony_ci		if (IS_ERR(device)) {
86762306a36Sopenharmony_ci			mutex_unlock(&fs_devices->device_list_mutex);
86862306a36Sopenharmony_ci			/* we can safely leave the fs_devices entry around */
86962306a36Sopenharmony_ci			return device;
87062306a36Sopenharmony_ci		}
87162306a36Sopenharmony_ci
87262306a36Sopenharmony_ci		device->devt = path_devt;
87362306a36Sopenharmony_ci
87462306a36Sopenharmony_ci		list_add_rcu(&device->dev_list, &fs_devices->devices);
87562306a36Sopenharmony_ci		fs_devices->num_devices++;
87662306a36Sopenharmony_ci
87762306a36Sopenharmony_ci		device->fs_devices = fs_devices;
87862306a36Sopenharmony_ci		*new_device_added = true;
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci		if (disk_super->label[0])
88162306a36Sopenharmony_ci			pr_info(
88262306a36Sopenharmony_ci	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
88362306a36Sopenharmony_ci				disk_super->label, devid, found_transid, path,
88462306a36Sopenharmony_ci				current->comm, task_pid_nr(current));
88562306a36Sopenharmony_ci		else
88662306a36Sopenharmony_ci			pr_info(
88762306a36Sopenharmony_ci	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
88862306a36Sopenharmony_ci				disk_super->fsid, devid, found_transid, path,
88962306a36Sopenharmony_ci				current->comm, task_pid_nr(current));
89062306a36Sopenharmony_ci
89162306a36Sopenharmony_ci	} else if (!device->name || strcmp(device->name->str, path)) {
89262306a36Sopenharmony_ci		/*
89362306a36Sopenharmony_ci		 * When FS is already mounted.
89462306a36Sopenharmony_ci		 * 1. If you are here and if the device->name is NULL that
89562306a36Sopenharmony_ci		 *    means this device was missing at time of FS mount.
89662306a36Sopenharmony_ci		 * 2. If you are here and if the device->name is different
89762306a36Sopenharmony_ci		 *    from 'path' that means either
89862306a36Sopenharmony_ci		 *      a. The same device disappeared and reappeared with
89962306a36Sopenharmony_ci		 *         different name. or
90062306a36Sopenharmony_ci		 *      b. The missing-disk-which-was-replaced, has
90162306a36Sopenharmony_ci		 *         reappeared now.
90262306a36Sopenharmony_ci		 *
90362306a36Sopenharmony_ci		 * We must allow 1 and 2a above. But 2b would be a spurious
90462306a36Sopenharmony_ci		 * and unintentional.
90562306a36Sopenharmony_ci		 *
90662306a36Sopenharmony_ci		 * Further in case of 1 and 2a above, the disk at 'path'
90762306a36Sopenharmony_ci		 * would have missed some transaction when it was away and
90862306a36Sopenharmony_ci		 * in case of 2a the stale bdev has to be updated as well.
90962306a36Sopenharmony_ci		 * 2b must not be allowed at all time.
91062306a36Sopenharmony_ci		 */
91162306a36Sopenharmony_ci
91262306a36Sopenharmony_ci		/*
91362306a36Sopenharmony_ci		 * For now, we do allow update to btrfs_fs_device through the
91462306a36Sopenharmony_ci		 * btrfs dev scan cli after FS has been mounted.  We're still
91562306a36Sopenharmony_ci		 * tracking a problem where systems fail mount by subvolume id
91662306a36Sopenharmony_ci		 * when we reject replacement on a mounted FS.
91762306a36Sopenharmony_ci		 */
91862306a36Sopenharmony_ci		if (!fs_devices->opened && found_transid < device->generation) {
91962306a36Sopenharmony_ci			/*
92062306a36Sopenharmony_ci			 * That is if the FS is _not_ mounted and if you
92162306a36Sopenharmony_ci			 * are here, that means there is more than one
92262306a36Sopenharmony_ci			 * disk with same uuid and devid.We keep the one
92362306a36Sopenharmony_ci			 * with larger generation number or the last-in if
92462306a36Sopenharmony_ci			 * generation are equal.
92562306a36Sopenharmony_ci			 */
92662306a36Sopenharmony_ci			mutex_unlock(&fs_devices->device_list_mutex);
92762306a36Sopenharmony_ci			btrfs_err(NULL,
92862306a36Sopenharmony_ci"device %s already registered with a higher generation, found %llu expect %llu",
92962306a36Sopenharmony_ci				  path, found_transid, device->generation);
93062306a36Sopenharmony_ci			return ERR_PTR(-EEXIST);
93162306a36Sopenharmony_ci		}
93262306a36Sopenharmony_ci
93362306a36Sopenharmony_ci		/*
93462306a36Sopenharmony_ci		 * We are going to replace the device path for a given devid,
93562306a36Sopenharmony_ci		 * make sure it's the same device if the device is mounted
93662306a36Sopenharmony_ci		 *
93762306a36Sopenharmony_ci		 * NOTE: the device->fs_info may not be reliable here so pass
93862306a36Sopenharmony_ci		 * in a NULL to message helpers instead. This avoids a possible
93962306a36Sopenharmony_ci		 * use-after-free when the fs_info and fs_info->sb are already
94062306a36Sopenharmony_ci		 * torn down.
94162306a36Sopenharmony_ci		 */
94262306a36Sopenharmony_ci		if (device->bdev) {
94362306a36Sopenharmony_ci			if (device->devt != path_devt) {
94462306a36Sopenharmony_ci				mutex_unlock(&fs_devices->device_list_mutex);
94562306a36Sopenharmony_ci				btrfs_warn_in_rcu(NULL,
94662306a36Sopenharmony_ci	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
94762306a36Sopenharmony_ci						  path, devid, found_transid,
94862306a36Sopenharmony_ci						  current->comm,
94962306a36Sopenharmony_ci						  task_pid_nr(current));
95062306a36Sopenharmony_ci				return ERR_PTR(-EEXIST);
95162306a36Sopenharmony_ci			}
95262306a36Sopenharmony_ci			btrfs_info_in_rcu(NULL,
95362306a36Sopenharmony_ci	"devid %llu device path %s changed to %s scanned by %s (%d)",
95462306a36Sopenharmony_ci					  devid, btrfs_dev_name(device),
95562306a36Sopenharmony_ci					  path, current->comm,
95662306a36Sopenharmony_ci					  task_pid_nr(current));
95762306a36Sopenharmony_ci		}
95862306a36Sopenharmony_ci
95962306a36Sopenharmony_ci		name = rcu_string_strdup(path, GFP_NOFS);
96062306a36Sopenharmony_ci		if (!name) {
96162306a36Sopenharmony_ci			mutex_unlock(&fs_devices->device_list_mutex);
96262306a36Sopenharmony_ci			return ERR_PTR(-ENOMEM);
96362306a36Sopenharmony_ci		}
96462306a36Sopenharmony_ci		rcu_string_free(device->name);
96562306a36Sopenharmony_ci		rcu_assign_pointer(device->name, name);
96662306a36Sopenharmony_ci		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
96762306a36Sopenharmony_ci			fs_devices->missing_devices--;
96862306a36Sopenharmony_ci			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
96962306a36Sopenharmony_ci		}
97062306a36Sopenharmony_ci		device->devt = path_devt;
97162306a36Sopenharmony_ci	}
97262306a36Sopenharmony_ci
97362306a36Sopenharmony_ci	/*
97462306a36Sopenharmony_ci	 * Unmount does not free the btrfs_device struct but would zero
97562306a36Sopenharmony_ci	 * generation along with most of the other members. So just update
97662306a36Sopenharmony_ci	 * it back. We need it to pick the disk with largest generation
97762306a36Sopenharmony_ci	 * (as above).
97862306a36Sopenharmony_ci	 */
97962306a36Sopenharmony_ci	if (!fs_devices->opened) {
98062306a36Sopenharmony_ci		device->generation = found_transid;
98162306a36Sopenharmony_ci		fs_devices->latest_generation = max_t(u64, found_transid,
98262306a36Sopenharmony_ci						fs_devices->latest_generation);
98362306a36Sopenharmony_ci	}
98462306a36Sopenharmony_ci
98562306a36Sopenharmony_ci	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
98862306a36Sopenharmony_ci	return device;
98962306a36Sopenharmony_ci}
99062306a36Sopenharmony_ci
99162306a36Sopenharmony_cistatic struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
99262306a36Sopenharmony_ci{
99362306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices;
99462306a36Sopenharmony_ci	struct btrfs_device *device;
99562306a36Sopenharmony_ci	struct btrfs_device *orig_dev;
99662306a36Sopenharmony_ci	int ret = 0;
99762306a36Sopenharmony_ci
99862306a36Sopenharmony_ci	lockdep_assert_held(&uuid_mutex);
99962306a36Sopenharmony_ci
100062306a36Sopenharmony_ci	fs_devices = alloc_fs_devices(orig->fsid, NULL);
100162306a36Sopenharmony_ci	if (IS_ERR(fs_devices))
100262306a36Sopenharmony_ci		return fs_devices;
100362306a36Sopenharmony_ci
100462306a36Sopenharmony_ci	fs_devices->total_devices = orig->total_devices;
100562306a36Sopenharmony_ci
100662306a36Sopenharmony_ci	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
100762306a36Sopenharmony_ci		const char *dev_path = NULL;
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_ci		/*
101062306a36Sopenharmony_ci		 * This is ok to do without RCU read locked because we hold the
101162306a36Sopenharmony_ci		 * uuid mutex so nothing we touch in here is going to disappear.
101262306a36Sopenharmony_ci		 */
101362306a36Sopenharmony_ci		if (orig_dev->name)
101462306a36Sopenharmony_ci			dev_path = orig_dev->name->str;
101562306a36Sopenharmony_ci
101662306a36Sopenharmony_ci		device = btrfs_alloc_device(NULL, &orig_dev->devid,
101762306a36Sopenharmony_ci					    orig_dev->uuid, dev_path);
101862306a36Sopenharmony_ci		if (IS_ERR(device)) {
101962306a36Sopenharmony_ci			ret = PTR_ERR(device);
102062306a36Sopenharmony_ci			goto error;
102162306a36Sopenharmony_ci		}
102262306a36Sopenharmony_ci
102362306a36Sopenharmony_ci		if (orig_dev->zone_info) {
102462306a36Sopenharmony_ci			struct btrfs_zoned_device_info *zone_info;
102562306a36Sopenharmony_ci
102662306a36Sopenharmony_ci			zone_info = btrfs_clone_dev_zone_info(orig_dev);
102762306a36Sopenharmony_ci			if (!zone_info) {
102862306a36Sopenharmony_ci				btrfs_free_device(device);
102962306a36Sopenharmony_ci				ret = -ENOMEM;
103062306a36Sopenharmony_ci				goto error;
103162306a36Sopenharmony_ci			}
103262306a36Sopenharmony_ci			device->zone_info = zone_info;
103362306a36Sopenharmony_ci		}
103462306a36Sopenharmony_ci
103562306a36Sopenharmony_ci		list_add(&device->dev_list, &fs_devices->devices);
103662306a36Sopenharmony_ci		device->fs_devices = fs_devices;
103762306a36Sopenharmony_ci		fs_devices->num_devices++;
103862306a36Sopenharmony_ci	}
103962306a36Sopenharmony_ci	return fs_devices;
104062306a36Sopenharmony_cierror:
104162306a36Sopenharmony_ci	free_fs_devices(fs_devices);
104262306a36Sopenharmony_ci	return ERR_PTR(ret);
104362306a36Sopenharmony_ci}
104462306a36Sopenharmony_ci
104562306a36Sopenharmony_cistatic void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
104662306a36Sopenharmony_ci				      struct btrfs_device **latest_dev)
104762306a36Sopenharmony_ci{
104862306a36Sopenharmony_ci	struct btrfs_device *device, *next;
104962306a36Sopenharmony_ci
105062306a36Sopenharmony_ci	/* This is the initialized path, it is safe to release the devices. */
105162306a36Sopenharmony_ci	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
105262306a36Sopenharmony_ci		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
105362306a36Sopenharmony_ci			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
105462306a36Sopenharmony_ci				      &device->dev_state) &&
105562306a36Sopenharmony_ci			    !test_bit(BTRFS_DEV_STATE_MISSING,
105662306a36Sopenharmony_ci				      &device->dev_state) &&
105762306a36Sopenharmony_ci			    (!*latest_dev ||
105862306a36Sopenharmony_ci			     device->generation > (*latest_dev)->generation)) {
105962306a36Sopenharmony_ci				*latest_dev = device;
106062306a36Sopenharmony_ci			}
106162306a36Sopenharmony_ci			continue;
106262306a36Sopenharmony_ci		}
106362306a36Sopenharmony_ci
106462306a36Sopenharmony_ci		/*
106562306a36Sopenharmony_ci		 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
106662306a36Sopenharmony_ci		 * in btrfs_init_dev_replace() so just continue.
106762306a36Sopenharmony_ci		 */
106862306a36Sopenharmony_ci		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
106962306a36Sopenharmony_ci			continue;
107062306a36Sopenharmony_ci
107162306a36Sopenharmony_ci		if (device->bdev) {
107262306a36Sopenharmony_ci			blkdev_put(device->bdev, device->holder);
107362306a36Sopenharmony_ci			device->bdev = NULL;
107462306a36Sopenharmony_ci			fs_devices->open_devices--;
107562306a36Sopenharmony_ci		}
107662306a36Sopenharmony_ci		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
107762306a36Sopenharmony_ci			list_del_init(&device->dev_alloc_list);
107862306a36Sopenharmony_ci			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
107962306a36Sopenharmony_ci			fs_devices->rw_devices--;
108062306a36Sopenharmony_ci		}
108162306a36Sopenharmony_ci		list_del_init(&device->dev_list);
108262306a36Sopenharmony_ci		fs_devices->num_devices--;
108362306a36Sopenharmony_ci		btrfs_free_device(device);
108462306a36Sopenharmony_ci	}
108562306a36Sopenharmony_ci
108662306a36Sopenharmony_ci}
108762306a36Sopenharmony_ci
108862306a36Sopenharmony_ci/*
108962306a36Sopenharmony_ci * After we have read the system tree and know devids belonging to this
109062306a36Sopenharmony_ci * filesystem, remove the device which does not belong there.
109162306a36Sopenharmony_ci */
109262306a36Sopenharmony_civoid btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
109362306a36Sopenharmony_ci{
109462306a36Sopenharmony_ci	struct btrfs_device *latest_dev = NULL;
109562306a36Sopenharmony_ci	struct btrfs_fs_devices *seed_dev;
109662306a36Sopenharmony_ci
109762306a36Sopenharmony_ci	mutex_lock(&uuid_mutex);
109862306a36Sopenharmony_ci	__btrfs_free_extra_devids(fs_devices, &latest_dev);
109962306a36Sopenharmony_ci
110062306a36Sopenharmony_ci	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
110162306a36Sopenharmony_ci		__btrfs_free_extra_devids(seed_dev, &latest_dev);
110262306a36Sopenharmony_ci
110362306a36Sopenharmony_ci	fs_devices->latest_dev = latest_dev;
110462306a36Sopenharmony_ci
110562306a36Sopenharmony_ci	mutex_unlock(&uuid_mutex);
110662306a36Sopenharmony_ci}
110762306a36Sopenharmony_ci
110862306a36Sopenharmony_cistatic void btrfs_close_bdev(struct btrfs_device *device)
110962306a36Sopenharmony_ci{
111062306a36Sopenharmony_ci	if (!device->bdev)
111162306a36Sopenharmony_ci		return;
111262306a36Sopenharmony_ci
111362306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
111462306a36Sopenharmony_ci		sync_blockdev(device->bdev);
111562306a36Sopenharmony_ci		invalidate_bdev(device->bdev);
111662306a36Sopenharmony_ci	}
111762306a36Sopenharmony_ci
111862306a36Sopenharmony_ci	blkdev_put(device->bdev, device->holder);
111962306a36Sopenharmony_ci}
112062306a36Sopenharmony_ci
112162306a36Sopenharmony_cistatic void btrfs_close_one_device(struct btrfs_device *device)
112262306a36Sopenharmony_ci{
112362306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = device->fs_devices;
112462306a36Sopenharmony_ci
112562306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
112662306a36Sopenharmony_ci	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
112762306a36Sopenharmony_ci		list_del_init(&device->dev_alloc_list);
112862306a36Sopenharmony_ci		fs_devices->rw_devices--;
112962306a36Sopenharmony_ci	}
113062306a36Sopenharmony_ci
113162306a36Sopenharmony_ci	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
113262306a36Sopenharmony_ci		clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
113362306a36Sopenharmony_ci
113462306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
113562306a36Sopenharmony_ci		clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
113662306a36Sopenharmony_ci		fs_devices->missing_devices--;
113762306a36Sopenharmony_ci	}
113862306a36Sopenharmony_ci
113962306a36Sopenharmony_ci	btrfs_close_bdev(device);
114062306a36Sopenharmony_ci	if (device->bdev) {
114162306a36Sopenharmony_ci		fs_devices->open_devices--;
114262306a36Sopenharmony_ci		device->bdev = NULL;
114362306a36Sopenharmony_ci	}
114462306a36Sopenharmony_ci	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
114562306a36Sopenharmony_ci	btrfs_destroy_dev_zone_info(device);
114662306a36Sopenharmony_ci
114762306a36Sopenharmony_ci	device->fs_info = NULL;
114862306a36Sopenharmony_ci	atomic_set(&device->dev_stats_ccnt, 0);
114962306a36Sopenharmony_ci	extent_io_tree_release(&device->alloc_state);
115062306a36Sopenharmony_ci
115162306a36Sopenharmony_ci	/*
115262306a36Sopenharmony_ci	 * Reset the flush error record. We might have a transient flush error
115362306a36Sopenharmony_ci	 * in this mount, and if so we aborted the current transaction and set
115462306a36Sopenharmony_ci	 * the fs to an error state, guaranteeing no super blocks can be further
115562306a36Sopenharmony_ci	 * committed. However that error might be transient and if we unmount the
115662306a36Sopenharmony_ci	 * filesystem and mount it again, we should allow the mount to succeed
115762306a36Sopenharmony_ci	 * (btrfs_check_rw_degradable() should not fail) - if after mounting the
115862306a36Sopenharmony_ci	 * filesystem again we still get flush errors, then we will again abort
115962306a36Sopenharmony_ci	 * any transaction and set the error state, guaranteeing no commits of
116062306a36Sopenharmony_ci	 * unsafe super blocks.
116162306a36Sopenharmony_ci	 */
116262306a36Sopenharmony_ci	device->last_flush_error = 0;
116362306a36Sopenharmony_ci
116462306a36Sopenharmony_ci	/* Verify the device is back in a pristine state  */
116562306a36Sopenharmony_ci	WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
116662306a36Sopenharmony_ci	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
116762306a36Sopenharmony_ci	WARN_ON(!list_empty(&device->dev_alloc_list));
116862306a36Sopenharmony_ci	WARN_ON(!list_empty(&device->post_commit_list));
116962306a36Sopenharmony_ci}
117062306a36Sopenharmony_ci
117162306a36Sopenharmony_cistatic void close_fs_devices(struct btrfs_fs_devices *fs_devices)
117262306a36Sopenharmony_ci{
117362306a36Sopenharmony_ci	struct btrfs_device *device, *tmp;
117462306a36Sopenharmony_ci
117562306a36Sopenharmony_ci	lockdep_assert_held(&uuid_mutex);
117662306a36Sopenharmony_ci
117762306a36Sopenharmony_ci	if (--fs_devices->opened > 0)
117862306a36Sopenharmony_ci		return;
117962306a36Sopenharmony_ci
118062306a36Sopenharmony_ci	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
118162306a36Sopenharmony_ci		btrfs_close_one_device(device);
118262306a36Sopenharmony_ci
118362306a36Sopenharmony_ci	WARN_ON(fs_devices->open_devices);
118462306a36Sopenharmony_ci	WARN_ON(fs_devices->rw_devices);
118562306a36Sopenharmony_ci	fs_devices->opened = 0;
118662306a36Sopenharmony_ci	fs_devices->seeding = false;
118762306a36Sopenharmony_ci	fs_devices->fs_info = NULL;
118862306a36Sopenharmony_ci}
118962306a36Sopenharmony_ci
119062306a36Sopenharmony_civoid btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
119162306a36Sopenharmony_ci{
119262306a36Sopenharmony_ci	LIST_HEAD(list);
119362306a36Sopenharmony_ci	struct btrfs_fs_devices *tmp;
119462306a36Sopenharmony_ci
119562306a36Sopenharmony_ci	mutex_lock(&uuid_mutex);
119662306a36Sopenharmony_ci	close_fs_devices(fs_devices);
119762306a36Sopenharmony_ci	if (!fs_devices->opened) {
119862306a36Sopenharmony_ci		list_splice_init(&fs_devices->seed_list, &list);
119962306a36Sopenharmony_ci
120062306a36Sopenharmony_ci		/*
120162306a36Sopenharmony_ci		 * If the struct btrfs_fs_devices is not assembled with any
120262306a36Sopenharmony_ci		 * other device, it can be re-initialized during the next mount
120362306a36Sopenharmony_ci		 * without the needing device-scan step. Therefore, it can be
120462306a36Sopenharmony_ci		 * fully freed.
120562306a36Sopenharmony_ci		 */
120662306a36Sopenharmony_ci		if (fs_devices->num_devices == 1) {
120762306a36Sopenharmony_ci			list_del(&fs_devices->fs_list);
120862306a36Sopenharmony_ci			free_fs_devices(fs_devices);
120962306a36Sopenharmony_ci		}
121062306a36Sopenharmony_ci	}
121162306a36Sopenharmony_ci
121262306a36Sopenharmony_ci
121362306a36Sopenharmony_ci	list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
121462306a36Sopenharmony_ci		close_fs_devices(fs_devices);
121562306a36Sopenharmony_ci		list_del(&fs_devices->seed_list);
121662306a36Sopenharmony_ci		free_fs_devices(fs_devices);
121762306a36Sopenharmony_ci	}
121862306a36Sopenharmony_ci	mutex_unlock(&uuid_mutex);
121962306a36Sopenharmony_ci}
122062306a36Sopenharmony_ci
122162306a36Sopenharmony_cistatic int open_fs_devices(struct btrfs_fs_devices *fs_devices,
122262306a36Sopenharmony_ci				blk_mode_t flags, void *holder)
122362306a36Sopenharmony_ci{
122462306a36Sopenharmony_ci	struct btrfs_device *device;
122562306a36Sopenharmony_ci	struct btrfs_device *latest_dev = NULL;
122662306a36Sopenharmony_ci	struct btrfs_device *tmp_device;
122762306a36Sopenharmony_ci
122862306a36Sopenharmony_ci	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
122962306a36Sopenharmony_ci				 dev_list) {
123062306a36Sopenharmony_ci		int ret;
123162306a36Sopenharmony_ci
123262306a36Sopenharmony_ci		ret = btrfs_open_one_device(fs_devices, device, flags, holder);
123362306a36Sopenharmony_ci		if (ret == 0 &&
123462306a36Sopenharmony_ci		    (!latest_dev || device->generation > latest_dev->generation)) {
123562306a36Sopenharmony_ci			latest_dev = device;
123662306a36Sopenharmony_ci		} else if (ret == -ENODATA) {
123762306a36Sopenharmony_ci			fs_devices->num_devices--;
123862306a36Sopenharmony_ci			list_del(&device->dev_list);
123962306a36Sopenharmony_ci			btrfs_free_device(device);
124062306a36Sopenharmony_ci		}
124162306a36Sopenharmony_ci	}
124262306a36Sopenharmony_ci	if (fs_devices->open_devices == 0)
124362306a36Sopenharmony_ci		return -EINVAL;
124462306a36Sopenharmony_ci
124562306a36Sopenharmony_ci	fs_devices->opened = 1;
124662306a36Sopenharmony_ci	fs_devices->latest_dev = latest_dev;
124762306a36Sopenharmony_ci	fs_devices->total_rw_bytes = 0;
124862306a36Sopenharmony_ci	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
124962306a36Sopenharmony_ci	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
125062306a36Sopenharmony_ci
125162306a36Sopenharmony_ci	return 0;
125262306a36Sopenharmony_ci}
125362306a36Sopenharmony_ci
125462306a36Sopenharmony_cistatic int devid_cmp(void *priv, const struct list_head *a,
125562306a36Sopenharmony_ci		     const struct list_head *b)
125662306a36Sopenharmony_ci{
125762306a36Sopenharmony_ci	const struct btrfs_device *dev1, *dev2;
125862306a36Sopenharmony_ci
125962306a36Sopenharmony_ci	dev1 = list_entry(a, struct btrfs_device, dev_list);
126062306a36Sopenharmony_ci	dev2 = list_entry(b, struct btrfs_device, dev_list);
126162306a36Sopenharmony_ci
126262306a36Sopenharmony_ci	if (dev1->devid < dev2->devid)
126362306a36Sopenharmony_ci		return -1;
126462306a36Sopenharmony_ci	else if (dev1->devid > dev2->devid)
126562306a36Sopenharmony_ci		return 1;
126662306a36Sopenharmony_ci	return 0;
126762306a36Sopenharmony_ci}
126862306a36Sopenharmony_ci
126962306a36Sopenharmony_ciint btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
127062306a36Sopenharmony_ci		       blk_mode_t flags, void *holder)
127162306a36Sopenharmony_ci{
127262306a36Sopenharmony_ci	int ret;
127362306a36Sopenharmony_ci
127462306a36Sopenharmony_ci	lockdep_assert_held(&uuid_mutex);
127562306a36Sopenharmony_ci	/*
127662306a36Sopenharmony_ci	 * The device_list_mutex cannot be taken here in case opening the
127762306a36Sopenharmony_ci	 * underlying device takes further locks like open_mutex.
127862306a36Sopenharmony_ci	 *
127962306a36Sopenharmony_ci	 * We also don't need the lock here as this is called during mount and
128062306a36Sopenharmony_ci	 * exclusion is provided by uuid_mutex
128162306a36Sopenharmony_ci	 */
128262306a36Sopenharmony_ci
128362306a36Sopenharmony_ci	if (fs_devices->opened) {
128462306a36Sopenharmony_ci		fs_devices->opened++;
128562306a36Sopenharmony_ci		ret = 0;
128662306a36Sopenharmony_ci	} else {
128762306a36Sopenharmony_ci		list_sort(NULL, &fs_devices->devices, devid_cmp);
128862306a36Sopenharmony_ci		ret = open_fs_devices(fs_devices, flags, holder);
128962306a36Sopenharmony_ci	}
129062306a36Sopenharmony_ci
129162306a36Sopenharmony_ci	return ret;
129262306a36Sopenharmony_ci}
129362306a36Sopenharmony_ci
129462306a36Sopenharmony_civoid btrfs_release_disk_super(struct btrfs_super_block *super)
129562306a36Sopenharmony_ci{
129662306a36Sopenharmony_ci	struct page *page = virt_to_page(super);
129762306a36Sopenharmony_ci
129862306a36Sopenharmony_ci	put_page(page);
129962306a36Sopenharmony_ci}
130062306a36Sopenharmony_ci
130162306a36Sopenharmony_cistatic struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
130262306a36Sopenharmony_ci						       u64 bytenr, u64 bytenr_orig)
130362306a36Sopenharmony_ci{
130462306a36Sopenharmony_ci	struct btrfs_super_block *disk_super;
130562306a36Sopenharmony_ci	struct page *page;
130662306a36Sopenharmony_ci	void *p;
130762306a36Sopenharmony_ci	pgoff_t index;
130862306a36Sopenharmony_ci
130962306a36Sopenharmony_ci	/* make sure our super fits in the device */
131062306a36Sopenharmony_ci	if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
131162306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
131262306a36Sopenharmony_ci
131362306a36Sopenharmony_ci	/* make sure our super fits in the page */
131462306a36Sopenharmony_ci	if (sizeof(*disk_super) > PAGE_SIZE)
131562306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci	/* make sure our super doesn't straddle pages on disk */
131862306a36Sopenharmony_ci	index = bytenr >> PAGE_SHIFT;
131962306a36Sopenharmony_ci	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
132062306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
132162306a36Sopenharmony_ci
132262306a36Sopenharmony_ci	/* pull in the page with our super */
132362306a36Sopenharmony_ci	page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
132462306a36Sopenharmony_ci
132562306a36Sopenharmony_ci	if (IS_ERR(page))
132662306a36Sopenharmony_ci		return ERR_CAST(page);
132762306a36Sopenharmony_ci
132862306a36Sopenharmony_ci	p = page_address(page);
132962306a36Sopenharmony_ci
133062306a36Sopenharmony_ci	/* align our pointer to the offset of the super block */
133162306a36Sopenharmony_ci	disk_super = p + offset_in_page(bytenr);
133262306a36Sopenharmony_ci
133362306a36Sopenharmony_ci	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
133462306a36Sopenharmony_ci	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
133562306a36Sopenharmony_ci		btrfs_release_disk_super(p);
133662306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
133762306a36Sopenharmony_ci	}
133862306a36Sopenharmony_ci
133962306a36Sopenharmony_ci	if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
134062306a36Sopenharmony_ci		disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
134162306a36Sopenharmony_ci
134262306a36Sopenharmony_ci	return disk_super;
134362306a36Sopenharmony_ci}
134462306a36Sopenharmony_ci
134562306a36Sopenharmony_ciint btrfs_forget_devices(dev_t devt)
134662306a36Sopenharmony_ci{
134762306a36Sopenharmony_ci	int ret;
134862306a36Sopenharmony_ci
134962306a36Sopenharmony_ci	mutex_lock(&uuid_mutex);
135062306a36Sopenharmony_ci	ret = btrfs_free_stale_devices(devt, NULL);
135162306a36Sopenharmony_ci	mutex_unlock(&uuid_mutex);
135262306a36Sopenharmony_ci
135362306a36Sopenharmony_ci	return ret;
135462306a36Sopenharmony_ci}
135562306a36Sopenharmony_ci
135662306a36Sopenharmony_ci/*
135762306a36Sopenharmony_ci * Look for a btrfs signature on a device. This may be called out of the mount path
135862306a36Sopenharmony_ci * and we are not allowed to call set_blocksize during the scan. The superblock
135962306a36Sopenharmony_ci * is read via pagecache
136062306a36Sopenharmony_ci */
136162306a36Sopenharmony_cistruct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
136262306a36Sopenharmony_ci{
136362306a36Sopenharmony_ci	struct btrfs_super_block *disk_super;
136462306a36Sopenharmony_ci	bool new_device_added = false;
136562306a36Sopenharmony_ci	struct btrfs_device *device = NULL;
136662306a36Sopenharmony_ci	struct block_device *bdev;
136762306a36Sopenharmony_ci	u64 bytenr, bytenr_orig;
136862306a36Sopenharmony_ci	int ret;
136962306a36Sopenharmony_ci
137062306a36Sopenharmony_ci	lockdep_assert_held(&uuid_mutex);
137162306a36Sopenharmony_ci
137262306a36Sopenharmony_ci	/*
137362306a36Sopenharmony_ci	 * we would like to check all the supers, but that would make
137462306a36Sopenharmony_ci	 * a btrfs mount succeed after a mkfs from a different FS.
137562306a36Sopenharmony_ci	 * So, we need to add a special mount option to scan for
137662306a36Sopenharmony_ci	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
137762306a36Sopenharmony_ci	 */
137862306a36Sopenharmony_ci
137962306a36Sopenharmony_ci	/*
138062306a36Sopenharmony_ci	 * Avoid an exclusive open here, as the systemd-udev may initiate the
138162306a36Sopenharmony_ci	 * device scan which may race with the user's mount or mkfs command,
138262306a36Sopenharmony_ci	 * resulting in failure.
138362306a36Sopenharmony_ci	 * Since the device scan is solely for reading purposes, there is no
138462306a36Sopenharmony_ci	 * need for an exclusive open. Additionally, the devices are read again
138562306a36Sopenharmony_ci	 * during the mount process. It is ok to get some inconsistent
138662306a36Sopenharmony_ci	 * values temporarily, as the device paths of the fsid are the only
138762306a36Sopenharmony_ci	 * required information for assembling the volume.
138862306a36Sopenharmony_ci	 */
138962306a36Sopenharmony_ci	bdev = blkdev_get_by_path(path, flags, NULL, NULL);
139062306a36Sopenharmony_ci	if (IS_ERR(bdev))
139162306a36Sopenharmony_ci		return ERR_CAST(bdev);
139262306a36Sopenharmony_ci
139362306a36Sopenharmony_ci	bytenr_orig = btrfs_sb_offset(0);
139462306a36Sopenharmony_ci	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
139562306a36Sopenharmony_ci	if (ret) {
139662306a36Sopenharmony_ci		device = ERR_PTR(ret);
139762306a36Sopenharmony_ci		goto error_bdev_put;
139862306a36Sopenharmony_ci	}
139962306a36Sopenharmony_ci
140062306a36Sopenharmony_ci	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
140162306a36Sopenharmony_ci	if (IS_ERR(disk_super)) {
140262306a36Sopenharmony_ci		device = ERR_CAST(disk_super);
140362306a36Sopenharmony_ci		goto error_bdev_put;
140462306a36Sopenharmony_ci	}
140562306a36Sopenharmony_ci
140662306a36Sopenharmony_ci	device = device_list_add(path, disk_super, &new_device_added);
140762306a36Sopenharmony_ci	if (!IS_ERR(device) && new_device_added)
140862306a36Sopenharmony_ci		btrfs_free_stale_devices(device->devt, device);
140962306a36Sopenharmony_ci
141062306a36Sopenharmony_ci	btrfs_release_disk_super(disk_super);
141162306a36Sopenharmony_ci
141262306a36Sopenharmony_cierror_bdev_put:
141362306a36Sopenharmony_ci	blkdev_put(bdev, NULL);
141462306a36Sopenharmony_ci
141562306a36Sopenharmony_ci	return device;
141662306a36Sopenharmony_ci}
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci/*
141962306a36Sopenharmony_ci * Try to find a chunk that intersects [start, start + len] range and when one
142062306a36Sopenharmony_ci * such is found, record the end of it in *start
142162306a36Sopenharmony_ci */
142262306a36Sopenharmony_cistatic bool contains_pending_extent(struct btrfs_device *device, u64 *start,
142362306a36Sopenharmony_ci				    u64 len)
142462306a36Sopenharmony_ci{
142562306a36Sopenharmony_ci	u64 physical_start, physical_end;
142662306a36Sopenharmony_ci
142762306a36Sopenharmony_ci	lockdep_assert_held(&device->fs_info->chunk_mutex);
142862306a36Sopenharmony_ci
142962306a36Sopenharmony_ci	if (find_first_extent_bit(&device->alloc_state, *start,
143062306a36Sopenharmony_ci				  &physical_start, &physical_end,
143162306a36Sopenharmony_ci				  CHUNK_ALLOCATED, NULL)) {
143262306a36Sopenharmony_ci
143362306a36Sopenharmony_ci		if (in_range(physical_start, *start, len) ||
143462306a36Sopenharmony_ci		    in_range(*start, physical_start,
143562306a36Sopenharmony_ci			     physical_end + 1 - physical_start)) {
143662306a36Sopenharmony_ci			*start = physical_end + 1;
143762306a36Sopenharmony_ci			return true;
143862306a36Sopenharmony_ci		}
143962306a36Sopenharmony_ci	}
144062306a36Sopenharmony_ci	return false;
144162306a36Sopenharmony_ci}
144262306a36Sopenharmony_ci
144362306a36Sopenharmony_cistatic u64 dev_extent_search_start(struct btrfs_device *device)
144462306a36Sopenharmony_ci{
144562306a36Sopenharmony_ci	switch (device->fs_devices->chunk_alloc_policy) {
144662306a36Sopenharmony_ci	case BTRFS_CHUNK_ALLOC_REGULAR:
144762306a36Sopenharmony_ci		return BTRFS_DEVICE_RANGE_RESERVED;
144862306a36Sopenharmony_ci	case BTRFS_CHUNK_ALLOC_ZONED:
144962306a36Sopenharmony_ci		/*
145062306a36Sopenharmony_ci		 * We don't care about the starting region like regular
145162306a36Sopenharmony_ci		 * allocator, because we anyway use/reserve the first two zones
145262306a36Sopenharmony_ci		 * for superblock logging.
145362306a36Sopenharmony_ci		 */
145462306a36Sopenharmony_ci		return 0;
145562306a36Sopenharmony_ci	default:
145662306a36Sopenharmony_ci		BUG();
145762306a36Sopenharmony_ci	}
145862306a36Sopenharmony_ci}
145962306a36Sopenharmony_ci
146062306a36Sopenharmony_cistatic bool dev_extent_hole_check_zoned(struct btrfs_device *device,
146162306a36Sopenharmony_ci					u64 *hole_start, u64 *hole_size,
146262306a36Sopenharmony_ci					u64 num_bytes)
146362306a36Sopenharmony_ci{
146462306a36Sopenharmony_ci	u64 zone_size = device->zone_info->zone_size;
146562306a36Sopenharmony_ci	u64 pos;
146662306a36Sopenharmony_ci	int ret;
146762306a36Sopenharmony_ci	bool changed = false;
146862306a36Sopenharmony_ci
146962306a36Sopenharmony_ci	ASSERT(IS_ALIGNED(*hole_start, zone_size));
147062306a36Sopenharmony_ci
147162306a36Sopenharmony_ci	while (*hole_size > 0) {
147262306a36Sopenharmony_ci		pos = btrfs_find_allocatable_zones(device, *hole_start,
147362306a36Sopenharmony_ci						   *hole_start + *hole_size,
147462306a36Sopenharmony_ci						   num_bytes);
147562306a36Sopenharmony_ci		if (pos != *hole_start) {
147662306a36Sopenharmony_ci			*hole_size = *hole_start + *hole_size - pos;
147762306a36Sopenharmony_ci			*hole_start = pos;
147862306a36Sopenharmony_ci			changed = true;
147962306a36Sopenharmony_ci			if (*hole_size < num_bytes)
148062306a36Sopenharmony_ci				break;
148162306a36Sopenharmony_ci		}
148262306a36Sopenharmony_ci
148362306a36Sopenharmony_ci		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
148462306a36Sopenharmony_ci
148562306a36Sopenharmony_ci		/* Range is ensured to be empty */
148662306a36Sopenharmony_ci		if (!ret)
148762306a36Sopenharmony_ci			return changed;
148862306a36Sopenharmony_ci
148962306a36Sopenharmony_ci		/* Given hole range was invalid (outside of device) */
149062306a36Sopenharmony_ci		if (ret == -ERANGE) {
149162306a36Sopenharmony_ci			*hole_start += *hole_size;
149262306a36Sopenharmony_ci			*hole_size = 0;
149362306a36Sopenharmony_ci			return true;
149462306a36Sopenharmony_ci		}
149562306a36Sopenharmony_ci
149662306a36Sopenharmony_ci		*hole_start += zone_size;
149762306a36Sopenharmony_ci		*hole_size -= zone_size;
149862306a36Sopenharmony_ci		changed = true;
149962306a36Sopenharmony_ci	}
150062306a36Sopenharmony_ci
150162306a36Sopenharmony_ci	return changed;
150262306a36Sopenharmony_ci}
150362306a36Sopenharmony_ci
150462306a36Sopenharmony_ci/*
150562306a36Sopenharmony_ci * Check if specified hole is suitable for allocation.
150662306a36Sopenharmony_ci *
150762306a36Sopenharmony_ci * @device:	the device which we have the hole
150862306a36Sopenharmony_ci * @hole_start: starting position of the hole
150962306a36Sopenharmony_ci * @hole_size:	the size of the hole
151062306a36Sopenharmony_ci * @num_bytes:	the size of the free space that we need
151162306a36Sopenharmony_ci *
151262306a36Sopenharmony_ci * This function may modify @hole_start and @hole_size to reflect the suitable
151362306a36Sopenharmony_ci * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
151462306a36Sopenharmony_ci */
151562306a36Sopenharmony_cistatic bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
151662306a36Sopenharmony_ci				  u64 *hole_size, u64 num_bytes)
151762306a36Sopenharmony_ci{
151862306a36Sopenharmony_ci	bool changed = false;
151962306a36Sopenharmony_ci	u64 hole_end = *hole_start + *hole_size;
152062306a36Sopenharmony_ci
152162306a36Sopenharmony_ci	for (;;) {
152262306a36Sopenharmony_ci		/*
152362306a36Sopenharmony_ci		 * Check before we set max_hole_start, otherwise we could end up
152462306a36Sopenharmony_ci		 * sending back this offset anyway.
152562306a36Sopenharmony_ci		 */
152662306a36Sopenharmony_ci		if (contains_pending_extent(device, hole_start, *hole_size)) {
152762306a36Sopenharmony_ci			if (hole_end >= *hole_start)
152862306a36Sopenharmony_ci				*hole_size = hole_end - *hole_start;
152962306a36Sopenharmony_ci			else
153062306a36Sopenharmony_ci				*hole_size = 0;
153162306a36Sopenharmony_ci			changed = true;
153262306a36Sopenharmony_ci		}
153362306a36Sopenharmony_ci
153462306a36Sopenharmony_ci		switch (device->fs_devices->chunk_alloc_policy) {
153562306a36Sopenharmony_ci		case BTRFS_CHUNK_ALLOC_REGULAR:
153662306a36Sopenharmony_ci			/* No extra check */
153762306a36Sopenharmony_ci			break;
153862306a36Sopenharmony_ci		case BTRFS_CHUNK_ALLOC_ZONED:
153962306a36Sopenharmony_ci			if (dev_extent_hole_check_zoned(device, hole_start,
154062306a36Sopenharmony_ci							hole_size, num_bytes)) {
154162306a36Sopenharmony_ci				changed = true;
154262306a36Sopenharmony_ci				/*
154362306a36Sopenharmony_ci				 * The changed hole can contain pending extent.
154462306a36Sopenharmony_ci				 * Loop again to check that.
154562306a36Sopenharmony_ci				 */
154662306a36Sopenharmony_ci				continue;
154762306a36Sopenharmony_ci			}
154862306a36Sopenharmony_ci			break;
154962306a36Sopenharmony_ci		default:
155062306a36Sopenharmony_ci			BUG();
155162306a36Sopenharmony_ci		}
155262306a36Sopenharmony_ci
155362306a36Sopenharmony_ci		break;
155462306a36Sopenharmony_ci	}
155562306a36Sopenharmony_ci
155662306a36Sopenharmony_ci	return changed;
155762306a36Sopenharmony_ci}
155862306a36Sopenharmony_ci
155962306a36Sopenharmony_ci/*
156062306a36Sopenharmony_ci * Find free space in the specified device.
156162306a36Sopenharmony_ci *
156262306a36Sopenharmony_ci * @device:	  the device which we search the free space in
156362306a36Sopenharmony_ci * @num_bytes:	  the size of the free space that we need
156462306a36Sopenharmony_ci * @search_start: the position from which to begin the search
156562306a36Sopenharmony_ci * @start:	  store the start of the free space.
156662306a36Sopenharmony_ci * @len:	  the size of the free space. that we find, or the size
156762306a36Sopenharmony_ci *		  of the max free space if we don't find suitable free space
156862306a36Sopenharmony_ci *
156962306a36Sopenharmony_ci * This does a pretty simple search, the expectation is that it is called very
157062306a36Sopenharmony_ci * infrequently and that a given device has a small number of extents.
157162306a36Sopenharmony_ci *
157262306a36Sopenharmony_ci * @start is used to store the start of the free space if we find. But if we
157362306a36Sopenharmony_ci * don't find suitable free space, it will be used to store the start position
157462306a36Sopenharmony_ci * of the max free space.
157562306a36Sopenharmony_ci *
157662306a36Sopenharmony_ci * @len is used to store the size of the free space that we find.
157762306a36Sopenharmony_ci * But if we don't find suitable free space, it is used to store the size of
157862306a36Sopenharmony_ci * the max free space.
157962306a36Sopenharmony_ci *
158062306a36Sopenharmony_ci * NOTE: This function will search *commit* root of device tree, and does extra
158162306a36Sopenharmony_ci * check to ensure dev extents are not double allocated.
158262306a36Sopenharmony_ci * This makes the function safe to allocate dev extents but may not report
158362306a36Sopenharmony_ci * correct usable device space, as device extent freed in current transaction
158462306a36Sopenharmony_ci * is not reported as available.
158562306a36Sopenharmony_ci */
158662306a36Sopenharmony_cistatic int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
158762306a36Sopenharmony_ci				u64 *start, u64 *len)
158862306a36Sopenharmony_ci{
158962306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = device->fs_info;
159062306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->dev_root;
159162306a36Sopenharmony_ci	struct btrfs_key key;
159262306a36Sopenharmony_ci	struct btrfs_dev_extent *dev_extent;
159362306a36Sopenharmony_ci	struct btrfs_path *path;
159462306a36Sopenharmony_ci	u64 search_start;
159562306a36Sopenharmony_ci	u64 hole_size;
159662306a36Sopenharmony_ci	u64 max_hole_start;
159762306a36Sopenharmony_ci	u64 max_hole_size = 0;
159862306a36Sopenharmony_ci	u64 extent_end;
159962306a36Sopenharmony_ci	u64 search_end = device->total_bytes;
160062306a36Sopenharmony_ci	int ret;
160162306a36Sopenharmony_ci	int slot;
160262306a36Sopenharmony_ci	struct extent_buffer *l;
160362306a36Sopenharmony_ci
160462306a36Sopenharmony_ci	search_start = dev_extent_search_start(device);
160562306a36Sopenharmony_ci	max_hole_start = search_start;
160662306a36Sopenharmony_ci
160762306a36Sopenharmony_ci	WARN_ON(device->zone_info &&
160862306a36Sopenharmony_ci		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
160962306a36Sopenharmony_ci
161062306a36Sopenharmony_ci	path = btrfs_alloc_path();
161162306a36Sopenharmony_ci	if (!path) {
161262306a36Sopenharmony_ci		ret = -ENOMEM;
161362306a36Sopenharmony_ci		goto out;
161462306a36Sopenharmony_ci	}
161562306a36Sopenharmony_ciagain:
161662306a36Sopenharmony_ci	if (search_start >= search_end ||
161762306a36Sopenharmony_ci		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
161862306a36Sopenharmony_ci		ret = -ENOSPC;
161962306a36Sopenharmony_ci		goto out;
162062306a36Sopenharmony_ci	}
162162306a36Sopenharmony_ci
162262306a36Sopenharmony_ci	path->reada = READA_FORWARD;
162362306a36Sopenharmony_ci	path->search_commit_root = 1;
162462306a36Sopenharmony_ci	path->skip_locking = 1;
162562306a36Sopenharmony_ci
162662306a36Sopenharmony_ci	key.objectid = device->devid;
162762306a36Sopenharmony_ci	key.offset = search_start;
162862306a36Sopenharmony_ci	key.type = BTRFS_DEV_EXTENT_KEY;
162962306a36Sopenharmony_ci
163062306a36Sopenharmony_ci	ret = btrfs_search_backwards(root, &key, path);
163162306a36Sopenharmony_ci	if (ret < 0)
163262306a36Sopenharmony_ci		goto out;
163362306a36Sopenharmony_ci
163462306a36Sopenharmony_ci	while (search_start < search_end) {
163562306a36Sopenharmony_ci		l = path->nodes[0];
163662306a36Sopenharmony_ci		slot = path->slots[0];
163762306a36Sopenharmony_ci		if (slot >= btrfs_header_nritems(l)) {
163862306a36Sopenharmony_ci			ret = btrfs_next_leaf(root, path);
163962306a36Sopenharmony_ci			if (ret == 0)
164062306a36Sopenharmony_ci				continue;
164162306a36Sopenharmony_ci			if (ret < 0)
164262306a36Sopenharmony_ci				goto out;
164362306a36Sopenharmony_ci
164462306a36Sopenharmony_ci			break;
164562306a36Sopenharmony_ci		}
164662306a36Sopenharmony_ci		btrfs_item_key_to_cpu(l, &key, slot);
164762306a36Sopenharmony_ci
164862306a36Sopenharmony_ci		if (key.objectid < device->devid)
164962306a36Sopenharmony_ci			goto next;
165062306a36Sopenharmony_ci
165162306a36Sopenharmony_ci		if (key.objectid > device->devid)
165262306a36Sopenharmony_ci			break;
165362306a36Sopenharmony_ci
165462306a36Sopenharmony_ci		if (key.type != BTRFS_DEV_EXTENT_KEY)
165562306a36Sopenharmony_ci			goto next;
165662306a36Sopenharmony_ci
165762306a36Sopenharmony_ci		if (key.offset > search_end)
165862306a36Sopenharmony_ci			break;
165962306a36Sopenharmony_ci
166062306a36Sopenharmony_ci		if (key.offset > search_start) {
166162306a36Sopenharmony_ci			hole_size = key.offset - search_start;
166262306a36Sopenharmony_ci			dev_extent_hole_check(device, &search_start, &hole_size,
166362306a36Sopenharmony_ci					      num_bytes);
166462306a36Sopenharmony_ci
166562306a36Sopenharmony_ci			if (hole_size > max_hole_size) {
166662306a36Sopenharmony_ci				max_hole_start = search_start;
166762306a36Sopenharmony_ci				max_hole_size = hole_size;
166862306a36Sopenharmony_ci			}
166962306a36Sopenharmony_ci
167062306a36Sopenharmony_ci			/*
167162306a36Sopenharmony_ci			 * If this free space is greater than which we need,
167262306a36Sopenharmony_ci			 * it must be the max free space that we have found
167362306a36Sopenharmony_ci			 * until now, so max_hole_start must point to the start
167462306a36Sopenharmony_ci			 * of this free space and the length of this free space
167562306a36Sopenharmony_ci			 * is stored in max_hole_size. Thus, we return
167662306a36Sopenharmony_ci			 * max_hole_start and max_hole_size and go back to the
167762306a36Sopenharmony_ci			 * caller.
167862306a36Sopenharmony_ci			 */
167962306a36Sopenharmony_ci			if (hole_size >= num_bytes) {
168062306a36Sopenharmony_ci				ret = 0;
168162306a36Sopenharmony_ci				goto out;
168262306a36Sopenharmony_ci			}
168362306a36Sopenharmony_ci		}
168462306a36Sopenharmony_ci
168562306a36Sopenharmony_ci		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
168662306a36Sopenharmony_ci		extent_end = key.offset + btrfs_dev_extent_length(l,
168762306a36Sopenharmony_ci								  dev_extent);
168862306a36Sopenharmony_ci		if (extent_end > search_start)
168962306a36Sopenharmony_ci			search_start = extent_end;
169062306a36Sopenharmony_cinext:
169162306a36Sopenharmony_ci		path->slots[0]++;
169262306a36Sopenharmony_ci		cond_resched();
169362306a36Sopenharmony_ci	}
169462306a36Sopenharmony_ci
169562306a36Sopenharmony_ci	/*
169662306a36Sopenharmony_ci	 * At this point, search_start should be the end of
169762306a36Sopenharmony_ci	 * allocated dev extents, and when shrinking the device,
169862306a36Sopenharmony_ci	 * search_end may be smaller than search_start.
169962306a36Sopenharmony_ci	 */
170062306a36Sopenharmony_ci	if (search_end > search_start) {
170162306a36Sopenharmony_ci		hole_size = search_end - search_start;
170262306a36Sopenharmony_ci		if (dev_extent_hole_check(device, &search_start, &hole_size,
170362306a36Sopenharmony_ci					  num_bytes)) {
170462306a36Sopenharmony_ci			btrfs_release_path(path);
170562306a36Sopenharmony_ci			goto again;
170662306a36Sopenharmony_ci		}
170762306a36Sopenharmony_ci
170862306a36Sopenharmony_ci		if (hole_size > max_hole_size) {
170962306a36Sopenharmony_ci			max_hole_start = search_start;
171062306a36Sopenharmony_ci			max_hole_size = hole_size;
171162306a36Sopenharmony_ci		}
171262306a36Sopenharmony_ci	}
171362306a36Sopenharmony_ci
171462306a36Sopenharmony_ci	/* See above. */
171562306a36Sopenharmony_ci	if (max_hole_size < num_bytes)
171662306a36Sopenharmony_ci		ret = -ENOSPC;
171762306a36Sopenharmony_ci	else
171862306a36Sopenharmony_ci		ret = 0;
171962306a36Sopenharmony_ci
172062306a36Sopenharmony_ci	ASSERT(max_hole_start + max_hole_size <= search_end);
172162306a36Sopenharmony_ciout:
172262306a36Sopenharmony_ci	btrfs_free_path(path);
172362306a36Sopenharmony_ci	*start = max_hole_start;
172462306a36Sopenharmony_ci	if (len)
172562306a36Sopenharmony_ci		*len = max_hole_size;
172662306a36Sopenharmony_ci	return ret;
172762306a36Sopenharmony_ci}
172862306a36Sopenharmony_ci
172962306a36Sopenharmony_cistatic int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
173062306a36Sopenharmony_ci			  struct btrfs_device *device,
173162306a36Sopenharmony_ci			  u64 start, u64 *dev_extent_len)
173262306a36Sopenharmony_ci{
173362306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = device->fs_info;
173462306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->dev_root;
173562306a36Sopenharmony_ci	int ret;
173662306a36Sopenharmony_ci	struct btrfs_path *path;
173762306a36Sopenharmony_ci	struct btrfs_key key;
173862306a36Sopenharmony_ci	struct btrfs_key found_key;
173962306a36Sopenharmony_ci	struct extent_buffer *leaf = NULL;
174062306a36Sopenharmony_ci	struct btrfs_dev_extent *extent = NULL;
174162306a36Sopenharmony_ci
174262306a36Sopenharmony_ci	path = btrfs_alloc_path();
174362306a36Sopenharmony_ci	if (!path)
174462306a36Sopenharmony_ci		return -ENOMEM;
174562306a36Sopenharmony_ci
174662306a36Sopenharmony_ci	key.objectid = device->devid;
174762306a36Sopenharmony_ci	key.offset = start;
174862306a36Sopenharmony_ci	key.type = BTRFS_DEV_EXTENT_KEY;
174962306a36Sopenharmony_ciagain:
175062306a36Sopenharmony_ci	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
175162306a36Sopenharmony_ci	if (ret > 0) {
175262306a36Sopenharmony_ci		ret = btrfs_previous_item(root, path, key.objectid,
175362306a36Sopenharmony_ci					  BTRFS_DEV_EXTENT_KEY);
175462306a36Sopenharmony_ci		if (ret)
175562306a36Sopenharmony_ci			goto out;
175662306a36Sopenharmony_ci		leaf = path->nodes[0];
175762306a36Sopenharmony_ci		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
175862306a36Sopenharmony_ci		extent = btrfs_item_ptr(leaf, path->slots[0],
175962306a36Sopenharmony_ci					struct btrfs_dev_extent);
176062306a36Sopenharmony_ci		BUG_ON(found_key.offset > start || found_key.offset +
176162306a36Sopenharmony_ci		       btrfs_dev_extent_length(leaf, extent) < start);
176262306a36Sopenharmony_ci		key = found_key;
176362306a36Sopenharmony_ci		btrfs_release_path(path);
176462306a36Sopenharmony_ci		goto again;
176562306a36Sopenharmony_ci	} else if (ret == 0) {
176662306a36Sopenharmony_ci		leaf = path->nodes[0];
176762306a36Sopenharmony_ci		extent = btrfs_item_ptr(leaf, path->slots[0],
176862306a36Sopenharmony_ci					struct btrfs_dev_extent);
176962306a36Sopenharmony_ci	} else {
177062306a36Sopenharmony_ci		goto out;
177162306a36Sopenharmony_ci	}
177262306a36Sopenharmony_ci
177362306a36Sopenharmony_ci	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
177462306a36Sopenharmony_ci
177562306a36Sopenharmony_ci	ret = btrfs_del_item(trans, root, path);
177662306a36Sopenharmony_ci	if (ret == 0)
177762306a36Sopenharmony_ci		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
177862306a36Sopenharmony_ciout:
177962306a36Sopenharmony_ci	btrfs_free_path(path);
178062306a36Sopenharmony_ci	return ret;
178162306a36Sopenharmony_ci}
178262306a36Sopenharmony_ci
178362306a36Sopenharmony_cistatic u64 find_next_chunk(struct btrfs_fs_info *fs_info)
178462306a36Sopenharmony_ci{
178562306a36Sopenharmony_ci	struct extent_map_tree *em_tree;
178662306a36Sopenharmony_ci	struct extent_map *em;
178762306a36Sopenharmony_ci	struct rb_node *n;
178862306a36Sopenharmony_ci	u64 ret = 0;
178962306a36Sopenharmony_ci
179062306a36Sopenharmony_ci	em_tree = &fs_info->mapping_tree;
179162306a36Sopenharmony_ci	read_lock(&em_tree->lock);
179262306a36Sopenharmony_ci	n = rb_last(&em_tree->map.rb_root);
179362306a36Sopenharmony_ci	if (n) {
179462306a36Sopenharmony_ci		em = rb_entry(n, struct extent_map, rb_node);
179562306a36Sopenharmony_ci		ret = em->start + em->len;
179662306a36Sopenharmony_ci	}
179762306a36Sopenharmony_ci	read_unlock(&em_tree->lock);
179862306a36Sopenharmony_ci
179962306a36Sopenharmony_ci	return ret;
180062306a36Sopenharmony_ci}
180162306a36Sopenharmony_ci
180262306a36Sopenharmony_cistatic noinline int find_next_devid(struct btrfs_fs_info *fs_info,
180362306a36Sopenharmony_ci				    u64 *devid_ret)
180462306a36Sopenharmony_ci{
180562306a36Sopenharmony_ci	int ret;
180662306a36Sopenharmony_ci	struct btrfs_key key;
180762306a36Sopenharmony_ci	struct btrfs_key found_key;
180862306a36Sopenharmony_ci	struct btrfs_path *path;
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_ci	path = btrfs_alloc_path();
181162306a36Sopenharmony_ci	if (!path)
181262306a36Sopenharmony_ci		return -ENOMEM;
181362306a36Sopenharmony_ci
181462306a36Sopenharmony_ci	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
181562306a36Sopenharmony_ci	key.type = BTRFS_DEV_ITEM_KEY;
181662306a36Sopenharmony_ci	key.offset = (u64)-1;
181762306a36Sopenharmony_ci
181862306a36Sopenharmony_ci	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
181962306a36Sopenharmony_ci	if (ret < 0)
182062306a36Sopenharmony_ci		goto error;
182162306a36Sopenharmony_ci
182262306a36Sopenharmony_ci	if (ret == 0) {
182362306a36Sopenharmony_ci		/* Corruption */
182462306a36Sopenharmony_ci		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
182562306a36Sopenharmony_ci		ret = -EUCLEAN;
182662306a36Sopenharmony_ci		goto error;
182762306a36Sopenharmony_ci	}
182862306a36Sopenharmony_ci
182962306a36Sopenharmony_ci	ret = btrfs_previous_item(fs_info->chunk_root, path,
183062306a36Sopenharmony_ci				  BTRFS_DEV_ITEMS_OBJECTID,
183162306a36Sopenharmony_ci				  BTRFS_DEV_ITEM_KEY);
183262306a36Sopenharmony_ci	if (ret) {
183362306a36Sopenharmony_ci		*devid_ret = 1;
183462306a36Sopenharmony_ci	} else {
183562306a36Sopenharmony_ci		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
183662306a36Sopenharmony_ci				      path->slots[0]);
183762306a36Sopenharmony_ci		*devid_ret = found_key.offset + 1;
183862306a36Sopenharmony_ci	}
183962306a36Sopenharmony_ci	ret = 0;
184062306a36Sopenharmony_cierror:
184162306a36Sopenharmony_ci	btrfs_free_path(path);
184262306a36Sopenharmony_ci	return ret;
184362306a36Sopenharmony_ci}
184462306a36Sopenharmony_ci
184562306a36Sopenharmony_ci/*
184662306a36Sopenharmony_ci * the device information is stored in the chunk root
184762306a36Sopenharmony_ci * the btrfs_device struct should be fully filled in
184862306a36Sopenharmony_ci */
184962306a36Sopenharmony_cistatic int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
185062306a36Sopenharmony_ci			    struct btrfs_device *device)
185162306a36Sopenharmony_ci{
185262306a36Sopenharmony_ci	int ret;
185362306a36Sopenharmony_ci	struct btrfs_path *path;
185462306a36Sopenharmony_ci	struct btrfs_dev_item *dev_item;
185562306a36Sopenharmony_ci	struct extent_buffer *leaf;
185662306a36Sopenharmony_ci	struct btrfs_key key;
185762306a36Sopenharmony_ci	unsigned long ptr;
185862306a36Sopenharmony_ci
185962306a36Sopenharmony_ci	path = btrfs_alloc_path();
186062306a36Sopenharmony_ci	if (!path)
186162306a36Sopenharmony_ci		return -ENOMEM;
186262306a36Sopenharmony_ci
186362306a36Sopenharmony_ci	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
186462306a36Sopenharmony_ci	key.type = BTRFS_DEV_ITEM_KEY;
186562306a36Sopenharmony_ci	key.offset = device->devid;
186662306a36Sopenharmony_ci
186762306a36Sopenharmony_ci	btrfs_reserve_chunk_metadata(trans, true);
186862306a36Sopenharmony_ci	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
186962306a36Sopenharmony_ci				      &key, sizeof(*dev_item));
187062306a36Sopenharmony_ci	btrfs_trans_release_chunk_metadata(trans);
187162306a36Sopenharmony_ci	if (ret)
187262306a36Sopenharmony_ci		goto out;
187362306a36Sopenharmony_ci
187462306a36Sopenharmony_ci	leaf = path->nodes[0];
187562306a36Sopenharmony_ci	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
187662306a36Sopenharmony_ci
187762306a36Sopenharmony_ci	btrfs_set_device_id(leaf, dev_item, device->devid);
187862306a36Sopenharmony_ci	btrfs_set_device_generation(leaf, dev_item, 0);
187962306a36Sopenharmony_ci	btrfs_set_device_type(leaf, dev_item, device->type);
188062306a36Sopenharmony_ci	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
188162306a36Sopenharmony_ci	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
188262306a36Sopenharmony_ci	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
188362306a36Sopenharmony_ci	btrfs_set_device_total_bytes(leaf, dev_item,
188462306a36Sopenharmony_ci				     btrfs_device_get_disk_total_bytes(device));
188562306a36Sopenharmony_ci	btrfs_set_device_bytes_used(leaf, dev_item,
188662306a36Sopenharmony_ci				    btrfs_device_get_bytes_used(device));
188762306a36Sopenharmony_ci	btrfs_set_device_group(leaf, dev_item, 0);
188862306a36Sopenharmony_ci	btrfs_set_device_seek_speed(leaf, dev_item, 0);
188962306a36Sopenharmony_ci	btrfs_set_device_bandwidth(leaf, dev_item, 0);
189062306a36Sopenharmony_ci	btrfs_set_device_start_offset(leaf, dev_item, 0);
189162306a36Sopenharmony_ci
189262306a36Sopenharmony_ci	ptr = btrfs_device_uuid(dev_item);
189362306a36Sopenharmony_ci	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
189462306a36Sopenharmony_ci	ptr = btrfs_device_fsid(dev_item);
189562306a36Sopenharmony_ci	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
189662306a36Sopenharmony_ci			    ptr, BTRFS_FSID_SIZE);
189762306a36Sopenharmony_ci	btrfs_mark_buffer_dirty(trans, leaf);
189862306a36Sopenharmony_ci
189962306a36Sopenharmony_ci	ret = 0;
190062306a36Sopenharmony_ciout:
190162306a36Sopenharmony_ci	btrfs_free_path(path);
190262306a36Sopenharmony_ci	return ret;
190362306a36Sopenharmony_ci}
190462306a36Sopenharmony_ci
190562306a36Sopenharmony_ci/*
190662306a36Sopenharmony_ci * Function to update ctime/mtime for a given device path.
190762306a36Sopenharmony_ci * Mainly used for ctime/mtime based probe like libblkid.
190862306a36Sopenharmony_ci *
190962306a36Sopenharmony_ci * We don't care about errors here, this is just to be kind to userspace.
191062306a36Sopenharmony_ci */
191162306a36Sopenharmony_cistatic void update_dev_time(const char *device_path)
191262306a36Sopenharmony_ci{
191362306a36Sopenharmony_ci	struct path path;
191462306a36Sopenharmony_ci	int ret;
191562306a36Sopenharmony_ci
191662306a36Sopenharmony_ci	ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
191762306a36Sopenharmony_ci	if (ret)
191862306a36Sopenharmony_ci		return;
191962306a36Sopenharmony_ci
192062306a36Sopenharmony_ci	inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
192162306a36Sopenharmony_ci	path_put(&path);
192262306a36Sopenharmony_ci}
192362306a36Sopenharmony_ci
192462306a36Sopenharmony_cistatic int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
192562306a36Sopenharmony_ci			     struct btrfs_device *device)
192662306a36Sopenharmony_ci{
192762306a36Sopenharmony_ci	struct btrfs_root *root = device->fs_info->chunk_root;
192862306a36Sopenharmony_ci	int ret;
192962306a36Sopenharmony_ci	struct btrfs_path *path;
193062306a36Sopenharmony_ci	struct btrfs_key key;
193162306a36Sopenharmony_ci
193262306a36Sopenharmony_ci	path = btrfs_alloc_path();
193362306a36Sopenharmony_ci	if (!path)
193462306a36Sopenharmony_ci		return -ENOMEM;
193562306a36Sopenharmony_ci
193662306a36Sopenharmony_ci	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
193762306a36Sopenharmony_ci	key.type = BTRFS_DEV_ITEM_KEY;
193862306a36Sopenharmony_ci	key.offset = device->devid;
193962306a36Sopenharmony_ci
194062306a36Sopenharmony_ci	btrfs_reserve_chunk_metadata(trans, false);
194162306a36Sopenharmony_ci	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
194262306a36Sopenharmony_ci	btrfs_trans_release_chunk_metadata(trans);
194362306a36Sopenharmony_ci	if (ret) {
194462306a36Sopenharmony_ci		if (ret > 0)
194562306a36Sopenharmony_ci			ret = -ENOENT;
194662306a36Sopenharmony_ci		goto out;
194762306a36Sopenharmony_ci	}
194862306a36Sopenharmony_ci
194962306a36Sopenharmony_ci	ret = btrfs_del_item(trans, root, path);
195062306a36Sopenharmony_ciout:
195162306a36Sopenharmony_ci	btrfs_free_path(path);
195262306a36Sopenharmony_ci	return ret;
195362306a36Sopenharmony_ci}
195462306a36Sopenharmony_ci
195562306a36Sopenharmony_ci/*
195662306a36Sopenharmony_ci * Verify that @num_devices satisfies the RAID profile constraints in the whole
195762306a36Sopenharmony_ci * filesystem. It's up to the caller to adjust that number regarding eg. device
195862306a36Sopenharmony_ci * replace.
195962306a36Sopenharmony_ci */
196062306a36Sopenharmony_cistatic int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
196162306a36Sopenharmony_ci		u64 num_devices)
196262306a36Sopenharmony_ci{
196362306a36Sopenharmony_ci	u64 all_avail;
196462306a36Sopenharmony_ci	unsigned seq;
196562306a36Sopenharmony_ci	int i;
196662306a36Sopenharmony_ci
196762306a36Sopenharmony_ci	do {
196862306a36Sopenharmony_ci		seq = read_seqbegin(&fs_info->profiles_lock);
196962306a36Sopenharmony_ci
197062306a36Sopenharmony_ci		all_avail = fs_info->avail_data_alloc_bits |
197162306a36Sopenharmony_ci			    fs_info->avail_system_alloc_bits |
197262306a36Sopenharmony_ci			    fs_info->avail_metadata_alloc_bits;
197362306a36Sopenharmony_ci	} while (read_seqretry(&fs_info->profiles_lock, seq));
197462306a36Sopenharmony_ci
197562306a36Sopenharmony_ci	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
197662306a36Sopenharmony_ci		if (!(all_avail & btrfs_raid_array[i].bg_flag))
197762306a36Sopenharmony_ci			continue;
197862306a36Sopenharmony_ci
197962306a36Sopenharmony_ci		if (num_devices < btrfs_raid_array[i].devs_min)
198062306a36Sopenharmony_ci			return btrfs_raid_array[i].mindev_error;
198162306a36Sopenharmony_ci	}
198262306a36Sopenharmony_ci
198362306a36Sopenharmony_ci	return 0;
198462306a36Sopenharmony_ci}
198562306a36Sopenharmony_ci
198662306a36Sopenharmony_cistatic struct btrfs_device * btrfs_find_next_active_device(
198762306a36Sopenharmony_ci		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
198862306a36Sopenharmony_ci{
198962306a36Sopenharmony_ci	struct btrfs_device *next_device;
199062306a36Sopenharmony_ci
199162306a36Sopenharmony_ci	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
199262306a36Sopenharmony_ci		if (next_device != device &&
199362306a36Sopenharmony_ci		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
199462306a36Sopenharmony_ci		    && next_device->bdev)
199562306a36Sopenharmony_ci			return next_device;
199662306a36Sopenharmony_ci	}
199762306a36Sopenharmony_ci
199862306a36Sopenharmony_ci	return NULL;
199962306a36Sopenharmony_ci}
200062306a36Sopenharmony_ci
200162306a36Sopenharmony_ci/*
200262306a36Sopenharmony_ci * Helper function to check if the given device is part of s_bdev / latest_dev
200362306a36Sopenharmony_ci * and replace it with the provided or the next active device, in the context
200462306a36Sopenharmony_ci * where this function called, there should be always be another device (or
200562306a36Sopenharmony_ci * this_dev) which is active.
200662306a36Sopenharmony_ci */
200762306a36Sopenharmony_civoid __cold btrfs_assign_next_active_device(struct btrfs_device *device,
200862306a36Sopenharmony_ci					    struct btrfs_device *next_device)
200962306a36Sopenharmony_ci{
201062306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = device->fs_info;
201162306a36Sopenharmony_ci
201262306a36Sopenharmony_ci	if (!next_device)
201362306a36Sopenharmony_ci		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
201462306a36Sopenharmony_ci							    device);
201562306a36Sopenharmony_ci	ASSERT(next_device);
201662306a36Sopenharmony_ci
201762306a36Sopenharmony_ci	if (fs_info->sb->s_bdev &&
201862306a36Sopenharmony_ci			(fs_info->sb->s_bdev == device->bdev))
201962306a36Sopenharmony_ci		fs_info->sb->s_bdev = next_device->bdev;
202062306a36Sopenharmony_ci
202162306a36Sopenharmony_ci	if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
202262306a36Sopenharmony_ci		fs_info->fs_devices->latest_dev = next_device;
202362306a36Sopenharmony_ci}
202462306a36Sopenharmony_ci
202562306a36Sopenharmony_ci/*
202662306a36Sopenharmony_ci * Return btrfs_fs_devices::num_devices excluding the device that's being
202762306a36Sopenharmony_ci * currently replaced.
202862306a36Sopenharmony_ci */
202962306a36Sopenharmony_cistatic u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
203062306a36Sopenharmony_ci{
203162306a36Sopenharmony_ci	u64 num_devices = fs_info->fs_devices->num_devices;
203262306a36Sopenharmony_ci
203362306a36Sopenharmony_ci	down_read(&fs_info->dev_replace.rwsem);
203462306a36Sopenharmony_ci	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
203562306a36Sopenharmony_ci		ASSERT(num_devices > 1);
203662306a36Sopenharmony_ci		num_devices--;
203762306a36Sopenharmony_ci	}
203862306a36Sopenharmony_ci	up_read(&fs_info->dev_replace.rwsem);
203962306a36Sopenharmony_ci
204062306a36Sopenharmony_ci	return num_devices;
204162306a36Sopenharmony_ci}
204262306a36Sopenharmony_ci
204362306a36Sopenharmony_cistatic void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
204462306a36Sopenharmony_ci				     struct block_device *bdev, int copy_num)
204562306a36Sopenharmony_ci{
204662306a36Sopenharmony_ci	struct btrfs_super_block *disk_super;
204762306a36Sopenharmony_ci	const size_t len = sizeof(disk_super->magic);
204862306a36Sopenharmony_ci	const u64 bytenr = btrfs_sb_offset(copy_num);
204962306a36Sopenharmony_ci	int ret;
205062306a36Sopenharmony_ci
205162306a36Sopenharmony_ci	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
205262306a36Sopenharmony_ci	if (IS_ERR(disk_super))
205362306a36Sopenharmony_ci		return;
205462306a36Sopenharmony_ci
205562306a36Sopenharmony_ci	memset(&disk_super->magic, 0, len);
205662306a36Sopenharmony_ci	folio_mark_dirty(virt_to_folio(disk_super));
205762306a36Sopenharmony_ci	btrfs_release_disk_super(disk_super);
205862306a36Sopenharmony_ci
205962306a36Sopenharmony_ci	ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
206062306a36Sopenharmony_ci	if (ret)
206162306a36Sopenharmony_ci		btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
206262306a36Sopenharmony_ci			copy_num, ret);
206362306a36Sopenharmony_ci}
206462306a36Sopenharmony_ci
206562306a36Sopenharmony_civoid btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
206662306a36Sopenharmony_ci			       struct block_device *bdev,
206762306a36Sopenharmony_ci			       const char *device_path)
206862306a36Sopenharmony_ci{
206962306a36Sopenharmony_ci	int copy_num;
207062306a36Sopenharmony_ci
207162306a36Sopenharmony_ci	if (!bdev)
207262306a36Sopenharmony_ci		return;
207362306a36Sopenharmony_ci
207462306a36Sopenharmony_ci	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
207562306a36Sopenharmony_ci		if (bdev_is_zoned(bdev))
207662306a36Sopenharmony_ci			btrfs_reset_sb_log_zones(bdev, copy_num);
207762306a36Sopenharmony_ci		else
207862306a36Sopenharmony_ci			btrfs_scratch_superblock(fs_info, bdev, copy_num);
207962306a36Sopenharmony_ci	}
208062306a36Sopenharmony_ci
208162306a36Sopenharmony_ci	/* Notify udev that device has changed */
208262306a36Sopenharmony_ci	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
208362306a36Sopenharmony_ci
208462306a36Sopenharmony_ci	/* Update ctime/mtime for device path for libblkid */
208562306a36Sopenharmony_ci	update_dev_time(device_path);
208662306a36Sopenharmony_ci}
208762306a36Sopenharmony_ci
208862306a36Sopenharmony_ciint btrfs_rm_device(struct btrfs_fs_info *fs_info,
208962306a36Sopenharmony_ci		    struct btrfs_dev_lookup_args *args,
209062306a36Sopenharmony_ci		    struct block_device **bdev, void **holder)
209162306a36Sopenharmony_ci{
209262306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
209362306a36Sopenharmony_ci	struct btrfs_device *device;
209462306a36Sopenharmony_ci	struct btrfs_fs_devices *cur_devices;
209562306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
209662306a36Sopenharmony_ci	u64 num_devices;
209762306a36Sopenharmony_ci	int ret = 0;
209862306a36Sopenharmony_ci
209962306a36Sopenharmony_ci	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
210062306a36Sopenharmony_ci		btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
210162306a36Sopenharmony_ci		return -EINVAL;
210262306a36Sopenharmony_ci	}
210362306a36Sopenharmony_ci
210462306a36Sopenharmony_ci	/*
210562306a36Sopenharmony_ci	 * The device list in fs_devices is accessed without locks (neither
210662306a36Sopenharmony_ci	 * uuid_mutex nor device_list_mutex) as it won't change on a mounted
210762306a36Sopenharmony_ci	 * filesystem and another device rm cannot run.
210862306a36Sopenharmony_ci	 */
210962306a36Sopenharmony_ci	num_devices = btrfs_num_devices(fs_info);
211062306a36Sopenharmony_ci
211162306a36Sopenharmony_ci	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
211262306a36Sopenharmony_ci	if (ret)
211362306a36Sopenharmony_ci		return ret;
211462306a36Sopenharmony_ci
211562306a36Sopenharmony_ci	device = btrfs_find_device(fs_info->fs_devices, args);
211662306a36Sopenharmony_ci	if (!device) {
211762306a36Sopenharmony_ci		if (args->missing)
211862306a36Sopenharmony_ci			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
211962306a36Sopenharmony_ci		else
212062306a36Sopenharmony_ci			ret = -ENOENT;
212162306a36Sopenharmony_ci		return ret;
212262306a36Sopenharmony_ci	}
212362306a36Sopenharmony_ci
212462306a36Sopenharmony_ci	if (btrfs_pinned_by_swapfile(fs_info, device)) {
212562306a36Sopenharmony_ci		btrfs_warn_in_rcu(fs_info,
212662306a36Sopenharmony_ci		  "cannot remove device %s (devid %llu) due to active swapfile",
212762306a36Sopenharmony_ci				  btrfs_dev_name(device), device->devid);
212862306a36Sopenharmony_ci		return -ETXTBSY;
212962306a36Sopenharmony_ci	}
213062306a36Sopenharmony_ci
213162306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
213262306a36Sopenharmony_ci		return BTRFS_ERROR_DEV_TGT_REPLACE;
213362306a36Sopenharmony_ci
213462306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
213562306a36Sopenharmony_ci	    fs_info->fs_devices->rw_devices == 1)
213662306a36Sopenharmony_ci		return BTRFS_ERROR_DEV_ONLY_WRITABLE;
213762306a36Sopenharmony_ci
213862306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
213962306a36Sopenharmony_ci		mutex_lock(&fs_info->chunk_mutex);
214062306a36Sopenharmony_ci		list_del_init(&device->dev_alloc_list);
214162306a36Sopenharmony_ci		device->fs_devices->rw_devices--;
214262306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
214362306a36Sopenharmony_ci	}
214462306a36Sopenharmony_ci
214562306a36Sopenharmony_ci	ret = btrfs_shrink_device(device, 0);
214662306a36Sopenharmony_ci	if (ret)
214762306a36Sopenharmony_ci		goto error_undo;
214862306a36Sopenharmony_ci
214962306a36Sopenharmony_ci	trans = btrfs_start_transaction(fs_info->chunk_root, 0);
215062306a36Sopenharmony_ci	if (IS_ERR(trans)) {
215162306a36Sopenharmony_ci		ret = PTR_ERR(trans);
215262306a36Sopenharmony_ci		goto error_undo;
215362306a36Sopenharmony_ci	}
215462306a36Sopenharmony_ci
215562306a36Sopenharmony_ci	ret = btrfs_rm_dev_item(trans, device);
215662306a36Sopenharmony_ci	if (ret) {
215762306a36Sopenharmony_ci		/* Any error in dev item removal is critical */
215862306a36Sopenharmony_ci		btrfs_crit(fs_info,
215962306a36Sopenharmony_ci			   "failed to remove device item for devid %llu: %d",
216062306a36Sopenharmony_ci			   device->devid, ret);
216162306a36Sopenharmony_ci		btrfs_abort_transaction(trans, ret);
216262306a36Sopenharmony_ci		btrfs_end_transaction(trans);
216362306a36Sopenharmony_ci		return ret;
216462306a36Sopenharmony_ci	}
216562306a36Sopenharmony_ci
216662306a36Sopenharmony_ci	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
216762306a36Sopenharmony_ci	btrfs_scrub_cancel_dev(device);
216862306a36Sopenharmony_ci
216962306a36Sopenharmony_ci	/*
217062306a36Sopenharmony_ci	 * the device list mutex makes sure that we don't change
217162306a36Sopenharmony_ci	 * the device list while someone else is writing out all
217262306a36Sopenharmony_ci	 * the device supers. Whoever is writing all supers, should
217362306a36Sopenharmony_ci	 * lock the device list mutex before getting the number of
217462306a36Sopenharmony_ci	 * devices in the super block (super_copy). Conversely,
217562306a36Sopenharmony_ci	 * whoever updates the number of devices in the super block
217662306a36Sopenharmony_ci	 * (super_copy) should hold the device list mutex.
217762306a36Sopenharmony_ci	 */
217862306a36Sopenharmony_ci
217962306a36Sopenharmony_ci	/*
218062306a36Sopenharmony_ci	 * In normal cases the cur_devices == fs_devices. But in case
218162306a36Sopenharmony_ci	 * of deleting a seed device, the cur_devices should point to
218262306a36Sopenharmony_ci	 * its own fs_devices listed under the fs_devices->seed_list.
218362306a36Sopenharmony_ci	 */
218462306a36Sopenharmony_ci	cur_devices = device->fs_devices;
218562306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
218662306a36Sopenharmony_ci	list_del_rcu(&device->dev_list);
218762306a36Sopenharmony_ci
218862306a36Sopenharmony_ci	cur_devices->num_devices--;
218962306a36Sopenharmony_ci	cur_devices->total_devices--;
219062306a36Sopenharmony_ci	/* Update total_devices of the parent fs_devices if it's seed */
219162306a36Sopenharmony_ci	if (cur_devices != fs_devices)
219262306a36Sopenharmony_ci		fs_devices->total_devices--;
219362306a36Sopenharmony_ci
219462306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
219562306a36Sopenharmony_ci		cur_devices->missing_devices--;
219662306a36Sopenharmony_ci
219762306a36Sopenharmony_ci	btrfs_assign_next_active_device(device, NULL);
219862306a36Sopenharmony_ci
219962306a36Sopenharmony_ci	if (device->bdev) {
220062306a36Sopenharmony_ci		cur_devices->open_devices--;
220162306a36Sopenharmony_ci		/* remove sysfs entry */
220262306a36Sopenharmony_ci		btrfs_sysfs_remove_device(device);
220362306a36Sopenharmony_ci	}
220462306a36Sopenharmony_ci
220562306a36Sopenharmony_ci	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
220662306a36Sopenharmony_ci	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
220762306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
220862306a36Sopenharmony_ci
220962306a36Sopenharmony_ci	/*
221062306a36Sopenharmony_ci	 * At this point, the device is zero sized and detached from the
221162306a36Sopenharmony_ci	 * devices list.  All that's left is to zero out the old supers and
221262306a36Sopenharmony_ci	 * free the device.
221362306a36Sopenharmony_ci	 *
221462306a36Sopenharmony_ci	 * We cannot call btrfs_close_bdev() here because we're holding the sb
221562306a36Sopenharmony_ci	 * write lock, and blkdev_put() will pull in the ->open_mutex on the
221662306a36Sopenharmony_ci	 * block device and it's dependencies.  Instead just flush the device
221762306a36Sopenharmony_ci	 * and let the caller do the final blkdev_put.
221862306a36Sopenharmony_ci	 */
221962306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
222062306a36Sopenharmony_ci		btrfs_scratch_superblocks(fs_info, device->bdev,
222162306a36Sopenharmony_ci					  device->name->str);
222262306a36Sopenharmony_ci		if (device->bdev) {
222362306a36Sopenharmony_ci			sync_blockdev(device->bdev);
222462306a36Sopenharmony_ci			invalidate_bdev(device->bdev);
222562306a36Sopenharmony_ci		}
222662306a36Sopenharmony_ci	}
222762306a36Sopenharmony_ci
222862306a36Sopenharmony_ci	*bdev = device->bdev;
222962306a36Sopenharmony_ci	*holder = device->holder;
223062306a36Sopenharmony_ci	synchronize_rcu();
223162306a36Sopenharmony_ci	btrfs_free_device(device);
223262306a36Sopenharmony_ci
223362306a36Sopenharmony_ci	/*
223462306a36Sopenharmony_ci	 * This can happen if cur_devices is the private seed devices list.  We
223562306a36Sopenharmony_ci	 * cannot call close_fs_devices() here because it expects the uuid_mutex
223662306a36Sopenharmony_ci	 * to be held, but in fact we don't need that for the private
223762306a36Sopenharmony_ci	 * seed_devices, we can simply decrement cur_devices->opened and then
223862306a36Sopenharmony_ci	 * remove it from our list and free the fs_devices.
223962306a36Sopenharmony_ci	 */
224062306a36Sopenharmony_ci	if (cur_devices->num_devices == 0) {
224162306a36Sopenharmony_ci		list_del_init(&cur_devices->seed_list);
224262306a36Sopenharmony_ci		ASSERT(cur_devices->opened == 1);
224362306a36Sopenharmony_ci		cur_devices->opened--;
224462306a36Sopenharmony_ci		free_fs_devices(cur_devices);
224562306a36Sopenharmony_ci	}
224662306a36Sopenharmony_ci
224762306a36Sopenharmony_ci	ret = btrfs_commit_transaction(trans);
224862306a36Sopenharmony_ci
224962306a36Sopenharmony_ci	return ret;
225062306a36Sopenharmony_ci
225162306a36Sopenharmony_cierror_undo:
225262306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
225362306a36Sopenharmony_ci		mutex_lock(&fs_info->chunk_mutex);
225462306a36Sopenharmony_ci		list_add(&device->dev_alloc_list,
225562306a36Sopenharmony_ci			 &fs_devices->alloc_list);
225662306a36Sopenharmony_ci		device->fs_devices->rw_devices++;
225762306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
225862306a36Sopenharmony_ci	}
225962306a36Sopenharmony_ci	return ret;
226062306a36Sopenharmony_ci}
226162306a36Sopenharmony_ci
226262306a36Sopenharmony_civoid btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
226362306a36Sopenharmony_ci{
226462306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices;
226562306a36Sopenharmony_ci
226662306a36Sopenharmony_ci	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
226762306a36Sopenharmony_ci
226862306a36Sopenharmony_ci	/*
226962306a36Sopenharmony_ci	 * in case of fs with no seed, srcdev->fs_devices will point
227062306a36Sopenharmony_ci	 * to fs_devices of fs_info. However when the dev being replaced is
227162306a36Sopenharmony_ci	 * a seed dev it will point to the seed's local fs_devices. In short
227262306a36Sopenharmony_ci	 * srcdev will have its correct fs_devices in both the cases.
227362306a36Sopenharmony_ci	 */
227462306a36Sopenharmony_ci	fs_devices = srcdev->fs_devices;
227562306a36Sopenharmony_ci
227662306a36Sopenharmony_ci	list_del_rcu(&srcdev->dev_list);
227762306a36Sopenharmony_ci	list_del(&srcdev->dev_alloc_list);
227862306a36Sopenharmony_ci	fs_devices->num_devices--;
227962306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
228062306a36Sopenharmony_ci		fs_devices->missing_devices--;
228162306a36Sopenharmony_ci
228262306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
228362306a36Sopenharmony_ci		fs_devices->rw_devices--;
228462306a36Sopenharmony_ci
228562306a36Sopenharmony_ci	if (srcdev->bdev)
228662306a36Sopenharmony_ci		fs_devices->open_devices--;
228762306a36Sopenharmony_ci}
228862306a36Sopenharmony_ci
228962306a36Sopenharmony_civoid btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
229062306a36Sopenharmony_ci{
229162306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
229262306a36Sopenharmony_ci
229362306a36Sopenharmony_ci	mutex_lock(&uuid_mutex);
229462306a36Sopenharmony_ci
229562306a36Sopenharmony_ci	btrfs_close_bdev(srcdev);
229662306a36Sopenharmony_ci	synchronize_rcu();
229762306a36Sopenharmony_ci	btrfs_free_device(srcdev);
229862306a36Sopenharmony_ci
229962306a36Sopenharmony_ci	/* if this is no devs we rather delete the fs_devices */
230062306a36Sopenharmony_ci	if (!fs_devices->num_devices) {
230162306a36Sopenharmony_ci		/*
230262306a36Sopenharmony_ci		 * On a mounted FS, num_devices can't be zero unless it's a
230362306a36Sopenharmony_ci		 * seed. In case of a seed device being replaced, the replace
230462306a36Sopenharmony_ci		 * target added to the sprout FS, so there will be no more
230562306a36Sopenharmony_ci		 * device left under the seed FS.
230662306a36Sopenharmony_ci		 */
230762306a36Sopenharmony_ci		ASSERT(fs_devices->seeding);
230862306a36Sopenharmony_ci
230962306a36Sopenharmony_ci		list_del_init(&fs_devices->seed_list);
231062306a36Sopenharmony_ci		close_fs_devices(fs_devices);
231162306a36Sopenharmony_ci		free_fs_devices(fs_devices);
231262306a36Sopenharmony_ci	}
231362306a36Sopenharmony_ci	mutex_unlock(&uuid_mutex);
231462306a36Sopenharmony_ci}
231562306a36Sopenharmony_ci
231662306a36Sopenharmony_civoid btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
231762306a36Sopenharmony_ci{
231862306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
231962306a36Sopenharmony_ci
232062306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
232162306a36Sopenharmony_ci
232262306a36Sopenharmony_ci	btrfs_sysfs_remove_device(tgtdev);
232362306a36Sopenharmony_ci
232462306a36Sopenharmony_ci	if (tgtdev->bdev)
232562306a36Sopenharmony_ci		fs_devices->open_devices--;
232662306a36Sopenharmony_ci
232762306a36Sopenharmony_ci	fs_devices->num_devices--;
232862306a36Sopenharmony_ci
232962306a36Sopenharmony_ci	btrfs_assign_next_active_device(tgtdev, NULL);
233062306a36Sopenharmony_ci
233162306a36Sopenharmony_ci	list_del_rcu(&tgtdev->dev_list);
233262306a36Sopenharmony_ci
233362306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
233462306a36Sopenharmony_ci
233562306a36Sopenharmony_ci	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
233662306a36Sopenharmony_ci				  tgtdev->name->str);
233762306a36Sopenharmony_ci
233862306a36Sopenharmony_ci	btrfs_close_bdev(tgtdev);
233962306a36Sopenharmony_ci	synchronize_rcu();
234062306a36Sopenharmony_ci	btrfs_free_device(tgtdev);
234162306a36Sopenharmony_ci}
234262306a36Sopenharmony_ci
234362306a36Sopenharmony_ci/*
234462306a36Sopenharmony_ci * Populate args from device at path.
234562306a36Sopenharmony_ci *
234662306a36Sopenharmony_ci * @fs_info:	the filesystem
234762306a36Sopenharmony_ci * @args:	the args to populate
234862306a36Sopenharmony_ci * @path:	the path to the device
234962306a36Sopenharmony_ci *
235062306a36Sopenharmony_ci * This will read the super block of the device at @path and populate @args with
235162306a36Sopenharmony_ci * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
235262306a36Sopenharmony_ci * lookup a device to operate on, but need to do it before we take any locks.
235362306a36Sopenharmony_ci * This properly handles the special case of "missing" that a user may pass in,
235462306a36Sopenharmony_ci * and does some basic sanity checks.  The caller must make sure that @path is
235562306a36Sopenharmony_ci * properly NUL terminated before calling in, and must call
235662306a36Sopenharmony_ci * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
235762306a36Sopenharmony_ci * uuid buffers.
235862306a36Sopenharmony_ci *
235962306a36Sopenharmony_ci * Return: 0 for success, -errno for failure
236062306a36Sopenharmony_ci */
236162306a36Sopenharmony_ciint btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
236262306a36Sopenharmony_ci				 struct btrfs_dev_lookup_args *args,
236362306a36Sopenharmony_ci				 const char *path)
236462306a36Sopenharmony_ci{
236562306a36Sopenharmony_ci	struct btrfs_super_block *disk_super;
236662306a36Sopenharmony_ci	struct block_device *bdev;
236762306a36Sopenharmony_ci	int ret;
236862306a36Sopenharmony_ci
236962306a36Sopenharmony_ci	if (!path || !path[0])
237062306a36Sopenharmony_ci		return -EINVAL;
237162306a36Sopenharmony_ci	if (!strcmp(path, "missing")) {
237262306a36Sopenharmony_ci		args->missing = true;
237362306a36Sopenharmony_ci		return 0;
237462306a36Sopenharmony_ci	}
237562306a36Sopenharmony_ci
237662306a36Sopenharmony_ci	args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
237762306a36Sopenharmony_ci	args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
237862306a36Sopenharmony_ci	if (!args->uuid || !args->fsid) {
237962306a36Sopenharmony_ci		btrfs_put_dev_args_from_path(args);
238062306a36Sopenharmony_ci		return -ENOMEM;
238162306a36Sopenharmony_ci	}
238262306a36Sopenharmony_ci
238362306a36Sopenharmony_ci	ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
238462306a36Sopenharmony_ci				    &bdev, &disk_super);
238562306a36Sopenharmony_ci	if (ret) {
238662306a36Sopenharmony_ci		btrfs_put_dev_args_from_path(args);
238762306a36Sopenharmony_ci		return ret;
238862306a36Sopenharmony_ci	}
238962306a36Sopenharmony_ci
239062306a36Sopenharmony_ci	args->devid = btrfs_stack_device_id(&disk_super->dev_item);
239162306a36Sopenharmony_ci	memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
239262306a36Sopenharmony_ci	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
239362306a36Sopenharmony_ci		memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
239462306a36Sopenharmony_ci	else
239562306a36Sopenharmony_ci		memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
239662306a36Sopenharmony_ci	btrfs_release_disk_super(disk_super);
239762306a36Sopenharmony_ci	blkdev_put(bdev, NULL);
239862306a36Sopenharmony_ci	return 0;
239962306a36Sopenharmony_ci}
240062306a36Sopenharmony_ci
240162306a36Sopenharmony_ci/*
240262306a36Sopenharmony_ci * Only use this jointly with btrfs_get_dev_args_from_path() because we will
240362306a36Sopenharmony_ci * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
240462306a36Sopenharmony_ci * that don't need to be freed.
240562306a36Sopenharmony_ci */
240662306a36Sopenharmony_civoid btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
240762306a36Sopenharmony_ci{
240862306a36Sopenharmony_ci	kfree(args->uuid);
240962306a36Sopenharmony_ci	kfree(args->fsid);
241062306a36Sopenharmony_ci	args->uuid = NULL;
241162306a36Sopenharmony_ci	args->fsid = NULL;
241262306a36Sopenharmony_ci}
241362306a36Sopenharmony_ci
241462306a36Sopenharmony_cistruct btrfs_device *btrfs_find_device_by_devspec(
241562306a36Sopenharmony_ci		struct btrfs_fs_info *fs_info, u64 devid,
241662306a36Sopenharmony_ci		const char *device_path)
241762306a36Sopenharmony_ci{
241862306a36Sopenharmony_ci	BTRFS_DEV_LOOKUP_ARGS(args);
241962306a36Sopenharmony_ci	struct btrfs_device *device;
242062306a36Sopenharmony_ci	int ret;
242162306a36Sopenharmony_ci
242262306a36Sopenharmony_ci	if (devid) {
242362306a36Sopenharmony_ci		args.devid = devid;
242462306a36Sopenharmony_ci		device = btrfs_find_device(fs_info->fs_devices, &args);
242562306a36Sopenharmony_ci		if (!device)
242662306a36Sopenharmony_ci			return ERR_PTR(-ENOENT);
242762306a36Sopenharmony_ci		return device;
242862306a36Sopenharmony_ci	}
242962306a36Sopenharmony_ci
243062306a36Sopenharmony_ci	ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
243162306a36Sopenharmony_ci	if (ret)
243262306a36Sopenharmony_ci		return ERR_PTR(ret);
243362306a36Sopenharmony_ci	device = btrfs_find_device(fs_info->fs_devices, &args);
243462306a36Sopenharmony_ci	btrfs_put_dev_args_from_path(&args);
243562306a36Sopenharmony_ci	if (!device)
243662306a36Sopenharmony_ci		return ERR_PTR(-ENOENT);
243762306a36Sopenharmony_ci	return device;
243862306a36Sopenharmony_ci}
243962306a36Sopenharmony_ci
244062306a36Sopenharmony_cistatic struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
244162306a36Sopenharmony_ci{
244262306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
244362306a36Sopenharmony_ci	struct btrfs_fs_devices *old_devices;
244462306a36Sopenharmony_ci	struct btrfs_fs_devices *seed_devices;
244562306a36Sopenharmony_ci
244662306a36Sopenharmony_ci	lockdep_assert_held(&uuid_mutex);
244762306a36Sopenharmony_ci	if (!fs_devices->seeding)
244862306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
244962306a36Sopenharmony_ci
245062306a36Sopenharmony_ci	/*
245162306a36Sopenharmony_ci	 * Private copy of the seed devices, anchored at
245262306a36Sopenharmony_ci	 * fs_info->fs_devices->seed_list
245362306a36Sopenharmony_ci	 */
245462306a36Sopenharmony_ci	seed_devices = alloc_fs_devices(NULL, NULL);
245562306a36Sopenharmony_ci	if (IS_ERR(seed_devices))
245662306a36Sopenharmony_ci		return seed_devices;
245762306a36Sopenharmony_ci
245862306a36Sopenharmony_ci	/*
245962306a36Sopenharmony_ci	 * It's necessary to retain a copy of the original seed fs_devices in
246062306a36Sopenharmony_ci	 * fs_uuids so that filesystems which have been seeded can successfully
246162306a36Sopenharmony_ci	 * reference the seed device from open_seed_devices. This also supports
246262306a36Sopenharmony_ci	 * multiple fs seed.
246362306a36Sopenharmony_ci	 */
246462306a36Sopenharmony_ci	old_devices = clone_fs_devices(fs_devices);
246562306a36Sopenharmony_ci	if (IS_ERR(old_devices)) {
246662306a36Sopenharmony_ci		kfree(seed_devices);
246762306a36Sopenharmony_ci		return old_devices;
246862306a36Sopenharmony_ci	}
246962306a36Sopenharmony_ci
247062306a36Sopenharmony_ci	list_add(&old_devices->fs_list, &fs_uuids);
247162306a36Sopenharmony_ci
247262306a36Sopenharmony_ci	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
247362306a36Sopenharmony_ci	seed_devices->opened = 1;
247462306a36Sopenharmony_ci	INIT_LIST_HEAD(&seed_devices->devices);
247562306a36Sopenharmony_ci	INIT_LIST_HEAD(&seed_devices->alloc_list);
247662306a36Sopenharmony_ci	mutex_init(&seed_devices->device_list_mutex);
247762306a36Sopenharmony_ci
247862306a36Sopenharmony_ci	return seed_devices;
247962306a36Sopenharmony_ci}
248062306a36Sopenharmony_ci
248162306a36Sopenharmony_ci/*
248262306a36Sopenharmony_ci * Splice seed devices into the sprout fs_devices.
248362306a36Sopenharmony_ci * Generate a new fsid for the sprouted read-write filesystem.
248462306a36Sopenharmony_ci */
248562306a36Sopenharmony_cistatic void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
248662306a36Sopenharmony_ci			       struct btrfs_fs_devices *seed_devices)
248762306a36Sopenharmony_ci{
248862306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
248962306a36Sopenharmony_ci	struct btrfs_super_block *disk_super = fs_info->super_copy;
249062306a36Sopenharmony_ci	struct btrfs_device *device;
249162306a36Sopenharmony_ci	u64 super_flags;
249262306a36Sopenharmony_ci
249362306a36Sopenharmony_ci	/*
249462306a36Sopenharmony_ci	 * We are updating the fsid, the thread leading to device_list_add()
249562306a36Sopenharmony_ci	 * could race, so uuid_mutex is needed.
249662306a36Sopenharmony_ci	 */
249762306a36Sopenharmony_ci	lockdep_assert_held(&uuid_mutex);
249862306a36Sopenharmony_ci
249962306a36Sopenharmony_ci	/*
250062306a36Sopenharmony_ci	 * The threads listed below may traverse dev_list but can do that without
250162306a36Sopenharmony_ci	 * device_list_mutex:
250262306a36Sopenharmony_ci	 * - All device ops and balance - as we are in btrfs_exclop_start.
250362306a36Sopenharmony_ci	 * - Various dev_list readers - are using RCU.
250462306a36Sopenharmony_ci	 * - btrfs_ioctl_fitrim() - is using RCU.
250562306a36Sopenharmony_ci	 *
250662306a36Sopenharmony_ci	 * For-read threads as below are using device_list_mutex:
250762306a36Sopenharmony_ci	 * - Readonly scrub btrfs_scrub_dev()
250862306a36Sopenharmony_ci	 * - Readonly scrub btrfs_scrub_progress()
250962306a36Sopenharmony_ci	 * - btrfs_get_dev_stats()
251062306a36Sopenharmony_ci	 */
251162306a36Sopenharmony_ci	lockdep_assert_held(&fs_devices->device_list_mutex);
251262306a36Sopenharmony_ci
251362306a36Sopenharmony_ci	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
251462306a36Sopenharmony_ci			      synchronize_rcu);
251562306a36Sopenharmony_ci	list_for_each_entry(device, &seed_devices->devices, dev_list)
251662306a36Sopenharmony_ci		device->fs_devices = seed_devices;
251762306a36Sopenharmony_ci
251862306a36Sopenharmony_ci	fs_devices->seeding = false;
251962306a36Sopenharmony_ci	fs_devices->num_devices = 0;
252062306a36Sopenharmony_ci	fs_devices->open_devices = 0;
252162306a36Sopenharmony_ci	fs_devices->missing_devices = 0;
252262306a36Sopenharmony_ci	fs_devices->rotating = false;
252362306a36Sopenharmony_ci	list_add(&seed_devices->seed_list, &fs_devices->seed_list);
252462306a36Sopenharmony_ci
252562306a36Sopenharmony_ci	generate_random_uuid(fs_devices->fsid);
252662306a36Sopenharmony_ci	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
252762306a36Sopenharmony_ci	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
252862306a36Sopenharmony_ci
252962306a36Sopenharmony_ci	super_flags = btrfs_super_flags(disk_super) &
253062306a36Sopenharmony_ci		      ~BTRFS_SUPER_FLAG_SEEDING;
253162306a36Sopenharmony_ci	btrfs_set_super_flags(disk_super, super_flags);
253262306a36Sopenharmony_ci}
253362306a36Sopenharmony_ci
253462306a36Sopenharmony_ci/*
253562306a36Sopenharmony_ci * Store the expected generation for seed devices in device items.
253662306a36Sopenharmony_ci */
253762306a36Sopenharmony_cistatic int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
253862306a36Sopenharmony_ci{
253962306a36Sopenharmony_ci	BTRFS_DEV_LOOKUP_ARGS(args);
254062306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = trans->fs_info;
254162306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->chunk_root;
254262306a36Sopenharmony_ci	struct btrfs_path *path;
254362306a36Sopenharmony_ci	struct extent_buffer *leaf;
254462306a36Sopenharmony_ci	struct btrfs_dev_item *dev_item;
254562306a36Sopenharmony_ci	struct btrfs_device *device;
254662306a36Sopenharmony_ci	struct btrfs_key key;
254762306a36Sopenharmony_ci	u8 fs_uuid[BTRFS_FSID_SIZE];
254862306a36Sopenharmony_ci	u8 dev_uuid[BTRFS_UUID_SIZE];
254962306a36Sopenharmony_ci	int ret;
255062306a36Sopenharmony_ci
255162306a36Sopenharmony_ci	path = btrfs_alloc_path();
255262306a36Sopenharmony_ci	if (!path)
255362306a36Sopenharmony_ci		return -ENOMEM;
255462306a36Sopenharmony_ci
255562306a36Sopenharmony_ci	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
255662306a36Sopenharmony_ci	key.offset = 0;
255762306a36Sopenharmony_ci	key.type = BTRFS_DEV_ITEM_KEY;
255862306a36Sopenharmony_ci
255962306a36Sopenharmony_ci	while (1) {
256062306a36Sopenharmony_ci		btrfs_reserve_chunk_metadata(trans, false);
256162306a36Sopenharmony_ci		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
256262306a36Sopenharmony_ci		btrfs_trans_release_chunk_metadata(trans);
256362306a36Sopenharmony_ci		if (ret < 0)
256462306a36Sopenharmony_ci			goto error;
256562306a36Sopenharmony_ci
256662306a36Sopenharmony_ci		leaf = path->nodes[0];
256762306a36Sopenharmony_cinext_slot:
256862306a36Sopenharmony_ci		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
256962306a36Sopenharmony_ci			ret = btrfs_next_leaf(root, path);
257062306a36Sopenharmony_ci			if (ret > 0)
257162306a36Sopenharmony_ci				break;
257262306a36Sopenharmony_ci			if (ret < 0)
257362306a36Sopenharmony_ci				goto error;
257462306a36Sopenharmony_ci			leaf = path->nodes[0];
257562306a36Sopenharmony_ci			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
257662306a36Sopenharmony_ci			btrfs_release_path(path);
257762306a36Sopenharmony_ci			continue;
257862306a36Sopenharmony_ci		}
257962306a36Sopenharmony_ci
258062306a36Sopenharmony_ci		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
258162306a36Sopenharmony_ci		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
258262306a36Sopenharmony_ci		    key.type != BTRFS_DEV_ITEM_KEY)
258362306a36Sopenharmony_ci			break;
258462306a36Sopenharmony_ci
258562306a36Sopenharmony_ci		dev_item = btrfs_item_ptr(leaf, path->slots[0],
258662306a36Sopenharmony_ci					  struct btrfs_dev_item);
258762306a36Sopenharmony_ci		args.devid = btrfs_device_id(leaf, dev_item);
258862306a36Sopenharmony_ci		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
258962306a36Sopenharmony_ci				   BTRFS_UUID_SIZE);
259062306a36Sopenharmony_ci		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
259162306a36Sopenharmony_ci				   BTRFS_FSID_SIZE);
259262306a36Sopenharmony_ci		args.uuid = dev_uuid;
259362306a36Sopenharmony_ci		args.fsid = fs_uuid;
259462306a36Sopenharmony_ci		device = btrfs_find_device(fs_info->fs_devices, &args);
259562306a36Sopenharmony_ci		BUG_ON(!device); /* Logic error */
259662306a36Sopenharmony_ci
259762306a36Sopenharmony_ci		if (device->fs_devices->seeding) {
259862306a36Sopenharmony_ci			btrfs_set_device_generation(leaf, dev_item,
259962306a36Sopenharmony_ci						    device->generation);
260062306a36Sopenharmony_ci			btrfs_mark_buffer_dirty(trans, leaf);
260162306a36Sopenharmony_ci		}
260262306a36Sopenharmony_ci
260362306a36Sopenharmony_ci		path->slots[0]++;
260462306a36Sopenharmony_ci		goto next_slot;
260562306a36Sopenharmony_ci	}
260662306a36Sopenharmony_ci	ret = 0;
260762306a36Sopenharmony_cierror:
260862306a36Sopenharmony_ci	btrfs_free_path(path);
260962306a36Sopenharmony_ci	return ret;
261062306a36Sopenharmony_ci}
261162306a36Sopenharmony_ci
261262306a36Sopenharmony_ciint btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
261362306a36Sopenharmony_ci{
261462306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->dev_root;
261562306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
261662306a36Sopenharmony_ci	struct btrfs_device *device;
261762306a36Sopenharmony_ci	struct block_device *bdev;
261862306a36Sopenharmony_ci	struct super_block *sb = fs_info->sb;
261962306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
262062306a36Sopenharmony_ci	struct btrfs_fs_devices *seed_devices = NULL;
262162306a36Sopenharmony_ci	u64 orig_super_total_bytes;
262262306a36Sopenharmony_ci	u64 orig_super_num_devices;
262362306a36Sopenharmony_ci	int ret = 0;
262462306a36Sopenharmony_ci	bool seeding_dev = false;
262562306a36Sopenharmony_ci	bool locked = false;
262662306a36Sopenharmony_ci
262762306a36Sopenharmony_ci	if (sb_rdonly(sb) && !fs_devices->seeding)
262862306a36Sopenharmony_ci		return -EROFS;
262962306a36Sopenharmony_ci
263062306a36Sopenharmony_ci	bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
263162306a36Sopenharmony_ci				  fs_info->bdev_holder, NULL);
263262306a36Sopenharmony_ci	if (IS_ERR(bdev))
263362306a36Sopenharmony_ci		return PTR_ERR(bdev);
263462306a36Sopenharmony_ci
263562306a36Sopenharmony_ci	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
263662306a36Sopenharmony_ci		ret = -EINVAL;
263762306a36Sopenharmony_ci		goto error;
263862306a36Sopenharmony_ci	}
263962306a36Sopenharmony_ci
264062306a36Sopenharmony_ci	if (fs_devices->seeding) {
264162306a36Sopenharmony_ci		seeding_dev = true;
264262306a36Sopenharmony_ci		down_write(&sb->s_umount);
264362306a36Sopenharmony_ci		mutex_lock(&uuid_mutex);
264462306a36Sopenharmony_ci		locked = true;
264562306a36Sopenharmony_ci	}
264662306a36Sopenharmony_ci
264762306a36Sopenharmony_ci	sync_blockdev(bdev);
264862306a36Sopenharmony_ci
264962306a36Sopenharmony_ci	rcu_read_lock();
265062306a36Sopenharmony_ci	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
265162306a36Sopenharmony_ci		if (device->bdev == bdev) {
265262306a36Sopenharmony_ci			ret = -EEXIST;
265362306a36Sopenharmony_ci			rcu_read_unlock();
265462306a36Sopenharmony_ci			goto error;
265562306a36Sopenharmony_ci		}
265662306a36Sopenharmony_ci	}
265762306a36Sopenharmony_ci	rcu_read_unlock();
265862306a36Sopenharmony_ci
265962306a36Sopenharmony_ci	device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
266062306a36Sopenharmony_ci	if (IS_ERR(device)) {
266162306a36Sopenharmony_ci		/* we can safely leave the fs_devices entry around */
266262306a36Sopenharmony_ci		ret = PTR_ERR(device);
266362306a36Sopenharmony_ci		goto error;
266462306a36Sopenharmony_ci	}
266562306a36Sopenharmony_ci
266662306a36Sopenharmony_ci	device->fs_info = fs_info;
266762306a36Sopenharmony_ci	device->bdev = bdev;
266862306a36Sopenharmony_ci	ret = lookup_bdev(device_path, &device->devt);
266962306a36Sopenharmony_ci	if (ret)
267062306a36Sopenharmony_ci		goto error_free_device;
267162306a36Sopenharmony_ci
267262306a36Sopenharmony_ci	ret = btrfs_get_dev_zone_info(device, false);
267362306a36Sopenharmony_ci	if (ret)
267462306a36Sopenharmony_ci		goto error_free_device;
267562306a36Sopenharmony_ci
267662306a36Sopenharmony_ci	trans = btrfs_start_transaction(root, 0);
267762306a36Sopenharmony_ci	if (IS_ERR(trans)) {
267862306a36Sopenharmony_ci		ret = PTR_ERR(trans);
267962306a36Sopenharmony_ci		goto error_free_zone;
268062306a36Sopenharmony_ci	}
268162306a36Sopenharmony_ci
268262306a36Sopenharmony_ci	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
268362306a36Sopenharmony_ci	device->generation = trans->transid;
268462306a36Sopenharmony_ci	device->io_width = fs_info->sectorsize;
268562306a36Sopenharmony_ci	device->io_align = fs_info->sectorsize;
268662306a36Sopenharmony_ci	device->sector_size = fs_info->sectorsize;
268762306a36Sopenharmony_ci	device->total_bytes =
268862306a36Sopenharmony_ci		round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
268962306a36Sopenharmony_ci	device->disk_total_bytes = device->total_bytes;
269062306a36Sopenharmony_ci	device->commit_total_bytes = device->total_bytes;
269162306a36Sopenharmony_ci	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
269262306a36Sopenharmony_ci	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
269362306a36Sopenharmony_ci	device->holder = fs_info->bdev_holder;
269462306a36Sopenharmony_ci	device->dev_stats_valid = 1;
269562306a36Sopenharmony_ci	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
269662306a36Sopenharmony_ci
269762306a36Sopenharmony_ci	if (seeding_dev) {
269862306a36Sopenharmony_ci		btrfs_clear_sb_rdonly(sb);
269962306a36Sopenharmony_ci
270062306a36Sopenharmony_ci		/* GFP_KERNEL allocation must not be under device_list_mutex */
270162306a36Sopenharmony_ci		seed_devices = btrfs_init_sprout(fs_info);
270262306a36Sopenharmony_ci		if (IS_ERR(seed_devices)) {
270362306a36Sopenharmony_ci			ret = PTR_ERR(seed_devices);
270462306a36Sopenharmony_ci			btrfs_abort_transaction(trans, ret);
270562306a36Sopenharmony_ci			goto error_trans;
270662306a36Sopenharmony_ci		}
270762306a36Sopenharmony_ci	}
270862306a36Sopenharmony_ci
270962306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
271062306a36Sopenharmony_ci	if (seeding_dev) {
271162306a36Sopenharmony_ci		btrfs_setup_sprout(fs_info, seed_devices);
271262306a36Sopenharmony_ci		btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
271362306a36Sopenharmony_ci						device);
271462306a36Sopenharmony_ci	}
271562306a36Sopenharmony_ci
271662306a36Sopenharmony_ci	device->fs_devices = fs_devices;
271762306a36Sopenharmony_ci
271862306a36Sopenharmony_ci	mutex_lock(&fs_info->chunk_mutex);
271962306a36Sopenharmony_ci	list_add_rcu(&device->dev_list, &fs_devices->devices);
272062306a36Sopenharmony_ci	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
272162306a36Sopenharmony_ci	fs_devices->num_devices++;
272262306a36Sopenharmony_ci	fs_devices->open_devices++;
272362306a36Sopenharmony_ci	fs_devices->rw_devices++;
272462306a36Sopenharmony_ci	fs_devices->total_devices++;
272562306a36Sopenharmony_ci	fs_devices->total_rw_bytes += device->total_bytes;
272662306a36Sopenharmony_ci
272762306a36Sopenharmony_ci	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
272862306a36Sopenharmony_ci
272962306a36Sopenharmony_ci	if (!bdev_nonrot(bdev))
273062306a36Sopenharmony_ci		fs_devices->rotating = true;
273162306a36Sopenharmony_ci
273262306a36Sopenharmony_ci	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
273362306a36Sopenharmony_ci	btrfs_set_super_total_bytes(fs_info->super_copy,
273462306a36Sopenharmony_ci		round_down(orig_super_total_bytes + device->total_bytes,
273562306a36Sopenharmony_ci			   fs_info->sectorsize));
273662306a36Sopenharmony_ci
273762306a36Sopenharmony_ci	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
273862306a36Sopenharmony_ci	btrfs_set_super_num_devices(fs_info->super_copy,
273962306a36Sopenharmony_ci				    orig_super_num_devices + 1);
274062306a36Sopenharmony_ci
274162306a36Sopenharmony_ci	/*
274262306a36Sopenharmony_ci	 * we've got more storage, clear any full flags on the space
274362306a36Sopenharmony_ci	 * infos
274462306a36Sopenharmony_ci	 */
274562306a36Sopenharmony_ci	btrfs_clear_space_info_full(fs_info);
274662306a36Sopenharmony_ci
274762306a36Sopenharmony_ci	mutex_unlock(&fs_info->chunk_mutex);
274862306a36Sopenharmony_ci
274962306a36Sopenharmony_ci	/* Add sysfs device entry */
275062306a36Sopenharmony_ci	btrfs_sysfs_add_device(device);
275162306a36Sopenharmony_ci
275262306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
275362306a36Sopenharmony_ci
275462306a36Sopenharmony_ci	if (seeding_dev) {
275562306a36Sopenharmony_ci		mutex_lock(&fs_info->chunk_mutex);
275662306a36Sopenharmony_ci		ret = init_first_rw_device(trans);
275762306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
275862306a36Sopenharmony_ci		if (ret) {
275962306a36Sopenharmony_ci			btrfs_abort_transaction(trans, ret);
276062306a36Sopenharmony_ci			goto error_sysfs;
276162306a36Sopenharmony_ci		}
276262306a36Sopenharmony_ci	}
276362306a36Sopenharmony_ci
276462306a36Sopenharmony_ci	ret = btrfs_add_dev_item(trans, device);
276562306a36Sopenharmony_ci	if (ret) {
276662306a36Sopenharmony_ci		btrfs_abort_transaction(trans, ret);
276762306a36Sopenharmony_ci		goto error_sysfs;
276862306a36Sopenharmony_ci	}
276962306a36Sopenharmony_ci
277062306a36Sopenharmony_ci	if (seeding_dev) {
277162306a36Sopenharmony_ci		ret = btrfs_finish_sprout(trans);
277262306a36Sopenharmony_ci		if (ret) {
277362306a36Sopenharmony_ci			btrfs_abort_transaction(trans, ret);
277462306a36Sopenharmony_ci			goto error_sysfs;
277562306a36Sopenharmony_ci		}
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_ci		/*
277862306a36Sopenharmony_ci		 * fs_devices now represents the newly sprouted filesystem and
277962306a36Sopenharmony_ci		 * its fsid has been changed by btrfs_sprout_splice().
278062306a36Sopenharmony_ci		 */
278162306a36Sopenharmony_ci		btrfs_sysfs_update_sprout_fsid(fs_devices);
278262306a36Sopenharmony_ci	}
278362306a36Sopenharmony_ci
278462306a36Sopenharmony_ci	ret = btrfs_commit_transaction(trans);
278562306a36Sopenharmony_ci
278662306a36Sopenharmony_ci	if (seeding_dev) {
278762306a36Sopenharmony_ci		mutex_unlock(&uuid_mutex);
278862306a36Sopenharmony_ci		up_write(&sb->s_umount);
278962306a36Sopenharmony_ci		locked = false;
279062306a36Sopenharmony_ci
279162306a36Sopenharmony_ci		if (ret) /* transaction commit */
279262306a36Sopenharmony_ci			return ret;
279362306a36Sopenharmony_ci
279462306a36Sopenharmony_ci		ret = btrfs_relocate_sys_chunks(fs_info);
279562306a36Sopenharmony_ci		if (ret < 0)
279662306a36Sopenharmony_ci			btrfs_handle_fs_error(fs_info, ret,
279762306a36Sopenharmony_ci				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
279862306a36Sopenharmony_ci		trans = btrfs_attach_transaction(root);
279962306a36Sopenharmony_ci		if (IS_ERR(trans)) {
280062306a36Sopenharmony_ci			if (PTR_ERR(trans) == -ENOENT)
280162306a36Sopenharmony_ci				return 0;
280262306a36Sopenharmony_ci			ret = PTR_ERR(trans);
280362306a36Sopenharmony_ci			trans = NULL;
280462306a36Sopenharmony_ci			goto error_sysfs;
280562306a36Sopenharmony_ci		}
280662306a36Sopenharmony_ci		ret = btrfs_commit_transaction(trans);
280762306a36Sopenharmony_ci	}
280862306a36Sopenharmony_ci
280962306a36Sopenharmony_ci	/*
281062306a36Sopenharmony_ci	 * Now that we have written a new super block to this device, check all
281162306a36Sopenharmony_ci	 * other fs_devices list if device_path alienates any other scanned
281262306a36Sopenharmony_ci	 * device.
281362306a36Sopenharmony_ci	 * We can ignore the return value as it typically returns -EINVAL and
281462306a36Sopenharmony_ci	 * only succeeds if the device was an alien.
281562306a36Sopenharmony_ci	 */
281662306a36Sopenharmony_ci	btrfs_forget_devices(device->devt);
281762306a36Sopenharmony_ci
281862306a36Sopenharmony_ci	/* Update ctime/mtime for blkid or udev */
281962306a36Sopenharmony_ci	update_dev_time(device_path);
282062306a36Sopenharmony_ci
282162306a36Sopenharmony_ci	return ret;
282262306a36Sopenharmony_ci
282362306a36Sopenharmony_cierror_sysfs:
282462306a36Sopenharmony_ci	btrfs_sysfs_remove_device(device);
282562306a36Sopenharmony_ci	mutex_lock(&fs_info->fs_devices->device_list_mutex);
282662306a36Sopenharmony_ci	mutex_lock(&fs_info->chunk_mutex);
282762306a36Sopenharmony_ci	list_del_rcu(&device->dev_list);
282862306a36Sopenharmony_ci	list_del(&device->dev_alloc_list);
282962306a36Sopenharmony_ci	fs_info->fs_devices->num_devices--;
283062306a36Sopenharmony_ci	fs_info->fs_devices->open_devices--;
283162306a36Sopenharmony_ci	fs_info->fs_devices->rw_devices--;
283262306a36Sopenharmony_ci	fs_info->fs_devices->total_devices--;
283362306a36Sopenharmony_ci	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
283462306a36Sopenharmony_ci	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
283562306a36Sopenharmony_ci	btrfs_set_super_total_bytes(fs_info->super_copy,
283662306a36Sopenharmony_ci				    orig_super_total_bytes);
283762306a36Sopenharmony_ci	btrfs_set_super_num_devices(fs_info->super_copy,
283862306a36Sopenharmony_ci				    orig_super_num_devices);
283962306a36Sopenharmony_ci	mutex_unlock(&fs_info->chunk_mutex);
284062306a36Sopenharmony_ci	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
284162306a36Sopenharmony_cierror_trans:
284262306a36Sopenharmony_ci	if (seeding_dev)
284362306a36Sopenharmony_ci		btrfs_set_sb_rdonly(sb);
284462306a36Sopenharmony_ci	if (trans)
284562306a36Sopenharmony_ci		btrfs_end_transaction(trans);
284662306a36Sopenharmony_cierror_free_zone:
284762306a36Sopenharmony_ci	btrfs_destroy_dev_zone_info(device);
284862306a36Sopenharmony_cierror_free_device:
284962306a36Sopenharmony_ci	btrfs_free_device(device);
285062306a36Sopenharmony_cierror:
285162306a36Sopenharmony_ci	blkdev_put(bdev, fs_info->bdev_holder);
285262306a36Sopenharmony_ci	if (locked) {
285362306a36Sopenharmony_ci		mutex_unlock(&uuid_mutex);
285462306a36Sopenharmony_ci		up_write(&sb->s_umount);
285562306a36Sopenharmony_ci	}
285662306a36Sopenharmony_ci	return ret;
285762306a36Sopenharmony_ci}
285862306a36Sopenharmony_ci
285962306a36Sopenharmony_cistatic noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
286062306a36Sopenharmony_ci					struct btrfs_device *device)
286162306a36Sopenharmony_ci{
286262306a36Sopenharmony_ci	int ret;
286362306a36Sopenharmony_ci	struct btrfs_path *path;
286462306a36Sopenharmony_ci	struct btrfs_root *root = device->fs_info->chunk_root;
286562306a36Sopenharmony_ci	struct btrfs_dev_item *dev_item;
286662306a36Sopenharmony_ci	struct extent_buffer *leaf;
286762306a36Sopenharmony_ci	struct btrfs_key key;
286862306a36Sopenharmony_ci
286962306a36Sopenharmony_ci	path = btrfs_alloc_path();
287062306a36Sopenharmony_ci	if (!path)
287162306a36Sopenharmony_ci		return -ENOMEM;
287262306a36Sopenharmony_ci
287362306a36Sopenharmony_ci	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
287462306a36Sopenharmony_ci	key.type = BTRFS_DEV_ITEM_KEY;
287562306a36Sopenharmony_ci	key.offset = device->devid;
287662306a36Sopenharmony_ci
287762306a36Sopenharmony_ci	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
287862306a36Sopenharmony_ci	if (ret < 0)
287962306a36Sopenharmony_ci		goto out;
288062306a36Sopenharmony_ci
288162306a36Sopenharmony_ci	if (ret > 0) {
288262306a36Sopenharmony_ci		ret = -ENOENT;
288362306a36Sopenharmony_ci		goto out;
288462306a36Sopenharmony_ci	}
288562306a36Sopenharmony_ci
288662306a36Sopenharmony_ci	leaf = path->nodes[0];
288762306a36Sopenharmony_ci	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
288862306a36Sopenharmony_ci
288962306a36Sopenharmony_ci	btrfs_set_device_id(leaf, dev_item, device->devid);
289062306a36Sopenharmony_ci	btrfs_set_device_type(leaf, dev_item, device->type);
289162306a36Sopenharmony_ci	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
289262306a36Sopenharmony_ci	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
289362306a36Sopenharmony_ci	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
289462306a36Sopenharmony_ci	btrfs_set_device_total_bytes(leaf, dev_item,
289562306a36Sopenharmony_ci				     btrfs_device_get_disk_total_bytes(device));
289662306a36Sopenharmony_ci	btrfs_set_device_bytes_used(leaf, dev_item,
289762306a36Sopenharmony_ci				    btrfs_device_get_bytes_used(device));
289862306a36Sopenharmony_ci	btrfs_mark_buffer_dirty(trans, leaf);
289962306a36Sopenharmony_ci
290062306a36Sopenharmony_ciout:
290162306a36Sopenharmony_ci	btrfs_free_path(path);
290262306a36Sopenharmony_ci	return ret;
290362306a36Sopenharmony_ci}
290462306a36Sopenharmony_ci
290562306a36Sopenharmony_ciint btrfs_grow_device(struct btrfs_trans_handle *trans,
290662306a36Sopenharmony_ci		      struct btrfs_device *device, u64 new_size)
290762306a36Sopenharmony_ci{
290862306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = device->fs_info;
290962306a36Sopenharmony_ci	struct btrfs_super_block *super_copy = fs_info->super_copy;
291062306a36Sopenharmony_ci	u64 old_total;
291162306a36Sopenharmony_ci	u64 diff;
291262306a36Sopenharmony_ci	int ret;
291362306a36Sopenharmony_ci
291462306a36Sopenharmony_ci	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
291562306a36Sopenharmony_ci		return -EACCES;
291662306a36Sopenharmony_ci
291762306a36Sopenharmony_ci	new_size = round_down(new_size, fs_info->sectorsize);
291862306a36Sopenharmony_ci
291962306a36Sopenharmony_ci	mutex_lock(&fs_info->chunk_mutex);
292062306a36Sopenharmony_ci	old_total = btrfs_super_total_bytes(super_copy);
292162306a36Sopenharmony_ci	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
292262306a36Sopenharmony_ci
292362306a36Sopenharmony_ci	if (new_size <= device->total_bytes ||
292462306a36Sopenharmony_ci	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
292562306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
292662306a36Sopenharmony_ci		return -EINVAL;
292762306a36Sopenharmony_ci	}
292862306a36Sopenharmony_ci
292962306a36Sopenharmony_ci	btrfs_set_super_total_bytes(super_copy,
293062306a36Sopenharmony_ci			round_down(old_total + diff, fs_info->sectorsize));
293162306a36Sopenharmony_ci	device->fs_devices->total_rw_bytes += diff;
293262306a36Sopenharmony_ci
293362306a36Sopenharmony_ci	btrfs_device_set_total_bytes(device, new_size);
293462306a36Sopenharmony_ci	btrfs_device_set_disk_total_bytes(device, new_size);
293562306a36Sopenharmony_ci	btrfs_clear_space_info_full(device->fs_info);
293662306a36Sopenharmony_ci	if (list_empty(&device->post_commit_list))
293762306a36Sopenharmony_ci		list_add_tail(&device->post_commit_list,
293862306a36Sopenharmony_ci			      &trans->transaction->dev_update_list);
293962306a36Sopenharmony_ci	mutex_unlock(&fs_info->chunk_mutex);
294062306a36Sopenharmony_ci
294162306a36Sopenharmony_ci	btrfs_reserve_chunk_metadata(trans, false);
294262306a36Sopenharmony_ci	ret = btrfs_update_device(trans, device);
294362306a36Sopenharmony_ci	btrfs_trans_release_chunk_metadata(trans);
294462306a36Sopenharmony_ci
294562306a36Sopenharmony_ci	return ret;
294662306a36Sopenharmony_ci}
294762306a36Sopenharmony_ci
294862306a36Sopenharmony_cistatic int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
294962306a36Sopenharmony_ci{
295062306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = trans->fs_info;
295162306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->chunk_root;
295262306a36Sopenharmony_ci	int ret;
295362306a36Sopenharmony_ci	struct btrfs_path *path;
295462306a36Sopenharmony_ci	struct btrfs_key key;
295562306a36Sopenharmony_ci
295662306a36Sopenharmony_ci	path = btrfs_alloc_path();
295762306a36Sopenharmony_ci	if (!path)
295862306a36Sopenharmony_ci		return -ENOMEM;
295962306a36Sopenharmony_ci
296062306a36Sopenharmony_ci	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
296162306a36Sopenharmony_ci	key.offset = chunk_offset;
296262306a36Sopenharmony_ci	key.type = BTRFS_CHUNK_ITEM_KEY;
296362306a36Sopenharmony_ci
296462306a36Sopenharmony_ci	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
296562306a36Sopenharmony_ci	if (ret < 0)
296662306a36Sopenharmony_ci		goto out;
296762306a36Sopenharmony_ci	else if (ret > 0) { /* Logic error or corruption */
296862306a36Sopenharmony_ci		btrfs_handle_fs_error(fs_info, -ENOENT,
296962306a36Sopenharmony_ci				      "Failed lookup while freeing chunk.");
297062306a36Sopenharmony_ci		ret = -ENOENT;
297162306a36Sopenharmony_ci		goto out;
297262306a36Sopenharmony_ci	}
297362306a36Sopenharmony_ci
297462306a36Sopenharmony_ci	ret = btrfs_del_item(trans, root, path);
297562306a36Sopenharmony_ci	if (ret < 0)
297662306a36Sopenharmony_ci		btrfs_handle_fs_error(fs_info, ret,
297762306a36Sopenharmony_ci				      "Failed to delete chunk item.");
297862306a36Sopenharmony_ciout:
297962306a36Sopenharmony_ci	btrfs_free_path(path);
298062306a36Sopenharmony_ci	return ret;
298162306a36Sopenharmony_ci}
298262306a36Sopenharmony_ci
298362306a36Sopenharmony_cistatic int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
298462306a36Sopenharmony_ci{
298562306a36Sopenharmony_ci	struct btrfs_super_block *super_copy = fs_info->super_copy;
298662306a36Sopenharmony_ci	struct btrfs_disk_key *disk_key;
298762306a36Sopenharmony_ci	struct btrfs_chunk *chunk;
298862306a36Sopenharmony_ci	u8 *ptr;
298962306a36Sopenharmony_ci	int ret = 0;
299062306a36Sopenharmony_ci	u32 num_stripes;
299162306a36Sopenharmony_ci	u32 array_size;
299262306a36Sopenharmony_ci	u32 len = 0;
299362306a36Sopenharmony_ci	u32 cur;
299462306a36Sopenharmony_ci	struct btrfs_key key;
299562306a36Sopenharmony_ci
299662306a36Sopenharmony_ci	lockdep_assert_held(&fs_info->chunk_mutex);
299762306a36Sopenharmony_ci	array_size = btrfs_super_sys_array_size(super_copy);
299862306a36Sopenharmony_ci
299962306a36Sopenharmony_ci	ptr = super_copy->sys_chunk_array;
300062306a36Sopenharmony_ci	cur = 0;
300162306a36Sopenharmony_ci
300262306a36Sopenharmony_ci	while (cur < array_size) {
300362306a36Sopenharmony_ci		disk_key = (struct btrfs_disk_key *)ptr;
300462306a36Sopenharmony_ci		btrfs_disk_key_to_cpu(&key, disk_key);
300562306a36Sopenharmony_ci
300662306a36Sopenharmony_ci		len = sizeof(*disk_key);
300762306a36Sopenharmony_ci
300862306a36Sopenharmony_ci		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
300962306a36Sopenharmony_ci			chunk = (struct btrfs_chunk *)(ptr + len);
301062306a36Sopenharmony_ci			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
301162306a36Sopenharmony_ci			len += btrfs_chunk_item_size(num_stripes);
301262306a36Sopenharmony_ci		} else {
301362306a36Sopenharmony_ci			ret = -EIO;
301462306a36Sopenharmony_ci			break;
301562306a36Sopenharmony_ci		}
301662306a36Sopenharmony_ci		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
301762306a36Sopenharmony_ci		    key.offset == chunk_offset) {
301862306a36Sopenharmony_ci			memmove(ptr, ptr + len, array_size - (cur + len));
301962306a36Sopenharmony_ci			array_size -= len;
302062306a36Sopenharmony_ci			btrfs_set_super_sys_array_size(super_copy, array_size);
302162306a36Sopenharmony_ci		} else {
302262306a36Sopenharmony_ci			ptr += len;
302362306a36Sopenharmony_ci			cur += len;
302462306a36Sopenharmony_ci		}
302562306a36Sopenharmony_ci	}
302662306a36Sopenharmony_ci	return ret;
302762306a36Sopenharmony_ci}
302862306a36Sopenharmony_ci
302962306a36Sopenharmony_ci/*
303062306a36Sopenharmony_ci * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
303162306a36Sopenharmony_ci * @logical: Logical block offset in bytes.
303262306a36Sopenharmony_ci * @length: Length of extent in bytes.
303362306a36Sopenharmony_ci *
303462306a36Sopenharmony_ci * Return: Chunk mapping or ERR_PTR.
303562306a36Sopenharmony_ci */
303662306a36Sopenharmony_cistruct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
303762306a36Sopenharmony_ci				       u64 logical, u64 length)
303862306a36Sopenharmony_ci{
303962306a36Sopenharmony_ci	struct extent_map_tree *em_tree;
304062306a36Sopenharmony_ci	struct extent_map *em;
304162306a36Sopenharmony_ci
304262306a36Sopenharmony_ci	em_tree = &fs_info->mapping_tree;
304362306a36Sopenharmony_ci	read_lock(&em_tree->lock);
304462306a36Sopenharmony_ci	em = lookup_extent_mapping(em_tree, logical, length);
304562306a36Sopenharmony_ci	read_unlock(&em_tree->lock);
304662306a36Sopenharmony_ci
304762306a36Sopenharmony_ci	if (!em) {
304862306a36Sopenharmony_ci		btrfs_crit(fs_info,
304962306a36Sopenharmony_ci			   "unable to find chunk map for logical %llu length %llu",
305062306a36Sopenharmony_ci			   logical, length);
305162306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
305262306a36Sopenharmony_ci	}
305362306a36Sopenharmony_ci
305462306a36Sopenharmony_ci	if (em->start > logical || em->start + em->len <= logical) {
305562306a36Sopenharmony_ci		btrfs_crit(fs_info,
305662306a36Sopenharmony_ci			   "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
305762306a36Sopenharmony_ci			   logical, logical + length, em->start, em->start + em->len);
305862306a36Sopenharmony_ci		free_extent_map(em);
305962306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
306062306a36Sopenharmony_ci	}
306162306a36Sopenharmony_ci
306262306a36Sopenharmony_ci	/* callers are responsible for dropping em's ref. */
306362306a36Sopenharmony_ci	return em;
306462306a36Sopenharmony_ci}
306562306a36Sopenharmony_ci
306662306a36Sopenharmony_cistatic int remove_chunk_item(struct btrfs_trans_handle *trans,
306762306a36Sopenharmony_ci			     struct map_lookup *map, u64 chunk_offset)
306862306a36Sopenharmony_ci{
306962306a36Sopenharmony_ci	int i;
307062306a36Sopenharmony_ci
307162306a36Sopenharmony_ci	/*
307262306a36Sopenharmony_ci	 * Removing chunk items and updating the device items in the chunks btree
307362306a36Sopenharmony_ci	 * requires holding the chunk_mutex.
307462306a36Sopenharmony_ci	 * See the comment at btrfs_chunk_alloc() for the details.
307562306a36Sopenharmony_ci	 */
307662306a36Sopenharmony_ci	lockdep_assert_held(&trans->fs_info->chunk_mutex);
307762306a36Sopenharmony_ci
307862306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
307962306a36Sopenharmony_ci		int ret;
308062306a36Sopenharmony_ci
308162306a36Sopenharmony_ci		ret = btrfs_update_device(trans, map->stripes[i].dev);
308262306a36Sopenharmony_ci		if (ret)
308362306a36Sopenharmony_ci			return ret;
308462306a36Sopenharmony_ci	}
308562306a36Sopenharmony_ci
308662306a36Sopenharmony_ci	return btrfs_free_chunk(trans, chunk_offset);
308762306a36Sopenharmony_ci}
308862306a36Sopenharmony_ci
308962306a36Sopenharmony_ciint btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
309062306a36Sopenharmony_ci{
309162306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = trans->fs_info;
309262306a36Sopenharmony_ci	struct extent_map *em;
309362306a36Sopenharmony_ci	struct map_lookup *map;
309462306a36Sopenharmony_ci	u64 dev_extent_len = 0;
309562306a36Sopenharmony_ci	int i, ret = 0;
309662306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
309762306a36Sopenharmony_ci
309862306a36Sopenharmony_ci	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
309962306a36Sopenharmony_ci	if (IS_ERR(em)) {
310062306a36Sopenharmony_ci		/*
310162306a36Sopenharmony_ci		 * This is a logic error, but we don't want to just rely on the
310262306a36Sopenharmony_ci		 * user having built with ASSERT enabled, so if ASSERT doesn't
310362306a36Sopenharmony_ci		 * do anything we still error out.
310462306a36Sopenharmony_ci		 */
310562306a36Sopenharmony_ci		ASSERT(0);
310662306a36Sopenharmony_ci		return PTR_ERR(em);
310762306a36Sopenharmony_ci	}
310862306a36Sopenharmony_ci	map = em->map_lookup;
310962306a36Sopenharmony_ci
311062306a36Sopenharmony_ci	/*
311162306a36Sopenharmony_ci	 * First delete the device extent items from the devices btree.
311262306a36Sopenharmony_ci	 * We take the device_list_mutex to avoid racing with the finishing phase
311362306a36Sopenharmony_ci	 * of a device replace operation. See the comment below before acquiring
311462306a36Sopenharmony_ci	 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
311562306a36Sopenharmony_ci	 * because that can result in a deadlock when deleting the device extent
311662306a36Sopenharmony_ci	 * items from the devices btree - COWing an extent buffer from the btree
311762306a36Sopenharmony_ci	 * may result in allocating a new metadata chunk, which would attempt to
311862306a36Sopenharmony_ci	 * lock again fs_info->chunk_mutex.
311962306a36Sopenharmony_ci	 */
312062306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
312162306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
312262306a36Sopenharmony_ci		struct btrfs_device *device = map->stripes[i].dev;
312362306a36Sopenharmony_ci		ret = btrfs_free_dev_extent(trans, device,
312462306a36Sopenharmony_ci					    map->stripes[i].physical,
312562306a36Sopenharmony_ci					    &dev_extent_len);
312662306a36Sopenharmony_ci		if (ret) {
312762306a36Sopenharmony_ci			mutex_unlock(&fs_devices->device_list_mutex);
312862306a36Sopenharmony_ci			btrfs_abort_transaction(trans, ret);
312962306a36Sopenharmony_ci			goto out;
313062306a36Sopenharmony_ci		}
313162306a36Sopenharmony_ci
313262306a36Sopenharmony_ci		if (device->bytes_used > 0) {
313362306a36Sopenharmony_ci			mutex_lock(&fs_info->chunk_mutex);
313462306a36Sopenharmony_ci			btrfs_device_set_bytes_used(device,
313562306a36Sopenharmony_ci					device->bytes_used - dev_extent_len);
313662306a36Sopenharmony_ci			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
313762306a36Sopenharmony_ci			btrfs_clear_space_info_full(fs_info);
313862306a36Sopenharmony_ci			mutex_unlock(&fs_info->chunk_mutex);
313962306a36Sopenharmony_ci		}
314062306a36Sopenharmony_ci	}
314162306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
314262306a36Sopenharmony_ci
314362306a36Sopenharmony_ci	/*
314462306a36Sopenharmony_ci	 * We acquire fs_info->chunk_mutex for 2 reasons:
314562306a36Sopenharmony_ci	 *
314662306a36Sopenharmony_ci	 * 1) Just like with the first phase of the chunk allocation, we must
314762306a36Sopenharmony_ci	 *    reserve system space, do all chunk btree updates and deletions, and
314862306a36Sopenharmony_ci	 *    update the system chunk array in the superblock while holding this
314962306a36Sopenharmony_ci	 *    mutex. This is for similar reasons as explained on the comment at
315062306a36Sopenharmony_ci	 *    the top of btrfs_chunk_alloc();
315162306a36Sopenharmony_ci	 *
315262306a36Sopenharmony_ci	 * 2) Prevent races with the final phase of a device replace operation
315362306a36Sopenharmony_ci	 *    that replaces the device object associated with the map's stripes,
315462306a36Sopenharmony_ci	 *    because the device object's id can change at any time during that
315562306a36Sopenharmony_ci	 *    final phase of the device replace operation
315662306a36Sopenharmony_ci	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
315762306a36Sopenharmony_ci	 *    replaced device and then see it with an ID of
315862306a36Sopenharmony_ci	 *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
315962306a36Sopenharmony_ci	 *    the device item, which does not exists on the chunk btree.
316062306a36Sopenharmony_ci	 *    The finishing phase of device replace acquires both the
316162306a36Sopenharmony_ci	 *    device_list_mutex and the chunk_mutex, in that order, so we are
316262306a36Sopenharmony_ci	 *    safe by just acquiring the chunk_mutex.
316362306a36Sopenharmony_ci	 */
316462306a36Sopenharmony_ci	trans->removing_chunk = true;
316562306a36Sopenharmony_ci	mutex_lock(&fs_info->chunk_mutex);
316662306a36Sopenharmony_ci
316762306a36Sopenharmony_ci	check_system_chunk(trans, map->type);
316862306a36Sopenharmony_ci
316962306a36Sopenharmony_ci	ret = remove_chunk_item(trans, map, chunk_offset);
317062306a36Sopenharmony_ci	/*
317162306a36Sopenharmony_ci	 * Normally we should not get -ENOSPC since we reserved space before
317262306a36Sopenharmony_ci	 * through the call to check_system_chunk().
317362306a36Sopenharmony_ci	 *
317462306a36Sopenharmony_ci	 * Despite our system space_info having enough free space, we may not
317562306a36Sopenharmony_ci	 * be able to allocate extents from its block groups, because all have
317662306a36Sopenharmony_ci	 * an incompatible profile, which will force us to allocate a new system
317762306a36Sopenharmony_ci	 * block group with the right profile, or right after we called
317862306a36Sopenharmony_ci	 * check_system_space() above, a scrub turned the only system block group
317962306a36Sopenharmony_ci	 * with enough free space into RO mode.
318062306a36Sopenharmony_ci	 * This is explained with more detail at do_chunk_alloc().
318162306a36Sopenharmony_ci	 *
318262306a36Sopenharmony_ci	 * So if we get -ENOSPC, allocate a new system chunk and retry once.
318362306a36Sopenharmony_ci	 */
318462306a36Sopenharmony_ci	if (ret == -ENOSPC) {
318562306a36Sopenharmony_ci		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
318662306a36Sopenharmony_ci		struct btrfs_block_group *sys_bg;
318762306a36Sopenharmony_ci
318862306a36Sopenharmony_ci		sys_bg = btrfs_create_chunk(trans, sys_flags);
318962306a36Sopenharmony_ci		if (IS_ERR(sys_bg)) {
319062306a36Sopenharmony_ci			ret = PTR_ERR(sys_bg);
319162306a36Sopenharmony_ci			btrfs_abort_transaction(trans, ret);
319262306a36Sopenharmony_ci			goto out;
319362306a36Sopenharmony_ci		}
319462306a36Sopenharmony_ci
319562306a36Sopenharmony_ci		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
319662306a36Sopenharmony_ci		if (ret) {
319762306a36Sopenharmony_ci			btrfs_abort_transaction(trans, ret);
319862306a36Sopenharmony_ci			goto out;
319962306a36Sopenharmony_ci		}
320062306a36Sopenharmony_ci
320162306a36Sopenharmony_ci		ret = remove_chunk_item(trans, map, chunk_offset);
320262306a36Sopenharmony_ci		if (ret) {
320362306a36Sopenharmony_ci			btrfs_abort_transaction(trans, ret);
320462306a36Sopenharmony_ci			goto out;
320562306a36Sopenharmony_ci		}
320662306a36Sopenharmony_ci	} else if (ret) {
320762306a36Sopenharmony_ci		btrfs_abort_transaction(trans, ret);
320862306a36Sopenharmony_ci		goto out;
320962306a36Sopenharmony_ci	}
321062306a36Sopenharmony_ci
321162306a36Sopenharmony_ci	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
321262306a36Sopenharmony_ci
321362306a36Sopenharmony_ci	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
321462306a36Sopenharmony_ci		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
321562306a36Sopenharmony_ci		if (ret) {
321662306a36Sopenharmony_ci			btrfs_abort_transaction(trans, ret);
321762306a36Sopenharmony_ci			goto out;
321862306a36Sopenharmony_ci		}
321962306a36Sopenharmony_ci	}
322062306a36Sopenharmony_ci
322162306a36Sopenharmony_ci	mutex_unlock(&fs_info->chunk_mutex);
322262306a36Sopenharmony_ci	trans->removing_chunk = false;
322362306a36Sopenharmony_ci
322462306a36Sopenharmony_ci	/*
322562306a36Sopenharmony_ci	 * We are done with chunk btree updates and deletions, so release the
322662306a36Sopenharmony_ci	 * system space we previously reserved (with check_system_chunk()).
322762306a36Sopenharmony_ci	 */
322862306a36Sopenharmony_ci	btrfs_trans_release_chunk_metadata(trans);
322962306a36Sopenharmony_ci
323062306a36Sopenharmony_ci	ret = btrfs_remove_block_group(trans, chunk_offset, em);
323162306a36Sopenharmony_ci	if (ret) {
323262306a36Sopenharmony_ci		btrfs_abort_transaction(trans, ret);
323362306a36Sopenharmony_ci		goto out;
323462306a36Sopenharmony_ci	}
323562306a36Sopenharmony_ci
323662306a36Sopenharmony_ciout:
323762306a36Sopenharmony_ci	if (trans->removing_chunk) {
323862306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
323962306a36Sopenharmony_ci		trans->removing_chunk = false;
324062306a36Sopenharmony_ci	}
324162306a36Sopenharmony_ci	/* once for us */
324262306a36Sopenharmony_ci	free_extent_map(em);
324362306a36Sopenharmony_ci	return ret;
324462306a36Sopenharmony_ci}
324562306a36Sopenharmony_ci
324662306a36Sopenharmony_ciint btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
324762306a36Sopenharmony_ci{
324862306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->chunk_root;
324962306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
325062306a36Sopenharmony_ci	struct btrfs_block_group *block_group;
325162306a36Sopenharmony_ci	u64 length;
325262306a36Sopenharmony_ci	int ret;
325362306a36Sopenharmony_ci
325462306a36Sopenharmony_ci	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
325562306a36Sopenharmony_ci		btrfs_err(fs_info,
325662306a36Sopenharmony_ci			  "relocate: not supported on extent tree v2 yet");
325762306a36Sopenharmony_ci		return -EINVAL;
325862306a36Sopenharmony_ci	}
325962306a36Sopenharmony_ci
326062306a36Sopenharmony_ci	/*
326162306a36Sopenharmony_ci	 * Prevent races with automatic removal of unused block groups.
326262306a36Sopenharmony_ci	 * After we relocate and before we remove the chunk with offset
326362306a36Sopenharmony_ci	 * chunk_offset, automatic removal of the block group can kick in,
326462306a36Sopenharmony_ci	 * resulting in a failure when calling btrfs_remove_chunk() below.
326562306a36Sopenharmony_ci	 *
326662306a36Sopenharmony_ci	 * Make sure to acquire this mutex before doing a tree search (dev
326762306a36Sopenharmony_ci	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
326862306a36Sopenharmony_ci	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
326962306a36Sopenharmony_ci	 * we release the path used to search the chunk/dev tree and before
327062306a36Sopenharmony_ci	 * the current task acquires this mutex and calls us.
327162306a36Sopenharmony_ci	 */
327262306a36Sopenharmony_ci	lockdep_assert_held(&fs_info->reclaim_bgs_lock);
327362306a36Sopenharmony_ci
327462306a36Sopenharmony_ci	/* step one, relocate all the extents inside this chunk */
327562306a36Sopenharmony_ci	btrfs_scrub_pause(fs_info);
327662306a36Sopenharmony_ci	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
327762306a36Sopenharmony_ci	btrfs_scrub_continue(fs_info);
327862306a36Sopenharmony_ci	if (ret) {
327962306a36Sopenharmony_ci		/*
328062306a36Sopenharmony_ci		 * If we had a transaction abort, stop all running scrubs.
328162306a36Sopenharmony_ci		 * See transaction.c:cleanup_transaction() why we do it here.
328262306a36Sopenharmony_ci		 */
328362306a36Sopenharmony_ci		if (BTRFS_FS_ERROR(fs_info))
328462306a36Sopenharmony_ci			btrfs_scrub_cancel(fs_info);
328562306a36Sopenharmony_ci		return ret;
328662306a36Sopenharmony_ci	}
328762306a36Sopenharmony_ci
328862306a36Sopenharmony_ci	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
328962306a36Sopenharmony_ci	if (!block_group)
329062306a36Sopenharmony_ci		return -ENOENT;
329162306a36Sopenharmony_ci	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
329262306a36Sopenharmony_ci	length = block_group->length;
329362306a36Sopenharmony_ci	btrfs_put_block_group(block_group);
329462306a36Sopenharmony_ci
329562306a36Sopenharmony_ci	/*
329662306a36Sopenharmony_ci	 * On a zoned file system, discard the whole block group, this will
329762306a36Sopenharmony_ci	 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
329862306a36Sopenharmony_ci	 * resetting the zone fails, don't treat it as a fatal problem from the
329962306a36Sopenharmony_ci	 * filesystem's point of view.
330062306a36Sopenharmony_ci	 */
330162306a36Sopenharmony_ci	if (btrfs_is_zoned(fs_info)) {
330262306a36Sopenharmony_ci		ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
330362306a36Sopenharmony_ci		if (ret)
330462306a36Sopenharmony_ci			btrfs_info(fs_info,
330562306a36Sopenharmony_ci				"failed to reset zone %llu after relocation",
330662306a36Sopenharmony_ci				chunk_offset);
330762306a36Sopenharmony_ci	}
330862306a36Sopenharmony_ci
330962306a36Sopenharmony_ci	trans = btrfs_start_trans_remove_block_group(root->fs_info,
331062306a36Sopenharmony_ci						     chunk_offset);
331162306a36Sopenharmony_ci	if (IS_ERR(trans)) {
331262306a36Sopenharmony_ci		ret = PTR_ERR(trans);
331362306a36Sopenharmony_ci		btrfs_handle_fs_error(root->fs_info, ret, NULL);
331462306a36Sopenharmony_ci		return ret;
331562306a36Sopenharmony_ci	}
331662306a36Sopenharmony_ci
331762306a36Sopenharmony_ci	/*
331862306a36Sopenharmony_ci	 * step two, delete the device extents and the
331962306a36Sopenharmony_ci	 * chunk tree entries
332062306a36Sopenharmony_ci	 */
332162306a36Sopenharmony_ci	ret = btrfs_remove_chunk(trans, chunk_offset);
332262306a36Sopenharmony_ci	btrfs_end_transaction(trans);
332362306a36Sopenharmony_ci	return ret;
332462306a36Sopenharmony_ci}
332562306a36Sopenharmony_ci
332662306a36Sopenharmony_cistatic int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
332762306a36Sopenharmony_ci{
332862306a36Sopenharmony_ci	struct btrfs_root *chunk_root = fs_info->chunk_root;
332962306a36Sopenharmony_ci	struct btrfs_path *path;
333062306a36Sopenharmony_ci	struct extent_buffer *leaf;
333162306a36Sopenharmony_ci	struct btrfs_chunk *chunk;
333262306a36Sopenharmony_ci	struct btrfs_key key;
333362306a36Sopenharmony_ci	struct btrfs_key found_key;
333462306a36Sopenharmony_ci	u64 chunk_type;
333562306a36Sopenharmony_ci	bool retried = false;
333662306a36Sopenharmony_ci	int failed = 0;
333762306a36Sopenharmony_ci	int ret;
333862306a36Sopenharmony_ci
333962306a36Sopenharmony_ci	path = btrfs_alloc_path();
334062306a36Sopenharmony_ci	if (!path)
334162306a36Sopenharmony_ci		return -ENOMEM;
334262306a36Sopenharmony_ci
334362306a36Sopenharmony_ciagain:
334462306a36Sopenharmony_ci	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
334562306a36Sopenharmony_ci	key.offset = (u64)-1;
334662306a36Sopenharmony_ci	key.type = BTRFS_CHUNK_ITEM_KEY;
334762306a36Sopenharmony_ci
334862306a36Sopenharmony_ci	while (1) {
334962306a36Sopenharmony_ci		mutex_lock(&fs_info->reclaim_bgs_lock);
335062306a36Sopenharmony_ci		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
335162306a36Sopenharmony_ci		if (ret < 0) {
335262306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
335362306a36Sopenharmony_ci			goto error;
335462306a36Sopenharmony_ci		}
335562306a36Sopenharmony_ci		BUG_ON(ret == 0); /* Corruption */
335662306a36Sopenharmony_ci
335762306a36Sopenharmony_ci		ret = btrfs_previous_item(chunk_root, path, key.objectid,
335862306a36Sopenharmony_ci					  key.type);
335962306a36Sopenharmony_ci		if (ret)
336062306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
336162306a36Sopenharmony_ci		if (ret < 0)
336262306a36Sopenharmony_ci			goto error;
336362306a36Sopenharmony_ci		if (ret > 0)
336462306a36Sopenharmony_ci			break;
336562306a36Sopenharmony_ci
336662306a36Sopenharmony_ci		leaf = path->nodes[0];
336762306a36Sopenharmony_ci		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
336862306a36Sopenharmony_ci
336962306a36Sopenharmony_ci		chunk = btrfs_item_ptr(leaf, path->slots[0],
337062306a36Sopenharmony_ci				       struct btrfs_chunk);
337162306a36Sopenharmony_ci		chunk_type = btrfs_chunk_type(leaf, chunk);
337262306a36Sopenharmony_ci		btrfs_release_path(path);
337362306a36Sopenharmony_ci
337462306a36Sopenharmony_ci		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
337562306a36Sopenharmony_ci			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
337662306a36Sopenharmony_ci			if (ret == -ENOSPC)
337762306a36Sopenharmony_ci				failed++;
337862306a36Sopenharmony_ci			else
337962306a36Sopenharmony_ci				BUG_ON(ret);
338062306a36Sopenharmony_ci		}
338162306a36Sopenharmony_ci		mutex_unlock(&fs_info->reclaim_bgs_lock);
338262306a36Sopenharmony_ci
338362306a36Sopenharmony_ci		if (found_key.offset == 0)
338462306a36Sopenharmony_ci			break;
338562306a36Sopenharmony_ci		key.offset = found_key.offset - 1;
338662306a36Sopenharmony_ci	}
338762306a36Sopenharmony_ci	ret = 0;
338862306a36Sopenharmony_ci	if (failed && !retried) {
338962306a36Sopenharmony_ci		failed = 0;
339062306a36Sopenharmony_ci		retried = true;
339162306a36Sopenharmony_ci		goto again;
339262306a36Sopenharmony_ci	} else if (WARN_ON(failed && retried)) {
339362306a36Sopenharmony_ci		ret = -ENOSPC;
339462306a36Sopenharmony_ci	}
339562306a36Sopenharmony_cierror:
339662306a36Sopenharmony_ci	btrfs_free_path(path);
339762306a36Sopenharmony_ci	return ret;
339862306a36Sopenharmony_ci}
339962306a36Sopenharmony_ci
340062306a36Sopenharmony_ci/*
340162306a36Sopenharmony_ci * return 1 : allocate a data chunk successfully,
340262306a36Sopenharmony_ci * return <0: errors during allocating a data chunk,
340362306a36Sopenharmony_ci * return 0 : no need to allocate a data chunk.
340462306a36Sopenharmony_ci */
340562306a36Sopenharmony_cistatic int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
340662306a36Sopenharmony_ci				      u64 chunk_offset)
340762306a36Sopenharmony_ci{
340862306a36Sopenharmony_ci	struct btrfs_block_group *cache;
340962306a36Sopenharmony_ci	u64 bytes_used;
341062306a36Sopenharmony_ci	u64 chunk_type;
341162306a36Sopenharmony_ci
341262306a36Sopenharmony_ci	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
341362306a36Sopenharmony_ci	ASSERT(cache);
341462306a36Sopenharmony_ci	chunk_type = cache->flags;
341562306a36Sopenharmony_ci	btrfs_put_block_group(cache);
341662306a36Sopenharmony_ci
341762306a36Sopenharmony_ci	if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
341862306a36Sopenharmony_ci		return 0;
341962306a36Sopenharmony_ci
342062306a36Sopenharmony_ci	spin_lock(&fs_info->data_sinfo->lock);
342162306a36Sopenharmony_ci	bytes_used = fs_info->data_sinfo->bytes_used;
342262306a36Sopenharmony_ci	spin_unlock(&fs_info->data_sinfo->lock);
342362306a36Sopenharmony_ci
342462306a36Sopenharmony_ci	if (!bytes_used) {
342562306a36Sopenharmony_ci		struct btrfs_trans_handle *trans;
342662306a36Sopenharmony_ci		int ret;
342762306a36Sopenharmony_ci
342862306a36Sopenharmony_ci		trans =	btrfs_join_transaction(fs_info->tree_root);
342962306a36Sopenharmony_ci		if (IS_ERR(trans))
343062306a36Sopenharmony_ci			return PTR_ERR(trans);
343162306a36Sopenharmony_ci
343262306a36Sopenharmony_ci		ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
343362306a36Sopenharmony_ci		btrfs_end_transaction(trans);
343462306a36Sopenharmony_ci		if (ret < 0)
343562306a36Sopenharmony_ci			return ret;
343662306a36Sopenharmony_ci		return 1;
343762306a36Sopenharmony_ci	}
343862306a36Sopenharmony_ci
343962306a36Sopenharmony_ci	return 0;
344062306a36Sopenharmony_ci}
344162306a36Sopenharmony_ci
344262306a36Sopenharmony_cistatic int insert_balance_item(struct btrfs_fs_info *fs_info,
344362306a36Sopenharmony_ci			       struct btrfs_balance_control *bctl)
344462306a36Sopenharmony_ci{
344562306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->tree_root;
344662306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
344762306a36Sopenharmony_ci	struct btrfs_balance_item *item;
344862306a36Sopenharmony_ci	struct btrfs_disk_balance_args disk_bargs;
344962306a36Sopenharmony_ci	struct btrfs_path *path;
345062306a36Sopenharmony_ci	struct extent_buffer *leaf;
345162306a36Sopenharmony_ci	struct btrfs_key key;
345262306a36Sopenharmony_ci	int ret, err;
345362306a36Sopenharmony_ci
345462306a36Sopenharmony_ci	path = btrfs_alloc_path();
345562306a36Sopenharmony_ci	if (!path)
345662306a36Sopenharmony_ci		return -ENOMEM;
345762306a36Sopenharmony_ci
345862306a36Sopenharmony_ci	trans = btrfs_start_transaction(root, 0);
345962306a36Sopenharmony_ci	if (IS_ERR(trans)) {
346062306a36Sopenharmony_ci		btrfs_free_path(path);
346162306a36Sopenharmony_ci		return PTR_ERR(trans);
346262306a36Sopenharmony_ci	}
346362306a36Sopenharmony_ci
346462306a36Sopenharmony_ci	key.objectid = BTRFS_BALANCE_OBJECTID;
346562306a36Sopenharmony_ci	key.type = BTRFS_TEMPORARY_ITEM_KEY;
346662306a36Sopenharmony_ci	key.offset = 0;
346762306a36Sopenharmony_ci
346862306a36Sopenharmony_ci	ret = btrfs_insert_empty_item(trans, root, path, &key,
346962306a36Sopenharmony_ci				      sizeof(*item));
347062306a36Sopenharmony_ci	if (ret)
347162306a36Sopenharmony_ci		goto out;
347262306a36Sopenharmony_ci
347362306a36Sopenharmony_ci	leaf = path->nodes[0];
347462306a36Sopenharmony_ci	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
347562306a36Sopenharmony_ci
347662306a36Sopenharmony_ci	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
347762306a36Sopenharmony_ci
347862306a36Sopenharmony_ci	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
347962306a36Sopenharmony_ci	btrfs_set_balance_data(leaf, item, &disk_bargs);
348062306a36Sopenharmony_ci	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
348162306a36Sopenharmony_ci	btrfs_set_balance_meta(leaf, item, &disk_bargs);
348262306a36Sopenharmony_ci	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
348362306a36Sopenharmony_ci	btrfs_set_balance_sys(leaf, item, &disk_bargs);
348462306a36Sopenharmony_ci
348562306a36Sopenharmony_ci	btrfs_set_balance_flags(leaf, item, bctl->flags);
348662306a36Sopenharmony_ci
348762306a36Sopenharmony_ci	btrfs_mark_buffer_dirty(trans, leaf);
348862306a36Sopenharmony_ciout:
348962306a36Sopenharmony_ci	btrfs_free_path(path);
349062306a36Sopenharmony_ci	err = btrfs_commit_transaction(trans);
349162306a36Sopenharmony_ci	if (err && !ret)
349262306a36Sopenharmony_ci		ret = err;
349362306a36Sopenharmony_ci	return ret;
349462306a36Sopenharmony_ci}
349562306a36Sopenharmony_ci
349662306a36Sopenharmony_cistatic int del_balance_item(struct btrfs_fs_info *fs_info)
349762306a36Sopenharmony_ci{
349862306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->tree_root;
349962306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
350062306a36Sopenharmony_ci	struct btrfs_path *path;
350162306a36Sopenharmony_ci	struct btrfs_key key;
350262306a36Sopenharmony_ci	int ret, err;
350362306a36Sopenharmony_ci
350462306a36Sopenharmony_ci	path = btrfs_alloc_path();
350562306a36Sopenharmony_ci	if (!path)
350662306a36Sopenharmony_ci		return -ENOMEM;
350762306a36Sopenharmony_ci
350862306a36Sopenharmony_ci	trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
350962306a36Sopenharmony_ci	if (IS_ERR(trans)) {
351062306a36Sopenharmony_ci		btrfs_free_path(path);
351162306a36Sopenharmony_ci		return PTR_ERR(trans);
351262306a36Sopenharmony_ci	}
351362306a36Sopenharmony_ci
351462306a36Sopenharmony_ci	key.objectid = BTRFS_BALANCE_OBJECTID;
351562306a36Sopenharmony_ci	key.type = BTRFS_TEMPORARY_ITEM_KEY;
351662306a36Sopenharmony_ci	key.offset = 0;
351762306a36Sopenharmony_ci
351862306a36Sopenharmony_ci	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
351962306a36Sopenharmony_ci	if (ret < 0)
352062306a36Sopenharmony_ci		goto out;
352162306a36Sopenharmony_ci	if (ret > 0) {
352262306a36Sopenharmony_ci		ret = -ENOENT;
352362306a36Sopenharmony_ci		goto out;
352462306a36Sopenharmony_ci	}
352562306a36Sopenharmony_ci
352662306a36Sopenharmony_ci	ret = btrfs_del_item(trans, root, path);
352762306a36Sopenharmony_ciout:
352862306a36Sopenharmony_ci	btrfs_free_path(path);
352962306a36Sopenharmony_ci	err = btrfs_commit_transaction(trans);
353062306a36Sopenharmony_ci	if (err && !ret)
353162306a36Sopenharmony_ci		ret = err;
353262306a36Sopenharmony_ci	return ret;
353362306a36Sopenharmony_ci}
353462306a36Sopenharmony_ci
353562306a36Sopenharmony_ci/*
353662306a36Sopenharmony_ci * This is a heuristic used to reduce the number of chunks balanced on
353762306a36Sopenharmony_ci * resume after balance was interrupted.
353862306a36Sopenharmony_ci */
353962306a36Sopenharmony_cistatic void update_balance_args(struct btrfs_balance_control *bctl)
354062306a36Sopenharmony_ci{
354162306a36Sopenharmony_ci	/*
354262306a36Sopenharmony_ci	 * Turn on soft mode for chunk types that were being converted.
354362306a36Sopenharmony_ci	 */
354462306a36Sopenharmony_ci	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
354562306a36Sopenharmony_ci		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
354662306a36Sopenharmony_ci	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
354762306a36Sopenharmony_ci		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
354862306a36Sopenharmony_ci	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
354962306a36Sopenharmony_ci		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
355062306a36Sopenharmony_ci
355162306a36Sopenharmony_ci	/*
355262306a36Sopenharmony_ci	 * Turn on usage filter if is not already used.  The idea is
355362306a36Sopenharmony_ci	 * that chunks that we have already balanced should be
355462306a36Sopenharmony_ci	 * reasonably full.  Don't do it for chunks that are being
355562306a36Sopenharmony_ci	 * converted - that will keep us from relocating unconverted
355662306a36Sopenharmony_ci	 * (albeit full) chunks.
355762306a36Sopenharmony_ci	 */
355862306a36Sopenharmony_ci	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
355962306a36Sopenharmony_ci	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
356062306a36Sopenharmony_ci	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
356162306a36Sopenharmony_ci		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
356262306a36Sopenharmony_ci		bctl->data.usage = 90;
356362306a36Sopenharmony_ci	}
356462306a36Sopenharmony_ci	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
356562306a36Sopenharmony_ci	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
356662306a36Sopenharmony_ci	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
356762306a36Sopenharmony_ci		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
356862306a36Sopenharmony_ci		bctl->sys.usage = 90;
356962306a36Sopenharmony_ci	}
357062306a36Sopenharmony_ci	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
357162306a36Sopenharmony_ci	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
357262306a36Sopenharmony_ci	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
357362306a36Sopenharmony_ci		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
357462306a36Sopenharmony_ci		bctl->meta.usage = 90;
357562306a36Sopenharmony_ci	}
357662306a36Sopenharmony_ci}
357762306a36Sopenharmony_ci
357862306a36Sopenharmony_ci/*
357962306a36Sopenharmony_ci * Clear the balance status in fs_info and delete the balance item from disk.
358062306a36Sopenharmony_ci */
358162306a36Sopenharmony_cistatic void reset_balance_state(struct btrfs_fs_info *fs_info)
358262306a36Sopenharmony_ci{
358362306a36Sopenharmony_ci	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
358462306a36Sopenharmony_ci	int ret;
358562306a36Sopenharmony_ci
358662306a36Sopenharmony_ci	BUG_ON(!fs_info->balance_ctl);
358762306a36Sopenharmony_ci
358862306a36Sopenharmony_ci	spin_lock(&fs_info->balance_lock);
358962306a36Sopenharmony_ci	fs_info->balance_ctl = NULL;
359062306a36Sopenharmony_ci	spin_unlock(&fs_info->balance_lock);
359162306a36Sopenharmony_ci
359262306a36Sopenharmony_ci	kfree(bctl);
359362306a36Sopenharmony_ci	ret = del_balance_item(fs_info);
359462306a36Sopenharmony_ci	if (ret)
359562306a36Sopenharmony_ci		btrfs_handle_fs_error(fs_info, ret, NULL);
359662306a36Sopenharmony_ci}
359762306a36Sopenharmony_ci
359862306a36Sopenharmony_ci/*
359962306a36Sopenharmony_ci * Balance filters.  Return 1 if chunk should be filtered out
360062306a36Sopenharmony_ci * (should not be balanced).
360162306a36Sopenharmony_ci */
360262306a36Sopenharmony_cistatic int chunk_profiles_filter(u64 chunk_type,
360362306a36Sopenharmony_ci				 struct btrfs_balance_args *bargs)
360462306a36Sopenharmony_ci{
360562306a36Sopenharmony_ci	chunk_type = chunk_to_extended(chunk_type) &
360662306a36Sopenharmony_ci				BTRFS_EXTENDED_PROFILE_MASK;
360762306a36Sopenharmony_ci
360862306a36Sopenharmony_ci	if (bargs->profiles & chunk_type)
360962306a36Sopenharmony_ci		return 0;
361062306a36Sopenharmony_ci
361162306a36Sopenharmony_ci	return 1;
361262306a36Sopenharmony_ci}
361362306a36Sopenharmony_ci
361462306a36Sopenharmony_cistatic int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
361562306a36Sopenharmony_ci			      struct btrfs_balance_args *bargs)
361662306a36Sopenharmony_ci{
361762306a36Sopenharmony_ci	struct btrfs_block_group *cache;
361862306a36Sopenharmony_ci	u64 chunk_used;
361962306a36Sopenharmony_ci	u64 user_thresh_min;
362062306a36Sopenharmony_ci	u64 user_thresh_max;
362162306a36Sopenharmony_ci	int ret = 1;
362262306a36Sopenharmony_ci
362362306a36Sopenharmony_ci	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
362462306a36Sopenharmony_ci	chunk_used = cache->used;
362562306a36Sopenharmony_ci
362662306a36Sopenharmony_ci	if (bargs->usage_min == 0)
362762306a36Sopenharmony_ci		user_thresh_min = 0;
362862306a36Sopenharmony_ci	else
362962306a36Sopenharmony_ci		user_thresh_min = mult_perc(cache->length, bargs->usage_min);
363062306a36Sopenharmony_ci
363162306a36Sopenharmony_ci	if (bargs->usage_max == 0)
363262306a36Sopenharmony_ci		user_thresh_max = 1;
363362306a36Sopenharmony_ci	else if (bargs->usage_max > 100)
363462306a36Sopenharmony_ci		user_thresh_max = cache->length;
363562306a36Sopenharmony_ci	else
363662306a36Sopenharmony_ci		user_thresh_max = mult_perc(cache->length, bargs->usage_max);
363762306a36Sopenharmony_ci
363862306a36Sopenharmony_ci	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
363962306a36Sopenharmony_ci		ret = 0;
364062306a36Sopenharmony_ci
364162306a36Sopenharmony_ci	btrfs_put_block_group(cache);
364262306a36Sopenharmony_ci	return ret;
364362306a36Sopenharmony_ci}
364462306a36Sopenharmony_ci
364562306a36Sopenharmony_cistatic int chunk_usage_filter(struct btrfs_fs_info *fs_info,
364662306a36Sopenharmony_ci		u64 chunk_offset, struct btrfs_balance_args *bargs)
364762306a36Sopenharmony_ci{
364862306a36Sopenharmony_ci	struct btrfs_block_group *cache;
364962306a36Sopenharmony_ci	u64 chunk_used, user_thresh;
365062306a36Sopenharmony_ci	int ret = 1;
365162306a36Sopenharmony_ci
365262306a36Sopenharmony_ci	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
365362306a36Sopenharmony_ci	chunk_used = cache->used;
365462306a36Sopenharmony_ci
365562306a36Sopenharmony_ci	if (bargs->usage_min == 0)
365662306a36Sopenharmony_ci		user_thresh = 1;
365762306a36Sopenharmony_ci	else if (bargs->usage > 100)
365862306a36Sopenharmony_ci		user_thresh = cache->length;
365962306a36Sopenharmony_ci	else
366062306a36Sopenharmony_ci		user_thresh = mult_perc(cache->length, bargs->usage);
366162306a36Sopenharmony_ci
366262306a36Sopenharmony_ci	if (chunk_used < user_thresh)
366362306a36Sopenharmony_ci		ret = 0;
366462306a36Sopenharmony_ci
366562306a36Sopenharmony_ci	btrfs_put_block_group(cache);
366662306a36Sopenharmony_ci	return ret;
366762306a36Sopenharmony_ci}
366862306a36Sopenharmony_ci
366962306a36Sopenharmony_cistatic int chunk_devid_filter(struct extent_buffer *leaf,
367062306a36Sopenharmony_ci			      struct btrfs_chunk *chunk,
367162306a36Sopenharmony_ci			      struct btrfs_balance_args *bargs)
367262306a36Sopenharmony_ci{
367362306a36Sopenharmony_ci	struct btrfs_stripe *stripe;
367462306a36Sopenharmony_ci	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
367562306a36Sopenharmony_ci	int i;
367662306a36Sopenharmony_ci
367762306a36Sopenharmony_ci	for (i = 0; i < num_stripes; i++) {
367862306a36Sopenharmony_ci		stripe = btrfs_stripe_nr(chunk, i);
367962306a36Sopenharmony_ci		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
368062306a36Sopenharmony_ci			return 0;
368162306a36Sopenharmony_ci	}
368262306a36Sopenharmony_ci
368362306a36Sopenharmony_ci	return 1;
368462306a36Sopenharmony_ci}
368562306a36Sopenharmony_ci
368662306a36Sopenharmony_cistatic u64 calc_data_stripes(u64 type, int num_stripes)
368762306a36Sopenharmony_ci{
368862306a36Sopenharmony_ci	const int index = btrfs_bg_flags_to_raid_index(type);
368962306a36Sopenharmony_ci	const int ncopies = btrfs_raid_array[index].ncopies;
369062306a36Sopenharmony_ci	const int nparity = btrfs_raid_array[index].nparity;
369162306a36Sopenharmony_ci
369262306a36Sopenharmony_ci	return (num_stripes - nparity) / ncopies;
369362306a36Sopenharmony_ci}
369462306a36Sopenharmony_ci
369562306a36Sopenharmony_ci/* [pstart, pend) */
369662306a36Sopenharmony_cistatic int chunk_drange_filter(struct extent_buffer *leaf,
369762306a36Sopenharmony_ci			       struct btrfs_chunk *chunk,
369862306a36Sopenharmony_ci			       struct btrfs_balance_args *bargs)
369962306a36Sopenharmony_ci{
370062306a36Sopenharmony_ci	struct btrfs_stripe *stripe;
370162306a36Sopenharmony_ci	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
370262306a36Sopenharmony_ci	u64 stripe_offset;
370362306a36Sopenharmony_ci	u64 stripe_length;
370462306a36Sopenharmony_ci	u64 type;
370562306a36Sopenharmony_ci	int factor;
370662306a36Sopenharmony_ci	int i;
370762306a36Sopenharmony_ci
370862306a36Sopenharmony_ci	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
370962306a36Sopenharmony_ci		return 0;
371062306a36Sopenharmony_ci
371162306a36Sopenharmony_ci	type = btrfs_chunk_type(leaf, chunk);
371262306a36Sopenharmony_ci	factor = calc_data_stripes(type, num_stripes);
371362306a36Sopenharmony_ci
371462306a36Sopenharmony_ci	for (i = 0; i < num_stripes; i++) {
371562306a36Sopenharmony_ci		stripe = btrfs_stripe_nr(chunk, i);
371662306a36Sopenharmony_ci		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
371762306a36Sopenharmony_ci			continue;
371862306a36Sopenharmony_ci
371962306a36Sopenharmony_ci		stripe_offset = btrfs_stripe_offset(leaf, stripe);
372062306a36Sopenharmony_ci		stripe_length = btrfs_chunk_length(leaf, chunk);
372162306a36Sopenharmony_ci		stripe_length = div_u64(stripe_length, factor);
372262306a36Sopenharmony_ci
372362306a36Sopenharmony_ci		if (stripe_offset < bargs->pend &&
372462306a36Sopenharmony_ci		    stripe_offset + stripe_length > bargs->pstart)
372562306a36Sopenharmony_ci			return 0;
372662306a36Sopenharmony_ci	}
372762306a36Sopenharmony_ci
372862306a36Sopenharmony_ci	return 1;
372962306a36Sopenharmony_ci}
373062306a36Sopenharmony_ci
373162306a36Sopenharmony_ci/* [vstart, vend) */
373262306a36Sopenharmony_cistatic int chunk_vrange_filter(struct extent_buffer *leaf,
373362306a36Sopenharmony_ci			       struct btrfs_chunk *chunk,
373462306a36Sopenharmony_ci			       u64 chunk_offset,
373562306a36Sopenharmony_ci			       struct btrfs_balance_args *bargs)
373662306a36Sopenharmony_ci{
373762306a36Sopenharmony_ci	if (chunk_offset < bargs->vend &&
373862306a36Sopenharmony_ci	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
373962306a36Sopenharmony_ci		/* at least part of the chunk is inside this vrange */
374062306a36Sopenharmony_ci		return 0;
374162306a36Sopenharmony_ci
374262306a36Sopenharmony_ci	return 1;
374362306a36Sopenharmony_ci}
374462306a36Sopenharmony_ci
374562306a36Sopenharmony_cistatic int chunk_stripes_range_filter(struct extent_buffer *leaf,
374662306a36Sopenharmony_ci			       struct btrfs_chunk *chunk,
374762306a36Sopenharmony_ci			       struct btrfs_balance_args *bargs)
374862306a36Sopenharmony_ci{
374962306a36Sopenharmony_ci	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
375062306a36Sopenharmony_ci
375162306a36Sopenharmony_ci	if (bargs->stripes_min <= num_stripes
375262306a36Sopenharmony_ci			&& num_stripes <= bargs->stripes_max)
375362306a36Sopenharmony_ci		return 0;
375462306a36Sopenharmony_ci
375562306a36Sopenharmony_ci	return 1;
375662306a36Sopenharmony_ci}
375762306a36Sopenharmony_ci
375862306a36Sopenharmony_cistatic int chunk_soft_convert_filter(u64 chunk_type,
375962306a36Sopenharmony_ci				     struct btrfs_balance_args *bargs)
376062306a36Sopenharmony_ci{
376162306a36Sopenharmony_ci	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
376262306a36Sopenharmony_ci		return 0;
376362306a36Sopenharmony_ci
376462306a36Sopenharmony_ci	chunk_type = chunk_to_extended(chunk_type) &
376562306a36Sopenharmony_ci				BTRFS_EXTENDED_PROFILE_MASK;
376662306a36Sopenharmony_ci
376762306a36Sopenharmony_ci	if (bargs->target == chunk_type)
376862306a36Sopenharmony_ci		return 1;
376962306a36Sopenharmony_ci
377062306a36Sopenharmony_ci	return 0;
377162306a36Sopenharmony_ci}
377262306a36Sopenharmony_ci
377362306a36Sopenharmony_cistatic int should_balance_chunk(struct extent_buffer *leaf,
377462306a36Sopenharmony_ci				struct btrfs_chunk *chunk, u64 chunk_offset)
377562306a36Sopenharmony_ci{
377662306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = leaf->fs_info;
377762306a36Sopenharmony_ci	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
377862306a36Sopenharmony_ci	struct btrfs_balance_args *bargs = NULL;
377962306a36Sopenharmony_ci	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
378062306a36Sopenharmony_ci
378162306a36Sopenharmony_ci	/* type filter */
378262306a36Sopenharmony_ci	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
378362306a36Sopenharmony_ci	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
378462306a36Sopenharmony_ci		return 0;
378562306a36Sopenharmony_ci	}
378662306a36Sopenharmony_ci
378762306a36Sopenharmony_ci	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
378862306a36Sopenharmony_ci		bargs = &bctl->data;
378962306a36Sopenharmony_ci	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
379062306a36Sopenharmony_ci		bargs = &bctl->sys;
379162306a36Sopenharmony_ci	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
379262306a36Sopenharmony_ci		bargs = &bctl->meta;
379362306a36Sopenharmony_ci
379462306a36Sopenharmony_ci	/* profiles filter */
379562306a36Sopenharmony_ci	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
379662306a36Sopenharmony_ci	    chunk_profiles_filter(chunk_type, bargs)) {
379762306a36Sopenharmony_ci		return 0;
379862306a36Sopenharmony_ci	}
379962306a36Sopenharmony_ci
380062306a36Sopenharmony_ci	/* usage filter */
380162306a36Sopenharmony_ci	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
380262306a36Sopenharmony_ci	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
380362306a36Sopenharmony_ci		return 0;
380462306a36Sopenharmony_ci	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
380562306a36Sopenharmony_ci	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
380662306a36Sopenharmony_ci		return 0;
380762306a36Sopenharmony_ci	}
380862306a36Sopenharmony_ci
380962306a36Sopenharmony_ci	/* devid filter */
381062306a36Sopenharmony_ci	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
381162306a36Sopenharmony_ci	    chunk_devid_filter(leaf, chunk, bargs)) {
381262306a36Sopenharmony_ci		return 0;
381362306a36Sopenharmony_ci	}
381462306a36Sopenharmony_ci
381562306a36Sopenharmony_ci	/* drange filter, makes sense only with devid filter */
381662306a36Sopenharmony_ci	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
381762306a36Sopenharmony_ci	    chunk_drange_filter(leaf, chunk, bargs)) {
381862306a36Sopenharmony_ci		return 0;
381962306a36Sopenharmony_ci	}
382062306a36Sopenharmony_ci
382162306a36Sopenharmony_ci	/* vrange filter */
382262306a36Sopenharmony_ci	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
382362306a36Sopenharmony_ci	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
382462306a36Sopenharmony_ci		return 0;
382562306a36Sopenharmony_ci	}
382662306a36Sopenharmony_ci
382762306a36Sopenharmony_ci	/* stripes filter */
382862306a36Sopenharmony_ci	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
382962306a36Sopenharmony_ci	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
383062306a36Sopenharmony_ci		return 0;
383162306a36Sopenharmony_ci	}
383262306a36Sopenharmony_ci
383362306a36Sopenharmony_ci	/* soft profile changing mode */
383462306a36Sopenharmony_ci	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
383562306a36Sopenharmony_ci	    chunk_soft_convert_filter(chunk_type, bargs)) {
383662306a36Sopenharmony_ci		return 0;
383762306a36Sopenharmony_ci	}
383862306a36Sopenharmony_ci
383962306a36Sopenharmony_ci	/*
384062306a36Sopenharmony_ci	 * limited by count, must be the last filter
384162306a36Sopenharmony_ci	 */
384262306a36Sopenharmony_ci	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
384362306a36Sopenharmony_ci		if (bargs->limit == 0)
384462306a36Sopenharmony_ci			return 0;
384562306a36Sopenharmony_ci		else
384662306a36Sopenharmony_ci			bargs->limit--;
384762306a36Sopenharmony_ci	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
384862306a36Sopenharmony_ci		/*
384962306a36Sopenharmony_ci		 * Same logic as the 'limit' filter; the minimum cannot be
385062306a36Sopenharmony_ci		 * determined here because we do not have the global information
385162306a36Sopenharmony_ci		 * about the count of all chunks that satisfy the filters.
385262306a36Sopenharmony_ci		 */
385362306a36Sopenharmony_ci		if (bargs->limit_max == 0)
385462306a36Sopenharmony_ci			return 0;
385562306a36Sopenharmony_ci		else
385662306a36Sopenharmony_ci			bargs->limit_max--;
385762306a36Sopenharmony_ci	}
385862306a36Sopenharmony_ci
385962306a36Sopenharmony_ci	return 1;
386062306a36Sopenharmony_ci}
386162306a36Sopenharmony_ci
386262306a36Sopenharmony_cistatic int __btrfs_balance(struct btrfs_fs_info *fs_info)
386362306a36Sopenharmony_ci{
386462306a36Sopenharmony_ci	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
386562306a36Sopenharmony_ci	struct btrfs_root *chunk_root = fs_info->chunk_root;
386662306a36Sopenharmony_ci	u64 chunk_type;
386762306a36Sopenharmony_ci	struct btrfs_chunk *chunk;
386862306a36Sopenharmony_ci	struct btrfs_path *path = NULL;
386962306a36Sopenharmony_ci	struct btrfs_key key;
387062306a36Sopenharmony_ci	struct btrfs_key found_key;
387162306a36Sopenharmony_ci	struct extent_buffer *leaf;
387262306a36Sopenharmony_ci	int slot;
387362306a36Sopenharmony_ci	int ret;
387462306a36Sopenharmony_ci	int enospc_errors = 0;
387562306a36Sopenharmony_ci	bool counting = true;
387662306a36Sopenharmony_ci	/* The single value limit and min/max limits use the same bytes in the */
387762306a36Sopenharmony_ci	u64 limit_data = bctl->data.limit;
387862306a36Sopenharmony_ci	u64 limit_meta = bctl->meta.limit;
387962306a36Sopenharmony_ci	u64 limit_sys = bctl->sys.limit;
388062306a36Sopenharmony_ci	u32 count_data = 0;
388162306a36Sopenharmony_ci	u32 count_meta = 0;
388262306a36Sopenharmony_ci	u32 count_sys = 0;
388362306a36Sopenharmony_ci	int chunk_reserved = 0;
388462306a36Sopenharmony_ci
388562306a36Sopenharmony_ci	path = btrfs_alloc_path();
388662306a36Sopenharmony_ci	if (!path) {
388762306a36Sopenharmony_ci		ret = -ENOMEM;
388862306a36Sopenharmony_ci		goto error;
388962306a36Sopenharmony_ci	}
389062306a36Sopenharmony_ci
389162306a36Sopenharmony_ci	/* zero out stat counters */
389262306a36Sopenharmony_ci	spin_lock(&fs_info->balance_lock);
389362306a36Sopenharmony_ci	memset(&bctl->stat, 0, sizeof(bctl->stat));
389462306a36Sopenharmony_ci	spin_unlock(&fs_info->balance_lock);
389562306a36Sopenharmony_ciagain:
389662306a36Sopenharmony_ci	if (!counting) {
389762306a36Sopenharmony_ci		/*
389862306a36Sopenharmony_ci		 * The single value limit and min/max limits use the same bytes
389962306a36Sopenharmony_ci		 * in the
390062306a36Sopenharmony_ci		 */
390162306a36Sopenharmony_ci		bctl->data.limit = limit_data;
390262306a36Sopenharmony_ci		bctl->meta.limit = limit_meta;
390362306a36Sopenharmony_ci		bctl->sys.limit = limit_sys;
390462306a36Sopenharmony_ci	}
390562306a36Sopenharmony_ci	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
390662306a36Sopenharmony_ci	key.offset = (u64)-1;
390762306a36Sopenharmony_ci	key.type = BTRFS_CHUNK_ITEM_KEY;
390862306a36Sopenharmony_ci
390962306a36Sopenharmony_ci	while (1) {
391062306a36Sopenharmony_ci		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
391162306a36Sopenharmony_ci		    atomic_read(&fs_info->balance_cancel_req)) {
391262306a36Sopenharmony_ci			ret = -ECANCELED;
391362306a36Sopenharmony_ci			goto error;
391462306a36Sopenharmony_ci		}
391562306a36Sopenharmony_ci
391662306a36Sopenharmony_ci		mutex_lock(&fs_info->reclaim_bgs_lock);
391762306a36Sopenharmony_ci		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
391862306a36Sopenharmony_ci		if (ret < 0) {
391962306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
392062306a36Sopenharmony_ci			goto error;
392162306a36Sopenharmony_ci		}
392262306a36Sopenharmony_ci
392362306a36Sopenharmony_ci		/*
392462306a36Sopenharmony_ci		 * this shouldn't happen, it means the last relocate
392562306a36Sopenharmony_ci		 * failed
392662306a36Sopenharmony_ci		 */
392762306a36Sopenharmony_ci		if (ret == 0)
392862306a36Sopenharmony_ci			BUG(); /* FIXME break ? */
392962306a36Sopenharmony_ci
393062306a36Sopenharmony_ci		ret = btrfs_previous_item(chunk_root, path, 0,
393162306a36Sopenharmony_ci					  BTRFS_CHUNK_ITEM_KEY);
393262306a36Sopenharmony_ci		if (ret) {
393362306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
393462306a36Sopenharmony_ci			ret = 0;
393562306a36Sopenharmony_ci			break;
393662306a36Sopenharmony_ci		}
393762306a36Sopenharmony_ci
393862306a36Sopenharmony_ci		leaf = path->nodes[0];
393962306a36Sopenharmony_ci		slot = path->slots[0];
394062306a36Sopenharmony_ci		btrfs_item_key_to_cpu(leaf, &found_key, slot);
394162306a36Sopenharmony_ci
394262306a36Sopenharmony_ci		if (found_key.objectid != key.objectid) {
394362306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
394462306a36Sopenharmony_ci			break;
394562306a36Sopenharmony_ci		}
394662306a36Sopenharmony_ci
394762306a36Sopenharmony_ci		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
394862306a36Sopenharmony_ci		chunk_type = btrfs_chunk_type(leaf, chunk);
394962306a36Sopenharmony_ci
395062306a36Sopenharmony_ci		if (!counting) {
395162306a36Sopenharmony_ci			spin_lock(&fs_info->balance_lock);
395262306a36Sopenharmony_ci			bctl->stat.considered++;
395362306a36Sopenharmony_ci			spin_unlock(&fs_info->balance_lock);
395462306a36Sopenharmony_ci		}
395562306a36Sopenharmony_ci
395662306a36Sopenharmony_ci		ret = should_balance_chunk(leaf, chunk, found_key.offset);
395762306a36Sopenharmony_ci
395862306a36Sopenharmony_ci		btrfs_release_path(path);
395962306a36Sopenharmony_ci		if (!ret) {
396062306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
396162306a36Sopenharmony_ci			goto loop;
396262306a36Sopenharmony_ci		}
396362306a36Sopenharmony_ci
396462306a36Sopenharmony_ci		if (counting) {
396562306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
396662306a36Sopenharmony_ci			spin_lock(&fs_info->balance_lock);
396762306a36Sopenharmony_ci			bctl->stat.expected++;
396862306a36Sopenharmony_ci			spin_unlock(&fs_info->balance_lock);
396962306a36Sopenharmony_ci
397062306a36Sopenharmony_ci			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
397162306a36Sopenharmony_ci				count_data++;
397262306a36Sopenharmony_ci			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
397362306a36Sopenharmony_ci				count_sys++;
397462306a36Sopenharmony_ci			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
397562306a36Sopenharmony_ci				count_meta++;
397662306a36Sopenharmony_ci
397762306a36Sopenharmony_ci			goto loop;
397862306a36Sopenharmony_ci		}
397962306a36Sopenharmony_ci
398062306a36Sopenharmony_ci		/*
398162306a36Sopenharmony_ci		 * Apply limit_min filter, no need to check if the LIMITS
398262306a36Sopenharmony_ci		 * filter is used, limit_min is 0 by default
398362306a36Sopenharmony_ci		 */
398462306a36Sopenharmony_ci		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
398562306a36Sopenharmony_ci					count_data < bctl->data.limit_min)
398662306a36Sopenharmony_ci				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
398762306a36Sopenharmony_ci					count_meta < bctl->meta.limit_min)
398862306a36Sopenharmony_ci				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
398962306a36Sopenharmony_ci					count_sys < bctl->sys.limit_min)) {
399062306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
399162306a36Sopenharmony_ci			goto loop;
399262306a36Sopenharmony_ci		}
399362306a36Sopenharmony_ci
399462306a36Sopenharmony_ci		if (!chunk_reserved) {
399562306a36Sopenharmony_ci			/*
399662306a36Sopenharmony_ci			 * We may be relocating the only data chunk we have,
399762306a36Sopenharmony_ci			 * which could potentially end up with losing data's
399862306a36Sopenharmony_ci			 * raid profile, so lets allocate an empty one in
399962306a36Sopenharmony_ci			 * advance.
400062306a36Sopenharmony_ci			 */
400162306a36Sopenharmony_ci			ret = btrfs_may_alloc_data_chunk(fs_info,
400262306a36Sopenharmony_ci							 found_key.offset);
400362306a36Sopenharmony_ci			if (ret < 0) {
400462306a36Sopenharmony_ci				mutex_unlock(&fs_info->reclaim_bgs_lock);
400562306a36Sopenharmony_ci				goto error;
400662306a36Sopenharmony_ci			} else if (ret == 1) {
400762306a36Sopenharmony_ci				chunk_reserved = 1;
400862306a36Sopenharmony_ci			}
400962306a36Sopenharmony_ci		}
401062306a36Sopenharmony_ci
401162306a36Sopenharmony_ci		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
401262306a36Sopenharmony_ci		mutex_unlock(&fs_info->reclaim_bgs_lock);
401362306a36Sopenharmony_ci		if (ret == -ENOSPC) {
401462306a36Sopenharmony_ci			enospc_errors++;
401562306a36Sopenharmony_ci		} else if (ret == -ETXTBSY) {
401662306a36Sopenharmony_ci			btrfs_info(fs_info,
401762306a36Sopenharmony_ci	   "skipping relocation of block group %llu due to active swapfile",
401862306a36Sopenharmony_ci				   found_key.offset);
401962306a36Sopenharmony_ci			ret = 0;
402062306a36Sopenharmony_ci		} else if (ret) {
402162306a36Sopenharmony_ci			goto error;
402262306a36Sopenharmony_ci		} else {
402362306a36Sopenharmony_ci			spin_lock(&fs_info->balance_lock);
402462306a36Sopenharmony_ci			bctl->stat.completed++;
402562306a36Sopenharmony_ci			spin_unlock(&fs_info->balance_lock);
402662306a36Sopenharmony_ci		}
402762306a36Sopenharmony_ciloop:
402862306a36Sopenharmony_ci		if (found_key.offset == 0)
402962306a36Sopenharmony_ci			break;
403062306a36Sopenharmony_ci		key.offset = found_key.offset - 1;
403162306a36Sopenharmony_ci	}
403262306a36Sopenharmony_ci
403362306a36Sopenharmony_ci	if (counting) {
403462306a36Sopenharmony_ci		btrfs_release_path(path);
403562306a36Sopenharmony_ci		counting = false;
403662306a36Sopenharmony_ci		goto again;
403762306a36Sopenharmony_ci	}
403862306a36Sopenharmony_cierror:
403962306a36Sopenharmony_ci	btrfs_free_path(path);
404062306a36Sopenharmony_ci	if (enospc_errors) {
404162306a36Sopenharmony_ci		btrfs_info(fs_info, "%d enospc errors during balance",
404262306a36Sopenharmony_ci			   enospc_errors);
404362306a36Sopenharmony_ci		if (!ret)
404462306a36Sopenharmony_ci			ret = -ENOSPC;
404562306a36Sopenharmony_ci	}
404662306a36Sopenharmony_ci
404762306a36Sopenharmony_ci	return ret;
404862306a36Sopenharmony_ci}
404962306a36Sopenharmony_ci
405062306a36Sopenharmony_ci/*
405162306a36Sopenharmony_ci * See if a given profile is valid and reduced.
405262306a36Sopenharmony_ci *
405362306a36Sopenharmony_ci * @flags:     profile to validate
405462306a36Sopenharmony_ci * @extended:  if true @flags is treated as an extended profile
405562306a36Sopenharmony_ci */
405662306a36Sopenharmony_cistatic int alloc_profile_is_valid(u64 flags, int extended)
405762306a36Sopenharmony_ci{
405862306a36Sopenharmony_ci	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
405962306a36Sopenharmony_ci			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
406062306a36Sopenharmony_ci
406162306a36Sopenharmony_ci	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
406262306a36Sopenharmony_ci
406362306a36Sopenharmony_ci	/* 1) check that all other bits are zeroed */
406462306a36Sopenharmony_ci	if (flags & ~mask)
406562306a36Sopenharmony_ci		return 0;
406662306a36Sopenharmony_ci
406762306a36Sopenharmony_ci	/* 2) see if profile is reduced */
406862306a36Sopenharmony_ci	if (flags == 0)
406962306a36Sopenharmony_ci		return !extended; /* "0" is valid for usual profiles */
407062306a36Sopenharmony_ci
407162306a36Sopenharmony_ci	return has_single_bit_set(flags);
407262306a36Sopenharmony_ci}
407362306a36Sopenharmony_ci
407462306a36Sopenharmony_ci/*
407562306a36Sopenharmony_ci * Validate target profile against allowed profiles and return true if it's OK.
407662306a36Sopenharmony_ci * Otherwise print the error message and return false.
407762306a36Sopenharmony_ci */
407862306a36Sopenharmony_cistatic inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
407962306a36Sopenharmony_ci		const struct btrfs_balance_args *bargs,
408062306a36Sopenharmony_ci		u64 allowed, const char *type)
408162306a36Sopenharmony_ci{
408262306a36Sopenharmony_ci	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
408362306a36Sopenharmony_ci		return true;
408462306a36Sopenharmony_ci
408562306a36Sopenharmony_ci	/* Profile is valid and does not have bits outside of the allowed set */
408662306a36Sopenharmony_ci	if (alloc_profile_is_valid(bargs->target, 1) &&
408762306a36Sopenharmony_ci	    (bargs->target & ~allowed) == 0)
408862306a36Sopenharmony_ci		return true;
408962306a36Sopenharmony_ci
409062306a36Sopenharmony_ci	btrfs_err(fs_info, "balance: invalid convert %s profile %s",
409162306a36Sopenharmony_ci			type, btrfs_bg_type_to_raid_name(bargs->target));
409262306a36Sopenharmony_ci	return false;
409362306a36Sopenharmony_ci}
409462306a36Sopenharmony_ci
409562306a36Sopenharmony_ci/*
409662306a36Sopenharmony_ci * Fill @buf with textual description of balance filter flags @bargs, up to
409762306a36Sopenharmony_ci * @size_buf including the terminating null. The output may be trimmed if it
409862306a36Sopenharmony_ci * does not fit into the provided buffer.
409962306a36Sopenharmony_ci */
410062306a36Sopenharmony_cistatic void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
410162306a36Sopenharmony_ci				 u32 size_buf)
410262306a36Sopenharmony_ci{
410362306a36Sopenharmony_ci	int ret;
410462306a36Sopenharmony_ci	u32 size_bp = size_buf;
410562306a36Sopenharmony_ci	char *bp = buf;
410662306a36Sopenharmony_ci	u64 flags = bargs->flags;
410762306a36Sopenharmony_ci	char tmp_buf[128] = {'\0'};
410862306a36Sopenharmony_ci
410962306a36Sopenharmony_ci	if (!flags)
411062306a36Sopenharmony_ci		return;
411162306a36Sopenharmony_ci
411262306a36Sopenharmony_ci#define CHECK_APPEND_NOARG(a)						\
411362306a36Sopenharmony_ci	do {								\
411462306a36Sopenharmony_ci		ret = snprintf(bp, size_bp, (a));			\
411562306a36Sopenharmony_ci		if (ret < 0 || ret >= size_bp)				\
411662306a36Sopenharmony_ci			goto out_overflow;				\
411762306a36Sopenharmony_ci		size_bp -= ret;						\
411862306a36Sopenharmony_ci		bp += ret;						\
411962306a36Sopenharmony_ci	} while (0)
412062306a36Sopenharmony_ci
412162306a36Sopenharmony_ci#define CHECK_APPEND_1ARG(a, v1)					\
412262306a36Sopenharmony_ci	do {								\
412362306a36Sopenharmony_ci		ret = snprintf(bp, size_bp, (a), (v1));			\
412462306a36Sopenharmony_ci		if (ret < 0 || ret >= size_bp)				\
412562306a36Sopenharmony_ci			goto out_overflow;				\
412662306a36Sopenharmony_ci		size_bp -= ret;						\
412762306a36Sopenharmony_ci		bp += ret;						\
412862306a36Sopenharmony_ci	} while (0)
412962306a36Sopenharmony_ci
413062306a36Sopenharmony_ci#define CHECK_APPEND_2ARG(a, v1, v2)					\
413162306a36Sopenharmony_ci	do {								\
413262306a36Sopenharmony_ci		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\
413362306a36Sopenharmony_ci		if (ret < 0 || ret >= size_bp)				\
413462306a36Sopenharmony_ci			goto out_overflow;				\
413562306a36Sopenharmony_ci		size_bp -= ret;						\
413662306a36Sopenharmony_ci		bp += ret;						\
413762306a36Sopenharmony_ci	} while (0)
413862306a36Sopenharmony_ci
413962306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
414062306a36Sopenharmony_ci		CHECK_APPEND_1ARG("convert=%s,",
414162306a36Sopenharmony_ci				  btrfs_bg_type_to_raid_name(bargs->target));
414262306a36Sopenharmony_ci
414362306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_SOFT)
414462306a36Sopenharmony_ci		CHECK_APPEND_NOARG("soft,");
414562306a36Sopenharmony_ci
414662306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
414762306a36Sopenharmony_ci		btrfs_describe_block_groups(bargs->profiles, tmp_buf,
414862306a36Sopenharmony_ci					    sizeof(tmp_buf));
414962306a36Sopenharmony_ci		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
415062306a36Sopenharmony_ci	}
415162306a36Sopenharmony_ci
415262306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_USAGE)
415362306a36Sopenharmony_ci		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
415462306a36Sopenharmony_ci
415562306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
415662306a36Sopenharmony_ci		CHECK_APPEND_2ARG("usage=%u..%u,",
415762306a36Sopenharmony_ci				  bargs->usage_min, bargs->usage_max);
415862306a36Sopenharmony_ci
415962306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_DEVID)
416062306a36Sopenharmony_ci		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
416162306a36Sopenharmony_ci
416262306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
416362306a36Sopenharmony_ci		CHECK_APPEND_2ARG("drange=%llu..%llu,",
416462306a36Sopenharmony_ci				  bargs->pstart, bargs->pend);
416562306a36Sopenharmony_ci
416662306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
416762306a36Sopenharmony_ci		CHECK_APPEND_2ARG("vrange=%llu..%llu,",
416862306a36Sopenharmony_ci				  bargs->vstart, bargs->vend);
416962306a36Sopenharmony_ci
417062306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
417162306a36Sopenharmony_ci		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
417262306a36Sopenharmony_ci
417362306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
417462306a36Sopenharmony_ci		CHECK_APPEND_2ARG("limit=%u..%u,",
417562306a36Sopenharmony_ci				bargs->limit_min, bargs->limit_max);
417662306a36Sopenharmony_ci
417762306a36Sopenharmony_ci	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
417862306a36Sopenharmony_ci		CHECK_APPEND_2ARG("stripes=%u..%u,",
417962306a36Sopenharmony_ci				  bargs->stripes_min, bargs->stripes_max);
418062306a36Sopenharmony_ci
418162306a36Sopenharmony_ci#undef CHECK_APPEND_2ARG
418262306a36Sopenharmony_ci#undef CHECK_APPEND_1ARG
418362306a36Sopenharmony_ci#undef CHECK_APPEND_NOARG
418462306a36Sopenharmony_ci
418562306a36Sopenharmony_ciout_overflow:
418662306a36Sopenharmony_ci
418762306a36Sopenharmony_ci	if (size_bp < size_buf)
418862306a36Sopenharmony_ci		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
418962306a36Sopenharmony_ci	else
419062306a36Sopenharmony_ci		buf[0] = '\0';
419162306a36Sopenharmony_ci}
419262306a36Sopenharmony_ci
419362306a36Sopenharmony_cistatic void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
419462306a36Sopenharmony_ci{
419562306a36Sopenharmony_ci	u32 size_buf = 1024;
419662306a36Sopenharmony_ci	char tmp_buf[192] = {'\0'};
419762306a36Sopenharmony_ci	char *buf;
419862306a36Sopenharmony_ci	char *bp;
419962306a36Sopenharmony_ci	u32 size_bp = size_buf;
420062306a36Sopenharmony_ci	int ret;
420162306a36Sopenharmony_ci	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
420262306a36Sopenharmony_ci
420362306a36Sopenharmony_ci	buf = kzalloc(size_buf, GFP_KERNEL);
420462306a36Sopenharmony_ci	if (!buf)
420562306a36Sopenharmony_ci		return;
420662306a36Sopenharmony_ci
420762306a36Sopenharmony_ci	bp = buf;
420862306a36Sopenharmony_ci
420962306a36Sopenharmony_ci#define CHECK_APPEND_1ARG(a, v1)					\
421062306a36Sopenharmony_ci	do {								\
421162306a36Sopenharmony_ci		ret = snprintf(bp, size_bp, (a), (v1));			\
421262306a36Sopenharmony_ci		if (ret < 0 || ret >= size_bp)				\
421362306a36Sopenharmony_ci			goto out_overflow;				\
421462306a36Sopenharmony_ci		size_bp -= ret;						\
421562306a36Sopenharmony_ci		bp += ret;						\
421662306a36Sopenharmony_ci	} while (0)
421762306a36Sopenharmony_ci
421862306a36Sopenharmony_ci	if (bctl->flags & BTRFS_BALANCE_FORCE)
421962306a36Sopenharmony_ci		CHECK_APPEND_1ARG("%s", "-f ");
422062306a36Sopenharmony_ci
422162306a36Sopenharmony_ci	if (bctl->flags & BTRFS_BALANCE_DATA) {
422262306a36Sopenharmony_ci		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
422362306a36Sopenharmony_ci		CHECK_APPEND_1ARG("-d%s ", tmp_buf);
422462306a36Sopenharmony_ci	}
422562306a36Sopenharmony_ci
422662306a36Sopenharmony_ci	if (bctl->flags & BTRFS_BALANCE_METADATA) {
422762306a36Sopenharmony_ci		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
422862306a36Sopenharmony_ci		CHECK_APPEND_1ARG("-m%s ", tmp_buf);
422962306a36Sopenharmony_ci	}
423062306a36Sopenharmony_ci
423162306a36Sopenharmony_ci	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
423262306a36Sopenharmony_ci		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
423362306a36Sopenharmony_ci		CHECK_APPEND_1ARG("-s%s ", tmp_buf);
423462306a36Sopenharmony_ci	}
423562306a36Sopenharmony_ci
423662306a36Sopenharmony_ci#undef CHECK_APPEND_1ARG
423762306a36Sopenharmony_ci
423862306a36Sopenharmony_ciout_overflow:
423962306a36Sopenharmony_ci
424062306a36Sopenharmony_ci	if (size_bp < size_buf)
424162306a36Sopenharmony_ci		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
424262306a36Sopenharmony_ci	btrfs_info(fs_info, "balance: %s %s",
424362306a36Sopenharmony_ci		   (bctl->flags & BTRFS_BALANCE_RESUME) ?
424462306a36Sopenharmony_ci		   "resume" : "start", buf);
424562306a36Sopenharmony_ci
424662306a36Sopenharmony_ci	kfree(buf);
424762306a36Sopenharmony_ci}
424862306a36Sopenharmony_ci
424962306a36Sopenharmony_ci/*
425062306a36Sopenharmony_ci * Should be called with balance mutexe held
425162306a36Sopenharmony_ci */
425262306a36Sopenharmony_ciint btrfs_balance(struct btrfs_fs_info *fs_info,
425362306a36Sopenharmony_ci		  struct btrfs_balance_control *bctl,
425462306a36Sopenharmony_ci		  struct btrfs_ioctl_balance_args *bargs)
425562306a36Sopenharmony_ci{
425662306a36Sopenharmony_ci	u64 meta_target, data_target;
425762306a36Sopenharmony_ci	u64 allowed;
425862306a36Sopenharmony_ci	int mixed = 0;
425962306a36Sopenharmony_ci	int ret;
426062306a36Sopenharmony_ci	u64 num_devices;
426162306a36Sopenharmony_ci	unsigned seq;
426262306a36Sopenharmony_ci	bool reducing_redundancy;
426362306a36Sopenharmony_ci	bool paused = false;
426462306a36Sopenharmony_ci	int i;
426562306a36Sopenharmony_ci
426662306a36Sopenharmony_ci	if (btrfs_fs_closing(fs_info) ||
426762306a36Sopenharmony_ci	    atomic_read(&fs_info->balance_pause_req) ||
426862306a36Sopenharmony_ci	    btrfs_should_cancel_balance(fs_info)) {
426962306a36Sopenharmony_ci		ret = -EINVAL;
427062306a36Sopenharmony_ci		goto out;
427162306a36Sopenharmony_ci	}
427262306a36Sopenharmony_ci
427362306a36Sopenharmony_ci	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
427462306a36Sopenharmony_ci	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
427562306a36Sopenharmony_ci		mixed = 1;
427662306a36Sopenharmony_ci
427762306a36Sopenharmony_ci	/*
427862306a36Sopenharmony_ci	 * In case of mixed groups both data and meta should be picked,
427962306a36Sopenharmony_ci	 * and identical options should be given for both of them.
428062306a36Sopenharmony_ci	 */
428162306a36Sopenharmony_ci	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
428262306a36Sopenharmony_ci	if (mixed && (bctl->flags & allowed)) {
428362306a36Sopenharmony_ci		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
428462306a36Sopenharmony_ci		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
428562306a36Sopenharmony_ci		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
428662306a36Sopenharmony_ci			btrfs_err(fs_info,
428762306a36Sopenharmony_ci	  "balance: mixed groups data and metadata options must be the same");
428862306a36Sopenharmony_ci			ret = -EINVAL;
428962306a36Sopenharmony_ci			goto out;
429062306a36Sopenharmony_ci		}
429162306a36Sopenharmony_ci	}
429262306a36Sopenharmony_ci
429362306a36Sopenharmony_ci	/*
429462306a36Sopenharmony_ci	 * rw_devices will not change at the moment, device add/delete/replace
429562306a36Sopenharmony_ci	 * are exclusive
429662306a36Sopenharmony_ci	 */
429762306a36Sopenharmony_ci	num_devices = fs_info->fs_devices->rw_devices;
429862306a36Sopenharmony_ci
429962306a36Sopenharmony_ci	/*
430062306a36Sopenharmony_ci	 * SINGLE profile on-disk has no profile bit, but in-memory we have a
430162306a36Sopenharmony_ci	 * special bit for it, to make it easier to distinguish.  Thus we need
430262306a36Sopenharmony_ci	 * to set it manually, or balance would refuse the profile.
430362306a36Sopenharmony_ci	 */
430462306a36Sopenharmony_ci	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
430562306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
430662306a36Sopenharmony_ci		if (num_devices >= btrfs_raid_array[i].devs_min)
430762306a36Sopenharmony_ci			allowed |= btrfs_raid_array[i].bg_flag;
430862306a36Sopenharmony_ci
430962306a36Sopenharmony_ci	if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
431062306a36Sopenharmony_ci	    !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
431162306a36Sopenharmony_ci	    !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
431262306a36Sopenharmony_ci		ret = -EINVAL;
431362306a36Sopenharmony_ci		goto out;
431462306a36Sopenharmony_ci	}
431562306a36Sopenharmony_ci
431662306a36Sopenharmony_ci	/*
431762306a36Sopenharmony_ci	 * Allow to reduce metadata or system integrity only if force set for
431862306a36Sopenharmony_ci	 * profiles with redundancy (copies, parity)
431962306a36Sopenharmony_ci	 */
432062306a36Sopenharmony_ci	allowed = 0;
432162306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
432262306a36Sopenharmony_ci		if (btrfs_raid_array[i].ncopies >= 2 ||
432362306a36Sopenharmony_ci		    btrfs_raid_array[i].tolerated_failures >= 1)
432462306a36Sopenharmony_ci			allowed |= btrfs_raid_array[i].bg_flag;
432562306a36Sopenharmony_ci	}
432662306a36Sopenharmony_ci	do {
432762306a36Sopenharmony_ci		seq = read_seqbegin(&fs_info->profiles_lock);
432862306a36Sopenharmony_ci
432962306a36Sopenharmony_ci		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
433062306a36Sopenharmony_ci		     (fs_info->avail_system_alloc_bits & allowed) &&
433162306a36Sopenharmony_ci		     !(bctl->sys.target & allowed)) ||
433262306a36Sopenharmony_ci		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
433362306a36Sopenharmony_ci		     (fs_info->avail_metadata_alloc_bits & allowed) &&
433462306a36Sopenharmony_ci		     !(bctl->meta.target & allowed)))
433562306a36Sopenharmony_ci			reducing_redundancy = true;
433662306a36Sopenharmony_ci		else
433762306a36Sopenharmony_ci			reducing_redundancy = false;
433862306a36Sopenharmony_ci
433962306a36Sopenharmony_ci		/* if we're not converting, the target field is uninitialized */
434062306a36Sopenharmony_ci		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
434162306a36Sopenharmony_ci			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
434262306a36Sopenharmony_ci		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
434362306a36Sopenharmony_ci			bctl->data.target : fs_info->avail_data_alloc_bits;
434462306a36Sopenharmony_ci	} while (read_seqretry(&fs_info->profiles_lock, seq));
434562306a36Sopenharmony_ci
434662306a36Sopenharmony_ci	if (reducing_redundancy) {
434762306a36Sopenharmony_ci		if (bctl->flags & BTRFS_BALANCE_FORCE) {
434862306a36Sopenharmony_ci			btrfs_info(fs_info,
434962306a36Sopenharmony_ci			   "balance: force reducing metadata redundancy");
435062306a36Sopenharmony_ci		} else {
435162306a36Sopenharmony_ci			btrfs_err(fs_info,
435262306a36Sopenharmony_ci	"balance: reduces metadata redundancy, use --force if you want this");
435362306a36Sopenharmony_ci			ret = -EINVAL;
435462306a36Sopenharmony_ci			goto out;
435562306a36Sopenharmony_ci		}
435662306a36Sopenharmony_ci	}
435762306a36Sopenharmony_ci
435862306a36Sopenharmony_ci	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
435962306a36Sopenharmony_ci		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
436062306a36Sopenharmony_ci		btrfs_warn(fs_info,
436162306a36Sopenharmony_ci	"balance: metadata profile %s has lower redundancy than data profile %s",
436262306a36Sopenharmony_ci				btrfs_bg_type_to_raid_name(meta_target),
436362306a36Sopenharmony_ci				btrfs_bg_type_to_raid_name(data_target));
436462306a36Sopenharmony_ci	}
436562306a36Sopenharmony_ci
436662306a36Sopenharmony_ci	ret = insert_balance_item(fs_info, bctl);
436762306a36Sopenharmony_ci	if (ret && ret != -EEXIST)
436862306a36Sopenharmony_ci		goto out;
436962306a36Sopenharmony_ci
437062306a36Sopenharmony_ci	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
437162306a36Sopenharmony_ci		BUG_ON(ret == -EEXIST);
437262306a36Sopenharmony_ci		BUG_ON(fs_info->balance_ctl);
437362306a36Sopenharmony_ci		spin_lock(&fs_info->balance_lock);
437462306a36Sopenharmony_ci		fs_info->balance_ctl = bctl;
437562306a36Sopenharmony_ci		spin_unlock(&fs_info->balance_lock);
437662306a36Sopenharmony_ci	} else {
437762306a36Sopenharmony_ci		BUG_ON(ret != -EEXIST);
437862306a36Sopenharmony_ci		spin_lock(&fs_info->balance_lock);
437962306a36Sopenharmony_ci		update_balance_args(bctl);
438062306a36Sopenharmony_ci		spin_unlock(&fs_info->balance_lock);
438162306a36Sopenharmony_ci	}
438262306a36Sopenharmony_ci
438362306a36Sopenharmony_ci	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
438462306a36Sopenharmony_ci	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
438562306a36Sopenharmony_ci	describe_balance_start_or_resume(fs_info);
438662306a36Sopenharmony_ci	mutex_unlock(&fs_info->balance_mutex);
438762306a36Sopenharmony_ci
438862306a36Sopenharmony_ci	ret = __btrfs_balance(fs_info);
438962306a36Sopenharmony_ci
439062306a36Sopenharmony_ci	mutex_lock(&fs_info->balance_mutex);
439162306a36Sopenharmony_ci	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
439262306a36Sopenharmony_ci		btrfs_info(fs_info, "balance: paused");
439362306a36Sopenharmony_ci		btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
439462306a36Sopenharmony_ci		paused = true;
439562306a36Sopenharmony_ci	}
439662306a36Sopenharmony_ci	/*
439762306a36Sopenharmony_ci	 * Balance can be canceled by:
439862306a36Sopenharmony_ci	 *
439962306a36Sopenharmony_ci	 * - Regular cancel request
440062306a36Sopenharmony_ci	 *   Then ret == -ECANCELED and balance_cancel_req > 0
440162306a36Sopenharmony_ci	 *
440262306a36Sopenharmony_ci	 * - Fatal signal to "btrfs" process
440362306a36Sopenharmony_ci	 *   Either the signal caught by wait_reserve_ticket() and callers
440462306a36Sopenharmony_ci	 *   got -EINTR, or caught by btrfs_should_cancel_balance() and
440562306a36Sopenharmony_ci	 *   got -ECANCELED.
440662306a36Sopenharmony_ci	 *   Either way, in this case balance_cancel_req = 0, and
440762306a36Sopenharmony_ci	 *   ret == -EINTR or ret == -ECANCELED.
440862306a36Sopenharmony_ci	 *
440962306a36Sopenharmony_ci	 * So here we only check the return value to catch canceled balance.
441062306a36Sopenharmony_ci	 */
441162306a36Sopenharmony_ci	else if (ret == -ECANCELED || ret == -EINTR)
441262306a36Sopenharmony_ci		btrfs_info(fs_info, "balance: canceled");
441362306a36Sopenharmony_ci	else
441462306a36Sopenharmony_ci		btrfs_info(fs_info, "balance: ended with status: %d", ret);
441562306a36Sopenharmony_ci
441662306a36Sopenharmony_ci	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
441762306a36Sopenharmony_ci
441862306a36Sopenharmony_ci	if (bargs) {
441962306a36Sopenharmony_ci		memset(bargs, 0, sizeof(*bargs));
442062306a36Sopenharmony_ci		btrfs_update_ioctl_balance_args(fs_info, bargs);
442162306a36Sopenharmony_ci	}
442262306a36Sopenharmony_ci
442362306a36Sopenharmony_ci	/* We didn't pause, we can clean everything up. */
442462306a36Sopenharmony_ci	if (!paused) {
442562306a36Sopenharmony_ci		reset_balance_state(fs_info);
442662306a36Sopenharmony_ci		btrfs_exclop_finish(fs_info);
442762306a36Sopenharmony_ci	}
442862306a36Sopenharmony_ci
442962306a36Sopenharmony_ci	wake_up(&fs_info->balance_wait_q);
443062306a36Sopenharmony_ci
443162306a36Sopenharmony_ci	return ret;
443262306a36Sopenharmony_ciout:
443362306a36Sopenharmony_ci	if (bctl->flags & BTRFS_BALANCE_RESUME)
443462306a36Sopenharmony_ci		reset_balance_state(fs_info);
443562306a36Sopenharmony_ci	else
443662306a36Sopenharmony_ci		kfree(bctl);
443762306a36Sopenharmony_ci	btrfs_exclop_finish(fs_info);
443862306a36Sopenharmony_ci
443962306a36Sopenharmony_ci	return ret;
444062306a36Sopenharmony_ci}
444162306a36Sopenharmony_ci
444262306a36Sopenharmony_cistatic int balance_kthread(void *data)
444362306a36Sopenharmony_ci{
444462306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = data;
444562306a36Sopenharmony_ci	int ret = 0;
444662306a36Sopenharmony_ci
444762306a36Sopenharmony_ci	sb_start_write(fs_info->sb);
444862306a36Sopenharmony_ci	mutex_lock(&fs_info->balance_mutex);
444962306a36Sopenharmony_ci	if (fs_info->balance_ctl)
445062306a36Sopenharmony_ci		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
445162306a36Sopenharmony_ci	mutex_unlock(&fs_info->balance_mutex);
445262306a36Sopenharmony_ci	sb_end_write(fs_info->sb);
445362306a36Sopenharmony_ci
445462306a36Sopenharmony_ci	return ret;
445562306a36Sopenharmony_ci}
445662306a36Sopenharmony_ci
445762306a36Sopenharmony_ciint btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
445862306a36Sopenharmony_ci{
445962306a36Sopenharmony_ci	struct task_struct *tsk;
446062306a36Sopenharmony_ci
446162306a36Sopenharmony_ci	mutex_lock(&fs_info->balance_mutex);
446262306a36Sopenharmony_ci	if (!fs_info->balance_ctl) {
446362306a36Sopenharmony_ci		mutex_unlock(&fs_info->balance_mutex);
446462306a36Sopenharmony_ci		return 0;
446562306a36Sopenharmony_ci	}
446662306a36Sopenharmony_ci	mutex_unlock(&fs_info->balance_mutex);
446762306a36Sopenharmony_ci
446862306a36Sopenharmony_ci	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
446962306a36Sopenharmony_ci		btrfs_info(fs_info, "balance: resume skipped");
447062306a36Sopenharmony_ci		return 0;
447162306a36Sopenharmony_ci	}
447262306a36Sopenharmony_ci
447362306a36Sopenharmony_ci	spin_lock(&fs_info->super_lock);
447462306a36Sopenharmony_ci	ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
447562306a36Sopenharmony_ci	fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
447662306a36Sopenharmony_ci	spin_unlock(&fs_info->super_lock);
447762306a36Sopenharmony_ci	/*
447862306a36Sopenharmony_ci	 * A ro->rw remount sequence should continue with the paused balance
447962306a36Sopenharmony_ci	 * regardless of who pauses it, system or the user as of now, so set
448062306a36Sopenharmony_ci	 * the resume flag.
448162306a36Sopenharmony_ci	 */
448262306a36Sopenharmony_ci	spin_lock(&fs_info->balance_lock);
448362306a36Sopenharmony_ci	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
448462306a36Sopenharmony_ci	spin_unlock(&fs_info->balance_lock);
448562306a36Sopenharmony_ci
448662306a36Sopenharmony_ci	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
448762306a36Sopenharmony_ci	return PTR_ERR_OR_ZERO(tsk);
448862306a36Sopenharmony_ci}
448962306a36Sopenharmony_ci
449062306a36Sopenharmony_ciint btrfs_recover_balance(struct btrfs_fs_info *fs_info)
449162306a36Sopenharmony_ci{
449262306a36Sopenharmony_ci	struct btrfs_balance_control *bctl;
449362306a36Sopenharmony_ci	struct btrfs_balance_item *item;
449462306a36Sopenharmony_ci	struct btrfs_disk_balance_args disk_bargs;
449562306a36Sopenharmony_ci	struct btrfs_path *path;
449662306a36Sopenharmony_ci	struct extent_buffer *leaf;
449762306a36Sopenharmony_ci	struct btrfs_key key;
449862306a36Sopenharmony_ci	int ret;
449962306a36Sopenharmony_ci
450062306a36Sopenharmony_ci	path = btrfs_alloc_path();
450162306a36Sopenharmony_ci	if (!path)
450262306a36Sopenharmony_ci		return -ENOMEM;
450362306a36Sopenharmony_ci
450462306a36Sopenharmony_ci	key.objectid = BTRFS_BALANCE_OBJECTID;
450562306a36Sopenharmony_ci	key.type = BTRFS_TEMPORARY_ITEM_KEY;
450662306a36Sopenharmony_ci	key.offset = 0;
450762306a36Sopenharmony_ci
450862306a36Sopenharmony_ci	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
450962306a36Sopenharmony_ci	if (ret < 0)
451062306a36Sopenharmony_ci		goto out;
451162306a36Sopenharmony_ci	if (ret > 0) { /* ret = -ENOENT; */
451262306a36Sopenharmony_ci		ret = 0;
451362306a36Sopenharmony_ci		goto out;
451462306a36Sopenharmony_ci	}
451562306a36Sopenharmony_ci
451662306a36Sopenharmony_ci	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
451762306a36Sopenharmony_ci	if (!bctl) {
451862306a36Sopenharmony_ci		ret = -ENOMEM;
451962306a36Sopenharmony_ci		goto out;
452062306a36Sopenharmony_ci	}
452162306a36Sopenharmony_ci
452262306a36Sopenharmony_ci	leaf = path->nodes[0];
452362306a36Sopenharmony_ci	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
452462306a36Sopenharmony_ci
452562306a36Sopenharmony_ci	bctl->flags = btrfs_balance_flags(leaf, item);
452662306a36Sopenharmony_ci	bctl->flags |= BTRFS_BALANCE_RESUME;
452762306a36Sopenharmony_ci
452862306a36Sopenharmony_ci	btrfs_balance_data(leaf, item, &disk_bargs);
452962306a36Sopenharmony_ci	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
453062306a36Sopenharmony_ci	btrfs_balance_meta(leaf, item, &disk_bargs);
453162306a36Sopenharmony_ci	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
453262306a36Sopenharmony_ci	btrfs_balance_sys(leaf, item, &disk_bargs);
453362306a36Sopenharmony_ci	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
453462306a36Sopenharmony_ci
453562306a36Sopenharmony_ci	/*
453662306a36Sopenharmony_ci	 * This should never happen, as the paused balance state is recovered
453762306a36Sopenharmony_ci	 * during mount without any chance of other exclusive ops to collide.
453862306a36Sopenharmony_ci	 *
453962306a36Sopenharmony_ci	 * This gives the exclusive op status to balance and keeps in paused
454062306a36Sopenharmony_ci	 * state until user intervention (cancel or umount). If the ownership
454162306a36Sopenharmony_ci	 * cannot be assigned, show a message but do not fail. The balance
454262306a36Sopenharmony_ci	 * is in a paused state and must have fs_info::balance_ctl properly
454362306a36Sopenharmony_ci	 * set up.
454462306a36Sopenharmony_ci	 */
454562306a36Sopenharmony_ci	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
454662306a36Sopenharmony_ci		btrfs_warn(fs_info,
454762306a36Sopenharmony_ci	"balance: cannot set exclusive op status, resume manually");
454862306a36Sopenharmony_ci
454962306a36Sopenharmony_ci	btrfs_release_path(path);
455062306a36Sopenharmony_ci
455162306a36Sopenharmony_ci	mutex_lock(&fs_info->balance_mutex);
455262306a36Sopenharmony_ci	BUG_ON(fs_info->balance_ctl);
455362306a36Sopenharmony_ci	spin_lock(&fs_info->balance_lock);
455462306a36Sopenharmony_ci	fs_info->balance_ctl = bctl;
455562306a36Sopenharmony_ci	spin_unlock(&fs_info->balance_lock);
455662306a36Sopenharmony_ci	mutex_unlock(&fs_info->balance_mutex);
455762306a36Sopenharmony_ciout:
455862306a36Sopenharmony_ci	btrfs_free_path(path);
455962306a36Sopenharmony_ci	return ret;
456062306a36Sopenharmony_ci}
456162306a36Sopenharmony_ci
456262306a36Sopenharmony_ciint btrfs_pause_balance(struct btrfs_fs_info *fs_info)
456362306a36Sopenharmony_ci{
456462306a36Sopenharmony_ci	int ret = 0;
456562306a36Sopenharmony_ci
456662306a36Sopenharmony_ci	mutex_lock(&fs_info->balance_mutex);
456762306a36Sopenharmony_ci	if (!fs_info->balance_ctl) {
456862306a36Sopenharmony_ci		mutex_unlock(&fs_info->balance_mutex);
456962306a36Sopenharmony_ci		return -ENOTCONN;
457062306a36Sopenharmony_ci	}
457162306a36Sopenharmony_ci
457262306a36Sopenharmony_ci	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
457362306a36Sopenharmony_ci		atomic_inc(&fs_info->balance_pause_req);
457462306a36Sopenharmony_ci		mutex_unlock(&fs_info->balance_mutex);
457562306a36Sopenharmony_ci
457662306a36Sopenharmony_ci		wait_event(fs_info->balance_wait_q,
457762306a36Sopenharmony_ci			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
457862306a36Sopenharmony_ci
457962306a36Sopenharmony_ci		mutex_lock(&fs_info->balance_mutex);
458062306a36Sopenharmony_ci		/* we are good with balance_ctl ripped off from under us */
458162306a36Sopenharmony_ci		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
458262306a36Sopenharmony_ci		atomic_dec(&fs_info->balance_pause_req);
458362306a36Sopenharmony_ci	} else {
458462306a36Sopenharmony_ci		ret = -ENOTCONN;
458562306a36Sopenharmony_ci	}
458662306a36Sopenharmony_ci
458762306a36Sopenharmony_ci	mutex_unlock(&fs_info->balance_mutex);
458862306a36Sopenharmony_ci	return ret;
458962306a36Sopenharmony_ci}
459062306a36Sopenharmony_ci
459162306a36Sopenharmony_ciint btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
459262306a36Sopenharmony_ci{
459362306a36Sopenharmony_ci	mutex_lock(&fs_info->balance_mutex);
459462306a36Sopenharmony_ci	if (!fs_info->balance_ctl) {
459562306a36Sopenharmony_ci		mutex_unlock(&fs_info->balance_mutex);
459662306a36Sopenharmony_ci		return -ENOTCONN;
459762306a36Sopenharmony_ci	}
459862306a36Sopenharmony_ci
459962306a36Sopenharmony_ci	/*
460062306a36Sopenharmony_ci	 * A paused balance with the item stored on disk can be resumed at
460162306a36Sopenharmony_ci	 * mount time if the mount is read-write. Otherwise it's still paused
460262306a36Sopenharmony_ci	 * and we must not allow cancelling as it deletes the item.
460362306a36Sopenharmony_ci	 */
460462306a36Sopenharmony_ci	if (sb_rdonly(fs_info->sb)) {
460562306a36Sopenharmony_ci		mutex_unlock(&fs_info->balance_mutex);
460662306a36Sopenharmony_ci		return -EROFS;
460762306a36Sopenharmony_ci	}
460862306a36Sopenharmony_ci
460962306a36Sopenharmony_ci	atomic_inc(&fs_info->balance_cancel_req);
461062306a36Sopenharmony_ci	/*
461162306a36Sopenharmony_ci	 * if we are running just wait and return, balance item is
461262306a36Sopenharmony_ci	 * deleted in btrfs_balance in this case
461362306a36Sopenharmony_ci	 */
461462306a36Sopenharmony_ci	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
461562306a36Sopenharmony_ci		mutex_unlock(&fs_info->balance_mutex);
461662306a36Sopenharmony_ci		wait_event(fs_info->balance_wait_q,
461762306a36Sopenharmony_ci			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
461862306a36Sopenharmony_ci		mutex_lock(&fs_info->balance_mutex);
461962306a36Sopenharmony_ci	} else {
462062306a36Sopenharmony_ci		mutex_unlock(&fs_info->balance_mutex);
462162306a36Sopenharmony_ci		/*
462262306a36Sopenharmony_ci		 * Lock released to allow other waiters to continue, we'll
462362306a36Sopenharmony_ci		 * reexamine the status again.
462462306a36Sopenharmony_ci		 */
462562306a36Sopenharmony_ci		mutex_lock(&fs_info->balance_mutex);
462662306a36Sopenharmony_ci
462762306a36Sopenharmony_ci		if (fs_info->balance_ctl) {
462862306a36Sopenharmony_ci			reset_balance_state(fs_info);
462962306a36Sopenharmony_ci			btrfs_exclop_finish(fs_info);
463062306a36Sopenharmony_ci			btrfs_info(fs_info, "balance: canceled");
463162306a36Sopenharmony_ci		}
463262306a36Sopenharmony_ci	}
463362306a36Sopenharmony_ci
463462306a36Sopenharmony_ci	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
463562306a36Sopenharmony_ci	atomic_dec(&fs_info->balance_cancel_req);
463662306a36Sopenharmony_ci	mutex_unlock(&fs_info->balance_mutex);
463762306a36Sopenharmony_ci	return 0;
463862306a36Sopenharmony_ci}
463962306a36Sopenharmony_ci
464062306a36Sopenharmony_ciint btrfs_uuid_scan_kthread(void *data)
464162306a36Sopenharmony_ci{
464262306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = data;
464362306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->tree_root;
464462306a36Sopenharmony_ci	struct btrfs_key key;
464562306a36Sopenharmony_ci	struct btrfs_path *path = NULL;
464662306a36Sopenharmony_ci	int ret = 0;
464762306a36Sopenharmony_ci	struct extent_buffer *eb;
464862306a36Sopenharmony_ci	int slot;
464962306a36Sopenharmony_ci	struct btrfs_root_item root_item;
465062306a36Sopenharmony_ci	u32 item_size;
465162306a36Sopenharmony_ci	struct btrfs_trans_handle *trans = NULL;
465262306a36Sopenharmony_ci	bool closing = false;
465362306a36Sopenharmony_ci
465462306a36Sopenharmony_ci	path = btrfs_alloc_path();
465562306a36Sopenharmony_ci	if (!path) {
465662306a36Sopenharmony_ci		ret = -ENOMEM;
465762306a36Sopenharmony_ci		goto out;
465862306a36Sopenharmony_ci	}
465962306a36Sopenharmony_ci
466062306a36Sopenharmony_ci	key.objectid = 0;
466162306a36Sopenharmony_ci	key.type = BTRFS_ROOT_ITEM_KEY;
466262306a36Sopenharmony_ci	key.offset = 0;
466362306a36Sopenharmony_ci
466462306a36Sopenharmony_ci	while (1) {
466562306a36Sopenharmony_ci		if (btrfs_fs_closing(fs_info)) {
466662306a36Sopenharmony_ci			closing = true;
466762306a36Sopenharmony_ci			break;
466862306a36Sopenharmony_ci		}
466962306a36Sopenharmony_ci		ret = btrfs_search_forward(root, &key, path,
467062306a36Sopenharmony_ci				BTRFS_OLDEST_GENERATION);
467162306a36Sopenharmony_ci		if (ret) {
467262306a36Sopenharmony_ci			if (ret > 0)
467362306a36Sopenharmony_ci				ret = 0;
467462306a36Sopenharmony_ci			break;
467562306a36Sopenharmony_ci		}
467662306a36Sopenharmony_ci
467762306a36Sopenharmony_ci		if (key.type != BTRFS_ROOT_ITEM_KEY ||
467862306a36Sopenharmony_ci		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
467962306a36Sopenharmony_ci		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
468062306a36Sopenharmony_ci		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
468162306a36Sopenharmony_ci			goto skip;
468262306a36Sopenharmony_ci
468362306a36Sopenharmony_ci		eb = path->nodes[0];
468462306a36Sopenharmony_ci		slot = path->slots[0];
468562306a36Sopenharmony_ci		item_size = btrfs_item_size(eb, slot);
468662306a36Sopenharmony_ci		if (item_size < sizeof(root_item))
468762306a36Sopenharmony_ci			goto skip;
468862306a36Sopenharmony_ci
468962306a36Sopenharmony_ci		read_extent_buffer(eb, &root_item,
469062306a36Sopenharmony_ci				   btrfs_item_ptr_offset(eb, slot),
469162306a36Sopenharmony_ci				   (int)sizeof(root_item));
469262306a36Sopenharmony_ci		if (btrfs_root_refs(&root_item) == 0)
469362306a36Sopenharmony_ci			goto skip;
469462306a36Sopenharmony_ci
469562306a36Sopenharmony_ci		if (!btrfs_is_empty_uuid(root_item.uuid) ||
469662306a36Sopenharmony_ci		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
469762306a36Sopenharmony_ci			if (trans)
469862306a36Sopenharmony_ci				goto update_tree;
469962306a36Sopenharmony_ci
470062306a36Sopenharmony_ci			btrfs_release_path(path);
470162306a36Sopenharmony_ci			/*
470262306a36Sopenharmony_ci			 * 1 - subvol uuid item
470362306a36Sopenharmony_ci			 * 1 - received_subvol uuid item
470462306a36Sopenharmony_ci			 */
470562306a36Sopenharmony_ci			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
470662306a36Sopenharmony_ci			if (IS_ERR(trans)) {
470762306a36Sopenharmony_ci				ret = PTR_ERR(trans);
470862306a36Sopenharmony_ci				break;
470962306a36Sopenharmony_ci			}
471062306a36Sopenharmony_ci			continue;
471162306a36Sopenharmony_ci		} else {
471262306a36Sopenharmony_ci			goto skip;
471362306a36Sopenharmony_ci		}
471462306a36Sopenharmony_ciupdate_tree:
471562306a36Sopenharmony_ci		btrfs_release_path(path);
471662306a36Sopenharmony_ci		if (!btrfs_is_empty_uuid(root_item.uuid)) {
471762306a36Sopenharmony_ci			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
471862306a36Sopenharmony_ci						  BTRFS_UUID_KEY_SUBVOL,
471962306a36Sopenharmony_ci						  key.objectid);
472062306a36Sopenharmony_ci			if (ret < 0) {
472162306a36Sopenharmony_ci				btrfs_warn(fs_info, "uuid_tree_add failed %d",
472262306a36Sopenharmony_ci					ret);
472362306a36Sopenharmony_ci				break;
472462306a36Sopenharmony_ci			}
472562306a36Sopenharmony_ci		}
472662306a36Sopenharmony_ci
472762306a36Sopenharmony_ci		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
472862306a36Sopenharmony_ci			ret = btrfs_uuid_tree_add(trans,
472962306a36Sopenharmony_ci						  root_item.received_uuid,
473062306a36Sopenharmony_ci						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
473162306a36Sopenharmony_ci						  key.objectid);
473262306a36Sopenharmony_ci			if (ret < 0) {
473362306a36Sopenharmony_ci				btrfs_warn(fs_info, "uuid_tree_add failed %d",
473462306a36Sopenharmony_ci					ret);
473562306a36Sopenharmony_ci				break;
473662306a36Sopenharmony_ci			}
473762306a36Sopenharmony_ci		}
473862306a36Sopenharmony_ci
473962306a36Sopenharmony_ciskip:
474062306a36Sopenharmony_ci		btrfs_release_path(path);
474162306a36Sopenharmony_ci		if (trans) {
474262306a36Sopenharmony_ci			ret = btrfs_end_transaction(trans);
474362306a36Sopenharmony_ci			trans = NULL;
474462306a36Sopenharmony_ci			if (ret)
474562306a36Sopenharmony_ci				break;
474662306a36Sopenharmony_ci		}
474762306a36Sopenharmony_ci
474862306a36Sopenharmony_ci		if (key.offset < (u64)-1) {
474962306a36Sopenharmony_ci			key.offset++;
475062306a36Sopenharmony_ci		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
475162306a36Sopenharmony_ci			key.offset = 0;
475262306a36Sopenharmony_ci			key.type = BTRFS_ROOT_ITEM_KEY;
475362306a36Sopenharmony_ci		} else if (key.objectid < (u64)-1) {
475462306a36Sopenharmony_ci			key.offset = 0;
475562306a36Sopenharmony_ci			key.type = BTRFS_ROOT_ITEM_KEY;
475662306a36Sopenharmony_ci			key.objectid++;
475762306a36Sopenharmony_ci		} else {
475862306a36Sopenharmony_ci			break;
475962306a36Sopenharmony_ci		}
476062306a36Sopenharmony_ci		cond_resched();
476162306a36Sopenharmony_ci	}
476262306a36Sopenharmony_ci
476362306a36Sopenharmony_ciout:
476462306a36Sopenharmony_ci	btrfs_free_path(path);
476562306a36Sopenharmony_ci	if (trans && !IS_ERR(trans))
476662306a36Sopenharmony_ci		btrfs_end_transaction(trans);
476762306a36Sopenharmony_ci	if (ret)
476862306a36Sopenharmony_ci		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
476962306a36Sopenharmony_ci	else if (!closing)
477062306a36Sopenharmony_ci		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
477162306a36Sopenharmony_ci	up(&fs_info->uuid_tree_rescan_sem);
477262306a36Sopenharmony_ci	return 0;
477362306a36Sopenharmony_ci}
477462306a36Sopenharmony_ci
477562306a36Sopenharmony_ciint btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
477662306a36Sopenharmony_ci{
477762306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
477862306a36Sopenharmony_ci	struct btrfs_root *tree_root = fs_info->tree_root;
477962306a36Sopenharmony_ci	struct btrfs_root *uuid_root;
478062306a36Sopenharmony_ci	struct task_struct *task;
478162306a36Sopenharmony_ci	int ret;
478262306a36Sopenharmony_ci
478362306a36Sopenharmony_ci	/*
478462306a36Sopenharmony_ci	 * 1 - root node
478562306a36Sopenharmony_ci	 * 1 - root item
478662306a36Sopenharmony_ci	 */
478762306a36Sopenharmony_ci	trans = btrfs_start_transaction(tree_root, 2);
478862306a36Sopenharmony_ci	if (IS_ERR(trans))
478962306a36Sopenharmony_ci		return PTR_ERR(trans);
479062306a36Sopenharmony_ci
479162306a36Sopenharmony_ci	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
479262306a36Sopenharmony_ci	if (IS_ERR(uuid_root)) {
479362306a36Sopenharmony_ci		ret = PTR_ERR(uuid_root);
479462306a36Sopenharmony_ci		btrfs_abort_transaction(trans, ret);
479562306a36Sopenharmony_ci		btrfs_end_transaction(trans);
479662306a36Sopenharmony_ci		return ret;
479762306a36Sopenharmony_ci	}
479862306a36Sopenharmony_ci
479962306a36Sopenharmony_ci	fs_info->uuid_root = uuid_root;
480062306a36Sopenharmony_ci
480162306a36Sopenharmony_ci	ret = btrfs_commit_transaction(trans);
480262306a36Sopenharmony_ci	if (ret)
480362306a36Sopenharmony_ci		return ret;
480462306a36Sopenharmony_ci
480562306a36Sopenharmony_ci	down(&fs_info->uuid_tree_rescan_sem);
480662306a36Sopenharmony_ci	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
480762306a36Sopenharmony_ci	if (IS_ERR(task)) {
480862306a36Sopenharmony_ci		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
480962306a36Sopenharmony_ci		btrfs_warn(fs_info, "failed to start uuid_scan task");
481062306a36Sopenharmony_ci		up(&fs_info->uuid_tree_rescan_sem);
481162306a36Sopenharmony_ci		return PTR_ERR(task);
481262306a36Sopenharmony_ci	}
481362306a36Sopenharmony_ci
481462306a36Sopenharmony_ci	return 0;
481562306a36Sopenharmony_ci}
481662306a36Sopenharmony_ci
481762306a36Sopenharmony_ci/*
481862306a36Sopenharmony_ci * shrinking a device means finding all of the device extents past
481962306a36Sopenharmony_ci * the new size, and then following the back refs to the chunks.
482062306a36Sopenharmony_ci * The chunk relocation code actually frees the device extent
482162306a36Sopenharmony_ci */
482262306a36Sopenharmony_ciint btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
482362306a36Sopenharmony_ci{
482462306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = device->fs_info;
482562306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->dev_root;
482662306a36Sopenharmony_ci	struct btrfs_trans_handle *trans;
482762306a36Sopenharmony_ci	struct btrfs_dev_extent *dev_extent = NULL;
482862306a36Sopenharmony_ci	struct btrfs_path *path;
482962306a36Sopenharmony_ci	u64 length;
483062306a36Sopenharmony_ci	u64 chunk_offset;
483162306a36Sopenharmony_ci	int ret;
483262306a36Sopenharmony_ci	int slot;
483362306a36Sopenharmony_ci	int failed = 0;
483462306a36Sopenharmony_ci	bool retried = false;
483562306a36Sopenharmony_ci	struct extent_buffer *l;
483662306a36Sopenharmony_ci	struct btrfs_key key;
483762306a36Sopenharmony_ci	struct btrfs_super_block *super_copy = fs_info->super_copy;
483862306a36Sopenharmony_ci	u64 old_total = btrfs_super_total_bytes(super_copy);
483962306a36Sopenharmony_ci	u64 old_size = btrfs_device_get_total_bytes(device);
484062306a36Sopenharmony_ci	u64 diff;
484162306a36Sopenharmony_ci	u64 start;
484262306a36Sopenharmony_ci
484362306a36Sopenharmony_ci	new_size = round_down(new_size, fs_info->sectorsize);
484462306a36Sopenharmony_ci	start = new_size;
484562306a36Sopenharmony_ci	diff = round_down(old_size - new_size, fs_info->sectorsize);
484662306a36Sopenharmony_ci
484762306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
484862306a36Sopenharmony_ci		return -EINVAL;
484962306a36Sopenharmony_ci
485062306a36Sopenharmony_ci	path = btrfs_alloc_path();
485162306a36Sopenharmony_ci	if (!path)
485262306a36Sopenharmony_ci		return -ENOMEM;
485362306a36Sopenharmony_ci
485462306a36Sopenharmony_ci	path->reada = READA_BACK;
485562306a36Sopenharmony_ci
485662306a36Sopenharmony_ci	trans = btrfs_start_transaction(root, 0);
485762306a36Sopenharmony_ci	if (IS_ERR(trans)) {
485862306a36Sopenharmony_ci		btrfs_free_path(path);
485962306a36Sopenharmony_ci		return PTR_ERR(trans);
486062306a36Sopenharmony_ci	}
486162306a36Sopenharmony_ci
486262306a36Sopenharmony_ci	mutex_lock(&fs_info->chunk_mutex);
486362306a36Sopenharmony_ci
486462306a36Sopenharmony_ci	btrfs_device_set_total_bytes(device, new_size);
486562306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
486662306a36Sopenharmony_ci		device->fs_devices->total_rw_bytes -= diff;
486762306a36Sopenharmony_ci		atomic64_sub(diff, &fs_info->free_chunk_space);
486862306a36Sopenharmony_ci	}
486962306a36Sopenharmony_ci
487062306a36Sopenharmony_ci	/*
487162306a36Sopenharmony_ci	 * Once the device's size has been set to the new size, ensure all
487262306a36Sopenharmony_ci	 * in-memory chunks are synced to disk so that the loop below sees them
487362306a36Sopenharmony_ci	 * and relocates them accordingly.
487462306a36Sopenharmony_ci	 */
487562306a36Sopenharmony_ci	if (contains_pending_extent(device, &start, diff)) {
487662306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
487762306a36Sopenharmony_ci		ret = btrfs_commit_transaction(trans);
487862306a36Sopenharmony_ci		if (ret)
487962306a36Sopenharmony_ci			goto done;
488062306a36Sopenharmony_ci	} else {
488162306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
488262306a36Sopenharmony_ci		btrfs_end_transaction(trans);
488362306a36Sopenharmony_ci	}
488462306a36Sopenharmony_ci
488562306a36Sopenharmony_ciagain:
488662306a36Sopenharmony_ci	key.objectid = device->devid;
488762306a36Sopenharmony_ci	key.offset = (u64)-1;
488862306a36Sopenharmony_ci	key.type = BTRFS_DEV_EXTENT_KEY;
488962306a36Sopenharmony_ci
489062306a36Sopenharmony_ci	do {
489162306a36Sopenharmony_ci		mutex_lock(&fs_info->reclaim_bgs_lock);
489262306a36Sopenharmony_ci		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
489362306a36Sopenharmony_ci		if (ret < 0) {
489462306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
489562306a36Sopenharmony_ci			goto done;
489662306a36Sopenharmony_ci		}
489762306a36Sopenharmony_ci
489862306a36Sopenharmony_ci		ret = btrfs_previous_item(root, path, 0, key.type);
489962306a36Sopenharmony_ci		if (ret) {
490062306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
490162306a36Sopenharmony_ci			if (ret < 0)
490262306a36Sopenharmony_ci				goto done;
490362306a36Sopenharmony_ci			ret = 0;
490462306a36Sopenharmony_ci			btrfs_release_path(path);
490562306a36Sopenharmony_ci			break;
490662306a36Sopenharmony_ci		}
490762306a36Sopenharmony_ci
490862306a36Sopenharmony_ci		l = path->nodes[0];
490962306a36Sopenharmony_ci		slot = path->slots[0];
491062306a36Sopenharmony_ci		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
491162306a36Sopenharmony_ci
491262306a36Sopenharmony_ci		if (key.objectid != device->devid) {
491362306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
491462306a36Sopenharmony_ci			btrfs_release_path(path);
491562306a36Sopenharmony_ci			break;
491662306a36Sopenharmony_ci		}
491762306a36Sopenharmony_ci
491862306a36Sopenharmony_ci		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
491962306a36Sopenharmony_ci		length = btrfs_dev_extent_length(l, dev_extent);
492062306a36Sopenharmony_ci
492162306a36Sopenharmony_ci		if (key.offset + length <= new_size) {
492262306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
492362306a36Sopenharmony_ci			btrfs_release_path(path);
492462306a36Sopenharmony_ci			break;
492562306a36Sopenharmony_ci		}
492662306a36Sopenharmony_ci
492762306a36Sopenharmony_ci		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
492862306a36Sopenharmony_ci		btrfs_release_path(path);
492962306a36Sopenharmony_ci
493062306a36Sopenharmony_ci		/*
493162306a36Sopenharmony_ci		 * We may be relocating the only data chunk we have,
493262306a36Sopenharmony_ci		 * which could potentially end up with losing data's
493362306a36Sopenharmony_ci		 * raid profile, so lets allocate an empty one in
493462306a36Sopenharmony_ci		 * advance.
493562306a36Sopenharmony_ci		 */
493662306a36Sopenharmony_ci		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
493762306a36Sopenharmony_ci		if (ret < 0) {
493862306a36Sopenharmony_ci			mutex_unlock(&fs_info->reclaim_bgs_lock);
493962306a36Sopenharmony_ci			goto done;
494062306a36Sopenharmony_ci		}
494162306a36Sopenharmony_ci
494262306a36Sopenharmony_ci		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
494362306a36Sopenharmony_ci		mutex_unlock(&fs_info->reclaim_bgs_lock);
494462306a36Sopenharmony_ci		if (ret == -ENOSPC) {
494562306a36Sopenharmony_ci			failed++;
494662306a36Sopenharmony_ci		} else if (ret) {
494762306a36Sopenharmony_ci			if (ret == -ETXTBSY) {
494862306a36Sopenharmony_ci				btrfs_warn(fs_info,
494962306a36Sopenharmony_ci		   "could not shrink block group %llu due to active swapfile",
495062306a36Sopenharmony_ci					   chunk_offset);
495162306a36Sopenharmony_ci			}
495262306a36Sopenharmony_ci			goto done;
495362306a36Sopenharmony_ci		}
495462306a36Sopenharmony_ci	} while (key.offset-- > 0);
495562306a36Sopenharmony_ci
495662306a36Sopenharmony_ci	if (failed && !retried) {
495762306a36Sopenharmony_ci		failed = 0;
495862306a36Sopenharmony_ci		retried = true;
495962306a36Sopenharmony_ci		goto again;
496062306a36Sopenharmony_ci	} else if (failed && retried) {
496162306a36Sopenharmony_ci		ret = -ENOSPC;
496262306a36Sopenharmony_ci		goto done;
496362306a36Sopenharmony_ci	}
496462306a36Sopenharmony_ci
496562306a36Sopenharmony_ci	/* Shrinking succeeded, else we would be at "done". */
496662306a36Sopenharmony_ci	trans = btrfs_start_transaction(root, 0);
496762306a36Sopenharmony_ci	if (IS_ERR(trans)) {
496862306a36Sopenharmony_ci		ret = PTR_ERR(trans);
496962306a36Sopenharmony_ci		goto done;
497062306a36Sopenharmony_ci	}
497162306a36Sopenharmony_ci
497262306a36Sopenharmony_ci	mutex_lock(&fs_info->chunk_mutex);
497362306a36Sopenharmony_ci	/* Clear all state bits beyond the shrunk device size */
497462306a36Sopenharmony_ci	clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
497562306a36Sopenharmony_ci			  CHUNK_STATE_MASK);
497662306a36Sopenharmony_ci
497762306a36Sopenharmony_ci	btrfs_device_set_disk_total_bytes(device, new_size);
497862306a36Sopenharmony_ci	if (list_empty(&device->post_commit_list))
497962306a36Sopenharmony_ci		list_add_tail(&device->post_commit_list,
498062306a36Sopenharmony_ci			      &trans->transaction->dev_update_list);
498162306a36Sopenharmony_ci
498262306a36Sopenharmony_ci	WARN_ON(diff > old_total);
498362306a36Sopenharmony_ci	btrfs_set_super_total_bytes(super_copy,
498462306a36Sopenharmony_ci			round_down(old_total - diff, fs_info->sectorsize));
498562306a36Sopenharmony_ci	mutex_unlock(&fs_info->chunk_mutex);
498662306a36Sopenharmony_ci
498762306a36Sopenharmony_ci	btrfs_reserve_chunk_metadata(trans, false);
498862306a36Sopenharmony_ci	/* Now btrfs_update_device() will change the on-disk size. */
498962306a36Sopenharmony_ci	ret = btrfs_update_device(trans, device);
499062306a36Sopenharmony_ci	btrfs_trans_release_chunk_metadata(trans);
499162306a36Sopenharmony_ci	if (ret < 0) {
499262306a36Sopenharmony_ci		btrfs_abort_transaction(trans, ret);
499362306a36Sopenharmony_ci		btrfs_end_transaction(trans);
499462306a36Sopenharmony_ci	} else {
499562306a36Sopenharmony_ci		ret = btrfs_commit_transaction(trans);
499662306a36Sopenharmony_ci	}
499762306a36Sopenharmony_cidone:
499862306a36Sopenharmony_ci	btrfs_free_path(path);
499962306a36Sopenharmony_ci	if (ret) {
500062306a36Sopenharmony_ci		mutex_lock(&fs_info->chunk_mutex);
500162306a36Sopenharmony_ci		btrfs_device_set_total_bytes(device, old_size);
500262306a36Sopenharmony_ci		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
500362306a36Sopenharmony_ci			device->fs_devices->total_rw_bytes += diff;
500462306a36Sopenharmony_ci		atomic64_add(diff, &fs_info->free_chunk_space);
500562306a36Sopenharmony_ci		mutex_unlock(&fs_info->chunk_mutex);
500662306a36Sopenharmony_ci	}
500762306a36Sopenharmony_ci	return ret;
500862306a36Sopenharmony_ci}
500962306a36Sopenharmony_ci
501062306a36Sopenharmony_cistatic int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
501162306a36Sopenharmony_ci			   struct btrfs_key *key,
501262306a36Sopenharmony_ci			   struct btrfs_chunk *chunk, int item_size)
501362306a36Sopenharmony_ci{
501462306a36Sopenharmony_ci	struct btrfs_super_block *super_copy = fs_info->super_copy;
501562306a36Sopenharmony_ci	struct btrfs_disk_key disk_key;
501662306a36Sopenharmony_ci	u32 array_size;
501762306a36Sopenharmony_ci	u8 *ptr;
501862306a36Sopenharmony_ci
501962306a36Sopenharmony_ci	lockdep_assert_held(&fs_info->chunk_mutex);
502062306a36Sopenharmony_ci
502162306a36Sopenharmony_ci	array_size = btrfs_super_sys_array_size(super_copy);
502262306a36Sopenharmony_ci	if (array_size + item_size + sizeof(disk_key)
502362306a36Sopenharmony_ci			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
502462306a36Sopenharmony_ci		return -EFBIG;
502562306a36Sopenharmony_ci
502662306a36Sopenharmony_ci	ptr = super_copy->sys_chunk_array + array_size;
502762306a36Sopenharmony_ci	btrfs_cpu_key_to_disk(&disk_key, key);
502862306a36Sopenharmony_ci	memcpy(ptr, &disk_key, sizeof(disk_key));
502962306a36Sopenharmony_ci	ptr += sizeof(disk_key);
503062306a36Sopenharmony_ci	memcpy(ptr, chunk, item_size);
503162306a36Sopenharmony_ci	item_size += sizeof(disk_key);
503262306a36Sopenharmony_ci	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
503362306a36Sopenharmony_ci
503462306a36Sopenharmony_ci	return 0;
503562306a36Sopenharmony_ci}
503662306a36Sopenharmony_ci
503762306a36Sopenharmony_ci/*
503862306a36Sopenharmony_ci * sort the devices in descending order by max_avail, total_avail
503962306a36Sopenharmony_ci */
504062306a36Sopenharmony_cistatic int btrfs_cmp_device_info(const void *a, const void *b)
504162306a36Sopenharmony_ci{
504262306a36Sopenharmony_ci	const struct btrfs_device_info *di_a = a;
504362306a36Sopenharmony_ci	const struct btrfs_device_info *di_b = b;
504462306a36Sopenharmony_ci
504562306a36Sopenharmony_ci	if (di_a->max_avail > di_b->max_avail)
504662306a36Sopenharmony_ci		return -1;
504762306a36Sopenharmony_ci	if (di_a->max_avail < di_b->max_avail)
504862306a36Sopenharmony_ci		return 1;
504962306a36Sopenharmony_ci	if (di_a->total_avail > di_b->total_avail)
505062306a36Sopenharmony_ci		return -1;
505162306a36Sopenharmony_ci	if (di_a->total_avail < di_b->total_avail)
505262306a36Sopenharmony_ci		return 1;
505362306a36Sopenharmony_ci	return 0;
505462306a36Sopenharmony_ci}
505562306a36Sopenharmony_ci
505662306a36Sopenharmony_cistatic void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
505762306a36Sopenharmony_ci{
505862306a36Sopenharmony_ci	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
505962306a36Sopenharmony_ci		return;
506062306a36Sopenharmony_ci
506162306a36Sopenharmony_ci	btrfs_set_fs_incompat(info, RAID56);
506262306a36Sopenharmony_ci}
506362306a36Sopenharmony_ci
506462306a36Sopenharmony_cistatic void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
506562306a36Sopenharmony_ci{
506662306a36Sopenharmony_ci	if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
506762306a36Sopenharmony_ci		return;
506862306a36Sopenharmony_ci
506962306a36Sopenharmony_ci	btrfs_set_fs_incompat(info, RAID1C34);
507062306a36Sopenharmony_ci}
507162306a36Sopenharmony_ci
507262306a36Sopenharmony_ci/*
507362306a36Sopenharmony_ci * Structure used internally for btrfs_create_chunk() function.
507462306a36Sopenharmony_ci * Wraps needed parameters.
507562306a36Sopenharmony_ci */
507662306a36Sopenharmony_cistruct alloc_chunk_ctl {
507762306a36Sopenharmony_ci	u64 start;
507862306a36Sopenharmony_ci	u64 type;
507962306a36Sopenharmony_ci	/* Total number of stripes to allocate */
508062306a36Sopenharmony_ci	int num_stripes;
508162306a36Sopenharmony_ci	/* sub_stripes info for map */
508262306a36Sopenharmony_ci	int sub_stripes;
508362306a36Sopenharmony_ci	/* Stripes per device */
508462306a36Sopenharmony_ci	int dev_stripes;
508562306a36Sopenharmony_ci	/* Maximum number of devices to use */
508662306a36Sopenharmony_ci	int devs_max;
508762306a36Sopenharmony_ci	/* Minimum number of devices to use */
508862306a36Sopenharmony_ci	int devs_min;
508962306a36Sopenharmony_ci	/* ndevs has to be a multiple of this */
509062306a36Sopenharmony_ci	int devs_increment;
509162306a36Sopenharmony_ci	/* Number of copies */
509262306a36Sopenharmony_ci	int ncopies;
509362306a36Sopenharmony_ci	/* Number of stripes worth of bytes to store parity information */
509462306a36Sopenharmony_ci	int nparity;
509562306a36Sopenharmony_ci	u64 max_stripe_size;
509662306a36Sopenharmony_ci	u64 max_chunk_size;
509762306a36Sopenharmony_ci	u64 dev_extent_min;
509862306a36Sopenharmony_ci	u64 stripe_size;
509962306a36Sopenharmony_ci	u64 chunk_size;
510062306a36Sopenharmony_ci	int ndevs;
510162306a36Sopenharmony_ci};
510262306a36Sopenharmony_ci
510362306a36Sopenharmony_cistatic void init_alloc_chunk_ctl_policy_regular(
510462306a36Sopenharmony_ci				struct btrfs_fs_devices *fs_devices,
510562306a36Sopenharmony_ci				struct alloc_chunk_ctl *ctl)
510662306a36Sopenharmony_ci{
510762306a36Sopenharmony_ci	struct btrfs_space_info *space_info;
510862306a36Sopenharmony_ci
510962306a36Sopenharmony_ci	space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
511062306a36Sopenharmony_ci	ASSERT(space_info);
511162306a36Sopenharmony_ci
511262306a36Sopenharmony_ci	ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
511362306a36Sopenharmony_ci	ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
511462306a36Sopenharmony_ci
511562306a36Sopenharmony_ci	if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
511662306a36Sopenharmony_ci		ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
511762306a36Sopenharmony_ci
511862306a36Sopenharmony_ci	/* We don't want a chunk larger than 10% of writable space */
511962306a36Sopenharmony_ci	ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
512062306a36Sopenharmony_ci				  ctl->max_chunk_size);
512162306a36Sopenharmony_ci	ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
512262306a36Sopenharmony_ci}
512362306a36Sopenharmony_ci
512462306a36Sopenharmony_cistatic void init_alloc_chunk_ctl_policy_zoned(
512562306a36Sopenharmony_ci				      struct btrfs_fs_devices *fs_devices,
512662306a36Sopenharmony_ci				      struct alloc_chunk_ctl *ctl)
512762306a36Sopenharmony_ci{
512862306a36Sopenharmony_ci	u64 zone_size = fs_devices->fs_info->zone_size;
512962306a36Sopenharmony_ci	u64 limit;
513062306a36Sopenharmony_ci	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
513162306a36Sopenharmony_ci	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
513262306a36Sopenharmony_ci	u64 min_chunk_size = min_data_stripes * zone_size;
513362306a36Sopenharmony_ci	u64 type = ctl->type;
513462306a36Sopenharmony_ci
513562306a36Sopenharmony_ci	ctl->max_stripe_size = zone_size;
513662306a36Sopenharmony_ci	if (type & BTRFS_BLOCK_GROUP_DATA) {
513762306a36Sopenharmony_ci		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
513862306a36Sopenharmony_ci						 zone_size);
513962306a36Sopenharmony_ci	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
514062306a36Sopenharmony_ci		ctl->max_chunk_size = ctl->max_stripe_size;
514162306a36Sopenharmony_ci	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
514262306a36Sopenharmony_ci		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
514362306a36Sopenharmony_ci		ctl->devs_max = min_t(int, ctl->devs_max,
514462306a36Sopenharmony_ci				      BTRFS_MAX_DEVS_SYS_CHUNK);
514562306a36Sopenharmony_ci	} else {
514662306a36Sopenharmony_ci		BUG();
514762306a36Sopenharmony_ci	}
514862306a36Sopenharmony_ci
514962306a36Sopenharmony_ci	/* We don't want a chunk larger than 10% of writable space */
515062306a36Sopenharmony_ci	limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
515162306a36Sopenharmony_ci			       zone_size),
515262306a36Sopenharmony_ci		    min_chunk_size);
515362306a36Sopenharmony_ci	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
515462306a36Sopenharmony_ci	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
515562306a36Sopenharmony_ci}
515662306a36Sopenharmony_ci
515762306a36Sopenharmony_cistatic void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
515862306a36Sopenharmony_ci				 struct alloc_chunk_ctl *ctl)
515962306a36Sopenharmony_ci{
516062306a36Sopenharmony_ci	int index = btrfs_bg_flags_to_raid_index(ctl->type);
516162306a36Sopenharmony_ci
516262306a36Sopenharmony_ci	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
516362306a36Sopenharmony_ci	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
516462306a36Sopenharmony_ci	ctl->devs_max = btrfs_raid_array[index].devs_max;
516562306a36Sopenharmony_ci	if (!ctl->devs_max)
516662306a36Sopenharmony_ci		ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
516762306a36Sopenharmony_ci	ctl->devs_min = btrfs_raid_array[index].devs_min;
516862306a36Sopenharmony_ci	ctl->devs_increment = btrfs_raid_array[index].devs_increment;
516962306a36Sopenharmony_ci	ctl->ncopies = btrfs_raid_array[index].ncopies;
517062306a36Sopenharmony_ci	ctl->nparity = btrfs_raid_array[index].nparity;
517162306a36Sopenharmony_ci	ctl->ndevs = 0;
517262306a36Sopenharmony_ci
517362306a36Sopenharmony_ci	switch (fs_devices->chunk_alloc_policy) {
517462306a36Sopenharmony_ci	case BTRFS_CHUNK_ALLOC_REGULAR:
517562306a36Sopenharmony_ci		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
517662306a36Sopenharmony_ci		break;
517762306a36Sopenharmony_ci	case BTRFS_CHUNK_ALLOC_ZONED:
517862306a36Sopenharmony_ci		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
517962306a36Sopenharmony_ci		break;
518062306a36Sopenharmony_ci	default:
518162306a36Sopenharmony_ci		BUG();
518262306a36Sopenharmony_ci	}
518362306a36Sopenharmony_ci}
518462306a36Sopenharmony_ci
518562306a36Sopenharmony_cistatic int gather_device_info(struct btrfs_fs_devices *fs_devices,
518662306a36Sopenharmony_ci			      struct alloc_chunk_ctl *ctl,
518762306a36Sopenharmony_ci			      struct btrfs_device_info *devices_info)
518862306a36Sopenharmony_ci{
518962306a36Sopenharmony_ci	struct btrfs_fs_info *info = fs_devices->fs_info;
519062306a36Sopenharmony_ci	struct btrfs_device *device;
519162306a36Sopenharmony_ci	u64 total_avail;
519262306a36Sopenharmony_ci	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
519362306a36Sopenharmony_ci	int ret;
519462306a36Sopenharmony_ci	int ndevs = 0;
519562306a36Sopenharmony_ci	u64 max_avail;
519662306a36Sopenharmony_ci	u64 dev_offset;
519762306a36Sopenharmony_ci
519862306a36Sopenharmony_ci	/*
519962306a36Sopenharmony_ci	 * in the first pass through the devices list, we gather information
520062306a36Sopenharmony_ci	 * about the available holes on each device.
520162306a36Sopenharmony_ci	 */
520262306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
520362306a36Sopenharmony_ci		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
520462306a36Sopenharmony_ci			WARN(1, KERN_ERR
520562306a36Sopenharmony_ci			       "BTRFS: read-only device in alloc_list\n");
520662306a36Sopenharmony_ci			continue;
520762306a36Sopenharmony_ci		}
520862306a36Sopenharmony_ci
520962306a36Sopenharmony_ci		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
521062306a36Sopenharmony_ci					&device->dev_state) ||
521162306a36Sopenharmony_ci		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
521262306a36Sopenharmony_ci			continue;
521362306a36Sopenharmony_ci
521462306a36Sopenharmony_ci		if (device->total_bytes > device->bytes_used)
521562306a36Sopenharmony_ci			total_avail = device->total_bytes - device->bytes_used;
521662306a36Sopenharmony_ci		else
521762306a36Sopenharmony_ci			total_avail = 0;
521862306a36Sopenharmony_ci
521962306a36Sopenharmony_ci		/* If there is no space on this device, skip it. */
522062306a36Sopenharmony_ci		if (total_avail < ctl->dev_extent_min)
522162306a36Sopenharmony_ci			continue;
522262306a36Sopenharmony_ci
522362306a36Sopenharmony_ci		ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
522462306a36Sopenharmony_ci					   &max_avail);
522562306a36Sopenharmony_ci		if (ret && ret != -ENOSPC)
522662306a36Sopenharmony_ci			return ret;
522762306a36Sopenharmony_ci
522862306a36Sopenharmony_ci		if (ret == 0)
522962306a36Sopenharmony_ci			max_avail = dev_extent_want;
523062306a36Sopenharmony_ci
523162306a36Sopenharmony_ci		if (max_avail < ctl->dev_extent_min) {
523262306a36Sopenharmony_ci			if (btrfs_test_opt(info, ENOSPC_DEBUG))
523362306a36Sopenharmony_ci				btrfs_debug(info,
523462306a36Sopenharmony_ci			"%s: devid %llu has no free space, have=%llu want=%llu",
523562306a36Sopenharmony_ci					    __func__, device->devid, max_avail,
523662306a36Sopenharmony_ci					    ctl->dev_extent_min);
523762306a36Sopenharmony_ci			continue;
523862306a36Sopenharmony_ci		}
523962306a36Sopenharmony_ci
524062306a36Sopenharmony_ci		if (ndevs == fs_devices->rw_devices) {
524162306a36Sopenharmony_ci			WARN(1, "%s: found more than %llu devices\n",
524262306a36Sopenharmony_ci			     __func__, fs_devices->rw_devices);
524362306a36Sopenharmony_ci			break;
524462306a36Sopenharmony_ci		}
524562306a36Sopenharmony_ci		devices_info[ndevs].dev_offset = dev_offset;
524662306a36Sopenharmony_ci		devices_info[ndevs].max_avail = max_avail;
524762306a36Sopenharmony_ci		devices_info[ndevs].total_avail = total_avail;
524862306a36Sopenharmony_ci		devices_info[ndevs].dev = device;
524962306a36Sopenharmony_ci		++ndevs;
525062306a36Sopenharmony_ci	}
525162306a36Sopenharmony_ci	ctl->ndevs = ndevs;
525262306a36Sopenharmony_ci
525362306a36Sopenharmony_ci	/*
525462306a36Sopenharmony_ci	 * now sort the devices by hole size / available space
525562306a36Sopenharmony_ci	 */
525662306a36Sopenharmony_ci	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
525762306a36Sopenharmony_ci	     btrfs_cmp_device_info, NULL);
525862306a36Sopenharmony_ci
525962306a36Sopenharmony_ci	return 0;
526062306a36Sopenharmony_ci}
526162306a36Sopenharmony_ci
526262306a36Sopenharmony_cistatic int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
526362306a36Sopenharmony_ci				      struct btrfs_device_info *devices_info)
526462306a36Sopenharmony_ci{
526562306a36Sopenharmony_ci	/* Number of stripes that count for block group size */
526662306a36Sopenharmony_ci	int data_stripes;
526762306a36Sopenharmony_ci
526862306a36Sopenharmony_ci	/*
526962306a36Sopenharmony_ci	 * The primary goal is to maximize the number of stripes, so use as
527062306a36Sopenharmony_ci	 * many devices as possible, even if the stripes are not maximum sized.
527162306a36Sopenharmony_ci	 *
527262306a36Sopenharmony_ci	 * The DUP profile stores more than one stripe per device, the
527362306a36Sopenharmony_ci	 * max_avail is the total size so we have to adjust.
527462306a36Sopenharmony_ci	 */
527562306a36Sopenharmony_ci	ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
527662306a36Sopenharmony_ci				   ctl->dev_stripes);
527762306a36Sopenharmony_ci	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
527862306a36Sopenharmony_ci
527962306a36Sopenharmony_ci	/* This will have to be fixed for RAID1 and RAID10 over more drives */
528062306a36Sopenharmony_ci	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
528162306a36Sopenharmony_ci
528262306a36Sopenharmony_ci	/*
528362306a36Sopenharmony_ci	 * Use the number of data stripes to figure out how big this chunk is
528462306a36Sopenharmony_ci	 * really going to be in terms of logical address space, and compare
528562306a36Sopenharmony_ci	 * that answer with the max chunk size. If it's higher, we try to
528662306a36Sopenharmony_ci	 * reduce stripe_size.
528762306a36Sopenharmony_ci	 */
528862306a36Sopenharmony_ci	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
528962306a36Sopenharmony_ci		/*
529062306a36Sopenharmony_ci		 * Reduce stripe_size, round it up to a 16MB boundary again and
529162306a36Sopenharmony_ci		 * then use it, unless it ends up being even bigger than the
529262306a36Sopenharmony_ci		 * previous value we had already.
529362306a36Sopenharmony_ci		 */
529462306a36Sopenharmony_ci		ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
529562306a36Sopenharmony_ci							data_stripes), SZ_16M),
529662306a36Sopenharmony_ci				       ctl->stripe_size);
529762306a36Sopenharmony_ci	}
529862306a36Sopenharmony_ci
529962306a36Sopenharmony_ci	/* Stripe size should not go beyond 1G. */
530062306a36Sopenharmony_ci	ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
530162306a36Sopenharmony_ci
530262306a36Sopenharmony_ci	/* Align to BTRFS_STRIPE_LEN */
530362306a36Sopenharmony_ci	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
530462306a36Sopenharmony_ci	ctl->chunk_size = ctl->stripe_size * data_stripes;
530562306a36Sopenharmony_ci
530662306a36Sopenharmony_ci	return 0;
530762306a36Sopenharmony_ci}
530862306a36Sopenharmony_ci
530962306a36Sopenharmony_cistatic int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
531062306a36Sopenharmony_ci				    struct btrfs_device_info *devices_info)
531162306a36Sopenharmony_ci{
531262306a36Sopenharmony_ci	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
531362306a36Sopenharmony_ci	/* Number of stripes that count for block group size */
531462306a36Sopenharmony_ci	int data_stripes;
531562306a36Sopenharmony_ci
531662306a36Sopenharmony_ci	/*
531762306a36Sopenharmony_ci	 * It should hold because:
531862306a36Sopenharmony_ci	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
531962306a36Sopenharmony_ci	 */
532062306a36Sopenharmony_ci	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
532162306a36Sopenharmony_ci
532262306a36Sopenharmony_ci	ctl->stripe_size = zone_size;
532362306a36Sopenharmony_ci	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
532462306a36Sopenharmony_ci	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
532562306a36Sopenharmony_ci
532662306a36Sopenharmony_ci	/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
532762306a36Sopenharmony_ci	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
532862306a36Sopenharmony_ci		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
532962306a36Sopenharmony_ci					     ctl->stripe_size) + ctl->nparity,
533062306a36Sopenharmony_ci				     ctl->dev_stripes);
533162306a36Sopenharmony_ci		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
533262306a36Sopenharmony_ci		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
533362306a36Sopenharmony_ci		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
533462306a36Sopenharmony_ci	}
533562306a36Sopenharmony_ci
533662306a36Sopenharmony_ci	ctl->chunk_size = ctl->stripe_size * data_stripes;
533762306a36Sopenharmony_ci
533862306a36Sopenharmony_ci	return 0;
533962306a36Sopenharmony_ci}
534062306a36Sopenharmony_ci
534162306a36Sopenharmony_cistatic int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
534262306a36Sopenharmony_ci			      struct alloc_chunk_ctl *ctl,
534362306a36Sopenharmony_ci			      struct btrfs_device_info *devices_info)
534462306a36Sopenharmony_ci{
534562306a36Sopenharmony_ci	struct btrfs_fs_info *info = fs_devices->fs_info;
534662306a36Sopenharmony_ci
534762306a36Sopenharmony_ci	/*
534862306a36Sopenharmony_ci	 * Round down to number of usable stripes, devs_increment can be any
534962306a36Sopenharmony_ci	 * number so we can't use round_down() that requires power of 2, while
535062306a36Sopenharmony_ci	 * rounddown is safe.
535162306a36Sopenharmony_ci	 */
535262306a36Sopenharmony_ci	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
535362306a36Sopenharmony_ci
535462306a36Sopenharmony_ci	if (ctl->ndevs < ctl->devs_min) {
535562306a36Sopenharmony_ci		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
535662306a36Sopenharmony_ci			btrfs_debug(info,
535762306a36Sopenharmony_ci	"%s: not enough devices with free space: have=%d minimum required=%d",
535862306a36Sopenharmony_ci				    __func__, ctl->ndevs, ctl->devs_min);
535962306a36Sopenharmony_ci		}
536062306a36Sopenharmony_ci		return -ENOSPC;
536162306a36Sopenharmony_ci	}
536262306a36Sopenharmony_ci
536362306a36Sopenharmony_ci	ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
536462306a36Sopenharmony_ci
536562306a36Sopenharmony_ci	switch (fs_devices->chunk_alloc_policy) {
536662306a36Sopenharmony_ci	case BTRFS_CHUNK_ALLOC_REGULAR:
536762306a36Sopenharmony_ci		return decide_stripe_size_regular(ctl, devices_info);
536862306a36Sopenharmony_ci	case BTRFS_CHUNK_ALLOC_ZONED:
536962306a36Sopenharmony_ci		return decide_stripe_size_zoned(ctl, devices_info);
537062306a36Sopenharmony_ci	default:
537162306a36Sopenharmony_ci		BUG();
537262306a36Sopenharmony_ci	}
537362306a36Sopenharmony_ci}
537462306a36Sopenharmony_ci
537562306a36Sopenharmony_cistatic struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
537662306a36Sopenharmony_ci			struct alloc_chunk_ctl *ctl,
537762306a36Sopenharmony_ci			struct btrfs_device_info *devices_info)
537862306a36Sopenharmony_ci{
537962306a36Sopenharmony_ci	struct btrfs_fs_info *info = trans->fs_info;
538062306a36Sopenharmony_ci	struct map_lookup *map = NULL;
538162306a36Sopenharmony_ci	struct extent_map_tree *em_tree;
538262306a36Sopenharmony_ci	struct btrfs_block_group *block_group;
538362306a36Sopenharmony_ci	struct extent_map *em;
538462306a36Sopenharmony_ci	u64 start = ctl->start;
538562306a36Sopenharmony_ci	u64 type = ctl->type;
538662306a36Sopenharmony_ci	int ret;
538762306a36Sopenharmony_ci	int i;
538862306a36Sopenharmony_ci	int j;
538962306a36Sopenharmony_ci
539062306a36Sopenharmony_ci	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
539162306a36Sopenharmony_ci	if (!map)
539262306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
539362306a36Sopenharmony_ci	map->num_stripes = ctl->num_stripes;
539462306a36Sopenharmony_ci
539562306a36Sopenharmony_ci	for (i = 0; i < ctl->ndevs; ++i) {
539662306a36Sopenharmony_ci		for (j = 0; j < ctl->dev_stripes; ++j) {
539762306a36Sopenharmony_ci			int s = i * ctl->dev_stripes + j;
539862306a36Sopenharmony_ci			map->stripes[s].dev = devices_info[i].dev;
539962306a36Sopenharmony_ci			map->stripes[s].physical = devices_info[i].dev_offset +
540062306a36Sopenharmony_ci						   j * ctl->stripe_size;
540162306a36Sopenharmony_ci		}
540262306a36Sopenharmony_ci	}
540362306a36Sopenharmony_ci	map->io_align = BTRFS_STRIPE_LEN;
540462306a36Sopenharmony_ci	map->io_width = BTRFS_STRIPE_LEN;
540562306a36Sopenharmony_ci	map->type = type;
540662306a36Sopenharmony_ci	map->sub_stripes = ctl->sub_stripes;
540762306a36Sopenharmony_ci
540862306a36Sopenharmony_ci	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
540962306a36Sopenharmony_ci
541062306a36Sopenharmony_ci	em = alloc_extent_map();
541162306a36Sopenharmony_ci	if (!em) {
541262306a36Sopenharmony_ci		kfree(map);
541362306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
541462306a36Sopenharmony_ci	}
541562306a36Sopenharmony_ci	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
541662306a36Sopenharmony_ci	em->map_lookup = map;
541762306a36Sopenharmony_ci	em->start = start;
541862306a36Sopenharmony_ci	em->len = ctl->chunk_size;
541962306a36Sopenharmony_ci	em->block_start = 0;
542062306a36Sopenharmony_ci	em->block_len = em->len;
542162306a36Sopenharmony_ci	em->orig_block_len = ctl->stripe_size;
542262306a36Sopenharmony_ci
542362306a36Sopenharmony_ci	em_tree = &info->mapping_tree;
542462306a36Sopenharmony_ci	write_lock(&em_tree->lock);
542562306a36Sopenharmony_ci	ret = add_extent_mapping(em_tree, em, 0);
542662306a36Sopenharmony_ci	if (ret) {
542762306a36Sopenharmony_ci		write_unlock(&em_tree->lock);
542862306a36Sopenharmony_ci		free_extent_map(em);
542962306a36Sopenharmony_ci		return ERR_PTR(ret);
543062306a36Sopenharmony_ci	}
543162306a36Sopenharmony_ci	write_unlock(&em_tree->lock);
543262306a36Sopenharmony_ci
543362306a36Sopenharmony_ci	block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
543462306a36Sopenharmony_ci	if (IS_ERR(block_group))
543562306a36Sopenharmony_ci		goto error_del_extent;
543662306a36Sopenharmony_ci
543762306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
543862306a36Sopenharmony_ci		struct btrfs_device *dev = map->stripes[i].dev;
543962306a36Sopenharmony_ci
544062306a36Sopenharmony_ci		btrfs_device_set_bytes_used(dev,
544162306a36Sopenharmony_ci					    dev->bytes_used + ctl->stripe_size);
544262306a36Sopenharmony_ci		if (list_empty(&dev->post_commit_list))
544362306a36Sopenharmony_ci			list_add_tail(&dev->post_commit_list,
544462306a36Sopenharmony_ci				      &trans->transaction->dev_update_list);
544562306a36Sopenharmony_ci	}
544662306a36Sopenharmony_ci
544762306a36Sopenharmony_ci	atomic64_sub(ctl->stripe_size * map->num_stripes,
544862306a36Sopenharmony_ci		     &info->free_chunk_space);
544962306a36Sopenharmony_ci
545062306a36Sopenharmony_ci	free_extent_map(em);
545162306a36Sopenharmony_ci	check_raid56_incompat_flag(info, type);
545262306a36Sopenharmony_ci	check_raid1c34_incompat_flag(info, type);
545362306a36Sopenharmony_ci
545462306a36Sopenharmony_ci	return block_group;
545562306a36Sopenharmony_ci
545662306a36Sopenharmony_cierror_del_extent:
545762306a36Sopenharmony_ci	write_lock(&em_tree->lock);
545862306a36Sopenharmony_ci	remove_extent_mapping(em_tree, em);
545962306a36Sopenharmony_ci	write_unlock(&em_tree->lock);
546062306a36Sopenharmony_ci
546162306a36Sopenharmony_ci	/* One for our allocation */
546262306a36Sopenharmony_ci	free_extent_map(em);
546362306a36Sopenharmony_ci	/* One for the tree reference */
546462306a36Sopenharmony_ci	free_extent_map(em);
546562306a36Sopenharmony_ci
546662306a36Sopenharmony_ci	return block_group;
546762306a36Sopenharmony_ci}
546862306a36Sopenharmony_ci
546962306a36Sopenharmony_cistruct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
547062306a36Sopenharmony_ci					    u64 type)
547162306a36Sopenharmony_ci{
547262306a36Sopenharmony_ci	struct btrfs_fs_info *info = trans->fs_info;
547362306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = info->fs_devices;
547462306a36Sopenharmony_ci	struct btrfs_device_info *devices_info = NULL;
547562306a36Sopenharmony_ci	struct alloc_chunk_ctl ctl;
547662306a36Sopenharmony_ci	struct btrfs_block_group *block_group;
547762306a36Sopenharmony_ci	int ret;
547862306a36Sopenharmony_ci
547962306a36Sopenharmony_ci	lockdep_assert_held(&info->chunk_mutex);
548062306a36Sopenharmony_ci
548162306a36Sopenharmony_ci	if (!alloc_profile_is_valid(type, 0)) {
548262306a36Sopenharmony_ci		ASSERT(0);
548362306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
548462306a36Sopenharmony_ci	}
548562306a36Sopenharmony_ci
548662306a36Sopenharmony_ci	if (list_empty(&fs_devices->alloc_list)) {
548762306a36Sopenharmony_ci		if (btrfs_test_opt(info, ENOSPC_DEBUG))
548862306a36Sopenharmony_ci			btrfs_debug(info, "%s: no writable device", __func__);
548962306a36Sopenharmony_ci		return ERR_PTR(-ENOSPC);
549062306a36Sopenharmony_ci	}
549162306a36Sopenharmony_ci
549262306a36Sopenharmony_ci	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
549362306a36Sopenharmony_ci		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
549462306a36Sopenharmony_ci		ASSERT(0);
549562306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
549662306a36Sopenharmony_ci	}
549762306a36Sopenharmony_ci
549862306a36Sopenharmony_ci	ctl.start = find_next_chunk(info);
549962306a36Sopenharmony_ci	ctl.type = type;
550062306a36Sopenharmony_ci	init_alloc_chunk_ctl(fs_devices, &ctl);
550162306a36Sopenharmony_ci
550262306a36Sopenharmony_ci	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
550362306a36Sopenharmony_ci			       GFP_NOFS);
550462306a36Sopenharmony_ci	if (!devices_info)
550562306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
550662306a36Sopenharmony_ci
550762306a36Sopenharmony_ci	ret = gather_device_info(fs_devices, &ctl, devices_info);
550862306a36Sopenharmony_ci	if (ret < 0) {
550962306a36Sopenharmony_ci		block_group = ERR_PTR(ret);
551062306a36Sopenharmony_ci		goto out;
551162306a36Sopenharmony_ci	}
551262306a36Sopenharmony_ci
551362306a36Sopenharmony_ci	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
551462306a36Sopenharmony_ci	if (ret < 0) {
551562306a36Sopenharmony_ci		block_group = ERR_PTR(ret);
551662306a36Sopenharmony_ci		goto out;
551762306a36Sopenharmony_ci	}
551862306a36Sopenharmony_ci
551962306a36Sopenharmony_ci	block_group = create_chunk(trans, &ctl, devices_info);
552062306a36Sopenharmony_ci
552162306a36Sopenharmony_ciout:
552262306a36Sopenharmony_ci	kfree(devices_info);
552362306a36Sopenharmony_ci	return block_group;
552462306a36Sopenharmony_ci}
552562306a36Sopenharmony_ci
552662306a36Sopenharmony_ci/*
552762306a36Sopenharmony_ci * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
552862306a36Sopenharmony_ci * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
552962306a36Sopenharmony_ci * chunks.
553062306a36Sopenharmony_ci *
553162306a36Sopenharmony_ci * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
553262306a36Sopenharmony_ci * phases.
553362306a36Sopenharmony_ci */
553462306a36Sopenharmony_ciint btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
553562306a36Sopenharmony_ci				     struct btrfs_block_group *bg)
553662306a36Sopenharmony_ci{
553762306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = trans->fs_info;
553862306a36Sopenharmony_ci	struct btrfs_root *chunk_root = fs_info->chunk_root;
553962306a36Sopenharmony_ci	struct btrfs_key key;
554062306a36Sopenharmony_ci	struct btrfs_chunk *chunk;
554162306a36Sopenharmony_ci	struct btrfs_stripe *stripe;
554262306a36Sopenharmony_ci	struct extent_map *em;
554362306a36Sopenharmony_ci	struct map_lookup *map;
554462306a36Sopenharmony_ci	size_t item_size;
554562306a36Sopenharmony_ci	int i;
554662306a36Sopenharmony_ci	int ret;
554762306a36Sopenharmony_ci
554862306a36Sopenharmony_ci	/*
554962306a36Sopenharmony_ci	 * We take the chunk_mutex for 2 reasons:
555062306a36Sopenharmony_ci	 *
555162306a36Sopenharmony_ci	 * 1) Updates and insertions in the chunk btree must be done while holding
555262306a36Sopenharmony_ci	 *    the chunk_mutex, as well as updating the system chunk array in the
555362306a36Sopenharmony_ci	 *    superblock. See the comment on top of btrfs_chunk_alloc() for the
555462306a36Sopenharmony_ci	 *    details;
555562306a36Sopenharmony_ci	 *
555662306a36Sopenharmony_ci	 * 2) To prevent races with the final phase of a device replace operation
555762306a36Sopenharmony_ci	 *    that replaces the device object associated with the map's stripes,
555862306a36Sopenharmony_ci	 *    because the device object's id can change at any time during that
555962306a36Sopenharmony_ci	 *    final phase of the device replace operation
556062306a36Sopenharmony_ci	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
556162306a36Sopenharmony_ci	 *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
556262306a36Sopenharmony_ci	 *    which would cause a failure when updating the device item, which does
556362306a36Sopenharmony_ci	 *    not exists, or persisting a stripe of the chunk item with such ID.
556462306a36Sopenharmony_ci	 *    Here we can't use the device_list_mutex because our caller already
556562306a36Sopenharmony_ci	 *    has locked the chunk_mutex, and the final phase of device replace
556662306a36Sopenharmony_ci	 *    acquires both mutexes - first the device_list_mutex and then the
556762306a36Sopenharmony_ci	 *    chunk_mutex. Using any of those two mutexes protects us from a
556862306a36Sopenharmony_ci	 *    concurrent device replace.
556962306a36Sopenharmony_ci	 */
557062306a36Sopenharmony_ci	lockdep_assert_held(&fs_info->chunk_mutex);
557162306a36Sopenharmony_ci
557262306a36Sopenharmony_ci	em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
557362306a36Sopenharmony_ci	if (IS_ERR(em)) {
557462306a36Sopenharmony_ci		ret = PTR_ERR(em);
557562306a36Sopenharmony_ci		btrfs_abort_transaction(trans, ret);
557662306a36Sopenharmony_ci		return ret;
557762306a36Sopenharmony_ci	}
557862306a36Sopenharmony_ci
557962306a36Sopenharmony_ci	map = em->map_lookup;
558062306a36Sopenharmony_ci	item_size = btrfs_chunk_item_size(map->num_stripes);
558162306a36Sopenharmony_ci
558262306a36Sopenharmony_ci	chunk = kzalloc(item_size, GFP_NOFS);
558362306a36Sopenharmony_ci	if (!chunk) {
558462306a36Sopenharmony_ci		ret = -ENOMEM;
558562306a36Sopenharmony_ci		btrfs_abort_transaction(trans, ret);
558662306a36Sopenharmony_ci		goto out;
558762306a36Sopenharmony_ci	}
558862306a36Sopenharmony_ci
558962306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
559062306a36Sopenharmony_ci		struct btrfs_device *device = map->stripes[i].dev;
559162306a36Sopenharmony_ci
559262306a36Sopenharmony_ci		ret = btrfs_update_device(trans, device);
559362306a36Sopenharmony_ci		if (ret)
559462306a36Sopenharmony_ci			goto out;
559562306a36Sopenharmony_ci	}
559662306a36Sopenharmony_ci
559762306a36Sopenharmony_ci	stripe = &chunk->stripe;
559862306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
559962306a36Sopenharmony_ci		struct btrfs_device *device = map->stripes[i].dev;
560062306a36Sopenharmony_ci		const u64 dev_offset = map->stripes[i].physical;
560162306a36Sopenharmony_ci
560262306a36Sopenharmony_ci		btrfs_set_stack_stripe_devid(stripe, device->devid);
560362306a36Sopenharmony_ci		btrfs_set_stack_stripe_offset(stripe, dev_offset);
560462306a36Sopenharmony_ci		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
560562306a36Sopenharmony_ci		stripe++;
560662306a36Sopenharmony_ci	}
560762306a36Sopenharmony_ci
560862306a36Sopenharmony_ci	btrfs_set_stack_chunk_length(chunk, bg->length);
560962306a36Sopenharmony_ci	btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
561062306a36Sopenharmony_ci	btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
561162306a36Sopenharmony_ci	btrfs_set_stack_chunk_type(chunk, map->type);
561262306a36Sopenharmony_ci	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
561362306a36Sopenharmony_ci	btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
561462306a36Sopenharmony_ci	btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
561562306a36Sopenharmony_ci	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
561662306a36Sopenharmony_ci	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
561762306a36Sopenharmony_ci
561862306a36Sopenharmony_ci	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
561962306a36Sopenharmony_ci	key.type = BTRFS_CHUNK_ITEM_KEY;
562062306a36Sopenharmony_ci	key.offset = bg->start;
562162306a36Sopenharmony_ci
562262306a36Sopenharmony_ci	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
562362306a36Sopenharmony_ci	if (ret)
562462306a36Sopenharmony_ci		goto out;
562562306a36Sopenharmony_ci
562662306a36Sopenharmony_ci	set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
562762306a36Sopenharmony_ci
562862306a36Sopenharmony_ci	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
562962306a36Sopenharmony_ci		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
563062306a36Sopenharmony_ci		if (ret)
563162306a36Sopenharmony_ci			goto out;
563262306a36Sopenharmony_ci	}
563362306a36Sopenharmony_ci
563462306a36Sopenharmony_ciout:
563562306a36Sopenharmony_ci	kfree(chunk);
563662306a36Sopenharmony_ci	free_extent_map(em);
563762306a36Sopenharmony_ci	return ret;
563862306a36Sopenharmony_ci}
563962306a36Sopenharmony_ci
564062306a36Sopenharmony_cistatic noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
564162306a36Sopenharmony_ci{
564262306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = trans->fs_info;
564362306a36Sopenharmony_ci	u64 alloc_profile;
564462306a36Sopenharmony_ci	struct btrfs_block_group *meta_bg;
564562306a36Sopenharmony_ci	struct btrfs_block_group *sys_bg;
564662306a36Sopenharmony_ci
564762306a36Sopenharmony_ci	/*
564862306a36Sopenharmony_ci	 * When adding a new device for sprouting, the seed device is read-only
564962306a36Sopenharmony_ci	 * so we must first allocate a metadata and a system chunk. But before
565062306a36Sopenharmony_ci	 * adding the block group items to the extent, device and chunk btrees,
565162306a36Sopenharmony_ci	 * we must first:
565262306a36Sopenharmony_ci	 *
565362306a36Sopenharmony_ci	 * 1) Create both chunks without doing any changes to the btrees, as
565462306a36Sopenharmony_ci	 *    otherwise we would get -ENOSPC since the block groups from the
565562306a36Sopenharmony_ci	 *    seed device are read-only;
565662306a36Sopenharmony_ci	 *
565762306a36Sopenharmony_ci	 * 2) Add the device item for the new sprout device - finishing the setup
565862306a36Sopenharmony_ci	 *    of a new block group requires updating the device item in the chunk
565962306a36Sopenharmony_ci	 *    btree, so it must exist when we attempt to do it. The previous step
566062306a36Sopenharmony_ci	 *    ensures this does not fail with -ENOSPC.
566162306a36Sopenharmony_ci	 *
566262306a36Sopenharmony_ci	 * After that we can add the block group items to their btrees:
566362306a36Sopenharmony_ci	 * update existing device item in the chunk btree, add a new block group
566462306a36Sopenharmony_ci	 * item to the extent btree, add a new chunk item to the chunk btree and
566562306a36Sopenharmony_ci	 * finally add the new device extent items to the devices btree.
566662306a36Sopenharmony_ci	 */
566762306a36Sopenharmony_ci
566862306a36Sopenharmony_ci	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
566962306a36Sopenharmony_ci	meta_bg = btrfs_create_chunk(trans, alloc_profile);
567062306a36Sopenharmony_ci	if (IS_ERR(meta_bg))
567162306a36Sopenharmony_ci		return PTR_ERR(meta_bg);
567262306a36Sopenharmony_ci
567362306a36Sopenharmony_ci	alloc_profile = btrfs_system_alloc_profile(fs_info);
567462306a36Sopenharmony_ci	sys_bg = btrfs_create_chunk(trans, alloc_profile);
567562306a36Sopenharmony_ci	if (IS_ERR(sys_bg))
567662306a36Sopenharmony_ci		return PTR_ERR(sys_bg);
567762306a36Sopenharmony_ci
567862306a36Sopenharmony_ci	return 0;
567962306a36Sopenharmony_ci}
568062306a36Sopenharmony_ci
568162306a36Sopenharmony_cistatic inline int btrfs_chunk_max_errors(struct map_lookup *map)
568262306a36Sopenharmony_ci{
568362306a36Sopenharmony_ci	const int index = btrfs_bg_flags_to_raid_index(map->type);
568462306a36Sopenharmony_ci
568562306a36Sopenharmony_ci	return btrfs_raid_array[index].tolerated_failures;
568662306a36Sopenharmony_ci}
568762306a36Sopenharmony_ci
568862306a36Sopenharmony_cibool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
568962306a36Sopenharmony_ci{
569062306a36Sopenharmony_ci	struct extent_map *em;
569162306a36Sopenharmony_ci	struct map_lookup *map;
569262306a36Sopenharmony_ci	int miss_ndevs = 0;
569362306a36Sopenharmony_ci	int i;
569462306a36Sopenharmony_ci	bool ret = true;
569562306a36Sopenharmony_ci
569662306a36Sopenharmony_ci	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
569762306a36Sopenharmony_ci	if (IS_ERR(em))
569862306a36Sopenharmony_ci		return false;
569962306a36Sopenharmony_ci
570062306a36Sopenharmony_ci	map = em->map_lookup;
570162306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
570262306a36Sopenharmony_ci		if (test_bit(BTRFS_DEV_STATE_MISSING,
570362306a36Sopenharmony_ci					&map->stripes[i].dev->dev_state)) {
570462306a36Sopenharmony_ci			miss_ndevs++;
570562306a36Sopenharmony_ci			continue;
570662306a36Sopenharmony_ci		}
570762306a36Sopenharmony_ci		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
570862306a36Sopenharmony_ci					&map->stripes[i].dev->dev_state)) {
570962306a36Sopenharmony_ci			ret = false;
571062306a36Sopenharmony_ci			goto end;
571162306a36Sopenharmony_ci		}
571262306a36Sopenharmony_ci	}
571362306a36Sopenharmony_ci
571462306a36Sopenharmony_ci	/*
571562306a36Sopenharmony_ci	 * If the number of missing devices is larger than max errors, we can
571662306a36Sopenharmony_ci	 * not write the data into that chunk successfully.
571762306a36Sopenharmony_ci	 */
571862306a36Sopenharmony_ci	if (miss_ndevs > btrfs_chunk_max_errors(map))
571962306a36Sopenharmony_ci		ret = false;
572062306a36Sopenharmony_ciend:
572162306a36Sopenharmony_ci	free_extent_map(em);
572262306a36Sopenharmony_ci	return ret;
572362306a36Sopenharmony_ci}
572462306a36Sopenharmony_ci
572562306a36Sopenharmony_civoid btrfs_mapping_tree_free(struct extent_map_tree *tree)
572662306a36Sopenharmony_ci{
572762306a36Sopenharmony_ci	struct extent_map *em;
572862306a36Sopenharmony_ci
572962306a36Sopenharmony_ci	while (1) {
573062306a36Sopenharmony_ci		write_lock(&tree->lock);
573162306a36Sopenharmony_ci		em = lookup_extent_mapping(tree, 0, (u64)-1);
573262306a36Sopenharmony_ci		if (em)
573362306a36Sopenharmony_ci			remove_extent_mapping(tree, em);
573462306a36Sopenharmony_ci		write_unlock(&tree->lock);
573562306a36Sopenharmony_ci		if (!em)
573662306a36Sopenharmony_ci			break;
573762306a36Sopenharmony_ci		/* once for us */
573862306a36Sopenharmony_ci		free_extent_map(em);
573962306a36Sopenharmony_ci		/* once for the tree */
574062306a36Sopenharmony_ci		free_extent_map(em);
574162306a36Sopenharmony_ci	}
574262306a36Sopenharmony_ci}
574362306a36Sopenharmony_ci
574462306a36Sopenharmony_ciint btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
574562306a36Sopenharmony_ci{
574662306a36Sopenharmony_ci	struct extent_map *em;
574762306a36Sopenharmony_ci	struct map_lookup *map;
574862306a36Sopenharmony_ci	enum btrfs_raid_types index;
574962306a36Sopenharmony_ci	int ret = 1;
575062306a36Sopenharmony_ci
575162306a36Sopenharmony_ci	em = btrfs_get_chunk_map(fs_info, logical, len);
575262306a36Sopenharmony_ci	if (IS_ERR(em))
575362306a36Sopenharmony_ci		/*
575462306a36Sopenharmony_ci		 * We could return errors for these cases, but that could get
575562306a36Sopenharmony_ci		 * ugly and we'd probably do the same thing which is just not do
575662306a36Sopenharmony_ci		 * anything else and exit, so return 1 so the callers don't try
575762306a36Sopenharmony_ci		 * to use other copies.
575862306a36Sopenharmony_ci		 */
575962306a36Sopenharmony_ci		return 1;
576062306a36Sopenharmony_ci
576162306a36Sopenharmony_ci	map = em->map_lookup;
576262306a36Sopenharmony_ci	index = btrfs_bg_flags_to_raid_index(map->type);
576362306a36Sopenharmony_ci
576462306a36Sopenharmony_ci	/* Non-RAID56, use their ncopies from btrfs_raid_array. */
576562306a36Sopenharmony_ci	if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
576662306a36Sopenharmony_ci		ret = btrfs_raid_array[index].ncopies;
576762306a36Sopenharmony_ci	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
576862306a36Sopenharmony_ci		ret = 2;
576962306a36Sopenharmony_ci	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
577062306a36Sopenharmony_ci		/*
577162306a36Sopenharmony_ci		 * There could be two corrupted data stripes, we need
577262306a36Sopenharmony_ci		 * to loop retry in order to rebuild the correct data.
577362306a36Sopenharmony_ci		 *
577462306a36Sopenharmony_ci		 * Fail a stripe at a time on every retry except the
577562306a36Sopenharmony_ci		 * stripe under reconstruction.
577662306a36Sopenharmony_ci		 */
577762306a36Sopenharmony_ci		ret = map->num_stripes;
577862306a36Sopenharmony_ci	free_extent_map(em);
577962306a36Sopenharmony_ci	return ret;
578062306a36Sopenharmony_ci}
578162306a36Sopenharmony_ci
578262306a36Sopenharmony_ciunsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
578362306a36Sopenharmony_ci				    u64 logical)
578462306a36Sopenharmony_ci{
578562306a36Sopenharmony_ci	struct extent_map *em;
578662306a36Sopenharmony_ci	struct map_lookup *map;
578762306a36Sopenharmony_ci	unsigned long len = fs_info->sectorsize;
578862306a36Sopenharmony_ci
578962306a36Sopenharmony_ci	if (!btrfs_fs_incompat(fs_info, RAID56))
579062306a36Sopenharmony_ci		return len;
579162306a36Sopenharmony_ci
579262306a36Sopenharmony_ci	em = btrfs_get_chunk_map(fs_info, logical, len);
579362306a36Sopenharmony_ci
579462306a36Sopenharmony_ci	if (!WARN_ON(IS_ERR(em))) {
579562306a36Sopenharmony_ci		map = em->map_lookup;
579662306a36Sopenharmony_ci		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
579762306a36Sopenharmony_ci			len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
579862306a36Sopenharmony_ci		free_extent_map(em);
579962306a36Sopenharmony_ci	}
580062306a36Sopenharmony_ci	return len;
580162306a36Sopenharmony_ci}
580262306a36Sopenharmony_ci
580362306a36Sopenharmony_ciint btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
580462306a36Sopenharmony_ci{
580562306a36Sopenharmony_ci	struct extent_map *em;
580662306a36Sopenharmony_ci	struct map_lookup *map;
580762306a36Sopenharmony_ci	int ret = 0;
580862306a36Sopenharmony_ci
580962306a36Sopenharmony_ci	if (!btrfs_fs_incompat(fs_info, RAID56))
581062306a36Sopenharmony_ci		return 0;
581162306a36Sopenharmony_ci
581262306a36Sopenharmony_ci	em = btrfs_get_chunk_map(fs_info, logical, len);
581362306a36Sopenharmony_ci
581462306a36Sopenharmony_ci	if(!WARN_ON(IS_ERR(em))) {
581562306a36Sopenharmony_ci		map = em->map_lookup;
581662306a36Sopenharmony_ci		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
581762306a36Sopenharmony_ci			ret = 1;
581862306a36Sopenharmony_ci		free_extent_map(em);
581962306a36Sopenharmony_ci	}
582062306a36Sopenharmony_ci	return ret;
582162306a36Sopenharmony_ci}
582262306a36Sopenharmony_ci
582362306a36Sopenharmony_cistatic int find_live_mirror(struct btrfs_fs_info *fs_info,
582462306a36Sopenharmony_ci			    struct map_lookup *map, int first,
582562306a36Sopenharmony_ci			    int dev_replace_is_ongoing)
582662306a36Sopenharmony_ci{
582762306a36Sopenharmony_ci	int i;
582862306a36Sopenharmony_ci	int num_stripes;
582962306a36Sopenharmony_ci	int preferred_mirror;
583062306a36Sopenharmony_ci	int tolerance;
583162306a36Sopenharmony_ci	struct btrfs_device *srcdev;
583262306a36Sopenharmony_ci
583362306a36Sopenharmony_ci	ASSERT((map->type &
583462306a36Sopenharmony_ci		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
583562306a36Sopenharmony_ci
583662306a36Sopenharmony_ci	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
583762306a36Sopenharmony_ci		num_stripes = map->sub_stripes;
583862306a36Sopenharmony_ci	else
583962306a36Sopenharmony_ci		num_stripes = map->num_stripes;
584062306a36Sopenharmony_ci
584162306a36Sopenharmony_ci	switch (fs_info->fs_devices->read_policy) {
584262306a36Sopenharmony_ci	default:
584362306a36Sopenharmony_ci		/* Shouldn't happen, just warn and use pid instead of failing */
584462306a36Sopenharmony_ci		btrfs_warn_rl(fs_info,
584562306a36Sopenharmony_ci			      "unknown read_policy type %u, reset to pid",
584662306a36Sopenharmony_ci			      fs_info->fs_devices->read_policy);
584762306a36Sopenharmony_ci		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
584862306a36Sopenharmony_ci		fallthrough;
584962306a36Sopenharmony_ci	case BTRFS_READ_POLICY_PID:
585062306a36Sopenharmony_ci		preferred_mirror = first + (current->pid % num_stripes);
585162306a36Sopenharmony_ci		break;
585262306a36Sopenharmony_ci	}
585362306a36Sopenharmony_ci
585462306a36Sopenharmony_ci	if (dev_replace_is_ongoing &&
585562306a36Sopenharmony_ci	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
585662306a36Sopenharmony_ci	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
585762306a36Sopenharmony_ci		srcdev = fs_info->dev_replace.srcdev;
585862306a36Sopenharmony_ci	else
585962306a36Sopenharmony_ci		srcdev = NULL;
586062306a36Sopenharmony_ci
586162306a36Sopenharmony_ci	/*
586262306a36Sopenharmony_ci	 * try to avoid the drive that is the source drive for a
586362306a36Sopenharmony_ci	 * dev-replace procedure, only choose it if no other non-missing
586462306a36Sopenharmony_ci	 * mirror is available
586562306a36Sopenharmony_ci	 */
586662306a36Sopenharmony_ci	for (tolerance = 0; tolerance < 2; tolerance++) {
586762306a36Sopenharmony_ci		if (map->stripes[preferred_mirror].dev->bdev &&
586862306a36Sopenharmony_ci		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
586962306a36Sopenharmony_ci			return preferred_mirror;
587062306a36Sopenharmony_ci		for (i = first; i < first + num_stripes; i++) {
587162306a36Sopenharmony_ci			if (map->stripes[i].dev->bdev &&
587262306a36Sopenharmony_ci			    (tolerance || map->stripes[i].dev != srcdev))
587362306a36Sopenharmony_ci				return i;
587462306a36Sopenharmony_ci		}
587562306a36Sopenharmony_ci	}
587662306a36Sopenharmony_ci
587762306a36Sopenharmony_ci	/* we couldn't find one that doesn't fail.  Just return something
587862306a36Sopenharmony_ci	 * and the io error handling code will clean up eventually
587962306a36Sopenharmony_ci	 */
588062306a36Sopenharmony_ci	return preferred_mirror;
588162306a36Sopenharmony_ci}
588262306a36Sopenharmony_ci
588362306a36Sopenharmony_cistatic struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
588462306a36Sopenharmony_ci						       u16 total_stripes)
588562306a36Sopenharmony_ci{
588662306a36Sopenharmony_ci	struct btrfs_io_context *bioc;
588762306a36Sopenharmony_ci
588862306a36Sopenharmony_ci	bioc = kzalloc(
588962306a36Sopenharmony_ci		 /* The size of btrfs_io_context */
589062306a36Sopenharmony_ci		sizeof(struct btrfs_io_context) +
589162306a36Sopenharmony_ci		/* Plus the variable array for the stripes */
589262306a36Sopenharmony_ci		sizeof(struct btrfs_io_stripe) * (total_stripes),
589362306a36Sopenharmony_ci		GFP_NOFS);
589462306a36Sopenharmony_ci
589562306a36Sopenharmony_ci	if (!bioc)
589662306a36Sopenharmony_ci		return NULL;
589762306a36Sopenharmony_ci
589862306a36Sopenharmony_ci	refcount_set(&bioc->refs, 1);
589962306a36Sopenharmony_ci
590062306a36Sopenharmony_ci	bioc->fs_info = fs_info;
590162306a36Sopenharmony_ci	bioc->replace_stripe_src = -1;
590262306a36Sopenharmony_ci	bioc->full_stripe_logical = (u64)-1;
590362306a36Sopenharmony_ci
590462306a36Sopenharmony_ci	return bioc;
590562306a36Sopenharmony_ci}
590662306a36Sopenharmony_ci
590762306a36Sopenharmony_civoid btrfs_get_bioc(struct btrfs_io_context *bioc)
590862306a36Sopenharmony_ci{
590962306a36Sopenharmony_ci	WARN_ON(!refcount_read(&bioc->refs));
591062306a36Sopenharmony_ci	refcount_inc(&bioc->refs);
591162306a36Sopenharmony_ci}
591262306a36Sopenharmony_ci
591362306a36Sopenharmony_civoid btrfs_put_bioc(struct btrfs_io_context *bioc)
591462306a36Sopenharmony_ci{
591562306a36Sopenharmony_ci	if (!bioc)
591662306a36Sopenharmony_ci		return;
591762306a36Sopenharmony_ci	if (refcount_dec_and_test(&bioc->refs))
591862306a36Sopenharmony_ci		kfree(bioc);
591962306a36Sopenharmony_ci}
592062306a36Sopenharmony_ci
592162306a36Sopenharmony_ci/*
592262306a36Sopenharmony_ci * Please note that, discard won't be sent to target device of device
592362306a36Sopenharmony_ci * replace.
592462306a36Sopenharmony_ci */
592562306a36Sopenharmony_cistruct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
592662306a36Sopenharmony_ci					       u64 logical, u64 *length_ret,
592762306a36Sopenharmony_ci					       u32 *num_stripes)
592862306a36Sopenharmony_ci{
592962306a36Sopenharmony_ci	struct extent_map *em;
593062306a36Sopenharmony_ci	struct map_lookup *map;
593162306a36Sopenharmony_ci	struct btrfs_discard_stripe *stripes;
593262306a36Sopenharmony_ci	u64 length = *length_ret;
593362306a36Sopenharmony_ci	u64 offset;
593462306a36Sopenharmony_ci	u32 stripe_nr;
593562306a36Sopenharmony_ci	u32 stripe_nr_end;
593662306a36Sopenharmony_ci	u32 stripe_cnt;
593762306a36Sopenharmony_ci	u64 stripe_end_offset;
593862306a36Sopenharmony_ci	u64 stripe_offset;
593962306a36Sopenharmony_ci	u32 stripe_index;
594062306a36Sopenharmony_ci	u32 factor = 0;
594162306a36Sopenharmony_ci	u32 sub_stripes = 0;
594262306a36Sopenharmony_ci	u32 stripes_per_dev = 0;
594362306a36Sopenharmony_ci	u32 remaining_stripes = 0;
594462306a36Sopenharmony_ci	u32 last_stripe = 0;
594562306a36Sopenharmony_ci	int ret;
594662306a36Sopenharmony_ci	int i;
594762306a36Sopenharmony_ci
594862306a36Sopenharmony_ci	em = btrfs_get_chunk_map(fs_info, logical, length);
594962306a36Sopenharmony_ci	if (IS_ERR(em))
595062306a36Sopenharmony_ci		return ERR_CAST(em);
595162306a36Sopenharmony_ci
595262306a36Sopenharmony_ci	map = em->map_lookup;
595362306a36Sopenharmony_ci
595462306a36Sopenharmony_ci	/* we don't discard raid56 yet */
595562306a36Sopenharmony_ci	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
595662306a36Sopenharmony_ci		ret = -EOPNOTSUPP;
595762306a36Sopenharmony_ci		goto out_free_map;
595862306a36Sopenharmony_ci	}
595962306a36Sopenharmony_ci
596062306a36Sopenharmony_ci	offset = logical - em->start;
596162306a36Sopenharmony_ci	length = min_t(u64, em->start + em->len - logical, length);
596262306a36Sopenharmony_ci	*length_ret = length;
596362306a36Sopenharmony_ci
596462306a36Sopenharmony_ci	/*
596562306a36Sopenharmony_ci	 * stripe_nr counts the total number of stripes we have to stride
596662306a36Sopenharmony_ci	 * to get to this block
596762306a36Sopenharmony_ci	 */
596862306a36Sopenharmony_ci	stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
596962306a36Sopenharmony_ci
597062306a36Sopenharmony_ci	/* stripe_offset is the offset of this block in its stripe */
597162306a36Sopenharmony_ci	stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
597262306a36Sopenharmony_ci
597362306a36Sopenharmony_ci	stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
597462306a36Sopenharmony_ci			BTRFS_STRIPE_LEN_SHIFT;
597562306a36Sopenharmony_ci	stripe_cnt = stripe_nr_end - stripe_nr;
597662306a36Sopenharmony_ci	stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
597762306a36Sopenharmony_ci			    (offset + length);
597862306a36Sopenharmony_ci	/*
597962306a36Sopenharmony_ci	 * after this, stripe_nr is the number of stripes on this
598062306a36Sopenharmony_ci	 * device we have to walk to find the data, and stripe_index is
598162306a36Sopenharmony_ci	 * the number of our device in the stripe array
598262306a36Sopenharmony_ci	 */
598362306a36Sopenharmony_ci	*num_stripes = 1;
598462306a36Sopenharmony_ci	stripe_index = 0;
598562306a36Sopenharmony_ci	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
598662306a36Sopenharmony_ci			 BTRFS_BLOCK_GROUP_RAID10)) {
598762306a36Sopenharmony_ci		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
598862306a36Sopenharmony_ci			sub_stripes = 1;
598962306a36Sopenharmony_ci		else
599062306a36Sopenharmony_ci			sub_stripes = map->sub_stripes;
599162306a36Sopenharmony_ci
599262306a36Sopenharmony_ci		factor = map->num_stripes / sub_stripes;
599362306a36Sopenharmony_ci		*num_stripes = min_t(u64, map->num_stripes,
599462306a36Sopenharmony_ci				    sub_stripes * stripe_cnt);
599562306a36Sopenharmony_ci		stripe_index = stripe_nr % factor;
599662306a36Sopenharmony_ci		stripe_nr /= factor;
599762306a36Sopenharmony_ci		stripe_index *= sub_stripes;
599862306a36Sopenharmony_ci
599962306a36Sopenharmony_ci		remaining_stripes = stripe_cnt % factor;
600062306a36Sopenharmony_ci		stripes_per_dev = stripe_cnt / factor;
600162306a36Sopenharmony_ci		last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
600262306a36Sopenharmony_ci	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
600362306a36Sopenharmony_ci				BTRFS_BLOCK_GROUP_DUP)) {
600462306a36Sopenharmony_ci		*num_stripes = map->num_stripes;
600562306a36Sopenharmony_ci	} else {
600662306a36Sopenharmony_ci		stripe_index = stripe_nr % map->num_stripes;
600762306a36Sopenharmony_ci		stripe_nr /= map->num_stripes;
600862306a36Sopenharmony_ci	}
600962306a36Sopenharmony_ci
601062306a36Sopenharmony_ci	stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
601162306a36Sopenharmony_ci	if (!stripes) {
601262306a36Sopenharmony_ci		ret = -ENOMEM;
601362306a36Sopenharmony_ci		goto out_free_map;
601462306a36Sopenharmony_ci	}
601562306a36Sopenharmony_ci
601662306a36Sopenharmony_ci	for (i = 0; i < *num_stripes; i++) {
601762306a36Sopenharmony_ci		stripes[i].physical =
601862306a36Sopenharmony_ci			map->stripes[stripe_index].physical +
601962306a36Sopenharmony_ci			stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
602062306a36Sopenharmony_ci		stripes[i].dev = map->stripes[stripe_index].dev;
602162306a36Sopenharmony_ci
602262306a36Sopenharmony_ci		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
602362306a36Sopenharmony_ci				 BTRFS_BLOCK_GROUP_RAID10)) {
602462306a36Sopenharmony_ci			stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);
602562306a36Sopenharmony_ci
602662306a36Sopenharmony_ci			if (i / sub_stripes < remaining_stripes)
602762306a36Sopenharmony_ci				stripes[i].length += BTRFS_STRIPE_LEN;
602862306a36Sopenharmony_ci
602962306a36Sopenharmony_ci			/*
603062306a36Sopenharmony_ci			 * Special for the first stripe and
603162306a36Sopenharmony_ci			 * the last stripe:
603262306a36Sopenharmony_ci			 *
603362306a36Sopenharmony_ci			 * |-------|...|-------|
603462306a36Sopenharmony_ci			 *     |----------|
603562306a36Sopenharmony_ci			 *    off     end_off
603662306a36Sopenharmony_ci			 */
603762306a36Sopenharmony_ci			if (i < sub_stripes)
603862306a36Sopenharmony_ci				stripes[i].length -= stripe_offset;
603962306a36Sopenharmony_ci
604062306a36Sopenharmony_ci			if (stripe_index >= last_stripe &&
604162306a36Sopenharmony_ci			    stripe_index <= (last_stripe +
604262306a36Sopenharmony_ci					     sub_stripes - 1))
604362306a36Sopenharmony_ci				stripes[i].length -= stripe_end_offset;
604462306a36Sopenharmony_ci
604562306a36Sopenharmony_ci			if (i == sub_stripes - 1)
604662306a36Sopenharmony_ci				stripe_offset = 0;
604762306a36Sopenharmony_ci		} else {
604862306a36Sopenharmony_ci			stripes[i].length = length;
604962306a36Sopenharmony_ci		}
605062306a36Sopenharmony_ci
605162306a36Sopenharmony_ci		stripe_index++;
605262306a36Sopenharmony_ci		if (stripe_index == map->num_stripes) {
605362306a36Sopenharmony_ci			stripe_index = 0;
605462306a36Sopenharmony_ci			stripe_nr++;
605562306a36Sopenharmony_ci		}
605662306a36Sopenharmony_ci	}
605762306a36Sopenharmony_ci
605862306a36Sopenharmony_ci	free_extent_map(em);
605962306a36Sopenharmony_ci	return stripes;
606062306a36Sopenharmony_ciout_free_map:
606162306a36Sopenharmony_ci	free_extent_map(em);
606262306a36Sopenharmony_ci	return ERR_PTR(ret);
606362306a36Sopenharmony_ci}
606462306a36Sopenharmony_ci
606562306a36Sopenharmony_cistatic bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
606662306a36Sopenharmony_ci{
606762306a36Sopenharmony_ci	struct btrfs_block_group *cache;
606862306a36Sopenharmony_ci	bool ret;
606962306a36Sopenharmony_ci
607062306a36Sopenharmony_ci	/* Non zoned filesystem does not use "to_copy" flag */
607162306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
607262306a36Sopenharmony_ci		return false;
607362306a36Sopenharmony_ci
607462306a36Sopenharmony_ci	cache = btrfs_lookup_block_group(fs_info, logical);
607562306a36Sopenharmony_ci
607662306a36Sopenharmony_ci	ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
607762306a36Sopenharmony_ci
607862306a36Sopenharmony_ci	btrfs_put_block_group(cache);
607962306a36Sopenharmony_ci	return ret;
608062306a36Sopenharmony_ci}
608162306a36Sopenharmony_ci
608262306a36Sopenharmony_cistatic void handle_ops_on_dev_replace(enum btrfs_map_op op,
608362306a36Sopenharmony_ci				      struct btrfs_io_context *bioc,
608462306a36Sopenharmony_ci				      struct btrfs_dev_replace *dev_replace,
608562306a36Sopenharmony_ci				      u64 logical,
608662306a36Sopenharmony_ci				      int *num_stripes_ret, int *max_errors_ret)
608762306a36Sopenharmony_ci{
608862306a36Sopenharmony_ci	u64 srcdev_devid = dev_replace->srcdev->devid;
608962306a36Sopenharmony_ci	/*
609062306a36Sopenharmony_ci	 * At this stage, num_stripes is still the real number of stripes,
609162306a36Sopenharmony_ci	 * excluding the duplicated stripes.
609262306a36Sopenharmony_ci	 */
609362306a36Sopenharmony_ci	int num_stripes = *num_stripes_ret;
609462306a36Sopenharmony_ci	int nr_extra_stripes = 0;
609562306a36Sopenharmony_ci	int max_errors = *max_errors_ret;
609662306a36Sopenharmony_ci	int i;
609762306a36Sopenharmony_ci
609862306a36Sopenharmony_ci	/*
609962306a36Sopenharmony_ci	 * A block group which has "to_copy" set will eventually be copied by
610062306a36Sopenharmony_ci	 * the dev-replace process. We can avoid cloning IO here.
610162306a36Sopenharmony_ci	 */
610262306a36Sopenharmony_ci	if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
610362306a36Sopenharmony_ci		return;
610462306a36Sopenharmony_ci
610562306a36Sopenharmony_ci	/*
610662306a36Sopenharmony_ci	 * Duplicate the write operations while the dev-replace procedure is
610762306a36Sopenharmony_ci	 * running. Since the copying of the old disk to the new disk takes
610862306a36Sopenharmony_ci	 * place at run time while the filesystem is mounted writable, the
610962306a36Sopenharmony_ci	 * regular write operations to the old disk have to be duplicated to go
611062306a36Sopenharmony_ci	 * to the new disk as well.
611162306a36Sopenharmony_ci	 *
611262306a36Sopenharmony_ci	 * Note that device->missing is handled by the caller, and that the
611362306a36Sopenharmony_ci	 * write to the old disk is already set up in the stripes array.
611462306a36Sopenharmony_ci	 */
611562306a36Sopenharmony_ci	for (i = 0; i < num_stripes; i++) {
611662306a36Sopenharmony_ci		struct btrfs_io_stripe *old = &bioc->stripes[i];
611762306a36Sopenharmony_ci		struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
611862306a36Sopenharmony_ci
611962306a36Sopenharmony_ci		if (old->dev->devid != srcdev_devid)
612062306a36Sopenharmony_ci			continue;
612162306a36Sopenharmony_ci
612262306a36Sopenharmony_ci		new->physical = old->physical;
612362306a36Sopenharmony_ci		new->dev = dev_replace->tgtdev;
612462306a36Sopenharmony_ci		if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
612562306a36Sopenharmony_ci			bioc->replace_stripe_src = i;
612662306a36Sopenharmony_ci		nr_extra_stripes++;
612762306a36Sopenharmony_ci	}
612862306a36Sopenharmony_ci
612962306a36Sopenharmony_ci	/* We can only have at most 2 extra nr_stripes (for DUP). */
613062306a36Sopenharmony_ci	ASSERT(nr_extra_stripes <= 2);
613162306a36Sopenharmony_ci	/*
613262306a36Sopenharmony_ci	 * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
613362306a36Sopenharmony_ci	 * replace.
613462306a36Sopenharmony_ci	 * If we have 2 extra stripes, only choose the one with smaller physical.
613562306a36Sopenharmony_ci	 */
613662306a36Sopenharmony_ci	if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
613762306a36Sopenharmony_ci		struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
613862306a36Sopenharmony_ci		struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
613962306a36Sopenharmony_ci
614062306a36Sopenharmony_ci		/* Only DUP can have two extra stripes. */
614162306a36Sopenharmony_ci		ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
614262306a36Sopenharmony_ci
614362306a36Sopenharmony_ci		/*
614462306a36Sopenharmony_ci		 * Swap the last stripe stripes and reduce @nr_extra_stripes.
614562306a36Sopenharmony_ci		 * The extra stripe would still be there, but won't be accessed.
614662306a36Sopenharmony_ci		 */
614762306a36Sopenharmony_ci		if (first->physical > second->physical) {
614862306a36Sopenharmony_ci			swap(second->physical, first->physical);
614962306a36Sopenharmony_ci			swap(second->dev, first->dev);
615062306a36Sopenharmony_ci			nr_extra_stripes--;
615162306a36Sopenharmony_ci		}
615262306a36Sopenharmony_ci	}
615362306a36Sopenharmony_ci
615462306a36Sopenharmony_ci	*num_stripes_ret = num_stripes + nr_extra_stripes;
615562306a36Sopenharmony_ci	*max_errors_ret = max_errors + nr_extra_stripes;
615662306a36Sopenharmony_ci	bioc->replace_nr_stripes = nr_extra_stripes;
615762306a36Sopenharmony_ci}
615862306a36Sopenharmony_ci
615962306a36Sopenharmony_cistatic u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
616062306a36Sopenharmony_ci			    u64 offset, u32 *stripe_nr, u64 *stripe_offset,
616162306a36Sopenharmony_ci			    u64 *full_stripe_start)
616262306a36Sopenharmony_ci{
616362306a36Sopenharmony_ci	/*
616462306a36Sopenharmony_ci	 * Stripe_nr is the stripe where this block falls.  stripe_offset is
616562306a36Sopenharmony_ci	 * the offset of this block in its stripe.
616662306a36Sopenharmony_ci	 */
616762306a36Sopenharmony_ci	*stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
616862306a36Sopenharmony_ci	*stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
616962306a36Sopenharmony_ci	ASSERT(*stripe_offset < U32_MAX);
617062306a36Sopenharmony_ci
617162306a36Sopenharmony_ci	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
617262306a36Sopenharmony_ci		unsigned long full_stripe_len =
617362306a36Sopenharmony_ci			btrfs_stripe_nr_to_offset(nr_data_stripes(map));
617462306a36Sopenharmony_ci
617562306a36Sopenharmony_ci		/*
617662306a36Sopenharmony_ci		 * For full stripe start, we use previously calculated
617762306a36Sopenharmony_ci		 * @stripe_nr. Align it to nr_data_stripes, then multiply with
617862306a36Sopenharmony_ci		 * STRIPE_LEN.
617962306a36Sopenharmony_ci		 *
618062306a36Sopenharmony_ci		 * By this we can avoid u64 division completely.  And we have
618162306a36Sopenharmony_ci		 * to go rounddown(), not round_down(), as nr_data_stripes is
618262306a36Sopenharmony_ci		 * not ensured to be power of 2.
618362306a36Sopenharmony_ci		 */
618462306a36Sopenharmony_ci		*full_stripe_start =
618562306a36Sopenharmony_ci			btrfs_stripe_nr_to_offset(
618662306a36Sopenharmony_ci				rounddown(*stripe_nr, nr_data_stripes(map)));
618762306a36Sopenharmony_ci
618862306a36Sopenharmony_ci		ASSERT(*full_stripe_start + full_stripe_len > offset);
618962306a36Sopenharmony_ci		ASSERT(*full_stripe_start <= offset);
619062306a36Sopenharmony_ci		/*
619162306a36Sopenharmony_ci		 * For writes to RAID56, allow to write a full stripe set, but
619262306a36Sopenharmony_ci		 * no straddling of stripe sets.
619362306a36Sopenharmony_ci		 */
619462306a36Sopenharmony_ci		if (op == BTRFS_MAP_WRITE)
619562306a36Sopenharmony_ci			return full_stripe_len - (offset - *full_stripe_start);
619662306a36Sopenharmony_ci	}
619762306a36Sopenharmony_ci
619862306a36Sopenharmony_ci	/*
619962306a36Sopenharmony_ci	 * For other RAID types and for RAID56 reads, allow a single stripe (on
620062306a36Sopenharmony_ci	 * a single disk).
620162306a36Sopenharmony_ci	 */
620262306a36Sopenharmony_ci	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
620362306a36Sopenharmony_ci		return BTRFS_STRIPE_LEN - *stripe_offset;
620462306a36Sopenharmony_ci	return U64_MAX;
620562306a36Sopenharmony_ci}
620662306a36Sopenharmony_ci
620762306a36Sopenharmony_cistatic void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
620862306a36Sopenharmony_ci			  u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
620962306a36Sopenharmony_ci{
621062306a36Sopenharmony_ci	dst->dev = map->stripes[stripe_index].dev;
621162306a36Sopenharmony_ci	dst->physical = map->stripes[stripe_index].physical +
621262306a36Sopenharmony_ci			stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
621362306a36Sopenharmony_ci}
621462306a36Sopenharmony_ci
621562306a36Sopenharmony_ci/*
621662306a36Sopenharmony_ci * Map one logical range to one or more physical ranges.
621762306a36Sopenharmony_ci *
621862306a36Sopenharmony_ci * @length:		(Mandatory) mapped length of this run.
621962306a36Sopenharmony_ci *			One logical range can be split into different segments
622062306a36Sopenharmony_ci *			due to factors like zones and RAID0/5/6/10 stripe
622162306a36Sopenharmony_ci *			boundaries.
622262306a36Sopenharmony_ci *
622362306a36Sopenharmony_ci * @bioc_ret:		(Mandatory) returned btrfs_io_context structure.
622462306a36Sopenharmony_ci *			which has one or more physical ranges (btrfs_io_stripe)
622562306a36Sopenharmony_ci *			recorded inside.
622662306a36Sopenharmony_ci *			Caller should call btrfs_put_bioc() to free it after use.
622762306a36Sopenharmony_ci *
622862306a36Sopenharmony_ci * @smap:		(Optional) single physical range optimization.
622962306a36Sopenharmony_ci *			If the map request can be fulfilled by one single
623062306a36Sopenharmony_ci *			physical range, and this is parameter is not NULL,
623162306a36Sopenharmony_ci *			then @bioc_ret would be NULL, and @smap would be
623262306a36Sopenharmony_ci *			updated.
623362306a36Sopenharmony_ci *
623462306a36Sopenharmony_ci * @mirror_num_ret:	(Mandatory) returned mirror number if the original
623562306a36Sopenharmony_ci *			value is 0.
623662306a36Sopenharmony_ci *
623762306a36Sopenharmony_ci *			Mirror number 0 means to choose any live mirrors.
623862306a36Sopenharmony_ci *
623962306a36Sopenharmony_ci *			For non-RAID56 profiles, non-zero mirror_num means
624062306a36Sopenharmony_ci *			the Nth mirror. (e.g. mirror_num 1 means the first
624162306a36Sopenharmony_ci *			copy).
624262306a36Sopenharmony_ci *
624362306a36Sopenharmony_ci *			For RAID56 profile, mirror 1 means rebuild from P and
624462306a36Sopenharmony_ci *			the remaining data stripes.
624562306a36Sopenharmony_ci *
624662306a36Sopenharmony_ci *			For RAID6 profile, mirror > 2 means mark another
624762306a36Sopenharmony_ci *			data/P stripe error and rebuild from the remaining
624862306a36Sopenharmony_ci *			stripes..
624962306a36Sopenharmony_ci *
625062306a36Sopenharmony_ci * @need_raid_map:	(Used only for integrity checker) whether the map wants
625162306a36Sopenharmony_ci *                      a full stripe map (including all data and P/Q stripes)
625262306a36Sopenharmony_ci *                      for RAID56. Should always be 1 except integrity checker.
625362306a36Sopenharmony_ci */
625462306a36Sopenharmony_ciint btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
625562306a36Sopenharmony_ci		    u64 logical, u64 *length,
625662306a36Sopenharmony_ci		    struct btrfs_io_context **bioc_ret,
625762306a36Sopenharmony_ci		    struct btrfs_io_stripe *smap, int *mirror_num_ret,
625862306a36Sopenharmony_ci		    int need_raid_map)
625962306a36Sopenharmony_ci{
626062306a36Sopenharmony_ci	struct extent_map *em;
626162306a36Sopenharmony_ci	struct map_lookup *map;
626262306a36Sopenharmony_ci	u64 map_offset;
626362306a36Sopenharmony_ci	u64 stripe_offset;
626462306a36Sopenharmony_ci	u32 stripe_nr;
626562306a36Sopenharmony_ci	u32 stripe_index;
626662306a36Sopenharmony_ci	int data_stripes;
626762306a36Sopenharmony_ci	int i;
626862306a36Sopenharmony_ci	int ret = 0;
626962306a36Sopenharmony_ci	int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
627062306a36Sopenharmony_ci	int num_stripes;
627162306a36Sopenharmony_ci	int num_copies;
627262306a36Sopenharmony_ci	int max_errors = 0;
627362306a36Sopenharmony_ci	struct btrfs_io_context *bioc = NULL;
627462306a36Sopenharmony_ci	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
627562306a36Sopenharmony_ci	int dev_replace_is_ongoing = 0;
627662306a36Sopenharmony_ci	u16 num_alloc_stripes;
627762306a36Sopenharmony_ci	u64 raid56_full_stripe_start = (u64)-1;
627862306a36Sopenharmony_ci	u64 max_len;
627962306a36Sopenharmony_ci
628062306a36Sopenharmony_ci	ASSERT(bioc_ret);
628162306a36Sopenharmony_ci
628262306a36Sopenharmony_ci	num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
628362306a36Sopenharmony_ci	if (mirror_num > num_copies)
628462306a36Sopenharmony_ci		return -EINVAL;
628562306a36Sopenharmony_ci
628662306a36Sopenharmony_ci	em = btrfs_get_chunk_map(fs_info, logical, *length);
628762306a36Sopenharmony_ci	if (IS_ERR(em))
628862306a36Sopenharmony_ci		return PTR_ERR(em);
628962306a36Sopenharmony_ci
629062306a36Sopenharmony_ci	map = em->map_lookup;
629162306a36Sopenharmony_ci	data_stripes = nr_data_stripes(map);
629262306a36Sopenharmony_ci
629362306a36Sopenharmony_ci	map_offset = logical - em->start;
629462306a36Sopenharmony_ci	max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
629562306a36Sopenharmony_ci				   &stripe_offset, &raid56_full_stripe_start);
629662306a36Sopenharmony_ci	*length = min_t(u64, em->len - map_offset, max_len);
629762306a36Sopenharmony_ci
629862306a36Sopenharmony_ci	down_read(&dev_replace->rwsem);
629962306a36Sopenharmony_ci	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
630062306a36Sopenharmony_ci	/*
630162306a36Sopenharmony_ci	 * Hold the semaphore for read during the whole operation, write is
630262306a36Sopenharmony_ci	 * requested at commit time but must wait.
630362306a36Sopenharmony_ci	 */
630462306a36Sopenharmony_ci	if (!dev_replace_is_ongoing)
630562306a36Sopenharmony_ci		up_read(&dev_replace->rwsem);
630662306a36Sopenharmony_ci
630762306a36Sopenharmony_ci	num_stripes = 1;
630862306a36Sopenharmony_ci	stripe_index = 0;
630962306a36Sopenharmony_ci	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
631062306a36Sopenharmony_ci		stripe_index = stripe_nr % map->num_stripes;
631162306a36Sopenharmony_ci		stripe_nr /= map->num_stripes;
631262306a36Sopenharmony_ci		if (op == BTRFS_MAP_READ)
631362306a36Sopenharmony_ci			mirror_num = 1;
631462306a36Sopenharmony_ci	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
631562306a36Sopenharmony_ci		if (op != BTRFS_MAP_READ) {
631662306a36Sopenharmony_ci			num_stripes = map->num_stripes;
631762306a36Sopenharmony_ci		} else if (mirror_num) {
631862306a36Sopenharmony_ci			stripe_index = mirror_num - 1;
631962306a36Sopenharmony_ci		} else {
632062306a36Sopenharmony_ci			stripe_index = find_live_mirror(fs_info, map, 0,
632162306a36Sopenharmony_ci					    dev_replace_is_ongoing);
632262306a36Sopenharmony_ci			mirror_num = stripe_index + 1;
632362306a36Sopenharmony_ci		}
632462306a36Sopenharmony_ci
632562306a36Sopenharmony_ci	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
632662306a36Sopenharmony_ci		if (op != BTRFS_MAP_READ) {
632762306a36Sopenharmony_ci			num_stripes = map->num_stripes;
632862306a36Sopenharmony_ci		} else if (mirror_num) {
632962306a36Sopenharmony_ci			stripe_index = mirror_num - 1;
633062306a36Sopenharmony_ci		} else {
633162306a36Sopenharmony_ci			mirror_num = 1;
633262306a36Sopenharmony_ci		}
633362306a36Sopenharmony_ci
633462306a36Sopenharmony_ci	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
633562306a36Sopenharmony_ci		u32 factor = map->num_stripes / map->sub_stripes;
633662306a36Sopenharmony_ci
633762306a36Sopenharmony_ci		stripe_index = (stripe_nr % factor) * map->sub_stripes;
633862306a36Sopenharmony_ci		stripe_nr /= factor;
633962306a36Sopenharmony_ci
634062306a36Sopenharmony_ci		if (op != BTRFS_MAP_READ)
634162306a36Sopenharmony_ci			num_stripes = map->sub_stripes;
634262306a36Sopenharmony_ci		else if (mirror_num)
634362306a36Sopenharmony_ci			stripe_index += mirror_num - 1;
634462306a36Sopenharmony_ci		else {
634562306a36Sopenharmony_ci			int old_stripe_index = stripe_index;
634662306a36Sopenharmony_ci			stripe_index = find_live_mirror(fs_info, map,
634762306a36Sopenharmony_ci					      stripe_index,
634862306a36Sopenharmony_ci					      dev_replace_is_ongoing);
634962306a36Sopenharmony_ci			mirror_num = stripe_index - old_stripe_index + 1;
635062306a36Sopenharmony_ci		}
635162306a36Sopenharmony_ci
635262306a36Sopenharmony_ci	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
635362306a36Sopenharmony_ci		if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
635462306a36Sopenharmony_ci			/*
635562306a36Sopenharmony_ci			 * Push stripe_nr back to the start of the full stripe
635662306a36Sopenharmony_ci			 * For those cases needing a full stripe, @stripe_nr
635762306a36Sopenharmony_ci			 * is the full stripe number.
635862306a36Sopenharmony_ci			 *
635962306a36Sopenharmony_ci			 * Originally we go raid56_full_stripe_start / full_stripe_len,
636062306a36Sopenharmony_ci			 * but that can be expensive.  Here we just divide
636162306a36Sopenharmony_ci			 * @stripe_nr with @data_stripes.
636262306a36Sopenharmony_ci			 */
636362306a36Sopenharmony_ci			stripe_nr /= data_stripes;
636462306a36Sopenharmony_ci
636562306a36Sopenharmony_ci			/* RAID[56] write or recovery. Return all stripes */
636662306a36Sopenharmony_ci			num_stripes = map->num_stripes;
636762306a36Sopenharmony_ci			max_errors = btrfs_chunk_max_errors(map);
636862306a36Sopenharmony_ci
636962306a36Sopenharmony_ci			/* Return the length to the full stripe end */
637062306a36Sopenharmony_ci			*length = min(logical + *length,
637162306a36Sopenharmony_ci				      raid56_full_stripe_start + em->start +
637262306a36Sopenharmony_ci				      btrfs_stripe_nr_to_offset(data_stripes)) -
637362306a36Sopenharmony_ci				  logical;
637462306a36Sopenharmony_ci			stripe_index = 0;
637562306a36Sopenharmony_ci			stripe_offset = 0;
637662306a36Sopenharmony_ci		} else {
637762306a36Sopenharmony_ci			/*
637862306a36Sopenharmony_ci			 * Mirror #0 or #1 means the original data block.
637962306a36Sopenharmony_ci			 * Mirror #2 is RAID5 parity block.
638062306a36Sopenharmony_ci			 * Mirror #3 is RAID6 Q block.
638162306a36Sopenharmony_ci			 */
638262306a36Sopenharmony_ci			stripe_index = stripe_nr % data_stripes;
638362306a36Sopenharmony_ci			stripe_nr /= data_stripes;
638462306a36Sopenharmony_ci			if (mirror_num > 1)
638562306a36Sopenharmony_ci				stripe_index = data_stripes + mirror_num - 2;
638662306a36Sopenharmony_ci
638762306a36Sopenharmony_ci			/* We distribute the parity blocks across stripes */
638862306a36Sopenharmony_ci			stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
638962306a36Sopenharmony_ci			if (op == BTRFS_MAP_READ && mirror_num <= 1)
639062306a36Sopenharmony_ci				mirror_num = 1;
639162306a36Sopenharmony_ci		}
639262306a36Sopenharmony_ci	} else {
639362306a36Sopenharmony_ci		/*
639462306a36Sopenharmony_ci		 * After this, stripe_nr is the number of stripes on this
639562306a36Sopenharmony_ci		 * device we have to walk to find the data, and stripe_index is
639662306a36Sopenharmony_ci		 * the number of our device in the stripe array
639762306a36Sopenharmony_ci		 */
639862306a36Sopenharmony_ci		stripe_index = stripe_nr % map->num_stripes;
639962306a36Sopenharmony_ci		stripe_nr /= map->num_stripes;
640062306a36Sopenharmony_ci		mirror_num = stripe_index + 1;
640162306a36Sopenharmony_ci	}
640262306a36Sopenharmony_ci	if (stripe_index >= map->num_stripes) {
640362306a36Sopenharmony_ci		btrfs_crit(fs_info,
640462306a36Sopenharmony_ci			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
640562306a36Sopenharmony_ci			   stripe_index, map->num_stripes);
640662306a36Sopenharmony_ci		ret = -EINVAL;
640762306a36Sopenharmony_ci		goto out;
640862306a36Sopenharmony_ci	}
640962306a36Sopenharmony_ci
641062306a36Sopenharmony_ci	num_alloc_stripes = num_stripes;
641162306a36Sopenharmony_ci	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
641262306a36Sopenharmony_ci	    op != BTRFS_MAP_READ)
641362306a36Sopenharmony_ci		/*
641462306a36Sopenharmony_ci		 * For replace case, we need to add extra stripes for extra
641562306a36Sopenharmony_ci		 * duplicated stripes.
641662306a36Sopenharmony_ci		 *
641762306a36Sopenharmony_ci		 * For both WRITE and GET_READ_MIRRORS, we may have at most
641862306a36Sopenharmony_ci		 * 2 more stripes (DUP types, otherwise 1).
641962306a36Sopenharmony_ci		 */
642062306a36Sopenharmony_ci		num_alloc_stripes += 2;
642162306a36Sopenharmony_ci
642262306a36Sopenharmony_ci	/*
642362306a36Sopenharmony_ci	 * If this I/O maps to a single device, try to return the device and
642462306a36Sopenharmony_ci	 * physical block information on the stack instead of allocating an
642562306a36Sopenharmony_ci	 * I/O context structure.
642662306a36Sopenharmony_ci	 */
642762306a36Sopenharmony_ci	if (smap && num_alloc_stripes == 1 &&
642862306a36Sopenharmony_ci	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
642962306a36Sopenharmony_ci		set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
643062306a36Sopenharmony_ci		if (mirror_num_ret)
643162306a36Sopenharmony_ci			*mirror_num_ret = mirror_num;
643262306a36Sopenharmony_ci		*bioc_ret = NULL;
643362306a36Sopenharmony_ci		ret = 0;
643462306a36Sopenharmony_ci		goto out;
643562306a36Sopenharmony_ci	}
643662306a36Sopenharmony_ci
643762306a36Sopenharmony_ci	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
643862306a36Sopenharmony_ci	if (!bioc) {
643962306a36Sopenharmony_ci		ret = -ENOMEM;
644062306a36Sopenharmony_ci		goto out;
644162306a36Sopenharmony_ci	}
644262306a36Sopenharmony_ci	bioc->map_type = map->type;
644362306a36Sopenharmony_ci
644462306a36Sopenharmony_ci	/*
644562306a36Sopenharmony_ci	 * For RAID56 full map, we need to make sure the stripes[] follows the
644662306a36Sopenharmony_ci	 * rule that data stripes are all ordered, then followed with P and Q
644762306a36Sopenharmony_ci	 * (if we have).
644862306a36Sopenharmony_ci	 *
644962306a36Sopenharmony_ci	 * It's still mostly the same as other profiles, just with extra rotation.
645062306a36Sopenharmony_ci	 */
645162306a36Sopenharmony_ci	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
645262306a36Sopenharmony_ci	    (op != BTRFS_MAP_READ || mirror_num > 1)) {
645362306a36Sopenharmony_ci		/*
645462306a36Sopenharmony_ci		 * For RAID56 @stripe_nr is already the number of full stripes
645562306a36Sopenharmony_ci		 * before us, which is also the rotation value (needs to modulo
645662306a36Sopenharmony_ci		 * with num_stripes).
645762306a36Sopenharmony_ci		 *
645862306a36Sopenharmony_ci		 * In this case, we just add @stripe_nr with @i, then do the
645962306a36Sopenharmony_ci		 * modulo, to reduce one modulo call.
646062306a36Sopenharmony_ci		 */
646162306a36Sopenharmony_ci		bioc->full_stripe_logical = em->start +
646262306a36Sopenharmony_ci			btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
646362306a36Sopenharmony_ci		for (i = 0; i < num_stripes; i++)
646462306a36Sopenharmony_ci			set_io_stripe(&bioc->stripes[i], map,
646562306a36Sopenharmony_ci				      (i + stripe_nr) % num_stripes,
646662306a36Sopenharmony_ci				      stripe_offset, stripe_nr);
646762306a36Sopenharmony_ci	} else {
646862306a36Sopenharmony_ci		/*
646962306a36Sopenharmony_ci		 * For all other non-RAID56 profiles, just copy the target
647062306a36Sopenharmony_ci		 * stripe into the bioc.
647162306a36Sopenharmony_ci		 */
647262306a36Sopenharmony_ci		for (i = 0; i < num_stripes; i++) {
647362306a36Sopenharmony_ci			set_io_stripe(&bioc->stripes[i], map, stripe_index,
647462306a36Sopenharmony_ci				      stripe_offset, stripe_nr);
647562306a36Sopenharmony_ci			stripe_index++;
647662306a36Sopenharmony_ci		}
647762306a36Sopenharmony_ci	}
647862306a36Sopenharmony_ci
647962306a36Sopenharmony_ci	if (op != BTRFS_MAP_READ)
648062306a36Sopenharmony_ci		max_errors = btrfs_chunk_max_errors(map);
648162306a36Sopenharmony_ci
648262306a36Sopenharmony_ci	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
648362306a36Sopenharmony_ci	    op != BTRFS_MAP_READ) {
648462306a36Sopenharmony_ci		handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
648562306a36Sopenharmony_ci					  &num_stripes, &max_errors);
648662306a36Sopenharmony_ci	}
648762306a36Sopenharmony_ci
648862306a36Sopenharmony_ci	*bioc_ret = bioc;
648962306a36Sopenharmony_ci	bioc->num_stripes = num_stripes;
649062306a36Sopenharmony_ci	bioc->max_errors = max_errors;
649162306a36Sopenharmony_ci	bioc->mirror_num = mirror_num;
649262306a36Sopenharmony_ci
649362306a36Sopenharmony_ciout:
649462306a36Sopenharmony_ci	if (dev_replace_is_ongoing) {
649562306a36Sopenharmony_ci		lockdep_assert_held(&dev_replace->rwsem);
649662306a36Sopenharmony_ci		/* Unlock and let waiting writers proceed */
649762306a36Sopenharmony_ci		up_read(&dev_replace->rwsem);
649862306a36Sopenharmony_ci	}
649962306a36Sopenharmony_ci	free_extent_map(em);
650062306a36Sopenharmony_ci	return ret;
650162306a36Sopenharmony_ci}
650262306a36Sopenharmony_ci
650362306a36Sopenharmony_cistatic bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
650462306a36Sopenharmony_ci				      const struct btrfs_fs_devices *fs_devices)
650562306a36Sopenharmony_ci{
650662306a36Sopenharmony_ci	if (args->fsid == NULL)
650762306a36Sopenharmony_ci		return true;
650862306a36Sopenharmony_ci	if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
650962306a36Sopenharmony_ci		return true;
651062306a36Sopenharmony_ci	return false;
651162306a36Sopenharmony_ci}
651262306a36Sopenharmony_ci
651362306a36Sopenharmony_cistatic bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
651462306a36Sopenharmony_ci				  const struct btrfs_device *device)
651562306a36Sopenharmony_ci{
651662306a36Sopenharmony_ci	if (args->missing) {
651762306a36Sopenharmony_ci		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
651862306a36Sopenharmony_ci		    !device->bdev)
651962306a36Sopenharmony_ci			return true;
652062306a36Sopenharmony_ci		return false;
652162306a36Sopenharmony_ci	}
652262306a36Sopenharmony_ci
652362306a36Sopenharmony_ci	if (device->devid != args->devid)
652462306a36Sopenharmony_ci		return false;
652562306a36Sopenharmony_ci	if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
652662306a36Sopenharmony_ci		return false;
652762306a36Sopenharmony_ci	return true;
652862306a36Sopenharmony_ci}
652962306a36Sopenharmony_ci
653062306a36Sopenharmony_ci/*
653162306a36Sopenharmony_ci * Find a device specified by @devid or @uuid in the list of @fs_devices, or
653262306a36Sopenharmony_ci * return NULL.
653362306a36Sopenharmony_ci *
653462306a36Sopenharmony_ci * If devid and uuid are both specified, the match must be exact, otherwise
653562306a36Sopenharmony_ci * only devid is used.
653662306a36Sopenharmony_ci */
653762306a36Sopenharmony_cistruct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
653862306a36Sopenharmony_ci				       const struct btrfs_dev_lookup_args *args)
653962306a36Sopenharmony_ci{
654062306a36Sopenharmony_ci	struct btrfs_device *device;
654162306a36Sopenharmony_ci	struct btrfs_fs_devices *seed_devs;
654262306a36Sopenharmony_ci
654362306a36Sopenharmony_ci	if (dev_args_match_fs_devices(args, fs_devices)) {
654462306a36Sopenharmony_ci		list_for_each_entry(device, &fs_devices->devices, dev_list) {
654562306a36Sopenharmony_ci			if (dev_args_match_device(args, device))
654662306a36Sopenharmony_ci				return device;
654762306a36Sopenharmony_ci		}
654862306a36Sopenharmony_ci	}
654962306a36Sopenharmony_ci
655062306a36Sopenharmony_ci	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
655162306a36Sopenharmony_ci		if (!dev_args_match_fs_devices(args, seed_devs))
655262306a36Sopenharmony_ci			continue;
655362306a36Sopenharmony_ci		list_for_each_entry(device, &seed_devs->devices, dev_list) {
655462306a36Sopenharmony_ci			if (dev_args_match_device(args, device))
655562306a36Sopenharmony_ci				return device;
655662306a36Sopenharmony_ci		}
655762306a36Sopenharmony_ci	}
655862306a36Sopenharmony_ci
655962306a36Sopenharmony_ci	return NULL;
656062306a36Sopenharmony_ci}
656162306a36Sopenharmony_ci
656262306a36Sopenharmony_cistatic struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
656362306a36Sopenharmony_ci					    u64 devid, u8 *dev_uuid)
656462306a36Sopenharmony_ci{
656562306a36Sopenharmony_ci	struct btrfs_device *device;
656662306a36Sopenharmony_ci	unsigned int nofs_flag;
656762306a36Sopenharmony_ci
656862306a36Sopenharmony_ci	/*
656962306a36Sopenharmony_ci	 * We call this under the chunk_mutex, so we want to use NOFS for this
657062306a36Sopenharmony_ci	 * allocation, however we don't want to change btrfs_alloc_device() to
657162306a36Sopenharmony_ci	 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
657262306a36Sopenharmony_ci	 * places.
657362306a36Sopenharmony_ci	 */
657462306a36Sopenharmony_ci
657562306a36Sopenharmony_ci	nofs_flag = memalloc_nofs_save();
657662306a36Sopenharmony_ci	device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
657762306a36Sopenharmony_ci	memalloc_nofs_restore(nofs_flag);
657862306a36Sopenharmony_ci	if (IS_ERR(device))
657962306a36Sopenharmony_ci		return device;
658062306a36Sopenharmony_ci
658162306a36Sopenharmony_ci	list_add(&device->dev_list, &fs_devices->devices);
658262306a36Sopenharmony_ci	device->fs_devices = fs_devices;
658362306a36Sopenharmony_ci	fs_devices->num_devices++;
658462306a36Sopenharmony_ci
658562306a36Sopenharmony_ci	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
658662306a36Sopenharmony_ci	fs_devices->missing_devices++;
658762306a36Sopenharmony_ci
658862306a36Sopenharmony_ci	return device;
658962306a36Sopenharmony_ci}
659062306a36Sopenharmony_ci
659162306a36Sopenharmony_ci/*
659262306a36Sopenharmony_ci * Allocate new device struct, set up devid and UUID.
659362306a36Sopenharmony_ci *
659462306a36Sopenharmony_ci * @fs_info:	used only for generating a new devid, can be NULL if
659562306a36Sopenharmony_ci *		devid is provided (i.e. @devid != NULL).
659662306a36Sopenharmony_ci * @devid:	a pointer to devid for this device.  If NULL a new devid
659762306a36Sopenharmony_ci *		is generated.
659862306a36Sopenharmony_ci * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
659962306a36Sopenharmony_ci *		is generated.
660062306a36Sopenharmony_ci * @path:	a pointer to device path if available, NULL otherwise.
660162306a36Sopenharmony_ci *
660262306a36Sopenharmony_ci * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
660362306a36Sopenharmony_ci * on error.  Returned struct is not linked onto any lists and must be
660462306a36Sopenharmony_ci * destroyed with btrfs_free_device.
660562306a36Sopenharmony_ci */
660662306a36Sopenharmony_cistruct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
660762306a36Sopenharmony_ci					const u64 *devid, const u8 *uuid,
660862306a36Sopenharmony_ci					const char *path)
660962306a36Sopenharmony_ci{
661062306a36Sopenharmony_ci	struct btrfs_device *dev;
661162306a36Sopenharmony_ci	u64 tmp;
661262306a36Sopenharmony_ci
661362306a36Sopenharmony_ci	if (WARN_ON(!devid && !fs_info))
661462306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
661562306a36Sopenharmony_ci
661662306a36Sopenharmony_ci	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
661762306a36Sopenharmony_ci	if (!dev)
661862306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
661962306a36Sopenharmony_ci
662062306a36Sopenharmony_ci	INIT_LIST_HEAD(&dev->dev_list);
662162306a36Sopenharmony_ci	INIT_LIST_HEAD(&dev->dev_alloc_list);
662262306a36Sopenharmony_ci	INIT_LIST_HEAD(&dev->post_commit_list);
662362306a36Sopenharmony_ci
662462306a36Sopenharmony_ci	atomic_set(&dev->dev_stats_ccnt, 0);
662562306a36Sopenharmony_ci	btrfs_device_data_ordered_init(dev);
662662306a36Sopenharmony_ci	extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
662762306a36Sopenharmony_ci
662862306a36Sopenharmony_ci	if (devid)
662962306a36Sopenharmony_ci		tmp = *devid;
663062306a36Sopenharmony_ci	else {
663162306a36Sopenharmony_ci		int ret;
663262306a36Sopenharmony_ci
663362306a36Sopenharmony_ci		ret = find_next_devid(fs_info, &tmp);
663462306a36Sopenharmony_ci		if (ret) {
663562306a36Sopenharmony_ci			btrfs_free_device(dev);
663662306a36Sopenharmony_ci			return ERR_PTR(ret);
663762306a36Sopenharmony_ci		}
663862306a36Sopenharmony_ci	}
663962306a36Sopenharmony_ci	dev->devid = tmp;
664062306a36Sopenharmony_ci
664162306a36Sopenharmony_ci	if (uuid)
664262306a36Sopenharmony_ci		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
664362306a36Sopenharmony_ci	else
664462306a36Sopenharmony_ci		generate_random_uuid(dev->uuid);
664562306a36Sopenharmony_ci
664662306a36Sopenharmony_ci	if (path) {
664762306a36Sopenharmony_ci		struct rcu_string *name;
664862306a36Sopenharmony_ci
664962306a36Sopenharmony_ci		name = rcu_string_strdup(path, GFP_KERNEL);
665062306a36Sopenharmony_ci		if (!name) {
665162306a36Sopenharmony_ci			btrfs_free_device(dev);
665262306a36Sopenharmony_ci			return ERR_PTR(-ENOMEM);
665362306a36Sopenharmony_ci		}
665462306a36Sopenharmony_ci		rcu_assign_pointer(dev->name, name);
665562306a36Sopenharmony_ci	}
665662306a36Sopenharmony_ci
665762306a36Sopenharmony_ci	return dev;
665862306a36Sopenharmony_ci}
665962306a36Sopenharmony_ci
666062306a36Sopenharmony_cistatic void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
666162306a36Sopenharmony_ci					u64 devid, u8 *uuid, bool error)
666262306a36Sopenharmony_ci{
666362306a36Sopenharmony_ci	if (error)
666462306a36Sopenharmony_ci		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
666562306a36Sopenharmony_ci			      devid, uuid);
666662306a36Sopenharmony_ci	else
666762306a36Sopenharmony_ci		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
666862306a36Sopenharmony_ci			      devid, uuid);
666962306a36Sopenharmony_ci}
667062306a36Sopenharmony_ci
667162306a36Sopenharmony_ciu64 btrfs_calc_stripe_length(const struct extent_map *em)
667262306a36Sopenharmony_ci{
667362306a36Sopenharmony_ci	const struct map_lookup *map = em->map_lookup;
667462306a36Sopenharmony_ci	const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
667562306a36Sopenharmony_ci
667662306a36Sopenharmony_ci	return div_u64(em->len, data_stripes);
667762306a36Sopenharmony_ci}
667862306a36Sopenharmony_ci
667962306a36Sopenharmony_ci#if BITS_PER_LONG == 32
668062306a36Sopenharmony_ci/*
668162306a36Sopenharmony_ci * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
668262306a36Sopenharmony_ci * can't be accessed on 32bit systems.
668362306a36Sopenharmony_ci *
668462306a36Sopenharmony_ci * This function do mount time check to reject the fs if it already has
668562306a36Sopenharmony_ci * metadata chunk beyond that limit.
668662306a36Sopenharmony_ci */
668762306a36Sopenharmony_cistatic int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
668862306a36Sopenharmony_ci				  u64 logical, u64 length, u64 type)
668962306a36Sopenharmony_ci{
669062306a36Sopenharmony_ci	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
669162306a36Sopenharmony_ci		return 0;
669262306a36Sopenharmony_ci
669362306a36Sopenharmony_ci	if (logical + length < MAX_LFS_FILESIZE)
669462306a36Sopenharmony_ci		return 0;
669562306a36Sopenharmony_ci
669662306a36Sopenharmony_ci	btrfs_err_32bit_limit(fs_info);
669762306a36Sopenharmony_ci	return -EOVERFLOW;
669862306a36Sopenharmony_ci}
669962306a36Sopenharmony_ci
670062306a36Sopenharmony_ci/*
670162306a36Sopenharmony_ci * This is to give early warning for any metadata chunk reaching
670262306a36Sopenharmony_ci * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
670362306a36Sopenharmony_ci * Although we can still access the metadata, it's not going to be possible
670462306a36Sopenharmony_ci * once the limit is reached.
670562306a36Sopenharmony_ci */
670662306a36Sopenharmony_cistatic void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
670762306a36Sopenharmony_ci				  u64 logical, u64 length, u64 type)
670862306a36Sopenharmony_ci{
670962306a36Sopenharmony_ci	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
671062306a36Sopenharmony_ci		return;
671162306a36Sopenharmony_ci
671262306a36Sopenharmony_ci	if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
671362306a36Sopenharmony_ci		return;
671462306a36Sopenharmony_ci
671562306a36Sopenharmony_ci	btrfs_warn_32bit_limit(fs_info);
671662306a36Sopenharmony_ci}
671762306a36Sopenharmony_ci#endif
671862306a36Sopenharmony_ci
671962306a36Sopenharmony_cistatic struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
672062306a36Sopenharmony_ci						  u64 devid, u8 *uuid)
672162306a36Sopenharmony_ci{
672262306a36Sopenharmony_ci	struct btrfs_device *dev;
672362306a36Sopenharmony_ci
672462306a36Sopenharmony_ci	if (!btrfs_test_opt(fs_info, DEGRADED)) {
672562306a36Sopenharmony_ci		btrfs_report_missing_device(fs_info, devid, uuid, true);
672662306a36Sopenharmony_ci		return ERR_PTR(-ENOENT);
672762306a36Sopenharmony_ci	}
672862306a36Sopenharmony_ci
672962306a36Sopenharmony_ci	dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
673062306a36Sopenharmony_ci	if (IS_ERR(dev)) {
673162306a36Sopenharmony_ci		btrfs_err(fs_info, "failed to init missing device %llu: %ld",
673262306a36Sopenharmony_ci			  devid, PTR_ERR(dev));
673362306a36Sopenharmony_ci		return dev;
673462306a36Sopenharmony_ci	}
673562306a36Sopenharmony_ci	btrfs_report_missing_device(fs_info, devid, uuid, false);
673662306a36Sopenharmony_ci
673762306a36Sopenharmony_ci	return dev;
673862306a36Sopenharmony_ci}
673962306a36Sopenharmony_ci
674062306a36Sopenharmony_cistatic int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
674162306a36Sopenharmony_ci			  struct btrfs_chunk *chunk)
674262306a36Sopenharmony_ci{
674362306a36Sopenharmony_ci	BTRFS_DEV_LOOKUP_ARGS(args);
674462306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = leaf->fs_info;
674562306a36Sopenharmony_ci	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
674662306a36Sopenharmony_ci	struct map_lookup *map;
674762306a36Sopenharmony_ci	struct extent_map *em;
674862306a36Sopenharmony_ci	u64 logical;
674962306a36Sopenharmony_ci	u64 length;
675062306a36Sopenharmony_ci	u64 devid;
675162306a36Sopenharmony_ci	u64 type;
675262306a36Sopenharmony_ci	u8 uuid[BTRFS_UUID_SIZE];
675362306a36Sopenharmony_ci	int index;
675462306a36Sopenharmony_ci	int num_stripes;
675562306a36Sopenharmony_ci	int ret;
675662306a36Sopenharmony_ci	int i;
675762306a36Sopenharmony_ci
675862306a36Sopenharmony_ci	logical = key->offset;
675962306a36Sopenharmony_ci	length = btrfs_chunk_length(leaf, chunk);
676062306a36Sopenharmony_ci	type = btrfs_chunk_type(leaf, chunk);
676162306a36Sopenharmony_ci	index = btrfs_bg_flags_to_raid_index(type);
676262306a36Sopenharmony_ci	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
676362306a36Sopenharmony_ci
676462306a36Sopenharmony_ci#if BITS_PER_LONG == 32
676562306a36Sopenharmony_ci	ret = check_32bit_meta_chunk(fs_info, logical, length, type);
676662306a36Sopenharmony_ci	if (ret < 0)
676762306a36Sopenharmony_ci		return ret;
676862306a36Sopenharmony_ci	warn_32bit_meta_chunk(fs_info, logical, length, type);
676962306a36Sopenharmony_ci#endif
677062306a36Sopenharmony_ci
677162306a36Sopenharmony_ci	/*
677262306a36Sopenharmony_ci	 * Only need to verify chunk item if we're reading from sys chunk array,
677362306a36Sopenharmony_ci	 * as chunk item in tree block is already verified by tree-checker.
677462306a36Sopenharmony_ci	 */
677562306a36Sopenharmony_ci	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
677662306a36Sopenharmony_ci		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
677762306a36Sopenharmony_ci		if (ret)
677862306a36Sopenharmony_ci			return ret;
677962306a36Sopenharmony_ci	}
678062306a36Sopenharmony_ci
678162306a36Sopenharmony_ci	read_lock(&map_tree->lock);
678262306a36Sopenharmony_ci	em = lookup_extent_mapping(map_tree, logical, 1);
678362306a36Sopenharmony_ci	read_unlock(&map_tree->lock);
678462306a36Sopenharmony_ci
678562306a36Sopenharmony_ci	/* already mapped? */
678662306a36Sopenharmony_ci	if (em && em->start <= logical && em->start + em->len > logical) {
678762306a36Sopenharmony_ci		free_extent_map(em);
678862306a36Sopenharmony_ci		return 0;
678962306a36Sopenharmony_ci	} else if (em) {
679062306a36Sopenharmony_ci		free_extent_map(em);
679162306a36Sopenharmony_ci	}
679262306a36Sopenharmony_ci
679362306a36Sopenharmony_ci	em = alloc_extent_map();
679462306a36Sopenharmony_ci	if (!em)
679562306a36Sopenharmony_ci		return -ENOMEM;
679662306a36Sopenharmony_ci	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
679762306a36Sopenharmony_ci	if (!map) {
679862306a36Sopenharmony_ci		free_extent_map(em);
679962306a36Sopenharmony_ci		return -ENOMEM;
680062306a36Sopenharmony_ci	}
680162306a36Sopenharmony_ci
680262306a36Sopenharmony_ci	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
680362306a36Sopenharmony_ci	em->map_lookup = map;
680462306a36Sopenharmony_ci	em->start = logical;
680562306a36Sopenharmony_ci	em->len = length;
680662306a36Sopenharmony_ci	em->orig_start = 0;
680762306a36Sopenharmony_ci	em->block_start = 0;
680862306a36Sopenharmony_ci	em->block_len = em->len;
680962306a36Sopenharmony_ci
681062306a36Sopenharmony_ci	map->num_stripes = num_stripes;
681162306a36Sopenharmony_ci	map->io_width = btrfs_chunk_io_width(leaf, chunk);
681262306a36Sopenharmony_ci	map->io_align = btrfs_chunk_io_align(leaf, chunk);
681362306a36Sopenharmony_ci	map->type = type;
681462306a36Sopenharmony_ci	/*
681562306a36Sopenharmony_ci	 * We can't use the sub_stripes value, as for profiles other than
681662306a36Sopenharmony_ci	 * RAID10, they may have 0 as sub_stripes for filesystems created by
681762306a36Sopenharmony_ci	 * older mkfs (<v5.4).
681862306a36Sopenharmony_ci	 * In that case, it can cause divide-by-zero errors later.
681962306a36Sopenharmony_ci	 * Since currently sub_stripes is fixed for each profile, let's
682062306a36Sopenharmony_ci	 * use the trusted value instead.
682162306a36Sopenharmony_ci	 */
682262306a36Sopenharmony_ci	map->sub_stripes = btrfs_raid_array[index].sub_stripes;
682362306a36Sopenharmony_ci	map->verified_stripes = 0;
682462306a36Sopenharmony_ci	em->orig_block_len = btrfs_calc_stripe_length(em);
682562306a36Sopenharmony_ci	for (i = 0; i < num_stripes; i++) {
682662306a36Sopenharmony_ci		map->stripes[i].physical =
682762306a36Sopenharmony_ci			btrfs_stripe_offset_nr(leaf, chunk, i);
682862306a36Sopenharmony_ci		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
682962306a36Sopenharmony_ci		args.devid = devid;
683062306a36Sopenharmony_ci		read_extent_buffer(leaf, uuid, (unsigned long)
683162306a36Sopenharmony_ci				   btrfs_stripe_dev_uuid_nr(chunk, i),
683262306a36Sopenharmony_ci				   BTRFS_UUID_SIZE);
683362306a36Sopenharmony_ci		args.uuid = uuid;
683462306a36Sopenharmony_ci		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
683562306a36Sopenharmony_ci		if (!map->stripes[i].dev) {
683662306a36Sopenharmony_ci			map->stripes[i].dev = handle_missing_device(fs_info,
683762306a36Sopenharmony_ci								    devid, uuid);
683862306a36Sopenharmony_ci			if (IS_ERR(map->stripes[i].dev)) {
683962306a36Sopenharmony_ci				ret = PTR_ERR(map->stripes[i].dev);
684062306a36Sopenharmony_ci				free_extent_map(em);
684162306a36Sopenharmony_ci				return ret;
684262306a36Sopenharmony_ci			}
684362306a36Sopenharmony_ci		}
684462306a36Sopenharmony_ci
684562306a36Sopenharmony_ci		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
684662306a36Sopenharmony_ci				&(map->stripes[i].dev->dev_state));
684762306a36Sopenharmony_ci	}
684862306a36Sopenharmony_ci
684962306a36Sopenharmony_ci	write_lock(&map_tree->lock);
685062306a36Sopenharmony_ci	ret = add_extent_mapping(map_tree, em, 0);
685162306a36Sopenharmony_ci	write_unlock(&map_tree->lock);
685262306a36Sopenharmony_ci	if (ret < 0) {
685362306a36Sopenharmony_ci		btrfs_err(fs_info,
685462306a36Sopenharmony_ci			  "failed to add chunk map, start=%llu len=%llu: %d",
685562306a36Sopenharmony_ci			  em->start, em->len, ret);
685662306a36Sopenharmony_ci	}
685762306a36Sopenharmony_ci	free_extent_map(em);
685862306a36Sopenharmony_ci
685962306a36Sopenharmony_ci	return ret;
686062306a36Sopenharmony_ci}
686162306a36Sopenharmony_ci
686262306a36Sopenharmony_cistatic void fill_device_from_item(struct extent_buffer *leaf,
686362306a36Sopenharmony_ci				 struct btrfs_dev_item *dev_item,
686462306a36Sopenharmony_ci				 struct btrfs_device *device)
686562306a36Sopenharmony_ci{
686662306a36Sopenharmony_ci	unsigned long ptr;
686762306a36Sopenharmony_ci
686862306a36Sopenharmony_ci	device->devid = btrfs_device_id(leaf, dev_item);
686962306a36Sopenharmony_ci	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
687062306a36Sopenharmony_ci	device->total_bytes = device->disk_total_bytes;
687162306a36Sopenharmony_ci	device->commit_total_bytes = device->disk_total_bytes;
687262306a36Sopenharmony_ci	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
687362306a36Sopenharmony_ci	device->commit_bytes_used = device->bytes_used;
687462306a36Sopenharmony_ci	device->type = btrfs_device_type(leaf, dev_item);
687562306a36Sopenharmony_ci	device->io_align = btrfs_device_io_align(leaf, dev_item);
687662306a36Sopenharmony_ci	device->io_width = btrfs_device_io_width(leaf, dev_item);
687762306a36Sopenharmony_ci	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
687862306a36Sopenharmony_ci	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
687962306a36Sopenharmony_ci	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
688062306a36Sopenharmony_ci
688162306a36Sopenharmony_ci	ptr = btrfs_device_uuid(dev_item);
688262306a36Sopenharmony_ci	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
688362306a36Sopenharmony_ci}
688462306a36Sopenharmony_ci
688562306a36Sopenharmony_cistatic struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
688662306a36Sopenharmony_ci						  u8 *fsid)
688762306a36Sopenharmony_ci{
688862306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices;
688962306a36Sopenharmony_ci	int ret;
689062306a36Sopenharmony_ci
689162306a36Sopenharmony_ci	lockdep_assert_held(&uuid_mutex);
689262306a36Sopenharmony_ci	ASSERT(fsid);
689362306a36Sopenharmony_ci
689462306a36Sopenharmony_ci	/* This will match only for multi-device seed fs */
689562306a36Sopenharmony_ci	list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
689662306a36Sopenharmony_ci		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
689762306a36Sopenharmony_ci			return fs_devices;
689862306a36Sopenharmony_ci
689962306a36Sopenharmony_ci
690062306a36Sopenharmony_ci	fs_devices = find_fsid(fsid, NULL);
690162306a36Sopenharmony_ci	if (!fs_devices) {
690262306a36Sopenharmony_ci		if (!btrfs_test_opt(fs_info, DEGRADED))
690362306a36Sopenharmony_ci			return ERR_PTR(-ENOENT);
690462306a36Sopenharmony_ci
690562306a36Sopenharmony_ci		fs_devices = alloc_fs_devices(fsid, NULL);
690662306a36Sopenharmony_ci		if (IS_ERR(fs_devices))
690762306a36Sopenharmony_ci			return fs_devices;
690862306a36Sopenharmony_ci
690962306a36Sopenharmony_ci		fs_devices->seeding = true;
691062306a36Sopenharmony_ci		fs_devices->opened = 1;
691162306a36Sopenharmony_ci		return fs_devices;
691262306a36Sopenharmony_ci	}
691362306a36Sopenharmony_ci
691462306a36Sopenharmony_ci	/*
691562306a36Sopenharmony_ci	 * Upon first call for a seed fs fsid, just create a private copy of the
691662306a36Sopenharmony_ci	 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
691762306a36Sopenharmony_ci	 */
691862306a36Sopenharmony_ci	fs_devices = clone_fs_devices(fs_devices);
691962306a36Sopenharmony_ci	if (IS_ERR(fs_devices))
692062306a36Sopenharmony_ci		return fs_devices;
692162306a36Sopenharmony_ci
692262306a36Sopenharmony_ci	ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
692362306a36Sopenharmony_ci	if (ret) {
692462306a36Sopenharmony_ci		free_fs_devices(fs_devices);
692562306a36Sopenharmony_ci		return ERR_PTR(ret);
692662306a36Sopenharmony_ci	}
692762306a36Sopenharmony_ci
692862306a36Sopenharmony_ci	if (!fs_devices->seeding) {
692962306a36Sopenharmony_ci		close_fs_devices(fs_devices);
693062306a36Sopenharmony_ci		free_fs_devices(fs_devices);
693162306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
693262306a36Sopenharmony_ci	}
693362306a36Sopenharmony_ci
693462306a36Sopenharmony_ci	list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
693562306a36Sopenharmony_ci
693662306a36Sopenharmony_ci	return fs_devices;
693762306a36Sopenharmony_ci}
693862306a36Sopenharmony_ci
693962306a36Sopenharmony_cistatic int read_one_dev(struct extent_buffer *leaf,
694062306a36Sopenharmony_ci			struct btrfs_dev_item *dev_item)
694162306a36Sopenharmony_ci{
694262306a36Sopenharmony_ci	BTRFS_DEV_LOOKUP_ARGS(args);
694362306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = leaf->fs_info;
694462306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
694562306a36Sopenharmony_ci	struct btrfs_device *device;
694662306a36Sopenharmony_ci	u64 devid;
694762306a36Sopenharmony_ci	int ret;
694862306a36Sopenharmony_ci	u8 fs_uuid[BTRFS_FSID_SIZE];
694962306a36Sopenharmony_ci	u8 dev_uuid[BTRFS_UUID_SIZE];
695062306a36Sopenharmony_ci
695162306a36Sopenharmony_ci	devid = btrfs_device_id(leaf, dev_item);
695262306a36Sopenharmony_ci	args.devid = devid;
695362306a36Sopenharmony_ci	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
695462306a36Sopenharmony_ci			   BTRFS_UUID_SIZE);
695562306a36Sopenharmony_ci	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
695662306a36Sopenharmony_ci			   BTRFS_FSID_SIZE);
695762306a36Sopenharmony_ci	args.uuid = dev_uuid;
695862306a36Sopenharmony_ci	args.fsid = fs_uuid;
695962306a36Sopenharmony_ci
696062306a36Sopenharmony_ci	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
696162306a36Sopenharmony_ci		fs_devices = open_seed_devices(fs_info, fs_uuid);
696262306a36Sopenharmony_ci		if (IS_ERR(fs_devices))
696362306a36Sopenharmony_ci			return PTR_ERR(fs_devices);
696462306a36Sopenharmony_ci	}
696562306a36Sopenharmony_ci
696662306a36Sopenharmony_ci	device = btrfs_find_device(fs_info->fs_devices, &args);
696762306a36Sopenharmony_ci	if (!device) {
696862306a36Sopenharmony_ci		if (!btrfs_test_opt(fs_info, DEGRADED)) {
696962306a36Sopenharmony_ci			btrfs_report_missing_device(fs_info, devid,
697062306a36Sopenharmony_ci							dev_uuid, true);
697162306a36Sopenharmony_ci			return -ENOENT;
697262306a36Sopenharmony_ci		}
697362306a36Sopenharmony_ci
697462306a36Sopenharmony_ci		device = add_missing_dev(fs_devices, devid, dev_uuid);
697562306a36Sopenharmony_ci		if (IS_ERR(device)) {
697662306a36Sopenharmony_ci			btrfs_err(fs_info,
697762306a36Sopenharmony_ci				"failed to add missing dev %llu: %ld",
697862306a36Sopenharmony_ci				devid, PTR_ERR(device));
697962306a36Sopenharmony_ci			return PTR_ERR(device);
698062306a36Sopenharmony_ci		}
698162306a36Sopenharmony_ci		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
698262306a36Sopenharmony_ci	} else {
698362306a36Sopenharmony_ci		if (!device->bdev) {
698462306a36Sopenharmony_ci			if (!btrfs_test_opt(fs_info, DEGRADED)) {
698562306a36Sopenharmony_ci				btrfs_report_missing_device(fs_info,
698662306a36Sopenharmony_ci						devid, dev_uuid, true);
698762306a36Sopenharmony_ci				return -ENOENT;
698862306a36Sopenharmony_ci			}
698962306a36Sopenharmony_ci			btrfs_report_missing_device(fs_info, devid,
699062306a36Sopenharmony_ci							dev_uuid, false);
699162306a36Sopenharmony_ci		}
699262306a36Sopenharmony_ci
699362306a36Sopenharmony_ci		if (!device->bdev &&
699462306a36Sopenharmony_ci		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
699562306a36Sopenharmony_ci			/*
699662306a36Sopenharmony_ci			 * this happens when a device that was properly setup
699762306a36Sopenharmony_ci			 * in the device info lists suddenly goes bad.
699862306a36Sopenharmony_ci			 * device->bdev is NULL, and so we have to set
699962306a36Sopenharmony_ci			 * device->missing to one here
700062306a36Sopenharmony_ci			 */
700162306a36Sopenharmony_ci			device->fs_devices->missing_devices++;
700262306a36Sopenharmony_ci			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
700362306a36Sopenharmony_ci		}
700462306a36Sopenharmony_ci
700562306a36Sopenharmony_ci		/* Move the device to its own fs_devices */
700662306a36Sopenharmony_ci		if (device->fs_devices != fs_devices) {
700762306a36Sopenharmony_ci			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
700862306a36Sopenharmony_ci							&device->dev_state));
700962306a36Sopenharmony_ci
701062306a36Sopenharmony_ci			list_move(&device->dev_list, &fs_devices->devices);
701162306a36Sopenharmony_ci			device->fs_devices->num_devices--;
701262306a36Sopenharmony_ci			fs_devices->num_devices++;
701362306a36Sopenharmony_ci
701462306a36Sopenharmony_ci			device->fs_devices->missing_devices--;
701562306a36Sopenharmony_ci			fs_devices->missing_devices++;
701662306a36Sopenharmony_ci
701762306a36Sopenharmony_ci			device->fs_devices = fs_devices;
701862306a36Sopenharmony_ci		}
701962306a36Sopenharmony_ci	}
702062306a36Sopenharmony_ci
702162306a36Sopenharmony_ci	if (device->fs_devices != fs_info->fs_devices) {
702262306a36Sopenharmony_ci		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
702362306a36Sopenharmony_ci		if (device->generation !=
702462306a36Sopenharmony_ci		    btrfs_device_generation(leaf, dev_item))
702562306a36Sopenharmony_ci			return -EINVAL;
702662306a36Sopenharmony_ci	}
702762306a36Sopenharmony_ci
702862306a36Sopenharmony_ci	fill_device_from_item(leaf, dev_item, device);
702962306a36Sopenharmony_ci	if (device->bdev) {
703062306a36Sopenharmony_ci		u64 max_total_bytes = bdev_nr_bytes(device->bdev);
703162306a36Sopenharmony_ci
703262306a36Sopenharmony_ci		if (device->total_bytes > max_total_bytes) {
703362306a36Sopenharmony_ci			btrfs_err(fs_info,
703462306a36Sopenharmony_ci			"device total_bytes should be at most %llu but found %llu",
703562306a36Sopenharmony_ci				  max_total_bytes, device->total_bytes);
703662306a36Sopenharmony_ci			return -EINVAL;
703762306a36Sopenharmony_ci		}
703862306a36Sopenharmony_ci	}
703962306a36Sopenharmony_ci	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
704062306a36Sopenharmony_ci	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
704162306a36Sopenharmony_ci	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
704262306a36Sopenharmony_ci		device->fs_devices->total_rw_bytes += device->total_bytes;
704362306a36Sopenharmony_ci		atomic64_add(device->total_bytes - device->bytes_used,
704462306a36Sopenharmony_ci				&fs_info->free_chunk_space);
704562306a36Sopenharmony_ci	}
704662306a36Sopenharmony_ci	ret = 0;
704762306a36Sopenharmony_ci	return ret;
704862306a36Sopenharmony_ci}
704962306a36Sopenharmony_ci
705062306a36Sopenharmony_ciint btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
705162306a36Sopenharmony_ci{
705262306a36Sopenharmony_ci	struct btrfs_super_block *super_copy = fs_info->super_copy;
705362306a36Sopenharmony_ci	struct extent_buffer *sb;
705462306a36Sopenharmony_ci	struct btrfs_disk_key *disk_key;
705562306a36Sopenharmony_ci	struct btrfs_chunk *chunk;
705662306a36Sopenharmony_ci	u8 *array_ptr;
705762306a36Sopenharmony_ci	unsigned long sb_array_offset;
705862306a36Sopenharmony_ci	int ret = 0;
705962306a36Sopenharmony_ci	u32 num_stripes;
706062306a36Sopenharmony_ci	u32 array_size;
706162306a36Sopenharmony_ci	u32 len = 0;
706262306a36Sopenharmony_ci	u32 cur_offset;
706362306a36Sopenharmony_ci	u64 type;
706462306a36Sopenharmony_ci	struct btrfs_key key;
706562306a36Sopenharmony_ci
706662306a36Sopenharmony_ci	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
706762306a36Sopenharmony_ci
706862306a36Sopenharmony_ci	/*
706962306a36Sopenharmony_ci	 * We allocated a dummy extent, just to use extent buffer accessors.
707062306a36Sopenharmony_ci	 * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
707162306a36Sopenharmony_ci	 * that's fine, we will not go beyond system chunk array anyway.
707262306a36Sopenharmony_ci	 */
707362306a36Sopenharmony_ci	sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
707462306a36Sopenharmony_ci	if (!sb)
707562306a36Sopenharmony_ci		return -ENOMEM;
707662306a36Sopenharmony_ci	set_extent_buffer_uptodate(sb);
707762306a36Sopenharmony_ci
707862306a36Sopenharmony_ci	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
707962306a36Sopenharmony_ci	array_size = btrfs_super_sys_array_size(super_copy);
708062306a36Sopenharmony_ci
708162306a36Sopenharmony_ci	array_ptr = super_copy->sys_chunk_array;
708262306a36Sopenharmony_ci	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
708362306a36Sopenharmony_ci	cur_offset = 0;
708462306a36Sopenharmony_ci
708562306a36Sopenharmony_ci	while (cur_offset < array_size) {
708662306a36Sopenharmony_ci		disk_key = (struct btrfs_disk_key *)array_ptr;
708762306a36Sopenharmony_ci		len = sizeof(*disk_key);
708862306a36Sopenharmony_ci		if (cur_offset + len > array_size)
708962306a36Sopenharmony_ci			goto out_short_read;
709062306a36Sopenharmony_ci
709162306a36Sopenharmony_ci		btrfs_disk_key_to_cpu(&key, disk_key);
709262306a36Sopenharmony_ci
709362306a36Sopenharmony_ci		array_ptr += len;
709462306a36Sopenharmony_ci		sb_array_offset += len;
709562306a36Sopenharmony_ci		cur_offset += len;
709662306a36Sopenharmony_ci
709762306a36Sopenharmony_ci		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
709862306a36Sopenharmony_ci			btrfs_err(fs_info,
709962306a36Sopenharmony_ci			    "unexpected item type %u in sys_array at offset %u",
710062306a36Sopenharmony_ci				  (u32)key.type, cur_offset);
710162306a36Sopenharmony_ci			ret = -EIO;
710262306a36Sopenharmony_ci			break;
710362306a36Sopenharmony_ci		}
710462306a36Sopenharmony_ci
710562306a36Sopenharmony_ci		chunk = (struct btrfs_chunk *)sb_array_offset;
710662306a36Sopenharmony_ci		/*
710762306a36Sopenharmony_ci		 * At least one btrfs_chunk with one stripe must be present,
710862306a36Sopenharmony_ci		 * exact stripe count check comes afterwards
710962306a36Sopenharmony_ci		 */
711062306a36Sopenharmony_ci		len = btrfs_chunk_item_size(1);
711162306a36Sopenharmony_ci		if (cur_offset + len > array_size)
711262306a36Sopenharmony_ci			goto out_short_read;
711362306a36Sopenharmony_ci
711462306a36Sopenharmony_ci		num_stripes = btrfs_chunk_num_stripes(sb, chunk);
711562306a36Sopenharmony_ci		if (!num_stripes) {
711662306a36Sopenharmony_ci			btrfs_err(fs_info,
711762306a36Sopenharmony_ci			"invalid number of stripes %u in sys_array at offset %u",
711862306a36Sopenharmony_ci				  num_stripes, cur_offset);
711962306a36Sopenharmony_ci			ret = -EIO;
712062306a36Sopenharmony_ci			break;
712162306a36Sopenharmony_ci		}
712262306a36Sopenharmony_ci
712362306a36Sopenharmony_ci		type = btrfs_chunk_type(sb, chunk);
712462306a36Sopenharmony_ci		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
712562306a36Sopenharmony_ci			btrfs_err(fs_info,
712662306a36Sopenharmony_ci			"invalid chunk type %llu in sys_array at offset %u",
712762306a36Sopenharmony_ci				  type, cur_offset);
712862306a36Sopenharmony_ci			ret = -EIO;
712962306a36Sopenharmony_ci			break;
713062306a36Sopenharmony_ci		}
713162306a36Sopenharmony_ci
713262306a36Sopenharmony_ci		len = btrfs_chunk_item_size(num_stripes);
713362306a36Sopenharmony_ci		if (cur_offset + len > array_size)
713462306a36Sopenharmony_ci			goto out_short_read;
713562306a36Sopenharmony_ci
713662306a36Sopenharmony_ci		ret = read_one_chunk(&key, sb, chunk);
713762306a36Sopenharmony_ci		if (ret)
713862306a36Sopenharmony_ci			break;
713962306a36Sopenharmony_ci
714062306a36Sopenharmony_ci		array_ptr += len;
714162306a36Sopenharmony_ci		sb_array_offset += len;
714262306a36Sopenharmony_ci		cur_offset += len;
714362306a36Sopenharmony_ci	}
714462306a36Sopenharmony_ci	clear_extent_buffer_uptodate(sb);
714562306a36Sopenharmony_ci	free_extent_buffer_stale(sb);
714662306a36Sopenharmony_ci	return ret;
714762306a36Sopenharmony_ci
714862306a36Sopenharmony_ciout_short_read:
714962306a36Sopenharmony_ci	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
715062306a36Sopenharmony_ci			len, cur_offset);
715162306a36Sopenharmony_ci	clear_extent_buffer_uptodate(sb);
715262306a36Sopenharmony_ci	free_extent_buffer_stale(sb);
715362306a36Sopenharmony_ci	return -EIO;
715462306a36Sopenharmony_ci}
715562306a36Sopenharmony_ci
715662306a36Sopenharmony_ci/*
715762306a36Sopenharmony_ci * Check if all chunks in the fs are OK for read-write degraded mount
715862306a36Sopenharmony_ci *
715962306a36Sopenharmony_ci * If the @failing_dev is specified, it's accounted as missing.
716062306a36Sopenharmony_ci *
716162306a36Sopenharmony_ci * Return true if all chunks meet the minimal RW mount requirements.
716262306a36Sopenharmony_ci * Return false if any chunk doesn't meet the minimal RW mount requirements.
716362306a36Sopenharmony_ci */
716462306a36Sopenharmony_cibool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
716562306a36Sopenharmony_ci					struct btrfs_device *failing_dev)
716662306a36Sopenharmony_ci{
716762306a36Sopenharmony_ci	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
716862306a36Sopenharmony_ci	struct extent_map *em;
716962306a36Sopenharmony_ci	u64 next_start = 0;
717062306a36Sopenharmony_ci	bool ret = true;
717162306a36Sopenharmony_ci
717262306a36Sopenharmony_ci	read_lock(&map_tree->lock);
717362306a36Sopenharmony_ci	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
717462306a36Sopenharmony_ci	read_unlock(&map_tree->lock);
717562306a36Sopenharmony_ci	/* No chunk at all? Return false anyway */
717662306a36Sopenharmony_ci	if (!em) {
717762306a36Sopenharmony_ci		ret = false;
717862306a36Sopenharmony_ci		goto out;
717962306a36Sopenharmony_ci	}
718062306a36Sopenharmony_ci	while (em) {
718162306a36Sopenharmony_ci		struct map_lookup *map;
718262306a36Sopenharmony_ci		int missing = 0;
718362306a36Sopenharmony_ci		int max_tolerated;
718462306a36Sopenharmony_ci		int i;
718562306a36Sopenharmony_ci
718662306a36Sopenharmony_ci		map = em->map_lookup;
718762306a36Sopenharmony_ci		max_tolerated =
718862306a36Sopenharmony_ci			btrfs_get_num_tolerated_disk_barrier_failures(
718962306a36Sopenharmony_ci					map->type);
719062306a36Sopenharmony_ci		for (i = 0; i < map->num_stripes; i++) {
719162306a36Sopenharmony_ci			struct btrfs_device *dev = map->stripes[i].dev;
719262306a36Sopenharmony_ci
719362306a36Sopenharmony_ci			if (!dev || !dev->bdev ||
719462306a36Sopenharmony_ci			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
719562306a36Sopenharmony_ci			    dev->last_flush_error)
719662306a36Sopenharmony_ci				missing++;
719762306a36Sopenharmony_ci			else if (failing_dev && failing_dev == dev)
719862306a36Sopenharmony_ci				missing++;
719962306a36Sopenharmony_ci		}
720062306a36Sopenharmony_ci		if (missing > max_tolerated) {
720162306a36Sopenharmony_ci			if (!failing_dev)
720262306a36Sopenharmony_ci				btrfs_warn(fs_info,
720362306a36Sopenharmony_ci	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
720462306a36Sopenharmony_ci				   em->start, missing, max_tolerated);
720562306a36Sopenharmony_ci			free_extent_map(em);
720662306a36Sopenharmony_ci			ret = false;
720762306a36Sopenharmony_ci			goto out;
720862306a36Sopenharmony_ci		}
720962306a36Sopenharmony_ci		next_start = extent_map_end(em);
721062306a36Sopenharmony_ci		free_extent_map(em);
721162306a36Sopenharmony_ci
721262306a36Sopenharmony_ci		read_lock(&map_tree->lock);
721362306a36Sopenharmony_ci		em = lookup_extent_mapping(map_tree, next_start,
721462306a36Sopenharmony_ci					   (u64)(-1) - next_start);
721562306a36Sopenharmony_ci		read_unlock(&map_tree->lock);
721662306a36Sopenharmony_ci	}
721762306a36Sopenharmony_ciout:
721862306a36Sopenharmony_ci	return ret;
721962306a36Sopenharmony_ci}
722062306a36Sopenharmony_ci
722162306a36Sopenharmony_cistatic void readahead_tree_node_children(struct extent_buffer *node)
722262306a36Sopenharmony_ci{
722362306a36Sopenharmony_ci	int i;
722462306a36Sopenharmony_ci	const int nr_items = btrfs_header_nritems(node);
722562306a36Sopenharmony_ci
722662306a36Sopenharmony_ci	for (i = 0; i < nr_items; i++)
722762306a36Sopenharmony_ci		btrfs_readahead_node_child(node, i);
722862306a36Sopenharmony_ci}
722962306a36Sopenharmony_ci
723062306a36Sopenharmony_ciint btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
723162306a36Sopenharmony_ci{
723262306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->chunk_root;
723362306a36Sopenharmony_ci	struct btrfs_path *path;
723462306a36Sopenharmony_ci	struct extent_buffer *leaf;
723562306a36Sopenharmony_ci	struct btrfs_key key;
723662306a36Sopenharmony_ci	struct btrfs_key found_key;
723762306a36Sopenharmony_ci	int ret;
723862306a36Sopenharmony_ci	int slot;
723962306a36Sopenharmony_ci	int iter_ret = 0;
724062306a36Sopenharmony_ci	u64 total_dev = 0;
724162306a36Sopenharmony_ci	u64 last_ra_node = 0;
724262306a36Sopenharmony_ci
724362306a36Sopenharmony_ci	path = btrfs_alloc_path();
724462306a36Sopenharmony_ci	if (!path)
724562306a36Sopenharmony_ci		return -ENOMEM;
724662306a36Sopenharmony_ci
724762306a36Sopenharmony_ci	/*
724862306a36Sopenharmony_ci	 * uuid_mutex is needed only if we are mounting a sprout FS
724962306a36Sopenharmony_ci	 * otherwise we don't need it.
725062306a36Sopenharmony_ci	 */
725162306a36Sopenharmony_ci	mutex_lock(&uuid_mutex);
725262306a36Sopenharmony_ci
725362306a36Sopenharmony_ci	/*
725462306a36Sopenharmony_ci	 * It is possible for mount and umount to race in such a way that
725562306a36Sopenharmony_ci	 * we execute this code path, but open_fs_devices failed to clear
725662306a36Sopenharmony_ci	 * total_rw_bytes. We certainly want it cleared before reading the
725762306a36Sopenharmony_ci	 * device items, so clear it here.
725862306a36Sopenharmony_ci	 */
725962306a36Sopenharmony_ci	fs_info->fs_devices->total_rw_bytes = 0;
726062306a36Sopenharmony_ci
726162306a36Sopenharmony_ci	/*
726262306a36Sopenharmony_ci	 * Lockdep complains about possible circular locking dependency between
726362306a36Sopenharmony_ci	 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
726462306a36Sopenharmony_ci	 * used for freeze procection of a fs (struct super_block.s_writers),
726562306a36Sopenharmony_ci	 * which we take when starting a transaction, and extent buffers of the
726662306a36Sopenharmony_ci	 * chunk tree if we call read_one_dev() while holding a lock on an
726762306a36Sopenharmony_ci	 * extent buffer of the chunk tree. Since we are mounting the filesystem
726862306a36Sopenharmony_ci	 * and at this point there can't be any concurrent task modifying the
726962306a36Sopenharmony_ci	 * chunk tree, to keep it simple, just skip locking on the chunk tree.
727062306a36Sopenharmony_ci	 */
727162306a36Sopenharmony_ci	ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
727262306a36Sopenharmony_ci	path->skip_locking = 1;
727362306a36Sopenharmony_ci
727462306a36Sopenharmony_ci	/*
727562306a36Sopenharmony_ci	 * Read all device items, and then all the chunk items. All
727662306a36Sopenharmony_ci	 * device items are found before any chunk item (their object id
727762306a36Sopenharmony_ci	 * is smaller than the lowest possible object id for a chunk
727862306a36Sopenharmony_ci	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
727962306a36Sopenharmony_ci	 */
728062306a36Sopenharmony_ci	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
728162306a36Sopenharmony_ci	key.offset = 0;
728262306a36Sopenharmony_ci	key.type = 0;
728362306a36Sopenharmony_ci	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
728462306a36Sopenharmony_ci		struct extent_buffer *node = path->nodes[1];
728562306a36Sopenharmony_ci
728662306a36Sopenharmony_ci		leaf = path->nodes[0];
728762306a36Sopenharmony_ci		slot = path->slots[0];
728862306a36Sopenharmony_ci
728962306a36Sopenharmony_ci		if (node) {
729062306a36Sopenharmony_ci			if (last_ra_node != node->start) {
729162306a36Sopenharmony_ci				readahead_tree_node_children(node);
729262306a36Sopenharmony_ci				last_ra_node = node->start;
729362306a36Sopenharmony_ci			}
729462306a36Sopenharmony_ci		}
729562306a36Sopenharmony_ci		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
729662306a36Sopenharmony_ci			struct btrfs_dev_item *dev_item;
729762306a36Sopenharmony_ci			dev_item = btrfs_item_ptr(leaf, slot,
729862306a36Sopenharmony_ci						  struct btrfs_dev_item);
729962306a36Sopenharmony_ci			ret = read_one_dev(leaf, dev_item);
730062306a36Sopenharmony_ci			if (ret)
730162306a36Sopenharmony_ci				goto error;
730262306a36Sopenharmony_ci			total_dev++;
730362306a36Sopenharmony_ci		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
730462306a36Sopenharmony_ci			struct btrfs_chunk *chunk;
730562306a36Sopenharmony_ci
730662306a36Sopenharmony_ci			/*
730762306a36Sopenharmony_ci			 * We are only called at mount time, so no need to take
730862306a36Sopenharmony_ci			 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
730962306a36Sopenharmony_ci			 * we always lock first fs_info->chunk_mutex before
731062306a36Sopenharmony_ci			 * acquiring any locks on the chunk tree. This is a
731162306a36Sopenharmony_ci			 * requirement for chunk allocation, see the comment on
731262306a36Sopenharmony_ci			 * top of btrfs_chunk_alloc() for details.
731362306a36Sopenharmony_ci			 */
731462306a36Sopenharmony_ci			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
731562306a36Sopenharmony_ci			ret = read_one_chunk(&found_key, leaf, chunk);
731662306a36Sopenharmony_ci			if (ret)
731762306a36Sopenharmony_ci				goto error;
731862306a36Sopenharmony_ci		}
731962306a36Sopenharmony_ci	}
732062306a36Sopenharmony_ci	/* Catch error found during iteration */
732162306a36Sopenharmony_ci	if (iter_ret < 0) {
732262306a36Sopenharmony_ci		ret = iter_ret;
732362306a36Sopenharmony_ci		goto error;
732462306a36Sopenharmony_ci	}
732562306a36Sopenharmony_ci
732662306a36Sopenharmony_ci	/*
732762306a36Sopenharmony_ci	 * After loading chunk tree, we've got all device information,
732862306a36Sopenharmony_ci	 * do another round of validation checks.
732962306a36Sopenharmony_ci	 */
733062306a36Sopenharmony_ci	if (total_dev != fs_info->fs_devices->total_devices) {
733162306a36Sopenharmony_ci		btrfs_warn(fs_info,
733262306a36Sopenharmony_ci"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
733362306a36Sopenharmony_ci			  btrfs_super_num_devices(fs_info->super_copy),
733462306a36Sopenharmony_ci			  total_dev);
733562306a36Sopenharmony_ci		fs_info->fs_devices->total_devices = total_dev;
733662306a36Sopenharmony_ci		btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
733762306a36Sopenharmony_ci	}
733862306a36Sopenharmony_ci	if (btrfs_super_total_bytes(fs_info->super_copy) <
733962306a36Sopenharmony_ci	    fs_info->fs_devices->total_rw_bytes) {
734062306a36Sopenharmony_ci		btrfs_err(fs_info,
734162306a36Sopenharmony_ci	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
734262306a36Sopenharmony_ci			  btrfs_super_total_bytes(fs_info->super_copy),
734362306a36Sopenharmony_ci			  fs_info->fs_devices->total_rw_bytes);
734462306a36Sopenharmony_ci		ret = -EINVAL;
734562306a36Sopenharmony_ci		goto error;
734662306a36Sopenharmony_ci	}
734762306a36Sopenharmony_ci	ret = 0;
734862306a36Sopenharmony_cierror:
734962306a36Sopenharmony_ci	mutex_unlock(&uuid_mutex);
735062306a36Sopenharmony_ci
735162306a36Sopenharmony_ci	btrfs_free_path(path);
735262306a36Sopenharmony_ci	return ret;
735362306a36Sopenharmony_ci}
735462306a36Sopenharmony_ci
735562306a36Sopenharmony_ciint btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
735662306a36Sopenharmony_ci{
735762306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
735862306a36Sopenharmony_ci	struct btrfs_device *device;
735962306a36Sopenharmony_ci	int ret = 0;
736062306a36Sopenharmony_ci
736162306a36Sopenharmony_ci	fs_devices->fs_info = fs_info;
736262306a36Sopenharmony_ci
736362306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
736462306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->devices, dev_list)
736562306a36Sopenharmony_ci		device->fs_info = fs_info;
736662306a36Sopenharmony_ci
736762306a36Sopenharmony_ci	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
736862306a36Sopenharmony_ci		list_for_each_entry(device, &seed_devs->devices, dev_list) {
736962306a36Sopenharmony_ci			device->fs_info = fs_info;
737062306a36Sopenharmony_ci			ret = btrfs_get_dev_zone_info(device, false);
737162306a36Sopenharmony_ci			if (ret)
737262306a36Sopenharmony_ci				break;
737362306a36Sopenharmony_ci		}
737462306a36Sopenharmony_ci
737562306a36Sopenharmony_ci		seed_devs->fs_info = fs_info;
737662306a36Sopenharmony_ci	}
737762306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
737862306a36Sopenharmony_ci
737962306a36Sopenharmony_ci	return ret;
738062306a36Sopenharmony_ci}
738162306a36Sopenharmony_ci
738262306a36Sopenharmony_cistatic u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
738362306a36Sopenharmony_ci				 const struct btrfs_dev_stats_item *ptr,
738462306a36Sopenharmony_ci				 int index)
738562306a36Sopenharmony_ci{
738662306a36Sopenharmony_ci	u64 val;
738762306a36Sopenharmony_ci
738862306a36Sopenharmony_ci	read_extent_buffer(eb, &val,
738962306a36Sopenharmony_ci			   offsetof(struct btrfs_dev_stats_item, values) +
739062306a36Sopenharmony_ci			    ((unsigned long)ptr) + (index * sizeof(u64)),
739162306a36Sopenharmony_ci			   sizeof(val));
739262306a36Sopenharmony_ci	return val;
739362306a36Sopenharmony_ci}
739462306a36Sopenharmony_ci
739562306a36Sopenharmony_cistatic void btrfs_set_dev_stats_value(struct extent_buffer *eb,
739662306a36Sopenharmony_ci				      struct btrfs_dev_stats_item *ptr,
739762306a36Sopenharmony_ci				      int index, u64 val)
739862306a36Sopenharmony_ci{
739962306a36Sopenharmony_ci	write_extent_buffer(eb, &val,
740062306a36Sopenharmony_ci			    offsetof(struct btrfs_dev_stats_item, values) +
740162306a36Sopenharmony_ci			     ((unsigned long)ptr) + (index * sizeof(u64)),
740262306a36Sopenharmony_ci			    sizeof(val));
740362306a36Sopenharmony_ci}
740462306a36Sopenharmony_ci
740562306a36Sopenharmony_cistatic int btrfs_device_init_dev_stats(struct btrfs_device *device,
740662306a36Sopenharmony_ci				       struct btrfs_path *path)
740762306a36Sopenharmony_ci{
740862306a36Sopenharmony_ci	struct btrfs_dev_stats_item *ptr;
740962306a36Sopenharmony_ci	struct extent_buffer *eb;
741062306a36Sopenharmony_ci	struct btrfs_key key;
741162306a36Sopenharmony_ci	int item_size;
741262306a36Sopenharmony_ci	int i, ret, slot;
741362306a36Sopenharmony_ci
741462306a36Sopenharmony_ci	if (!device->fs_info->dev_root)
741562306a36Sopenharmony_ci		return 0;
741662306a36Sopenharmony_ci
741762306a36Sopenharmony_ci	key.objectid = BTRFS_DEV_STATS_OBJECTID;
741862306a36Sopenharmony_ci	key.type = BTRFS_PERSISTENT_ITEM_KEY;
741962306a36Sopenharmony_ci	key.offset = device->devid;
742062306a36Sopenharmony_ci	ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
742162306a36Sopenharmony_ci	if (ret) {
742262306a36Sopenharmony_ci		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
742362306a36Sopenharmony_ci			btrfs_dev_stat_set(device, i, 0);
742462306a36Sopenharmony_ci		device->dev_stats_valid = 1;
742562306a36Sopenharmony_ci		btrfs_release_path(path);
742662306a36Sopenharmony_ci		return ret < 0 ? ret : 0;
742762306a36Sopenharmony_ci	}
742862306a36Sopenharmony_ci	slot = path->slots[0];
742962306a36Sopenharmony_ci	eb = path->nodes[0];
743062306a36Sopenharmony_ci	item_size = btrfs_item_size(eb, slot);
743162306a36Sopenharmony_ci
743262306a36Sopenharmony_ci	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
743362306a36Sopenharmony_ci
743462306a36Sopenharmony_ci	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
743562306a36Sopenharmony_ci		if (item_size >= (1 + i) * sizeof(__le64))
743662306a36Sopenharmony_ci			btrfs_dev_stat_set(device, i,
743762306a36Sopenharmony_ci					   btrfs_dev_stats_value(eb, ptr, i));
743862306a36Sopenharmony_ci		else
743962306a36Sopenharmony_ci			btrfs_dev_stat_set(device, i, 0);
744062306a36Sopenharmony_ci	}
744162306a36Sopenharmony_ci
744262306a36Sopenharmony_ci	device->dev_stats_valid = 1;
744362306a36Sopenharmony_ci	btrfs_dev_stat_print_on_load(device);
744462306a36Sopenharmony_ci	btrfs_release_path(path);
744562306a36Sopenharmony_ci
744662306a36Sopenharmony_ci	return 0;
744762306a36Sopenharmony_ci}
744862306a36Sopenharmony_ci
744962306a36Sopenharmony_ciint btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
745062306a36Sopenharmony_ci{
745162306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
745262306a36Sopenharmony_ci	struct btrfs_device *device;
745362306a36Sopenharmony_ci	struct btrfs_path *path = NULL;
745462306a36Sopenharmony_ci	int ret = 0;
745562306a36Sopenharmony_ci
745662306a36Sopenharmony_ci	path = btrfs_alloc_path();
745762306a36Sopenharmony_ci	if (!path)
745862306a36Sopenharmony_ci		return -ENOMEM;
745962306a36Sopenharmony_ci
746062306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
746162306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->devices, dev_list) {
746262306a36Sopenharmony_ci		ret = btrfs_device_init_dev_stats(device, path);
746362306a36Sopenharmony_ci		if (ret)
746462306a36Sopenharmony_ci			goto out;
746562306a36Sopenharmony_ci	}
746662306a36Sopenharmony_ci	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
746762306a36Sopenharmony_ci		list_for_each_entry(device, &seed_devs->devices, dev_list) {
746862306a36Sopenharmony_ci			ret = btrfs_device_init_dev_stats(device, path);
746962306a36Sopenharmony_ci			if (ret)
747062306a36Sopenharmony_ci				goto out;
747162306a36Sopenharmony_ci		}
747262306a36Sopenharmony_ci	}
747362306a36Sopenharmony_ciout:
747462306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
747562306a36Sopenharmony_ci
747662306a36Sopenharmony_ci	btrfs_free_path(path);
747762306a36Sopenharmony_ci	return ret;
747862306a36Sopenharmony_ci}
747962306a36Sopenharmony_ci
748062306a36Sopenharmony_cistatic int update_dev_stat_item(struct btrfs_trans_handle *trans,
748162306a36Sopenharmony_ci				struct btrfs_device *device)
748262306a36Sopenharmony_ci{
748362306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = trans->fs_info;
748462306a36Sopenharmony_ci	struct btrfs_root *dev_root = fs_info->dev_root;
748562306a36Sopenharmony_ci	struct btrfs_path *path;
748662306a36Sopenharmony_ci	struct btrfs_key key;
748762306a36Sopenharmony_ci	struct extent_buffer *eb;
748862306a36Sopenharmony_ci	struct btrfs_dev_stats_item *ptr;
748962306a36Sopenharmony_ci	int ret;
749062306a36Sopenharmony_ci	int i;
749162306a36Sopenharmony_ci
749262306a36Sopenharmony_ci	key.objectid = BTRFS_DEV_STATS_OBJECTID;
749362306a36Sopenharmony_ci	key.type = BTRFS_PERSISTENT_ITEM_KEY;
749462306a36Sopenharmony_ci	key.offset = device->devid;
749562306a36Sopenharmony_ci
749662306a36Sopenharmony_ci	path = btrfs_alloc_path();
749762306a36Sopenharmony_ci	if (!path)
749862306a36Sopenharmony_ci		return -ENOMEM;
749962306a36Sopenharmony_ci	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
750062306a36Sopenharmony_ci	if (ret < 0) {
750162306a36Sopenharmony_ci		btrfs_warn_in_rcu(fs_info,
750262306a36Sopenharmony_ci			"error %d while searching for dev_stats item for device %s",
750362306a36Sopenharmony_ci				  ret, btrfs_dev_name(device));
750462306a36Sopenharmony_ci		goto out;
750562306a36Sopenharmony_ci	}
750662306a36Sopenharmony_ci
750762306a36Sopenharmony_ci	if (ret == 0 &&
750862306a36Sopenharmony_ci	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
750962306a36Sopenharmony_ci		/* need to delete old one and insert a new one */
751062306a36Sopenharmony_ci		ret = btrfs_del_item(trans, dev_root, path);
751162306a36Sopenharmony_ci		if (ret != 0) {
751262306a36Sopenharmony_ci			btrfs_warn_in_rcu(fs_info,
751362306a36Sopenharmony_ci				"delete too small dev_stats item for device %s failed %d",
751462306a36Sopenharmony_ci					  btrfs_dev_name(device), ret);
751562306a36Sopenharmony_ci			goto out;
751662306a36Sopenharmony_ci		}
751762306a36Sopenharmony_ci		ret = 1;
751862306a36Sopenharmony_ci	}
751962306a36Sopenharmony_ci
752062306a36Sopenharmony_ci	if (ret == 1) {
752162306a36Sopenharmony_ci		/* need to insert a new item */
752262306a36Sopenharmony_ci		btrfs_release_path(path);
752362306a36Sopenharmony_ci		ret = btrfs_insert_empty_item(trans, dev_root, path,
752462306a36Sopenharmony_ci					      &key, sizeof(*ptr));
752562306a36Sopenharmony_ci		if (ret < 0) {
752662306a36Sopenharmony_ci			btrfs_warn_in_rcu(fs_info,
752762306a36Sopenharmony_ci				"insert dev_stats item for device %s failed %d",
752862306a36Sopenharmony_ci				btrfs_dev_name(device), ret);
752962306a36Sopenharmony_ci			goto out;
753062306a36Sopenharmony_ci		}
753162306a36Sopenharmony_ci	}
753262306a36Sopenharmony_ci
753362306a36Sopenharmony_ci	eb = path->nodes[0];
753462306a36Sopenharmony_ci	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
753562306a36Sopenharmony_ci	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
753662306a36Sopenharmony_ci		btrfs_set_dev_stats_value(eb, ptr, i,
753762306a36Sopenharmony_ci					  btrfs_dev_stat_read(device, i));
753862306a36Sopenharmony_ci	btrfs_mark_buffer_dirty(trans, eb);
753962306a36Sopenharmony_ci
754062306a36Sopenharmony_ciout:
754162306a36Sopenharmony_ci	btrfs_free_path(path);
754262306a36Sopenharmony_ci	return ret;
754362306a36Sopenharmony_ci}
754462306a36Sopenharmony_ci
754562306a36Sopenharmony_ci/*
754662306a36Sopenharmony_ci * called from commit_transaction. Writes all changed device stats to disk.
754762306a36Sopenharmony_ci */
754862306a36Sopenharmony_ciint btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
754962306a36Sopenharmony_ci{
755062306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = trans->fs_info;
755162306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
755262306a36Sopenharmony_ci	struct btrfs_device *device;
755362306a36Sopenharmony_ci	int stats_cnt;
755462306a36Sopenharmony_ci	int ret = 0;
755562306a36Sopenharmony_ci
755662306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
755762306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->devices, dev_list) {
755862306a36Sopenharmony_ci		stats_cnt = atomic_read(&device->dev_stats_ccnt);
755962306a36Sopenharmony_ci		if (!device->dev_stats_valid || stats_cnt == 0)
756062306a36Sopenharmony_ci			continue;
756162306a36Sopenharmony_ci
756262306a36Sopenharmony_ci
756362306a36Sopenharmony_ci		/*
756462306a36Sopenharmony_ci		 * There is a LOAD-LOAD control dependency between the value of
756562306a36Sopenharmony_ci		 * dev_stats_ccnt and updating the on-disk values which requires
756662306a36Sopenharmony_ci		 * reading the in-memory counters. Such control dependencies
756762306a36Sopenharmony_ci		 * require explicit read memory barriers.
756862306a36Sopenharmony_ci		 *
756962306a36Sopenharmony_ci		 * This memory barriers pairs with smp_mb__before_atomic in
757062306a36Sopenharmony_ci		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
757162306a36Sopenharmony_ci		 * barrier implied by atomic_xchg in
757262306a36Sopenharmony_ci		 * btrfs_dev_stats_read_and_reset
757362306a36Sopenharmony_ci		 */
757462306a36Sopenharmony_ci		smp_rmb();
757562306a36Sopenharmony_ci
757662306a36Sopenharmony_ci		ret = update_dev_stat_item(trans, device);
757762306a36Sopenharmony_ci		if (!ret)
757862306a36Sopenharmony_ci			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
757962306a36Sopenharmony_ci	}
758062306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
758162306a36Sopenharmony_ci
758262306a36Sopenharmony_ci	return ret;
758362306a36Sopenharmony_ci}
758462306a36Sopenharmony_ci
758562306a36Sopenharmony_civoid btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
758662306a36Sopenharmony_ci{
758762306a36Sopenharmony_ci	btrfs_dev_stat_inc(dev, index);
758862306a36Sopenharmony_ci
758962306a36Sopenharmony_ci	if (!dev->dev_stats_valid)
759062306a36Sopenharmony_ci		return;
759162306a36Sopenharmony_ci	btrfs_err_rl_in_rcu(dev->fs_info,
759262306a36Sopenharmony_ci		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
759362306a36Sopenharmony_ci			   btrfs_dev_name(dev),
759462306a36Sopenharmony_ci			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
759562306a36Sopenharmony_ci			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
759662306a36Sopenharmony_ci			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
759762306a36Sopenharmony_ci			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
759862306a36Sopenharmony_ci			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
759962306a36Sopenharmony_ci}
760062306a36Sopenharmony_ci
760162306a36Sopenharmony_cistatic void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
760262306a36Sopenharmony_ci{
760362306a36Sopenharmony_ci	int i;
760462306a36Sopenharmony_ci
760562306a36Sopenharmony_ci	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
760662306a36Sopenharmony_ci		if (btrfs_dev_stat_read(dev, i) != 0)
760762306a36Sopenharmony_ci			break;
760862306a36Sopenharmony_ci	if (i == BTRFS_DEV_STAT_VALUES_MAX)
760962306a36Sopenharmony_ci		return; /* all values == 0, suppress message */
761062306a36Sopenharmony_ci
761162306a36Sopenharmony_ci	btrfs_info_in_rcu(dev->fs_info,
761262306a36Sopenharmony_ci		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
761362306a36Sopenharmony_ci	       btrfs_dev_name(dev),
761462306a36Sopenharmony_ci	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
761562306a36Sopenharmony_ci	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
761662306a36Sopenharmony_ci	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
761762306a36Sopenharmony_ci	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
761862306a36Sopenharmony_ci	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
761962306a36Sopenharmony_ci}
762062306a36Sopenharmony_ci
762162306a36Sopenharmony_ciint btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
762262306a36Sopenharmony_ci			struct btrfs_ioctl_get_dev_stats *stats)
762362306a36Sopenharmony_ci{
762462306a36Sopenharmony_ci	BTRFS_DEV_LOOKUP_ARGS(args);
762562306a36Sopenharmony_ci	struct btrfs_device *dev;
762662306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
762762306a36Sopenharmony_ci	int i;
762862306a36Sopenharmony_ci
762962306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
763062306a36Sopenharmony_ci	args.devid = stats->devid;
763162306a36Sopenharmony_ci	dev = btrfs_find_device(fs_info->fs_devices, &args);
763262306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
763362306a36Sopenharmony_ci
763462306a36Sopenharmony_ci	if (!dev) {
763562306a36Sopenharmony_ci		btrfs_warn(fs_info, "get dev_stats failed, device not found");
763662306a36Sopenharmony_ci		return -ENODEV;
763762306a36Sopenharmony_ci	} else if (!dev->dev_stats_valid) {
763862306a36Sopenharmony_ci		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
763962306a36Sopenharmony_ci		return -ENODEV;
764062306a36Sopenharmony_ci	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
764162306a36Sopenharmony_ci		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
764262306a36Sopenharmony_ci			if (stats->nr_items > i)
764362306a36Sopenharmony_ci				stats->values[i] =
764462306a36Sopenharmony_ci					btrfs_dev_stat_read_and_reset(dev, i);
764562306a36Sopenharmony_ci			else
764662306a36Sopenharmony_ci				btrfs_dev_stat_set(dev, i, 0);
764762306a36Sopenharmony_ci		}
764862306a36Sopenharmony_ci		btrfs_info(fs_info, "device stats zeroed by %s (%d)",
764962306a36Sopenharmony_ci			   current->comm, task_pid_nr(current));
765062306a36Sopenharmony_ci	} else {
765162306a36Sopenharmony_ci		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
765262306a36Sopenharmony_ci			if (stats->nr_items > i)
765362306a36Sopenharmony_ci				stats->values[i] = btrfs_dev_stat_read(dev, i);
765462306a36Sopenharmony_ci	}
765562306a36Sopenharmony_ci	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
765662306a36Sopenharmony_ci		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
765762306a36Sopenharmony_ci	return 0;
765862306a36Sopenharmony_ci}
765962306a36Sopenharmony_ci
766062306a36Sopenharmony_ci/*
766162306a36Sopenharmony_ci * Update the size and bytes used for each device where it changed.  This is
766262306a36Sopenharmony_ci * delayed since we would otherwise get errors while writing out the
766362306a36Sopenharmony_ci * superblocks.
766462306a36Sopenharmony_ci *
766562306a36Sopenharmony_ci * Must be invoked during transaction commit.
766662306a36Sopenharmony_ci */
766762306a36Sopenharmony_civoid btrfs_commit_device_sizes(struct btrfs_transaction *trans)
766862306a36Sopenharmony_ci{
766962306a36Sopenharmony_ci	struct btrfs_device *curr, *next;
767062306a36Sopenharmony_ci
767162306a36Sopenharmony_ci	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
767262306a36Sopenharmony_ci
767362306a36Sopenharmony_ci	if (list_empty(&trans->dev_update_list))
767462306a36Sopenharmony_ci		return;
767562306a36Sopenharmony_ci
767662306a36Sopenharmony_ci	/*
767762306a36Sopenharmony_ci	 * We don't need the device_list_mutex here.  This list is owned by the
767862306a36Sopenharmony_ci	 * transaction and the transaction must complete before the device is
767962306a36Sopenharmony_ci	 * released.
768062306a36Sopenharmony_ci	 */
768162306a36Sopenharmony_ci	mutex_lock(&trans->fs_info->chunk_mutex);
768262306a36Sopenharmony_ci	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
768362306a36Sopenharmony_ci				 post_commit_list) {
768462306a36Sopenharmony_ci		list_del_init(&curr->post_commit_list);
768562306a36Sopenharmony_ci		curr->commit_total_bytes = curr->disk_total_bytes;
768662306a36Sopenharmony_ci		curr->commit_bytes_used = curr->bytes_used;
768762306a36Sopenharmony_ci	}
768862306a36Sopenharmony_ci	mutex_unlock(&trans->fs_info->chunk_mutex);
768962306a36Sopenharmony_ci}
769062306a36Sopenharmony_ci
769162306a36Sopenharmony_ci/*
769262306a36Sopenharmony_ci * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
769362306a36Sopenharmony_ci */
769462306a36Sopenharmony_ciint btrfs_bg_type_to_factor(u64 flags)
769562306a36Sopenharmony_ci{
769662306a36Sopenharmony_ci	const int index = btrfs_bg_flags_to_raid_index(flags);
769762306a36Sopenharmony_ci
769862306a36Sopenharmony_ci	return btrfs_raid_array[index].ncopies;
769962306a36Sopenharmony_ci}
770062306a36Sopenharmony_ci
770162306a36Sopenharmony_ci
770262306a36Sopenharmony_ci
770362306a36Sopenharmony_cistatic int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
770462306a36Sopenharmony_ci				 u64 chunk_offset, u64 devid,
770562306a36Sopenharmony_ci				 u64 physical_offset, u64 physical_len)
770662306a36Sopenharmony_ci{
770762306a36Sopenharmony_ci	struct btrfs_dev_lookup_args args = { .devid = devid };
770862306a36Sopenharmony_ci	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
770962306a36Sopenharmony_ci	struct extent_map *em;
771062306a36Sopenharmony_ci	struct map_lookup *map;
771162306a36Sopenharmony_ci	struct btrfs_device *dev;
771262306a36Sopenharmony_ci	u64 stripe_len;
771362306a36Sopenharmony_ci	bool found = false;
771462306a36Sopenharmony_ci	int ret = 0;
771562306a36Sopenharmony_ci	int i;
771662306a36Sopenharmony_ci
771762306a36Sopenharmony_ci	read_lock(&em_tree->lock);
771862306a36Sopenharmony_ci	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
771962306a36Sopenharmony_ci	read_unlock(&em_tree->lock);
772062306a36Sopenharmony_ci
772162306a36Sopenharmony_ci	if (!em) {
772262306a36Sopenharmony_ci		btrfs_err(fs_info,
772362306a36Sopenharmony_ci"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
772462306a36Sopenharmony_ci			  physical_offset, devid);
772562306a36Sopenharmony_ci		ret = -EUCLEAN;
772662306a36Sopenharmony_ci		goto out;
772762306a36Sopenharmony_ci	}
772862306a36Sopenharmony_ci
772962306a36Sopenharmony_ci	map = em->map_lookup;
773062306a36Sopenharmony_ci	stripe_len = btrfs_calc_stripe_length(em);
773162306a36Sopenharmony_ci	if (physical_len != stripe_len) {
773262306a36Sopenharmony_ci		btrfs_err(fs_info,
773362306a36Sopenharmony_ci"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
773462306a36Sopenharmony_ci			  physical_offset, devid, em->start, physical_len,
773562306a36Sopenharmony_ci			  stripe_len);
773662306a36Sopenharmony_ci		ret = -EUCLEAN;
773762306a36Sopenharmony_ci		goto out;
773862306a36Sopenharmony_ci	}
773962306a36Sopenharmony_ci
774062306a36Sopenharmony_ci	/*
774162306a36Sopenharmony_ci	 * Very old mkfs.btrfs (before v4.1) will not respect the reserved
774262306a36Sopenharmony_ci	 * space. Although kernel can handle it without problem, better to warn
774362306a36Sopenharmony_ci	 * the users.
774462306a36Sopenharmony_ci	 */
774562306a36Sopenharmony_ci	if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
774662306a36Sopenharmony_ci		btrfs_warn(fs_info,
774762306a36Sopenharmony_ci		"devid %llu physical %llu len %llu inside the reserved space",
774862306a36Sopenharmony_ci			   devid, physical_offset, physical_len);
774962306a36Sopenharmony_ci
775062306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
775162306a36Sopenharmony_ci		if (map->stripes[i].dev->devid == devid &&
775262306a36Sopenharmony_ci		    map->stripes[i].physical == physical_offset) {
775362306a36Sopenharmony_ci			found = true;
775462306a36Sopenharmony_ci			if (map->verified_stripes >= map->num_stripes) {
775562306a36Sopenharmony_ci				btrfs_err(fs_info,
775662306a36Sopenharmony_ci				"too many dev extents for chunk %llu found",
775762306a36Sopenharmony_ci					  em->start);
775862306a36Sopenharmony_ci				ret = -EUCLEAN;
775962306a36Sopenharmony_ci				goto out;
776062306a36Sopenharmony_ci			}
776162306a36Sopenharmony_ci			map->verified_stripes++;
776262306a36Sopenharmony_ci			break;
776362306a36Sopenharmony_ci		}
776462306a36Sopenharmony_ci	}
776562306a36Sopenharmony_ci	if (!found) {
776662306a36Sopenharmony_ci		btrfs_err(fs_info,
776762306a36Sopenharmony_ci	"dev extent physical offset %llu devid %llu has no corresponding chunk",
776862306a36Sopenharmony_ci			physical_offset, devid);
776962306a36Sopenharmony_ci		ret = -EUCLEAN;
777062306a36Sopenharmony_ci	}
777162306a36Sopenharmony_ci
777262306a36Sopenharmony_ci	/* Make sure no dev extent is beyond device boundary */
777362306a36Sopenharmony_ci	dev = btrfs_find_device(fs_info->fs_devices, &args);
777462306a36Sopenharmony_ci	if (!dev) {
777562306a36Sopenharmony_ci		btrfs_err(fs_info, "failed to find devid %llu", devid);
777662306a36Sopenharmony_ci		ret = -EUCLEAN;
777762306a36Sopenharmony_ci		goto out;
777862306a36Sopenharmony_ci	}
777962306a36Sopenharmony_ci
778062306a36Sopenharmony_ci	if (physical_offset + physical_len > dev->disk_total_bytes) {
778162306a36Sopenharmony_ci		btrfs_err(fs_info,
778262306a36Sopenharmony_ci"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
778362306a36Sopenharmony_ci			  devid, physical_offset, physical_len,
778462306a36Sopenharmony_ci			  dev->disk_total_bytes);
778562306a36Sopenharmony_ci		ret = -EUCLEAN;
778662306a36Sopenharmony_ci		goto out;
778762306a36Sopenharmony_ci	}
778862306a36Sopenharmony_ci
778962306a36Sopenharmony_ci	if (dev->zone_info) {
779062306a36Sopenharmony_ci		u64 zone_size = dev->zone_info->zone_size;
779162306a36Sopenharmony_ci
779262306a36Sopenharmony_ci		if (!IS_ALIGNED(physical_offset, zone_size) ||
779362306a36Sopenharmony_ci		    !IS_ALIGNED(physical_len, zone_size)) {
779462306a36Sopenharmony_ci			btrfs_err(fs_info,
779562306a36Sopenharmony_ci"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
779662306a36Sopenharmony_ci				  devid, physical_offset, physical_len);
779762306a36Sopenharmony_ci			ret = -EUCLEAN;
779862306a36Sopenharmony_ci			goto out;
779962306a36Sopenharmony_ci		}
780062306a36Sopenharmony_ci	}
780162306a36Sopenharmony_ci
780262306a36Sopenharmony_ciout:
780362306a36Sopenharmony_ci	free_extent_map(em);
780462306a36Sopenharmony_ci	return ret;
780562306a36Sopenharmony_ci}
780662306a36Sopenharmony_ci
780762306a36Sopenharmony_cistatic int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
780862306a36Sopenharmony_ci{
780962306a36Sopenharmony_ci	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
781062306a36Sopenharmony_ci	struct extent_map *em;
781162306a36Sopenharmony_ci	struct rb_node *node;
781262306a36Sopenharmony_ci	int ret = 0;
781362306a36Sopenharmony_ci
781462306a36Sopenharmony_ci	read_lock(&em_tree->lock);
781562306a36Sopenharmony_ci	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
781662306a36Sopenharmony_ci		em = rb_entry(node, struct extent_map, rb_node);
781762306a36Sopenharmony_ci		if (em->map_lookup->num_stripes !=
781862306a36Sopenharmony_ci		    em->map_lookup->verified_stripes) {
781962306a36Sopenharmony_ci			btrfs_err(fs_info,
782062306a36Sopenharmony_ci			"chunk %llu has missing dev extent, have %d expect %d",
782162306a36Sopenharmony_ci				  em->start, em->map_lookup->verified_stripes,
782262306a36Sopenharmony_ci				  em->map_lookup->num_stripes);
782362306a36Sopenharmony_ci			ret = -EUCLEAN;
782462306a36Sopenharmony_ci			goto out;
782562306a36Sopenharmony_ci		}
782662306a36Sopenharmony_ci	}
782762306a36Sopenharmony_ciout:
782862306a36Sopenharmony_ci	read_unlock(&em_tree->lock);
782962306a36Sopenharmony_ci	return ret;
783062306a36Sopenharmony_ci}
783162306a36Sopenharmony_ci
783262306a36Sopenharmony_ci/*
783362306a36Sopenharmony_ci * Ensure that all dev extents are mapped to correct chunk, otherwise
783462306a36Sopenharmony_ci * later chunk allocation/free would cause unexpected behavior.
783562306a36Sopenharmony_ci *
783662306a36Sopenharmony_ci * NOTE: This will iterate through the whole device tree, which should be of
783762306a36Sopenharmony_ci * the same size level as the chunk tree.  This slightly increases mount time.
783862306a36Sopenharmony_ci */
783962306a36Sopenharmony_ciint btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
784062306a36Sopenharmony_ci{
784162306a36Sopenharmony_ci	struct btrfs_path *path;
784262306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->dev_root;
784362306a36Sopenharmony_ci	struct btrfs_key key;
784462306a36Sopenharmony_ci	u64 prev_devid = 0;
784562306a36Sopenharmony_ci	u64 prev_dev_ext_end = 0;
784662306a36Sopenharmony_ci	int ret = 0;
784762306a36Sopenharmony_ci
784862306a36Sopenharmony_ci	/*
784962306a36Sopenharmony_ci	 * We don't have a dev_root because we mounted with ignorebadroots and
785062306a36Sopenharmony_ci	 * failed to load the root, so we want to skip the verification in this
785162306a36Sopenharmony_ci	 * case for sure.
785262306a36Sopenharmony_ci	 *
785362306a36Sopenharmony_ci	 * However if the dev root is fine, but the tree itself is corrupted
785462306a36Sopenharmony_ci	 * we'd still fail to mount.  This verification is only to make sure
785562306a36Sopenharmony_ci	 * writes can happen safely, so instead just bypass this check
785662306a36Sopenharmony_ci	 * completely in the case of IGNOREBADROOTS.
785762306a36Sopenharmony_ci	 */
785862306a36Sopenharmony_ci	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
785962306a36Sopenharmony_ci		return 0;
786062306a36Sopenharmony_ci
786162306a36Sopenharmony_ci	key.objectid = 1;
786262306a36Sopenharmony_ci	key.type = BTRFS_DEV_EXTENT_KEY;
786362306a36Sopenharmony_ci	key.offset = 0;
786462306a36Sopenharmony_ci
786562306a36Sopenharmony_ci	path = btrfs_alloc_path();
786662306a36Sopenharmony_ci	if (!path)
786762306a36Sopenharmony_ci		return -ENOMEM;
786862306a36Sopenharmony_ci
786962306a36Sopenharmony_ci	path->reada = READA_FORWARD;
787062306a36Sopenharmony_ci	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
787162306a36Sopenharmony_ci	if (ret < 0)
787262306a36Sopenharmony_ci		goto out;
787362306a36Sopenharmony_ci
787462306a36Sopenharmony_ci	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
787562306a36Sopenharmony_ci		ret = btrfs_next_leaf(root, path);
787662306a36Sopenharmony_ci		if (ret < 0)
787762306a36Sopenharmony_ci			goto out;
787862306a36Sopenharmony_ci		/* No dev extents at all? Not good */
787962306a36Sopenharmony_ci		if (ret > 0) {
788062306a36Sopenharmony_ci			ret = -EUCLEAN;
788162306a36Sopenharmony_ci			goto out;
788262306a36Sopenharmony_ci		}
788362306a36Sopenharmony_ci	}
788462306a36Sopenharmony_ci	while (1) {
788562306a36Sopenharmony_ci		struct extent_buffer *leaf = path->nodes[0];
788662306a36Sopenharmony_ci		struct btrfs_dev_extent *dext;
788762306a36Sopenharmony_ci		int slot = path->slots[0];
788862306a36Sopenharmony_ci		u64 chunk_offset;
788962306a36Sopenharmony_ci		u64 physical_offset;
789062306a36Sopenharmony_ci		u64 physical_len;
789162306a36Sopenharmony_ci		u64 devid;
789262306a36Sopenharmony_ci
789362306a36Sopenharmony_ci		btrfs_item_key_to_cpu(leaf, &key, slot);
789462306a36Sopenharmony_ci		if (key.type != BTRFS_DEV_EXTENT_KEY)
789562306a36Sopenharmony_ci			break;
789662306a36Sopenharmony_ci		devid = key.objectid;
789762306a36Sopenharmony_ci		physical_offset = key.offset;
789862306a36Sopenharmony_ci
789962306a36Sopenharmony_ci		dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
790062306a36Sopenharmony_ci		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
790162306a36Sopenharmony_ci		physical_len = btrfs_dev_extent_length(leaf, dext);
790262306a36Sopenharmony_ci
790362306a36Sopenharmony_ci		/* Check if this dev extent overlaps with the previous one */
790462306a36Sopenharmony_ci		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
790562306a36Sopenharmony_ci			btrfs_err(fs_info,
790662306a36Sopenharmony_ci"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
790762306a36Sopenharmony_ci				  devid, physical_offset, prev_dev_ext_end);
790862306a36Sopenharmony_ci			ret = -EUCLEAN;
790962306a36Sopenharmony_ci			goto out;
791062306a36Sopenharmony_ci		}
791162306a36Sopenharmony_ci
791262306a36Sopenharmony_ci		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
791362306a36Sopenharmony_ci					    physical_offset, physical_len);
791462306a36Sopenharmony_ci		if (ret < 0)
791562306a36Sopenharmony_ci			goto out;
791662306a36Sopenharmony_ci		prev_devid = devid;
791762306a36Sopenharmony_ci		prev_dev_ext_end = physical_offset + physical_len;
791862306a36Sopenharmony_ci
791962306a36Sopenharmony_ci		ret = btrfs_next_item(root, path);
792062306a36Sopenharmony_ci		if (ret < 0)
792162306a36Sopenharmony_ci			goto out;
792262306a36Sopenharmony_ci		if (ret > 0) {
792362306a36Sopenharmony_ci			ret = 0;
792462306a36Sopenharmony_ci			break;
792562306a36Sopenharmony_ci		}
792662306a36Sopenharmony_ci	}
792762306a36Sopenharmony_ci
792862306a36Sopenharmony_ci	/* Ensure all chunks have corresponding dev extents */
792962306a36Sopenharmony_ci	ret = verify_chunk_dev_extent_mapping(fs_info);
793062306a36Sopenharmony_ciout:
793162306a36Sopenharmony_ci	btrfs_free_path(path);
793262306a36Sopenharmony_ci	return ret;
793362306a36Sopenharmony_ci}
793462306a36Sopenharmony_ci
793562306a36Sopenharmony_ci/*
793662306a36Sopenharmony_ci * Check whether the given block group or device is pinned by any inode being
793762306a36Sopenharmony_ci * used as a swapfile.
793862306a36Sopenharmony_ci */
793962306a36Sopenharmony_cibool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
794062306a36Sopenharmony_ci{
794162306a36Sopenharmony_ci	struct btrfs_swapfile_pin *sp;
794262306a36Sopenharmony_ci	struct rb_node *node;
794362306a36Sopenharmony_ci
794462306a36Sopenharmony_ci	spin_lock(&fs_info->swapfile_pins_lock);
794562306a36Sopenharmony_ci	node = fs_info->swapfile_pins.rb_node;
794662306a36Sopenharmony_ci	while (node) {
794762306a36Sopenharmony_ci		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
794862306a36Sopenharmony_ci		if (ptr < sp->ptr)
794962306a36Sopenharmony_ci			node = node->rb_left;
795062306a36Sopenharmony_ci		else if (ptr > sp->ptr)
795162306a36Sopenharmony_ci			node = node->rb_right;
795262306a36Sopenharmony_ci		else
795362306a36Sopenharmony_ci			break;
795462306a36Sopenharmony_ci	}
795562306a36Sopenharmony_ci	spin_unlock(&fs_info->swapfile_pins_lock);
795662306a36Sopenharmony_ci	return node != NULL;
795762306a36Sopenharmony_ci}
795862306a36Sopenharmony_ci
795962306a36Sopenharmony_cistatic int relocating_repair_kthread(void *data)
796062306a36Sopenharmony_ci{
796162306a36Sopenharmony_ci	struct btrfs_block_group *cache = data;
796262306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = cache->fs_info;
796362306a36Sopenharmony_ci	u64 target;
796462306a36Sopenharmony_ci	int ret = 0;
796562306a36Sopenharmony_ci
796662306a36Sopenharmony_ci	target = cache->start;
796762306a36Sopenharmony_ci	btrfs_put_block_group(cache);
796862306a36Sopenharmony_ci
796962306a36Sopenharmony_ci	sb_start_write(fs_info->sb);
797062306a36Sopenharmony_ci	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
797162306a36Sopenharmony_ci		btrfs_info(fs_info,
797262306a36Sopenharmony_ci			   "zoned: skip relocating block group %llu to repair: EBUSY",
797362306a36Sopenharmony_ci			   target);
797462306a36Sopenharmony_ci		sb_end_write(fs_info->sb);
797562306a36Sopenharmony_ci		return -EBUSY;
797662306a36Sopenharmony_ci	}
797762306a36Sopenharmony_ci
797862306a36Sopenharmony_ci	mutex_lock(&fs_info->reclaim_bgs_lock);
797962306a36Sopenharmony_ci
798062306a36Sopenharmony_ci	/* Ensure block group still exists */
798162306a36Sopenharmony_ci	cache = btrfs_lookup_block_group(fs_info, target);
798262306a36Sopenharmony_ci	if (!cache)
798362306a36Sopenharmony_ci		goto out;
798462306a36Sopenharmony_ci
798562306a36Sopenharmony_ci	if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
798662306a36Sopenharmony_ci		goto out;
798762306a36Sopenharmony_ci
798862306a36Sopenharmony_ci	ret = btrfs_may_alloc_data_chunk(fs_info, target);
798962306a36Sopenharmony_ci	if (ret < 0)
799062306a36Sopenharmony_ci		goto out;
799162306a36Sopenharmony_ci
799262306a36Sopenharmony_ci	btrfs_info(fs_info,
799362306a36Sopenharmony_ci		   "zoned: relocating block group %llu to repair IO failure",
799462306a36Sopenharmony_ci		   target);
799562306a36Sopenharmony_ci	ret = btrfs_relocate_chunk(fs_info, target);
799662306a36Sopenharmony_ci
799762306a36Sopenharmony_ciout:
799862306a36Sopenharmony_ci	if (cache)
799962306a36Sopenharmony_ci		btrfs_put_block_group(cache);
800062306a36Sopenharmony_ci	mutex_unlock(&fs_info->reclaim_bgs_lock);
800162306a36Sopenharmony_ci	btrfs_exclop_finish(fs_info);
800262306a36Sopenharmony_ci	sb_end_write(fs_info->sb);
800362306a36Sopenharmony_ci
800462306a36Sopenharmony_ci	return ret;
800562306a36Sopenharmony_ci}
800662306a36Sopenharmony_ci
800762306a36Sopenharmony_cibool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
800862306a36Sopenharmony_ci{
800962306a36Sopenharmony_ci	struct btrfs_block_group *cache;
801062306a36Sopenharmony_ci
801162306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
801262306a36Sopenharmony_ci		return false;
801362306a36Sopenharmony_ci
801462306a36Sopenharmony_ci	/* Do not attempt to repair in degraded state */
801562306a36Sopenharmony_ci	if (btrfs_test_opt(fs_info, DEGRADED))
801662306a36Sopenharmony_ci		return true;
801762306a36Sopenharmony_ci
801862306a36Sopenharmony_ci	cache = btrfs_lookup_block_group(fs_info, logical);
801962306a36Sopenharmony_ci	if (!cache)
802062306a36Sopenharmony_ci		return true;
802162306a36Sopenharmony_ci
802262306a36Sopenharmony_ci	if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
802362306a36Sopenharmony_ci		btrfs_put_block_group(cache);
802462306a36Sopenharmony_ci		return true;
802562306a36Sopenharmony_ci	}
802662306a36Sopenharmony_ci
802762306a36Sopenharmony_ci	kthread_run(relocating_repair_kthread, cache,
802862306a36Sopenharmony_ci		    "btrfs-relocating-repair");
802962306a36Sopenharmony_ci
803062306a36Sopenharmony_ci	return true;
803162306a36Sopenharmony_ci}
803262306a36Sopenharmony_ci
803362306a36Sopenharmony_cistatic void map_raid56_repair_block(struct btrfs_io_context *bioc,
803462306a36Sopenharmony_ci				    struct btrfs_io_stripe *smap,
803562306a36Sopenharmony_ci				    u64 logical)
803662306a36Sopenharmony_ci{
803762306a36Sopenharmony_ci	int data_stripes = nr_bioc_data_stripes(bioc);
803862306a36Sopenharmony_ci	int i;
803962306a36Sopenharmony_ci
804062306a36Sopenharmony_ci	for (i = 0; i < data_stripes; i++) {
804162306a36Sopenharmony_ci		u64 stripe_start = bioc->full_stripe_logical +
804262306a36Sopenharmony_ci				   btrfs_stripe_nr_to_offset(i);
804362306a36Sopenharmony_ci
804462306a36Sopenharmony_ci		if (logical >= stripe_start &&
804562306a36Sopenharmony_ci		    logical < stripe_start + BTRFS_STRIPE_LEN)
804662306a36Sopenharmony_ci			break;
804762306a36Sopenharmony_ci	}
804862306a36Sopenharmony_ci	ASSERT(i < data_stripes);
804962306a36Sopenharmony_ci	smap->dev = bioc->stripes[i].dev;
805062306a36Sopenharmony_ci	smap->physical = bioc->stripes[i].physical +
805162306a36Sopenharmony_ci			((logical - bioc->full_stripe_logical) &
805262306a36Sopenharmony_ci			 BTRFS_STRIPE_LEN_MASK);
805362306a36Sopenharmony_ci}
805462306a36Sopenharmony_ci
805562306a36Sopenharmony_ci/*
805662306a36Sopenharmony_ci * Map a repair write into a single device.
805762306a36Sopenharmony_ci *
805862306a36Sopenharmony_ci * A repair write is triggered by read time repair or scrub, which would only
805962306a36Sopenharmony_ci * update the contents of a single device.
806062306a36Sopenharmony_ci * Not update any other mirrors nor go through RMW path.
806162306a36Sopenharmony_ci *
806262306a36Sopenharmony_ci * Callers should ensure:
806362306a36Sopenharmony_ci *
806462306a36Sopenharmony_ci * - Call btrfs_bio_counter_inc_blocked() first
806562306a36Sopenharmony_ci * - The range does not cross stripe boundary
806662306a36Sopenharmony_ci * - Has a valid @mirror_num passed in.
806762306a36Sopenharmony_ci */
806862306a36Sopenharmony_ciint btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
806962306a36Sopenharmony_ci			   struct btrfs_io_stripe *smap, u64 logical,
807062306a36Sopenharmony_ci			   u32 length, int mirror_num)
807162306a36Sopenharmony_ci{
807262306a36Sopenharmony_ci	struct btrfs_io_context *bioc = NULL;
807362306a36Sopenharmony_ci	u64 map_length = length;
807462306a36Sopenharmony_ci	int mirror_ret = mirror_num;
807562306a36Sopenharmony_ci	int ret;
807662306a36Sopenharmony_ci
807762306a36Sopenharmony_ci	ASSERT(mirror_num > 0);
807862306a36Sopenharmony_ci
807962306a36Sopenharmony_ci	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
808062306a36Sopenharmony_ci			      &bioc, smap, &mirror_ret, true);
808162306a36Sopenharmony_ci	if (ret < 0)
808262306a36Sopenharmony_ci		return ret;
808362306a36Sopenharmony_ci
808462306a36Sopenharmony_ci	/* The map range should not cross stripe boundary. */
808562306a36Sopenharmony_ci	ASSERT(map_length >= length);
808662306a36Sopenharmony_ci
808762306a36Sopenharmony_ci	/* Already mapped to single stripe. */
808862306a36Sopenharmony_ci	if (!bioc)
808962306a36Sopenharmony_ci		goto out;
809062306a36Sopenharmony_ci
809162306a36Sopenharmony_ci	/* Map the RAID56 multi-stripe writes to a single one. */
809262306a36Sopenharmony_ci	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
809362306a36Sopenharmony_ci		map_raid56_repair_block(bioc, smap, logical);
809462306a36Sopenharmony_ci		goto out;
809562306a36Sopenharmony_ci	}
809662306a36Sopenharmony_ci
809762306a36Sopenharmony_ci	ASSERT(mirror_num <= bioc->num_stripes);
809862306a36Sopenharmony_ci	smap->dev = bioc->stripes[mirror_num - 1].dev;
809962306a36Sopenharmony_ci	smap->physical = bioc->stripes[mirror_num - 1].physical;
810062306a36Sopenharmony_ciout:
810162306a36Sopenharmony_ci	btrfs_put_bioc(bioc);
810262306a36Sopenharmony_ci	ASSERT(smap->dev);
810362306a36Sopenharmony_ci	return 0;
810462306a36Sopenharmony_ci}
8105