162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci
362306a36Sopenharmony_ci#include <linux/bitops.h>
462306a36Sopenharmony_ci#include <linux/slab.h>
562306a36Sopenharmony_ci#include <linux/blkdev.h>
662306a36Sopenharmony_ci#include <linux/sched/mm.h>
762306a36Sopenharmony_ci#include <linux/atomic.h>
862306a36Sopenharmony_ci#include <linux/vmalloc.h>
962306a36Sopenharmony_ci#include "ctree.h"
1062306a36Sopenharmony_ci#include "volumes.h"
1162306a36Sopenharmony_ci#include "zoned.h"
1262306a36Sopenharmony_ci#include "rcu-string.h"
1362306a36Sopenharmony_ci#include "disk-io.h"
1462306a36Sopenharmony_ci#include "block-group.h"
1562306a36Sopenharmony_ci#include "transaction.h"
1662306a36Sopenharmony_ci#include "dev-replace.h"
1762306a36Sopenharmony_ci#include "space-info.h"
1862306a36Sopenharmony_ci#include "super.h"
1962306a36Sopenharmony_ci#include "fs.h"
2062306a36Sopenharmony_ci#include "accessors.h"
2162306a36Sopenharmony_ci#include "bio.h"
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci/* Maximum number of zones to report per blkdev_report_zones() call */
2462306a36Sopenharmony_ci#define BTRFS_REPORT_NR_ZONES   4096
2562306a36Sopenharmony_ci/* Invalid allocation pointer value for missing devices */
2662306a36Sopenharmony_ci#define WP_MISSING_DEV ((u64)-1)
2762306a36Sopenharmony_ci/* Pseudo write pointer value for conventional zone */
2862306a36Sopenharmony_ci#define WP_CONVENTIONAL ((u64)-2)
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci/*
3162306a36Sopenharmony_ci * Location of the first zone of superblock logging zone pairs.
3262306a36Sopenharmony_ci *
3362306a36Sopenharmony_ci * - primary superblock:    0B (zone 0)
3462306a36Sopenharmony_ci * - first copy:          512G (zone starting at that offset)
3562306a36Sopenharmony_ci * - second copy:           4T (zone starting at that offset)
3662306a36Sopenharmony_ci */
3762306a36Sopenharmony_ci#define BTRFS_SB_LOG_PRIMARY_OFFSET	(0ULL)
3862306a36Sopenharmony_ci#define BTRFS_SB_LOG_FIRST_OFFSET	(512ULL * SZ_1G)
3962306a36Sopenharmony_ci#define BTRFS_SB_LOG_SECOND_OFFSET	(4096ULL * SZ_1G)
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci#define BTRFS_SB_LOG_FIRST_SHIFT	const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
4262306a36Sopenharmony_ci#define BTRFS_SB_LOG_SECOND_SHIFT	const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci/* Number of superblock log zones */
4562306a36Sopenharmony_ci#define BTRFS_NR_SB_LOG_ZONES 2
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci/*
4862306a36Sopenharmony_ci * Minimum of active zones we need:
4962306a36Sopenharmony_ci *
5062306a36Sopenharmony_ci * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
5162306a36Sopenharmony_ci * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
5262306a36Sopenharmony_ci * - 1 zone for tree-log dedicated block group
5362306a36Sopenharmony_ci * - 1 zone for relocation
5462306a36Sopenharmony_ci */
5562306a36Sopenharmony_ci#define BTRFS_MIN_ACTIVE_ZONES		(BTRFS_SUPER_MIRROR_MAX + 5)
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci/*
5862306a36Sopenharmony_ci * Minimum / maximum supported zone size. Currently, SMR disks have a zone
5962306a36Sopenharmony_ci * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
6062306a36Sopenharmony_ci * We do not expect the zone size to become larger than 8GiB or smaller than
6162306a36Sopenharmony_ci * 4MiB in the near future.
6262306a36Sopenharmony_ci */
6362306a36Sopenharmony_ci#define BTRFS_MAX_ZONE_SIZE		SZ_8G
6462306a36Sopenharmony_ci#define BTRFS_MIN_ZONE_SIZE		SZ_4M
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci#define SUPER_INFO_SECTORS	((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_cistatic void wait_eb_writebacks(struct btrfs_block_group *block_group);
6962306a36Sopenharmony_cistatic int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written);
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_cistatic inline bool sb_zone_is_full(const struct blk_zone *zone)
7262306a36Sopenharmony_ci{
7362306a36Sopenharmony_ci	return (zone->cond == BLK_ZONE_COND_FULL) ||
7462306a36Sopenharmony_ci		(zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
7562306a36Sopenharmony_ci}
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_cistatic int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
7862306a36Sopenharmony_ci{
7962306a36Sopenharmony_ci	struct blk_zone *zones = data;
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci	memcpy(&zones[idx], zone, sizeof(*zone));
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci	return 0;
8462306a36Sopenharmony_ci}
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_cistatic int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
8762306a36Sopenharmony_ci			    u64 *wp_ret)
8862306a36Sopenharmony_ci{
8962306a36Sopenharmony_ci	bool empty[BTRFS_NR_SB_LOG_ZONES];
9062306a36Sopenharmony_ci	bool full[BTRFS_NR_SB_LOG_ZONES];
9162306a36Sopenharmony_ci	sector_t sector;
9262306a36Sopenharmony_ci	int i;
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci	for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
9562306a36Sopenharmony_ci		ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
9662306a36Sopenharmony_ci		empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
9762306a36Sopenharmony_ci		full[i] = sb_zone_is_full(&zones[i]);
9862306a36Sopenharmony_ci	}
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci	/*
10162306a36Sopenharmony_ci	 * Possible states of log buffer zones
10262306a36Sopenharmony_ci	 *
10362306a36Sopenharmony_ci	 *           Empty[0]  In use[0]  Full[0]
10462306a36Sopenharmony_ci	 * Empty[1]         *          0        1
10562306a36Sopenharmony_ci	 * In use[1]        x          x        1
10662306a36Sopenharmony_ci	 * Full[1]          0          0        C
10762306a36Sopenharmony_ci	 *
10862306a36Sopenharmony_ci	 * Log position:
10962306a36Sopenharmony_ci	 *   *: Special case, no superblock is written
11062306a36Sopenharmony_ci	 *   0: Use write pointer of zones[0]
11162306a36Sopenharmony_ci	 *   1: Use write pointer of zones[1]
11262306a36Sopenharmony_ci	 *   C: Compare super blocks from zones[0] and zones[1], use the latest
11362306a36Sopenharmony_ci	 *      one determined by generation
11462306a36Sopenharmony_ci	 *   x: Invalid state
11562306a36Sopenharmony_ci	 */
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	if (empty[0] && empty[1]) {
11862306a36Sopenharmony_ci		/* Special case to distinguish no superblock to read */
11962306a36Sopenharmony_ci		*wp_ret = zones[0].start << SECTOR_SHIFT;
12062306a36Sopenharmony_ci		return -ENOENT;
12162306a36Sopenharmony_ci	} else if (full[0] && full[1]) {
12262306a36Sopenharmony_ci		/* Compare two super blocks */
12362306a36Sopenharmony_ci		struct address_space *mapping = bdev->bd_inode->i_mapping;
12462306a36Sopenharmony_ci		struct page *page[BTRFS_NR_SB_LOG_ZONES];
12562306a36Sopenharmony_ci		struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
12662306a36Sopenharmony_ci		int i;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci		for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
12962306a36Sopenharmony_ci			u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
13062306a36Sopenharmony_ci			u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
13162306a36Sopenharmony_ci						BTRFS_SUPER_INFO_SIZE;
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci			page[i] = read_cache_page_gfp(mapping,
13462306a36Sopenharmony_ci					bytenr >> PAGE_SHIFT, GFP_NOFS);
13562306a36Sopenharmony_ci			if (IS_ERR(page[i])) {
13662306a36Sopenharmony_ci				if (i == 1)
13762306a36Sopenharmony_ci					btrfs_release_disk_super(super[0]);
13862306a36Sopenharmony_ci				return PTR_ERR(page[i]);
13962306a36Sopenharmony_ci			}
14062306a36Sopenharmony_ci			super[i] = page_address(page[i]);
14162306a36Sopenharmony_ci		}
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ci		if (btrfs_super_generation(super[0]) >
14462306a36Sopenharmony_ci		    btrfs_super_generation(super[1]))
14562306a36Sopenharmony_ci			sector = zones[1].start;
14662306a36Sopenharmony_ci		else
14762306a36Sopenharmony_ci			sector = zones[0].start;
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci		for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
15062306a36Sopenharmony_ci			btrfs_release_disk_super(super[i]);
15162306a36Sopenharmony_ci	} else if (!full[0] && (empty[1] || full[1])) {
15262306a36Sopenharmony_ci		sector = zones[0].wp;
15362306a36Sopenharmony_ci	} else if (full[0]) {
15462306a36Sopenharmony_ci		sector = zones[1].wp;
15562306a36Sopenharmony_ci	} else {
15662306a36Sopenharmony_ci		return -EUCLEAN;
15762306a36Sopenharmony_ci	}
15862306a36Sopenharmony_ci	*wp_ret = sector << SECTOR_SHIFT;
15962306a36Sopenharmony_ci	return 0;
16062306a36Sopenharmony_ci}
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci/*
16362306a36Sopenharmony_ci * Get the first zone number of the superblock mirror
16462306a36Sopenharmony_ci */
16562306a36Sopenharmony_cistatic inline u32 sb_zone_number(int shift, int mirror)
16662306a36Sopenharmony_ci{
16762306a36Sopenharmony_ci	u64 zone = U64_MAX;
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
17062306a36Sopenharmony_ci	switch (mirror) {
17162306a36Sopenharmony_ci	case 0: zone = 0; break;
17262306a36Sopenharmony_ci	case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
17362306a36Sopenharmony_ci	case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
17462306a36Sopenharmony_ci	}
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci	ASSERT(zone <= U32_MAX);
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	return (u32)zone;
17962306a36Sopenharmony_ci}
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_cistatic inline sector_t zone_start_sector(u32 zone_number,
18262306a36Sopenharmony_ci					 struct block_device *bdev)
18362306a36Sopenharmony_ci{
18462306a36Sopenharmony_ci	return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
18562306a36Sopenharmony_ci}
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_cistatic inline u64 zone_start_physical(u32 zone_number,
18862306a36Sopenharmony_ci				      struct btrfs_zoned_device_info *zone_info)
18962306a36Sopenharmony_ci{
19062306a36Sopenharmony_ci	return (u64)zone_number << zone_info->zone_size_shift;
19162306a36Sopenharmony_ci}
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci/*
19462306a36Sopenharmony_ci * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
19562306a36Sopenharmony_ci * device into static sized chunks and fake a conventional zone on each of
19662306a36Sopenharmony_ci * them.
19762306a36Sopenharmony_ci */
19862306a36Sopenharmony_cistatic int emulate_report_zones(struct btrfs_device *device, u64 pos,
19962306a36Sopenharmony_ci				struct blk_zone *zones, unsigned int nr_zones)
20062306a36Sopenharmony_ci{
20162306a36Sopenharmony_ci	const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
20262306a36Sopenharmony_ci	sector_t bdev_size = bdev_nr_sectors(device->bdev);
20362306a36Sopenharmony_ci	unsigned int i;
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	pos >>= SECTOR_SHIFT;
20662306a36Sopenharmony_ci	for (i = 0; i < nr_zones; i++) {
20762306a36Sopenharmony_ci		zones[i].start = i * zone_sectors + pos;
20862306a36Sopenharmony_ci		zones[i].len = zone_sectors;
20962306a36Sopenharmony_ci		zones[i].capacity = zone_sectors;
21062306a36Sopenharmony_ci		zones[i].wp = zones[i].start + zone_sectors;
21162306a36Sopenharmony_ci		zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
21262306a36Sopenharmony_ci		zones[i].cond = BLK_ZONE_COND_NOT_WP;
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci		if (zones[i].wp >= bdev_size) {
21562306a36Sopenharmony_ci			i++;
21662306a36Sopenharmony_ci			break;
21762306a36Sopenharmony_ci		}
21862306a36Sopenharmony_ci	}
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci	return i;
22162306a36Sopenharmony_ci}
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_cistatic int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
22462306a36Sopenharmony_ci			       struct blk_zone *zones, unsigned int *nr_zones)
22562306a36Sopenharmony_ci{
22662306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zinfo = device->zone_info;
22762306a36Sopenharmony_ci	int ret;
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci	if (!*nr_zones)
23062306a36Sopenharmony_ci		return 0;
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci	if (!bdev_is_zoned(device->bdev)) {
23362306a36Sopenharmony_ci		ret = emulate_report_zones(device, pos, zones, *nr_zones);
23462306a36Sopenharmony_ci		*nr_zones = ret;
23562306a36Sopenharmony_ci		return 0;
23662306a36Sopenharmony_ci	}
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	/* Check cache */
23962306a36Sopenharmony_ci	if (zinfo->zone_cache) {
24062306a36Sopenharmony_ci		unsigned int i;
24162306a36Sopenharmony_ci		u32 zno;
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ci		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
24462306a36Sopenharmony_ci		zno = pos >> zinfo->zone_size_shift;
24562306a36Sopenharmony_ci		/*
24662306a36Sopenharmony_ci		 * We cannot report zones beyond the zone end. So, it is OK to
24762306a36Sopenharmony_ci		 * cap *nr_zones to at the end.
24862306a36Sopenharmony_ci		 */
24962306a36Sopenharmony_ci		*nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci		for (i = 0; i < *nr_zones; i++) {
25262306a36Sopenharmony_ci			struct blk_zone *zone_info;
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci			zone_info = &zinfo->zone_cache[zno + i];
25562306a36Sopenharmony_ci			if (!zone_info->len)
25662306a36Sopenharmony_ci				break;
25762306a36Sopenharmony_ci		}
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci		if (i == *nr_zones) {
26062306a36Sopenharmony_ci			/* Cache hit on all the zones */
26162306a36Sopenharmony_ci			memcpy(zones, zinfo->zone_cache + zno,
26262306a36Sopenharmony_ci			       sizeof(*zinfo->zone_cache) * *nr_zones);
26362306a36Sopenharmony_ci			return 0;
26462306a36Sopenharmony_ci		}
26562306a36Sopenharmony_ci	}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
26862306a36Sopenharmony_ci				  copy_zone_info_cb, zones);
26962306a36Sopenharmony_ci	if (ret < 0) {
27062306a36Sopenharmony_ci		btrfs_err_in_rcu(device->fs_info,
27162306a36Sopenharmony_ci				 "zoned: failed to read zone %llu on %s (devid %llu)",
27262306a36Sopenharmony_ci				 pos, rcu_str_deref(device->name),
27362306a36Sopenharmony_ci				 device->devid);
27462306a36Sopenharmony_ci		return ret;
27562306a36Sopenharmony_ci	}
27662306a36Sopenharmony_ci	*nr_zones = ret;
27762306a36Sopenharmony_ci	if (!ret)
27862306a36Sopenharmony_ci		return -EIO;
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	/* Populate cache */
28162306a36Sopenharmony_ci	if (zinfo->zone_cache) {
28262306a36Sopenharmony_ci		u32 zno = pos >> zinfo->zone_size_shift;
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci		memcpy(zinfo->zone_cache + zno, zones,
28562306a36Sopenharmony_ci		       sizeof(*zinfo->zone_cache) * *nr_zones);
28662306a36Sopenharmony_ci	}
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	return 0;
28962306a36Sopenharmony_ci}
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci/* The emulated zone size is determined from the size of device extent */
29262306a36Sopenharmony_cistatic int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
29362306a36Sopenharmony_ci{
29462306a36Sopenharmony_ci	struct btrfs_path *path;
29562306a36Sopenharmony_ci	struct btrfs_root *root = fs_info->dev_root;
29662306a36Sopenharmony_ci	struct btrfs_key key;
29762306a36Sopenharmony_ci	struct extent_buffer *leaf;
29862306a36Sopenharmony_ci	struct btrfs_dev_extent *dext;
29962306a36Sopenharmony_ci	int ret = 0;
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	key.objectid = 1;
30262306a36Sopenharmony_ci	key.type = BTRFS_DEV_EXTENT_KEY;
30362306a36Sopenharmony_ci	key.offset = 0;
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci	path = btrfs_alloc_path();
30662306a36Sopenharmony_ci	if (!path)
30762306a36Sopenharmony_ci		return -ENOMEM;
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
31062306a36Sopenharmony_ci	if (ret < 0)
31162306a36Sopenharmony_ci		goto out;
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
31462306a36Sopenharmony_ci		ret = btrfs_next_leaf(root, path);
31562306a36Sopenharmony_ci		if (ret < 0)
31662306a36Sopenharmony_ci			goto out;
31762306a36Sopenharmony_ci		/* No dev extents at all? Not good */
31862306a36Sopenharmony_ci		if (ret > 0) {
31962306a36Sopenharmony_ci			ret = -EUCLEAN;
32062306a36Sopenharmony_ci			goto out;
32162306a36Sopenharmony_ci		}
32262306a36Sopenharmony_ci	}
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci	leaf = path->nodes[0];
32562306a36Sopenharmony_ci	dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
32662306a36Sopenharmony_ci	fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
32762306a36Sopenharmony_ci	ret = 0;
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ciout:
33062306a36Sopenharmony_ci	btrfs_free_path(path);
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci	return ret;
33362306a36Sopenharmony_ci}
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ciint btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
33662306a36Sopenharmony_ci{
33762306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
33862306a36Sopenharmony_ci	struct btrfs_device *device;
33962306a36Sopenharmony_ci	int ret = 0;
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	/* fs_info->zone_size might not set yet. Use the incomapt flag here. */
34262306a36Sopenharmony_ci	if (!btrfs_fs_incompat(fs_info, ZONED))
34362306a36Sopenharmony_ci		return 0;
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
34662306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->devices, dev_list) {
34762306a36Sopenharmony_ci		/* We can skip reading of zone info for missing devices */
34862306a36Sopenharmony_ci		if (!device->bdev)
34962306a36Sopenharmony_ci			continue;
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci		ret = btrfs_get_dev_zone_info(device, true);
35262306a36Sopenharmony_ci		if (ret)
35362306a36Sopenharmony_ci			break;
35462306a36Sopenharmony_ci	}
35562306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci	return ret;
35862306a36Sopenharmony_ci}
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ciint btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
36162306a36Sopenharmony_ci{
36262306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = device->fs_info;
36362306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zone_info = NULL;
36462306a36Sopenharmony_ci	struct block_device *bdev = device->bdev;
36562306a36Sopenharmony_ci	unsigned int max_active_zones;
36662306a36Sopenharmony_ci	unsigned int nactive;
36762306a36Sopenharmony_ci	sector_t nr_sectors;
36862306a36Sopenharmony_ci	sector_t sector = 0;
36962306a36Sopenharmony_ci	struct blk_zone *zones = NULL;
37062306a36Sopenharmony_ci	unsigned int i, nreported = 0, nr_zones;
37162306a36Sopenharmony_ci	sector_t zone_sectors;
37262306a36Sopenharmony_ci	char *model, *emulated;
37362306a36Sopenharmony_ci	int ret;
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_ci	/*
37662306a36Sopenharmony_ci	 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
37762306a36Sopenharmony_ci	 * yet be set.
37862306a36Sopenharmony_ci	 */
37962306a36Sopenharmony_ci	if (!btrfs_fs_incompat(fs_info, ZONED))
38062306a36Sopenharmony_ci		return 0;
38162306a36Sopenharmony_ci
38262306a36Sopenharmony_ci	if (device->zone_info)
38362306a36Sopenharmony_ci		return 0;
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci	zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
38662306a36Sopenharmony_ci	if (!zone_info)
38762306a36Sopenharmony_ci		return -ENOMEM;
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci	device->zone_info = zone_info;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	if (!bdev_is_zoned(bdev)) {
39262306a36Sopenharmony_ci		if (!fs_info->zone_size) {
39362306a36Sopenharmony_ci			ret = calculate_emulated_zone_size(fs_info);
39462306a36Sopenharmony_ci			if (ret)
39562306a36Sopenharmony_ci				goto out;
39662306a36Sopenharmony_ci		}
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_ci		ASSERT(fs_info->zone_size);
39962306a36Sopenharmony_ci		zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
40062306a36Sopenharmony_ci	} else {
40162306a36Sopenharmony_ci		zone_sectors = bdev_zone_sectors(bdev);
40262306a36Sopenharmony_ci	}
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci	ASSERT(is_power_of_two_u64(zone_sectors));
40562306a36Sopenharmony_ci	zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ci	/* We reject devices with a zone size larger than 8GB */
40862306a36Sopenharmony_ci	if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
40962306a36Sopenharmony_ci		btrfs_err_in_rcu(fs_info,
41062306a36Sopenharmony_ci		"zoned: %s: zone size %llu larger than supported maximum %llu",
41162306a36Sopenharmony_ci				 rcu_str_deref(device->name),
41262306a36Sopenharmony_ci				 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
41362306a36Sopenharmony_ci		ret = -EINVAL;
41462306a36Sopenharmony_ci		goto out;
41562306a36Sopenharmony_ci	} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
41662306a36Sopenharmony_ci		btrfs_err_in_rcu(fs_info,
41762306a36Sopenharmony_ci		"zoned: %s: zone size %llu smaller than supported minimum %u",
41862306a36Sopenharmony_ci				 rcu_str_deref(device->name),
41962306a36Sopenharmony_ci				 zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
42062306a36Sopenharmony_ci		ret = -EINVAL;
42162306a36Sopenharmony_ci		goto out;
42262306a36Sopenharmony_ci	}
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci	nr_sectors = bdev_nr_sectors(bdev);
42562306a36Sopenharmony_ci	zone_info->zone_size_shift = ilog2(zone_info->zone_size);
42662306a36Sopenharmony_ci	zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
42762306a36Sopenharmony_ci	if (!IS_ALIGNED(nr_sectors, zone_sectors))
42862306a36Sopenharmony_ci		zone_info->nr_zones++;
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_ci	max_active_zones = bdev_max_active_zones(bdev);
43162306a36Sopenharmony_ci	if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
43262306a36Sopenharmony_ci		btrfs_err_in_rcu(fs_info,
43362306a36Sopenharmony_ci"zoned: %s: max active zones %u is too small, need at least %u active zones",
43462306a36Sopenharmony_ci				 rcu_str_deref(device->name), max_active_zones,
43562306a36Sopenharmony_ci				 BTRFS_MIN_ACTIVE_ZONES);
43662306a36Sopenharmony_ci		ret = -EINVAL;
43762306a36Sopenharmony_ci		goto out;
43862306a36Sopenharmony_ci	}
43962306a36Sopenharmony_ci	zone_info->max_active_zones = max_active_zones;
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
44262306a36Sopenharmony_ci	if (!zone_info->seq_zones) {
44362306a36Sopenharmony_ci		ret = -ENOMEM;
44462306a36Sopenharmony_ci		goto out;
44562306a36Sopenharmony_ci	}
44662306a36Sopenharmony_ci
44762306a36Sopenharmony_ci	zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
44862306a36Sopenharmony_ci	if (!zone_info->empty_zones) {
44962306a36Sopenharmony_ci		ret = -ENOMEM;
45062306a36Sopenharmony_ci		goto out;
45162306a36Sopenharmony_ci	}
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_ci	zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
45462306a36Sopenharmony_ci	if (!zone_info->active_zones) {
45562306a36Sopenharmony_ci		ret = -ENOMEM;
45662306a36Sopenharmony_ci		goto out;
45762306a36Sopenharmony_ci	}
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci	zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
46062306a36Sopenharmony_ci	if (!zones) {
46162306a36Sopenharmony_ci		ret = -ENOMEM;
46262306a36Sopenharmony_ci		goto out;
46362306a36Sopenharmony_ci	}
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci	/*
46662306a36Sopenharmony_ci	 * Enable zone cache only for a zoned device. On a non-zoned device, we
46762306a36Sopenharmony_ci	 * fill the zone info with emulated CONVENTIONAL zones, so no need to
46862306a36Sopenharmony_ci	 * use the cache.
46962306a36Sopenharmony_ci	 */
47062306a36Sopenharmony_ci	if (populate_cache && bdev_is_zoned(device->bdev)) {
47162306a36Sopenharmony_ci		zone_info->zone_cache = vcalloc(zone_info->nr_zones,
47262306a36Sopenharmony_ci						sizeof(struct blk_zone));
47362306a36Sopenharmony_ci		if (!zone_info->zone_cache) {
47462306a36Sopenharmony_ci			btrfs_err_in_rcu(device->fs_info,
47562306a36Sopenharmony_ci				"zoned: failed to allocate zone cache for %s",
47662306a36Sopenharmony_ci				rcu_str_deref(device->name));
47762306a36Sopenharmony_ci			ret = -ENOMEM;
47862306a36Sopenharmony_ci			goto out;
47962306a36Sopenharmony_ci		}
48062306a36Sopenharmony_ci	}
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_ci	/* Get zones type */
48362306a36Sopenharmony_ci	nactive = 0;
48462306a36Sopenharmony_ci	while (sector < nr_sectors) {
48562306a36Sopenharmony_ci		nr_zones = BTRFS_REPORT_NR_ZONES;
48662306a36Sopenharmony_ci		ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
48762306a36Sopenharmony_ci					  &nr_zones);
48862306a36Sopenharmony_ci		if (ret)
48962306a36Sopenharmony_ci			goto out;
49062306a36Sopenharmony_ci
49162306a36Sopenharmony_ci		for (i = 0; i < nr_zones; i++) {
49262306a36Sopenharmony_ci			if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
49362306a36Sopenharmony_ci				__set_bit(nreported, zone_info->seq_zones);
49462306a36Sopenharmony_ci			switch (zones[i].cond) {
49562306a36Sopenharmony_ci			case BLK_ZONE_COND_EMPTY:
49662306a36Sopenharmony_ci				__set_bit(nreported, zone_info->empty_zones);
49762306a36Sopenharmony_ci				break;
49862306a36Sopenharmony_ci			case BLK_ZONE_COND_IMP_OPEN:
49962306a36Sopenharmony_ci			case BLK_ZONE_COND_EXP_OPEN:
50062306a36Sopenharmony_ci			case BLK_ZONE_COND_CLOSED:
50162306a36Sopenharmony_ci				__set_bit(nreported, zone_info->active_zones);
50262306a36Sopenharmony_ci				nactive++;
50362306a36Sopenharmony_ci				break;
50462306a36Sopenharmony_ci			}
50562306a36Sopenharmony_ci			nreported++;
50662306a36Sopenharmony_ci		}
50762306a36Sopenharmony_ci		sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
50862306a36Sopenharmony_ci	}
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	if (nreported != zone_info->nr_zones) {
51162306a36Sopenharmony_ci		btrfs_err_in_rcu(device->fs_info,
51262306a36Sopenharmony_ci				 "inconsistent number of zones on %s (%u/%u)",
51362306a36Sopenharmony_ci				 rcu_str_deref(device->name), nreported,
51462306a36Sopenharmony_ci				 zone_info->nr_zones);
51562306a36Sopenharmony_ci		ret = -EIO;
51662306a36Sopenharmony_ci		goto out;
51762306a36Sopenharmony_ci	}
51862306a36Sopenharmony_ci
51962306a36Sopenharmony_ci	if (max_active_zones) {
52062306a36Sopenharmony_ci		if (nactive > max_active_zones) {
52162306a36Sopenharmony_ci			btrfs_err_in_rcu(device->fs_info,
52262306a36Sopenharmony_ci			"zoned: %u active zones on %s exceeds max_active_zones %u",
52362306a36Sopenharmony_ci					 nactive, rcu_str_deref(device->name),
52462306a36Sopenharmony_ci					 max_active_zones);
52562306a36Sopenharmony_ci			ret = -EIO;
52662306a36Sopenharmony_ci			goto out;
52762306a36Sopenharmony_ci		}
52862306a36Sopenharmony_ci		atomic_set(&zone_info->active_zones_left,
52962306a36Sopenharmony_ci			   max_active_zones - nactive);
53062306a36Sopenharmony_ci		set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
53162306a36Sopenharmony_ci	}
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci	/* Validate superblock log */
53462306a36Sopenharmony_ci	nr_zones = BTRFS_NR_SB_LOG_ZONES;
53562306a36Sopenharmony_ci	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
53662306a36Sopenharmony_ci		u32 sb_zone;
53762306a36Sopenharmony_ci		u64 sb_wp;
53862306a36Sopenharmony_ci		int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_ci		sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
54162306a36Sopenharmony_ci		if (sb_zone + 1 >= zone_info->nr_zones)
54262306a36Sopenharmony_ci			continue;
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci		ret = btrfs_get_dev_zones(device,
54562306a36Sopenharmony_ci					  zone_start_physical(sb_zone, zone_info),
54662306a36Sopenharmony_ci					  &zone_info->sb_zones[sb_pos],
54762306a36Sopenharmony_ci					  &nr_zones);
54862306a36Sopenharmony_ci		if (ret)
54962306a36Sopenharmony_ci			goto out;
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci		if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
55262306a36Sopenharmony_ci			btrfs_err_in_rcu(device->fs_info,
55362306a36Sopenharmony_ci	"zoned: failed to read super block log zone info at devid %llu zone %u",
55462306a36Sopenharmony_ci					 device->devid, sb_zone);
55562306a36Sopenharmony_ci			ret = -EUCLEAN;
55662306a36Sopenharmony_ci			goto out;
55762306a36Sopenharmony_ci		}
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci		/*
56062306a36Sopenharmony_ci		 * If zones[0] is conventional, always use the beginning of the
56162306a36Sopenharmony_ci		 * zone to record superblock. No need to validate in that case.
56262306a36Sopenharmony_ci		 */
56362306a36Sopenharmony_ci		if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
56462306a36Sopenharmony_ci		    BLK_ZONE_TYPE_CONVENTIONAL)
56562306a36Sopenharmony_ci			continue;
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_ci		ret = sb_write_pointer(device->bdev,
56862306a36Sopenharmony_ci				       &zone_info->sb_zones[sb_pos], &sb_wp);
56962306a36Sopenharmony_ci		if (ret != -ENOENT && ret) {
57062306a36Sopenharmony_ci			btrfs_err_in_rcu(device->fs_info,
57162306a36Sopenharmony_ci			"zoned: super block log zone corrupted devid %llu zone %u",
57262306a36Sopenharmony_ci					 device->devid, sb_zone);
57362306a36Sopenharmony_ci			ret = -EUCLEAN;
57462306a36Sopenharmony_ci			goto out;
57562306a36Sopenharmony_ci		}
57662306a36Sopenharmony_ci	}
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci	kvfree(zones);
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	switch (bdev_zoned_model(bdev)) {
58262306a36Sopenharmony_ci	case BLK_ZONED_HM:
58362306a36Sopenharmony_ci		model = "host-managed zoned";
58462306a36Sopenharmony_ci		emulated = "";
58562306a36Sopenharmony_ci		break;
58662306a36Sopenharmony_ci	case BLK_ZONED_HA:
58762306a36Sopenharmony_ci		model = "host-aware zoned";
58862306a36Sopenharmony_ci		emulated = "";
58962306a36Sopenharmony_ci		break;
59062306a36Sopenharmony_ci	case BLK_ZONED_NONE:
59162306a36Sopenharmony_ci		model = "regular";
59262306a36Sopenharmony_ci		emulated = "emulated ";
59362306a36Sopenharmony_ci		break;
59462306a36Sopenharmony_ci	default:
59562306a36Sopenharmony_ci		/* Just in case */
59662306a36Sopenharmony_ci		btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
59762306a36Sopenharmony_ci				 bdev_zoned_model(bdev),
59862306a36Sopenharmony_ci				 rcu_str_deref(device->name));
59962306a36Sopenharmony_ci		ret = -EOPNOTSUPP;
60062306a36Sopenharmony_ci		goto out_free_zone_info;
60162306a36Sopenharmony_ci	}
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci	btrfs_info_in_rcu(fs_info,
60462306a36Sopenharmony_ci		"%s block device %s, %u %szones of %llu bytes",
60562306a36Sopenharmony_ci		model, rcu_str_deref(device->name), zone_info->nr_zones,
60662306a36Sopenharmony_ci		emulated, zone_info->zone_size);
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci	return 0;
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_ciout:
61162306a36Sopenharmony_ci	kvfree(zones);
61262306a36Sopenharmony_ciout_free_zone_info:
61362306a36Sopenharmony_ci	btrfs_destroy_dev_zone_info(device);
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci	return ret;
61662306a36Sopenharmony_ci}
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_civoid btrfs_destroy_dev_zone_info(struct btrfs_device *device)
61962306a36Sopenharmony_ci{
62062306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zone_info = device->zone_info;
62162306a36Sopenharmony_ci
62262306a36Sopenharmony_ci	if (!zone_info)
62362306a36Sopenharmony_ci		return;
62462306a36Sopenharmony_ci
62562306a36Sopenharmony_ci	bitmap_free(zone_info->active_zones);
62662306a36Sopenharmony_ci	bitmap_free(zone_info->seq_zones);
62762306a36Sopenharmony_ci	bitmap_free(zone_info->empty_zones);
62862306a36Sopenharmony_ci	vfree(zone_info->zone_cache);
62962306a36Sopenharmony_ci	kfree(zone_info);
63062306a36Sopenharmony_ci	device->zone_info = NULL;
63162306a36Sopenharmony_ci}
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_cistruct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
63462306a36Sopenharmony_ci{
63562306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zone_info;
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci	zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
63862306a36Sopenharmony_ci	if (!zone_info)
63962306a36Sopenharmony_ci		return NULL;
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci	zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
64262306a36Sopenharmony_ci	if (!zone_info->seq_zones)
64362306a36Sopenharmony_ci		goto out;
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ci	bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
64662306a36Sopenharmony_ci		    zone_info->nr_zones);
64762306a36Sopenharmony_ci
64862306a36Sopenharmony_ci	zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
64962306a36Sopenharmony_ci	if (!zone_info->empty_zones)
65062306a36Sopenharmony_ci		goto out;
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci	bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
65362306a36Sopenharmony_ci		    zone_info->nr_zones);
65462306a36Sopenharmony_ci
65562306a36Sopenharmony_ci	zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
65662306a36Sopenharmony_ci	if (!zone_info->active_zones)
65762306a36Sopenharmony_ci		goto out;
65862306a36Sopenharmony_ci
65962306a36Sopenharmony_ci	bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
66062306a36Sopenharmony_ci		    zone_info->nr_zones);
66162306a36Sopenharmony_ci	zone_info->zone_cache = NULL;
66262306a36Sopenharmony_ci
66362306a36Sopenharmony_ci	return zone_info;
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ciout:
66662306a36Sopenharmony_ci	bitmap_free(zone_info->seq_zones);
66762306a36Sopenharmony_ci	bitmap_free(zone_info->empty_zones);
66862306a36Sopenharmony_ci	bitmap_free(zone_info->active_zones);
66962306a36Sopenharmony_ci	kfree(zone_info);
67062306a36Sopenharmony_ci	return NULL;
67162306a36Sopenharmony_ci}
67262306a36Sopenharmony_ci
67362306a36Sopenharmony_ciint btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
67462306a36Sopenharmony_ci		       struct blk_zone *zone)
67562306a36Sopenharmony_ci{
67662306a36Sopenharmony_ci	unsigned int nr_zones = 1;
67762306a36Sopenharmony_ci	int ret;
67862306a36Sopenharmony_ci
67962306a36Sopenharmony_ci	ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
68062306a36Sopenharmony_ci	if (ret != 0 || !nr_zones)
68162306a36Sopenharmony_ci		return ret ? ret : -EIO;
68262306a36Sopenharmony_ci
68362306a36Sopenharmony_ci	return 0;
68462306a36Sopenharmony_ci}
68562306a36Sopenharmony_ci
68662306a36Sopenharmony_cistatic int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
68762306a36Sopenharmony_ci{
68862306a36Sopenharmony_ci	struct btrfs_device *device;
68962306a36Sopenharmony_ci
69062306a36Sopenharmony_ci	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
69162306a36Sopenharmony_ci		if (device->bdev &&
69262306a36Sopenharmony_ci		    bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
69362306a36Sopenharmony_ci			btrfs_err(fs_info,
69462306a36Sopenharmony_ci				"zoned: mode not enabled but zoned device found: %pg",
69562306a36Sopenharmony_ci				device->bdev);
69662306a36Sopenharmony_ci			return -EINVAL;
69762306a36Sopenharmony_ci		}
69862306a36Sopenharmony_ci	}
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ci	return 0;
70162306a36Sopenharmony_ci}
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_ciint btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
70462306a36Sopenharmony_ci{
70562306a36Sopenharmony_ci	struct queue_limits *lim = &fs_info->limits;
70662306a36Sopenharmony_ci	struct btrfs_device *device;
70762306a36Sopenharmony_ci	u64 zone_size = 0;
70862306a36Sopenharmony_ci	int ret;
70962306a36Sopenharmony_ci
71062306a36Sopenharmony_ci	/*
71162306a36Sopenharmony_ci	 * Host-Managed devices can't be used without the ZONED flag.  With the
71262306a36Sopenharmony_ci	 * ZONED all devices can be used, using zone emulation if required.
71362306a36Sopenharmony_ci	 */
71462306a36Sopenharmony_ci	if (!btrfs_fs_incompat(fs_info, ZONED))
71562306a36Sopenharmony_ci		return btrfs_check_for_zoned_device(fs_info);
71662306a36Sopenharmony_ci
71762306a36Sopenharmony_ci	blk_set_stacking_limits(lim);
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_ci	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
72062306a36Sopenharmony_ci		struct btrfs_zoned_device_info *zone_info = device->zone_info;
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci		if (!device->bdev)
72362306a36Sopenharmony_ci			continue;
72462306a36Sopenharmony_ci
72562306a36Sopenharmony_ci		if (!zone_size) {
72662306a36Sopenharmony_ci			zone_size = zone_info->zone_size;
72762306a36Sopenharmony_ci		} else if (zone_info->zone_size != zone_size) {
72862306a36Sopenharmony_ci			btrfs_err(fs_info,
72962306a36Sopenharmony_ci		"zoned: unequal block device zone sizes: have %llu found %llu",
73062306a36Sopenharmony_ci				  zone_info->zone_size, zone_size);
73162306a36Sopenharmony_ci			return -EINVAL;
73262306a36Sopenharmony_ci		}
73362306a36Sopenharmony_ci
73462306a36Sopenharmony_ci		/*
73562306a36Sopenharmony_ci		 * With the zoned emulation, we can have non-zoned device on the
73662306a36Sopenharmony_ci		 * zoned mode. In this case, we don't have a valid max zone
73762306a36Sopenharmony_ci		 * append size.
73862306a36Sopenharmony_ci		 */
73962306a36Sopenharmony_ci		if (bdev_is_zoned(device->bdev)) {
74062306a36Sopenharmony_ci			blk_stack_limits(lim,
74162306a36Sopenharmony_ci					 &bdev_get_queue(device->bdev)->limits,
74262306a36Sopenharmony_ci					 0);
74362306a36Sopenharmony_ci		}
74462306a36Sopenharmony_ci	}
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_ci	/*
74762306a36Sopenharmony_ci	 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
74862306a36Sopenharmony_ci	 * btrfs_create_chunk(). Since we want stripe_len == zone_size,
74962306a36Sopenharmony_ci	 * check the alignment here.
75062306a36Sopenharmony_ci	 */
75162306a36Sopenharmony_ci	if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
75262306a36Sopenharmony_ci		btrfs_err(fs_info,
75362306a36Sopenharmony_ci			  "zoned: zone size %llu not aligned to stripe %u",
75462306a36Sopenharmony_ci			  zone_size, BTRFS_STRIPE_LEN);
75562306a36Sopenharmony_ci		return -EINVAL;
75662306a36Sopenharmony_ci	}
75762306a36Sopenharmony_ci
75862306a36Sopenharmony_ci	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
75962306a36Sopenharmony_ci		btrfs_err(fs_info, "zoned: mixed block groups not supported");
76062306a36Sopenharmony_ci		return -EINVAL;
76162306a36Sopenharmony_ci	}
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_ci	fs_info->zone_size = zone_size;
76462306a36Sopenharmony_ci	/*
76562306a36Sopenharmony_ci	 * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
76662306a36Sopenharmony_ci	 * Technically, we can have multiple pages per segment. But, since
76762306a36Sopenharmony_ci	 * we add the pages one by one to a bio, and cannot increase the
76862306a36Sopenharmony_ci	 * metadata reservation even if it increases the number of extents, it
76962306a36Sopenharmony_ci	 * is safe to stick with the limit.
77062306a36Sopenharmony_ci	 */
77162306a36Sopenharmony_ci	fs_info->max_zone_append_size = ALIGN_DOWN(
77262306a36Sopenharmony_ci		min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
77362306a36Sopenharmony_ci		     (u64)lim->max_sectors << SECTOR_SHIFT,
77462306a36Sopenharmony_ci		     (u64)lim->max_segments << PAGE_SHIFT),
77562306a36Sopenharmony_ci		fs_info->sectorsize);
77662306a36Sopenharmony_ci	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
77762306a36Sopenharmony_ci	if (fs_info->max_zone_append_size < fs_info->max_extent_size)
77862306a36Sopenharmony_ci		fs_info->max_extent_size = fs_info->max_zone_append_size;
77962306a36Sopenharmony_ci
78062306a36Sopenharmony_ci	/*
78162306a36Sopenharmony_ci	 * Check mount options here, because we might change fs_info->zoned
78262306a36Sopenharmony_ci	 * from fs_info->zone_size.
78362306a36Sopenharmony_ci	 */
78462306a36Sopenharmony_ci	ret = btrfs_check_mountopts_zoned(fs_info);
78562306a36Sopenharmony_ci	if (ret)
78662306a36Sopenharmony_ci		return ret;
78762306a36Sopenharmony_ci
78862306a36Sopenharmony_ci	btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
78962306a36Sopenharmony_ci	return 0;
79062306a36Sopenharmony_ci}
79162306a36Sopenharmony_ci
79262306a36Sopenharmony_ciint btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
79362306a36Sopenharmony_ci{
79462306a36Sopenharmony_ci	if (!btrfs_is_zoned(info))
79562306a36Sopenharmony_ci		return 0;
79662306a36Sopenharmony_ci
79762306a36Sopenharmony_ci	/*
79862306a36Sopenharmony_ci	 * Space cache writing is not COWed. Disable that to avoid write errors
79962306a36Sopenharmony_ci	 * in sequential zones.
80062306a36Sopenharmony_ci	 */
80162306a36Sopenharmony_ci	if (btrfs_test_opt(info, SPACE_CACHE)) {
80262306a36Sopenharmony_ci		btrfs_err(info, "zoned: space cache v1 is not supported");
80362306a36Sopenharmony_ci		return -EINVAL;
80462306a36Sopenharmony_ci	}
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci	if (btrfs_test_opt(info, NODATACOW)) {
80762306a36Sopenharmony_ci		btrfs_err(info, "zoned: NODATACOW not supported");
80862306a36Sopenharmony_ci		return -EINVAL;
80962306a36Sopenharmony_ci	}
81062306a36Sopenharmony_ci
81162306a36Sopenharmony_ci	btrfs_clear_and_info(info, DISCARD_ASYNC,
81262306a36Sopenharmony_ci			"zoned: async discard ignored and disabled for zoned mode");
81362306a36Sopenharmony_ci
81462306a36Sopenharmony_ci	return 0;
81562306a36Sopenharmony_ci}
81662306a36Sopenharmony_ci
81762306a36Sopenharmony_cistatic int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
81862306a36Sopenharmony_ci			   int rw, u64 *bytenr_ret)
81962306a36Sopenharmony_ci{
82062306a36Sopenharmony_ci	u64 wp;
82162306a36Sopenharmony_ci	int ret;
82262306a36Sopenharmony_ci
82362306a36Sopenharmony_ci	if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
82462306a36Sopenharmony_ci		*bytenr_ret = zones[0].start << SECTOR_SHIFT;
82562306a36Sopenharmony_ci		return 0;
82662306a36Sopenharmony_ci	}
82762306a36Sopenharmony_ci
82862306a36Sopenharmony_ci	ret = sb_write_pointer(bdev, zones, &wp);
82962306a36Sopenharmony_ci	if (ret != -ENOENT && ret < 0)
83062306a36Sopenharmony_ci		return ret;
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_ci	if (rw == WRITE) {
83362306a36Sopenharmony_ci		struct blk_zone *reset = NULL;
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_ci		if (wp == zones[0].start << SECTOR_SHIFT)
83662306a36Sopenharmony_ci			reset = &zones[0];
83762306a36Sopenharmony_ci		else if (wp == zones[1].start << SECTOR_SHIFT)
83862306a36Sopenharmony_ci			reset = &zones[1];
83962306a36Sopenharmony_ci
84062306a36Sopenharmony_ci		if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
84162306a36Sopenharmony_ci			ASSERT(sb_zone_is_full(reset));
84262306a36Sopenharmony_ci
84362306a36Sopenharmony_ci			ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
84462306a36Sopenharmony_ci					       reset->start, reset->len,
84562306a36Sopenharmony_ci					       GFP_NOFS);
84662306a36Sopenharmony_ci			if (ret)
84762306a36Sopenharmony_ci				return ret;
84862306a36Sopenharmony_ci
84962306a36Sopenharmony_ci			reset->cond = BLK_ZONE_COND_EMPTY;
85062306a36Sopenharmony_ci			reset->wp = reset->start;
85162306a36Sopenharmony_ci		}
85262306a36Sopenharmony_ci	} else if (ret != -ENOENT) {
85362306a36Sopenharmony_ci		/*
85462306a36Sopenharmony_ci		 * For READ, we want the previous one. Move write pointer to
85562306a36Sopenharmony_ci		 * the end of a zone, if it is at the head of a zone.
85662306a36Sopenharmony_ci		 */
85762306a36Sopenharmony_ci		u64 zone_end = 0;
85862306a36Sopenharmony_ci
85962306a36Sopenharmony_ci		if (wp == zones[0].start << SECTOR_SHIFT)
86062306a36Sopenharmony_ci			zone_end = zones[1].start + zones[1].capacity;
86162306a36Sopenharmony_ci		else if (wp == zones[1].start << SECTOR_SHIFT)
86262306a36Sopenharmony_ci			zone_end = zones[0].start + zones[0].capacity;
86362306a36Sopenharmony_ci		if (zone_end)
86462306a36Sopenharmony_ci			wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
86562306a36Sopenharmony_ci					BTRFS_SUPER_INFO_SIZE);
86662306a36Sopenharmony_ci
86762306a36Sopenharmony_ci		wp -= BTRFS_SUPER_INFO_SIZE;
86862306a36Sopenharmony_ci	}
86962306a36Sopenharmony_ci
87062306a36Sopenharmony_ci	*bytenr_ret = wp;
87162306a36Sopenharmony_ci	return 0;
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci}
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ciint btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
87662306a36Sopenharmony_ci			       u64 *bytenr_ret)
87762306a36Sopenharmony_ci{
87862306a36Sopenharmony_ci	struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
87962306a36Sopenharmony_ci	sector_t zone_sectors;
88062306a36Sopenharmony_ci	u32 sb_zone;
88162306a36Sopenharmony_ci	int ret;
88262306a36Sopenharmony_ci	u8 zone_sectors_shift;
88362306a36Sopenharmony_ci	sector_t nr_sectors;
88462306a36Sopenharmony_ci	u32 nr_zones;
88562306a36Sopenharmony_ci
88662306a36Sopenharmony_ci	if (!bdev_is_zoned(bdev)) {
88762306a36Sopenharmony_ci		*bytenr_ret = btrfs_sb_offset(mirror);
88862306a36Sopenharmony_ci		return 0;
88962306a36Sopenharmony_ci	}
89062306a36Sopenharmony_ci
89162306a36Sopenharmony_ci	ASSERT(rw == READ || rw == WRITE);
89262306a36Sopenharmony_ci
89362306a36Sopenharmony_ci	zone_sectors = bdev_zone_sectors(bdev);
89462306a36Sopenharmony_ci	if (!is_power_of_2(zone_sectors))
89562306a36Sopenharmony_ci		return -EINVAL;
89662306a36Sopenharmony_ci	zone_sectors_shift = ilog2(zone_sectors);
89762306a36Sopenharmony_ci	nr_sectors = bdev_nr_sectors(bdev);
89862306a36Sopenharmony_ci	nr_zones = nr_sectors >> zone_sectors_shift;
89962306a36Sopenharmony_ci
90062306a36Sopenharmony_ci	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
90162306a36Sopenharmony_ci	if (sb_zone + 1 >= nr_zones)
90262306a36Sopenharmony_ci		return -ENOENT;
90362306a36Sopenharmony_ci
90462306a36Sopenharmony_ci	ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
90562306a36Sopenharmony_ci				  BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
90662306a36Sopenharmony_ci				  zones);
90762306a36Sopenharmony_ci	if (ret < 0)
90862306a36Sopenharmony_ci		return ret;
90962306a36Sopenharmony_ci	if (ret != BTRFS_NR_SB_LOG_ZONES)
91062306a36Sopenharmony_ci		return -EIO;
91162306a36Sopenharmony_ci
91262306a36Sopenharmony_ci	return sb_log_location(bdev, zones, rw, bytenr_ret);
91362306a36Sopenharmony_ci}
91462306a36Sopenharmony_ci
91562306a36Sopenharmony_ciint btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
91662306a36Sopenharmony_ci			  u64 *bytenr_ret)
91762306a36Sopenharmony_ci{
91862306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zinfo = device->zone_info;
91962306a36Sopenharmony_ci	u32 zone_num;
92062306a36Sopenharmony_ci
92162306a36Sopenharmony_ci	/*
92262306a36Sopenharmony_ci	 * For a zoned filesystem on a non-zoned block device, use the same
92362306a36Sopenharmony_ci	 * super block locations as regular filesystem. Doing so, the super
92462306a36Sopenharmony_ci	 * block can always be retrieved and the zoned flag of the volume
92562306a36Sopenharmony_ci	 * detected from the super block information.
92662306a36Sopenharmony_ci	 */
92762306a36Sopenharmony_ci	if (!bdev_is_zoned(device->bdev)) {
92862306a36Sopenharmony_ci		*bytenr_ret = btrfs_sb_offset(mirror);
92962306a36Sopenharmony_ci		return 0;
93062306a36Sopenharmony_ci	}
93162306a36Sopenharmony_ci
93262306a36Sopenharmony_ci	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
93362306a36Sopenharmony_ci	if (zone_num + 1 >= zinfo->nr_zones)
93462306a36Sopenharmony_ci		return -ENOENT;
93562306a36Sopenharmony_ci
93662306a36Sopenharmony_ci	return sb_log_location(device->bdev,
93762306a36Sopenharmony_ci			       &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
93862306a36Sopenharmony_ci			       rw, bytenr_ret);
93962306a36Sopenharmony_ci}
94062306a36Sopenharmony_ci
94162306a36Sopenharmony_cistatic inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
94262306a36Sopenharmony_ci				  int mirror)
94362306a36Sopenharmony_ci{
94462306a36Sopenharmony_ci	u32 zone_num;
94562306a36Sopenharmony_ci
94662306a36Sopenharmony_ci	if (!zinfo)
94762306a36Sopenharmony_ci		return false;
94862306a36Sopenharmony_ci
94962306a36Sopenharmony_ci	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
95062306a36Sopenharmony_ci	if (zone_num + 1 >= zinfo->nr_zones)
95162306a36Sopenharmony_ci		return false;
95262306a36Sopenharmony_ci
95362306a36Sopenharmony_ci	if (!test_bit(zone_num, zinfo->seq_zones))
95462306a36Sopenharmony_ci		return false;
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci	return true;
95762306a36Sopenharmony_ci}
95862306a36Sopenharmony_ci
95962306a36Sopenharmony_ciint btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
96062306a36Sopenharmony_ci{
96162306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zinfo = device->zone_info;
96262306a36Sopenharmony_ci	struct blk_zone *zone;
96362306a36Sopenharmony_ci	int i;
96462306a36Sopenharmony_ci
96562306a36Sopenharmony_ci	if (!is_sb_log_zone(zinfo, mirror))
96662306a36Sopenharmony_ci		return 0;
96762306a36Sopenharmony_ci
96862306a36Sopenharmony_ci	zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
96962306a36Sopenharmony_ci	for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
97062306a36Sopenharmony_ci		/* Advance the next zone */
97162306a36Sopenharmony_ci		if (zone->cond == BLK_ZONE_COND_FULL) {
97262306a36Sopenharmony_ci			zone++;
97362306a36Sopenharmony_ci			continue;
97462306a36Sopenharmony_ci		}
97562306a36Sopenharmony_ci
97662306a36Sopenharmony_ci		if (zone->cond == BLK_ZONE_COND_EMPTY)
97762306a36Sopenharmony_ci			zone->cond = BLK_ZONE_COND_IMP_OPEN;
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci		zone->wp += SUPER_INFO_SECTORS;
98062306a36Sopenharmony_ci
98162306a36Sopenharmony_ci		if (sb_zone_is_full(zone)) {
98262306a36Sopenharmony_ci			/*
98362306a36Sopenharmony_ci			 * No room left to write new superblock. Since
98462306a36Sopenharmony_ci			 * superblock is written with REQ_SYNC, it is safe to
98562306a36Sopenharmony_ci			 * finish the zone now.
98662306a36Sopenharmony_ci			 *
98762306a36Sopenharmony_ci			 * If the write pointer is exactly at the capacity,
98862306a36Sopenharmony_ci			 * explicit ZONE_FINISH is not necessary.
98962306a36Sopenharmony_ci			 */
99062306a36Sopenharmony_ci			if (zone->wp != zone->start + zone->capacity) {
99162306a36Sopenharmony_ci				int ret;
99262306a36Sopenharmony_ci
99362306a36Sopenharmony_ci				ret = blkdev_zone_mgmt(device->bdev,
99462306a36Sopenharmony_ci						REQ_OP_ZONE_FINISH, zone->start,
99562306a36Sopenharmony_ci						zone->len, GFP_NOFS);
99662306a36Sopenharmony_ci				if (ret)
99762306a36Sopenharmony_ci					return ret;
99862306a36Sopenharmony_ci			}
99962306a36Sopenharmony_ci
100062306a36Sopenharmony_ci			zone->wp = zone->start + zone->len;
100162306a36Sopenharmony_ci			zone->cond = BLK_ZONE_COND_FULL;
100262306a36Sopenharmony_ci		}
100362306a36Sopenharmony_ci		return 0;
100462306a36Sopenharmony_ci	}
100562306a36Sopenharmony_ci
100662306a36Sopenharmony_ci	/* All the zones are FULL. Should not reach here. */
100762306a36Sopenharmony_ci	ASSERT(0);
100862306a36Sopenharmony_ci	return -EIO;
100962306a36Sopenharmony_ci}
101062306a36Sopenharmony_ci
101162306a36Sopenharmony_ciint btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
101262306a36Sopenharmony_ci{
101362306a36Sopenharmony_ci	sector_t zone_sectors;
101462306a36Sopenharmony_ci	sector_t nr_sectors;
101562306a36Sopenharmony_ci	u8 zone_sectors_shift;
101662306a36Sopenharmony_ci	u32 sb_zone;
101762306a36Sopenharmony_ci	u32 nr_zones;
101862306a36Sopenharmony_ci
101962306a36Sopenharmony_ci	zone_sectors = bdev_zone_sectors(bdev);
102062306a36Sopenharmony_ci	zone_sectors_shift = ilog2(zone_sectors);
102162306a36Sopenharmony_ci	nr_sectors = bdev_nr_sectors(bdev);
102262306a36Sopenharmony_ci	nr_zones = nr_sectors >> zone_sectors_shift;
102362306a36Sopenharmony_ci
102462306a36Sopenharmony_ci	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
102562306a36Sopenharmony_ci	if (sb_zone + 1 >= nr_zones)
102662306a36Sopenharmony_ci		return -ENOENT;
102762306a36Sopenharmony_ci
102862306a36Sopenharmony_ci	return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
102962306a36Sopenharmony_ci				zone_start_sector(sb_zone, bdev),
103062306a36Sopenharmony_ci				zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
103162306a36Sopenharmony_ci}
103262306a36Sopenharmony_ci
103362306a36Sopenharmony_ci/*
103462306a36Sopenharmony_ci * Find allocatable zones within a given region.
103562306a36Sopenharmony_ci *
103662306a36Sopenharmony_ci * @device:	the device to allocate a region on
103762306a36Sopenharmony_ci * @hole_start: the position of the hole to allocate the region
103862306a36Sopenharmony_ci * @num_bytes:	size of wanted region
103962306a36Sopenharmony_ci * @hole_end:	the end of the hole
104062306a36Sopenharmony_ci * @return:	position of allocatable zones
104162306a36Sopenharmony_ci *
104262306a36Sopenharmony_ci * Allocatable region should not contain any superblock locations.
104362306a36Sopenharmony_ci */
104462306a36Sopenharmony_ciu64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
104562306a36Sopenharmony_ci				 u64 hole_end, u64 num_bytes)
104662306a36Sopenharmony_ci{
104762306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zinfo = device->zone_info;
104862306a36Sopenharmony_ci	const u8 shift = zinfo->zone_size_shift;
104962306a36Sopenharmony_ci	u64 nzones = num_bytes >> shift;
105062306a36Sopenharmony_ci	u64 pos = hole_start;
105162306a36Sopenharmony_ci	u64 begin, end;
105262306a36Sopenharmony_ci	bool have_sb;
105362306a36Sopenharmony_ci	int i;
105462306a36Sopenharmony_ci
105562306a36Sopenharmony_ci	ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
105662306a36Sopenharmony_ci	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
105762306a36Sopenharmony_ci
105862306a36Sopenharmony_ci	while (pos < hole_end) {
105962306a36Sopenharmony_ci		begin = pos >> shift;
106062306a36Sopenharmony_ci		end = begin + nzones;
106162306a36Sopenharmony_ci
106262306a36Sopenharmony_ci		if (end > zinfo->nr_zones)
106362306a36Sopenharmony_ci			return hole_end;
106462306a36Sopenharmony_ci
106562306a36Sopenharmony_ci		/* Check if zones in the region are all empty */
106662306a36Sopenharmony_ci		if (btrfs_dev_is_sequential(device, pos) &&
106762306a36Sopenharmony_ci		    !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) {
106862306a36Sopenharmony_ci			pos += zinfo->zone_size;
106962306a36Sopenharmony_ci			continue;
107062306a36Sopenharmony_ci		}
107162306a36Sopenharmony_ci
107262306a36Sopenharmony_ci		have_sb = false;
107362306a36Sopenharmony_ci		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
107462306a36Sopenharmony_ci			u32 sb_zone;
107562306a36Sopenharmony_ci			u64 sb_pos;
107662306a36Sopenharmony_ci
107762306a36Sopenharmony_ci			sb_zone = sb_zone_number(shift, i);
107862306a36Sopenharmony_ci			if (!(end <= sb_zone ||
107962306a36Sopenharmony_ci			      sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
108062306a36Sopenharmony_ci				have_sb = true;
108162306a36Sopenharmony_ci				pos = zone_start_physical(
108262306a36Sopenharmony_ci					sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
108362306a36Sopenharmony_ci				break;
108462306a36Sopenharmony_ci			}
108562306a36Sopenharmony_ci
108662306a36Sopenharmony_ci			/* We also need to exclude regular superblock positions */
108762306a36Sopenharmony_ci			sb_pos = btrfs_sb_offset(i);
108862306a36Sopenharmony_ci			if (!(pos + num_bytes <= sb_pos ||
108962306a36Sopenharmony_ci			      sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
109062306a36Sopenharmony_ci				have_sb = true;
109162306a36Sopenharmony_ci				pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
109262306a36Sopenharmony_ci					    zinfo->zone_size);
109362306a36Sopenharmony_ci				break;
109462306a36Sopenharmony_ci			}
109562306a36Sopenharmony_ci		}
109662306a36Sopenharmony_ci		if (!have_sb)
109762306a36Sopenharmony_ci			break;
109862306a36Sopenharmony_ci	}
109962306a36Sopenharmony_ci
110062306a36Sopenharmony_ci	return pos;
110162306a36Sopenharmony_ci}
110262306a36Sopenharmony_ci
110362306a36Sopenharmony_cistatic bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
110462306a36Sopenharmony_ci{
110562306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zone_info = device->zone_info;
110662306a36Sopenharmony_ci	unsigned int zno = (pos >> zone_info->zone_size_shift);
110762306a36Sopenharmony_ci
110862306a36Sopenharmony_ci	/* We can use any number of zones */
110962306a36Sopenharmony_ci	if (zone_info->max_active_zones == 0)
111062306a36Sopenharmony_ci		return true;
111162306a36Sopenharmony_ci
111262306a36Sopenharmony_ci	if (!test_bit(zno, zone_info->active_zones)) {
111362306a36Sopenharmony_ci		/* Active zone left? */
111462306a36Sopenharmony_ci		if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
111562306a36Sopenharmony_ci			return false;
111662306a36Sopenharmony_ci		if (test_and_set_bit(zno, zone_info->active_zones)) {
111762306a36Sopenharmony_ci			/* Someone already set the bit */
111862306a36Sopenharmony_ci			atomic_inc(&zone_info->active_zones_left);
111962306a36Sopenharmony_ci		}
112062306a36Sopenharmony_ci	}
112162306a36Sopenharmony_ci
112262306a36Sopenharmony_ci	return true;
112362306a36Sopenharmony_ci}
112462306a36Sopenharmony_ci
112562306a36Sopenharmony_cistatic void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
112662306a36Sopenharmony_ci{
112762306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zone_info = device->zone_info;
112862306a36Sopenharmony_ci	unsigned int zno = (pos >> zone_info->zone_size_shift);
112962306a36Sopenharmony_ci
113062306a36Sopenharmony_ci	/* We can use any number of zones */
113162306a36Sopenharmony_ci	if (zone_info->max_active_zones == 0)
113262306a36Sopenharmony_ci		return;
113362306a36Sopenharmony_ci
113462306a36Sopenharmony_ci	if (test_and_clear_bit(zno, zone_info->active_zones))
113562306a36Sopenharmony_ci		atomic_inc(&zone_info->active_zones_left);
113662306a36Sopenharmony_ci}
113762306a36Sopenharmony_ci
113862306a36Sopenharmony_ciint btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
113962306a36Sopenharmony_ci			    u64 length, u64 *bytes)
114062306a36Sopenharmony_ci{
114162306a36Sopenharmony_ci	int ret;
114262306a36Sopenharmony_ci
114362306a36Sopenharmony_ci	*bytes = 0;
114462306a36Sopenharmony_ci	ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
114562306a36Sopenharmony_ci			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
114662306a36Sopenharmony_ci			       GFP_NOFS);
114762306a36Sopenharmony_ci	if (ret)
114862306a36Sopenharmony_ci		return ret;
114962306a36Sopenharmony_ci
115062306a36Sopenharmony_ci	*bytes = length;
115162306a36Sopenharmony_ci	while (length) {
115262306a36Sopenharmony_ci		btrfs_dev_set_zone_empty(device, physical);
115362306a36Sopenharmony_ci		btrfs_dev_clear_active_zone(device, physical);
115462306a36Sopenharmony_ci		physical += device->zone_info->zone_size;
115562306a36Sopenharmony_ci		length -= device->zone_info->zone_size;
115662306a36Sopenharmony_ci	}
115762306a36Sopenharmony_ci
115862306a36Sopenharmony_ci	return 0;
115962306a36Sopenharmony_ci}
116062306a36Sopenharmony_ci
116162306a36Sopenharmony_ciint btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
116262306a36Sopenharmony_ci{
116362306a36Sopenharmony_ci	struct btrfs_zoned_device_info *zinfo = device->zone_info;
116462306a36Sopenharmony_ci	const u8 shift = zinfo->zone_size_shift;
116562306a36Sopenharmony_ci	unsigned long begin = start >> shift;
116662306a36Sopenharmony_ci	unsigned long nbits = size >> shift;
116762306a36Sopenharmony_ci	u64 pos;
116862306a36Sopenharmony_ci	int ret;
116962306a36Sopenharmony_ci
117062306a36Sopenharmony_ci	ASSERT(IS_ALIGNED(start, zinfo->zone_size));
117162306a36Sopenharmony_ci	ASSERT(IS_ALIGNED(size, zinfo->zone_size));
117262306a36Sopenharmony_ci
117362306a36Sopenharmony_ci	if (begin + nbits > zinfo->nr_zones)
117462306a36Sopenharmony_ci		return -ERANGE;
117562306a36Sopenharmony_ci
117662306a36Sopenharmony_ci	/* All the zones are conventional */
117762306a36Sopenharmony_ci	if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits))
117862306a36Sopenharmony_ci		return 0;
117962306a36Sopenharmony_ci
118062306a36Sopenharmony_ci	/* All the zones are sequential and empty */
118162306a36Sopenharmony_ci	if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) &&
118262306a36Sopenharmony_ci	    bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits))
118362306a36Sopenharmony_ci		return 0;
118462306a36Sopenharmony_ci
118562306a36Sopenharmony_ci	for (pos = start; pos < start + size; pos += zinfo->zone_size) {
118662306a36Sopenharmony_ci		u64 reset_bytes;
118762306a36Sopenharmony_ci
118862306a36Sopenharmony_ci		if (!btrfs_dev_is_sequential(device, pos) ||
118962306a36Sopenharmony_ci		    btrfs_dev_is_empty_zone(device, pos))
119062306a36Sopenharmony_ci			continue;
119162306a36Sopenharmony_ci
119262306a36Sopenharmony_ci		/* Free regions should be empty */
119362306a36Sopenharmony_ci		btrfs_warn_in_rcu(
119462306a36Sopenharmony_ci			device->fs_info,
119562306a36Sopenharmony_ci		"zoned: resetting device %s (devid %llu) zone %llu for allocation",
119662306a36Sopenharmony_ci			rcu_str_deref(device->name), device->devid, pos >> shift);
119762306a36Sopenharmony_ci		WARN_ON_ONCE(1);
119862306a36Sopenharmony_ci
119962306a36Sopenharmony_ci		ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
120062306a36Sopenharmony_ci					      &reset_bytes);
120162306a36Sopenharmony_ci		if (ret)
120262306a36Sopenharmony_ci			return ret;
120362306a36Sopenharmony_ci	}
120462306a36Sopenharmony_ci
120562306a36Sopenharmony_ci	return 0;
120662306a36Sopenharmony_ci}
120762306a36Sopenharmony_ci
120862306a36Sopenharmony_ci/*
120962306a36Sopenharmony_ci * Calculate an allocation pointer from the extent allocation information
121062306a36Sopenharmony_ci * for a block group consist of conventional zones. It is pointed to the
121162306a36Sopenharmony_ci * end of the highest addressed extent in the block group as an allocation
121262306a36Sopenharmony_ci * offset.
121362306a36Sopenharmony_ci */
121462306a36Sopenharmony_cistatic int calculate_alloc_pointer(struct btrfs_block_group *cache,
121562306a36Sopenharmony_ci				   u64 *offset_ret, bool new)
121662306a36Sopenharmony_ci{
121762306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = cache->fs_info;
121862306a36Sopenharmony_ci	struct btrfs_root *root;
121962306a36Sopenharmony_ci	struct btrfs_path *path;
122062306a36Sopenharmony_ci	struct btrfs_key key;
122162306a36Sopenharmony_ci	struct btrfs_key found_key;
122262306a36Sopenharmony_ci	int ret;
122362306a36Sopenharmony_ci	u64 length;
122462306a36Sopenharmony_ci
122562306a36Sopenharmony_ci	/*
122662306a36Sopenharmony_ci	 * Avoid  tree lookups for a new block group, there's no use for it.
122762306a36Sopenharmony_ci	 * It must always be 0.
122862306a36Sopenharmony_ci	 *
122962306a36Sopenharmony_ci	 * Also, we have a lock chain of extent buffer lock -> chunk mutex.
123062306a36Sopenharmony_ci	 * For new a block group, this function is called from
123162306a36Sopenharmony_ci	 * btrfs_make_block_group() which is already taking the chunk mutex.
123262306a36Sopenharmony_ci	 * Thus, we cannot call calculate_alloc_pointer() which takes extent
123362306a36Sopenharmony_ci	 * buffer locks to avoid deadlock.
123462306a36Sopenharmony_ci	 */
123562306a36Sopenharmony_ci	if (new) {
123662306a36Sopenharmony_ci		*offset_ret = 0;
123762306a36Sopenharmony_ci		return 0;
123862306a36Sopenharmony_ci	}
123962306a36Sopenharmony_ci
124062306a36Sopenharmony_ci	path = btrfs_alloc_path();
124162306a36Sopenharmony_ci	if (!path)
124262306a36Sopenharmony_ci		return -ENOMEM;
124362306a36Sopenharmony_ci
124462306a36Sopenharmony_ci	key.objectid = cache->start + cache->length;
124562306a36Sopenharmony_ci	key.type = 0;
124662306a36Sopenharmony_ci	key.offset = 0;
124762306a36Sopenharmony_ci
124862306a36Sopenharmony_ci	root = btrfs_extent_root(fs_info, key.objectid);
124962306a36Sopenharmony_ci	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
125062306a36Sopenharmony_ci	/* We should not find the exact match */
125162306a36Sopenharmony_ci	if (!ret)
125262306a36Sopenharmony_ci		ret = -EUCLEAN;
125362306a36Sopenharmony_ci	if (ret < 0)
125462306a36Sopenharmony_ci		goto out;
125562306a36Sopenharmony_ci
125662306a36Sopenharmony_ci	ret = btrfs_previous_extent_item(root, path, cache->start);
125762306a36Sopenharmony_ci	if (ret) {
125862306a36Sopenharmony_ci		if (ret == 1) {
125962306a36Sopenharmony_ci			ret = 0;
126062306a36Sopenharmony_ci			*offset_ret = 0;
126162306a36Sopenharmony_ci		}
126262306a36Sopenharmony_ci		goto out;
126362306a36Sopenharmony_ci	}
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_ci	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
126662306a36Sopenharmony_ci
126762306a36Sopenharmony_ci	if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
126862306a36Sopenharmony_ci		length = found_key.offset;
126962306a36Sopenharmony_ci	else
127062306a36Sopenharmony_ci		length = fs_info->nodesize;
127162306a36Sopenharmony_ci
127262306a36Sopenharmony_ci	if (!(found_key.objectid >= cache->start &&
127362306a36Sopenharmony_ci	       found_key.objectid + length <= cache->start + cache->length)) {
127462306a36Sopenharmony_ci		ret = -EUCLEAN;
127562306a36Sopenharmony_ci		goto out;
127662306a36Sopenharmony_ci	}
127762306a36Sopenharmony_ci	*offset_ret = found_key.objectid + length - cache->start;
127862306a36Sopenharmony_ci	ret = 0;
127962306a36Sopenharmony_ci
128062306a36Sopenharmony_ciout:
128162306a36Sopenharmony_ci	btrfs_free_path(path);
128262306a36Sopenharmony_ci	return ret;
128362306a36Sopenharmony_ci}
128462306a36Sopenharmony_ci
128562306a36Sopenharmony_ciint btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
128662306a36Sopenharmony_ci{
128762306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = cache->fs_info;
128862306a36Sopenharmony_ci	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
128962306a36Sopenharmony_ci	struct extent_map *em;
129062306a36Sopenharmony_ci	struct map_lookup *map;
129162306a36Sopenharmony_ci	struct btrfs_device *device;
129262306a36Sopenharmony_ci	u64 logical = cache->start;
129362306a36Sopenharmony_ci	u64 length = cache->length;
129462306a36Sopenharmony_ci	int ret;
129562306a36Sopenharmony_ci	int i;
129662306a36Sopenharmony_ci	unsigned int nofs_flag;
129762306a36Sopenharmony_ci	u64 *alloc_offsets = NULL;
129862306a36Sopenharmony_ci	u64 *caps = NULL;
129962306a36Sopenharmony_ci	u64 *physical = NULL;
130062306a36Sopenharmony_ci	unsigned long *active = NULL;
130162306a36Sopenharmony_ci	u64 last_alloc = 0;
130262306a36Sopenharmony_ci	u32 num_sequential = 0, num_conventional = 0;
130362306a36Sopenharmony_ci
130462306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
130562306a36Sopenharmony_ci		return 0;
130662306a36Sopenharmony_ci
130762306a36Sopenharmony_ci	/* Sanity check */
130862306a36Sopenharmony_ci	if (!IS_ALIGNED(length, fs_info->zone_size)) {
130962306a36Sopenharmony_ci		btrfs_err(fs_info,
131062306a36Sopenharmony_ci		"zoned: block group %llu len %llu unaligned to zone size %llu",
131162306a36Sopenharmony_ci			  logical, length, fs_info->zone_size);
131262306a36Sopenharmony_ci		return -EIO;
131362306a36Sopenharmony_ci	}
131462306a36Sopenharmony_ci
131562306a36Sopenharmony_ci	/* Get the chunk mapping */
131662306a36Sopenharmony_ci	read_lock(&em_tree->lock);
131762306a36Sopenharmony_ci	em = lookup_extent_mapping(em_tree, logical, length);
131862306a36Sopenharmony_ci	read_unlock(&em_tree->lock);
131962306a36Sopenharmony_ci
132062306a36Sopenharmony_ci	if (!em)
132162306a36Sopenharmony_ci		return -EINVAL;
132262306a36Sopenharmony_ci
132362306a36Sopenharmony_ci	map = em->map_lookup;
132462306a36Sopenharmony_ci
132562306a36Sopenharmony_ci	cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
132662306a36Sopenharmony_ci	if (!cache->physical_map) {
132762306a36Sopenharmony_ci		ret = -ENOMEM;
132862306a36Sopenharmony_ci		goto out;
132962306a36Sopenharmony_ci	}
133062306a36Sopenharmony_ci
133162306a36Sopenharmony_ci	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
133262306a36Sopenharmony_ci	if (!alloc_offsets) {
133362306a36Sopenharmony_ci		ret = -ENOMEM;
133462306a36Sopenharmony_ci		goto out;
133562306a36Sopenharmony_ci	}
133662306a36Sopenharmony_ci
133762306a36Sopenharmony_ci	caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
133862306a36Sopenharmony_ci	if (!caps) {
133962306a36Sopenharmony_ci		ret = -ENOMEM;
134062306a36Sopenharmony_ci		goto out;
134162306a36Sopenharmony_ci	}
134262306a36Sopenharmony_ci
134362306a36Sopenharmony_ci	physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
134462306a36Sopenharmony_ci	if (!physical) {
134562306a36Sopenharmony_ci		ret = -ENOMEM;
134662306a36Sopenharmony_ci		goto out;
134762306a36Sopenharmony_ci	}
134862306a36Sopenharmony_ci
134962306a36Sopenharmony_ci	active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
135062306a36Sopenharmony_ci	if (!active) {
135162306a36Sopenharmony_ci		ret = -ENOMEM;
135262306a36Sopenharmony_ci		goto out;
135362306a36Sopenharmony_ci	}
135462306a36Sopenharmony_ci
135562306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
135662306a36Sopenharmony_ci		bool is_sequential;
135762306a36Sopenharmony_ci		struct blk_zone zone;
135862306a36Sopenharmony_ci		struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
135962306a36Sopenharmony_ci		int dev_replace_is_ongoing = 0;
136062306a36Sopenharmony_ci
136162306a36Sopenharmony_ci		device = map->stripes[i].dev;
136262306a36Sopenharmony_ci		physical[i] = map->stripes[i].physical;
136362306a36Sopenharmony_ci
136462306a36Sopenharmony_ci		if (device->bdev == NULL) {
136562306a36Sopenharmony_ci			alloc_offsets[i] = WP_MISSING_DEV;
136662306a36Sopenharmony_ci			continue;
136762306a36Sopenharmony_ci		}
136862306a36Sopenharmony_ci
136962306a36Sopenharmony_ci		is_sequential = btrfs_dev_is_sequential(device, physical[i]);
137062306a36Sopenharmony_ci		if (is_sequential)
137162306a36Sopenharmony_ci			num_sequential++;
137262306a36Sopenharmony_ci		else
137362306a36Sopenharmony_ci			num_conventional++;
137462306a36Sopenharmony_ci
137562306a36Sopenharmony_ci		/*
137662306a36Sopenharmony_ci		 * Consider a zone as active if we can allow any number of
137762306a36Sopenharmony_ci		 * active zones.
137862306a36Sopenharmony_ci		 */
137962306a36Sopenharmony_ci		if (!device->zone_info->max_active_zones)
138062306a36Sopenharmony_ci			__set_bit(i, active);
138162306a36Sopenharmony_ci
138262306a36Sopenharmony_ci		if (!is_sequential) {
138362306a36Sopenharmony_ci			alloc_offsets[i] = WP_CONVENTIONAL;
138462306a36Sopenharmony_ci			continue;
138562306a36Sopenharmony_ci		}
138662306a36Sopenharmony_ci
138762306a36Sopenharmony_ci		/*
138862306a36Sopenharmony_ci		 * This zone will be used for allocation, so mark this zone
138962306a36Sopenharmony_ci		 * non-empty.
139062306a36Sopenharmony_ci		 */
139162306a36Sopenharmony_ci		btrfs_dev_clear_zone_empty(device, physical[i]);
139262306a36Sopenharmony_ci
139362306a36Sopenharmony_ci		down_read(&dev_replace->rwsem);
139462306a36Sopenharmony_ci		dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
139562306a36Sopenharmony_ci		if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
139662306a36Sopenharmony_ci			btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
139762306a36Sopenharmony_ci		up_read(&dev_replace->rwsem);
139862306a36Sopenharmony_ci
139962306a36Sopenharmony_ci		/*
140062306a36Sopenharmony_ci		 * The group is mapped to a sequential zone. Get the zone write
140162306a36Sopenharmony_ci		 * pointer to determine the allocation offset within the zone.
140262306a36Sopenharmony_ci		 */
140362306a36Sopenharmony_ci		WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
140462306a36Sopenharmony_ci		nofs_flag = memalloc_nofs_save();
140562306a36Sopenharmony_ci		ret = btrfs_get_dev_zone(device, physical[i], &zone);
140662306a36Sopenharmony_ci		memalloc_nofs_restore(nofs_flag);
140762306a36Sopenharmony_ci		if (ret == -EIO || ret == -EOPNOTSUPP) {
140862306a36Sopenharmony_ci			ret = 0;
140962306a36Sopenharmony_ci			alloc_offsets[i] = WP_MISSING_DEV;
141062306a36Sopenharmony_ci			continue;
141162306a36Sopenharmony_ci		} else if (ret) {
141262306a36Sopenharmony_ci			goto out;
141362306a36Sopenharmony_ci		}
141462306a36Sopenharmony_ci
141562306a36Sopenharmony_ci		if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
141662306a36Sopenharmony_ci			btrfs_err_in_rcu(fs_info,
141762306a36Sopenharmony_ci	"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
141862306a36Sopenharmony_ci				zone.start << SECTOR_SHIFT,
141962306a36Sopenharmony_ci				rcu_str_deref(device->name), device->devid);
142062306a36Sopenharmony_ci			ret = -EIO;
142162306a36Sopenharmony_ci			goto out;
142262306a36Sopenharmony_ci		}
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_ci		caps[i] = (zone.capacity << SECTOR_SHIFT);
142562306a36Sopenharmony_ci
142662306a36Sopenharmony_ci		switch (zone.cond) {
142762306a36Sopenharmony_ci		case BLK_ZONE_COND_OFFLINE:
142862306a36Sopenharmony_ci		case BLK_ZONE_COND_READONLY:
142962306a36Sopenharmony_ci			btrfs_err(fs_info,
143062306a36Sopenharmony_ci		"zoned: offline/readonly zone %llu on device %s (devid %llu)",
143162306a36Sopenharmony_ci				  physical[i] >> device->zone_info->zone_size_shift,
143262306a36Sopenharmony_ci				  rcu_str_deref(device->name), device->devid);
143362306a36Sopenharmony_ci			alloc_offsets[i] = WP_MISSING_DEV;
143462306a36Sopenharmony_ci			break;
143562306a36Sopenharmony_ci		case BLK_ZONE_COND_EMPTY:
143662306a36Sopenharmony_ci			alloc_offsets[i] = 0;
143762306a36Sopenharmony_ci			break;
143862306a36Sopenharmony_ci		case BLK_ZONE_COND_FULL:
143962306a36Sopenharmony_ci			alloc_offsets[i] = caps[i];
144062306a36Sopenharmony_ci			break;
144162306a36Sopenharmony_ci		default:
144262306a36Sopenharmony_ci			/* Partially used zone */
144362306a36Sopenharmony_ci			alloc_offsets[i] =
144462306a36Sopenharmony_ci					((zone.wp - zone.start) << SECTOR_SHIFT);
144562306a36Sopenharmony_ci			__set_bit(i, active);
144662306a36Sopenharmony_ci			break;
144762306a36Sopenharmony_ci		}
144862306a36Sopenharmony_ci	}
144962306a36Sopenharmony_ci
145062306a36Sopenharmony_ci	if (num_sequential > 0)
145162306a36Sopenharmony_ci		set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
145262306a36Sopenharmony_ci
145362306a36Sopenharmony_ci	if (num_conventional > 0) {
145462306a36Sopenharmony_ci		/* Zone capacity is always zone size in emulation */
145562306a36Sopenharmony_ci		cache->zone_capacity = cache->length;
145662306a36Sopenharmony_ci		ret = calculate_alloc_pointer(cache, &last_alloc, new);
145762306a36Sopenharmony_ci		if (ret) {
145862306a36Sopenharmony_ci			btrfs_err(fs_info,
145962306a36Sopenharmony_ci			"zoned: failed to determine allocation offset of bg %llu",
146062306a36Sopenharmony_ci				  cache->start);
146162306a36Sopenharmony_ci			goto out;
146262306a36Sopenharmony_ci		} else if (map->num_stripes == num_conventional) {
146362306a36Sopenharmony_ci			cache->alloc_offset = last_alloc;
146462306a36Sopenharmony_ci			set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
146562306a36Sopenharmony_ci			goto out;
146662306a36Sopenharmony_ci		}
146762306a36Sopenharmony_ci	}
146862306a36Sopenharmony_ci
146962306a36Sopenharmony_ci	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
147062306a36Sopenharmony_ci	case 0: /* single */
147162306a36Sopenharmony_ci		if (alloc_offsets[0] == WP_MISSING_DEV) {
147262306a36Sopenharmony_ci			btrfs_err(fs_info,
147362306a36Sopenharmony_ci			"zoned: cannot recover write pointer for zone %llu",
147462306a36Sopenharmony_ci				physical[0]);
147562306a36Sopenharmony_ci			ret = -EIO;
147662306a36Sopenharmony_ci			goto out;
147762306a36Sopenharmony_ci		}
147862306a36Sopenharmony_ci		cache->alloc_offset = alloc_offsets[0];
147962306a36Sopenharmony_ci		cache->zone_capacity = caps[0];
148062306a36Sopenharmony_ci		if (test_bit(0, active))
148162306a36Sopenharmony_ci			set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
148262306a36Sopenharmony_ci		break;
148362306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_DUP:
148462306a36Sopenharmony_ci		if (map->type & BTRFS_BLOCK_GROUP_DATA) {
148562306a36Sopenharmony_ci			btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
148662306a36Sopenharmony_ci			ret = -EINVAL;
148762306a36Sopenharmony_ci			goto out;
148862306a36Sopenharmony_ci		}
148962306a36Sopenharmony_ci		if (alloc_offsets[0] == WP_MISSING_DEV) {
149062306a36Sopenharmony_ci			btrfs_err(fs_info,
149162306a36Sopenharmony_ci			"zoned: cannot recover write pointer for zone %llu",
149262306a36Sopenharmony_ci				physical[0]);
149362306a36Sopenharmony_ci			ret = -EIO;
149462306a36Sopenharmony_ci			goto out;
149562306a36Sopenharmony_ci		}
149662306a36Sopenharmony_ci		if (alloc_offsets[1] == WP_MISSING_DEV) {
149762306a36Sopenharmony_ci			btrfs_err(fs_info,
149862306a36Sopenharmony_ci			"zoned: cannot recover write pointer for zone %llu",
149962306a36Sopenharmony_ci				physical[1]);
150062306a36Sopenharmony_ci			ret = -EIO;
150162306a36Sopenharmony_ci			goto out;
150262306a36Sopenharmony_ci		}
150362306a36Sopenharmony_ci		if (alloc_offsets[0] != alloc_offsets[1]) {
150462306a36Sopenharmony_ci			btrfs_err(fs_info,
150562306a36Sopenharmony_ci			"zoned: write pointer offset mismatch of zones in DUP profile");
150662306a36Sopenharmony_ci			ret = -EIO;
150762306a36Sopenharmony_ci			goto out;
150862306a36Sopenharmony_ci		}
150962306a36Sopenharmony_ci		if (test_bit(0, active) != test_bit(1, active)) {
151062306a36Sopenharmony_ci			if (!btrfs_zone_activate(cache)) {
151162306a36Sopenharmony_ci				ret = -EIO;
151262306a36Sopenharmony_ci				goto out;
151362306a36Sopenharmony_ci			}
151462306a36Sopenharmony_ci		} else {
151562306a36Sopenharmony_ci			if (test_bit(0, active))
151662306a36Sopenharmony_ci				set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
151762306a36Sopenharmony_ci					&cache->runtime_flags);
151862306a36Sopenharmony_ci		}
151962306a36Sopenharmony_ci		cache->alloc_offset = alloc_offsets[0];
152062306a36Sopenharmony_ci		cache->zone_capacity = min(caps[0], caps[1]);
152162306a36Sopenharmony_ci		break;
152262306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_RAID1:
152362306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_RAID0:
152462306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_RAID10:
152562306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_RAID5:
152662306a36Sopenharmony_ci	case BTRFS_BLOCK_GROUP_RAID6:
152762306a36Sopenharmony_ci		/* non-single profiles are not supported yet */
152862306a36Sopenharmony_ci	default:
152962306a36Sopenharmony_ci		btrfs_err(fs_info, "zoned: profile %s not yet supported",
153062306a36Sopenharmony_ci			  btrfs_bg_type_to_raid_name(map->type));
153162306a36Sopenharmony_ci		ret = -EINVAL;
153262306a36Sopenharmony_ci		goto out;
153362306a36Sopenharmony_ci	}
153462306a36Sopenharmony_ci
153562306a36Sopenharmony_ciout:
153662306a36Sopenharmony_ci	if (cache->alloc_offset > fs_info->zone_size) {
153762306a36Sopenharmony_ci		btrfs_err(fs_info,
153862306a36Sopenharmony_ci			"zoned: invalid write pointer %llu in block group %llu",
153962306a36Sopenharmony_ci			cache->alloc_offset, cache->start);
154062306a36Sopenharmony_ci		ret = -EIO;
154162306a36Sopenharmony_ci	}
154262306a36Sopenharmony_ci
154362306a36Sopenharmony_ci	if (cache->alloc_offset > cache->zone_capacity) {
154462306a36Sopenharmony_ci		btrfs_err(fs_info,
154562306a36Sopenharmony_ci"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
154662306a36Sopenharmony_ci			  cache->alloc_offset, cache->zone_capacity,
154762306a36Sopenharmony_ci			  cache->start);
154862306a36Sopenharmony_ci		ret = -EIO;
154962306a36Sopenharmony_ci	}
155062306a36Sopenharmony_ci
155162306a36Sopenharmony_ci	/* An extent is allocated after the write pointer */
155262306a36Sopenharmony_ci	if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
155362306a36Sopenharmony_ci		btrfs_err(fs_info,
155462306a36Sopenharmony_ci			  "zoned: got wrong write pointer in BG %llu: %llu > %llu",
155562306a36Sopenharmony_ci			  logical, last_alloc, cache->alloc_offset);
155662306a36Sopenharmony_ci		ret = -EIO;
155762306a36Sopenharmony_ci	}
155862306a36Sopenharmony_ci
155962306a36Sopenharmony_ci	if (!ret) {
156062306a36Sopenharmony_ci		cache->meta_write_pointer = cache->alloc_offset + cache->start;
156162306a36Sopenharmony_ci		if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
156262306a36Sopenharmony_ci			btrfs_get_block_group(cache);
156362306a36Sopenharmony_ci			spin_lock(&fs_info->zone_active_bgs_lock);
156462306a36Sopenharmony_ci			list_add_tail(&cache->active_bg_list,
156562306a36Sopenharmony_ci				      &fs_info->zone_active_bgs);
156662306a36Sopenharmony_ci			spin_unlock(&fs_info->zone_active_bgs_lock);
156762306a36Sopenharmony_ci		}
156862306a36Sopenharmony_ci	} else {
156962306a36Sopenharmony_ci		kfree(cache->physical_map);
157062306a36Sopenharmony_ci		cache->physical_map = NULL;
157162306a36Sopenharmony_ci	}
157262306a36Sopenharmony_ci	bitmap_free(active);
157362306a36Sopenharmony_ci	kfree(physical);
157462306a36Sopenharmony_ci	kfree(caps);
157562306a36Sopenharmony_ci	kfree(alloc_offsets);
157662306a36Sopenharmony_ci	free_extent_map(em);
157762306a36Sopenharmony_ci
157862306a36Sopenharmony_ci	return ret;
157962306a36Sopenharmony_ci}
158062306a36Sopenharmony_ci
158162306a36Sopenharmony_civoid btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
158262306a36Sopenharmony_ci{
158362306a36Sopenharmony_ci	u64 unusable, free;
158462306a36Sopenharmony_ci
158562306a36Sopenharmony_ci	if (!btrfs_is_zoned(cache->fs_info))
158662306a36Sopenharmony_ci		return;
158762306a36Sopenharmony_ci
158862306a36Sopenharmony_ci	WARN_ON(cache->bytes_super != 0);
158962306a36Sopenharmony_ci	unusable = (cache->alloc_offset - cache->used) +
159062306a36Sopenharmony_ci		   (cache->length - cache->zone_capacity);
159162306a36Sopenharmony_ci	free = cache->zone_capacity - cache->alloc_offset;
159262306a36Sopenharmony_ci
159362306a36Sopenharmony_ci	/* We only need ->free_space in ALLOC_SEQ block groups */
159462306a36Sopenharmony_ci	cache->cached = BTRFS_CACHE_FINISHED;
159562306a36Sopenharmony_ci	cache->free_space_ctl->free_space = free;
159662306a36Sopenharmony_ci	cache->zone_unusable = unusable;
159762306a36Sopenharmony_ci}
159862306a36Sopenharmony_ci
159962306a36Sopenharmony_civoid btrfs_redirty_list_add(struct btrfs_transaction *trans,
160062306a36Sopenharmony_ci			    struct extent_buffer *eb)
160162306a36Sopenharmony_ci{
160262306a36Sopenharmony_ci	if (!btrfs_is_zoned(eb->fs_info) ||
160362306a36Sopenharmony_ci	    btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
160462306a36Sopenharmony_ci		return;
160562306a36Sopenharmony_ci
160662306a36Sopenharmony_ci	ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
160762306a36Sopenharmony_ci
160862306a36Sopenharmony_ci	memzero_extent_buffer(eb, 0, eb->len);
160962306a36Sopenharmony_ci	set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
161062306a36Sopenharmony_ci	set_extent_buffer_dirty(eb);
161162306a36Sopenharmony_ci	set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
161262306a36Sopenharmony_ci			EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
161362306a36Sopenharmony_ci}
161462306a36Sopenharmony_ci
161562306a36Sopenharmony_cibool btrfs_use_zone_append(struct btrfs_bio *bbio)
161662306a36Sopenharmony_ci{
161762306a36Sopenharmony_ci	u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
161862306a36Sopenharmony_ci	struct btrfs_inode *inode = bbio->inode;
161962306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = bbio->fs_info;
162062306a36Sopenharmony_ci	struct btrfs_block_group *cache;
162162306a36Sopenharmony_ci	bool ret = false;
162262306a36Sopenharmony_ci
162362306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
162462306a36Sopenharmony_ci		return false;
162562306a36Sopenharmony_ci
162662306a36Sopenharmony_ci	if (!inode || !is_data_inode(&inode->vfs_inode))
162762306a36Sopenharmony_ci		return false;
162862306a36Sopenharmony_ci
162962306a36Sopenharmony_ci	if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
163062306a36Sopenharmony_ci		return false;
163162306a36Sopenharmony_ci
163262306a36Sopenharmony_ci	/*
163362306a36Sopenharmony_ci	 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
163462306a36Sopenharmony_ci	 * extent layout the relocation code has.
163562306a36Sopenharmony_ci	 * Furthermore we have set aside own block-group from which only the
163662306a36Sopenharmony_ci	 * relocation "process" can allocate and make sure only one process at a
163762306a36Sopenharmony_ci	 * time can add pages to an extent that gets relocated, so it's safe to
163862306a36Sopenharmony_ci	 * use regular REQ_OP_WRITE for this special case.
163962306a36Sopenharmony_ci	 */
164062306a36Sopenharmony_ci	if (btrfs_is_data_reloc_root(inode->root))
164162306a36Sopenharmony_ci		return false;
164262306a36Sopenharmony_ci
164362306a36Sopenharmony_ci	cache = btrfs_lookup_block_group(fs_info, start);
164462306a36Sopenharmony_ci	ASSERT(cache);
164562306a36Sopenharmony_ci	if (!cache)
164662306a36Sopenharmony_ci		return false;
164762306a36Sopenharmony_ci
164862306a36Sopenharmony_ci	ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
164962306a36Sopenharmony_ci	btrfs_put_block_group(cache);
165062306a36Sopenharmony_ci
165162306a36Sopenharmony_ci	return ret;
165262306a36Sopenharmony_ci}
165362306a36Sopenharmony_ci
165462306a36Sopenharmony_civoid btrfs_record_physical_zoned(struct btrfs_bio *bbio)
165562306a36Sopenharmony_ci{
165662306a36Sopenharmony_ci	const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
165762306a36Sopenharmony_ci	struct btrfs_ordered_sum *sum = bbio->sums;
165862306a36Sopenharmony_ci
165962306a36Sopenharmony_ci	if (physical < bbio->orig_physical)
166062306a36Sopenharmony_ci		sum->logical -= bbio->orig_physical - physical;
166162306a36Sopenharmony_ci	else
166262306a36Sopenharmony_ci		sum->logical += physical - bbio->orig_physical;
166362306a36Sopenharmony_ci}
166462306a36Sopenharmony_ci
166562306a36Sopenharmony_cistatic void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
166662306a36Sopenharmony_ci					u64 logical)
166762306a36Sopenharmony_ci{
166862306a36Sopenharmony_ci	struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
166962306a36Sopenharmony_ci	struct extent_map *em;
167062306a36Sopenharmony_ci
167162306a36Sopenharmony_ci	ordered->disk_bytenr = logical;
167262306a36Sopenharmony_ci
167362306a36Sopenharmony_ci	write_lock(&em_tree->lock);
167462306a36Sopenharmony_ci	em = search_extent_mapping(em_tree, ordered->file_offset,
167562306a36Sopenharmony_ci				   ordered->num_bytes);
167662306a36Sopenharmony_ci	em->block_start = logical;
167762306a36Sopenharmony_ci	free_extent_map(em);
167862306a36Sopenharmony_ci	write_unlock(&em_tree->lock);
167962306a36Sopenharmony_ci}
168062306a36Sopenharmony_ci
168162306a36Sopenharmony_cistatic bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
168262306a36Sopenharmony_ci				      u64 logical, u64 len)
168362306a36Sopenharmony_ci{
168462306a36Sopenharmony_ci	struct btrfs_ordered_extent *new;
168562306a36Sopenharmony_ci
168662306a36Sopenharmony_ci	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
168762306a36Sopenharmony_ci	    split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
168862306a36Sopenharmony_ci			     ordered->num_bytes, len, logical))
168962306a36Sopenharmony_ci		return false;
169062306a36Sopenharmony_ci
169162306a36Sopenharmony_ci	new = btrfs_split_ordered_extent(ordered, len);
169262306a36Sopenharmony_ci	if (IS_ERR(new))
169362306a36Sopenharmony_ci		return false;
169462306a36Sopenharmony_ci	new->disk_bytenr = logical;
169562306a36Sopenharmony_ci	btrfs_finish_one_ordered(new);
169662306a36Sopenharmony_ci	return true;
169762306a36Sopenharmony_ci}
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_civoid btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
170062306a36Sopenharmony_ci{
170162306a36Sopenharmony_ci	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
170262306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = inode->root->fs_info;
170362306a36Sopenharmony_ci	struct btrfs_ordered_sum *sum;
170462306a36Sopenharmony_ci	u64 logical, len;
170562306a36Sopenharmony_ci
170662306a36Sopenharmony_ci	/*
170762306a36Sopenharmony_ci	 * Write to pre-allocated region is for the data relocation, and so
170862306a36Sopenharmony_ci	 * it should use WRITE operation. No split/rewrite are necessary.
170962306a36Sopenharmony_ci	 */
171062306a36Sopenharmony_ci	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
171162306a36Sopenharmony_ci		return;
171262306a36Sopenharmony_ci
171362306a36Sopenharmony_ci	ASSERT(!list_empty(&ordered->list));
171462306a36Sopenharmony_ci	/* The ordered->list can be empty in the above pre-alloc case. */
171562306a36Sopenharmony_ci	sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list);
171662306a36Sopenharmony_ci	logical = sum->logical;
171762306a36Sopenharmony_ci	len = sum->len;
171862306a36Sopenharmony_ci
171962306a36Sopenharmony_ci	while (len < ordered->disk_num_bytes) {
172062306a36Sopenharmony_ci		sum = list_next_entry(sum, list);
172162306a36Sopenharmony_ci		if (sum->logical == logical + len) {
172262306a36Sopenharmony_ci			len += sum->len;
172362306a36Sopenharmony_ci			continue;
172462306a36Sopenharmony_ci		}
172562306a36Sopenharmony_ci		if (!btrfs_zoned_split_ordered(ordered, logical, len)) {
172662306a36Sopenharmony_ci			set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
172762306a36Sopenharmony_ci			btrfs_err(fs_info, "failed to split ordered extent");
172862306a36Sopenharmony_ci			goto out;
172962306a36Sopenharmony_ci		}
173062306a36Sopenharmony_ci		logical = sum->logical;
173162306a36Sopenharmony_ci		len = sum->len;
173262306a36Sopenharmony_ci	}
173362306a36Sopenharmony_ci
173462306a36Sopenharmony_ci	if (ordered->disk_bytenr != logical)
173562306a36Sopenharmony_ci		btrfs_rewrite_logical_zoned(ordered, logical);
173662306a36Sopenharmony_ci
173762306a36Sopenharmony_ciout:
173862306a36Sopenharmony_ci	/*
173962306a36Sopenharmony_ci	 * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
174062306a36Sopenharmony_ci	 * were allocated by btrfs_alloc_dummy_sum only to record the logical
174162306a36Sopenharmony_ci	 * addresses and don't contain actual checksums.  We thus must free them
174262306a36Sopenharmony_ci	 * here so that we don't attempt to log the csums later.
174362306a36Sopenharmony_ci	 */
174462306a36Sopenharmony_ci	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
174562306a36Sopenharmony_ci	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
174662306a36Sopenharmony_ci		while ((sum = list_first_entry_or_null(&ordered->list,
174762306a36Sopenharmony_ci						       typeof(*sum), list))) {
174862306a36Sopenharmony_ci			list_del(&sum->list);
174962306a36Sopenharmony_ci			kfree(sum);
175062306a36Sopenharmony_ci		}
175162306a36Sopenharmony_ci	}
175262306a36Sopenharmony_ci}
175362306a36Sopenharmony_ci
175462306a36Sopenharmony_cistatic bool check_bg_is_active(struct btrfs_eb_write_context *ctx,
175562306a36Sopenharmony_ci			       struct btrfs_block_group **active_bg)
175662306a36Sopenharmony_ci{
175762306a36Sopenharmony_ci	const struct writeback_control *wbc = ctx->wbc;
175862306a36Sopenharmony_ci	struct btrfs_block_group *block_group = ctx->zoned_bg;
175962306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = block_group->fs_info;
176062306a36Sopenharmony_ci
176162306a36Sopenharmony_ci	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
176262306a36Sopenharmony_ci		return true;
176362306a36Sopenharmony_ci
176462306a36Sopenharmony_ci	if (fs_info->treelog_bg == block_group->start) {
176562306a36Sopenharmony_ci		if (!btrfs_zone_activate(block_group)) {
176662306a36Sopenharmony_ci			int ret_fin = btrfs_zone_finish_one_bg(fs_info);
176762306a36Sopenharmony_ci
176862306a36Sopenharmony_ci			if (ret_fin != 1 || !btrfs_zone_activate(block_group))
176962306a36Sopenharmony_ci				return false;
177062306a36Sopenharmony_ci		}
177162306a36Sopenharmony_ci	} else if (*active_bg != block_group) {
177262306a36Sopenharmony_ci		struct btrfs_block_group *tgt = *active_bg;
177362306a36Sopenharmony_ci
177462306a36Sopenharmony_ci		/* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */
177562306a36Sopenharmony_ci		lockdep_assert_held(&fs_info->zoned_meta_io_lock);
177662306a36Sopenharmony_ci
177762306a36Sopenharmony_ci		if (tgt) {
177862306a36Sopenharmony_ci			/*
177962306a36Sopenharmony_ci			 * If there is an unsent IO left in the allocated area,
178062306a36Sopenharmony_ci			 * we cannot wait for them as it may cause a deadlock.
178162306a36Sopenharmony_ci			 */
178262306a36Sopenharmony_ci			if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) {
178362306a36Sopenharmony_ci				if (wbc->sync_mode == WB_SYNC_NONE ||
178462306a36Sopenharmony_ci				    (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync))
178562306a36Sopenharmony_ci					return false;
178662306a36Sopenharmony_ci			}
178762306a36Sopenharmony_ci
178862306a36Sopenharmony_ci			/* Pivot active metadata/system block group. */
178962306a36Sopenharmony_ci			btrfs_zoned_meta_io_unlock(fs_info);
179062306a36Sopenharmony_ci			wait_eb_writebacks(tgt);
179162306a36Sopenharmony_ci			do_zone_finish(tgt, true);
179262306a36Sopenharmony_ci			btrfs_zoned_meta_io_lock(fs_info);
179362306a36Sopenharmony_ci			if (*active_bg == tgt) {
179462306a36Sopenharmony_ci				btrfs_put_block_group(tgt);
179562306a36Sopenharmony_ci				*active_bg = NULL;
179662306a36Sopenharmony_ci			}
179762306a36Sopenharmony_ci		}
179862306a36Sopenharmony_ci		if (!btrfs_zone_activate(block_group))
179962306a36Sopenharmony_ci			return false;
180062306a36Sopenharmony_ci		if (*active_bg != block_group) {
180162306a36Sopenharmony_ci			ASSERT(*active_bg == NULL);
180262306a36Sopenharmony_ci			*active_bg = block_group;
180362306a36Sopenharmony_ci			btrfs_get_block_group(block_group);
180462306a36Sopenharmony_ci		}
180562306a36Sopenharmony_ci	}
180662306a36Sopenharmony_ci
180762306a36Sopenharmony_ci	return true;
180862306a36Sopenharmony_ci}
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_ci/*
181162306a36Sopenharmony_ci * Check if @ctx->eb is aligned to the write pointer.
181262306a36Sopenharmony_ci *
181362306a36Sopenharmony_ci * Return:
181462306a36Sopenharmony_ci *   0:        @ctx->eb is at the write pointer. You can write it.
181562306a36Sopenharmony_ci *   -EAGAIN:  There is a hole. The caller should handle the case.
181662306a36Sopenharmony_ci *   -EBUSY:   There is a hole, but the caller can just bail out.
181762306a36Sopenharmony_ci */
181862306a36Sopenharmony_ciint btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
181962306a36Sopenharmony_ci				   struct btrfs_eb_write_context *ctx)
182062306a36Sopenharmony_ci{
182162306a36Sopenharmony_ci	const struct writeback_control *wbc = ctx->wbc;
182262306a36Sopenharmony_ci	const struct extent_buffer *eb = ctx->eb;
182362306a36Sopenharmony_ci	struct btrfs_block_group *block_group = ctx->zoned_bg;
182462306a36Sopenharmony_ci
182562306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
182662306a36Sopenharmony_ci		return 0;
182762306a36Sopenharmony_ci
182862306a36Sopenharmony_ci	if (block_group) {
182962306a36Sopenharmony_ci		if (block_group->start > eb->start ||
183062306a36Sopenharmony_ci		    block_group->start + block_group->length <= eb->start) {
183162306a36Sopenharmony_ci			btrfs_put_block_group(block_group);
183262306a36Sopenharmony_ci			block_group = NULL;
183362306a36Sopenharmony_ci			ctx->zoned_bg = NULL;
183462306a36Sopenharmony_ci		}
183562306a36Sopenharmony_ci	}
183662306a36Sopenharmony_ci
183762306a36Sopenharmony_ci	if (!block_group) {
183862306a36Sopenharmony_ci		block_group = btrfs_lookup_block_group(fs_info, eb->start);
183962306a36Sopenharmony_ci		if (!block_group)
184062306a36Sopenharmony_ci			return 0;
184162306a36Sopenharmony_ci		ctx->zoned_bg = block_group;
184262306a36Sopenharmony_ci	}
184362306a36Sopenharmony_ci
184462306a36Sopenharmony_ci	if (block_group->meta_write_pointer == eb->start) {
184562306a36Sopenharmony_ci		struct btrfs_block_group **tgt;
184662306a36Sopenharmony_ci
184762306a36Sopenharmony_ci		if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
184862306a36Sopenharmony_ci			return 0;
184962306a36Sopenharmony_ci
185062306a36Sopenharmony_ci		if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)
185162306a36Sopenharmony_ci			tgt = &fs_info->active_system_bg;
185262306a36Sopenharmony_ci		else
185362306a36Sopenharmony_ci			tgt = &fs_info->active_meta_bg;
185462306a36Sopenharmony_ci		if (check_bg_is_active(ctx, tgt))
185562306a36Sopenharmony_ci			return 0;
185662306a36Sopenharmony_ci	}
185762306a36Sopenharmony_ci
185862306a36Sopenharmony_ci	/*
185962306a36Sopenharmony_ci	 * Since we may release fs_info->zoned_meta_io_lock, someone can already
186062306a36Sopenharmony_ci	 * start writing this eb. In that case, we can just bail out.
186162306a36Sopenharmony_ci	 */
186262306a36Sopenharmony_ci	if (block_group->meta_write_pointer > eb->start)
186362306a36Sopenharmony_ci		return -EBUSY;
186462306a36Sopenharmony_ci
186562306a36Sopenharmony_ci	/* If for_sync, this hole will be filled with trasnsaction commit. */
186662306a36Sopenharmony_ci	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
186762306a36Sopenharmony_ci		return -EAGAIN;
186862306a36Sopenharmony_ci	return -EBUSY;
186962306a36Sopenharmony_ci}
187062306a36Sopenharmony_ci
187162306a36Sopenharmony_ciint btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
187262306a36Sopenharmony_ci{
187362306a36Sopenharmony_ci	if (!btrfs_dev_is_sequential(device, physical))
187462306a36Sopenharmony_ci		return -EOPNOTSUPP;
187562306a36Sopenharmony_ci
187662306a36Sopenharmony_ci	return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
187762306a36Sopenharmony_ci				    length >> SECTOR_SHIFT, GFP_NOFS, 0);
187862306a36Sopenharmony_ci}
187962306a36Sopenharmony_ci
188062306a36Sopenharmony_cistatic int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
188162306a36Sopenharmony_ci			  struct blk_zone *zone)
188262306a36Sopenharmony_ci{
188362306a36Sopenharmony_ci	struct btrfs_io_context *bioc = NULL;
188462306a36Sopenharmony_ci	u64 mapped_length = PAGE_SIZE;
188562306a36Sopenharmony_ci	unsigned int nofs_flag;
188662306a36Sopenharmony_ci	int nmirrors;
188762306a36Sopenharmony_ci	int i, ret;
188862306a36Sopenharmony_ci
188962306a36Sopenharmony_ci	ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
189062306a36Sopenharmony_ci			      &mapped_length, &bioc, NULL, NULL, 1);
189162306a36Sopenharmony_ci	if (ret || !bioc || mapped_length < PAGE_SIZE) {
189262306a36Sopenharmony_ci		ret = -EIO;
189362306a36Sopenharmony_ci		goto out_put_bioc;
189462306a36Sopenharmony_ci	}
189562306a36Sopenharmony_ci
189662306a36Sopenharmony_ci	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
189762306a36Sopenharmony_ci		ret = -EINVAL;
189862306a36Sopenharmony_ci		goto out_put_bioc;
189962306a36Sopenharmony_ci	}
190062306a36Sopenharmony_ci
190162306a36Sopenharmony_ci	nofs_flag = memalloc_nofs_save();
190262306a36Sopenharmony_ci	nmirrors = (int)bioc->num_stripes;
190362306a36Sopenharmony_ci	for (i = 0; i < nmirrors; i++) {
190462306a36Sopenharmony_ci		u64 physical = bioc->stripes[i].physical;
190562306a36Sopenharmony_ci		struct btrfs_device *dev = bioc->stripes[i].dev;
190662306a36Sopenharmony_ci
190762306a36Sopenharmony_ci		/* Missing device */
190862306a36Sopenharmony_ci		if (!dev->bdev)
190962306a36Sopenharmony_ci			continue;
191062306a36Sopenharmony_ci
191162306a36Sopenharmony_ci		ret = btrfs_get_dev_zone(dev, physical, zone);
191262306a36Sopenharmony_ci		/* Failing device */
191362306a36Sopenharmony_ci		if (ret == -EIO || ret == -EOPNOTSUPP)
191462306a36Sopenharmony_ci			continue;
191562306a36Sopenharmony_ci		break;
191662306a36Sopenharmony_ci	}
191762306a36Sopenharmony_ci	memalloc_nofs_restore(nofs_flag);
191862306a36Sopenharmony_ciout_put_bioc:
191962306a36Sopenharmony_ci	btrfs_put_bioc(bioc);
192062306a36Sopenharmony_ci	return ret;
192162306a36Sopenharmony_ci}
192262306a36Sopenharmony_ci
192362306a36Sopenharmony_ci/*
192462306a36Sopenharmony_ci * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
192562306a36Sopenharmony_ci * filling zeros between @physical_pos to a write pointer of dev-replace
192662306a36Sopenharmony_ci * source device.
192762306a36Sopenharmony_ci */
192862306a36Sopenharmony_ciint btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
192962306a36Sopenharmony_ci				    u64 physical_start, u64 physical_pos)
193062306a36Sopenharmony_ci{
193162306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
193262306a36Sopenharmony_ci	struct blk_zone zone;
193362306a36Sopenharmony_ci	u64 length;
193462306a36Sopenharmony_ci	u64 wp;
193562306a36Sopenharmony_ci	int ret;
193662306a36Sopenharmony_ci
193762306a36Sopenharmony_ci	if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
193862306a36Sopenharmony_ci		return 0;
193962306a36Sopenharmony_ci
194062306a36Sopenharmony_ci	ret = read_zone_info(fs_info, logical, &zone);
194162306a36Sopenharmony_ci	if (ret)
194262306a36Sopenharmony_ci		return ret;
194362306a36Sopenharmony_ci
194462306a36Sopenharmony_ci	wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
194562306a36Sopenharmony_ci
194662306a36Sopenharmony_ci	if (physical_pos == wp)
194762306a36Sopenharmony_ci		return 0;
194862306a36Sopenharmony_ci
194962306a36Sopenharmony_ci	if (physical_pos > wp)
195062306a36Sopenharmony_ci		return -EUCLEAN;
195162306a36Sopenharmony_ci
195262306a36Sopenharmony_ci	length = wp - physical_pos;
195362306a36Sopenharmony_ci	return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
195462306a36Sopenharmony_ci}
195562306a36Sopenharmony_ci
195662306a36Sopenharmony_ci/*
195762306a36Sopenharmony_ci * Activate block group and underlying device zones
195862306a36Sopenharmony_ci *
195962306a36Sopenharmony_ci * @block_group: the block group to activate
196062306a36Sopenharmony_ci *
196162306a36Sopenharmony_ci * Return: true on success, false otherwise
196262306a36Sopenharmony_ci */
196362306a36Sopenharmony_cibool btrfs_zone_activate(struct btrfs_block_group *block_group)
196462306a36Sopenharmony_ci{
196562306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = block_group->fs_info;
196662306a36Sopenharmony_ci	struct map_lookup *map;
196762306a36Sopenharmony_ci	struct btrfs_device *device;
196862306a36Sopenharmony_ci	u64 physical;
196962306a36Sopenharmony_ci	const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
197062306a36Sopenharmony_ci	bool ret;
197162306a36Sopenharmony_ci	int i;
197262306a36Sopenharmony_ci
197362306a36Sopenharmony_ci	if (!btrfs_is_zoned(block_group->fs_info))
197462306a36Sopenharmony_ci		return true;
197562306a36Sopenharmony_ci
197662306a36Sopenharmony_ci	map = block_group->physical_map;
197762306a36Sopenharmony_ci
197862306a36Sopenharmony_ci	spin_lock(&fs_info->zone_active_bgs_lock);
197962306a36Sopenharmony_ci	spin_lock(&block_group->lock);
198062306a36Sopenharmony_ci	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
198162306a36Sopenharmony_ci		ret = true;
198262306a36Sopenharmony_ci		goto out_unlock;
198362306a36Sopenharmony_ci	}
198462306a36Sopenharmony_ci
198562306a36Sopenharmony_ci	/* No space left */
198662306a36Sopenharmony_ci	if (btrfs_zoned_bg_is_full(block_group)) {
198762306a36Sopenharmony_ci		ret = false;
198862306a36Sopenharmony_ci		goto out_unlock;
198962306a36Sopenharmony_ci	}
199062306a36Sopenharmony_ci
199162306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
199262306a36Sopenharmony_ci		struct btrfs_zoned_device_info *zinfo;
199362306a36Sopenharmony_ci		int reserved = 0;
199462306a36Sopenharmony_ci
199562306a36Sopenharmony_ci		device = map->stripes[i].dev;
199662306a36Sopenharmony_ci		physical = map->stripes[i].physical;
199762306a36Sopenharmony_ci		zinfo = device->zone_info;
199862306a36Sopenharmony_ci
199962306a36Sopenharmony_ci		if (zinfo->max_active_zones == 0)
200062306a36Sopenharmony_ci			continue;
200162306a36Sopenharmony_ci
200262306a36Sopenharmony_ci		if (is_data)
200362306a36Sopenharmony_ci			reserved = zinfo->reserved_active_zones;
200462306a36Sopenharmony_ci		/*
200562306a36Sopenharmony_ci		 * For the data block group, leave active zones for one
200662306a36Sopenharmony_ci		 * metadata block group and one system block group.
200762306a36Sopenharmony_ci		 */
200862306a36Sopenharmony_ci		if (atomic_read(&zinfo->active_zones_left) <= reserved) {
200962306a36Sopenharmony_ci			ret = false;
201062306a36Sopenharmony_ci			goto out_unlock;
201162306a36Sopenharmony_ci		}
201262306a36Sopenharmony_ci
201362306a36Sopenharmony_ci		if (!btrfs_dev_set_active_zone(device, physical)) {
201462306a36Sopenharmony_ci			/* Cannot activate the zone */
201562306a36Sopenharmony_ci			ret = false;
201662306a36Sopenharmony_ci			goto out_unlock;
201762306a36Sopenharmony_ci		}
201862306a36Sopenharmony_ci		if (!is_data)
201962306a36Sopenharmony_ci			zinfo->reserved_active_zones--;
202062306a36Sopenharmony_ci	}
202162306a36Sopenharmony_ci
202262306a36Sopenharmony_ci	/* Successfully activated all the zones */
202362306a36Sopenharmony_ci	set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
202462306a36Sopenharmony_ci	spin_unlock(&block_group->lock);
202562306a36Sopenharmony_ci
202662306a36Sopenharmony_ci	/* For the active block group list */
202762306a36Sopenharmony_ci	btrfs_get_block_group(block_group);
202862306a36Sopenharmony_ci	list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
202962306a36Sopenharmony_ci	spin_unlock(&fs_info->zone_active_bgs_lock);
203062306a36Sopenharmony_ci
203162306a36Sopenharmony_ci	return true;
203262306a36Sopenharmony_ci
203362306a36Sopenharmony_ciout_unlock:
203462306a36Sopenharmony_ci	spin_unlock(&block_group->lock);
203562306a36Sopenharmony_ci	spin_unlock(&fs_info->zone_active_bgs_lock);
203662306a36Sopenharmony_ci	return ret;
203762306a36Sopenharmony_ci}
203862306a36Sopenharmony_ci
203962306a36Sopenharmony_cistatic void wait_eb_writebacks(struct btrfs_block_group *block_group)
204062306a36Sopenharmony_ci{
204162306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = block_group->fs_info;
204262306a36Sopenharmony_ci	const u64 end = block_group->start + block_group->length;
204362306a36Sopenharmony_ci	struct radix_tree_iter iter;
204462306a36Sopenharmony_ci	struct extent_buffer *eb;
204562306a36Sopenharmony_ci	void __rcu **slot;
204662306a36Sopenharmony_ci
204762306a36Sopenharmony_ci	rcu_read_lock();
204862306a36Sopenharmony_ci	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
204962306a36Sopenharmony_ci				 block_group->start >> fs_info->sectorsize_bits) {
205062306a36Sopenharmony_ci		eb = radix_tree_deref_slot(slot);
205162306a36Sopenharmony_ci		if (!eb)
205262306a36Sopenharmony_ci			continue;
205362306a36Sopenharmony_ci		if (radix_tree_deref_retry(eb)) {
205462306a36Sopenharmony_ci			slot = radix_tree_iter_retry(&iter);
205562306a36Sopenharmony_ci			continue;
205662306a36Sopenharmony_ci		}
205762306a36Sopenharmony_ci
205862306a36Sopenharmony_ci		if (eb->start < block_group->start)
205962306a36Sopenharmony_ci			continue;
206062306a36Sopenharmony_ci		if (eb->start >= end)
206162306a36Sopenharmony_ci			break;
206262306a36Sopenharmony_ci
206362306a36Sopenharmony_ci		slot = radix_tree_iter_resume(slot, &iter);
206462306a36Sopenharmony_ci		rcu_read_unlock();
206562306a36Sopenharmony_ci		wait_on_extent_buffer_writeback(eb);
206662306a36Sopenharmony_ci		rcu_read_lock();
206762306a36Sopenharmony_ci	}
206862306a36Sopenharmony_ci	rcu_read_unlock();
206962306a36Sopenharmony_ci}
207062306a36Sopenharmony_ci
207162306a36Sopenharmony_cistatic int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
207262306a36Sopenharmony_ci{
207362306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = block_group->fs_info;
207462306a36Sopenharmony_ci	struct map_lookup *map;
207562306a36Sopenharmony_ci	const bool is_metadata = (block_group->flags &
207662306a36Sopenharmony_ci			(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
207762306a36Sopenharmony_ci	int ret = 0;
207862306a36Sopenharmony_ci	int i;
207962306a36Sopenharmony_ci
208062306a36Sopenharmony_ci	spin_lock(&block_group->lock);
208162306a36Sopenharmony_ci	if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
208262306a36Sopenharmony_ci		spin_unlock(&block_group->lock);
208362306a36Sopenharmony_ci		return 0;
208462306a36Sopenharmony_ci	}
208562306a36Sopenharmony_ci
208662306a36Sopenharmony_ci	/* Check if we have unwritten allocated space */
208762306a36Sopenharmony_ci	if (is_metadata &&
208862306a36Sopenharmony_ci	    block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
208962306a36Sopenharmony_ci		spin_unlock(&block_group->lock);
209062306a36Sopenharmony_ci		return -EAGAIN;
209162306a36Sopenharmony_ci	}
209262306a36Sopenharmony_ci
209362306a36Sopenharmony_ci	/*
209462306a36Sopenharmony_ci	 * If we are sure that the block group is full (= no more room left for
209562306a36Sopenharmony_ci	 * new allocation) and the IO for the last usable block is completed, we
209662306a36Sopenharmony_ci	 * don't need to wait for the other IOs. This holds because we ensure
209762306a36Sopenharmony_ci	 * the sequential IO submissions using the ZONE_APPEND command for data
209862306a36Sopenharmony_ci	 * and block_group->meta_write_pointer for metadata.
209962306a36Sopenharmony_ci	 */
210062306a36Sopenharmony_ci	if (!fully_written) {
210162306a36Sopenharmony_ci		if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
210262306a36Sopenharmony_ci			spin_unlock(&block_group->lock);
210362306a36Sopenharmony_ci			return -EAGAIN;
210462306a36Sopenharmony_ci		}
210562306a36Sopenharmony_ci		spin_unlock(&block_group->lock);
210662306a36Sopenharmony_ci
210762306a36Sopenharmony_ci		ret = btrfs_inc_block_group_ro(block_group, false);
210862306a36Sopenharmony_ci		if (ret)
210962306a36Sopenharmony_ci			return ret;
211062306a36Sopenharmony_ci
211162306a36Sopenharmony_ci		/* Ensure all writes in this block group finish */
211262306a36Sopenharmony_ci		btrfs_wait_block_group_reservations(block_group);
211362306a36Sopenharmony_ci		/* No need to wait for NOCOW writers. Zoned mode does not allow that */
211462306a36Sopenharmony_ci		btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
211562306a36Sopenharmony_ci					 block_group->length);
211662306a36Sopenharmony_ci		/* Wait for extent buffers to be written. */
211762306a36Sopenharmony_ci		if (is_metadata)
211862306a36Sopenharmony_ci			wait_eb_writebacks(block_group);
211962306a36Sopenharmony_ci
212062306a36Sopenharmony_ci		spin_lock(&block_group->lock);
212162306a36Sopenharmony_ci
212262306a36Sopenharmony_ci		/*
212362306a36Sopenharmony_ci		 * Bail out if someone already deactivated the block group, or
212462306a36Sopenharmony_ci		 * allocated space is left in the block group.
212562306a36Sopenharmony_ci		 */
212662306a36Sopenharmony_ci		if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
212762306a36Sopenharmony_ci			      &block_group->runtime_flags)) {
212862306a36Sopenharmony_ci			spin_unlock(&block_group->lock);
212962306a36Sopenharmony_ci			btrfs_dec_block_group_ro(block_group);
213062306a36Sopenharmony_ci			return 0;
213162306a36Sopenharmony_ci		}
213262306a36Sopenharmony_ci
213362306a36Sopenharmony_ci		if (block_group->reserved ||
213462306a36Sopenharmony_ci		    test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
213562306a36Sopenharmony_ci			     &block_group->runtime_flags)) {
213662306a36Sopenharmony_ci			spin_unlock(&block_group->lock);
213762306a36Sopenharmony_ci			btrfs_dec_block_group_ro(block_group);
213862306a36Sopenharmony_ci			return -EAGAIN;
213962306a36Sopenharmony_ci		}
214062306a36Sopenharmony_ci	}
214162306a36Sopenharmony_ci
214262306a36Sopenharmony_ci	clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
214362306a36Sopenharmony_ci	block_group->alloc_offset = block_group->zone_capacity;
214462306a36Sopenharmony_ci	if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
214562306a36Sopenharmony_ci		block_group->meta_write_pointer = block_group->start +
214662306a36Sopenharmony_ci						  block_group->zone_capacity;
214762306a36Sopenharmony_ci	block_group->free_space_ctl->free_space = 0;
214862306a36Sopenharmony_ci	btrfs_clear_treelog_bg(block_group);
214962306a36Sopenharmony_ci	btrfs_clear_data_reloc_bg(block_group);
215062306a36Sopenharmony_ci	spin_unlock(&block_group->lock);
215162306a36Sopenharmony_ci
215262306a36Sopenharmony_ci	map = block_group->physical_map;
215362306a36Sopenharmony_ci	for (i = 0; i < map->num_stripes; i++) {
215462306a36Sopenharmony_ci		struct btrfs_device *device = map->stripes[i].dev;
215562306a36Sopenharmony_ci		const u64 physical = map->stripes[i].physical;
215662306a36Sopenharmony_ci		struct btrfs_zoned_device_info *zinfo = device->zone_info;
215762306a36Sopenharmony_ci
215862306a36Sopenharmony_ci		if (zinfo->max_active_zones == 0)
215962306a36Sopenharmony_ci			continue;
216062306a36Sopenharmony_ci
216162306a36Sopenharmony_ci		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
216262306a36Sopenharmony_ci				       physical >> SECTOR_SHIFT,
216362306a36Sopenharmony_ci				       zinfo->zone_size >> SECTOR_SHIFT,
216462306a36Sopenharmony_ci				       GFP_NOFS);
216562306a36Sopenharmony_ci
216662306a36Sopenharmony_ci		if (ret)
216762306a36Sopenharmony_ci			return ret;
216862306a36Sopenharmony_ci
216962306a36Sopenharmony_ci		if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
217062306a36Sopenharmony_ci			zinfo->reserved_active_zones++;
217162306a36Sopenharmony_ci		btrfs_dev_clear_active_zone(device, physical);
217262306a36Sopenharmony_ci	}
217362306a36Sopenharmony_ci
217462306a36Sopenharmony_ci	if (!fully_written)
217562306a36Sopenharmony_ci		btrfs_dec_block_group_ro(block_group);
217662306a36Sopenharmony_ci
217762306a36Sopenharmony_ci	spin_lock(&fs_info->zone_active_bgs_lock);
217862306a36Sopenharmony_ci	ASSERT(!list_empty(&block_group->active_bg_list));
217962306a36Sopenharmony_ci	list_del_init(&block_group->active_bg_list);
218062306a36Sopenharmony_ci	spin_unlock(&fs_info->zone_active_bgs_lock);
218162306a36Sopenharmony_ci
218262306a36Sopenharmony_ci	/* For active_bg_list */
218362306a36Sopenharmony_ci	btrfs_put_block_group(block_group);
218462306a36Sopenharmony_ci
218562306a36Sopenharmony_ci	clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
218662306a36Sopenharmony_ci
218762306a36Sopenharmony_ci	return 0;
218862306a36Sopenharmony_ci}
218962306a36Sopenharmony_ci
219062306a36Sopenharmony_ciint btrfs_zone_finish(struct btrfs_block_group *block_group)
219162306a36Sopenharmony_ci{
219262306a36Sopenharmony_ci	if (!btrfs_is_zoned(block_group->fs_info))
219362306a36Sopenharmony_ci		return 0;
219462306a36Sopenharmony_ci
219562306a36Sopenharmony_ci	return do_zone_finish(block_group, false);
219662306a36Sopenharmony_ci}
219762306a36Sopenharmony_ci
219862306a36Sopenharmony_cibool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
219962306a36Sopenharmony_ci{
220062306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = fs_devices->fs_info;
220162306a36Sopenharmony_ci	struct btrfs_device *device;
220262306a36Sopenharmony_ci	bool ret = false;
220362306a36Sopenharmony_ci
220462306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
220562306a36Sopenharmony_ci		return true;
220662306a36Sopenharmony_ci
220762306a36Sopenharmony_ci	/* Check if there is a device with active zones left */
220862306a36Sopenharmony_ci	mutex_lock(&fs_info->chunk_mutex);
220962306a36Sopenharmony_ci	spin_lock(&fs_info->zone_active_bgs_lock);
221062306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
221162306a36Sopenharmony_ci		struct btrfs_zoned_device_info *zinfo = device->zone_info;
221262306a36Sopenharmony_ci		int reserved = 0;
221362306a36Sopenharmony_ci
221462306a36Sopenharmony_ci		if (!device->bdev)
221562306a36Sopenharmony_ci			continue;
221662306a36Sopenharmony_ci
221762306a36Sopenharmony_ci		if (!zinfo->max_active_zones) {
221862306a36Sopenharmony_ci			ret = true;
221962306a36Sopenharmony_ci			break;
222062306a36Sopenharmony_ci		}
222162306a36Sopenharmony_ci
222262306a36Sopenharmony_ci		if (flags & BTRFS_BLOCK_GROUP_DATA)
222362306a36Sopenharmony_ci			reserved = zinfo->reserved_active_zones;
222462306a36Sopenharmony_ci
222562306a36Sopenharmony_ci		switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
222662306a36Sopenharmony_ci		case 0: /* single */
222762306a36Sopenharmony_ci			ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved));
222862306a36Sopenharmony_ci			break;
222962306a36Sopenharmony_ci		case BTRFS_BLOCK_GROUP_DUP:
223062306a36Sopenharmony_ci			ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved));
223162306a36Sopenharmony_ci			break;
223262306a36Sopenharmony_ci		}
223362306a36Sopenharmony_ci		if (ret)
223462306a36Sopenharmony_ci			break;
223562306a36Sopenharmony_ci	}
223662306a36Sopenharmony_ci	spin_unlock(&fs_info->zone_active_bgs_lock);
223762306a36Sopenharmony_ci	mutex_unlock(&fs_info->chunk_mutex);
223862306a36Sopenharmony_ci
223962306a36Sopenharmony_ci	if (!ret)
224062306a36Sopenharmony_ci		set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
224162306a36Sopenharmony_ci
224262306a36Sopenharmony_ci	return ret;
224362306a36Sopenharmony_ci}
224462306a36Sopenharmony_ci
224562306a36Sopenharmony_civoid btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
224662306a36Sopenharmony_ci{
224762306a36Sopenharmony_ci	struct btrfs_block_group *block_group;
224862306a36Sopenharmony_ci	u64 min_alloc_bytes;
224962306a36Sopenharmony_ci
225062306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
225162306a36Sopenharmony_ci		return;
225262306a36Sopenharmony_ci
225362306a36Sopenharmony_ci	block_group = btrfs_lookup_block_group(fs_info, logical);
225462306a36Sopenharmony_ci	ASSERT(block_group);
225562306a36Sopenharmony_ci
225662306a36Sopenharmony_ci	/* No MIXED_BG on zoned btrfs. */
225762306a36Sopenharmony_ci	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
225862306a36Sopenharmony_ci		min_alloc_bytes = fs_info->sectorsize;
225962306a36Sopenharmony_ci	else
226062306a36Sopenharmony_ci		min_alloc_bytes = fs_info->nodesize;
226162306a36Sopenharmony_ci
226262306a36Sopenharmony_ci	/* Bail out if we can allocate more data from this block group. */
226362306a36Sopenharmony_ci	if (logical + length + min_alloc_bytes <=
226462306a36Sopenharmony_ci	    block_group->start + block_group->zone_capacity)
226562306a36Sopenharmony_ci		goto out;
226662306a36Sopenharmony_ci
226762306a36Sopenharmony_ci	do_zone_finish(block_group, true);
226862306a36Sopenharmony_ci
226962306a36Sopenharmony_ciout:
227062306a36Sopenharmony_ci	btrfs_put_block_group(block_group);
227162306a36Sopenharmony_ci}
227262306a36Sopenharmony_ci
227362306a36Sopenharmony_cistatic void btrfs_zone_finish_endio_workfn(struct work_struct *work)
227462306a36Sopenharmony_ci{
227562306a36Sopenharmony_ci	struct btrfs_block_group *bg =
227662306a36Sopenharmony_ci		container_of(work, struct btrfs_block_group, zone_finish_work);
227762306a36Sopenharmony_ci
227862306a36Sopenharmony_ci	wait_on_extent_buffer_writeback(bg->last_eb);
227962306a36Sopenharmony_ci	free_extent_buffer(bg->last_eb);
228062306a36Sopenharmony_ci	btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
228162306a36Sopenharmony_ci	btrfs_put_block_group(bg);
228262306a36Sopenharmony_ci}
228362306a36Sopenharmony_ci
228462306a36Sopenharmony_civoid btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
228562306a36Sopenharmony_ci				   struct extent_buffer *eb)
228662306a36Sopenharmony_ci{
228762306a36Sopenharmony_ci	if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
228862306a36Sopenharmony_ci	    eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
228962306a36Sopenharmony_ci		return;
229062306a36Sopenharmony_ci
229162306a36Sopenharmony_ci	if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
229262306a36Sopenharmony_ci		btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
229362306a36Sopenharmony_ci			  bg->start);
229462306a36Sopenharmony_ci		return;
229562306a36Sopenharmony_ci	}
229662306a36Sopenharmony_ci
229762306a36Sopenharmony_ci	/* For the work */
229862306a36Sopenharmony_ci	btrfs_get_block_group(bg);
229962306a36Sopenharmony_ci	atomic_inc(&eb->refs);
230062306a36Sopenharmony_ci	bg->last_eb = eb;
230162306a36Sopenharmony_ci	INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
230262306a36Sopenharmony_ci	queue_work(system_unbound_wq, &bg->zone_finish_work);
230362306a36Sopenharmony_ci}
230462306a36Sopenharmony_ci
230562306a36Sopenharmony_civoid btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
230662306a36Sopenharmony_ci{
230762306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = bg->fs_info;
230862306a36Sopenharmony_ci
230962306a36Sopenharmony_ci	spin_lock(&fs_info->relocation_bg_lock);
231062306a36Sopenharmony_ci	if (fs_info->data_reloc_bg == bg->start)
231162306a36Sopenharmony_ci		fs_info->data_reloc_bg = 0;
231262306a36Sopenharmony_ci	spin_unlock(&fs_info->relocation_bg_lock);
231362306a36Sopenharmony_ci}
231462306a36Sopenharmony_ci
231562306a36Sopenharmony_civoid btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
231662306a36Sopenharmony_ci{
231762306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
231862306a36Sopenharmony_ci	struct btrfs_device *device;
231962306a36Sopenharmony_ci
232062306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
232162306a36Sopenharmony_ci		return;
232262306a36Sopenharmony_ci
232362306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
232462306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->devices, dev_list) {
232562306a36Sopenharmony_ci		if (device->zone_info) {
232662306a36Sopenharmony_ci			vfree(device->zone_info->zone_cache);
232762306a36Sopenharmony_ci			device->zone_info->zone_cache = NULL;
232862306a36Sopenharmony_ci		}
232962306a36Sopenharmony_ci	}
233062306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
233162306a36Sopenharmony_ci}
233262306a36Sopenharmony_ci
233362306a36Sopenharmony_cibool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
233462306a36Sopenharmony_ci{
233562306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
233662306a36Sopenharmony_ci	struct btrfs_device *device;
233762306a36Sopenharmony_ci	u64 used = 0;
233862306a36Sopenharmony_ci	u64 total = 0;
233962306a36Sopenharmony_ci	u64 factor;
234062306a36Sopenharmony_ci
234162306a36Sopenharmony_ci	ASSERT(btrfs_is_zoned(fs_info));
234262306a36Sopenharmony_ci
234362306a36Sopenharmony_ci	if (fs_info->bg_reclaim_threshold == 0)
234462306a36Sopenharmony_ci		return false;
234562306a36Sopenharmony_ci
234662306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
234762306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->devices, dev_list) {
234862306a36Sopenharmony_ci		if (!device->bdev)
234962306a36Sopenharmony_ci			continue;
235062306a36Sopenharmony_ci
235162306a36Sopenharmony_ci		total += device->disk_total_bytes;
235262306a36Sopenharmony_ci		used += device->bytes_used;
235362306a36Sopenharmony_ci	}
235462306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
235562306a36Sopenharmony_ci
235662306a36Sopenharmony_ci	factor = div64_u64(used * 100, total);
235762306a36Sopenharmony_ci	return factor >= fs_info->bg_reclaim_threshold;
235862306a36Sopenharmony_ci}
235962306a36Sopenharmony_ci
236062306a36Sopenharmony_civoid btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
236162306a36Sopenharmony_ci				       u64 length)
236262306a36Sopenharmony_ci{
236362306a36Sopenharmony_ci	struct btrfs_block_group *block_group;
236462306a36Sopenharmony_ci
236562306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info))
236662306a36Sopenharmony_ci		return;
236762306a36Sopenharmony_ci
236862306a36Sopenharmony_ci	block_group = btrfs_lookup_block_group(fs_info, logical);
236962306a36Sopenharmony_ci	/* It should be called on a previous data relocation block group. */
237062306a36Sopenharmony_ci	ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
237162306a36Sopenharmony_ci
237262306a36Sopenharmony_ci	spin_lock(&block_group->lock);
237362306a36Sopenharmony_ci	if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
237462306a36Sopenharmony_ci		goto out;
237562306a36Sopenharmony_ci
237662306a36Sopenharmony_ci	/* All relocation extents are written. */
237762306a36Sopenharmony_ci	if (block_group->start + block_group->alloc_offset == logical + length) {
237862306a36Sopenharmony_ci		/*
237962306a36Sopenharmony_ci		 * Now, release this block group for further allocations and
238062306a36Sopenharmony_ci		 * zone finish.
238162306a36Sopenharmony_ci		 */
238262306a36Sopenharmony_ci		clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
238362306a36Sopenharmony_ci			  &block_group->runtime_flags);
238462306a36Sopenharmony_ci	}
238562306a36Sopenharmony_ci
238662306a36Sopenharmony_ciout:
238762306a36Sopenharmony_ci	spin_unlock(&block_group->lock);
238862306a36Sopenharmony_ci	btrfs_put_block_group(block_group);
238962306a36Sopenharmony_ci}
239062306a36Sopenharmony_ci
239162306a36Sopenharmony_ciint btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
239262306a36Sopenharmony_ci{
239362306a36Sopenharmony_ci	struct btrfs_block_group *block_group;
239462306a36Sopenharmony_ci	struct btrfs_block_group *min_bg = NULL;
239562306a36Sopenharmony_ci	u64 min_avail = U64_MAX;
239662306a36Sopenharmony_ci	int ret;
239762306a36Sopenharmony_ci
239862306a36Sopenharmony_ci	spin_lock(&fs_info->zone_active_bgs_lock);
239962306a36Sopenharmony_ci	list_for_each_entry(block_group, &fs_info->zone_active_bgs,
240062306a36Sopenharmony_ci			    active_bg_list) {
240162306a36Sopenharmony_ci		u64 avail;
240262306a36Sopenharmony_ci
240362306a36Sopenharmony_ci		spin_lock(&block_group->lock);
240462306a36Sopenharmony_ci		if (block_group->reserved || block_group->alloc_offset == 0 ||
240562306a36Sopenharmony_ci		    (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
240662306a36Sopenharmony_ci		    test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
240762306a36Sopenharmony_ci			spin_unlock(&block_group->lock);
240862306a36Sopenharmony_ci			continue;
240962306a36Sopenharmony_ci		}
241062306a36Sopenharmony_ci
241162306a36Sopenharmony_ci		avail = block_group->zone_capacity - block_group->alloc_offset;
241262306a36Sopenharmony_ci		if (min_avail > avail) {
241362306a36Sopenharmony_ci			if (min_bg)
241462306a36Sopenharmony_ci				btrfs_put_block_group(min_bg);
241562306a36Sopenharmony_ci			min_bg = block_group;
241662306a36Sopenharmony_ci			min_avail = avail;
241762306a36Sopenharmony_ci			btrfs_get_block_group(min_bg);
241862306a36Sopenharmony_ci		}
241962306a36Sopenharmony_ci		spin_unlock(&block_group->lock);
242062306a36Sopenharmony_ci	}
242162306a36Sopenharmony_ci	spin_unlock(&fs_info->zone_active_bgs_lock);
242262306a36Sopenharmony_ci
242362306a36Sopenharmony_ci	if (!min_bg)
242462306a36Sopenharmony_ci		return 0;
242562306a36Sopenharmony_ci
242662306a36Sopenharmony_ci	ret = btrfs_zone_finish(min_bg);
242762306a36Sopenharmony_ci	btrfs_put_block_group(min_bg);
242862306a36Sopenharmony_ci
242962306a36Sopenharmony_ci	return ret < 0 ? ret : 1;
243062306a36Sopenharmony_ci}
243162306a36Sopenharmony_ci
243262306a36Sopenharmony_ciint btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
243362306a36Sopenharmony_ci				struct btrfs_space_info *space_info,
243462306a36Sopenharmony_ci				bool do_finish)
243562306a36Sopenharmony_ci{
243662306a36Sopenharmony_ci	struct btrfs_block_group *bg;
243762306a36Sopenharmony_ci	int index;
243862306a36Sopenharmony_ci
243962306a36Sopenharmony_ci	if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
244062306a36Sopenharmony_ci		return 0;
244162306a36Sopenharmony_ci
244262306a36Sopenharmony_ci	for (;;) {
244362306a36Sopenharmony_ci		int ret;
244462306a36Sopenharmony_ci		bool need_finish = false;
244562306a36Sopenharmony_ci
244662306a36Sopenharmony_ci		down_read(&space_info->groups_sem);
244762306a36Sopenharmony_ci		for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
244862306a36Sopenharmony_ci			list_for_each_entry(bg, &space_info->block_groups[index],
244962306a36Sopenharmony_ci					    list) {
245062306a36Sopenharmony_ci				if (!spin_trylock(&bg->lock))
245162306a36Sopenharmony_ci					continue;
245262306a36Sopenharmony_ci				if (btrfs_zoned_bg_is_full(bg) ||
245362306a36Sopenharmony_ci				    test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
245462306a36Sopenharmony_ci					     &bg->runtime_flags)) {
245562306a36Sopenharmony_ci					spin_unlock(&bg->lock);
245662306a36Sopenharmony_ci					continue;
245762306a36Sopenharmony_ci				}
245862306a36Sopenharmony_ci				spin_unlock(&bg->lock);
245962306a36Sopenharmony_ci
246062306a36Sopenharmony_ci				if (btrfs_zone_activate(bg)) {
246162306a36Sopenharmony_ci					up_read(&space_info->groups_sem);
246262306a36Sopenharmony_ci					return 1;
246362306a36Sopenharmony_ci				}
246462306a36Sopenharmony_ci
246562306a36Sopenharmony_ci				need_finish = true;
246662306a36Sopenharmony_ci			}
246762306a36Sopenharmony_ci		}
246862306a36Sopenharmony_ci		up_read(&space_info->groups_sem);
246962306a36Sopenharmony_ci
247062306a36Sopenharmony_ci		if (!do_finish || !need_finish)
247162306a36Sopenharmony_ci			break;
247262306a36Sopenharmony_ci
247362306a36Sopenharmony_ci		ret = btrfs_zone_finish_one_bg(fs_info);
247462306a36Sopenharmony_ci		if (ret == 0)
247562306a36Sopenharmony_ci			break;
247662306a36Sopenharmony_ci		if (ret < 0)
247762306a36Sopenharmony_ci			return ret;
247862306a36Sopenharmony_ci	}
247962306a36Sopenharmony_ci
248062306a36Sopenharmony_ci	return 0;
248162306a36Sopenharmony_ci}
248262306a36Sopenharmony_ci
248362306a36Sopenharmony_ci/*
248462306a36Sopenharmony_ci * Reserve zones for one metadata block group, one tree-log block group, and one
248562306a36Sopenharmony_ci * system block group.
248662306a36Sopenharmony_ci */
248762306a36Sopenharmony_civoid btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
248862306a36Sopenharmony_ci{
248962306a36Sopenharmony_ci	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
249062306a36Sopenharmony_ci	struct btrfs_block_group *block_group;
249162306a36Sopenharmony_ci	struct btrfs_device *device;
249262306a36Sopenharmony_ci	/* Reserve zones for normal SINGLE metadata and tree-log block group. */
249362306a36Sopenharmony_ci	unsigned int metadata_reserve = 2;
249462306a36Sopenharmony_ci	/* Reserve a zone for SINGLE system block group. */
249562306a36Sopenharmony_ci	unsigned int system_reserve = 1;
249662306a36Sopenharmony_ci
249762306a36Sopenharmony_ci	if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
249862306a36Sopenharmony_ci		return;
249962306a36Sopenharmony_ci
250062306a36Sopenharmony_ci	/*
250162306a36Sopenharmony_ci	 * This function is called from the mount context. So, there is no
250262306a36Sopenharmony_ci	 * parallel process touching the bits. No need for read_seqretry().
250362306a36Sopenharmony_ci	 */
250462306a36Sopenharmony_ci	if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
250562306a36Sopenharmony_ci		metadata_reserve = 4;
250662306a36Sopenharmony_ci	if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
250762306a36Sopenharmony_ci		system_reserve = 2;
250862306a36Sopenharmony_ci
250962306a36Sopenharmony_ci	/* Apply the reservation on all the devices. */
251062306a36Sopenharmony_ci	mutex_lock(&fs_devices->device_list_mutex);
251162306a36Sopenharmony_ci	list_for_each_entry(device, &fs_devices->devices, dev_list) {
251262306a36Sopenharmony_ci		if (!device->bdev)
251362306a36Sopenharmony_ci			continue;
251462306a36Sopenharmony_ci
251562306a36Sopenharmony_ci		device->zone_info->reserved_active_zones =
251662306a36Sopenharmony_ci			metadata_reserve + system_reserve;
251762306a36Sopenharmony_ci	}
251862306a36Sopenharmony_ci	mutex_unlock(&fs_devices->device_list_mutex);
251962306a36Sopenharmony_ci
252062306a36Sopenharmony_ci	/* Release reservation for currently active block groups. */
252162306a36Sopenharmony_ci	spin_lock(&fs_info->zone_active_bgs_lock);
252262306a36Sopenharmony_ci	list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
252362306a36Sopenharmony_ci		struct map_lookup *map = block_group->physical_map;
252462306a36Sopenharmony_ci
252562306a36Sopenharmony_ci		if (!(block_group->flags &
252662306a36Sopenharmony_ci		      (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
252762306a36Sopenharmony_ci			continue;
252862306a36Sopenharmony_ci
252962306a36Sopenharmony_ci		for (int i = 0; i < map->num_stripes; i++)
253062306a36Sopenharmony_ci			map->stripes[i].dev->zone_info->reserved_active_zones--;
253162306a36Sopenharmony_ci	}
253262306a36Sopenharmony_ci	spin_unlock(&fs_info->zone_active_bgs_lock);
253362306a36Sopenharmony_ci}
2534