162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2021 Western Digital Corporation or its affiliates.
462306a36Sopenharmony_ci */
562306a36Sopenharmony_ci
662306a36Sopenharmony_ci#include <linux/blkdev.h>
762306a36Sopenharmony_ci#include <linux/mm.h>
862306a36Sopenharmony_ci#include <linux/sched/mm.h>
962306a36Sopenharmony_ci#include <linux/slab.h>
1062306a36Sopenharmony_ci#include <linux/bitmap.h>
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include "dm-core.h"
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci#define DM_MSG_PREFIX "zone"
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci#define DM_ZONE_INVALID_WP_OFST		UINT_MAX
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci/*
1962306a36Sopenharmony_ci * For internal zone reports bypassing the top BIO submission path.
2062306a36Sopenharmony_ci */
2162306a36Sopenharmony_cistatic int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
2262306a36Sopenharmony_ci				  sector_t sector, unsigned int nr_zones,
2362306a36Sopenharmony_ci				  report_zones_cb cb, void *data)
2462306a36Sopenharmony_ci{
2562306a36Sopenharmony_ci	struct gendisk *disk = md->disk;
2662306a36Sopenharmony_ci	int ret;
2762306a36Sopenharmony_ci	struct dm_report_zones_args args = {
2862306a36Sopenharmony_ci		.next_sector = sector,
2962306a36Sopenharmony_ci		.orig_data = data,
3062306a36Sopenharmony_ci		.orig_cb = cb,
3162306a36Sopenharmony_ci	};
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci	do {
3462306a36Sopenharmony_ci		struct dm_target *tgt;
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci		tgt = dm_table_find_target(t, args.next_sector);
3762306a36Sopenharmony_ci		if (WARN_ON_ONCE(!tgt->type->report_zones))
3862306a36Sopenharmony_ci			return -EIO;
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_ci		args.tgt = tgt;
4162306a36Sopenharmony_ci		ret = tgt->type->report_zones(tgt, &args,
4262306a36Sopenharmony_ci					      nr_zones - args.zone_idx);
4362306a36Sopenharmony_ci		if (ret < 0)
4462306a36Sopenharmony_ci			return ret;
4562306a36Sopenharmony_ci	} while (args.zone_idx < nr_zones &&
4662306a36Sopenharmony_ci		 args.next_sector < get_capacity(disk));
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci	return args.zone_idx;
4962306a36Sopenharmony_ci}
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci/*
5262306a36Sopenharmony_ci * User facing dm device block device report zone operation. This calls the
5362306a36Sopenharmony_ci * report_zones operation for each target of a device table. This operation is
5462306a36Sopenharmony_ci * generally implemented by targets using dm_report_zones().
5562306a36Sopenharmony_ci */
5662306a36Sopenharmony_ciint dm_blk_report_zones(struct gendisk *disk, sector_t sector,
5762306a36Sopenharmony_ci			unsigned int nr_zones, report_zones_cb cb, void *data)
5862306a36Sopenharmony_ci{
5962306a36Sopenharmony_ci	struct mapped_device *md = disk->private_data;
6062306a36Sopenharmony_ci	struct dm_table *map;
6162306a36Sopenharmony_ci	int srcu_idx, ret;
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	if (dm_suspended_md(md))
6462306a36Sopenharmony_ci		return -EAGAIN;
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci	map = dm_get_live_table(md, &srcu_idx);
6762306a36Sopenharmony_ci	if (!map)
6862306a36Sopenharmony_ci		return -EIO;
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci	ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci	dm_put_live_table(md, srcu_idx);
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci	return ret;
7562306a36Sopenharmony_ci}
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_cistatic int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
7862306a36Sopenharmony_ci			      void *data)
7962306a36Sopenharmony_ci{
8062306a36Sopenharmony_ci	struct dm_report_zones_args *args = data;
8162306a36Sopenharmony_ci	sector_t sector_diff = args->tgt->begin - args->start;
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci	/*
8462306a36Sopenharmony_ci	 * Ignore zones beyond the target range.
8562306a36Sopenharmony_ci	 */
8662306a36Sopenharmony_ci	if (zone->start >= args->start + args->tgt->len)
8762306a36Sopenharmony_ci		return 0;
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci	/*
9062306a36Sopenharmony_ci	 * Remap the start sector and write pointer position of the zone
9162306a36Sopenharmony_ci	 * to match its position in the target range.
9262306a36Sopenharmony_ci	 */
9362306a36Sopenharmony_ci	zone->start += sector_diff;
9462306a36Sopenharmony_ci	if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
9562306a36Sopenharmony_ci		if (zone->cond == BLK_ZONE_COND_FULL)
9662306a36Sopenharmony_ci			zone->wp = zone->start + zone->len;
9762306a36Sopenharmony_ci		else if (zone->cond == BLK_ZONE_COND_EMPTY)
9862306a36Sopenharmony_ci			zone->wp = zone->start;
9962306a36Sopenharmony_ci		else
10062306a36Sopenharmony_ci			zone->wp += sector_diff;
10162306a36Sopenharmony_ci	}
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci	args->next_sector = zone->start + zone->len;
10462306a36Sopenharmony_ci	return args->orig_cb(zone, args->zone_idx++, args->orig_data);
10562306a36Sopenharmony_ci}
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci/*
10862306a36Sopenharmony_ci * Helper for drivers of zoned targets to implement struct target_type
10962306a36Sopenharmony_ci * report_zones operation.
11062306a36Sopenharmony_ci */
11162306a36Sopenharmony_ciint dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
11262306a36Sopenharmony_ci		    struct dm_report_zones_args *args, unsigned int nr_zones)
11362306a36Sopenharmony_ci{
11462306a36Sopenharmony_ci	/*
11562306a36Sopenharmony_ci	 * Set the target mapping start sector first so that
11662306a36Sopenharmony_ci	 * dm_report_zones_cb() can correctly remap zone information.
11762306a36Sopenharmony_ci	 */
11862306a36Sopenharmony_ci	args->start = start;
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci	return blkdev_report_zones(bdev, sector, nr_zones,
12162306a36Sopenharmony_ci				   dm_report_zones_cb, args);
12262306a36Sopenharmony_ci}
12362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_report_zones);
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_cibool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
12662306a36Sopenharmony_ci{
12762306a36Sopenharmony_ci	struct request_queue *q = md->queue;
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci	if (!blk_queue_is_zoned(q))
13062306a36Sopenharmony_ci		return false;
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci	switch (bio_op(bio)) {
13362306a36Sopenharmony_ci	case REQ_OP_WRITE_ZEROES:
13462306a36Sopenharmony_ci	case REQ_OP_WRITE:
13562306a36Sopenharmony_ci		return !op_is_flush(bio->bi_opf) && bio_sectors(bio);
13662306a36Sopenharmony_ci	default:
13762306a36Sopenharmony_ci		return false;
13862306a36Sopenharmony_ci	}
13962306a36Sopenharmony_ci}
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_civoid dm_cleanup_zoned_dev(struct mapped_device *md)
14262306a36Sopenharmony_ci{
14362306a36Sopenharmony_ci	if (md->disk) {
14462306a36Sopenharmony_ci		bitmap_free(md->disk->conv_zones_bitmap);
14562306a36Sopenharmony_ci		md->disk->conv_zones_bitmap = NULL;
14662306a36Sopenharmony_ci		bitmap_free(md->disk->seq_zones_wlock);
14762306a36Sopenharmony_ci		md->disk->seq_zones_wlock = NULL;
14862306a36Sopenharmony_ci	}
14962306a36Sopenharmony_ci
15062306a36Sopenharmony_ci	kvfree(md->zwp_offset);
15162306a36Sopenharmony_ci	md->zwp_offset = NULL;
15262306a36Sopenharmony_ci	md->nr_zones = 0;
15362306a36Sopenharmony_ci}
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_cistatic unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
15662306a36Sopenharmony_ci{
15762306a36Sopenharmony_ci	switch (zone->cond) {
15862306a36Sopenharmony_ci	case BLK_ZONE_COND_IMP_OPEN:
15962306a36Sopenharmony_ci	case BLK_ZONE_COND_EXP_OPEN:
16062306a36Sopenharmony_ci	case BLK_ZONE_COND_CLOSED:
16162306a36Sopenharmony_ci		return zone->wp - zone->start;
16262306a36Sopenharmony_ci	case BLK_ZONE_COND_FULL:
16362306a36Sopenharmony_ci		return zone->len;
16462306a36Sopenharmony_ci	case BLK_ZONE_COND_EMPTY:
16562306a36Sopenharmony_ci	case BLK_ZONE_COND_NOT_WP:
16662306a36Sopenharmony_ci	case BLK_ZONE_COND_OFFLINE:
16762306a36Sopenharmony_ci	case BLK_ZONE_COND_READONLY:
16862306a36Sopenharmony_ci	default:
16962306a36Sopenharmony_ci		/*
17062306a36Sopenharmony_ci		 * Conventional, offline and read-only zones do not have a valid
17162306a36Sopenharmony_ci		 * write pointer. Use 0 as for an empty zone.
17262306a36Sopenharmony_ci		 */
17362306a36Sopenharmony_ci		return 0;
17462306a36Sopenharmony_ci	}
17562306a36Sopenharmony_ci}
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_cistatic int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
17862306a36Sopenharmony_ci				 void *data)
17962306a36Sopenharmony_ci{
18062306a36Sopenharmony_ci	struct mapped_device *md = data;
18162306a36Sopenharmony_ci	struct gendisk *disk = md->disk;
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci	switch (zone->type) {
18462306a36Sopenharmony_ci	case BLK_ZONE_TYPE_CONVENTIONAL:
18562306a36Sopenharmony_ci		if (!disk->conv_zones_bitmap) {
18662306a36Sopenharmony_ci			disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones,
18762306a36Sopenharmony_ci								GFP_NOIO);
18862306a36Sopenharmony_ci			if (!disk->conv_zones_bitmap)
18962306a36Sopenharmony_ci				return -ENOMEM;
19062306a36Sopenharmony_ci		}
19162306a36Sopenharmony_ci		set_bit(idx, disk->conv_zones_bitmap);
19262306a36Sopenharmony_ci		break;
19362306a36Sopenharmony_ci	case BLK_ZONE_TYPE_SEQWRITE_REQ:
19462306a36Sopenharmony_ci	case BLK_ZONE_TYPE_SEQWRITE_PREF:
19562306a36Sopenharmony_ci		if (!disk->seq_zones_wlock) {
19662306a36Sopenharmony_ci			disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones,
19762306a36Sopenharmony_ci							      GFP_NOIO);
19862306a36Sopenharmony_ci			if (!disk->seq_zones_wlock)
19962306a36Sopenharmony_ci				return -ENOMEM;
20062306a36Sopenharmony_ci		}
20162306a36Sopenharmony_ci		if (!md->zwp_offset) {
20262306a36Sopenharmony_ci			md->zwp_offset =
20362306a36Sopenharmony_ci				kvcalloc(disk->nr_zones, sizeof(unsigned int),
20462306a36Sopenharmony_ci					 GFP_KERNEL);
20562306a36Sopenharmony_ci			if (!md->zwp_offset)
20662306a36Sopenharmony_ci				return -ENOMEM;
20762306a36Sopenharmony_ci		}
20862306a36Sopenharmony_ci		md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci		break;
21162306a36Sopenharmony_ci	default:
21262306a36Sopenharmony_ci		DMERR("Invalid zone type 0x%x at sectors %llu",
21362306a36Sopenharmony_ci		      (int)zone->type, zone->start);
21462306a36Sopenharmony_ci		return -ENODEV;
21562306a36Sopenharmony_ci	}
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	return 0;
21862306a36Sopenharmony_ci}
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci/*
22162306a36Sopenharmony_ci * Revalidate the zones of a mapped device to initialize resource necessary
22262306a36Sopenharmony_ci * for zone append emulation. Note that we cannot simply use the block layer
22362306a36Sopenharmony_ci * blk_revalidate_disk_zones() function here as the mapped device is suspended
22462306a36Sopenharmony_ci * (this is called from __bind() context).
22562306a36Sopenharmony_ci */
22662306a36Sopenharmony_cistatic int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
22762306a36Sopenharmony_ci{
22862306a36Sopenharmony_ci	struct gendisk *disk = md->disk;
22962306a36Sopenharmony_ci	unsigned int noio_flag;
23062306a36Sopenharmony_ci	int ret;
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci	/*
23362306a36Sopenharmony_ci	 * Check if something changed. If yes, cleanup the current resources
23462306a36Sopenharmony_ci	 * and reallocate everything.
23562306a36Sopenharmony_ci	 */
23662306a36Sopenharmony_ci	if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
23762306a36Sopenharmony_ci		dm_cleanup_zoned_dev(md);
23862306a36Sopenharmony_ci	if (md->nr_zones)
23962306a36Sopenharmony_ci		return 0;
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_ci	/*
24262306a36Sopenharmony_ci	 * Scan all zones to initialize everything. Ensure that all vmalloc
24362306a36Sopenharmony_ci	 * operations in this context are done as if GFP_NOIO was specified.
24462306a36Sopenharmony_ci	 */
24562306a36Sopenharmony_ci	noio_flag = memalloc_noio_save();
24662306a36Sopenharmony_ci	ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones,
24762306a36Sopenharmony_ci				     dm_zone_revalidate_cb, md);
24862306a36Sopenharmony_ci	memalloc_noio_restore(noio_flag);
24962306a36Sopenharmony_ci	if (ret < 0)
25062306a36Sopenharmony_ci		goto err;
25162306a36Sopenharmony_ci	if (ret != disk->nr_zones) {
25262306a36Sopenharmony_ci		ret = -EIO;
25362306a36Sopenharmony_ci		goto err;
25462306a36Sopenharmony_ci	}
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	md->nr_zones = disk->nr_zones;
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci	return 0;
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_cierr:
26162306a36Sopenharmony_ci	DMERR("Revalidate zones failed %d", ret);
26262306a36Sopenharmony_ci	dm_cleanup_zoned_dev(md);
26362306a36Sopenharmony_ci	return ret;
26462306a36Sopenharmony_ci}
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_cistatic int device_not_zone_append_capable(struct dm_target *ti,
26762306a36Sopenharmony_ci					  struct dm_dev *dev, sector_t start,
26862306a36Sopenharmony_ci					  sector_t len, void *data)
26962306a36Sopenharmony_ci{
27062306a36Sopenharmony_ci	return !bdev_is_zoned(dev->bdev);
27162306a36Sopenharmony_ci}
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_cistatic bool dm_table_supports_zone_append(struct dm_table *t)
27462306a36Sopenharmony_ci{
27562306a36Sopenharmony_ci	for (unsigned int i = 0; i < t->num_targets; i++) {
27662306a36Sopenharmony_ci		struct dm_target *ti = dm_table_get_target(t, i);
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci		if (ti->emulate_zone_append)
27962306a36Sopenharmony_ci			return false;
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_ci		if (!ti->type->iterate_devices ||
28262306a36Sopenharmony_ci		    ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL))
28362306a36Sopenharmony_ci			return false;
28462306a36Sopenharmony_ci	}
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	return true;
28762306a36Sopenharmony_ci}
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ciint dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
29062306a36Sopenharmony_ci{
29162306a36Sopenharmony_ci	struct mapped_device *md = t->md;
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	/*
29462306a36Sopenharmony_ci	 * For a zoned target, the number of zones should be updated for the
29562306a36Sopenharmony_ci	 * correct value to be exposed in sysfs queue/nr_zones.
29662306a36Sopenharmony_ci	 */
29762306a36Sopenharmony_ci	WARN_ON_ONCE(queue_is_mq(q));
29862306a36Sopenharmony_ci	md->disk->nr_zones = bdev_nr_zones(md->disk->part0);
29962306a36Sopenharmony_ci
30062306a36Sopenharmony_ci	/* Check if zone append is natively supported */
30162306a36Sopenharmony_ci	if (dm_table_supports_zone_append(t)) {
30262306a36Sopenharmony_ci		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
30362306a36Sopenharmony_ci		dm_cleanup_zoned_dev(md);
30462306a36Sopenharmony_ci		return 0;
30562306a36Sopenharmony_ci	}
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci	/*
30862306a36Sopenharmony_ci	 * Mark the mapped device as needing zone append emulation and
30962306a36Sopenharmony_ci	 * initialize the emulation resources once the capacity is set.
31062306a36Sopenharmony_ci	 */
31162306a36Sopenharmony_ci	set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
31262306a36Sopenharmony_ci	if (!get_capacity(md->disk))
31362306a36Sopenharmony_ci		return 0;
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci	return dm_revalidate_zones(md, t);
31662306a36Sopenharmony_ci}
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_cistatic int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
31962306a36Sopenharmony_ci				       void *data)
32062306a36Sopenharmony_ci{
32162306a36Sopenharmony_ci	unsigned int *wp_offset = data;
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci	*wp_offset = dm_get_zone_wp_offset(zone);
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci	return 0;
32662306a36Sopenharmony_ci}
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_cistatic int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
32962306a36Sopenharmony_ci				    unsigned int *wp_ofst)
33062306a36Sopenharmony_ci{
33162306a36Sopenharmony_ci	sector_t sector = zno * bdev_zone_sectors(md->disk->part0);
33262306a36Sopenharmony_ci	unsigned int noio_flag;
33362306a36Sopenharmony_ci	struct dm_table *t;
33462306a36Sopenharmony_ci	int srcu_idx, ret;
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci	t = dm_get_live_table(md, &srcu_idx);
33762306a36Sopenharmony_ci	if (!t)
33862306a36Sopenharmony_ci		return -EIO;
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci	/*
34162306a36Sopenharmony_ci	 * Ensure that all memory allocations in this context are done as if
34262306a36Sopenharmony_ci	 * GFP_NOIO was specified.
34362306a36Sopenharmony_ci	 */
34462306a36Sopenharmony_ci	noio_flag = memalloc_noio_save();
34562306a36Sopenharmony_ci	ret = dm_blk_do_report_zones(md, t, sector, 1,
34662306a36Sopenharmony_ci				     dm_update_zone_wp_offset_cb, wp_ofst);
34762306a36Sopenharmony_ci	memalloc_noio_restore(noio_flag);
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci	dm_put_live_table(md, srcu_idx);
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci	if (ret != 1)
35262306a36Sopenharmony_ci		return -EIO;
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	return 0;
35562306a36Sopenharmony_ci}
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_cistruct orig_bio_details {
35862306a36Sopenharmony_ci	enum req_op op;
35962306a36Sopenharmony_ci	unsigned int nr_sectors;
36062306a36Sopenharmony_ci};
36162306a36Sopenharmony_ci
36262306a36Sopenharmony_ci/*
36362306a36Sopenharmony_ci * First phase of BIO mapping for targets with zone append emulation:
36462306a36Sopenharmony_ci * check all BIO that change a zone writer pointer and change zone
36562306a36Sopenharmony_ci * append operations into regular write operations.
36662306a36Sopenharmony_ci */
36762306a36Sopenharmony_cistatic bool dm_zone_map_bio_begin(struct mapped_device *md,
36862306a36Sopenharmony_ci				  unsigned int zno, struct bio *clone)
36962306a36Sopenharmony_ci{
37062306a36Sopenharmony_ci	sector_t zsectors = bdev_zone_sectors(md->disk->part0);
37162306a36Sopenharmony_ci	unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci	/*
37462306a36Sopenharmony_ci	 * If the target zone is in an error state, recover by inspecting the
37562306a36Sopenharmony_ci	 * zone to get its current write pointer position. Note that since the
37662306a36Sopenharmony_ci	 * target zone is already locked, a BIO issuing context should never
37762306a36Sopenharmony_ci	 * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
37862306a36Sopenharmony_ci	 */
37962306a36Sopenharmony_ci	if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
38062306a36Sopenharmony_ci		if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
38162306a36Sopenharmony_ci			return false;
38262306a36Sopenharmony_ci		WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
38362306a36Sopenharmony_ci	}
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci	switch (bio_op(clone)) {
38662306a36Sopenharmony_ci	case REQ_OP_ZONE_RESET:
38762306a36Sopenharmony_ci	case REQ_OP_ZONE_FINISH:
38862306a36Sopenharmony_ci		return true;
38962306a36Sopenharmony_ci	case REQ_OP_WRITE_ZEROES:
39062306a36Sopenharmony_ci	case REQ_OP_WRITE:
39162306a36Sopenharmony_ci		/* Writes must be aligned to the zone write pointer */
39262306a36Sopenharmony_ci		if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
39362306a36Sopenharmony_ci			return false;
39462306a36Sopenharmony_ci		break;
39562306a36Sopenharmony_ci	case REQ_OP_ZONE_APPEND:
39662306a36Sopenharmony_ci		/*
39762306a36Sopenharmony_ci		 * Change zone append operations into a non-mergeable regular
39862306a36Sopenharmony_ci		 * writes directed at the current write pointer position of the
39962306a36Sopenharmony_ci		 * target zone.
40062306a36Sopenharmony_ci		 */
40162306a36Sopenharmony_ci		clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
40262306a36Sopenharmony_ci			(clone->bi_opf & (~REQ_OP_MASK));
40362306a36Sopenharmony_ci		clone->bi_iter.bi_sector += zwp_offset;
40462306a36Sopenharmony_ci		break;
40562306a36Sopenharmony_ci	default:
40662306a36Sopenharmony_ci		DMWARN_LIMIT("Invalid BIO operation");
40762306a36Sopenharmony_ci		return false;
40862306a36Sopenharmony_ci	}
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci	/* Cannot write to a full zone */
41162306a36Sopenharmony_ci	if (zwp_offset >= zsectors)
41262306a36Sopenharmony_ci		return false;
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci	return true;
41562306a36Sopenharmony_ci}
41662306a36Sopenharmony_ci
41762306a36Sopenharmony_ci/*
41862306a36Sopenharmony_ci * Second phase of BIO mapping for targets with zone append emulation:
41962306a36Sopenharmony_ci * update the zone write pointer offset array to account for the additional
42062306a36Sopenharmony_ci * data written to a zone. Note that at this point, the remapped clone BIO
42162306a36Sopenharmony_ci * may already have completed, so we do not touch it.
42262306a36Sopenharmony_ci */
42362306a36Sopenharmony_cistatic blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno,
42462306a36Sopenharmony_ci					struct orig_bio_details *orig_bio_details,
42562306a36Sopenharmony_ci					unsigned int nr_sectors)
42662306a36Sopenharmony_ci{
42762306a36Sopenharmony_ci	unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	/* The clone BIO may already have been completed and failed */
43062306a36Sopenharmony_ci	if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
43162306a36Sopenharmony_ci		return BLK_STS_IOERR;
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci	/* Update the zone wp offset */
43462306a36Sopenharmony_ci	switch (orig_bio_details->op) {
43562306a36Sopenharmony_ci	case REQ_OP_ZONE_RESET:
43662306a36Sopenharmony_ci		WRITE_ONCE(md->zwp_offset[zno], 0);
43762306a36Sopenharmony_ci		return BLK_STS_OK;
43862306a36Sopenharmony_ci	case REQ_OP_ZONE_FINISH:
43962306a36Sopenharmony_ci		WRITE_ONCE(md->zwp_offset[zno],
44062306a36Sopenharmony_ci			   bdev_zone_sectors(md->disk->part0));
44162306a36Sopenharmony_ci		return BLK_STS_OK;
44262306a36Sopenharmony_ci	case REQ_OP_WRITE_ZEROES:
44362306a36Sopenharmony_ci	case REQ_OP_WRITE:
44462306a36Sopenharmony_ci		WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
44562306a36Sopenharmony_ci		return BLK_STS_OK;
44662306a36Sopenharmony_ci	case REQ_OP_ZONE_APPEND:
44762306a36Sopenharmony_ci		/*
44862306a36Sopenharmony_ci		 * Check that the target did not truncate the write operation
44962306a36Sopenharmony_ci		 * emulating a zone append.
45062306a36Sopenharmony_ci		 */
45162306a36Sopenharmony_ci		if (nr_sectors != orig_bio_details->nr_sectors) {
45262306a36Sopenharmony_ci			DMWARN_LIMIT("Truncated write for zone append");
45362306a36Sopenharmony_ci			return BLK_STS_IOERR;
45462306a36Sopenharmony_ci		}
45562306a36Sopenharmony_ci		WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
45662306a36Sopenharmony_ci		return BLK_STS_OK;
45762306a36Sopenharmony_ci	default:
45862306a36Sopenharmony_ci		DMWARN_LIMIT("Invalid BIO operation");
45962306a36Sopenharmony_ci		return BLK_STS_IOERR;
46062306a36Sopenharmony_ci	}
46162306a36Sopenharmony_ci}
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_cistatic inline void dm_zone_lock(struct gendisk *disk, unsigned int zno,
46462306a36Sopenharmony_ci				struct bio *clone)
46562306a36Sopenharmony_ci{
46662306a36Sopenharmony_ci	if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
46762306a36Sopenharmony_ci		return;
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
47062306a36Sopenharmony_ci	bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
47162306a36Sopenharmony_ci}
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_cistatic inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno,
47462306a36Sopenharmony_ci				  struct bio *clone)
47562306a36Sopenharmony_ci{
47662306a36Sopenharmony_ci	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
47762306a36Sopenharmony_ci		return;
47862306a36Sopenharmony_ci
47962306a36Sopenharmony_ci	WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock));
48062306a36Sopenharmony_ci	clear_bit_unlock(zno, disk->seq_zones_wlock);
48162306a36Sopenharmony_ci	smp_mb__after_atomic();
48262306a36Sopenharmony_ci	wake_up_bit(disk->seq_zones_wlock, zno);
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci	bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
48562306a36Sopenharmony_ci}
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_cistatic bool dm_need_zone_wp_tracking(struct bio *bio)
48862306a36Sopenharmony_ci{
48962306a36Sopenharmony_ci	/*
49062306a36Sopenharmony_ci	 * Special processing is not needed for operations that do not need the
49162306a36Sopenharmony_ci	 * zone write lock, that is, all operations that target conventional
49262306a36Sopenharmony_ci	 * zones and all operations that do not modify directly a sequential
49362306a36Sopenharmony_ci	 * zone write pointer.
49462306a36Sopenharmony_ci	 */
49562306a36Sopenharmony_ci	if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
49662306a36Sopenharmony_ci		return false;
49762306a36Sopenharmony_ci	switch (bio_op(bio)) {
49862306a36Sopenharmony_ci	case REQ_OP_WRITE_ZEROES:
49962306a36Sopenharmony_ci	case REQ_OP_WRITE:
50062306a36Sopenharmony_ci	case REQ_OP_ZONE_RESET:
50162306a36Sopenharmony_ci	case REQ_OP_ZONE_FINISH:
50262306a36Sopenharmony_ci	case REQ_OP_ZONE_APPEND:
50362306a36Sopenharmony_ci		return bio_zone_is_seq(bio);
50462306a36Sopenharmony_ci	default:
50562306a36Sopenharmony_ci		return false;
50662306a36Sopenharmony_ci	}
50762306a36Sopenharmony_ci}
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci/*
51062306a36Sopenharmony_ci * Special IO mapping for targets needing zone append emulation.
51162306a36Sopenharmony_ci */
51262306a36Sopenharmony_ciint dm_zone_map_bio(struct dm_target_io *tio)
51362306a36Sopenharmony_ci{
51462306a36Sopenharmony_ci	struct dm_io *io = tio->io;
51562306a36Sopenharmony_ci	struct dm_target *ti = tio->ti;
51662306a36Sopenharmony_ci	struct mapped_device *md = io->md;
51762306a36Sopenharmony_ci	struct bio *clone = &tio->clone;
51862306a36Sopenharmony_ci	struct orig_bio_details orig_bio_details;
51962306a36Sopenharmony_ci	unsigned int zno;
52062306a36Sopenharmony_ci	blk_status_t sts;
52162306a36Sopenharmony_ci	int r;
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	/*
52462306a36Sopenharmony_ci	 * IOs that do not change a zone write pointer do not need
52562306a36Sopenharmony_ci	 * any additional special processing.
52662306a36Sopenharmony_ci	 */
52762306a36Sopenharmony_ci	if (!dm_need_zone_wp_tracking(clone))
52862306a36Sopenharmony_ci		return ti->type->map(ti, clone);
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci	/* Lock the target zone */
53162306a36Sopenharmony_ci	zno = bio_zone_no(clone);
53262306a36Sopenharmony_ci	dm_zone_lock(md->disk, zno, clone);
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci	orig_bio_details.nr_sectors = bio_sectors(clone);
53562306a36Sopenharmony_ci	orig_bio_details.op = bio_op(clone);
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci	/*
53862306a36Sopenharmony_ci	 * Check that the bio and the target zone write pointer offset are
53962306a36Sopenharmony_ci	 * both valid, and if the bio is a zone append, remap it to a write.
54062306a36Sopenharmony_ci	 */
54162306a36Sopenharmony_ci	if (!dm_zone_map_bio_begin(md, zno, clone)) {
54262306a36Sopenharmony_ci		dm_zone_unlock(md->disk, zno, clone);
54362306a36Sopenharmony_ci		return DM_MAPIO_KILL;
54462306a36Sopenharmony_ci	}
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci	/* Let the target do its work */
54762306a36Sopenharmony_ci	r = ti->type->map(ti, clone);
54862306a36Sopenharmony_ci	switch (r) {
54962306a36Sopenharmony_ci	case DM_MAPIO_SUBMITTED:
55062306a36Sopenharmony_ci		/*
55162306a36Sopenharmony_ci		 * The target submitted the clone BIO. The target zone will
55262306a36Sopenharmony_ci		 * be unlocked on completion of the clone.
55362306a36Sopenharmony_ci		 */
55462306a36Sopenharmony_ci		sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
55562306a36Sopenharmony_ci					  *tio->len_ptr);
55662306a36Sopenharmony_ci		break;
55762306a36Sopenharmony_ci	case DM_MAPIO_REMAPPED:
55862306a36Sopenharmony_ci		/*
55962306a36Sopenharmony_ci		 * The target only remapped the clone BIO. In case of error,
56062306a36Sopenharmony_ci		 * unlock the target zone here as the clone will not be
56162306a36Sopenharmony_ci		 * submitted.
56262306a36Sopenharmony_ci		 */
56362306a36Sopenharmony_ci		sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
56462306a36Sopenharmony_ci					  *tio->len_ptr);
56562306a36Sopenharmony_ci		if (sts != BLK_STS_OK)
56662306a36Sopenharmony_ci			dm_zone_unlock(md->disk, zno, clone);
56762306a36Sopenharmony_ci		break;
56862306a36Sopenharmony_ci	case DM_MAPIO_REQUEUE:
56962306a36Sopenharmony_ci	case DM_MAPIO_KILL:
57062306a36Sopenharmony_ci	default:
57162306a36Sopenharmony_ci		dm_zone_unlock(md->disk, zno, clone);
57262306a36Sopenharmony_ci		sts = BLK_STS_IOERR;
57362306a36Sopenharmony_ci		break;
57462306a36Sopenharmony_ci	}
57562306a36Sopenharmony_ci
57662306a36Sopenharmony_ci	if (sts != BLK_STS_OK)
57762306a36Sopenharmony_ci		return DM_MAPIO_KILL;
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci	return r;
58062306a36Sopenharmony_ci}
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ci/*
58362306a36Sopenharmony_ci * IO completion callback called from clone_endio().
58462306a36Sopenharmony_ci */
58562306a36Sopenharmony_civoid dm_zone_endio(struct dm_io *io, struct bio *clone)
58662306a36Sopenharmony_ci{
58762306a36Sopenharmony_ci	struct mapped_device *md = io->md;
58862306a36Sopenharmony_ci	struct gendisk *disk = md->disk;
58962306a36Sopenharmony_ci	struct bio *orig_bio = io->orig_bio;
59062306a36Sopenharmony_ci	unsigned int zwp_offset;
59162306a36Sopenharmony_ci	unsigned int zno;
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ci	/*
59462306a36Sopenharmony_ci	 * For targets that do not emulate zone append, we only need to
59562306a36Sopenharmony_ci	 * handle native zone-append bios.
59662306a36Sopenharmony_ci	 */
59762306a36Sopenharmony_ci	if (!dm_emulate_zone_append(md)) {
59862306a36Sopenharmony_ci		/*
59962306a36Sopenharmony_ci		 * Get the offset within the zone of the written sector
60062306a36Sopenharmony_ci		 * and add that to the original bio sector position.
60162306a36Sopenharmony_ci		 */
60262306a36Sopenharmony_ci		if (clone->bi_status == BLK_STS_OK &&
60362306a36Sopenharmony_ci		    bio_op(clone) == REQ_OP_ZONE_APPEND) {
60462306a36Sopenharmony_ci			sector_t mask =
60562306a36Sopenharmony_ci				(sector_t)bdev_zone_sectors(disk->part0) - 1;
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci			orig_bio->bi_iter.bi_sector +=
60862306a36Sopenharmony_ci				clone->bi_iter.bi_sector & mask;
60962306a36Sopenharmony_ci		}
61062306a36Sopenharmony_ci
61162306a36Sopenharmony_ci		return;
61262306a36Sopenharmony_ci	}
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci	/*
61562306a36Sopenharmony_ci	 * For targets that do emulate zone append, if the clone BIO does not
61662306a36Sopenharmony_ci	 * own the target zone write lock, we have nothing to do.
61762306a36Sopenharmony_ci	 */
61862306a36Sopenharmony_ci	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
61962306a36Sopenharmony_ci		return;
62062306a36Sopenharmony_ci
62162306a36Sopenharmony_ci	zno = bio_zone_no(orig_bio);
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_ci	if (clone->bi_status != BLK_STS_OK) {
62462306a36Sopenharmony_ci		/*
62562306a36Sopenharmony_ci		 * BIOs that modify a zone write pointer may leave the zone
62662306a36Sopenharmony_ci		 * in an unknown state in case of failure (e.g. the write
62762306a36Sopenharmony_ci		 * pointer was only partially advanced). In this case, set
62862306a36Sopenharmony_ci		 * the target zone write pointer as invalid unless it is
62962306a36Sopenharmony_ci		 * already being updated.
63062306a36Sopenharmony_ci		 */
63162306a36Sopenharmony_ci		WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
63262306a36Sopenharmony_ci	} else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
63362306a36Sopenharmony_ci		/*
63462306a36Sopenharmony_ci		 * Get the written sector for zone append operation that were
63562306a36Sopenharmony_ci		 * emulated using regular write operations.
63662306a36Sopenharmony_ci		 */
63762306a36Sopenharmony_ci		zwp_offset = READ_ONCE(md->zwp_offset[zno]);
63862306a36Sopenharmony_ci		if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
63962306a36Sopenharmony_ci			WRITE_ONCE(md->zwp_offset[zno],
64062306a36Sopenharmony_ci				   DM_ZONE_INVALID_WP_OFST);
64162306a36Sopenharmony_ci		else
64262306a36Sopenharmony_ci			orig_bio->bi_iter.bi_sector +=
64362306a36Sopenharmony_ci				zwp_offset - bio_sectors(orig_bio);
64462306a36Sopenharmony_ci	}
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_ci	dm_zone_unlock(disk, zno, clone);
64762306a36Sopenharmony_ci}
648