162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Module Author: Kiyoshi Ueda
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * This file is released under the GPL.
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci * Throughput oriented path selector.
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include "dm.h"
1362306a36Sopenharmony_ci#include "dm-path-selector.h"
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci#include <linux/slab.h>
1662306a36Sopenharmony_ci#include <linux/module.h>
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#define DM_MSG_PREFIX	"multipath service-time"
1962306a36Sopenharmony_ci#define ST_MIN_IO	1
2062306a36Sopenharmony_ci#define ST_MAX_RELATIVE_THROUGHPUT	100
2162306a36Sopenharmony_ci#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT	7
2262306a36Sopenharmony_ci#define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
2362306a36Sopenharmony_ci#define ST_VERSION	"0.3.0"
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_cistruct selector {
2662306a36Sopenharmony_ci	struct list_head valid_paths;
2762306a36Sopenharmony_ci	struct list_head failed_paths;
2862306a36Sopenharmony_ci	spinlock_t lock;
2962306a36Sopenharmony_ci};
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_cistruct path_info {
3262306a36Sopenharmony_ci	struct list_head list;
3362306a36Sopenharmony_ci	struct dm_path *path;
3462306a36Sopenharmony_ci	unsigned int repeat_count;
3562306a36Sopenharmony_ci	unsigned int relative_throughput;
3662306a36Sopenharmony_ci	atomic_t in_flight_size;	/* Total size of in-flight I/Os */
3762306a36Sopenharmony_ci};
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_cistatic struct selector *alloc_selector(void)
4062306a36Sopenharmony_ci{
4162306a36Sopenharmony_ci	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci	if (s) {
4462306a36Sopenharmony_ci		INIT_LIST_HEAD(&s->valid_paths);
4562306a36Sopenharmony_ci		INIT_LIST_HEAD(&s->failed_paths);
4662306a36Sopenharmony_ci		spin_lock_init(&s->lock);
4762306a36Sopenharmony_ci	}
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci	return s;
5062306a36Sopenharmony_ci}
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_cistatic int st_create(struct path_selector *ps, unsigned int argc, char **argv)
5362306a36Sopenharmony_ci{
5462306a36Sopenharmony_ci	struct selector *s = alloc_selector();
5562306a36Sopenharmony_ci
5662306a36Sopenharmony_ci	if (!s)
5762306a36Sopenharmony_ci		return -ENOMEM;
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci	ps->context = s;
6062306a36Sopenharmony_ci	return 0;
6162306a36Sopenharmony_ci}
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_cistatic void free_paths(struct list_head *paths)
6462306a36Sopenharmony_ci{
6562306a36Sopenharmony_ci	struct path_info *pi, *next;
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci	list_for_each_entry_safe(pi, next, paths, list) {
6862306a36Sopenharmony_ci		list_del(&pi->list);
6962306a36Sopenharmony_ci		kfree(pi);
7062306a36Sopenharmony_ci	}
7162306a36Sopenharmony_ci}
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_cistatic void st_destroy(struct path_selector *ps)
7462306a36Sopenharmony_ci{
7562306a36Sopenharmony_ci	struct selector *s = ps->context;
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci	free_paths(&s->valid_paths);
7862306a36Sopenharmony_ci	free_paths(&s->failed_paths);
7962306a36Sopenharmony_ci	kfree(s);
8062306a36Sopenharmony_ci	ps->context = NULL;
8162306a36Sopenharmony_ci}
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_cistatic int st_status(struct path_selector *ps, struct dm_path *path,
8462306a36Sopenharmony_ci		     status_type_t type, char *result, unsigned int maxlen)
8562306a36Sopenharmony_ci{
8662306a36Sopenharmony_ci	unsigned int sz = 0;
8762306a36Sopenharmony_ci	struct path_info *pi;
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci	if (!path)
9062306a36Sopenharmony_ci		DMEMIT("0 ");
9162306a36Sopenharmony_ci	else {
9262306a36Sopenharmony_ci		pi = path->pscontext;
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci		switch (type) {
9562306a36Sopenharmony_ci		case STATUSTYPE_INFO:
9662306a36Sopenharmony_ci			DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
9762306a36Sopenharmony_ci			       pi->relative_throughput);
9862306a36Sopenharmony_ci			break;
9962306a36Sopenharmony_ci		case STATUSTYPE_TABLE:
10062306a36Sopenharmony_ci			DMEMIT("%u %u ", pi->repeat_count,
10162306a36Sopenharmony_ci			       pi->relative_throughput);
10262306a36Sopenharmony_ci			break;
10362306a36Sopenharmony_ci		case STATUSTYPE_IMA:
10462306a36Sopenharmony_ci			result[0] = '\0';
10562306a36Sopenharmony_ci			break;
10662306a36Sopenharmony_ci		}
10762306a36Sopenharmony_ci	}
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	return sz;
11062306a36Sopenharmony_ci}
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_cistatic int st_add_path(struct path_selector *ps, struct dm_path *path,
11362306a36Sopenharmony_ci		       int argc, char **argv, char **error)
11462306a36Sopenharmony_ci{
11562306a36Sopenharmony_ci	struct selector *s = ps->context;
11662306a36Sopenharmony_ci	struct path_info *pi;
11762306a36Sopenharmony_ci	unsigned int repeat_count = ST_MIN_IO;
11862306a36Sopenharmony_ci	unsigned int relative_throughput = 1;
11962306a36Sopenharmony_ci	char dummy;
12062306a36Sopenharmony_ci	unsigned long flags;
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci	/*
12362306a36Sopenharmony_ci	 * Arguments: [<repeat_count> [<relative_throughput>]]
12462306a36Sopenharmony_ci	 *	<repeat_count>: The number of I/Os before switching path.
12562306a36Sopenharmony_ci	 *			If not given, default (ST_MIN_IO) is used.
12662306a36Sopenharmony_ci	 *	<relative_throughput>: The relative throughput value of
12762306a36Sopenharmony_ci	 *			the path among all paths in the path-group.
12862306a36Sopenharmony_ci	 *			The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
12962306a36Sopenharmony_ci	 *			If not given, minimum value '1' is used.
13062306a36Sopenharmony_ci	 *			If '0' is given, the path isn't selected while
13162306a36Sopenharmony_ci	 *			other paths having a positive value are	available.
13262306a36Sopenharmony_ci	 */
13362306a36Sopenharmony_ci	if (argc > 2) {
13462306a36Sopenharmony_ci		*error = "service-time ps: incorrect number of arguments";
13562306a36Sopenharmony_ci		return -EINVAL;
13662306a36Sopenharmony_ci	}
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
13962306a36Sopenharmony_ci		*error = "service-time ps: invalid repeat count";
14062306a36Sopenharmony_ci		return -EINVAL;
14162306a36Sopenharmony_ci	}
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ci	if (repeat_count > 1) {
14462306a36Sopenharmony_ci		DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
14562306a36Sopenharmony_ci		repeat_count = 1;
14662306a36Sopenharmony_ci	}
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci	if ((argc == 2) &&
14962306a36Sopenharmony_ci	    (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
15062306a36Sopenharmony_ci	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
15162306a36Sopenharmony_ci		*error = "service-time ps: invalid relative_throughput value";
15262306a36Sopenharmony_ci		return -EINVAL;
15362306a36Sopenharmony_ci	}
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	/* allocate the path */
15662306a36Sopenharmony_ci	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
15762306a36Sopenharmony_ci	if (!pi) {
15862306a36Sopenharmony_ci		*error = "service-time ps: Error allocating path context";
15962306a36Sopenharmony_ci		return -ENOMEM;
16062306a36Sopenharmony_ci	}
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci	pi->path = path;
16362306a36Sopenharmony_ci	pi->repeat_count = repeat_count;
16462306a36Sopenharmony_ci	pi->relative_throughput = relative_throughput;
16562306a36Sopenharmony_ci	atomic_set(&pi->in_flight_size, 0);
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_ci	path->pscontext = pi;
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci	spin_lock_irqsave(&s->lock, flags);
17062306a36Sopenharmony_ci	list_add_tail(&pi->list, &s->valid_paths);
17162306a36Sopenharmony_ci	spin_unlock_irqrestore(&s->lock, flags);
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	return 0;
17462306a36Sopenharmony_ci}
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_cistatic void st_fail_path(struct path_selector *ps, struct dm_path *path)
17762306a36Sopenharmony_ci{
17862306a36Sopenharmony_ci	struct selector *s = ps->context;
17962306a36Sopenharmony_ci	struct path_info *pi = path->pscontext;
18062306a36Sopenharmony_ci	unsigned long flags;
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	spin_lock_irqsave(&s->lock, flags);
18362306a36Sopenharmony_ci	list_move(&pi->list, &s->failed_paths);
18462306a36Sopenharmony_ci	spin_unlock_irqrestore(&s->lock, flags);
18562306a36Sopenharmony_ci}
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_cistatic int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
18862306a36Sopenharmony_ci{
18962306a36Sopenharmony_ci	struct selector *s = ps->context;
19062306a36Sopenharmony_ci	struct path_info *pi = path->pscontext;
19162306a36Sopenharmony_ci	unsigned long flags;
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci	spin_lock_irqsave(&s->lock, flags);
19462306a36Sopenharmony_ci	list_move_tail(&pi->list, &s->valid_paths);
19562306a36Sopenharmony_ci	spin_unlock_irqrestore(&s->lock, flags);
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci	return 0;
19862306a36Sopenharmony_ci}
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci/*
20162306a36Sopenharmony_ci * Compare the estimated service time of 2 paths, pi1 and pi2,
20262306a36Sopenharmony_ci * for the incoming I/O.
20362306a36Sopenharmony_ci *
20462306a36Sopenharmony_ci * Returns:
20562306a36Sopenharmony_ci * < 0 : pi1 is better
20662306a36Sopenharmony_ci * 0   : no difference between pi1 and pi2
20762306a36Sopenharmony_ci * > 0 : pi2 is better
20862306a36Sopenharmony_ci *
20962306a36Sopenharmony_ci * Description:
21062306a36Sopenharmony_ci * Basically, the service time is estimated by:
21162306a36Sopenharmony_ci *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
21262306a36Sopenharmony_ci * To reduce the calculation, some optimizations are made.
21362306a36Sopenharmony_ci * (See comments inline)
21462306a36Sopenharmony_ci */
21562306a36Sopenharmony_cistatic int st_compare_load(struct path_info *pi1, struct path_info *pi2,
21662306a36Sopenharmony_ci			   size_t incoming)
21762306a36Sopenharmony_ci{
21862306a36Sopenharmony_ci	size_t sz1, sz2, st1, st2;
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci	sz1 = atomic_read(&pi1->in_flight_size);
22162306a36Sopenharmony_ci	sz2 = atomic_read(&pi2->in_flight_size);
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_ci	/*
22462306a36Sopenharmony_ci	 * Case 1: Both have same throughput value. Choose less loaded path.
22562306a36Sopenharmony_ci	 */
22662306a36Sopenharmony_ci	if (pi1->relative_throughput == pi2->relative_throughput)
22762306a36Sopenharmony_ci		return sz1 - sz2;
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci	/*
23062306a36Sopenharmony_ci	 * Case 2a: Both have same load. Choose higher throughput path.
23162306a36Sopenharmony_ci	 * Case 2b: One path has no throughput value. Choose the other one.
23262306a36Sopenharmony_ci	 */
23362306a36Sopenharmony_ci	if (sz1 == sz2 ||
23462306a36Sopenharmony_ci	    !pi1->relative_throughput || !pi2->relative_throughput)
23562306a36Sopenharmony_ci		return pi2->relative_throughput - pi1->relative_throughput;
23662306a36Sopenharmony_ci
23762306a36Sopenharmony_ci	/*
23862306a36Sopenharmony_ci	 * Case 3: Calculate service time. Choose faster path.
23962306a36Sopenharmony_ci	 *         Service time using pi1:
24062306a36Sopenharmony_ci	 *             st1 = (sz1 + incoming) / pi1->relative_throughput
24162306a36Sopenharmony_ci	 *         Service time using pi2:
24262306a36Sopenharmony_ci	 *             st2 = (sz2 + incoming) / pi2->relative_throughput
24362306a36Sopenharmony_ci	 *
24462306a36Sopenharmony_ci	 *         To avoid the division, transform the expression to use
24562306a36Sopenharmony_ci	 *         multiplication.
24662306a36Sopenharmony_ci	 *         Because ->relative_throughput > 0 here, if st1 < st2,
24762306a36Sopenharmony_ci	 *         the expressions below are the same meaning:
24862306a36Sopenharmony_ci	 *             (sz1 + incoming) / pi1->relative_throughput <
24962306a36Sopenharmony_ci	 *                 (sz2 + incoming) / pi2->relative_throughput
25062306a36Sopenharmony_ci	 *             (sz1 + incoming) * pi2->relative_throughput <
25162306a36Sopenharmony_ci	 *                 (sz2 + incoming) * pi1->relative_throughput
25262306a36Sopenharmony_ci	 *         So use the later one.
25362306a36Sopenharmony_ci	 */
25462306a36Sopenharmony_ci	sz1 += incoming;
25562306a36Sopenharmony_ci	sz2 += incoming;
25662306a36Sopenharmony_ci	if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
25762306a36Sopenharmony_ci		     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
25862306a36Sopenharmony_ci		/*
25962306a36Sopenharmony_ci		 * Size may be too big for multiplying pi->relative_throughput
26062306a36Sopenharmony_ci		 * and overflow.
26162306a36Sopenharmony_ci		 * To avoid the overflow and mis-selection, shift down both.
26262306a36Sopenharmony_ci		 */
26362306a36Sopenharmony_ci		sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
26462306a36Sopenharmony_ci		sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
26562306a36Sopenharmony_ci	}
26662306a36Sopenharmony_ci	st1 = sz1 * pi2->relative_throughput;
26762306a36Sopenharmony_ci	st2 = sz2 * pi1->relative_throughput;
26862306a36Sopenharmony_ci	if (st1 != st2)
26962306a36Sopenharmony_ci		return st1 - st2;
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci	/*
27262306a36Sopenharmony_ci	 * Case 4: Service time is equal. Choose higher throughput path.
27362306a36Sopenharmony_ci	 */
27462306a36Sopenharmony_ci	return pi2->relative_throughput - pi1->relative_throughput;
27562306a36Sopenharmony_ci}
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_cistatic struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes)
27862306a36Sopenharmony_ci{
27962306a36Sopenharmony_ci	struct selector *s = ps->context;
28062306a36Sopenharmony_ci	struct path_info *pi = NULL, *best = NULL;
28162306a36Sopenharmony_ci	struct dm_path *ret = NULL;
28262306a36Sopenharmony_ci	unsigned long flags;
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci	spin_lock_irqsave(&s->lock, flags);
28562306a36Sopenharmony_ci	if (list_empty(&s->valid_paths))
28662306a36Sopenharmony_ci		goto out;
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	list_for_each_entry(pi, &s->valid_paths, list)
28962306a36Sopenharmony_ci		if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
29062306a36Sopenharmony_ci			best = pi;
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci	if (!best)
29362306a36Sopenharmony_ci		goto out;
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	/* Move most recently used to least preferred to evenly balance. */
29662306a36Sopenharmony_ci	list_move_tail(&best->list, &s->valid_paths);
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	ret = best->path;
29962306a36Sopenharmony_ciout:
30062306a36Sopenharmony_ci	spin_unlock_irqrestore(&s->lock, flags);
30162306a36Sopenharmony_ci	return ret;
30262306a36Sopenharmony_ci}
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_cistatic int st_start_io(struct path_selector *ps, struct dm_path *path,
30562306a36Sopenharmony_ci		       size_t nr_bytes)
30662306a36Sopenharmony_ci{
30762306a36Sopenharmony_ci	struct path_info *pi = path->pscontext;
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	atomic_add(nr_bytes, &pi->in_flight_size);
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	return 0;
31262306a36Sopenharmony_ci}
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_cistatic int st_end_io(struct path_selector *ps, struct dm_path *path,
31562306a36Sopenharmony_ci		     size_t nr_bytes, u64 start_time)
31662306a36Sopenharmony_ci{
31762306a36Sopenharmony_ci	struct path_info *pi = path->pscontext;
31862306a36Sopenharmony_ci
31962306a36Sopenharmony_ci	atomic_sub(nr_bytes, &pi->in_flight_size);
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci	return 0;
32262306a36Sopenharmony_ci}
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_cistatic struct path_selector_type st_ps = {
32562306a36Sopenharmony_ci	.name		= "service-time",
32662306a36Sopenharmony_ci	.module		= THIS_MODULE,
32762306a36Sopenharmony_ci	.table_args	= 2,
32862306a36Sopenharmony_ci	.info_args	= 2,
32962306a36Sopenharmony_ci	.create		= st_create,
33062306a36Sopenharmony_ci	.destroy	= st_destroy,
33162306a36Sopenharmony_ci	.status		= st_status,
33262306a36Sopenharmony_ci	.add_path	= st_add_path,
33362306a36Sopenharmony_ci	.fail_path	= st_fail_path,
33462306a36Sopenharmony_ci	.reinstate_path	= st_reinstate_path,
33562306a36Sopenharmony_ci	.select_path	= st_select_path,
33662306a36Sopenharmony_ci	.start_io	= st_start_io,
33762306a36Sopenharmony_ci	.end_io		= st_end_io,
33862306a36Sopenharmony_ci};
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_cistatic int __init dm_st_init(void)
34162306a36Sopenharmony_ci{
34262306a36Sopenharmony_ci	int r = dm_register_path_selector(&st_ps);
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_ci	if (r < 0)
34562306a36Sopenharmony_ci		DMERR("register failed %d", r);
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	DMINFO("version " ST_VERSION " loaded");
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci	return r;
35062306a36Sopenharmony_ci}
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_cistatic void __exit dm_st_exit(void)
35362306a36Sopenharmony_ci{
35462306a36Sopenharmony_ci	int r = dm_unregister_path_selector(&st_ps);
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci	if (r < 0)
35762306a36Sopenharmony_ci		DMERR("unregister failed %d", r);
35862306a36Sopenharmony_ci}
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_cimodule_init(dm_st_init);
36162306a36Sopenharmony_cimodule_exit(dm_st_exit);
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ciMODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
36462306a36Sopenharmony_ciMODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
36562306a36Sopenharmony_ciMODULE_LICENSE("GPL");
366