162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Historical Service Time
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci *  Keeps a time-weighted exponential moving average of the historical
662306a36Sopenharmony_ci *  service time. Estimates future service time based on the historical
762306a36Sopenharmony_ci *  service time and the number of outstanding requests.
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci *  Marks paths stale if they have not finished within hst *
1062306a36Sopenharmony_ci *  num_paths. If a path is stale and unused, we will send a single
1162306a36Sopenharmony_ci *  request to probe in case the path has improved. This situation
1262306a36Sopenharmony_ci *  generally arises if the path is so much worse than others that it
1362306a36Sopenharmony_ci *  will never have the best estimated service time, or if the entire
1462306a36Sopenharmony_ci *  multipath device is unused. If a path is stale and in use, limit the
1562306a36Sopenharmony_ci *  number of requests it can receive with the assumption that the path
1662306a36Sopenharmony_ci *  has become degraded.
1762306a36Sopenharmony_ci *
1862306a36Sopenharmony_ci *  To avoid repeatedly calculating exponents for time weighting, times
1962306a36Sopenharmony_ci *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
2062306a36Sopenharmony_ci *  ns, and the weighting is pre-calculated.
2162306a36Sopenharmony_ci *
2262306a36Sopenharmony_ci */
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci#include "dm.h"
2562306a36Sopenharmony_ci#include "dm-path-selector.h"
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci#include <linux/blkdev.h>
2862306a36Sopenharmony_ci#include <linux/slab.h>
2962306a36Sopenharmony_ci#include <linux/module.h>
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci#define DM_MSG_PREFIX	"multipath historical-service-time"
3362306a36Sopenharmony_ci#define HST_MIN_IO 1
3462306a36Sopenharmony_ci#define HST_VERSION "0.1.1"
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
3762306a36Sopenharmony_ci#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
3862306a36Sopenharmony_ci#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
3962306a36Sopenharmony_ci#define HST_FIXED_95 972
4062306a36Sopenharmony_ci#define HST_MAX_INFLIGHT HST_FIXED_1
4162306a36Sopenharmony_ci#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
4262306a36Sopenharmony_ci#define HST_WEIGHT_COUNT 64ULL
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_cistruct selector {
4562306a36Sopenharmony_ci	struct list_head valid_paths;
4662306a36Sopenharmony_ci	struct list_head failed_paths;
4762306a36Sopenharmony_ci	int valid_count;
4862306a36Sopenharmony_ci	spinlock_t lock;
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci	unsigned int weights[HST_WEIGHT_COUNT];
5162306a36Sopenharmony_ci	unsigned int threshold_multiplier;
5262306a36Sopenharmony_ci};
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_cistruct path_info {
5562306a36Sopenharmony_ci	struct list_head list;
5662306a36Sopenharmony_ci	struct dm_path *path;
5762306a36Sopenharmony_ci	unsigned int repeat_count;
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci	spinlock_t lock;
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ci	u64 historical_service_time; /* Fixed point */
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	u64 stale_after;
6462306a36Sopenharmony_ci	u64 last_finish;
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci	u64 outstanding;
6762306a36Sopenharmony_ci};
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci/**
7062306a36Sopenharmony_ci * fixed_power - compute: x^n, in O(log n) time
7162306a36Sopenharmony_ci *
7262306a36Sopenharmony_ci * @x:         base of the power
7362306a36Sopenharmony_ci * @frac_bits: fractional bits of @x
7462306a36Sopenharmony_ci * @n:         power to raise @x to.
7562306a36Sopenharmony_ci *
7662306a36Sopenharmony_ci * By exploiting the relation between the definition of the natural power
7762306a36Sopenharmony_ci * function: x^n := x*x*...*x (x multiplied by itself for n times), and
7862306a36Sopenharmony_ci * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
7962306a36Sopenharmony_ci * (where: n_i \elem {0, 1}, the binary vector representing n),
8062306a36Sopenharmony_ci * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
8162306a36Sopenharmony_ci * of course trivially computable in O(log_2 n), the length of our binary
8262306a36Sopenharmony_ci * vector.
8362306a36Sopenharmony_ci *
8462306a36Sopenharmony_ci * (see: kernel/sched/loadavg.c)
8562306a36Sopenharmony_ci */
8662306a36Sopenharmony_cistatic u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
8762306a36Sopenharmony_ci{
8862306a36Sopenharmony_ci	unsigned long result = 1UL << frac_bits;
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci	if (n) {
9162306a36Sopenharmony_ci		for (;;) {
9262306a36Sopenharmony_ci			if (n & 1) {
9362306a36Sopenharmony_ci				result *= x;
9462306a36Sopenharmony_ci				result += 1UL << (frac_bits - 1);
9562306a36Sopenharmony_ci				result >>= frac_bits;
9662306a36Sopenharmony_ci			}
9762306a36Sopenharmony_ci			n >>= 1;
9862306a36Sopenharmony_ci			if (!n)
9962306a36Sopenharmony_ci				break;
10062306a36Sopenharmony_ci			x *= x;
10162306a36Sopenharmony_ci			x += 1UL << (frac_bits - 1);
10262306a36Sopenharmony_ci			x >>= frac_bits;
10362306a36Sopenharmony_ci		}
10462306a36Sopenharmony_ci	}
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci	return result;
10762306a36Sopenharmony_ci}
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci/*
11062306a36Sopenharmony_ci * Calculate the next value of an exponential moving average
11162306a36Sopenharmony_ci * a_1 = a_0 * e + a * (1 - e)
11262306a36Sopenharmony_ci *
11362306a36Sopenharmony_ci * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
11462306a36Sopenharmony_ci * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
11562306a36Sopenharmony_ci * @weight: [0, HST_FIXED_1]
11662306a36Sopenharmony_ci *
11762306a36Sopenharmony_ci * Note:
11862306a36Sopenharmony_ci *   To account for multiple periods in the same calculation,
11962306a36Sopenharmony_ci *   a_n = a_0 * e^n + a * (1 - e^n),
12062306a36Sopenharmony_ci *   so call fixed_ema(last, next, pow(weight, N))
12162306a36Sopenharmony_ci */
12262306a36Sopenharmony_cistatic u64 fixed_ema(u64 last, u64 next, u64 weight)
12362306a36Sopenharmony_ci{
12462306a36Sopenharmony_ci	last *= weight;
12562306a36Sopenharmony_ci	last += next * (HST_FIXED_1 - weight);
12662306a36Sopenharmony_ci	last += 1ULL << (HST_FIXED_SHIFT - 1);
12762306a36Sopenharmony_ci	return last >> HST_FIXED_SHIFT;
12862306a36Sopenharmony_ci}
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_cistatic struct selector *alloc_selector(void)
13162306a36Sopenharmony_ci{
13262306a36Sopenharmony_ci	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci	if (s) {
13562306a36Sopenharmony_ci		INIT_LIST_HEAD(&s->valid_paths);
13662306a36Sopenharmony_ci		INIT_LIST_HEAD(&s->failed_paths);
13762306a36Sopenharmony_ci		spin_lock_init(&s->lock);
13862306a36Sopenharmony_ci		s->valid_count = 0;
13962306a36Sopenharmony_ci	}
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci	return s;
14262306a36Sopenharmony_ci}
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci/*
14562306a36Sopenharmony_ci * Get the weight for a given time span.
14662306a36Sopenharmony_ci */
14762306a36Sopenharmony_cistatic u64 hst_weight(struct path_selector *ps, u64 delta)
14862306a36Sopenharmony_ci{
14962306a36Sopenharmony_ci	struct selector *s = ps->context;
15062306a36Sopenharmony_ci	int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
15162306a36Sopenharmony_ci			   HST_WEIGHT_COUNT - 1);
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci	return s->weights[bucket];
15462306a36Sopenharmony_ci}
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci/*
15762306a36Sopenharmony_ci * Set up the weights array.
15862306a36Sopenharmony_ci *
15962306a36Sopenharmony_ci * weights[len-1] = 0
16062306a36Sopenharmony_ci * weights[n] = base ^ (n + 1)
16162306a36Sopenharmony_ci */
16262306a36Sopenharmony_cistatic void hst_set_weights(struct path_selector *ps, unsigned int base)
16362306a36Sopenharmony_ci{
16462306a36Sopenharmony_ci	struct selector *s = ps->context;
16562306a36Sopenharmony_ci	int i;
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_ci	if (base >= HST_FIXED_1)
16862306a36Sopenharmony_ci		return;
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
17162306a36Sopenharmony_ci		s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
17262306a36Sopenharmony_ci	s->weights[HST_WEIGHT_COUNT - 1] = 0;
17362306a36Sopenharmony_ci}
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_cistatic int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
17662306a36Sopenharmony_ci{
17762306a36Sopenharmony_ci	struct selector *s;
17862306a36Sopenharmony_ci	unsigned int base_weight = HST_FIXED_95;
17962306a36Sopenharmony_ci	unsigned int threshold_multiplier = 0;
18062306a36Sopenharmony_ci	char dummy;
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	/*
18362306a36Sopenharmony_ci	 * Arguments: [<base_weight> [<threshold_multiplier>]]
18462306a36Sopenharmony_ci	 *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
18562306a36Sopenharmony_ci	 *                  value of 0 will completely ignore any history.
18662306a36Sopenharmony_ci	 *                  If not given, default (HST_FIXED_95) is used.
18762306a36Sopenharmony_ci	 *   <threshold_multiplier>: Minimum threshold multiplier for paths to
18862306a36Sopenharmony_ci	 *                  be considered different. That is, a path is
18962306a36Sopenharmony_ci	 *                  considered different iff (p1 > N * p2) where p1
19062306a36Sopenharmony_ci	 *                  is the path with higher service time. A threshold
19162306a36Sopenharmony_ci	 *                  of 1 or 0 has no effect. Defaults to 0.
19262306a36Sopenharmony_ci	 */
19362306a36Sopenharmony_ci	if (argc > 2)
19462306a36Sopenharmony_ci		return -EINVAL;
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
19762306a36Sopenharmony_ci	     base_weight >= HST_FIXED_1)) {
19862306a36Sopenharmony_ci		return -EINVAL;
19962306a36Sopenharmony_ci	}
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	if (argc > 1 && (sscanf(argv[1], "%u%c",
20262306a36Sopenharmony_ci				&threshold_multiplier, &dummy) != 1)) {
20362306a36Sopenharmony_ci		return -EINVAL;
20462306a36Sopenharmony_ci	}
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci	s = alloc_selector();
20762306a36Sopenharmony_ci	if (!s)
20862306a36Sopenharmony_ci		return -ENOMEM;
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	ps->context = s;
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	hst_set_weights(ps, base_weight);
21362306a36Sopenharmony_ci	s->threshold_multiplier = threshold_multiplier;
21462306a36Sopenharmony_ci	return 0;
21562306a36Sopenharmony_ci}
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_cistatic void free_paths(struct list_head *paths)
21862306a36Sopenharmony_ci{
21962306a36Sopenharmony_ci	struct path_info *pi, *next;
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci	list_for_each_entry_safe(pi, next, paths, list) {
22262306a36Sopenharmony_ci		list_del(&pi->list);
22362306a36Sopenharmony_ci		kfree(pi);
22462306a36Sopenharmony_ci	}
22562306a36Sopenharmony_ci}
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_cistatic void hst_destroy(struct path_selector *ps)
22862306a36Sopenharmony_ci{
22962306a36Sopenharmony_ci	struct selector *s = ps->context;
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci	free_paths(&s->valid_paths);
23262306a36Sopenharmony_ci	free_paths(&s->failed_paths);
23362306a36Sopenharmony_ci	kfree(s);
23462306a36Sopenharmony_ci	ps->context = NULL;
23562306a36Sopenharmony_ci}
23662306a36Sopenharmony_ci
23762306a36Sopenharmony_cistatic int hst_status(struct path_selector *ps, struct dm_path *path,
23862306a36Sopenharmony_ci		     status_type_t type, char *result, unsigned int maxlen)
23962306a36Sopenharmony_ci{
24062306a36Sopenharmony_ci	unsigned int sz = 0;
24162306a36Sopenharmony_ci	struct path_info *pi;
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ci	if (!path) {
24462306a36Sopenharmony_ci		struct selector *s = ps->context;
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci		DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
24762306a36Sopenharmony_ci	} else {
24862306a36Sopenharmony_ci		pi = path->pscontext;
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci		switch (type) {
25162306a36Sopenharmony_ci		case STATUSTYPE_INFO:
25262306a36Sopenharmony_ci			DMEMIT("%llu %llu %llu ", pi->historical_service_time,
25362306a36Sopenharmony_ci			       pi->outstanding, pi->stale_after);
25462306a36Sopenharmony_ci			break;
25562306a36Sopenharmony_ci		case STATUSTYPE_TABLE:
25662306a36Sopenharmony_ci			DMEMIT("0 ");
25762306a36Sopenharmony_ci			break;
25862306a36Sopenharmony_ci		case STATUSTYPE_IMA:
25962306a36Sopenharmony_ci			*result = '\0';
26062306a36Sopenharmony_ci			break;
26162306a36Sopenharmony_ci		}
26262306a36Sopenharmony_ci	}
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	return sz;
26562306a36Sopenharmony_ci}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_cistatic int hst_add_path(struct path_selector *ps, struct dm_path *path,
26862306a36Sopenharmony_ci		       int argc, char **argv, char **error)
26962306a36Sopenharmony_ci{
27062306a36Sopenharmony_ci	struct selector *s = ps->context;
27162306a36Sopenharmony_ci	struct path_info *pi;
27262306a36Sopenharmony_ci	unsigned int repeat_count = HST_MIN_IO;
27362306a36Sopenharmony_ci	char dummy;
27462306a36Sopenharmony_ci	unsigned long flags;
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	/*
27762306a36Sopenharmony_ci	 * Arguments: [<repeat_count>]
27862306a36Sopenharmony_ci	 *   <repeat_count>: The number of I/Os before switching path.
27962306a36Sopenharmony_ci	 *                   If not given, default (HST_MIN_IO) is used.
28062306a36Sopenharmony_ci	 */
28162306a36Sopenharmony_ci	if (argc > 1) {
28262306a36Sopenharmony_ci		*error = "historical-service-time ps: incorrect number of arguments";
28362306a36Sopenharmony_ci		return -EINVAL;
28462306a36Sopenharmony_ci	}
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
28762306a36Sopenharmony_ci		*error = "historical-service-time ps: invalid repeat count";
28862306a36Sopenharmony_ci		return -EINVAL;
28962306a36Sopenharmony_ci	}
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci	/* allocate the path */
29262306a36Sopenharmony_ci	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
29362306a36Sopenharmony_ci	if (!pi) {
29462306a36Sopenharmony_ci		*error = "historical-service-time ps: Error allocating path context";
29562306a36Sopenharmony_ci		return -ENOMEM;
29662306a36Sopenharmony_ci	}
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	pi->path = path;
29962306a36Sopenharmony_ci	pi->repeat_count = repeat_count;
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	pi->historical_service_time = HST_FIXED_1;
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	spin_lock_init(&pi->lock);
30462306a36Sopenharmony_ci	pi->outstanding = 0;
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_ci	pi->stale_after = 0;
30762306a36Sopenharmony_ci	pi->last_finish = 0;
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	path->pscontext = pi;
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	spin_lock_irqsave(&s->lock, flags);
31262306a36Sopenharmony_ci	list_add_tail(&pi->list, &s->valid_paths);
31362306a36Sopenharmony_ci	s->valid_count++;
31462306a36Sopenharmony_ci	spin_unlock_irqrestore(&s->lock, flags);
31562306a36Sopenharmony_ci
31662306a36Sopenharmony_ci	return 0;
31762306a36Sopenharmony_ci}
31862306a36Sopenharmony_ci
31962306a36Sopenharmony_cistatic void hst_fail_path(struct path_selector *ps, struct dm_path *path)
32062306a36Sopenharmony_ci{
32162306a36Sopenharmony_ci	struct selector *s = ps->context;
32262306a36Sopenharmony_ci	struct path_info *pi = path->pscontext;
32362306a36Sopenharmony_ci	unsigned long flags;
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci	spin_lock_irqsave(&s->lock, flags);
32662306a36Sopenharmony_ci	list_move(&pi->list, &s->failed_paths);
32762306a36Sopenharmony_ci	s->valid_count--;
32862306a36Sopenharmony_ci	spin_unlock_irqrestore(&s->lock, flags);
32962306a36Sopenharmony_ci}
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_cistatic int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
33262306a36Sopenharmony_ci{
33362306a36Sopenharmony_ci	struct selector *s = ps->context;
33462306a36Sopenharmony_ci	struct path_info *pi = path->pscontext;
33562306a36Sopenharmony_ci	unsigned long flags;
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	spin_lock_irqsave(&s->lock, flags);
33862306a36Sopenharmony_ci	list_move_tail(&pi->list, &s->valid_paths);
33962306a36Sopenharmony_ci	s->valid_count++;
34062306a36Sopenharmony_ci	spin_unlock_irqrestore(&s->lock, flags);
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	return 0;
34362306a36Sopenharmony_ci}
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_cistatic void hst_fill_compare(struct path_info *pi, u64 *hst,
34662306a36Sopenharmony_ci			     u64 *out, u64 *stale)
34762306a36Sopenharmony_ci{
34862306a36Sopenharmony_ci	unsigned long flags;
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ci	spin_lock_irqsave(&pi->lock, flags);
35162306a36Sopenharmony_ci	*hst = pi->historical_service_time;
35262306a36Sopenharmony_ci	*out = pi->outstanding;
35362306a36Sopenharmony_ci	*stale = pi->stale_after;
35462306a36Sopenharmony_ci	spin_unlock_irqrestore(&pi->lock, flags);
35562306a36Sopenharmony_ci}
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci/*
35862306a36Sopenharmony_ci * Compare the estimated service time of 2 paths, pi1 and pi2,
35962306a36Sopenharmony_ci * for the incoming I/O.
36062306a36Sopenharmony_ci *
36162306a36Sopenharmony_ci * Returns:
36262306a36Sopenharmony_ci * < 0 : pi1 is better
36362306a36Sopenharmony_ci * 0   : no difference between pi1 and pi2
36462306a36Sopenharmony_ci * > 0 : pi2 is better
36562306a36Sopenharmony_ci *
36662306a36Sopenharmony_ci */
36762306a36Sopenharmony_cistatic long long hst_compare(struct path_info *pi1, struct path_info *pi2,
36862306a36Sopenharmony_ci			     u64 time_now, struct path_selector *ps)
36962306a36Sopenharmony_ci{
37062306a36Sopenharmony_ci	struct selector *s = ps->context;
37162306a36Sopenharmony_ci	u64 hst1, hst2;
37262306a36Sopenharmony_ci	long long out1, out2, stale1, stale2;
37362306a36Sopenharmony_ci	int pi2_better, over_threshold;
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_ci	hst_fill_compare(pi1, &hst1, &out1, &stale1);
37662306a36Sopenharmony_ci	hst_fill_compare(pi2, &hst2, &out2, &stale2);
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_ci	/* Check here if estimated latency for two paths are too similar.
37962306a36Sopenharmony_ci	 * If this is the case, we skip extra calculation and just compare
38062306a36Sopenharmony_ci	 * outstanding requests. In this case, any unloaded paths will
38162306a36Sopenharmony_ci	 * be preferred.
38262306a36Sopenharmony_ci	 */
38362306a36Sopenharmony_ci	if (hst1 > hst2)
38462306a36Sopenharmony_ci		over_threshold = hst1 > (s->threshold_multiplier * hst2);
38562306a36Sopenharmony_ci	else
38662306a36Sopenharmony_ci		over_threshold = hst2 > (s->threshold_multiplier * hst1);
38762306a36Sopenharmony_ci
38862306a36Sopenharmony_ci	if (!over_threshold)
38962306a36Sopenharmony_ci		return out1 - out2;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	/*
39262306a36Sopenharmony_ci	 * If an unloaded path is stale, choose it. If both paths are unloaded,
39362306a36Sopenharmony_ci	 * choose path that is the most stale.
39462306a36Sopenharmony_ci	 * (If one path is loaded, choose the other)
39562306a36Sopenharmony_ci	 */
39662306a36Sopenharmony_ci	if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
39762306a36Sopenharmony_ci	    (!out1 && !out2))
39862306a36Sopenharmony_ci		return (!out2 * stale1) - (!out1 * stale2);
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci	/* Compare estimated service time. If outstanding is the same, we
40162306a36Sopenharmony_ci	 * don't need to multiply
40262306a36Sopenharmony_ci	 */
40362306a36Sopenharmony_ci	if (out1 == out2) {
40462306a36Sopenharmony_ci		pi2_better = hst1 > hst2;
40562306a36Sopenharmony_ci	} else {
40662306a36Sopenharmony_ci		/* Potential overflow with out >= 1024 */
40762306a36Sopenharmony_ci		if (unlikely(out1 >= HST_MAX_INFLIGHT ||
40862306a36Sopenharmony_ci			     out2 >= HST_MAX_INFLIGHT)) {
40962306a36Sopenharmony_ci			/* If over 1023 in-flights, we may overflow if hst
41062306a36Sopenharmony_ci			 * is at max. (With this shift we still overflow at
41162306a36Sopenharmony_ci			 * 1048576 in-flights, which is high enough).
41262306a36Sopenharmony_ci			 */
41362306a36Sopenharmony_ci			hst1 >>= HST_FIXED_SHIFT;
41462306a36Sopenharmony_ci			hst2 >>= HST_FIXED_SHIFT;
41562306a36Sopenharmony_ci		}
41662306a36Sopenharmony_ci		pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
41762306a36Sopenharmony_ci	}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	/* In the case that the 'winner' is stale, limit to equal usage. */
42062306a36Sopenharmony_ci	if (pi2_better) {
42162306a36Sopenharmony_ci		if (stale2 < time_now)
42262306a36Sopenharmony_ci			return out1 - out2;
42362306a36Sopenharmony_ci		return 1;
42462306a36Sopenharmony_ci	}
42562306a36Sopenharmony_ci	if (stale1 < time_now)
42662306a36Sopenharmony_ci		return out1 - out2;
42762306a36Sopenharmony_ci	return -1;
42862306a36Sopenharmony_ci}
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_cistatic struct dm_path *hst_select_path(struct path_selector *ps,
43162306a36Sopenharmony_ci				       size_t nr_bytes)
43262306a36Sopenharmony_ci{
43362306a36Sopenharmony_ci	struct selector *s = ps->context;
43462306a36Sopenharmony_ci	struct path_info *pi = NULL, *best = NULL;
43562306a36Sopenharmony_ci	u64 time_now = ktime_get_ns();
43662306a36Sopenharmony_ci	struct dm_path *ret = NULL;
43762306a36Sopenharmony_ci	unsigned long flags;
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	spin_lock_irqsave(&s->lock, flags);
44062306a36Sopenharmony_ci	if (list_empty(&s->valid_paths))
44162306a36Sopenharmony_ci		goto out;
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	list_for_each_entry(pi, &s->valid_paths, list) {
44462306a36Sopenharmony_ci		if (!best || (hst_compare(pi, best, time_now, ps) < 0))
44562306a36Sopenharmony_ci			best = pi;
44662306a36Sopenharmony_ci	}
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci	if (!best)
44962306a36Sopenharmony_ci		goto out;
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci	/* Move last used path to end (least preferred in case of ties) */
45262306a36Sopenharmony_ci	list_move_tail(&best->list, &s->valid_paths);
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	ret = best->path;
45562306a36Sopenharmony_ci
45662306a36Sopenharmony_ciout:
45762306a36Sopenharmony_ci	spin_unlock_irqrestore(&s->lock, flags);
45862306a36Sopenharmony_ci	return ret;
45962306a36Sopenharmony_ci}
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_cistatic int hst_start_io(struct path_selector *ps, struct dm_path *path,
46262306a36Sopenharmony_ci			size_t nr_bytes)
46362306a36Sopenharmony_ci{
46462306a36Sopenharmony_ci	struct path_info *pi = path->pscontext;
46562306a36Sopenharmony_ci	unsigned long flags;
46662306a36Sopenharmony_ci
46762306a36Sopenharmony_ci	spin_lock_irqsave(&pi->lock, flags);
46862306a36Sopenharmony_ci	pi->outstanding++;
46962306a36Sopenharmony_ci	spin_unlock_irqrestore(&pi->lock, flags);
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci	return 0;
47262306a36Sopenharmony_ci}
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_cistatic u64 path_service_time(struct path_info *pi, u64 start_time)
47562306a36Sopenharmony_ci{
47662306a36Sopenharmony_ci	u64 now = ktime_get_ns();
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci	/* if a previous disk request has finished after this IO was
47962306a36Sopenharmony_ci	 * sent to the hardware, pretend the submission happened
48062306a36Sopenharmony_ci	 * serially.
48162306a36Sopenharmony_ci	 */
48262306a36Sopenharmony_ci	if (time_after64(pi->last_finish, start_time))
48362306a36Sopenharmony_ci		start_time = pi->last_finish;
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci	pi->last_finish = now;
48662306a36Sopenharmony_ci	if (time_before64(now, start_time))
48762306a36Sopenharmony_ci		return 0;
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci	return now - start_time;
49062306a36Sopenharmony_ci}
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_cistatic int hst_end_io(struct path_selector *ps, struct dm_path *path,
49362306a36Sopenharmony_ci		      size_t nr_bytes, u64 start_time)
49462306a36Sopenharmony_ci{
49562306a36Sopenharmony_ci	struct path_info *pi = path->pscontext;
49662306a36Sopenharmony_ci	struct selector *s = ps->context;
49762306a36Sopenharmony_ci	unsigned long flags;
49862306a36Sopenharmony_ci	u64 st;
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	spin_lock_irqsave(&pi->lock, flags);
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci	st = path_service_time(pi, start_time);
50362306a36Sopenharmony_ci	pi->outstanding--;
50462306a36Sopenharmony_ci	pi->historical_service_time =
50562306a36Sopenharmony_ci		fixed_ema(pi->historical_service_time,
50662306a36Sopenharmony_ci			  min(st * HST_FIXED_1, HST_FIXED_MAX),
50762306a36Sopenharmony_ci			  hst_weight(ps, st));
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci	/*
51062306a36Sopenharmony_ci	 * On request end, mark path as fresh. If a path hasn't
51162306a36Sopenharmony_ci	 * finished any requests within the fresh period, the estimated
51262306a36Sopenharmony_ci	 * service time is considered too optimistic and we limit the
51362306a36Sopenharmony_ci	 * maximum requests on that path.
51462306a36Sopenharmony_ci	 */
51562306a36Sopenharmony_ci	pi->stale_after = pi->last_finish +
51662306a36Sopenharmony_ci		(s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_ci	spin_unlock_irqrestore(&pi->lock, flags);
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci	return 0;
52162306a36Sopenharmony_ci}
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_cistatic struct path_selector_type hst_ps = {
52462306a36Sopenharmony_ci	.name		= "historical-service-time",
52562306a36Sopenharmony_ci	.module		= THIS_MODULE,
52662306a36Sopenharmony_ci	.features	= DM_PS_USE_HR_TIMER,
52762306a36Sopenharmony_ci	.table_args	= 1,
52862306a36Sopenharmony_ci	.info_args	= 3,
52962306a36Sopenharmony_ci	.create		= hst_create,
53062306a36Sopenharmony_ci	.destroy	= hst_destroy,
53162306a36Sopenharmony_ci	.status		= hst_status,
53262306a36Sopenharmony_ci	.add_path	= hst_add_path,
53362306a36Sopenharmony_ci	.fail_path	= hst_fail_path,
53462306a36Sopenharmony_ci	.reinstate_path	= hst_reinstate_path,
53562306a36Sopenharmony_ci	.select_path	= hst_select_path,
53662306a36Sopenharmony_ci	.start_io	= hst_start_io,
53762306a36Sopenharmony_ci	.end_io		= hst_end_io,
53862306a36Sopenharmony_ci};
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_cistatic int __init dm_hst_init(void)
54162306a36Sopenharmony_ci{
54262306a36Sopenharmony_ci	int r = dm_register_path_selector(&hst_ps);
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci	if (r < 0)
54562306a36Sopenharmony_ci		DMERR("register failed %d", r);
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_ci	DMINFO("version " HST_VERSION " loaded");
54862306a36Sopenharmony_ci
54962306a36Sopenharmony_ci	return r;
55062306a36Sopenharmony_ci}
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_cistatic void __exit dm_hst_exit(void)
55362306a36Sopenharmony_ci{
55462306a36Sopenharmony_ci	int r = dm_unregister_path_selector(&hst_ps);
55562306a36Sopenharmony_ci
55662306a36Sopenharmony_ci	if (r < 0)
55762306a36Sopenharmony_ci		DMERR("unregister failed %d", r);
55862306a36Sopenharmony_ci}
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_cimodule_init(dm_hst_init);
56162306a36Sopenharmony_cimodule_exit(dm_hst_exit);
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ciMODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
56462306a36Sopenharmony_ciMODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
56562306a36Sopenharmony_ciMODULE_LICENSE("GPL");
566