162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Historical Service Time 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Keeps a time-weighted exponential moving average of the historical 662306a36Sopenharmony_ci * service time. Estimates future service time based on the historical 762306a36Sopenharmony_ci * service time and the number of outstanding requests. 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * Marks paths stale if they have not finished within hst * 1062306a36Sopenharmony_ci * num_paths. If a path is stale and unused, we will send a single 1162306a36Sopenharmony_ci * request to probe in case the path has improved. This situation 1262306a36Sopenharmony_ci * generally arises if the path is so much worse than others that it 1362306a36Sopenharmony_ci * will never have the best estimated service time, or if the entire 1462306a36Sopenharmony_ci * multipath device is unused. If a path is stale and in use, limit the 1562306a36Sopenharmony_ci * number of requests it can receive with the assumption that the path 1662306a36Sopenharmony_ci * has become degraded. 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * To avoid repeatedly calculating exponents for time weighting, times 1962306a36Sopenharmony_ci * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) 2062306a36Sopenharmony_ci * ns, and the weighting is pre-calculated. 2162306a36Sopenharmony_ci * 2262306a36Sopenharmony_ci */ 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci#include "dm.h" 2562306a36Sopenharmony_ci#include "dm-path-selector.h" 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci#include <linux/blkdev.h> 2862306a36Sopenharmony_ci#include <linux/slab.h> 2962306a36Sopenharmony_ci#include <linux/module.h> 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_ci#define DM_MSG_PREFIX "multipath historical-service-time" 3362306a36Sopenharmony_ci#define HST_MIN_IO 1 3462306a36Sopenharmony_ci#define HST_VERSION "0.1.1" 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci#define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ 3762306a36Sopenharmony_ci#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) 3862306a36Sopenharmony_ci#define HST_FIXED_1 (1 << HST_FIXED_SHIFT) 3962306a36Sopenharmony_ci#define HST_FIXED_95 972 4062306a36Sopenharmony_ci#define HST_MAX_INFLIGHT HST_FIXED_1 4162306a36Sopenharmony_ci#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ 4262306a36Sopenharmony_ci#define HST_WEIGHT_COUNT 64ULL 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_cistruct selector { 4562306a36Sopenharmony_ci struct list_head valid_paths; 4662306a36Sopenharmony_ci struct list_head failed_paths; 4762306a36Sopenharmony_ci int valid_count; 4862306a36Sopenharmony_ci spinlock_t lock; 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci unsigned int weights[HST_WEIGHT_COUNT]; 5162306a36Sopenharmony_ci unsigned int threshold_multiplier; 5262306a36Sopenharmony_ci}; 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_cistruct path_info { 5562306a36Sopenharmony_ci struct list_head list; 5662306a36Sopenharmony_ci struct dm_path *path; 5762306a36Sopenharmony_ci unsigned int repeat_count; 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci spinlock_t lock; 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci u64 historical_service_time; /* Fixed point */ 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ci u64 stale_after; 6462306a36Sopenharmony_ci u64 last_finish; 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci u64 outstanding; 6762306a36Sopenharmony_ci}; 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci/** 7062306a36Sopenharmony_ci * fixed_power - compute: x^n, in O(log n) time 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * @x: base of the power 7362306a36Sopenharmony_ci * @frac_bits: fractional bits of @x 7462306a36Sopenharmony_ci * @n: power to raise @x to. 7562306a36Sopenharmony_ci * 7662306a36Sopenharmony_ci * By exploiting the relation between the definition of the natural power 7762306a36Sopenharmony_ci * function: x^n := x*x*...*x (x multiplied by itself for n times), and 7862306a36Sopenharmony_ci * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 7962306a36Sopenharmony_ci * (where: n_i \elem {0, 1}, the binary vector representing n), 8062306a36Sopenharmony_ci * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 8162306a36Sopenharmony_ci * of course trivially computable in O(log_2 n), the length of our binary 8262306a36Sopenharmony_ci * vector. 8362306a36Sopenharmony_ci * 8462306a36Sopenharmony_ci * (see: kernel/sched/loadavg.c) 8562306a36Sopenharmony_ci */ 8662306a36Sopenharmony_cistatic u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) 8762306a36Sopenharmony_ci{ 8862306a36Sopenharmony_ci unsigned long result = 1UL << frac_bits; 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci if (n) { 9162306a36Sopenharmony_ci for (;;) { 9262306a36Sopenharmony_ci if (n & 1) { 9362306a36Sopenharmony_ci result *= x; 9462306a36Sopenharmony_ci result += 1UL << (frac_bits - 1); 9562306a36Sopenharmony_ci result >>= frac_bits; 9662306a36Sopenharmony_ci } 9762306a36Sopenharmony_ci n >>= 1; 9862306a36Sopenharmony_ci if (!n) 9962306a36Sopenharmony_ci break; 10062306a36Sopenharmony_ci x *= x; 10162306a36Sopenharmony_ci x += 1UL << (frac_bits - 1); 10262306a36Sopenharmony_ci x >>= frac_bits; 10362306a36Sopenharmony_ci } 10462306a36Sopenharmony_ci } 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci return result; 10762306a36Sopenharmony_ci} 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci/* 11062306a36Sopenharmony_ci * Calculate the next value of an exponential moving average 11162306a36Sopenharmony_ci * a_1 = a_0 * e + a * (1 - e) 11262306a36Sopenharmony_ci * 11362306a36Sopenharmony_ci * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] 11462306a36Sopenharmony_ci * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] 11562306a36Sopenharmony_ci * @weight: [0, HST_FIXED_1] 11662306a36Sopenharmony_ci * 11762306a36Sopenharmony_ci * Note: 11862306a36Sopenharmony_ci * To account for multiple periods in the same calculation, 11962306a36Sopenharmony_ci * a_n = a_0 * e^n + a * (1 - e^n), 12062306a36Sopenharmony_ci * so call fixed_ema(last, next, pow(weight, N)) 12162306a36Sopenharmony_ci */ 12262306a36Sopenharmony_cistatic u64 fixed_ema(u64 last, u64 next, u64 weight) 12362306a36Sopenharmony_ci{ 12462306a36Sopenharmony_ci last *= weight; 12562306a36Sopenharmony_ci last += next * (HST_FIXED_1 - weight); 12662306a36Sopenharmony_ci last += 1ULL << (HST_FIXED_SHIFT - 1); 12762306a36Sopenharmony_ci return last >> HST_FIXED_SHIFT; 12862306a36Sopenharmony_ci} 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_cistatic struct selector *alloc_selector(void) 13162306a36Sopenharmony_ci{ 13262306a36Sopenharmony_ci struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci if (s) { 13562306a36Sopenharmony_ci INIT_LIST_HEAD(&s->valid_paths); 13662306a36Sopenharmony_ci INIT_LIST_HEAD(&s->failed_paths); 13762306a36Sopenharmony_ci spin_lock_init(&s->lock); 13862306a36Sopenharmony_ci s->valid_count = 0; 13962306a36Sopenharmony_ci } 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci return s; 14262306a36Sopenharmony_ci} 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci/* 14562306a36Sopenharmony_ci * Get the weight for a given time span. 14662306a36Sopenharmony_ci */ 14762306a36Sopenharmony_cistatic u64 hst_weight(struct path_selector *ps, u64 delta) 14862306a36Sopenharmony_ci{ 14962306a36Sopenharmony_ci struct selector *s = ps->context; 15062306a36Sopenharmony_ci int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, 15162306a36Sopenharmony_ci HST_WEIGHT_COUNT - 1); 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_ci return s->weights[bucket]; 15462306a36Sopenharmony_ci} 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci/* 15762306a36Sopenharmony_ci * Set up the weights array. 15862306a36Sopenharmony_ci * 15962306a36Sopenharmony_ci * weights[len-1] = 0 16062306a36Sopenharmony_ci * weights[n] = base ^ (n + 1) 16162306a36Sopenharmony_ci */ 16262306a36Sopenharmony_cistatic void hst_set_weights(struct path_selector *ps, unsigned int base) 16362306a36Sopenharmony_ci{ 16462306a36Sopenharmony_ci struct selector *s = ps->context; 16562306a36Sopenharmony_ci int i; 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci if (base >= HST_FIXED_1) 16862306a36Sopenharmony_ci return; 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) 17162306a36Sopenharmony_ci s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); 17262306a36Sopenharmony_ci s->weights[HST_WEIGHT_COUNT - 1] = 0; 17362306a36Sopenharmony_ci} 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_cistatic int hst_create(struct path_selector *ps, unsigned int argc, char **argv) 17662306a36Sopenharmony_ci{ 17762306a36Sopenharmony_ci struct selector *s; 17862306a36Sopenharmony_ci unsigned int base_weight = HST_FIXED_95; 17962306a36Sopenharmony_ci unsigned int threshold_multiplier = 0; 18062306a36Sopenharmony_ci char dummy; 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci /* 18362306a36Sopenharmony_ci * Arguments: [<base_weight> [<threshold_multiplier>]] 18462306a36Sopenharmony_ci * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A 18562306a36Sopenharmony_ci * value of 0 will completely ignore any history. 18662306a36Sopenharmony_ci * If not given, default (HST_FIXED_95) is used. 18762306a36Sopenharmony_ci * <threshold_multiplier>: Minimum threshold multiplier for paths to 18862306a36Sopenharmony_ci * be considered different. That is, a path is 18962306a36Sopenharmony_ci * considered different iff (p1 > N * p2) where p1 19062306a36Sopenharmony_ci * is the path with higher service time. A threshold 19162306a36Sopenharmony_ci * of 1 or 0 has no effect. Defaults to 0. 19262306a36Sopenharmony_ci */ 19362306a36Sopenharmony_ci if (argc > 2) 19462306a36Sopenharmony_ci return -EINVAL; 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || 19762306a36Sopenharmony_ci base_weight >= HST_FIXED_1)) { 19862306a36Sopenharmony_ci return -EINVAL; 19962306a36Sopenharmony_ci } 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci if (argc > 1 && (sscanf(argv[1], "%u%c", 20262306a36Sopenharmony_ci &threshold_multiplier, &dummy) != 1)) { 20362306a36Sopenharmony_ci return -EINVAL; 20462306a36Sopenharmony_ci } 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci s = alloc_selector(); 20762306a36Sopenharmony_ci if (!s) 20862306a36Sopenharmony_ci return -ENOMEM; 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci ps->context = s; 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci hst_set_weights(ps, base_weight); 21362306a36Sopenharmony_ci s->threshold_multiplier = threshold_multiplier; 21462306a36Sopenharmony_ci return 0; 21562306a36Sopenharmony_ci} 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_cistatic void free_paths(struct list_head *paths) 21862306a36Sopenharmony_ci{ 21962306a36Sopenharmony_ci struct path_info *pi, *next; 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci list_for_each_entry_safe(pi, next, paths, list) { 22262306a36Sopenharmony_ci list_del(&pi->list); 22362306a36Sopenharmony_ci kfree(pi); 22462306a36Sopenharmony_ci } 22562306a36Sopenharmony_ci} 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_cistatic void hst_destroy(struct path_selector *ps) 22862306a36Sopenharmony_ci{ 22962306a36Sopenharmony_ci struct selector *s = ps->context; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci free_paths(&s->valid_paths); 23262306a36Sopenharmony_ci free_paths(&s->failed_paths); 23362306a36Sopenharmony_ci kfree(s); 23462306a36Sopenharmony_ci ps->context = NULL; 23562306a36Sopenharmony_ci} 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_cistatic int hst_status(struct path_selector *ps, struct dm_path *path, 23862306a36Sopenharmony_ci status_type_t type, char *result, unsigned int maxlen) 23962306a36Sopenharmony_ci{ 24062306a36Sopenharmony_ci unsigned int sz = 0; 24162306a36Sopenharmony_ci struct path_info *pi; 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci if (!path) { 24462306a36Sopenharmony_ci struct selector *s = ps->context; 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); 24762306a36Sopenharmony_ci } else { 24862306a36Sopenharmony_ci pi = path->pscontext; 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci switch (type) { 25162306a36Sopenharmony_ci case STATUSTYPE_INFO: 25262306a36Sopenharmony_ci DMEMIT("%llu %llu %llu ", pi->historical_service_time, 25362306a36Sopenharmony_ci pi->outstanding, pi->stale_after); 25462306a36Sopenharmony_ci break; 25562306a36Sopenharmony_ci case STATUSTYPE_TABLE: 25662306a36Sopenharmony_ci DMEMIT("0 "); 25762306a36Sopenharmony_ci break; 25862306a36Sopenharmony_ci case STATUSTYPE_IMA: 25962306a36Sopenharmony_ci *result = '\0'; 26062306a36Sopenharmony_ci break; 26162306a36Sopenharmony_ci } 26262306a36Sopenharmony_ci } 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci return sz; 26562306a36Sopenharmony_ci} 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_cistatic int hst_add_path(struct path_selector *ps, struct dm_path *path, 26862306a36Sopenharmony_ci int argc, char **argv, char **error) 26962306a36Sopenharmony_ci{ 27062306a36Sopenharmony_ci struct selector *s = ps->context; 27162306a36Sopenharmony_ci struct path_info *pi; 27262306a36Sopenharmony_ci unsigned int repeat_count = HST_MIN_IO; 27362306a36Sopenharmony_ci char dummy; 27462306a36Sopenharmony_ci unsigned long flags; 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci /* 27762306a36Sopenharmony_ci * Arguments: [<repeat_count>] 27862306a36Sopenharmony_ci * <repeat_count>: The number of I/Os before switching path. 27962306a36Sopenharmony_ci * If not given, default (HST_MIN_IO) is used. 28062306a36Sopenharmony_ci */ 28162306a36Sopenharmony_ci if (argc > 1) { 28262306a36Sopenharmony_ci *error = "historical-service-time ps: incorrect number of arguments"; 28362306a36Sopenharmony_ci return -EINVAL; 28462306a36Sopenharmony_ci } 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { 28762306a36Sopenharmony_ci *error = "historical-service-time ps: invalid repeat count"; 28862306a36Sopenharmony_ci return -EINVAL; 28962306a36Sopenharmony_ci } 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci /* allocate the path */ 29262306a36Sopenharmony_ci pi = kmalloc(sizeof(*pi), GFP_KERNEL); 29362306a36Sopenharmony_ci if (!pi) { 29462306a36Sopenharmony_ci *error = "historical-service-time ps: Error allocating path context"; 29562306a36Sopenharmony_ci return -ENOMEM; 29662306a36Sopenharmony_ci } 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci pi->path = path; 29962306a36Sopenharmony_ci pi->repeat_count = repeat_count; 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci pi->historical_service_time = HST_FIXED_1; 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci spin_lock_init(&pi->lock); 30462306a36Sopenharmony_ci pi->outstanding = 0; 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_ci pi->stale_after = 0; 30762306a36Sopenharmony_ci pi->last_finish = 0; 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci path->pscontext = pi; 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci spin_lock_irqsave(&s->lock, flags); 31262306a36Sopenharmony_ci list_add_tail(&pi->list, &s->valid_paths); 31362306a36Sopenharmony_ci s->valid_count++; 31462306a36Sopenharmony_ci spin_unlock_irqrestore(&s->lock, flags); 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci return 0; 31762306a36Sopenharmony_ci} 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_cistatic void hst_fail_path(struct path_selector *ps, struct dm_path *path) 32062306a36Sopenharmony_ci{ 32162306a36Sopenharmony_ci struct selector *s = ps->context; 32262306a36Sopenharmony_ci struct path_info *pi = path->pscontext; 32362306a36Sopenharmony_ci unsigned long flags; 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci spin_lock_irqsave(&s->lock, flags); 32662306a36Sopenharmony_ci list_move(&pi->list, &s->failed_paths); 32762306a36Sopenharmony_ci s->valid_count--; 32862306a36Sopenharmony_ci spin_unlock_irqrestore(&s->lock, flags); 32962306a36Sopenharmony_ci} 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_cistatic int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) 33262306a36Sopenharmony_ci{ 33362306a36Sopenharmony_ci struct selector *s = ps->context; 33462306a36Sopenharmony_ci struct path_info *pi = path->pscontext; 33562306a36Sopenharmony_ci unsigned long flags; 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ci spin_lock_irqsave(&s->lock, flags); 33862306a36Sopenharmony_ci list_move_tail(&pi->list, &s->valid_paths); 33962306a36Sopenharmony_ci s->valid_count++; 34062306a36Sopenharmony_ci spin_unlock_irqrestore(&s->lock, flags); 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci return 0; 34362306a36Sopenharmony_ci} 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_cistatic void hst_fill_compare(struct path_info *pi, u64 *hst, 34662306a36Sopenharmony_ci u64 *out, u64 *stale) 34762306a36Sopenharmony_ci{ 34862306a36Sopenharmony_ci unsigned long flags; 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ci spin_lock_irqsave(&pi->lock, flags); 35162306a36Sopenharmony_ci *hst = pi->historical_service_time; 35262306a36Sopenharmony_ci *out = pi->outstanding; 35362306a36Sopenharmony_ci *stale = pi->stale_after; 35462306a36Sopenharmony_ci spin_unlock_irqrestore(&pi->lock, flags); 35562306a36Sopenharmony_ci} 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci/* 35862306a36Sopenharmony_ci * Compare the estimated service time of 2 paths, pi1 and pi2, 35962306a36Sopenharmony_ci * for the incoming I/O. 36062306a36Sopenharmony_ci * 36162306a36Sopenharmony_ci * Returns: 36262306a36Sopenharmony_ci * < 0 : pi1 is better 36362306a36Sopenharmony_ci * 0 : no difference between pi1 and pi2 36462306a36Sopenharmony_ci * > 0 : pi2 is better 36562306a36Sopenharmony_ci * 36662306a36Sopenharmony_ci */ 36762306a36Sopenharmony_cistatic long long hst_compare(struct path_info *pi1, struct path_info *pi2, 36862306a36Sopenharmony_ci u64 time_now, struct path_selector *ps) 36962306a36Sopenharmony_ci{ 37062306a36Sopenharmony_ci struct selector *s = ps->context; 37162306a36Sopenharmony_ci u64 hst1, hst2; 37262306a36Sopenharmony_ci long long out1, out2, stale1, stale2; 37362306a36Sopenharmony_ci int pi2_better, over_threshold; 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci hst_fill_compare(pi1, &hst1, &out1, &stale1); 37662306a36Sopenharmony_ci hst_fill_compare(pi2, &hst2, &out2, &stale2); 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci /* Check here if estimated latency for two paths are too similar. 37962306a36Sopenharmony_ci * If this is the case, we skip extra calculation and just compare 38062306a36Sopenharmony_ci * outstanding requests. In this case, any unloaded paths will 38162306a36Sopenharmony_ci * be preferred. 38262306a36Sopenharmony_ci */ 38362306a36Sopenharmony_ci if (hst1 > hst2) 38462306a36Sopenharmony_ci over_threshold = hst1 > (s->threshold_multiplier * hst2); 38562306a36Sopenharmony_ci else 38662306a36Sopenharmony_ci over_threshold = hst2 > (s->threshold_multiplier * hst1); 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci if (!over_threshold) 38962306a36Sopenharmony_ci return out1 - out2; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci /* 39262306a36Sopenharmony_ci * If an unloaded path is stale, choose it. If both paths are unloaded, 39362306a36Sopenharmony_ci * choose path that is the most stale. 39462306a36Sopenharmony_ci * (If one path is loaded, choose the other) 39562306a36Sopenharmony_ci */ 39662306a36Sopenharmony_ci if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || 39762306a36Sopenharmony_ci (!out1 && !out2)) 39862306a36Sopenharmony_ci return (!out2 * stale1) - (!out1 * stale2); 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci /* Compare estimated service time. If outstanding is the same, we 40162306a36Sopenharmony_ci * don't need to multiply 40262306a36Sopenharmony_ci */ 40362306a36Sopenharmony_ci if (out1 == out2) { 40462306a36Sopenharmony_ci pi2_better = hst1 > hst2; 40562306a36Sopenharmony_ci } else { 40662306a36Sopenharmony_ci /* Potential overflow with out >= 1024 */ 40762306a36Sopenharmony_ci if (unlikely(out1 >= HST_MAX_INFLIGHT || 40862306a36Sopenharmony_ci out2 >= HST_MAX_INFLIGHT)) { 40962306a36Sopenharmony_ci /* If over 1023 in-flights, we may overflow if hst 41062306a36Sopenharmony_ci * is at max. (With this shift we still overflow at 41162306a36Sopenharmony_ci * 1048576 in-flights, which is high enough). 41262306a36Sopenharmony_ci */ 41362306a36Sopenharmony_ci hst1 >>= HST_FIXED_SHIFT; 41462306a36Sopenharmony_ci hst2 >>= HST_FIXED_SHIFT; 41562306a36Sopenharmony_ci } 41662306a36Sopenharmony_ci pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; 41762306a36Sopenharmony_ci } 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci /* In the case that the 'winner' is stale, limit to equal usage. */ 42062306a36Sopenharmony_ci if (pi2_better) { 42162306a36Sopenharmony_ci if (stale2 < time_now) 42262306a36Sopenharmony_ci return out1 - out2; 42362306a36Sopenharmony_ci return 1; 42462306a36Sopenharmony_ci } 42562306a36Sopenharmony_ci if (stale1 < time_now) 42662306a36Sopenharmony_ci return out1 - out2; 42762306a36Sopenharmony_ci return -1; 42862306a36Sopenharmony_ci} 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_cistatic struct dm_path *hst_select_path(struct path_selector *ps, 43162306a36Sopenharmony_ci size_t nr_bytes) 43262306a36Sopenharmony_ci{ 43362306a36Sopenharmony_ci struct selector *s = ps->context; 43462306a36Sopenharmony_ci struct path_info *pi = NULL, *best = NULL; 43562306a36Sopenharmony_ci u64 time_now = ktime_get_ns(); 43662306a36Sopenharmony_ci struct dm_path *ret = NULL; 43762306a36Sopenharmony_ci unsigned long flags; 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ci spin_lock_irqsave(&s->lock, flags); 44062306a36Sopenharmony_ci if (list_empty(&s->valid_paths)) 44162306a36Sopenharmony_ci goto out; 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci list_for_each_entry(pi, &s->valid_paths, list) { 44462306a36Sopenharmony_ci if (!best || (hst_compare(pi, best, time_now, ps) < 0)) 44562306a36Sopenharmony_ci best = pi; 44662306a36Sopenharmony_ci } 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci if (!best) 44962306a36Sopenharmony_ci goto out; 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci /* Move last used path to end (least preferred in case of ties) */ 45262306a36Sopenharmony_ci list_move_tail(&best->list, &s->valid_paths); 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci ret = best->path; 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_ciout: 45762306a36Sopenharmony_ci spin_unlock_irqrestore(&s->lock, flags); 45862306a36Sopenharmony_ci return ret; 45962306a36Sopenharmony_ci} 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_cistatic int hst_start_io(struct path_selector *ps, struct dm_path *path, 46262306a36Sopenharmony_ci size_t nr_bytes) 46362306a36Sopenharmony_ci{ 46462306a36Sopenharmony_ci struct path_info *pi = path->pscontext; 46562306a36Sopenharmony_ci unsigned long flags; 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_ci spin_lock_irqsave(&pi->lock, flags); 46862306a36Sopenharmony_ci pi->outstanding++; 46962306a36Sopenharmony_ci spin_unlock_irqrestore(&pi->lock, flags); 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci return 0; 47262306a36Sopenharmony_ci} 47362306a36Sopenharmony_ci 47462306a36Sopenharmony_cistatic u64 path_service_time(struct path_info *pi, u64 start_time) 47562306a36Sopenharmony_ci{ 47662306a36Sopenharmony_ci u64 now = ktime_get_ns(); 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci /* if a previous disk request has finished after this IO was 47962306a36Sopenharmony_ci * sent to the hardware, pretend the submission happened 48062306a36Sopenharmony_ci * serially. 48162306a36Sopenharmony_ci */ 48262306a36Sopenharmony_ci if (time_after64(pi->last_finish, start_time)) 48362306a36Sopenharmony_ci start_time = pi->last_finish; 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci pi->last_finish = now; 48662306a36Sopenharmony_ci if (time_before64(now, start_time)) 48762306a36Sopenharmony_ci return 0; 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci return now - start_time; 49062306a36Sopenharmony_ci} 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_cistatic int hst_end_io(struct path_selector *ps, struct dm_path *path, 49362306a36Sopenharmony_ci size_t nr_bytes, u64 start_time) 49462306a36Sopenharmony_ci{ 49562306a36Sopenharmony_ci struct path_info *pi = path->pscontext; 49662306a36Sopenharmony_ci struct selector *s = ps->context; 49762306a36Sopenharmony_ci unsigned long flags; 49862306a36Sopenharmony_ci u64 st; 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci spin_lock_irqsave(&pi->lock, flags); 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci st = path_service_time(pi, start_time); 50362306a36Sopenharmony_ci pi->outstanding--; 50462306a36Sopenharmony_ci pi->historical_service_time = 50562306a36Sopenharmony_ci fixed_ema(pi->historical_service_time, 50662306a36Sopenharmony_ci min(st * HST_FIXED_1, HST_FIXED_MAX), 50762306a36Sopenharmony_ci hst_weight(ps, st)); 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci /* 51062306a36Sopenharmony_ci * On request end, mark path as fresh. If a path hasn't 51162306a36Sopenharmony_ci * finished any requests within the fresh period, the estimated 51262306a36Sopenharmony_ci * service time is considered too optimistic and we limit the 51362306a36Sopenharmony_ci * maximum requests on that path. 51462306a36Sopenharmony_ci */ 51562306a36Sopenharmony_ci pi->stale_after = pi->last_finish + 51662306a36Sopenharmony_ci (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci spin_unlock_irqrestore(&pi->lock, flags); 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci return 0; 52162306a36Sopenharmony_ci} 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_cistatic struct path_selector_type hst_ps = { 52462306a36Sopenharmony_ci .name = "historical-service-time", 52562306a36Sopenharmony_ci .module = THIS_MODULE, 52662306a36Sopenharmony_ci .features = DM_PS_USE_HR_TIMER, 52762306a36Sopenharmony_ci .table_args = 1, 52862306a36Sopenharmony_ci .info_args = 3, 52962306a36Sopenharmony_ci .create = hst_create, 53062306a36Sopenharmony_ci .destroy = hst_destroy, 53162306a36Sopenharmony_ci .status = hst_status, 53262306a36Sopenharmony_ci .add_path = hst_add_path, 53362306a36Sopenharmony_ci .fail_path = hst_fail_path, 53462306a36Sopenharmony_ci .reinstate_path = hst_reinstate_path, 53562306a36Sopenharmony_ci .select_path = hst_select_path, 53662306a36Sopenharmony_ci .start_io = hst_start_io, 53762306a36Sopenharmony_ci .end_io = hst_end_io, 53862306a36Sopenharmony_ci}; 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_cistatic int __init dm_hst_init(void) 54162306a36Sopenharmony_ci{ 54262306a36Sopenharmony_ci int r = dm_register_path_selector(&hst_ps); 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_ci if (r < 0) 54562306a36Sopenharmony_ci DMERR("register failed %d", r); 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_ci DMINFO("version " HST_VERSION " loaded"); 54862306a36Sopenharmony_ci 54962306a36Sopenharmony_ci return r; 55062306a36Sopenharmony_ci} 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_cistatic void __exit dm_hst_exit(void) 55362306a36Sopenharmony_ci{ 55462306a36Sopenharmony_ci int r = dm_unregister_path_selector(&hst_ps); 55562306a36Sopenharmony_ci 55662306a36Sopenharmony_ci if (r < 0) 55762306a36Sopenharmony_ci DMERR("unregister failed %d", r); 55862306a36Sopenharmony_ci} 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_cimodule_init(dm_hst_init); 56162306a36Sopenharmony_cimodule_exit(dm_hst_exit); 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ciMODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); 56462306a36Sopenharmony_ciMODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>"); 56562306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 566