162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Module Author: Kiyoshi Ueda 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * This file is released under the GPL. 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * Throughput oriented path selector. 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci#include "dm.h" 1362306a36Sopenharmony_ci#include "dm-path-selector.h" 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci#include <linux/slab.h> 1662306a36Sopenharmony_ci#include <linux/module.h> 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci#define DM_MSG_PREFIX "multipath service-time" 1962306a36Sopenharmony_ci#define ST_MIN_IO 1 2062306a36Sopenharmony_ci#define ST_MAX_RELATIVE_THROUGHPUT 100 2162306a36Sopenharmony_ci#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 2262306a36Sopenharmony_ci#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) 2362306a36Sopenharmony_ci#define ST_VERSION "0.3.0" 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_cistruct selector { 2662306a36Sopenharmony_ci struct list_head valid_paths; 2762306a36Sopenharmony_ci struct list_head failed_paths; 2862306a36Sopenharmony_ci spinlock_t lock; 2962306a36Sopenharmony_ci}; 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_cistruct path_info { 3262306a36Sopenharmony_ci struct list_head list; 3362306a36Sopenharmony_ci struct dm_path *path; 3462306a36Sopenharmony_ci unsigned int repeat_count; 3562306a36Sopenharmony_ci unsigned int relative_throughput; 3662306a36Sopenharmony_ci atomic_t in_flight_size; /* Total size of in-flight I/Os */ 3762306a36Sopenharmony_ci}; 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_cistatic struct selector *alloc_selector(void) 4062306a36Sopenharmony_ci{ 4162306a36Sopenharmony_ci struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci if (s) { 4462306a36Sopenharmony_ci INIT_LIST_HEAD(&s->valid_paths); 4562306a36Sopenharmony_ci INIT_LIST_HEAD(&s->failed_paths); 4662306a36Sopenharmony_ci spin_lock_init(&s->lock); 4762306a36Sopenharmony_ci } 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci return s; 5062306a36Sopenharmony_ci} 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_cistatic int st_create(struct path_selector *ps, unsigned int argc, char **argv) 5362306a36Sopenharmony_ci{ 5462306a36Sopenharmony_ci struct selector *s = alloc_selector(); 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci if (!s) 5762306a36Sopenharmony_ci return -ENOMEM; 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci ps->context = s; 6062306a36Sopenharmony_ci return 0; 6162306a36Sopenharmony_ci} 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_cistatic void free_paths(struct list_head *paths) 6462306a36Sopenharmony_ci{ 6562306a36Sopenharmony_ci struct path_info *pi, *next; 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci list_for_each_entry_safe(pi, next, paths, list) { 6862306a36Sopenharmony_ci list_del(&pi->list); 6962306a36Sopenharmony_ci kfree(pi); 7062306a36Sopenharmony_ci } 7162306a36Sopenharmony_ci} 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_cistatic void st_destroy(struct path_selector *ps) 7462306a36Sopenharmony_ci{ 7562306a36Sopenharmony_ci struct selector *s = ps->context; 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci free_paths(&s->valid_paths); 7862306a36Sopenharmony_ci free_paths(&s->failed_paths); 7962306a36Sopenharmony_ci kfree(s); 8062306a36Sopenharmony_ci ps->context = NULL; 8162306a36Sopenharmony_ci} 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_cistatic int st_status(struct path_selector *ps, struct dm_path *path, 8462306a36Sopenharmony_ci status_type_t type, char *result, unsigned int maxlen) 8562306a36Sopenharmony_ci{ 8662306a36Sopenharmony_ci unsigned int sz = 0; 8762306a36Sopenharmony_ci struct path_info *pi; 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci if (!path) 9062306a36Sopenharmony_ci DMEMIT("0 "); 9162306a36Sopenharmony_ci else { 9262306a36Sopenharmony_ci pi = path->pscontext; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci switch (type) { 9562306a36Sopenharmony_ci case STATUSTYPE_INFO: 9662306a36Sopenharmony_ci DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), 9762306a36Sopenharmony_ci pi->relative_throughput); 9862306a36Sopenharmony_ci break; 9962306a36Sopenharmony_ci case STATUSTYPE_TABLE: 10062306a36Sopenharmony_ci DMEMIT("%u %u ", pi->repeat_count, 10162306a36Sopenharmony_ci pi->relative_throughput); 10262306a36Sopenharmony_ci break; 10362306a36Sopenharmony_ci case STATUSTYPE_IMA: 10462306a36Sopenharmony_ci result[0] = '\0'; 10562306a36Sopenharmony_ci break; 10662306a36Sopenharmony_ci } 10762306a36Sopenharmony_ci } 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci return sz; 11062306a36Sopenharmony_ci} 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_cistatic int st_add_path(struct path_selector *ps, struct dm_path *path, 11362306a36Sopenharmony_ci int argc, char **argv, char **error) 11462306a36Sopenharmony_ci{ 11562306a36Sopenharmony_ci struct selector *s = ps->context; 11662306a36Sopenharmony_ci struct path_info *pi; 11762306a36Sopenharmony_ci unsigned int repeat_count = ST_MIN_IO; 11862306a36Sopenharmony_ci unsigned int relative_throughput = 1; 11962306a36Sopenharmony_ci char dummy; 12062306a36Sopenharmony_ci unsigned long flags; 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci /* 12362306a36Sopenharmony_ci * Arguments: [<repeat_count> [<relative_throughput>]] 12462306a36Sopenharmony_ci * <repeat_count>: The number of I/Os before switching path. 12562306a36Sopenharmony_ci * If not given, default (ST_MIN_IO) is used. 12662306a36Sopenharmony_ci * <relative_throughput>: The relative throughput value of 12762306a36Sopenharmony_ci * the path among all paths in the path-group. 12862306a36Sopenharmony_ci * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> 12962306a36Sopenharmony_ci * If not given, minimum value '1' is used. 13062306a36Sopenharmony_ci * If '0' is given, the path isn't selected while 13162306a36Sopenharmony_ci * other paths having a positive value are available. 13262306a36Sopenharmony_ci */ 13362306a36Sopenharmony_ci if (argc > 2) { 13462306a36Sopenharmony_ci *error = "service-time ps: incorrect number of arguments"; 13562306a36Sopenharmony_ci return -EINVAL; 13662306a36Sopenharmony_ci } 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { 13962306a36Sopenharmony_ci *error = "service-time ps: invalid repeat count"; 14062306a36Sopenharmony_ci return -EINVAL; 14162306a36Sopenharmony_ci } 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci if (repeat_count > 1) { 14462306a36Sopenharmony_ci DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); 14562306a36Sopenharmony_ci repeat_count = 1; 14662306a36Sopenharmony_ci } 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci if ((argc == 2) && 14962306a36Sopenharmony_ci (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || 15062306a36Sopenharmony_ci relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { 15162306a36Sopenharmony_ci *error = "service-time ps: invalid relative_throughput value"; 15262306a36Sopenharmony_ci return -EINVAL; 15362306a36Sopenharmony_ci } 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci /* allocate the path */ 15662306a36Sopenharmony_ci pi = kmalloc(sizeof(*pi), GFP_KERNEL); 15762306a36Sopenharmony_ci if (!pi) { 15862306a36Sopenharmony_ci *error = "service-time ps: Error allocating path context"; 15962306a36Sopenharmony_ci return -ENOMEM; 16062306a36Sopenharmony_ci } 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci pi->path = path; 16362306a36Sopenharmony_ci pi->repeat_count = repeat_count; 16462306a36Sopenharmony_ci pi->relative_throughput = relative_throughput; 16562306a36Sopenharmony_ci atomic_set(&pi->in_flight_size, 0); 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci path->pscontext = pi; 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci spin_lock_irqsave(&s->lock, flags); 17062306a36Sopenharmony_ci list_add_tail(&pi->list, &s->valid_paths); 17162306a36Sopenharmony_ci spin_unlock_irqrestore(&s->lock, flags); 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci return 0; 17462306a36Sopenharmony_ci} 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_cistatic void st_fail_path(struct path_selector *ps, struct dm_path *path) 17762306a36Sopenharmony_ci{ 17862306a36Sopenharmony_ci struct selector *s = ps->context; 17962306a36Sopenharmony_ci struct path_info *pi = path->pscontext; 18062306a36Sopenharmony_ci unsigned long flags; 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci spin_lock_irqsave(&s->lock, flags); 18362306a36Sopenharmony_ci list_move(&pi->list, &s->failed_paths); 18462306a36Sopenharmony_ci spin_unlock_irqrestore(&s->lock, flags); 18562306a36Sopenharmony_ci} 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_cistatic int st_reinstate_path(struct path_selector *ps, struct dm_path *path) 18862306a36Sopenharmony_ci{ 18962306a36Sopenharmony_ci struct selector *s = ps->context; 19062306a36Sopenharmony_ci struct path_info *pi = path->pscontext; 19162306a36Sopenharmony_ci unsigned long flags; 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci spin_lock_irqsave(&s->lock, flags); 19462306a36Sopenharmony_ci list_move_tail(&pi->list, &s->valid_paths); 19562306a36Sopenharmony_ci spin_unlock_irqrestore(&s->lock, flags); 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci return 0; 19862306a36Sopenharmony_ci} 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci/* 20162306a36Sopenharmony_ci * Compare the estimated service time of 2 paths, pi1 and pi2, 20262306a36Sopenharmony_ci * for the incoming I/O. 20362306a36Sopenharmony_ci * 20462306a36Sopenharmony_ci * Returns: 20562306a36Sopenharmony_ci * < 0 : pi1 is better 20662306a36Sopenharmony_ci * 0 : no difference between pi1 and pi2 20762306a36Sopenharmony_ci * > 0 : pi2 is better 20862306a36Sopenharmony_ci * 20962306a36Sopenharmony_ci * Description: 21062306a36Sopenharmony_ci * Basically, the service time is estimated by: 21162306a36Sopenharmony_ci * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' 21262306a36Sopenharmony_ci * To reduce the calculation, some optimizations are made. 21362306a36Sopenharmony_ci * (See comments inline) 21462306a36Sopenharmony_ci */ 21562306a36Sopenharmony_cistatic int st_compare_load(struct path_info *pi1, struct path_info *pi2, 21662306a36Sopenharmony_ci size_t incoming) 21762306a36Sopenharmony_ci{ 21862306a36Sopenharmony_ci size_t sz1, sz2, st1, st2; 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci sz1 = atomic_read(&pi1->in_flight_size); 22162306a36Sopenharmony_ci sz2 = atomic_read(&pi2->in_flight_size); 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci /* 22462306a36Sopenharmony_ci * Case 1: Both have same throughput value. Choose less loaded path. 22562306a36Sopenharmony_ci */ 22662306a36Sopenharmony_ci if (pi1->relative_throughput == pi2->relative_throughput) 22762306a36Sopenharmony_ci return sz1 - sz2; 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci /* 23062306a36Sopenharmony_ci * Case 2a: Both have same load. Choose higher throughput path. 23162306a36Sopenharmony_ci * Case 2b: One path has no throughput value. Choose the other one. 23262306a36Sopenharmony_ci */ 23362306a36Sopenharmony_ci if (sz1 == sz2 || 23462306a36Sopenharmony_ci !pi1->relative_throughput || !pi2->relative_throughput) 23562306a36Sopenharmony_ci return pi2->relative_throughput - pi1->relative_throughput; 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci /* 23862306a36Sopenharmony_ci * Case 3: Calculate service time. Choose faster path. 23962306a36Sopenharmony_ci * Service time using pi1: 24062306a36Sopenharmony_ci * st1 = (sz1 + incoming) / pi1->relative_throughput 24162306a36Sopenharmony_ci * Service time using pi2: 24262306a36Sopenharmony_ci * st2 = (sz2 + incoming) / pi2->relative_throughput 24362306a36Sopenharmony_ci * 24462306a36Sopenharmony_ci * To avoid the division, transform the expression to use 24562306a36Sopenharmony_ci * multiplication. 24662306a36Sopenharmony_ci * Because ->relative_throughput > 0 here, if st1 < st2, 24762306a36Sopenharmony_ci * the expressions below are the same meaning: 24862306a36Sopenharmony_ci * (sz1 + incoming) / pi1->relative_throughput < 24962306a36Sopenharmony_ci * (sz2 + incoming) / pi2->relative_throughput 25062306a36Sopenharmony_ci * (sz1 + incoming) * pi2->relative_throughput < 25162306a36Sopenharmony_ci * (sz2 + incoming) * pi1->relative_throughput 25262306a36Sopenharmony_ci * So use the later one. 25362306a36Sopenharmony_ci */ 25462306a36Sopenharmony_ci sz1 += incoming; 25562306a36Sopenharmony_ci sz2 += incoming; 25662306a36Sopenharmony_ci if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || 25762306a36Sopenharmony_ci sz2 >= ST_MAX_INFLIGHT_SIZE)) { 25862306a36Sopenharmony_ci /* 25962306a36Sopenharmony_ci * Size may be too big for multiplying pi->relative_throughput 26062306a36Sopenharmony_ci * and overflow. 26162306a36Sopenharmony_ci * To avoid the overflow and mis-selection, shift down both. 26262306a36Sopenharmony_ci */ 26362306a36Sopenharmony_ci sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; 26462306a36Sopenharmony_ci sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; 26562306a36Sopenharmony_ci } 26662306a36Sopenharmony_ci st1 = sz1 * pi2->relative_throughput; 26762306a36Sopenharmony_ci st2 = sz2 * pi1->relative_throughput; 26862306a36Sopenharmony_ci if (st1 != st2) 26962306a36Sopenharmony_ci return st1 - st2; 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci /* 27262306a36Sopenharmony_ci * Case 4: Service time is equal. Choose higher throughput path. 27362306a36Sopenharmony_ci */ 27462306a36Sopenharmony_ci return pi2->relative_throughput - pi1->relative_throughput; 27562306a36Sopenharmony_ci} 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_cistatic struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) 27862306a36Sopenharmony_ci{ 27962306a36Sopenharmony_ci struct selector *s = ps->context; 28062306a36Sopenharmony_ci struct path_info *pi = NULL, *best = NULL; 28162306a36Sopenharmony_ci struct dm_path *ret = NULL; 28262306a36Sopenharmony_ci unsigned long flags; 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci spin_lock_irqsave(&s->lock, flags); 28562306a36Sopenharmony_ci if (list_empty(&s->valid_paths)) 28662306a36Sopenharmony_ci goto out; 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci list_for_each_entry(pi, &s->valid_paths, list) 28962306a36Sopenharmony_ci if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) 29062306a36Sopenharmony_ci best = pi; 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci if (!best) 29362306a36Sopenharmony_ci goto out; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci /* Move most recently used to least preferred to evenly balance. */ 29662306a36Sopenharmony_ci list_move_tail(&best->list, &s->valid_paths); 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci ret = best->path; 29962306a36Sopenharmony_ciout: 30062306a36Sopenharmony_ci spin_unlock_irqrestore(&s->lock, flags); 30162306a36Sopenharmony_ci return ret; 30262306a36Sopenharmony_ci} 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_cistatic int st_start_io(struct path_selector *ps, struct dm_path *path, 30562306a36Sopenharmony_ci size_t nr_bytes) 30662306a36Sopenharmony_ci{ 30762306a36Sopenharmony_ci struct path_info *pi = path->pscontext; 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci atomic_add(nr_bytes, &pi->in_flight_size); 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci return 0; 31262306a36Sopenharmony_ci} 31362306a36Sopenharmony_ci 31462306a36Sopenharmony_cistatic int st_end_io(struct path_selector *ps, struct dm_path *path, 31562306a36Sopenharmony_ci size_t nr_bytes, u64 start_time) 31662306a36Sopenharmony_ci{ 31762306a36Sopenharmony_ci struct path_info *pi = path->pscontext; 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_ci atomic_sub(nr_bytes, &pi->in_flight_size); 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci return 0; 32262306a36Sopenharmony_ci} 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_cistatic struct path_selector_type st_ps = { 32562306a36Sopenharmony_ci .name = "service-time", 32662306a36Sopenharmony_ci .module = THIS_MODULE, 32762306a36Sopenharmony_ci .table_args = 2, 32862306a36Sopenharmony_ci .info_args = 2, 32962306a36Sopenharmony_ci .create = st_create, 33062306a36Sopenharmony_ci .destroy = st_destroy, 33162306a36Sopenharmony_ci .status = st_status, 33262306a36Sopenharmony_ci .add_path = st_add_path, 33362306a36Sopenharmony_ci .fail_path = st_fail_path, 33462306a36Sopenharmony_ci .reinstate_path = st_reinstate_path, 33562306a36Sopenharmony_ci .select_path = st_select_path, 33662306a36Sopenharmony_ci .start_io = st_start_io, 33762306a36Sopenharmony_ci .end_io = st_end_io, 33862306a36Sopenharmony_ci}; 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_cistatic int __init dm_st_init(void) 34162306a36Sopenharmony_ci{ 34262306a36Sopenharmony_ci int r = dm_register_path_selector(&st_ps); 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_ci if (r < 0) 34562306a36Sopenharmony_ci DMERR("register failed %d", r); 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci DMINFO("version " ST_VERSION " loaded"); 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_ci return r; 35062306a36Sopenharmony_ci} 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_cistatic void __exit dm_st_exit(void) 35362306a36Sopenharmony_ci{ 35462306a36Sopenharmony_ci int r = dm_unregister_path_selector(&st_ps); 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci if (r < 0) 35762306a36Sopenharmony_ci DMERR("unregister failed %d", r); 35862306a36Sopenharmony_ci} 35962306a36Sopenharmony_ci 36062306a36Sopenharmony_cimodule_init(dm_st_init); 36162306a36Sopenharmony_cimodule_exit(dm_st_exit); 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ciMODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); 36462306a36Sopenharmony_ciMODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); 36562306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 366