162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * Ceph - scalable distributed file system 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Copyright (C) 2015 Intel Corporation All Rights Reserved 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * This is free software; you can redistribute it and/or 762306a36Sopenharmony_ci * modify it under the terms of the GNU Lesser General Public 862306a36Sopenharmony_ci * License version 2.1, as published by the Free Software 962306a36Sopenharmony_ci * Foundation. See file COPYING. 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci */ 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci#ifdef __KERNEL__ 1462306a36Sopenharmony_ci# include <linux/string.h> 1562306a36Sopenharmony_ci# include <linux/slab.h> 1662306a36Sopenharmony_ci# include <linux/bug.h> 1762306a36Sopenharmony_ci# include <linux/kernel.h> 1862306a36Sopenharmony_ci# include <linux/crush/crush.h> 1962306a36Sopenharmony_ci# include <linux/crush/hash.h> 2062306a36Sopenharmony_ci# include <linux/crush/mapper.h> 2162306a36Sopenharmony_ci#else 2262306a36Sopenharmony_ci# include "crush_compat.h" 2362306a36Sopenharmony_ci# include "crush.h" 2462306a36Sopenharmony_ci# include "hash.h" 2562306a36Sopenharmony_ci# include "mapper.h" 2662306a36Sopenharmony_ci#endif 2762306a36Sopenharmony_ci#include "crush_ln_table.h" 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci#define dprintk(args...) /* printf(args) */ 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci/* 3262306a36Sopenharmony_ci * Implement the core CRUSH mapping algorithm. 3362306a36Sopenharmony_ci */ 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci/** 3662306a36Sopenharmony_ci * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. 3762306a36Sopenharmony_ci * @map: the crush_map 3862306a36Sopenharmony_ci * @ruleset: the storage ruleset id (user defined) 3962306a36Sopenharmony_ci * @type: storage ruleset type (user defined) 4062306a36Sopenharmony_ci * @size: output set size 4162306a36Sopenharmony_ci */ 4262306a36Sopenharmony_ciint crush_find_rule(const struct crush_map *map, int ruleset, int type, int size) 4362306a36Sopenharmony_ci{ 4462306a36Sopenharmony_ci __u32 i; 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci for (i = 0; i < map->max_rules; i++) { 4762306a36Sopenharmony_ci if (map->rules[i] && 4862306a36Sopenharmony_ci map->rules[i]->mask.ruleset == ruleset && 4962306a36Sopenharmony_ci map->rules[i]->mask.type == type && 5062306a36Sopenharmony_ci map->rules[i]->mask.min_size <= size && 5162306a36Sopenharmony_ci map->rules[i]->mask.max_size >= size) 5262306a36Sopenharmony_ci return i; 5362306a36Sopenharmony_ci } 5462306a36Sopenharmony_ci return -1; 5562306a36Sopenharmony_ci} 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci/* 5862306a36Sopenharmony_ci * bucket choose methods 5962306a36Sopenharmony_ci * 6062306a36Sopenharmony_ci * For each bucket algorithm, we have a "choose" method that, given a 6162306a36Sopenharmony_ci * crush input @x and replica position (usually, position in output set) @r, 6262306a36Sopenharmony_ci * will produce an item in the bucket. 6362306a36Sopenharmony_ci */ 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci/* 6662306a36Sopenharmony_ci * Choose based on a random permutation of the bucket. 6762306a36Sopenharmony_ci * 6862306a36Sopenharmony_ci * We used to use some prime number arithmetic to do this, but it 6962306a36Sopenharmony_ci * wasn't very random, and had some other bad behaviors. Instead, we 7062306a36Sopenharmony_ci * calculate an actual random permutation of the bucket members. 7162306a36Sopenharmony_ci * Since this is expensive, we optimize for the r=0 case, which 7262306a36Sopenharmony_ci * captures the vast majority of calls. 7362306a36Sopenharmony_ci */ 7462306a36Sopenharmony_cistatic int bucket_perm_choose(const struct crush_bucket *bucket, 7562306a36Sopenharmony_ci struct crush_work_bucket *work, 7662306a36Sopenharmony_ci int x, int r) 7762306a36Sopenharmony_ci{ 7862306a36Sopenharmony_ci unsigned int pr = r % bucket->size; 7962306a36Sopenharmony_ci unsigned int i, s; 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci /* start a new permutation if @x has changed */ 8262306a36Sopenharmony_ci if (work->perm_x != (__u32)x || work->perm_n == 0) { 8362306a36Sopenharmony_ci dprintk("bucket %d new x=%d\n", bucket->id, x); 8462306a36Sopenharmony_ci work->perm_x = x; 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci /* optimize common r=0 case */ 8762306a36Sopenharmony_ci if (pr == 0) { 8862306a36Sopenharmony_ci s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % 8962306a36Sopenharmony_ci bucket->size; 9062306a36Sopenharmony_ci work->perm[0] = s; 9162306a36Sopenharmony_ci work->perm_n = 0xffff; /* magic value, see below */ 9262306a36Sopenharmony_ci goto out; 9362306a36Sopenharmony_ci } 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci for (i = 0; i < bucket->size; i++) 9662306a36Sopenharmony_ci work->perm[i] = i; 9762306a36Sopenharmony_ci work->perm_n = 0; 9862306a36Sopenharmony_ci } else if (work->perm_n == 0xffff) { 9962306a36Sopenharmony_ci /* clean up after the r=0 case above */ 10062306a36Sopenharmony_ci for (i = 1; i < bucket->size; i++) 10162306a36Sopenharmony_ci work->perm[i] = i; 10262306a36Sopenharmony_ci work->perm[work->perm[0]] = 0; 10362306a36Sopenharmony_ci work->perm_n = 1; 10462306a36Sopenharmony_ci } 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci /* calculate permutation up to pr */ 10762306a36Sopenharmony_ci for (i = 0; i < work->perm_n; i++) 10862306a36Sopenharmony_ci dprintk(" perm_choose have %d: %d\n", i, work->perm[i]); 10962306a36Sopenharmony_ci while (work->perm_n <= pr) { 11062306a36Sopenharmony_ci unsigned int p = work->perm_n; 11162306a36Sopenharmony_ci /* no point in swapping the final entry */ 11262306a36Sopenharmony_ci if (p < bucket->size - 1) { 11362306a36Sopenharmony_ci i = crush_hash32_3(bucket->hash, x, bucket->id, p) % 11462306a36Sopenharmony_ci (bucket->size - p); 11562306a36Sopenharmony_ci if (i) { 11662306a36Sopenharmony_ci unsigned int t = work->perm[p + i]; 11762306a36Sopenharmony_ci work->perm[p + i] = work->perm[p]; 11862306a36Sopenharmony_ci work->perm[p] = t; 11962306a36Sopenharmony_ci } 12062306a36Sopenharmony_ci dprintk(" perm_choose swap %d with %d\n", p, p+i); 12162306a36Sopenharmony_ci } 12262306a36Sopenharmony_ci work->perm_n++; 12362306a36Sopenharmony_ci } 12462306a36Sopenharmony_ci for (i = 0; i < bucket->size; i++) 12562306a36Sopenharmony_ci dprintk(" perm_choose %d: %d\n", i, work->perm[i]); 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci s = work->perm[pr]; 12862306a36Sopenharmony_ciout: 12962306a36Sopenharmony_ci dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, 13062306a36Sopenharmony_ci bucket->size, x, r, pr, s); 13162306a36Sopenharmony_ci return bucket->items[s]; 13262306a36Sopenharmony_ci} 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci/* uniform */ 13562306a36Sopenharmony_cistatic int bucket_uniform_choose(const struct crush_bucket_uniform *bucket, 13662306a36Sopenharmony_ci struct crush_work_bucket *work, int x, int r) 13762306a36Sopenharmony_ci{ 13862306a36Sopenharmony_ci return bucket_perm_choose(&bucket->h, work, x, r); 13962306a36Sopenharmony_ci} 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci/* list */ 14262306a36Sopenharmony_cistatic int bucket_list_choose(const struct crush_bucket_list *bucket, 14362306a36Sopenharmony_ci int x, int r) 14462306a36Sopenharmony_ci{ 14562306a36Sopenharmony_ci int i; 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci for (i = bucket->h.size-1; i >= 0; i--) { 14862306a36Sopenharmony_ci __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i], 14962306a36Sopenharmony_ci r, bucket->h.id); 15062306a36Sopenharmony_ci w &= 0xffff; 15162306a36Sopenharmony_ci dprintk("list_choose i=%d x=%d r=%d item %d weight %x " 15262306a36Sopenharmony_ci "sw %x rand %llx", 15362306a36Sopenharmony_ci i, x, r, bucket->h.items[i], bucket->item_weights[i], 15462306a36Sopenharmony_ci bucket->sum_weights[i], w); 15562306a36Sopenharmony_ci w *= bucket->sum_weights[i]; 15662306a36Sopenharmony_ci w = w >> 16; 15762306a36Sopenharmony_ci /*dprintk(" scaled %llx\n", w);*/ 15862306a36Sopenharmony_ci if (w < bucket->item_weights[i]) { 15962306a36Sopenharmony_ci return bucket->h.items[i]; 16062306a36Sopenharmony_ci } 16162306a36Sopenharmony_ci } 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci dprintk("bad list sums for bucket %d\n", bucket->h.id); 16462306a36Sopenharmony_ci return bucket->h.items[0]; 16562306a36Sopenharmony_ci} 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci/* (binary) tree */ 16962306a36Sopenharmony_cistatic int height(int n) 17062306a36Sopenharmony_ci{ 17162306a36Sopenharmony_ci int h = 0; 17262306a36Sopenharmony_ci while ((n & 1) == 0) { 17362306a36Sopenharmony_ci h++; 17462306a36Sopenharmony_ci n = n >> 1; 17562306a36Sopenharmony_ci } 17662306a36Sopenharmony_ci return h; 17762306a36Sopenharmony_ci} 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_cistatic int left(int x) 18062306a36Sopenharmony_ci{ 18162306a36Sopenharmony_ci int h = height(x); 18262306a36Sopenharmony_ci return x - (1 << (h-1)); 18362306a36Sopenharmony_ci} 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_cistatic int right(int x) 18662306a36Sopenharmony_ci{ 18762306a36Sopenharmony_ci int h = height(x); 18862306a36Sopenharmony_ci return x + (1 << (h-1)); 18962306a36Sopenharmony_ci} 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_cistatic int terminal(int x) 19262306a36Sopenharmony_ci{ 19362306a36Sopenharmony_ci return x & 1; 19462306a36Sopenharmony_ci} 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_cistatic int bucket_tree_choose(const struct crush_bucket_tree *bucket, 19762306a36Sopenharmony_ci int x, int r) 19862306a36Sopenharmony_ci{ 19962306a36Sopenharmony_ci int n; 20062306a36Sopenharmony_ci __u32 w; 20162306a36Sopenharmony_ci __u64 t; 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci /* start at root */ 20462306a36Sopenharmony_ci n = bucket->num_nodes >> 1; 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci while (!terminal(n)) { 20762306a36Sopenharmony_ci int l; 20862306a36Sopenharmony_ci /* pick point in [0, w) */ 20962306a36Sopenharmony_ci w = bucket->node_weights[n]; 21062306a36Sopenharmony_ci t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, 21162306a36Sopenharmony_ci bucket->h.id) * (__u64)w; 21262306a36Sopenharmony_ci t = t >> 32; 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci /* descend to the left or right? */ 21562306a36Sopenharmony_ci l = left(n); 21662306a36Sopenharmony_ci if (t < bucket->node_weights[l]) 21762306a36Sopenharmony_ci n = l; 21862306a36Sopenharmony_ci else 21962306a36Sopenharmony_ci n = right(n); 22062306a36Sopenharmony_ci } 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci return bucket->h.items[n >> 1]; 22362306a36Sopenharmony_ci} 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci/* straw */ 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_cistatic int bucket_straw_choose(const struct crush_bucket_straw *bucket, 22962306a36Sopenharmony_ci int x, int r) 23062306a36Sopenharmony_ci{ 23162306a36Sopenharmony_ci __u32 i; 23262306a36Sopenharmony_ci int high = 0; 23362306a36Sopenharmony_ci __u64 high_draw = 0; 23462306a36Sopenharmony_ci __u64 draw; 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci for (i = 0; i < bucket->h.size; i++) { 23762306a36Sopenharmony_ci draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); 23862306a36Sopenharmony_ci draw &= 0xffff; 23962306a36Sopenharmony_ci draw *= bucket->straws[i]; 24062306a36Sopenharmony_ci if (i == 0 || draw > high_draw) { 24162306a36Sopenharmony_ci high = i; 24262306a36Sopenharmony_ci high_draw = draw; 24362306a36Sopenharmony_ci } 24462306a36Sopenharmony_ci } 24562306a36Sopenharmony_ci return bucket->h.items[high]; 24662306a36Sopenharmony_ci} 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci/* compute 2^44*log2(input+1) */ 24962306a36Sopenharmony_cistatic __u64 crush_ln(unsigned int xin) 25062306a36Sopenharmony_ci{ 25162306a36Sopenharmony_ci unsigned int x = xin; 25262306a36Sopenharmony_ci int iexpon, index1, index2; 25362306a36Sopenharmony_ci __u64 RH, LH, LL, xl64, result; 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci x++; 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci /* normalize input */ 25862306a36Sopenharmony_ci iexpon = 15; 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci /* 26162306a36Sopenharmony_ci * figure out number of bits we need to shift and 26262306a36Sopenharmony_ci * do it in one step instead of iteratively 26362306a36Sopenharmony_ci */ 26462306a36Sopenharmony_ci if (!(x & 0x18000)) { 26562306a36Sopenharmony_ci int bits = __builtin_clz(x & 0x1FFFF) - 16; 26662306a36Sopenharmony_ci x <<= bits; 26762306a36Sopenharmony_ci iexpon = 15 - bits; 26862306a36Sopenharmony_ci } 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci index1 = (x >> 8) << 1; 27162306a36Sopenharmony_ci /* RH ~ 2^56/index1 */ 27262306a36Sopenharmony_ci RH = __RH_LH_tbl[index1 - 256]; 27362306a36Sopenharmony_ci /* LH ~ 2^48 * log2(index1/256) */ 27462306a36Sopenharmony_ci LH = __RH_LH_tbl[index1 + 1 - 256]; 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */ 27762306a36Sopenharmony_ci xl64 = (__s64)x * RH; 27862306a36Sopenharmony_ci xl64 >>= 48; 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci result = iexpon; 28162306a36Sopenharmony_ci result <<= (12 + 32); 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci index2 = xl64 & 0xff; 28462306a36Sopenharmony_ci /* LL ~ 2^48*log2(1.0+index2/2^15) */ 28562306a36Sopenharmony_ci LL = __LL_tbl[index2]; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci LH = LH + LL; 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci LH >>= (48 - 12 - 32); 29062306a36Sopenharmony_ci result += LH; 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci return result; 29362306a36Sopenharmony_ci} 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci/* 29762306a36Sopenharmony_ci * straw2 29862306a36Sopenharmony_ci * 29962306a36Sopenharmony_ci * for reference, see: 30062306a36Sopenharmony_ci * 30162306a36Sopenharmony_ci * https://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables 30262306a36Sopenharmony_ci * 30362306a36Sopenharmony_ci */ 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_cistatic __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket, 30662306a36Sopenharmony_ci const struct crush_choose_arg *arg, 30762306a36Sopenharmony_ci int position) 30862306a36Sopenharmony_ci{ 30962306a36Sopenharmony_ci if (!arg || !arg->weight_set) 31062306a36Sopenharmony_ci return bucket->item_weights; 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_ci if (position >= arg->weight_set_size) 31362306a36Sopenharmony_ci position = arg->weight_set_size - 1; 31462306a36Sopenharmony_ci return arg->weight_set[position].weights; 31562306a36Sopenharmony_ci} 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_cistatic __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket, 31862306a36Sopenharmony_ci const struct crush_choose_arg *arg) 31962306a36Sopenharmony_ci{ 32062306a36Sopenharmony_ci if (!arg || !arg->ids) 32162306a36Sopenharmony_ci return bucket->h.items; 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci return arg->ids; 32462306a36Sopenharmony_ci} 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_cistatic int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, 32762306a36Sopenharmony_ci int x, int r, 32862306a36Sopenharmony_ci const struct crush_choose_arg *arg, 32962306a36Sopenharmony_ci int position) 33062306a36Sopenharmony_ci{ 33162306a36Sopenharmony_ci unsigned int i, high = 0; 33262306a36Sopenharmony_ci unsigned int u; 33362306a36Sopenharmony_ci __s64 ln, draw, high_draw = 0; 33462306a36Sopenharmony_ci __u32 *weights = get_choose_arg_weights(bucket, arg, position); 33562306a36Sopenharmony_ci __s32 *ids = get_choose_arg_ids(bucket, arg); 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ci for (i = 0; i < bucket->h.size; i++) { 33862306a36Sopenharmony_ci dprintk("weight 0x%x item %d\n", weights[i], ids[i]); 33962306a36Sopenharmony_ci if (weights[i]) { 34062306a36Sopenharmony_ci u = crush_hash32_3(bucket->h.hash, x, ids[i], r); 34162306a36Sopenharmony_ci u &= 0xffff; 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci /* 34462306a36Sopenharmony_ci * for some reason slightly less than 0x10000 produces 34562306a36Sopenharmony_ci * a slightly more accurate distribution... probably a 34662306a36Sopenharmony_ci * rounding effect. 34762306a36Sopenharmony_ci * 34862306a36Sopenharmony_ci * the natural log lookup table maps [0,0xffff] 34962306a36Sopenharmony_ci * (corresponding to real numbers [1/0x10000, 1] to 35062306a36Sopenharmony_ci * [0, 0xffffffffffff] (corresponding to real numbers 35162306a36Sopenharmony_ci * [-11.090355,0]). 35262306a36Sopenharmony_ci */ 35362306a36Sopenharmony_ci ln = crush_ln(u) - 0x1000000000000ll; 35462306a36Sopenharmony_ci 35562306a36Sopenharmony_ci /* 35662306a36Sopenharmony_ci * divide by 16.16 fixed-point weight. note 35762306a36Sopenharmony_ci * that the ln value is negative, so a larger 35862306a36Sopenharmony_ci * weight means a larger (less negative) value 35962306a36Sopenharmony_ci * for draw. 36062306a36Sopenharmony_ci */ 36162306a36Sopenharmony_ci draw = div64_s64(ln, weights[i]); 36262306a36Sopenharmony_ci } else { 36362306a36Sopenharmony_ci draw = S64_MIN; 36462306a36Sopenharmony_ci } 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci if (i == 0 || draw > high_draw) { 36762306a36Sopenharmony_ci high = i; 36862306a36Sopenharmony_ci high_draw = draw; 36962306a36Sopenharmony_ci } 37062306a36Sopenharmony_ci } 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci return bucket->h.items[high]; 37362306a36Sopenharmony_ci} 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_cistatic int crush_bucket_choose(const struct crush_bucket *in, 37762306a36Sopenharmony_ci struct crush_work_bucket *work, 37862306a36Sopenharmony_ci int x, int r, 37962306a36Sopenharmony_ci const struct crush_choose_arg *arg, 38062306a36Sopenharmony_ci int position) 38162306a36Sopenharmony_ci{ 38262306a36Sopenharmony_ci dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 38362306a36Sopenharmony_ci BUG_ON(in->size == 0); 38462306a36Sopenharmony_ci switch (in->alg) { 38562306a36Sopenharmony_ci case CRUSH_BUCKET_UNIFORM: 38662306a36Sopenharmony_ci return bucket_uniform_choose( 38762306a36Sopenharmony_ci (const struct crush_bucket_uniform *)in, 38862306a36Sopenharmony_ci work, x, r); 38962306a36Sopenharmony_ci case CRUSH_BUCKET_LIST: 39062306a36Sopenharmony_ci return bucket_list_choose((const struct crush_bucket_list *)in, 39162306a36Sopenharmony_ci x, r); 39262306a36Sopenharmony_ci case CRUSH_BUCKET_TREE: 39362306a36Sopenharmony_ci return bucket_tree_choose((const struct crush_bucket_tree *)in, 39462306a36Sopenharmony_ci x, r); 39562306a36Sopenharmony_ci case CRUSH_BUCKET_STRAW: 39662306a36Sopenharmony_ci return bucket_straw_choose( 39762306a36Sopenharmony_ci (const struct crush_bucket_straw *)in, 39862306a36Sopenharmony_ci x, r); 39962306a36Sopenharmony_ci case CRUSH_BUCKET_STRAW2: 40062306a36Sopenharmony_ci return bucket_straw2_choose( 40162306a36Sopenharmony_ci (const struct crush_bucket_straw2 *)in, 40262306a36Sopenharmony_ci x, r, arg, position); 40362306a36Sopenharmony_ci default: 40462306a36Sopenharmony_ci dprintk("unknown bucket %d alg %d\n", in->id, in->alg); 40562306a36Sopenharmony_ci return in->items[0]; 40662306a36Sopenharmony_ci } 40762306a36Sopenharmony_ci} 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci/* 41062306a36Sopenharmony_ci * true if device is marked "out" (failed, fully offloaded) 41162306a36Sopenharmony_ci * of the cluster 41262306a36Sopenharmony_ci */ 41362306a36Sopenharmony_cistatic int is_out(const struct crush_map *map, 41462306a36Sopenharmony_ci const __u32 *weight, int weight_max, 41562306a36Sopenharmony_ci int item, int x) 41662306a36Sopenharmony_ci{ 41762306a36Sopenharmony_ci if (item >= weight_max) 41862306a36Sopenharmony_ci return 1; 41962306a36Sopenharmony_ci if (weight[item] >= 0x10000) 42062306a36Sopenharmony_ci return 0; 42162306a36Sopenharmony_ci if (weight[item] == 0) 42262306a36Sopenharmony_ci return 1; 42362306a36Sopenharmony_ci if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) 42462306a36Sopenharmony_ci < weight[item]) 42562306a36Sopenharmony_ci return 0; 42662306a36Sopenharmony_ci return 1; 42762306a36Sopenharmony_ci} 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_ci/** 43062306a36Sopenharmony_ci * crush_choose_firstn - choose numrep distinct items of given type 43162306a36Sopenharmony_ci * @map: the crush_map 43262306a36Sopenharmony_ci * @bucket: the bucket we are choose an item from 43362306a36Sopenharmony_ci * @x: crush input value 43462306a36Sopenharmony_ci * @numrep: the number of items to choose 43562306a36Sopenharmony_ci * @type: the type of item to choose 43662306a36Sopenharmony_ci * @out: pointer to output vector 43762306a36Sopenharmony_ci * @outpos: our position in that vector 43862306a36Sopenharmony_ci * @out_size: size of the out vector 43962306a36Sopenharmony_ci * @tries: number of attempts to make 44062306a36Sopenharmony_ci * @recurse_tries: number of attempts to have recursive chooseleaf make 44162306a36Sopenharmony_ci * @local_retries: localized retries 44262306a36Sopenharmony_ci * @local_fallback_retries: localized fallback retries 44362306a36Sopenharmony_ci * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 44462306a36Sopenharmony_ci * @stable: stable mode starts rep=0 in the recursive call for all replicas 44562306a36Sopenharmony_ci * @vary_r: pass r to recursive calls 44662306a36Sopenharmony_ci * @out2: second output vector for leaf items (if @recurse_to_leaf) 44762306a36Sopenharmony_ci * @parent_r: r value passed from the parent 44862306a36Sopenharmony_ci */ 44962306a36Sopenharmony_cistatic int crush_choose_firstn(const struct crush_map *map, 45062306a36Sopenharmony_ci struct crush_work *work, 45162306a36Sopenharmony_ci const struct crush_bucket *bucket, 45262306a36Sopenharmony_ci const __u32 *weight, int weight_max, 45362306a36Sopenharmony_ci int x, int numrep, int type, 45462306a36Sopenharmony_ci int *out, int outpos, 45562306a36Sopenharmony_ci int out_size, 45662306a36Sopenharmony_ci unsigned int tries, 45762306a36Sopenharmony_ci unsigned int recurse_tries, 45862306a36Sopenharmony_ci unsigned int local_retries, 45962306a36Sopenharmony_ci unsigned int local_fallback_retries, 46062306a36Sopenharmony_ci int recurse_to_leaf, 46162306a36Sopenharmony_ci unsigned int vary_r, 46262306a36Sopenharmony_ci unsigned int stable, 46362306a36Sopenharmony_ci int *out2, 46462306a36Sopenharmony_ci int parent_r, 46562306a36Sopenharmony_ci const struct crush_choose_arg *choose_args) 46662306a36Sopenharmony_ci{ 46762306a36Sopenharmony_ci int rep; 46862306a36Sopenharmony_ci unsigned int ftotal, flocal; 46962306a36Sopenharmony_ci int retry_descent, retry_bucket, skip_rep; 47062306a36Sopenharmony_ci const struct crush_bucket *in = bucket; 47162306a36Sopenharmony_ci int r; 47262306a36Sopenharmony_ci int i; 47362306a36Sopenharmony_ci int item = 0; 47462306a36Sopenharmony_ci int itemtype; 47562306a36Sopenharmony_ci int collide, reject; 47662306a36Sopenharmony_ci int count = out_size; 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", 47962306a36Sopenharmony_ci recurse_to_leaf ? "_LEAF" : "", 48062306a36Sopenharmony_ci bucket->id, x, outpos, numrep, 48162306a36Sopenharmony_ci tries, recurse_tries, local_retries, local_fallback_retries, 48262306a36Sopenharmony_ci parent_r, stable); 48362306a36Sopenharmony_ci 48462306a36Sopenharmony_ci for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { 48562306a36Sopenharmony_ci /* keep trying until we get a non-out, non-colliding item */ 48662306a36Sopenharmony_ci ftotal = 0; 48762306a36Sopenharmony_ci skip_rep = 0; 48862306a36Sopenharmony_ci do { 48962306a36Sopenharmony_ci retry_descent = 0; 49062306a36Sopenharmony_ci in = bucket; /* initial bucket */ 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci /* choose through intervening buckets */ 49362306a36Sopenharmony_ci flocal = 0; 49462306a36Sopenharmony_ci do { 49562306a36Sopenharmony_ci collide = 0; 49662306a36Sopenharmony_ci retry_bucket = 0; 49762306a36Sopenharmony_ci r = rep + parent_r; 49862306a36Sopenharmony_ci /* r' = r + f_total */ 49962306a36Sopenharmony_ci r += ftotal; 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci /* bucket choose */ 50262306a36Sopenharmony_ci if (in->size == 0) { 50362306a36Sopenharmony_ci reject = 1; 50462306a36Sopenharmony_ci goto reject; 50562306a36Sopenharmony_ci } 50662306a36Sopenharmony_ci if (local_fallback_retries > 0 && 50762306a36Sopenharmony_ci flocal >= (in->size>>1) && 50862306a36Sopenharmony_ci flocal > local_fallback_retries) 50962306a36Sopenharmony_ci item = bucket_perm_choose( 51062306a36Sopenharmony_ci in, work->work[-1-in->id], 51162306a36Sopenharmony_ci x, r); 51262306a36Sopenharmony_ci else 51362306a36Sopenharmony_ci item = crush_bucket_choose( 51462306a36Sopenharmony_ci in, work->work[-1-in->id], 51562306a36Sopenharmony_ci x, r, 51662306a36Sopenharmony_ci (choose_args ? 51762306a36Sopenharmony_ci &choose_args[-1-in->id] : NULL), 51862306a36Sopenharmony_ci outpos); 51962306a36Sopenharmony_ci if (item >= map->max_devices) { 52062306a36Sopenharmony_ci dprintk(" bad item %d\n", item); 52162306a36Sopenharmony_ci skip_rep = 1; 52262306a36Sopenharmony_ci break; 52362306a36Sopenharmony_ci } 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci /* desired type? */ 52662306a36Sopenharmony_ci if (item < 0) 52762306a36Sopenharmony_ci itemtype = map->buckets[-1-item]->type; 52862306a36Sopenharmony_ci else 52962306a36Sopenharmony_ci itemtype = 0; 53062306a36Sopenharmony_ci dprintk(" item %d type %d\n", item, itemtype); 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci /* keep going? */ 53362306a36Sopenharmony_ci if (itemtype != type) { 53462306a36Sopenharmony_ci if (item >= 0 || 53562306a36Sopenharmony_ci (-1-item) >= map->max_buckets) { 53662306a36Sopenharmony_ci dprintk(" bad item type %d\n", type); 53762306a36Sopenharmony_ci skip_rep = 1; 53862306a36Sopenharmony_ci break; 53962306a36Sopenharmony_ci } 54062306a36Sopenharmony_ci in = map->buckets[-1-item]; 54162306a36Sopenharmony_ci retry_bucket = 1; 54262306a36Sopenharmony_ci continue; 54362306a36Sopenharmony_ci } 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_ci /* collision? */ 54662306a36Sopenharmony_ci for (i = 0; i < outpos; i++) { 54762306a36Sopenharmony_ci if (out[i] == item) { 54862306a36Sopenharmony_ci collide = 1; 54962306a36Sopenharmony_ci break; 55062306a36Sopenharmony_ci } 55162306a36Sopenharmony_ci } 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_ci reject = 0; 55462306a36Sopenharmony_ci if (!collide && recurse_to_leaf) { 55562306a36Sopenharmony_ci if (item < 0) { 55662306a36Sopenharmony_ci int sub_r; 55762306a36Sopenharmony_ci if (vary_r) 55862306a36Sopenharmony_ci sub_r = r >> (vary_r-1); 55962306a36Sopenharmony_ci else 56062306a36Sopenharmony_ci sub_r = 0; 56162306a36Sopenharmony_ci if (crush_choose_firstn( 56262306a36Sopenharmony_ci map, 56362306a36Sopenharmony_ci work, 56462306a36Sopenharmony_ci map->buckets[-1-item], 56562306a36Sopenharmony_ci weight, weight_max, 56662306a36Sopenharmony_ci x, stable ? 1 : outpos+1, 0, 56762306a36Sopenharmony_ci out2, outpos, count, 56862306a36Sopenharmony_ci recurse_tries, 0, 56962306a36Sopenharmony_ci local_retries, 57062306a36Sopenharmony_ci local_fallback_retries, 57162306a36Sopenharmony_ci 0, 57262306a36Sopenharmony_ci vary_r, 57362306a36Sopenharmony_ci stable, 57462306a36Sopenharmony_ci NULL, 57562306a36Sopenharmony_ci sub_r, 57662306a36Sopenharmony_ci choose_args) <= outpos) 57762306a36Sopenharmony_ci /* didn't get leaf */ 57862306a36Sopenharmony_ci reject = 1; 57962306a36Sopenharmony_ci } else { 58062306a36Sopenharmony_ci /* we already have a leaf! */ 58162306a36Sopenharmony_ci out2[outpos] = item; 58262306a36Sopenharmony_ci } 58362306a36Sopenharmony_ci } 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci if (!reject && !collide) { 58662306a36Sopenharmony_ci /* out? */ 58762306a36Sopenharmony_ci if (itemtype == 0) 58862306a36Sopenharmony_ci reject = is_out(map, weight, 58962306a36Sopenharmony_ci weight_max, 59062306a36Sopenharmony_ci item, x); 59162306a36Sopenharmony_ci } 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_cireject: 59462306a36Sopenharmony_ci if (reject || collide) { 59562306a36Sopenharmony_ci ftotal++; 59662306a36Sopenharmony_ci flocal++; 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci if (collide && flocal <= local_retries) 59962306a36Sopenharmony_ci /* retry locally a few times */ 60062306a36Sopenharmony_ci retry_bucket = 1; 60162306a36Sopenharmony_ci else if (local_fallback_retries > 0 && 60262306a36Sopenharmony_ci flocal <= in->size + local_fallback_retries) 60362306a36Sopenharmony_ci /* exhaustive bucket search */ 60462306a36Sopenharmony_ci retry_bucket = 1; 60562306a36Sopenharmony_ci else if (ftotal < tries) 60662306a36Sopenharmony_ci /* then retry descent */ 60762306a36Sopenharmony_ci retry_descent = 1; 60862306a36Sopenharmony_ci else 60962306a36Sopenharmony_ci /* else give up */ 61062306a36Sopenharmony_ci skip_rep = 1; 61162306a36Sopenharmony_ci dprintk(" reject %d collide %d " 61262306a36Sopenharmony_ci "ftotal %u flocal %u\n", 61362306a36Sopenharmony_ci reject, collide, ftotal, 61462306a36Sopenharmony_ci flocal); 61562306a36Sopenharmony_ci } 61662306a36Sopenharmony_ci } while (retry_bucket); 61762306a36Sopenharmony_ci } while (retry_descent); 61862306a36Sopenharmony_ci 61962306a36Sopenharmony_ci if (skip_rep) { 62062306a36Sopenharmony_ci dprintk("skip rep\n"); 62162306a36Sopenharmony_ci continue; 62262306a36Sopenharmony_ci } 62362306a36Sopenharmony_ci 62462306a36Sopenharmony_ci dprintk("CHOOSE got %d\n", item); 62562306a36Sopenharmony_ci out[outpos] = item; 62662306a36Sopenharmony_ci outpos++; 62762306a36Sopenharmony_ci count--; 62862306a36Sopenharmony_ci#ifndef __KERNEL__ 62962306a36Sopenharmony_ci if (map->choose_tries && ftotal <= map->choose_total_tries) 63062306a36Sopenharmony_ci map->choose_tries[ftotal]++; 63162306a36Sopenharmony_ci#endif 63262306a36Sopenharmony_ci } 63362306a36Sopenharmony_ci 63462306a36Sopenharmony_ci dprintk("CHOOSE returns %d\n", outpos); 63562306a36Sopenharmony_ci return outpos; 63662306a36Sopenharmony_ci} 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci 63962306a36Sopenharmony_ci/** 64062306a36Sopenharmony_ci * crush_choose_indep: alternative breadth-first positionally stable mapping 64162306a36Sopenharmony_ci * 64262306a36Sopenharmony_ci */ 64362306a36Sopenharmony_cistatic void crush_choose_indep(const struct crush_map *map, 64462306a36Sopenharmony_ci struct crush_work *work, 64562306a36Sopenharmony_ci const struct crush_bucket *bucket, 64662306a36Sopenharmony_ci const __u32 *weight, int weight_max, 64762306a36Sopenharmony_ci int x, int left, int numrep, int type, 64862306a36Sopenharmony_ci int *out, int outpos, 64962306a36Sopenharmony_ci unsigned int tries, 65062306a36Sopenharmony_ci unsigned int recurse_tries, 65162306a36Sopenharmony_ci int recurse_to_leaf, 65262306a36Sopenharmony_ci int *out2, 65362306a36Sopenharmony_ci int parent_r, 65462306a36Sopenharmony_ci const struct crush_choose_arg *choose_args) 65562306a36Sopenharmony_ci{ 65662306a36Sopenharmony_ci const struct crush_bucket *in = bucket; 65762306a36Sopenharmony_ci int endpos = outpos + left; 65862306a36Sopenharmony_ci int rep; 65962306a36Sopenharmony_ci unsigned int ftotal; 66062306a36Sopenharmony_ci int r; 66162306a36Sopenharmony_ci int i; 66262306a36Sopenharmony_ci int item = 0; 66362306a36Sopenharmony_ci int itemtype; 66462306a36Sopenharmony_ci int collide; 66562306a36Sopenharmony_ci 66662306a36Sopenharmony_ci dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 66762306a36Sopenharmony_ci bucket->id, x, outpos, numrep); 66862306a36Sopenharmony_ci 66962306a36Sopenharmony_ci /* initially my result is undefined */ 67062306a36Sopenharmony_ci for (rep = outpos; rep < endpos; rep++) { 67162306a36Sopenharmony_ci out[rep] = CRUSH_ITEM_UNDEF; 67262306a36Sopenharmony_ci if (out2) 67362306a36Sopenharmony_ci out2[rep] = CRUSH_ITEM_UNDEF; 67462306a36Sopenharmony_ci } 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_ci for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { 67762306a36Sopenharmony_ci#ifdef DEBUG_INDEP 67862306a36Sopenharmony_ci if (out2 && ftotal) { 67962306a36Sopenharmony_ci dprintk("%u %d a: ", ftotal, left); 68062306a36Sopenharmony_ci for (rep = outpos; rep < endpos; rep++) { 68162306a36Sopenharmony_ci dprintk(" %d", out[rep]); 68262306a36Sopenharmony_ci } 68362306a36Sopenharmony_ci dprintk("\n"); 68462306a36Sopenharmony_ci dprintk("%u %d b: ", ftotal, left); 68562306a36Sopenharmony_ci for (rep = outpos; rep < endpos; rep++) { 68662306a36Sopenharmony_ci dprintk(" %d", out2[rep]); 68762306a36Sopenharmony_ci } 68862306a36Sopenharmony_ci dprintk("\n"); 68962306a36Sopenharmony_ci } 69062306a36Sopenharmony_ci#endif 69162306a36Sopenharmony_ci for (rep = outpos; rep < endpos; rep++) { 69262306a36Sopenharmony_ci if (out[rep] != CRUSH_ITEM_UNDEF) 69362306a36Sopenharmony_ci continue; 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_ci in = bucket; /* initial bucket */ 69662306a36Sopenharmony_ci 69762306a36Sopenharmony_ci /* choose through intervening buckets */ 69862306a36Sopenharmony_ci for (;;) { 69962306a36Sopenharmony_ci /* note: we base the choice on the position 70062306a36Sopenharmony_ci * even in the nested call. that means that 70162306a36Sopenharmony_ci * if the first layer chooses the same bucket 70262306a36Sopenharmony_ci * in a different position, we will tend to 70362306a36Sopenharmony_ci * choose a different item in that bucket. 70462306a36Sopenharmony_ci * this will involve more devices in data 70562306a36Sopenharmony_ci * movement and tend to distribute the load. 70662306a36Sopenharmony_ci */ 70762306a36Sopenharmony_ci r = rep + parent_r; 70862306a36Sopenharmony_ci 70962306a36Sopenharmony_ci /* be careful */ 71062306a36Sopenharmony_ci if (in->alg == CRUSH_BUCKET_UNIFORM && 71162306a36Sopenharmony_ci in->size % numrep == 0) 71262306a36Sopenharmony_ci /* r'=r+(n+1)*f_total */ 71362306a36Sopenharmony_ci r += (numrep+1) * ftotal; 71462306a36Sopenharmony_ci else 71562306a36Sopenharmony_ci /* r' = r + n*f_total */ 71662306a36Sopenharmony_ci r += numrep * ftotal; 71762306a36Sopenharmony_ci 71862306a36Sopenharmony_ci /* bucket choose */ 71962306a36Sopenharmony_ci if (in->size == 0) { 72062306a36Sopenharmony_ci dprintk(" empty bucket\n"); 72162306a36Sopenharmony_ci break; 72262306a36Sopenharmony_ci } 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci item = crush_bucket_choose( 72562306a36Sopenharmony_ci in, work->work[-1-in->id], 72662306a36Sopenharmony_ci x, r, 72762306a36Sopenharmony_ci (choose_args ? 72862306a36Sopenharmony_ci &choose_args[-1-in->id] : NULL), 72962306a36Sopenharmony_ci outpos); 73062306a36Sopenharmony_ci if (item >= map->max_devices) { 73162306a36Sopenharmony_ci dprintk(" bad item %d\n", item); 73262306a36Sopenharmony_ci out[rep] = CRUSH_ITEM_NONE; 73362306a36Sopenharmony_ci if (out2) 73462306a36Sopenharmony_ci out2[rep] = CRUSH_ITEM_NONE; 73562306a36Sopenharmony_ci left--; 73662306a36Sopenharmony_ci break; 73762306a36Sopenharmony_ci } 73862306a36Sopenharmony_ci 73962306a36Sopenharmony_ci /* desired type? */ 74062306a36Sopenharmony_ci if (item < 0) 74162306a36Sopenharmony_ci itemtype = map->buckets[-1-item]->type; 74262306a36Sopenharmony_ci else 74362306a36Sopenharmony_ci itemtype = 0; 74462306a36Sopenharmony_ci dprintk(" item %d type %d\n", item, itemtype); 74562306a36Sopenharmony_ci 74662306a36Sopenharmony_ci /* keep going? */ 74762306a36Sopenharmony_ci if (itemtype != type) { 74862306a36Sopenharmony_ci if (item >= 0 || 74962306a36Sopenharmony_ci (-1-item) >= map->max_buckets) { 75062306a36Sopenharmony_ci dprintk(" bad item type %d\n", type); 75162306a36Sopenharmony_ci out[rep] = CRUSH_ITEM_NONE; 75262306a36Sopenharmony_ci if (out2) 75362306a36Sopenharmony_ci out2[rep] = 75462306a36Sopenharmony_ci CRUSH_ITEM_NONE; 75562306a36Sopenharmony_ci left--; 75662306a36Sopenharmony_ci break; 75762306a36Sopenharmony_ci } 75862306a36Sopenharmony_ci in = map->buckets[-1-item]; 75962306a36Sopenharmony_ci continue; 76062306a36Sopenharmony_ci } 76162306a36Sopenharmony_ci 76262306a36Sopenharmony_ci /* collision? */ 76362306a36Sopenharmony_ci collide = 0; 76462306a36Sopenharmony_ci for (i = outpos; i < endpos; i++) { 76562306a36Sopenharmony_ci if (out[i] == item) { 76662306a36Sopenharmony_ci collide = 1; 76762306a36Sopenharmony_ci break; 76862306a36Sopenharmony_ci } 76962306a36Sopenharmony_ci } 77062306a36Sopenharmony_ci if (collide) 77162306a36Sopenharmony_ci break; 77262306a36Sopenharmony_ci 77362306a36Sopenharmony_ci if (recurse_to_leaf) { 77462306a36Sopenharmony_ci if (item < 0) { 77562306a36Sopenharmony_ci crush_choose_indep( 77662306a36Sopenharmony_ci map, 77762306a36Sopenharmony_ci work, 77862306a36Sopenharmony_ci map->buckets[-1-item], 77962306a36Sopenharmony_ci weight, weight_max, 78062306a36Sopenharmony_ci x, 1, numrep, 0, 78162306a36Sopenharmony_ci out2, rep, 78262306a36Sopenharmony_ci recurse_tries, 0, 78362306a36Sopenharmony_ci 0, NULL, r, 78462306a36Sopenharmony_ci choose_args); 78562306a36Sopenharmony_ci if (out2[rep] == CRUSH_ITEM_NONE) { 78662306a36Sopenharmony_ci /* placed nothing; no leaf */ 78762306a36Sopenharmony_ci break; 78862306a36Sopenharmony_ci } 78962306a36Sopenharmony_ci } else { 79062306a36Sopenharmony_ci /* we already have a leaf! */ 79162306a36Sopenharmony_ci out2[rep] = item; 79262306a36Sopenharmony_ci } 79362306a36Sopenharmony_ci } 79462306a36Sopenharmony_ci 79562306a36Sopenharmony_ci /* out? */ 79662306a36Sopenharmony_ci if (itemtype == 0 && 79762306a36Sopenharmony_ci is_out(map, weight, weight_max, item, x)) 79862306a36Sopenharmony_ci break; 79962306a36Sopenharmony_ci 80062306a36Sopenharmony_ci /* yay! */ 80162306a36Sopenharmony_ci out[rep] = item; 80262306a36Sopenharmony_ci left--; 80362306a36Sopenharmony_ci break; 80462306a36Sopenharmony_ci } 80562306a36Sopenharmony_ci } 80662306a36Sopenharmony_ci } 80762306a36Sopenharmony_ci for (rep = outpos; rep < endpos; rep++) { 80862306a36Sopenharmony_ci if (out[rep] == CRUSH_ITEM_UNDEF) { 80962306a36Sopenharmony_ci out[rep] = CRUSH_ITEM_NONE; 81062306a36Sopenharmony_ci } 81162306a36Sopenharmony_ci if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { 81262306a36Sopenharmony_ci out2[rep] = CRUSH_ITEM_NONE; 81362306a36Sopenharmony_ci } 81462306a36Sopenharmony_ci } 81562306a36Sopenharmony_ci#ifndef __KERNEL__ 81662306a36Sopenharmony_ci if (map->choose_tries && ftotal <= map->choose_total_tries) 81762306a36Sopenharmony_ci map->choose_tries[ftotal]++; 81862306a36Sopenharmony_ci#endif 81962306a36Sopenharmony_ci#ifdef DEBUG_INDEP 82062306a36Sopenharmony_ci if (out2) { 82162306a36Sopenharmony_ci dprintk("%u %d a: ", ftotal, left); 82262306a36Sopenharmony_ci for (rep = outpos; rep < endpos; rep++) { 82362306a36Sopenharmony_ci dprintk(" %d", out[rep]); 82462306a36Sopenharmony_ci } 82562306a36Sopenharmony_ci dprintk("\n"); 82662306a36Sopenharmony_ci dprintk("%u %d b: ", ftotal, left); 82762306a36Sopenharmony_ci for (rep = outpos; rep < endpos; rep++) { 82862306a36Sopenharmony_ci dprintk(" %d", out2[rep]); 82962306a36Sopenharmony_ci } 83062306a36Sopenharmony_ci dprintk("\n"); 83162306a36Sopenharmony_ci } 83262306a36Sopenharmony_ci#endif 83362306a36Sopenharmony_ci} 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci/* 83762306a36Sopenharmony_ci * This takes a chunk of memory and sets it up to be a shiny new 83862306a36Sopenharmony_ci * working area for a CRUSH placement computation. It must be called 83962306a36Sopenharmony_ci * on any newly allocated memory before passing it in to 84062306a36Sopenharmony_ci * crush_do_rule. It may be used repeatedly after that, so long as the 84162306a36Sopenharmony_ci * map has not changed. If the map /has/ changed, you must make sure 84262306a36Sopenharmony_ci * the working size is no smaller than what was allocated and re-run 84362306a36Sopenharmony_ci * crush_init_workspace. 84462306a36Sopenharmony_ci * 84562306a36Sopenharmony_ci * If you do retain the working space between calls to crush, make it 84662306a36Sopenharmony_ci * thread-local. 84762306a36Sopenharmony_ci */ 84862306a36Sopenharmony_civoid crush_init_workspace(const struct crush_map *map, void *v) 84962306a36Sopenharmony_ci{ 85062306a36Sopenharmony_ci struct crush_work *w = v; 85162306a36Sopenharmony_ci __s32 b; 85262306a36Sopenharmony_ci 85362306a36Sopenharmony_ci /* 85462306a36Sopenharmony_ci * We work by moving through the available space and setting 85562306a36Sopenharmony_ci * values and pointers as we go. 85662306a36Sopenharmony_ci * 85762306a36Sopenharmony_ci * It's a bit like Forth's use of the 'allot' word since we 85862306a36Sopenharmony_ci * set the pointer first and then reserve the space for it to 85962306a36Sopenharmony_ci * point to by incrementing the point. 86062306a36Sopenharmony_ci */ 86162306a36Sopenharmony_ci v += sizeof(struct crush_work); 86262306a36Sopenharmony_ci w->work = v; 86362306a36Sopenharmony_ci v += map->max_buckets * sizeof(struct crush_work_bucket *); 86462306a36Sopenharmony_ci for (b = 0; b < map->max_buckets; ++b) { 86562306a36Sopenharmony_ci if (!map->buckets[b]) 86662306a36Sopenharmony_ci continue; 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci w->work[b] = v; 86962306a36Sopenharmony_ci switch (map->buckets[b]->alg) { 87062306a36Sopenharmony_ci default: 87162306a36Sopenharmony_ci v += sizeof(struct crush_work_bucket); 87262306a36Sopenharmony_ci break; 87362306a36Sopenharmony_ci } 87462306a36Sopenharmony_ci w->work[b]->perm_x = 0; 87562306a36Sopenharmony_ci w->work[b]->perm_n = 0; 87662306a36Sopenharmony_ci w->work[b]->perm = v; 87762306a36Sopenharmony_ci v += map->buckets[b]->size * sizeof(__u32); 87862306a36Sopenharmony_ci } 87962306a36Sopenharmony_ci BUG_ON(v - (void *)w != map->working_size); 88062306a36Sopenharmony_ci} 88162306a36Sopenharmony_ci 88262306a36Sopenharmony_ci/** 88362306a36Sopenharmony_ci * crush_do_rule - calculate a mapping with the given input and rule 88462306a36Sopenharmony_ci * @map: the crush_map 88562306a36Sopenharmony_ci * @ruleno: the rule id 88662306a36Sopenharmony_ci * @x: hash input 88762306a36Sopenharmony_ci * @result: pointer to result vector 88862306a36Sopenharmony_ci * @result_max: maximum result size 88962306a36Sopenharmony_ci * @weight: weight vector (for map leaves) 89062306a36Sopenharmony_ci * @weight_max: size of weight vector 89162306a36Sopenharmony_ci * @cwin: pointer to at least crush_work_size() bytes of memory 89262306a36Sopenharmony_ci * @choose_args: weights and ids for each known bucket 89362306a36Sopenharmony_ci */ 89462306a36Sopenharmony_ciint crush_do_rule(const struct crush_map *map, 89562306a36Sopenharmony_ci int ruleno, int x, int *result, int result_max, 89662306a36Sopenharmony_ci const __u32 *weight, int weight_max, 89762306a36Sopenharmony_ci void *cwin, const struct crush_choose_arg *choose_args) 89862306a36Sopenharmony_ci{ 89962306a36Sopenharmony_ci int result_len; 90062306a36Sopenharmony_ci struct crush_work *cw = cwin; 90162306a36Sopenharmony_ci int *a = cwin + map->working_size; 90262306a36Sopenharmony_ci int *b = a + result_max; 90362306a36Sopenharmony_ci int *c = b + result_max; 90462306a36Sopenharmony_ci int *w = a; 90562306a36Sopenharmony_ci int *o = b; 90662306a36Sopenharmony_ci int recurse_to_leaf; 90762306a36Sopenharmony_ci int wsize = 0; 90862306a36Sopenharmony_ci int osize; 90962306a36Sopenharmony_ci const struct crush_rule *rule; 91062306a36Sopenharmony_ci __u32 step; 91162306a36Sopenharmony_ci int i, j; 91262306a36Sopenharmony_ci int numrep; 91362306a36Sopenharmony_ci int out_size; 91462306a36Sopenharmony_ci /* 91562306a36Sopenharmony_ci * the original choose_total_tries value was off by one (it 91662306a36Sopenharmony_ci * counted "retries" and not "tries"). add one. 91762306a36Sopenharmony_ci */ 91862306a36Sopenharmony_ci int choose_tries = map->choose_total_tries + 1; 91962306a36Sopenharmony_ci int choose_leaf_tries = 0; 92062306a36Sopenharmony_ci /* 92162306a36Sopenharmony_ci * the local tries values were counted as "retries", though, 92262306a36Sopenharmony_ci * and need no adjustment 92362306a36Sopenharmony_ci */ 92462306a36Sopenharmony_ci int choose_local_retries = map->choose_local_tries; 92562306a36Sopenharmony_ci int choose_local_fallback_retries = map->choose_local_fallback_tries; 92662306a36Sopenharmony_ci 92762306a36Sopenharmony_ci int vary_r = map->chooseleaf_vary_r; 92862306a36Sopenharmony_ci int stable = map->chooseleaf_stable; 92962306a36Sopenharmony_ci 93062306a36Sopenharmony_ci if ((__u32)ruleno >= map->max_rules) { 93162306a36Sopenharmony_ci dprintk(" bad ruleno %d\n", ruleno); 93262306a36Sopenharmony_ci return 0; 93362306a36Sopenharmony_ci } 93462306a36Sopenharmony_ci 93562306a36Sopenharmony_ci rule = map->rules[ruleno]; 93662306a36Sopenharmony_ci result_len = 0; 93762306a36Sopenharmony_ci 93862306a36Sopenharmony_ci for (step = 0; step < rule->len; step++) { 93962306a36Sopenharmony_ci int firstn = 0; 94062306a36Sopenharmony_ci const struct crush_rule_step *curstep = &rule->steps[step]; 94162306a36Sopenharmony_ci 94262306a36Sopenharmony_ci switch (curstep->op) { 94362306a36Sopenharmony_ci case CRUSH_RULE_TAKE: 94462306a36Sopenharmony_ci if ((curstep->arg1 >= 0 && 94562306a36Sopenharmony_ci curstep->arg1 < map->max_devices) || 94662306a36Sopenharmony_ci (-1-curstep->arg1 >= 0 && 94762306a36Sopenharmony_ci -1-curstep->arg1 < map->max_buckets && 94862306a36Sopenharmony_ci map->buckets[-1-curstep->arg1])) { 94962306a36Sopenharmony_ci w[0] = curstep->arg1; 95062306a36Sopenharmony_ci wsize = 1; 95162306a36Sopenharmony_ci } else { 95262306a36Sopenharmony_ci dprintk(" bad take value %d\n", curstep->arg1); 95362306a36Sopenharmony_ci } 95462306a36Sopenharmony_ci break; 95562306a36Sopenharmony_ci 95662306a36Sopenharmony_ci case CRUSH_RULE_SET_CHOOSE_TRIES: 95762306a36Sopenharmony_ci if (curstep->arg1 > 0) 95862306a36Sopenharmony_ci choose_tries = curstep->arg1; 95962306a36Sopenharmony_ci break; 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_ci case CRUSH_RULE_SET_CHOOSELEAF_TRIES: 96262306a36Sopenharmony_ci if (curstep->arg1 > 0) 96362306a36Sopenharmony_ci choose_leaf_tries = curstep->arg1; 96462306a36Sopenharmony_ci break; 96562306a36Sopenharmony_ci 96662306a36Sopenharmony_ci case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: 96762306a36Sopenharmony_ci if (curstep->arg1 >= 0) 96862306a36Sopenharmony_ci choose_local_retries = curstep->arg1; 96962306a36Sopenharmony_ci break; 97062306a36Sopenharmony_ci 97162306a36Sopenharmony_ci case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: 97262306a36Sopenharmony_ci if (curstep->arg1 >= 0) 97362306a36Sopenharmony_ci choose_local_fallback_retries = curstep->arg1; 97462306a36Sopenharmony_ci break; 97562306a36Sopenharmony_ci 97662306a36Sopenharmony_ci case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: 97762306a36Sopenharmony_ci if (curstep->arg1 >= 0) 97862306a36Sopenharmony_ci vary_r = curstep->arg1; 97962306a36Sopenharmony_ci break; 98062306a36Sopenharmony_ci 98162306a36Sopenharmony_ci case CRUSH_RULE_SET_CHOOSELEAF_STABLE: 98262306a36Sopenharmony_ci if (curstep->arg1 >= 0) 98362306a36Sopenharmony_ci stable = curstep->arg1; 98462306a36Sopenharmony_ci break; 98562306a36Sopenharmony_ci 98662306a36Sopenharmony_ci case CRUSH_RULE_CHOOSELEAF_FIRSTN: 98762306a36Sopenharmony_ci case CRUSH_RULE_CHOOSE_FIRSTN: 98862306a36Sopenharmony_ci firstn = 1; 98962306a36Sopenharmony_ci fallthrough; 99062306a36Sopenharmony_ci case CRUSH_RULE_CHOOSELEAF_INDEP: 99162306a36Sopenharmony_ci case CRUSH_RULE_CHOOSE_INDEP: 99262306a36Sopenharmony_ci if (wsize == 0) 99362306a36Sopenharmony_ci break; 99462306a36Sopenharmony_ci 99562306a36Sopenharmony_ci recurse_to_leaf = 99662306a36Sopenharmony_ci curstep->op == 99762306a36Sopenharmony_ci CRUSH_RULE_CHOOSELEAF_FIRSTN || 99862306a36Sopenharmony_ci curstep->op == 99962306a36Sopenharmony_ci CRUSH_RULE_CHOOSELEAF_INDEP; 100062306a36Sopenharmony_ci 100162306a36Sopenharmony_ci /* reset output */ 100262306a36Sopenharmony_ci osize = 0; 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci for (i = 0; i < wsize; i++) { 100562306a36Sopenharmony_ci int bno; 100662306a36Sopenharmony_ci numrep = curstep->arg1; 100762306a36Sopenharmony_ci if (numrep <= 0) { 100862306a36Sopenharmony_ci numrep += result_max; 100962306a36Sopenharmony_ci if (numrep <= 0) 101062306a36Sopenharmony_ci continue; 101162306a36Sopenharmony_ci } 101262306a36Sopenharmony_ci j = 0; 101362306a36Sopenharmony_ci /* make sure bucket id is valid */ 101462306a36Sopenharmony_ci bno = -1 - w[i]; 101562306a36Sopenharmony_ci if (bno < 0 || bno >= map->max_buckets) { 101662306a36Sopenharmony_ci /* w[i] is probably CRUSH_ITEM_NONE */ 101762306a36Sopenharmony_ci dprintk(" bad w[i] %d\n", w[i]); 101862306a36Sopenharmony_ci continue; 101962306a36Sopenharmony_ci } 102062306a36Sopenharmony_ci if (firstn) { 102162306a36Sopenharmony_ci int recurse_tries; 102262306a36Sopenharmony_ci if (choose_leaf_tries) 102362306a36Sopenharmony_ci recurse_tries = 102462306a36Sopenharmony_ci choose_leaf_tries; 102562306a36Sopenharmony_ci else if (map->chooseleaf_descend_once) 102662306a36Sopenharmony_ci recurse_tries = 1; 102762306a36Sopenharmony_ci else 102862306a36Sopenharmony_ci recurse_tries = choose_tries; 102962306a36Sopenharmony_ci osize += crush_choose_firstn( 103062306a36Sopenharmony_ci map, 103162306a36Sopenharmony_ci cw, 103262306a36Sopenharmony_ci map->buckets[bno], 103362306a36Sopenharmony_ci weight, weight_max, 103462306a36Sopenharmony_ci x, numrep, 103562306a36Sopenharmony_ci curstep->arg2, 103662306a36Sopenharmony_ci o+osize, j, 103762306a36Sopenharmony_ci result_max-osize, 103862306a36Sopenharmony_ci choose_tries, 103962306a36Sopenharmony_ci recurse_tries, 104062306a36Sopenharmony_ci choose_local_retries, 104162306a36Sopenharmony_ci choose_local_fallback_retries, 104262306a36Sopenharmony_ci recurse_to_leaf, 104362306a36Sopenharmony_ci vary_r, 104462306a36Sopenharmony_ci stable, 104562306a36Sopenharmony_ci c+osize, 104662306a36Sopenharmony_ci 0, 104762306a36Sopenharmony_ci choose_args); 104862306a36Sopenharmony_ci } else { 104962306a36Sopenharmony_ci out_size = ((numrep < (result_max-osize)) ? 105062306a36Sopenharmony_ci numrep : (result_max-osize)); 105162306a36Sopenharmony_ci crush_choose_indep( 105262306a36Sopenharmony_ci map, 105362306a36Sopenharmony_ci cw, 105462306a36Sopenharmony_ci map->buckets[bno], 105562306a36Sopenharmony_ci weight, weight_max, 105662306a36Sopenharmony_ci x, out_size, numrep, 105762306a36Sopenharmony_ci curstep->arg2, 105862306a36Sopenharmony_ci o+osize, j, 105962306a36Sopenharmony_ci choose_tries, 106062306a36Sopenharmony_ci choose_leaf_tries ? 106162306a36Sopenharmony_ci choose_leaf_tries : 1, 106262306a36Sopenharmony_ci recurse_to_leaf, 106362306a36Sopenharmony_ci c+osize, 106462306a36Sopenharmony_ci 0, 106562306a36Sopenharmony_ci choose_args); 106662306a36Sopenharmony_ci osize += out_size; 106762306a36Sopenharmony_ci } 106862306a36Sopenharmony_ci } 106962306a36Sopenharmony_ci 107062306a36Sopenharmony_ci if (recurse_to_leaf) 107162306a36Sopenharmony_ci /* copy final _leaf_ values to output set */ 107262306a36Sopenharmony_ci memcpy(o, c, osize*sizeof(*o)); 107362306a36Sopenharmony_ci 107462306a36Sopenharmony_ci /* swap o and w arrays */ 107562306a36Sopenharmony_ci swap(o, w); 107662306a36Sopenharmony_ci wsize = osize; 107762306a36Sopenharmony_ci break; 107862306a36Sopenharmony_ci 107962306a36Sopenharmony_ci 108062306a36Sopenharmony_ci case CRUSH_RULE_EMIT: 108162306a36Sopenharmony_ci for (i = 0; i < wsize && result_len < result_max; i++) { 108262306a36Sopenharmony_ci result[result_len] = w[i]; 108362306a36Sopenharmony_ci result_len++; 108462306a36Sopenharmony_ci } 108562306a36Sopenharmony_ci wsize = 0; 108662306a36Sopenharmony_ci break; 108762306a36Sopenharmony_ci 108862306a36Sopenharmony_ci default: 108962306a36Sopenharmony_ci dprintk(" unknown op %d at step %d\n", 109062306a36Sopenharmony_ci curstep->op, step); 109162306a36Sopenharmony_ci break; 109262306a36Sopenharmony_ci } 109362306a36Sopenharmony_ci } 109462306a36Sopenharmony_ci 109562306a36Sopenharmony_ci return result_len; 109662306a36Sopenharmony_ci} 1097