xref: /kernel/linux/linux-6.6/net/ceph/crush/mapper.c (revision 62306a36)
162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * Ceph - scalable distributed file system
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Copyright (C) 2015 Intel Corporation All Rights Reserved
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * This is free software; you can redistribute it and/or
762306a36Sopenharmony_ci * modify it under the terms of the GNU Lesser General Public
862306a36Sopenharmony_ci * License version 2.1, as published by the Free Software
962306a36Sopenharmony_ci * Foundation.  See file COPYING.
1062306a36Sopenharmony_ci *
1162306a36Sopenharmony_ci */
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#ifdef __KERNEL__
1462306a36Sopenharmony_ci# include <linux/string.h>
1562306a36Sopenharmony_ci# include <linux/slab.h>
1662306a36Sopenharmony_ci# include <linux/bug.h>
1762306a36Sopenharmony_ci# include <linux/kernel.h>
1862306a36Sopenharmony_ci# include <linux/crush/crush.h>
1962306a36Sopenharmony_ci# include <linux/crush/hash.h>
2062306a36Sopenharmony_ci# include <linux/crush/mapper.h>
2162306a36Sopenharmony_ci#else
2262306a36Sopenharmony_ci# include "crush_compat.h"
2362306a36Sopenharmony_ci# include "crush.h"
2462306a36Sopenharmony_ci# include "hash.h"
2562306a36Sopenharmony_ci# include "mapper.h"
2662306a36Sopenharmony_ci#endif
2762306a36Sopenharmony_ci#include "crush_ln_table.h"
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ci#define dprintk(args...) /* printf(args) */
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_ci/*
3262306a36Sopenharmony_ci * Implement the core CRUSH mapping algorithm.
3362306a36Sopenharmony_ci */
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci/**
3662306a36Sopenharmony_ci * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
3762306a36Sopenharmony_ci * @map: the crush_map
3862306a36Sopenharmony_ci * @ruleset: the storage ruleset id (user defined)
3962306a36Sopenharmony_ci * @type: storage ruleset type (user defined)
4062306a36Sopenharmony_ci * @size: output set size
4162306a36Sopenharmony_ci */
4262306a36Sopenharmony_ciint crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
4362306a36Sopenharmony_ci{
4462306a36Sopenharmony_ci	__u32 i;
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci	for (i = 0; i < map->max_rules; i++) {
4762306a36Sopenharmony_ci		if (map->rules[i] &&
4862306a36Sopenharmony_ci		    map->rules[i]->mask.ruleset == ruleset &&
4962306a36Sopenharmony_ci		    map->rules[i]->mask.type == type &&
5062306a36Sopenharmony_ci		    map->rules[i]->mask.min_size <= size &&
5162306a36Sopenharmony_ci		    map->rules[i]->mask.max_size >= size)
5262306a36Sopenharmony_ci			return i;
5362306a36Sopenharmony_ci	}
5462306a36Sopenharmony_ci	return -1;
5562306a36Sopenharmony_ci}
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci/*
5862306a36Sopenharmony_ci * bucket choose methods
5962306a36Sopenharmony_ci *
6062306a36Sopenharmony_ci * For each bucket algorithm, we have a "choose" method that, given a
6162306a36Sopenharmony_ci * crush input @x and replica position (usually, position in output set) @r,
6262306a36Sopenharmony_ci * will produce an item in the bucket.
6362306a36Sopenharmony_ci */
6462306a36Sopenharmony_ci
6562306a36Sopenharmony_ci/*
6662306a36Sopenharmony_ci * Choose based on a random permutation of the bucket.
6762306a36Sopenharmony_ci *
6862306a36Sopenharmony_ci * We used to use some prime number arithmetic to do this, but it
6962306a36Sopenharmony_ci * wasn't very random, and had some other bad behaviors.  Instead, we
7062306a36Sopenharmony_ci * calculate an actual random permutation of the bucket members.
7162306a36Sopenharmony_ci * Since this is expensive, we optimize for the r=0 case, which
7262306a36Sopenharmony_ci * captures the vast majority of calls.
7362306a36Sopenharmony_ci */
7462306a36Sopenharmony_cistatic int bucket_perm_choose(const struct crush_bucket *bucket,
7562306a36Sopenharmony_ci			      struct crush_work_bucket *work,
7662306a36Sopenharmony_ci			      int x, int r)
7762306a36Sopenharmony_ci{
7862306a36Sopenharmony_ci	unsigned int pr = r % bucket->size;
7962306a36Sopenharmony_ci	unsigned int i, s;
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci	/* start a new permutation if @x has changed */
8262306a36Sopenharmony_ci	if (work->perm_x != (__u32)x || work->perm_n == 0) {
8362306a36Sopenharmony_ci		dprintk("bucket %d new x=%d\n", bucket->id, x);
8462306a36Sopenharmony_ci		work->perm_x = x;
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci		/* optimize common r=0 case */
8762306a36Sopenharmony_ci		if (pr == 0) {
8862306a36Sopenharmony_ci			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
8962306a36Sopenharmony_ci				bucket->size;
9062306a36Sopenharmony_ci			work->perm[0] = s;
9162306a36Sopenharmony_ci			work->perm_n = 0xffff;   /* magic value, see below */
9262306a36Sopenharmony_ci			goto out;
9362306a36Sopenharmony_ci		}
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci		for (i = 0; i < bucket->size; i++)
9662306a36Sopenharmony_ci			work->perm[i] = i;
9762306a36Sopenharmony_ci		work->perm_n = 0;
9862306a36Sopenharmony_ci	} else if (work->perm_n == 0xffff) {
9962306a36Sopenharmony_ci		/* clean up after the r=0 case above */
10062306a36Sopenharmony_ci		for (i = 1; i < bucket->size; i++)
10162306a36Sopenharmony_ci			work->perm[i] = i;
10262306a36Sopenharmony_ci		work->perm[work->perm[0]] = 0;
10362306a36Sopenharmony_ci		work->perm_n = 1;
10462306a36Sopenharmony_ci	}
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci	/* calculate permutation up to pr */
10762306a36Sopenharmony_ci	for (i = 0; i < work->perm_n; i++)
10862306a36Sopenharmony_ci		dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
10962306a36Sopenharmony_ci	while (work->perm_n <= pr) {
11062306a36Sopenharmony_ci		unsigned int p = work->perm_n;
11162306a36Sopenharmony_ci		/* no point in swapping the final entry */
11262306a36Sopenharmony_ci		if (p < bucket->size - 1) {
11362306a36Sopenharmony_ci			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
11462306a36Sopenharmony_ci				(bucket->size - p);
11562306a36Sopenharmony_ci			if (i) {
11662306a36Sopenharmony_ci				unsigned int t = work->perm[p + i];
11762306a36Sopenharmony_ci				work->perm[p + i] = work->perm[p];
11862306a36Sopenharmony_ci				work->perm[p] = t;
11962306a36Sopenharmony_ci			}
12062306a36Sopenharmony_ci			dprintk(" perm_choose swap %d with %d\n", p, p+i);
12162306a36Sopenharmony_ci		}
12262306a36Sopenharmony_ci		work->perm_n++;
12362306a36Sopenharmony_ci	}
12462306a36Sopenharmony_ci	for (i = 0; i < bucket->size; i++)
12562306a36Sopenharmony_ci		dprintk(" perm_choose  %d: %d\n", i, work->perm[i]);
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci	s = work->perm[pr];
12862306a36Sopenharmony_ciout:
12962306a36Sopenharmony_ci	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
13062306a36Sopenharmony_ci		bucket->size, x, r, pr, s);
13162306a36Sopenharmony_ci	return bucket->items[s];
13262306a36Sopenharmony_ci}
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci/* uniform */
13562306a36Sopenharmony_cistatic int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
13662306a36Sopenharmony_ci				 struct crush_work_bucket *work, int x, int r)
13762306a36Sopenharmony_ci{
13862306a36Sopenharmony_ci	return bucket_perm_choose(&bucket->h, work, x, r);
13962306a36Sopenharmony_ci}
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci/* list */
14262306a36Sopenharmony_cistatic int bucket_list_choose(const struct crush_bucket_list *bucket,
14362306a36Sopenharmony_ci			      int x, int r)
14462306a36Sopenharmony_ci{
14562306a36Sopenharmony_ci	int i;
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	for (i = bucket->h.size-1; i >= 0; i--) {
14862306a36Sopenharmony_ci		__u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
14962306a36Sopenharmony_ci					 r, bucket->h.id);
15062306a36Sopenharmony_ci		w &= 0xffff;
15162306a36Sopenharmony_ci		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
15262306a36Sopenharmony_ci			"sw %x rand %llx",
15362306a36Sopenharmony_ci			i, x, r, bucket->h.items[i], bucket->item_weights[i],
15462306a36Sopenharmony_ci			bucket->sum_weights[i], w);
15562306a36Sopenharmony_ci		w *= bucket->sum_weights[i];
15662306a36Sopenharmony_ci		w = w >> 16;
15762306a36Sopenharmony_ci		/*dprintk(" scaled %llx\n", w);*/
15862306a36Sopenharmony_ci		if (w < bucket->item_weights[i]) {
15962306a36Sopenharmony_ci			return bucket->h.items[i];
16062306a36Sopenharmony_ci		}
16162306a36Sopenharmony_ci	}
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	dprintk("bad list sums for bucket %d\n", bucket->h.id);
16462306a36Sopenharmony_ci	return bucket->h.items[0];
16562306a36Sopenharmony_ci}
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci/* (binary) tree */
16962306a36Sopenharmony_cistatic int height(int n)
17062306a36Sopenharmony_ci{
17162306a36Sopenharmony_ci	int h = 0;
17262306a36Sopenharmony_ci	while ((n & 1) == 0) {
17362306a36Sopenharmony_ci		h++;
17462306a36Sopenharmony_ci		n = n >> 1;
17562306a36Sopenharmony_ci	}
17662306a36Sopenharmony_ci	return h;
17762306a36Sopenharmony_ci}
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_cistatic int left(int x)
18062306a36Sopenharmony_ci{
18162306a36Sopenharmony_ci	int h = height(x);
18262306a36Sopenharmony_ci	return x - (1 << (h-1));
18362306a36Sopenharmony_ci}
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_cistatic int right(int x)
18662306a36Sopenharmony_ci{
18762306a36Sopenharmony_ci	int h = height(x);
18862306a36Sopenharmony_ci	return x + (1 << (h-1));
18962306a36Sopenharmony_ci}
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_cistatic int terminal(int x)
19262306a36Sopenharmony_ci{
19362306a36Sopenharmony_ci	return x & 1;
19462306a36Sopenharmony_ci}
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_cistatic int bucket_tree_choose(const struct crush_bucket_tree *bucket,
19762306a36Sopenharmony_ci			      int x, int r)
19862306a36Sopenharmony_ci{
19962306a36Sopenharmony_ci	int n;
20062306a36Sopenharmony_ci	__u32 w;
20162306a36Sopenharmony_ci	__u64 t;
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	/* start at root */
20462306a36Sopenharmony_ci	n = bucket->num_nodes >> 1;
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci	while (!terminal(n)) {
20762306a36Sopenharmony_ci		int l;
20862306a36Sopenharmony_ci		/* pick point in [0, w) */
20962306a36Sopenharmony_ci		w = bucket->node_weights[n];
21062306a36Sopenharmony_ci		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
21162306a36Sopenharmony_ci					  bucket->h.id) * (__u64)w;
21262306a36Sopenharmony_ci		t = t >> 32;
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci		/* descend to the left or right? */
21562306a36Sopenharmony_ci		l = left(n);
21662306a36Sopenharmony_ci		if (t < bucket->node_weights[l])
21762306a36Sopenharmony_ci			n = l;
21862306a36Sopenharmony_ci		else
21962306a36Sopenharmony_ci			n = right(n);
22062306a36Sopenharmony_ci	}
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci	return bucket->h.items[n >> 1];
22362306a36Sopenharmony_ci}
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci/* straw */
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_cistatic int bucket_straw_choose(const struct crush_bucket_straw *bucket,
22962306a36Sopenharmony_ci			       int x, int r)
23062306a36Sopenharmony_ci{
23162306a36Sopenharmony_ci	__u32 i;
23262306a36Sopenharmony_ci	int high = 0;
23362306a36Sopenharmony_ci	__u64 high_draw = 0;
23462306a36Sopenharmony_ci	__u64 draw;
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci	for (i = 0; i < bucket->h.size; i++) {
23762306a36Sopenharmony_ci		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
23862306a36Sopenharmony_ci		draw &= 0xffff;
23962306a36Sopenharmony_ci		draw *= bucket->straws[i];
24062306a36Sopenharmony_ci		if (i == 0 || draw > high_draw) {
24162306a36Sopenharmony_ci			high = i;
24262306a36Sopenharmony_ci			high_draw = draw;
24362306a36Sopenharmony_ci		}
24462306a36Sopenharmony_ci	}
24562306a36Sopenharmony_ci	return bucket->h.items[high];
24662306a36Sopenharmony_ci}
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci/* compute 2^44*log2(input+1) */
24962306a36Sopenharmony_cistatic __u64 crush_ln(unsigned int xin)
25062306a36Sopenharmony_ci{
25162306a36Sopenharmony_ci	unsigned int x = xin;
25262306a36Sopenharmony_ci	int iexpon, index1, index2;
25362306a36Sopenharmony_ci	__u64 RH, LH, LL, xl64, result;
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	x++;
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	/* normalize input */
25862306a36Sopenharmony_ci	iexpon = 15;
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	/*
26162306a36Sopenharmony_ci	 * figure out number of bits we need to shift and
26262306a36Sopenharmony_ci	 * do it in one step instead of iteratively
26362306a36Sopenharmony_ci	 */
26462306a36Sopenharmony_ci	if (!(x & 0x18000)) {
26562306a36Sopenharmony_ci		int bits = __builtin_clz(x & 0x1FFFF) - 16;
26662306a36Sopenharmony_ci		x <<= bits;
26762306a36Sopenharmony_ci		iexpon = 15 - bits;
26862306a36Sopenharmony_ci	}
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	index1 = (x >> 8) << 1;
27162306a36Sopenharmony_ci	/* RH ~ 2^56/index1 */
27262306a36Sopenharmony_ci	RH = __RH_LH_tbl[index1 - 256];
27362306a36Sopenharmony_ci	/* LH ~ 2^48 * log2(index1/256) */
27462306a36Sopenharmony_ci	LH = __RH_LH_tbl[index1 + 1 - 256];
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
27762306a36Sopenharmony_ci	xl64 = (__s64)x * RH;
27862306a36Sopenharmony_ci	xl64 >>= 48;
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	result = iexpon;
28162306a36Sopenharmony_ci	result <<= (12 + 32);
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	index2 = xl64 & 0xff;
28462306a36Sopenharmony_ci	/* LL ~ 2^48*log2(1.0+index2/2^15) */
28562306a36Sopenharmony_ci	LL = __LL_tbl[index2];
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci	LH = LH + LL;
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	LH >>= (48 - 12 - 32);
29062306a36Sopenharmony_ci	result += LH;
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci	return result;
29362306a36Sopenharmony_ci}
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci/*
29762306a36Sopenharmony_ci * straw2
29862306a36Sopenharmony_ci *
29962306a36Sopenharmony_ci * for reference, see:
30062306a36Sopenharmony_ci *
30162306a36Sopenharmony_ci * https://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
30262306a36Sopenharmony_ci *
30362306a36Sopenharmony_ci */
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_cistatic __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
30662306a36Sopenharmony_ci				     const struct crush_choose_arg *arg,
30762306a36Sopenharmony_ci				     int position)
30862306a36Sopenharmony_ci{
30962306a36Sopenharmony_ci	if (!arg || !arg->weight_set)
31062306a36Sopenharmony_ci		return bucket->item_weights;
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci	if (position >= arg->weight_set_size)
31362306a36Sopenharmony_ci		position = arg->weight_set_size - 1;
31462306a36Sopenharmony_ci	return arg->weight_set[position].weights;
31562306a36Sopenharmony_ci}
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_cistatic __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
31862306a36Sopenharmony_ci				 const struct crush_choose_arg *arg)
31962306a36Sopenharmony_ci{
32062306a36Sopenharmony_ci	if (!arg || !arg->ids)
32162306a36Sopenharmony_ci		return bucket->h.items;
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci	return arg->ids;
32462306a36Sopenharmony_ci}
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_cistatic int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
32762306a36Sopenharmony_ci				int x, int r,
32862306a36Sopenharmony_ci				const struct crush_choose_arg *arg,
32962306a36Sopenharmony_ci				int position)
33062306a36Sopenharmony_ci{
33162306a36Sopenharmony_ci	unsigned int i, high = 0;
33262306a36Sopenharmony_ci	unsigned int u;
33362306a36Sopenharmony_ci	__s64 ln, draw, high_draw = 0;
33462306a36Sopenharmony_ci	__u32 *weights = get_choose_arg_weights(bucket, arg, position);
33562306a36Sopenharmony_ci	__s32 *ids = get_choose_arg_ids(bucket, arg);
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	for (i = 0; i < bucket->h.size; i++) {
33862306a36Sopenharmony_ci		dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
33962306a36Sopenharmony_ci		if (weights[i]) {
34062306a36Sopenharmony_ci			u = crush_hash32_3(bucket->h.hash, x, ids[i], r);
34162306a36Sopenharmony_ci			u &= 0xffff;
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_ci			/*
34462306a36Sopenharmony_ci			 * for some reason slightly less than 0x10000 produces
34562306a36Sopenharmony_ci			 * a slightly more accurate distribution... probably a
34662306a36Sopenharmony_ci			 * rounding effect.
34762306a36Sopenharmony_ci			 *
34862306a36Sopenharmony_ci			 * the natural log lookup table maps [0,0xffff]
34962306a36Sopenharmony_ci			 * (corresponding to real numbers [1/0x10000, 1] to
35062306a36Sopenharmony_ci			 * [0, 0xffffffffffff] (corresponding to real numbers
35162306a36Sopenharmony_ci			 * [-11.090355,0]).
35262306a36Sopenharmony_ci			 */
35362306a36Sopenharmony_ci			ln = crush_ln(u) - 0x1000000000000ll;
35462306a36Sopenharmony_ci
35562306a36Sopenharmony_ci			/*
35662306a36Sopenharmony_ci			 * divide by 16.16 fixed-point weight.  note
35762306a36Sopenharmony_ci			 * that the ln value is negative, so a larger
35862306a36Sopenharmony_ci			 * weight means a larger (less negative) value
35962306a36Sopenharmony_ci			 * for draw.
36062306a36Sopenharmony_ci			 */
36162306a36Sopenharmony_ci			draw = div64_s64(ln, weights[i]);
36262306a36Sopenharmony_ci		} else {
36362306a36Sopenharmony_ci			draw = S64_MIN;
36462306a36Sopenharmony_ci		}
36562306a36Sopenharmony_ci
36662306a36Sopenharmony_ci		if (i == 0 || draw > high_draw) {
36762306a36Sopenharmony_ci			high = i;
36862306a36Sopenharmony_ci			high_draw = draw;
36962306a36Sopenharmony_ci		}
37062306a36Sopenharmony_ci	}
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	return bucket->h.items[high];
37362306a36Sopenharmony_ci}
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_cistatic int crush_bucket_choose(const struct crush_bucket *in,
37762306a36Sopenharmony_ci			       struct crush_work_bucket *work,
37862306a36Sopenharmony_ci			       int x, int r,
37962306a36Sopenharmony_ci			       const struct crush_choose_arg *arg,
38062306a36Sopenharmony_ci			       int position)
38162306a36Sopenharmony_ci{
38262306a36Sopenharmony_ci	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
38362306a36Sopenharmony_ci	BUG_ON(in->size == 0);
38462306a36Sopenharmony_ci	switch (in->alg) {
38562306a36Sopenharmony_ci	case CRUSH_BUCKET_UNIFORM:
38662306a36Sopenharmony_ci		return bucket_uniform_choose(
38762306a36Sopenharmony_ci			(const struct crush_bucket_uniform *)in,
38862306a36Sopenharmony_ci			work, x, r);
38962306a36Sopenharmony_ci	case CRUSH_BUCKET_LIST:
39062306a36Sopenharmony_ci		return bucket_list_choose((const struct crush_bucket_list *)in,
39162306a36Sopenharmony_ci					  x, r);
39262306a36Sopenharmony_ci	case CRUSH_BUCKET_TREE:
39362306a36Sopenharmony_ci		return bucket_tree_choose((const struct crush_bucket_tree *)in,
39462306a36Sopenharmony_ci					  x, r);
39562306a36Sopenharmony_ci	case CRUSH_BUCKET_STRAW:
39662306a36Sopenharmony_ci		return bucket_straw_choose(
39762306a36Sopenharmony_ci			(const struct crush_bucket_straw *)in,
39862306a36Sopenharmony_ci			x, r);
39962306a36Sopenharmony_ci	case CRUSH_BUCKET_STRAW2:
40062306a36Sopenharmony_ci		return bucket_straw2_choose(
40162306a36Sopenharmony_ci			(const struct crush_bucket_straw2 *)in,
40262306a36Sopenharmony_ci			x, r, arg, position);
40362306a36Sopenharmony_ci	default:
40462306a36Sopenharmony_ci		dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
40562306a36Sopenharmony_ci		return in->items[0];
40662306a36Sopenharmony_ci	}
40762306a36Sopenharmony_ci}
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci/*
41062306a36Sopenharmony_ci * true if device is marked "out" (failed, fully offloaded)
41162306a36Sopenharmony_ci * of the cluster
41262306a36Sopenharmony_ci */
41362306a36Sopenharmony_cistatic int is_out(const struct crush_map *map,
41462306a36Sopenharmony_ci		  const __u32 *weight, int weight_max,
41562306a36Sopenharmony_ci		  int item, int x)
41662306a36Sopenharmony_ci{
41762306a36Sopenharmony_ci	if (item >= weight_max)
41862306a36Sopenharmony_ci		return 1;
41962306a36Sopenharmony_ci	if (weight[item] >= 0x10000)
42062306a36Sopenharmony_ci		return 0;
42162306a36Sopenharmony_ci	if (weight[item] == 0)
42262306a36Sopenharmony_ci		return 1;
42362306a36Sopenharmony_ci	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
42462306a36Sopenharmony_ci	    < weight[item])
42562306a36Sopenharmony_ci		return 0;
42662306a36Sopenharmony_ci	return 1;
42762306a36Sopenharmony_ci}
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci/**
43062306a36Sopenharmony_ci * crush_choose_firstn - choose numrep distinct items of given type
43162306a36Sopenharmony_ci * @map: the crush_map
43262306a36Sopenharmony_ci * @bucket: the bucket we are choose an item from
43362306a36Sopenharmony_ci * @x: crush input value
43462306a36Sopenharmony_ci * @numrep: the number of items to choose
43562306a36Sopenharmony_ci * @type: the type of item to choose
43662306a36Sopenharmony_ci * @out: pointer to output vector
43762306a36Sopenharmony_ci * @outpos: our position in that vector
43862306a36Sopenharmony_ci * @out_size: size of the out vector
43962306a36Sopenharmony_ci * @tries: number of attempts to make
44062306a36Sopenharmony_ci * @recurse_tries: number of attempts to have recursive chooseleaf make
44162306a36Sopenharmony_ci * @local_retries: localized retries
44262306a36Sopenharmony_ci * @local_fallback_retries: localized fallback retries
44362306a36Sopenharmony_ci * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
44462306a36Sopenharmony_ci * @stable: stable mode starts rep=0 in the recursive call for all replicas
44562306a36Sopenharmony_ci * @vary_r: pass r to recursive calls
44662306a36Sopenharmony_ci * @out2: second output vector for leaf items (if @recurse_to_leaf)
44762306a36Sopenharmony_ci * @parent_r: r value passed from the parent
44862306a36Sopenharmony_ci */
44962306a36Sopenharmony_cistatic int crush_choose_firstn(const struct crush_map *map,
45062306a36Sopenharmony_ci			       struct crush_work *work,
45162306a36Sopenharmony_ci			       const struct crush_bucket *bucket,
45262306a36Sopenharmony_ci			       const __u32 *weight, int weight_max,
45362306a36Sopenharmony_ci			       int x, int numrep, int type,
45462306a36Sopenharmony_ci			       int *out, int outpos,
45562306a36Sopenharmony_ci			       int out_size,
45662306a36Sopenharmony_ci			       unsigned int tries,
45762306a36Sopenharmony_ci			       unsigned int recurse_tries,
45862306a36Sopenharmony_ci			       unsigned int local_retries,
45962306a36Sopenharmony_ci			       unsigned int local_fallback_retries,
46062306a36Sopenharmony_ci			       int recurse_to_leaf,
46162306a36Sopenharmony_ci			       unsigned int vary_r,
46262306a36Sopenharmony_ci			       unsigned int stable,
46362306a36Sopenharmony_ci			       int *out2,
46462306a36Sopenharmony_ci			       int parent_r,
46562306a36Sopenharmony_ci			       const struct crush_choose_arg *choose_args)
46662306a36Sopenharmony_ci{
46762306a36Sopenharmony_ci	int rep;
46862306a36Sopenharmony_ci	unsigned int ftotal, flocal;
46962306a36Sopenharmony_ci	int retry_descent, retry_bucket, skip_rep;
47062306a36Sopenharmony_ci	const struct crush_bucket *in = bucket;
47162306a36Sopenharmony_ci	int r;
47262306a36Sopenharmony_ci	int i;
47362306a36Sopenharmony_ci	int item = 0;
47462306a36Sopenharmony_ci	int itemtype;
47562306a36Sopenharmony_ci	int collide, reject;
47662306a36Sopenharmony_ci	int count = out_size;
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
47962306a36Sopenharmony_ci		recurse_to_leaf ? "_LEAF" : "",
48062306a36Sopenharmony_ci		bucket->id, x, outpos, numrep,
48162306a36Sopenharmony_ci		tries, recurse_tries, local_retries, local_fallback_retries,
48262306a36Sopenharmony_ci		parent_r, stable);
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci	for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
48562306a36Sopenharmony_ci		/* keep trying until we get a non-out, non-colliding item */
48662306a36Sopenharmony_ci		ftotal = 0;
48762306a36Sopenharmony_ci		skip_rep = 0;
48862306a36Sopenharmony_ci		do {
48962306a36Sopenharmony_ci			retry_descent = 0;
49062306a36Sopenharmony_ci			in = bucket;               /* initial bucket */
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci			/* choose through intervening buckets */
49362306a36Sopenharmony_ci			flocal = 0;
49462306a36Sopenharmony_ci			do {
49562306a36Sopenharmony_ci				collide = 0;
49662306a36Sopenharmony_ci				retry_bucket = 0;
49762306a36Sopenharmony_ci				r = rep + parent_r;
49862306a36Sopenharmony_ci				/* r' = r + f_total */
49962306a36Sopenharmony_ci				r += ftotal;
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ci				/* bucket choose */
50262306a36Sopenharmony_ci				if (in->size == 0) {
50362306a36Sopenharmony_ci					reject = 1;
50462306a36Sopenharmony_ci					goto reject;
50562306a36Sopenharmony_ci				}
50662306a36Sopenharmony_ci				if (local_fallback_retries > 0 &&
50762306a36Sopenharmony_ci				    flocal >= (in->size>>1) &&
50862306a36Sopenharmony_ci				    flocal > local_fallback_retries)
50962306a36Sopenharmony_ci					item = bucket_perm_choose(
51062306a36Sopenharmony_ci						in, work->work[-1-in->id],
51162306a36Sopenharmony_ci						x, r);
51262306a36Sopenharmony_ci				else
51362306a36Sopenharmony_ci					item = crush_bucket_choose(
51462306a36Sopenharmony_ci						in, work->work[-1-in->id],
51562306a36Sopenharmony_ci						x, r,
51662306a36Sopenharmony_ci						(choose_args ?
51762306a36Sopenharmony_ci						 &choose_args[-1-in->id] : NULL),
51862306a36Sopenharmony_ci						outpos);
51962306a36Sopenharmony_ci				if (item >= map->max_devices) {
52062306a36Sopenharmony_ci					dprintk("   bad item %d\n", item);
52162306a36Sopenharmony_ci					skip_rep = 1;
52262306a36Sopenharmony_ci					break;
52362306a36Sopenharmony_ci				}
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci				/* desired type? */
52662306a36Sopenharmony_ci				if (item < 0)
52762306a36Sopenharmony_ci					itemtype = map->buckets[-1-item]->type;
52862306a36Sopenharmony_ci				else
52962306a36Sopenharmony_ci					itemtype = 0;
53062306a36Sopenharmony_ci				dprintk("  item %d type %d\n", item, itemtype);
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci				/* keep going? */
53362306a36Sopenharmony_ci				if (itemtype != type) {
53462306a36Sopenharmony_ci					if (item >= 0 ||
53562306a36Sopenharmony_ci					    (-1-item) >= map->max_buckets) {
53662306a36Sopenharmony_ci						dprintk("   bad item type %d\n", type);
53762306a36Sopenharmony_ci						skip_rep = 1;
53862306a36Sopenharmony_ci						break;
53962306a36Sopenharmony_ci					}
54062306a36Sopenharmony_ci					in = map->buckets[-1-item];
54162306a36Sopenharmony_ci					retry_bucket = 1;
54262306a36Sopenharmony_ci					continue;
54362306a36Sopenharmony_ci				}
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci				/* collision? */
54662306a36Sopenharmony_ci				for (i = 0; i < outpos; i++) {
54762306a36Sopenharmony_ci					if (out[i] == item) {
54862306a36Sopenharmony_ci						collide = 1;
54962306a36Sopenharmony_ci						break;
55062306a36Sopenharmony_ci					}
55162306a36Sopenharmony_ci				}
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci				reject = 0;
55462306a36Sopenharmony_ci				if (!collide && recurse_to_leaf) {
55562306a36Sopenharmony_ci					if (item < 0) {
55662306a36Sopenharmony_ci						int sub_r;
55762306a36Sopenharmony_ci						if (vary_r)
55862306a36Sopenharmony_ci							sub_r = r >> (vary_r-1);
55962306a36Sopenharmony_ci						else
56062306a36Sopenharmony_ci							sub_r = 0;
56162306a36Sopenharmony_ci						if (crush_choose_firstn(
56262306a36Sopenharmony_ci							    map,
56362306a36Sopenharmony_ci							    work,
56462306a36Sopenharmony_ci							    map->buckets[-1-item],
56562306a36Sopenharmony_ci							    weight, weight_max,
56662306a36Sopenharmony_ci							    x, stable ? 1 : outpos+1, 0,
56762306a36Sopenharmony_ci							    out2, outpos, count,
56862306a36Sopenharmony_ci							    recurse_tries, 0,
56962306a36Sopenharmony_ci							    local_retries,
57062306a36Sopenharmony_ci							    local_fallback_retries,
57162306a36Sopenharmony_ci							    0,
57262306a36Sopenharmony_ci							    vary_r,
57362306a36Sopenharmony_ci							    stable,
57462306a36Sopenharmony_ci							    NULL,
57562306a36Sopenharmony_ci							    sub_r,
57662306a36Sopenharmony_ci							    choose_args) <= outpos)
57762306a36Sopenharmony_ci							/* didn't get leaf */
57862306a36Sopenharmony_ci							reject = 1;
57962306a36Sopenharmony_ci					} else {
58062306a36Sopenharmony_ci						/* we already have a leaf! */
58162306a36Sopenharmony_ci						out2[outpos] = item;
58262306a36Sopenharmony_ci					}
58362306a36Sopenharmony_ci				}
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci				if (!reject && !collide) {
58662306a36Sopenharmony_ci					/* out? */
58762306a36Sopenharmony_ci					if (itemtype == 0)
58862306a36Sopenharmony_ci						reject = is_out(map, weight,
58962306a36Sopenharmony_ci								weight_max,
59062306a36Sopenharmony_ci								item, x);
59162306a36Sopenharmony_ci				}
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_cireject:
59462306a36Sopenharmony_ci				if (reject || collide) {
59562306a36Sopenharmony_ci					ftotal++;
59662306a36Sopenharmony_ci					flocal++;
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci					if (collide && flocal <= local_retries)
59962306a36Sopenharmony_ci						/* retry locally a few times */
60062306a36Sopenharmony_ci						retry_bucket = 1;
60162306a36Sopenharmony_ci					else if (local_fallback_retries > 0 &&
60262306a36Sopenharmony_ci						 flocal <= in->size + local_fallback_retries)
60362306a36Sopenharmony_ci						/* exhaustive bucket search */
60462306a36Sopenharmony_ci						retry_bucket = 1;
60562306a36Sopenharmony_ci					else if (ftotal < tries)
60662306a36Sopenharmony_ci						/* then retry descent */
60762306a36Sopenharmony_ci						retry_descent = 1;
60862306a36Sopenharmony_ci					else
60962306a36Sopenharmony_ci						/* else give up */
61062306a36Sopenharmony_ci						skip_rep = 1;
61162306a36Sopenharmony_ci					dprintk("  reject %d  collide %d  "
61262306a36Sopenharmony_ci						"ftotal %u  flocal %u\n",
61362306a36Sopenharmony_ci						reject, collide, ftotal,
61462306a36Sopenharmony_ci						flocal);
61562306a36Sopenharmony_ci				}
61662306a36Sopenharmony_ci			} while (retry_bucket);
61762306a36Sopenharmony_ci		} while (retry_descent);
61862306a36Sopenharmony_ci
61962306a36Sopenharmony_ci		if (skip_rep) {
62062306a36Sopenharmony_ci			dprintk("skip rep\n");
62162306a36Sopenharmony_ci			continue;
62262306a36Sopenharmony_ci		}
62362306a36Sopenharmony_ci
62462306a36Sopenharmony_ci		dprintk("CHOOSE got %d\n", item);
62562306a36Sopenharmony_ci		out[outpos] = item;
62662306a36Sopenharmony_ci		outpos++;
62762306a36Sopenharmony_ci		count--;
62862306a36Sopenharmony_ci#ifndef __KERNEL__
62962306a36Sopenharmony_ci		if (map->choose_tries && ftotal <= map->choose_total_tries)
63062306a36Sopenharmony_ci			map->choose_tries[ftotal]++;
63162306a36Sopenharmony_ci#endif
63262306a36Sopenharmony_ci	}
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci	dprintk("CHOOSE returns %d\n", outpos);
63562306a36Sopenharmony_ci	return outpos;
63662306a36Sopenharmony_ci}
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci
63962306a36Sopenharmony_ci/**
64062306a36Sopenharmony_ci * crush_choose_indep: alternative breadth-first positionally stable mapping
64162306a36Sopenharmony_ci *
64262306a36Sopenharmony_ci */
64362306a36Sopenharmony_cistatic void crush_choose_indep(const struct crush_map *map,
64462306a36Sopenharmony_ci			       struct crush_work *work,
64562306a36Sopenharmony_ci			       const struct crush_bucket *bucket,
64662306a36Sopenharmony_ci			       const __u32 *weight, int weight_max,
64762306a36Sopenharmony_ci			       int x, int left, int numrep, int type,
64862306a36Sopenharmony_ci			       int *out, int outpos,
64962306a36Sopenharmony_ci			       unsigned int tries,
65062306a36Sopenharmony_ci			       unsigned int recurse_tries,
65162306a36Sopenharmony_ci			       int recurse_to_leaf,
65262306a36Sopenharmony_ci			       int *out2,
65362306a36Sopenharmony_ci			       int parent_r,
65462306a36Sopenharmony_ci			       const struct crush_choose_arg *choose_args)
65562306a36Sopenharmony_ci{
65662306a36Sopenharmony_ci	const struct crush_bucket *in = bucket;
65762306a36Sopenharmony_ci	int endpos = outpos + left;
65862306a36Sopenharmony_ci	int rep;
65962306a36Sopenharmony_ci	unsigned int ftotal;
66062306a36Sopenharmony_ci	int r;
66162306a36Sopenharmony_ci	int i;
66262306a36Sopenharmony_ci	int item = 0;
66362306a36Sopenharmony_ci	int itemtype;
66462306a36Sopenharmony_ci	int collide;
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci	dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
66762306a36Sopenharmony_ci		bucket->id, x, outpos, numrep);
66862306a36Sopenharmony_ci
66962306a36Sopenharmony_ci	/* initially my result is undefined */
67062306a36Sopenharmony_ci	for (rep = outpos; rep < endpos; rep++) {
67162306a36Sopenharmony_ci		out[rep] = CRUSH_ITEM_UNDEF;
67262306a36Sopenharmony_ci		if (out2)
67362306a36Sopenharmony_ci			out2[rep] = CRUSH_ITEM_UNDEF;
67462306a36Sopenharmony_ci	}
67562306a36Sopenharmony_ci
67662306a36Sopenharmony_ci	for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
67762306a36Sopenharmony_ci#ifdef DEBUG_INDEP
67862306a36Sopenharmony_ci		if (out2 && ftotal) {
67962306a36Sopenharmony_ci			dprintk("%u %d a: ", ftotal, left);
68062306a36Sopenharmony_ci			for (rep = outpos; rep < endpos; rep++) {
68162306a36Sopenharmony_ci				dprintk(" %d", out[rep]);
68262306a36Sopenharmony_ci			}
68362306a36Sopenharmony_ci			dprintk("\n");
68462306a36Sopenharmony_ci			dprintk("%u %d b: ", ftotal, left);
68562306a36Sopenharmony_ci			for (rep = outpos; rep < endpos; rep++) {
68662306a36Sopenharmony_ci				dprintk(" %d", out2[rep]);
68762306a36Sopenharmony_ci			}
68862306a36Sopenharmony_ci			dprintk("\n");
68962306a36Sopenharmony_ci		}
69062306a36Sopenharmony_ci#endif
69162306a36Sopenharmony_ci		for (rep = outpos; rep < endpos; rep++) {
69262306a36Sopenharmony_ci			if (out[rep] != CRUSH_ITEM_UNDEF)
69362306a36Sopenharmony_ci				continue;
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_ci			in = bucket;  /* initial bucket */
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci			/* choose through intervening buckets */
69862306a36Sopenharmony_ci			for (;;) {
69962306a36Sopenharmony_ci				/* note: we base the choice on the position
70062306a36Sopenharmony_ci				 * even in the nested call.  that means that
70162306a36Sopenharmony_ci				 * if the first layer chooses the same bucket
70262306a36Sopenharmony_ci				 * in a different position, we will tend to
70362306a36Sopenharmony_ci				 * choose a different item in that bucket.
70462306a36Sopenharmony_ci				 * this will involve more devices in data
70562306a36Sopenharmony_ci				 * movement and tend to distribute the load.
70662306a36Sopenharmony_ci				 */
70762306a36Sopenharmony_ci				r = rep + parent_r;
70862306a36Sopenharmony_ci
70962306a36Sopenharmony_ci				/* be careful */
71062306a36Sopenharmony_ci				if (in->alg == CRUSH_BUCKET_UNIFORM &&
71162306a36Sopenharmony_ci				    in->size % numrep == 0)
71262306a36Sopenharmony_ci					/* r'=r+(n+1)*f_total */
71362306a36Sopenharmony_ci					r += (numrep+1) * ftotal;
71462306a36Sopenharmony_ci				else
71562306a36Sopenharmony_ci					/* r' = r + n*f_total */
71662306a36Sopenharmony_ci					r += numrep * ftotal;
71762306a36Sopenharmony_ci
71862306a36Sopenharmony_ci				/* bucket choose */
71962306a36Sopenharmony_ci				if (in->size == 0) {
72062306a36Sopenharmony_ci					dprintk("   empty bucket\n");
72162306a36Sopenharmony_ci					break;
72262306a36Sopenharmony_ci				}
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci				item = crush_bucket_choose(
72562306a36Sopenharmony_ci					in, work->work[-1-in->id],
72662306a36Sopenharmony_ci					x, r,
72762306a36Sopenharmony_ci					(choose_args ?
72862306a36Sopenharmony_ci					 &choose_args[-1-in->id] : NULL),
72962306a36Sopenharmony_ci					outpos);
73062306a36Sopenharmony_ci				if (item >= map->max_devices) {
73162306a36Sopenharmony_ci					dprintk("   bad item %d\n", item);
73262306a36Sopenharmony_ci					out[rep] = CRUSH_ITEM_NONE;
73362306a36Sopenharmony_ci					if (out2)
73462306a36Sopenharmony_ci						out2[rep] = CRUSH_ITEM_NONE;
73562306a36Sopenharmony_ci					left--;
73662306a36Sopenharmony_ci					break;
73762306a36Sopenharmony_ci				}
73862306a36Sopenharmony_ci
73962306a36Sopenharmony_ci				/* desired type? */
74062306a36Sopenharmony_ci				if (item < 0)
74162306a36Sopenharmony_ci					itemtype = map->buckets[-1-item]->type;
74262306a36Sopenharmony_ci				else
74362306a36Sopenharmony_ci					itemtype = 0;
74462306a36Sopenharmony_ci				dprintk("  item %d type %d\n", item, itemtype);
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_ci				/* keep going? */
74762306a36Sopenharmony_ci				if (itemtype != type) {
74862306a36Sopenharmony_ci					if (item >= 0 ||
74962306a36Sopenharmony_ci					    (-1-item) >= map->max_buckets) {
75062306a36Sopenharmony_ci						dprintk("   bad item type %d\n", type);
75162306a36Sopenharmony_ci						out[rep] = CRUSH_ITEM_NONE;
75262306a36Sopenharmony_ci						if (out2)
75362306a36Sopenharmony_ci							out2[rep] =
75462306a36Sopenharmony_ci								CRUSH_ITEM_NONE;
75562306a36Sopenharmony_ci						left--;
75662306a36Sopenharmony_ci						break;
75762306a36Sopenharmony_ci					}
75862306a36Sopenharmony_ci					in = map->buckets[-1-item];
75962306a36Sopenharmony_ci					continue;
76062306a36Sopenharmony_ci				}
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_ci				/* collision? */
76362306a36Sopenharmony_ci				collide = 0;
76462306a36Sopenharmony_ci				for (i = outpos; i < endpos; i++) {
76562306a36Sopenharmony_ci					if (out[i] == item) {
76662306a36Sopenharmony_ci						collide = 1;
76762306a36Sopenharmony_ci						break;
76862306a36Sopenharmony_ci					}
76962306a36Sopenharmony_ci				}
77062306a36Sopenharmony_ci				if (collide)
77162306a36Sopenharmony_ci					break;
77262306a36Sopenharmony_ci
77362306a36Sopenharmony_ci				if (recurse_to_leaf) {
77462306a36Sopenharmony_ci					if (item < 0) {
77562306a36Sopenharmony_ci						crush_choose_indep(
77662306a36Sopenharmony_ci							map,
77762306a36Sopenharmony_ci							work,
77862306a36Sopenharmony_ci							map->buckets[-1-item],
77962306a36Sopenharmony_ci							weight, weight_max,
78062306a36Sopenharmony_ci							x, 1, numrep, 0,
78162306a36Sopenharmony_ci							out2, rep,
78262306a36Sopenharmony_ci							recurse_tries, 0,
78362306a36Sopenharmony_ci							0, NULL, r,
78462306a36Sopenharmony_ci							choose_args);
78562306a36Sopenharmony_ci						if (out2[rep] == CRUSH_ITEM_NONE) {
78662306a36Sopenharmony_ci							/* placed nothing; no leaf */
78762306a36Sopenharmony_ci							break;
78862306a36Sopenharmony_ci						}
78962306a36Sopenharmony_ci					} else {
79062306a36Sopenharmony_ci						/* we already have a leaf! */
79162306a36Sopenharmony_ci						out2[rep] = item;
79262306a36Sopenharmony_ci					}
79362306a36Sopenharmony_ci				}
79462306a36Sopenharmony_ci
79562306a36Sopenharmony_ci				/* out? */
79662306a36Sopenharmony_ci				if (itemtype == 0 &&
79762306a36Sopenharmony_ci				    is_out(map, weight, weight_max, item, x))
79862306a36Sopenharmony_ci					break;
79962306a36Sopenharmony_ci
80062306a36Sopenharmony_ci				/* yay! */
80162306a36Sopenharmony_ci				out[rep] = item;
80262306a36Sopenharmony_ci				left--;
80362306a36Sopenharmony_ci				break;
80462306a36Sopenharmony_ci			}
80562306a36Sopenharmony_ci		}
80662306a36Sopenharmony_ci	}
80762306a36Sopenharmony_ci	for (rep = outpos; rep < endpos; rep++) {
80862306a36Sopenharmony_ci		if (out[rep] == CRUSH_ITEM_UNDEF) {
80962306a36Sopenharmony_ci			out[rep] = CRUSH_ITEM_NONE;
81062306a36Sopenharmony_ci		}
81162306a36Sopenharmony_ci		if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
81262306a36Sopenharmony_ci			out2[rep] = CRUSH_ITEM_NONE;
81362306a36Sopenharmony_ci		}
81462306a36Sopenharmony_ci	}
81562306a36Sopenharmony_ci#ifndef __KERNEL__
81662306a36Sopenharmony_ci	if (map->choose_tries && ftotal <= map->choose_total_tries)
81762306a36Sopenharmony_ci		map->choose_tries[ftotal]++;
81862306a36Sopenharmony_ci#endif
81962306a36Sopenharmony_ci#ifdef DEBUG_INDEP
82062306a36Sopenharmony_ci	if (out2) {
82162306a36Sopenharmony_ci		dprintk("%u %d a: ", ftotal, left);
82262306a36Sopenharmony_ci		for (rep = outpos; rep < endpos; rep++) {
82362306a36Sopenharmony_ci			dprintk(" %d", out[rep]);
82462306a36Sopenharmony_ci		}
82562306a36Sopenharmony_ci		dprintk("\n");
82662306a36Sopenharmony_ci		dprintk("%u %d b: ", ftotal, left);
82762306a36Sopenharmony_ci		for (rep = outpos; rep < endpos; rep++) {
82862306a36Sopenharmony_ci			dprintk(" %d", out2[rep]);
82962306a36Sopenharmony_ci		}
83062306a36Sopenharmony_ci		dprintk("\n");
83162306a36Sopenharmony_ci	}
83262306a36Sopenharmony_ci#endif
83362306a36Sopenharmony_ci}
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci/*
83762306a36Sopenharmony_ci * This takes a chunk of memory and sets it up to be a shiny new
83862306a36Sopenharmony_ci * working area for a CRUSH placement computation. It must be called
83962306a36Sopenharmony_ci * on any newly allocated memory before passing it in to
84062306a36Sopenharmony_ci * crush_do_rule. It may be used repeatedly after that, so long as the
84162306a36Sopenharmony_ci * map has not changed. If the map /has/ changed, you must make sure
84262306a36Sopenharmony_ci * the working size is no smaller than what was allocated and re-run
84362306a36Sopenharmony_ci * crush_init_workspace.
84462306a36Sopenharmony_ci *
84562306a36Sopenharmony_ci * If you do retain the working space between calls to crush, make it
84662306a36Sopenharmony_ci * thread-local.
84762306a36Sopenharmony_ci */
84862306a36Sopenharmony_civoid crush_init_workspace(const struct crush_map *map, void *v)
84962306a36Sopenharmony_ci{
85062306a36Sopenharmony_ci	struct crush_work *w = v;
85162306a36Sopenharmony_ci	__s32 b;
85262306a36Sopenharmony_ci
85362306a36Sopenharmony_ci	/*
85462306a36Sopenharmony_ci	 * We work by moving through the available space and setting
85562306a36Sopenharmony_ci	 * values and pointers as we go.
85662306a36Sopenharmony_ci	 *
85762306a36Sopenharmony_ci	 * It's a bit like Forth's use of the 'allot' word since we
85862306a36Sopenharmony_ci	 * set the pointer first and then reserve the space for it to
85962306a36Sopenharmony_ci	 * point to by incrementing the point.
86062306a36Sopenharmony_ci	 */
86162306a36Sopenharmony_ci	v += sizeof(struct crush_work);
86262306a36Sopenharmony_ci	w->work = v;
86362306a36Sopenharmony_ci	v += map->max_buckets * sizeof(struct crush_work_bucket *);
86462306a36Sopenharmony_ci	for (b = 0; b < map->max_buckets; ++b) {
86562306a36Sopenharmony_ci		if (!map->buckets[b])
86662306a36Sopenharmony_ci			continue;
86762306a36Sopenharmony_ci
86862306a36Sopenharmony_ci		w->work[b] = v;
86962306a36Sopenharmony_ci		switch (map->buckets[b]->alg) {
87062306a36Sopenharmony_ci		default:
87162306a36Sopenharmony_ci			v += sizeof(struct crush_work_bucket);
87262306a36Sopenharmony_ci			break;
87362306a36Sopenharmony_ci		}
87462306a36Sopenharmony_ci		w->work[b]->perm_x = 0;
87562306a36Sopenharmony_ci		w->work[b]->perm_n = 0;
87662306a36Sopenharmony_ci		w->work[b]->perm = v;
87762306a36Sopenharmony_ci		v += map->buckets[b]->size * sizeof(__u32);
87862306a36Sopenharmony_ci	}
87962306a36Sopenharmony_ci	BUG_ON(v - (void *)w != map->working_size);
88062306a36Sopenharmony_ci}
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci/**
88362306a36Sopenharmony_ci * crush_do_rule - calculate a mapping with the given input and rule
88462306a36Sopenharmony_ci * @map: the crush_map
88562306a36Sopenharmony_ci * @ruleno: the rule id
88662306a36Sopenharmony_ci * @x: hash input
88762306a36Sopenharmony_ci * @result: pointer to result vector
88862306a36Sopenharmony_ci * @result_max: maximum result size
88962306a36Sopenharmony_ci * @weight: weight vector (for map leaves)
89062306a36Sopenharmony_ci * @weight_max: size of weight vector
89162306a36Sopenharmony_ci * @cwin: pointer to at least crush_work_size() bytes of memory
89262306a36Sopenharmony_ci * @choose_args: weights and ids for each known bucket
89362306a36Sopenharmony_ci */
89462306a36Sopenharmony_ciint crush_do_rule(const struct crush_map *map,
89562306a36Sopenharmony_ci		  int ruleno, int x, int *result, int result_max,
89662306a36Sopenharmony_ci		  const __u32 *weight, int weight_max,
89762306a36Sopenharmony_ci		  void *cwin, const struct crush_choose_arg *choose_args)
89862306a36Sopenharmony_ci{
89962306a36Sopenharmony_ci	int result_len;
90062306a36Sopenharmony_ci	struct crush_work *cw = cwin;
90162306a36Sopenharmony_ci	int *a = cwin + map->working_size;
90262306a36Sopenharmony_ci	int *b = a + result_max;
90362306a36Sopenharmony_ci	int *c = b + result_max;
90462306a36Sopenharmony_ci	int *w = a;
90562306a36Sopenharmony_ci	int *o = b;
90662306a36Sopenharmony_ci	int recurse_to_leaf;
90762306a36Sopenharmony_ci	int wsize = 0;
90862306a36Sopenharmony_ci	int osize;
90962306a36Sopenharmony_ci	const struct crush_rule *rule;
91062306a36Sopenharmony_ci	__u32 step;
91162306a36Sopenharmony_ci	int i, j;
91262306a36Sopenharmony_ci	int numrep;
91362306a36Sopenharmony_ci	int out_size;
91462306a36Sopenharmony_ci	/*
91562306a36Sopenharmony_ci	 * the original choose_total_tries value was off by one (it
91662306a36Sopenharmony_ci	 * counted "retries" and not "tries").  add one.
91762306a36Sopenharmony_ci	 */
91862306a36Sopenharmony_ci	int choose_tries = map->choose_total_tries + 1;
91962306a36Sopenharmony_ci	int choose_leaf_tries = 0;
92062306a36Sopenharmony_ci	/*
92162306a36Sopenharmony_ci	 * the local tries values were counted as "retries", though,
92262306a36Sopenharmony_ci	 * and need no adjustment
92362306a36Sopenharmony_ci	 */
92462306a36Sopenharmony_ci	int choose_local_retries = map->choose_local_tries;
92562306a36Sopenharmony_ci	int choose_local_fallback_retries = map->choose_local_fallback_tries;
92662306a36Sopenharmony_ci
92762306a36Sopenharmony_ci	int vary_r = map->chooseleaf_vary_r;
92862306a36Sopenharmony_ci	int stable = map->chooseleaf_stable;
92962306a36Sopenharmony_ci
93062306a36Sopenharmony_ci	if ((__u32)ruleno >= map->max_rules) {
93162306a36Sopenharmony_ci		dprintk(" bad ruleno %d\n", ruleno);
93262306a36Sopenharmony_ci		return 0;
93362306a36Sopenharmony_ci	}
93462306a36Sopenharmony_ci
93562306a36Sopenharmony_ci	rule = map->rules[ruleno];
93662306a36Sopenharmony_ci	result_len = 0;
93762306a36Sopenharmony_ci
93862306a36Sopenharmony_ci	for (step = 0; step < rule->len; step++) {
93962306a36Sopenharmony_ci		int firstn = 0;
94062306a36Sopenharmony_ci		const struct crush_rule_step *curstep = &rule->steps[step];
94162306a36Sopenharmony_ci
94262306a36Sopenharmony_ci		switch (curstep->op) {
94362306a36Sopenharmony_ci		case CRUSH_RULE_TAKE:
94462306a36Sopenharmony_ci			if ((curstep->arg1 >= 0 &&
94562306a36Sopenharmony_ci			     curstep->arg1 < map->max_devices) ||
94662306a36Sopenharmony_ci			    (-1-curstep->arg1 >= 0 &&
94762306a36Sopenharmony_ci			     -1-curstep->arg1 < map->max_buckets &&
94862306a36Sopenharmony_ci			     map->buckets[-1-curstep->arg1])) {
94962306a36Sopenharmony_ci				w[0] = curstep->arg1;
95062306a36Sopenharmony_ci				wsize = 1;
95162306a36Sopenharmony_ci			} else {
95262306a36Sopenharmony_ci				dprintk(" bad take value %d\n", curstep->arg1);
95362306a36Sopenharmony_ci			}
95462306a36Sopenharmony_ci			break;
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci		case CRUSH_RULE_SET_CHOOSE_TRIES:
95762306a36Sopenharmony_ci			if (curstep->arg1 > 0)
95862306a36Sopenharmony_ci				choose_tries = curstep->arg1;
95962306a36Sopenharmony_ci			break;
96062306a36Sopenharmony_ci
96162306a36Sopenharmony_ci		case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
96262306a36Sopenharmony_ci			if (curstep->arg1 > 0)
96362306a36Sopenharmony_ci				choose_leaf_tries = curstep->arg1;
96462306a36Sopenharmony_ci			break;
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci		case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
96762306a36Sopenharmony_ci			if (curstep->arg1 >= 0)
96862306a36Sopenharmony_ci				choose_local_retries = curstep->arg1;
96962306a36Sopenharmony_ci			break;
97062306a36Sopenharmony_ci
97162306a36Sopenharmony_ci		case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
97262306a36Sopenharmony_ci			if (curstep->arg1 >= 0)
97362306a36Sopenharmony_ci				choose_local_fallback_retries = curstep->arg1;
97462306a36Sopenharmony_ci			break;
97562306a36Sopenharmony_ci
97662306a36Sopenharmony_ci		case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
97762306a36Sopenharmony_ci			if (curstep->arg1 >= 0)
97862306a36Sopenharmony_ci				vary_r = curstep->arg1;
97962306a36Sopenharmony_ci			break;
98062306a36Sopenharmony_ci
98162306a36Sopenharmony_ci		case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
98262306a36Sopenharmony_ci			if (curstep->arg1 >= 0)
98362306a36Sopenharmony_ci				stable = curstep->arg1;
98462306a36Sopenharmony_ci			break;
98562306a36Sopenharmony_ci
98662306a36Sopenharmony_ci		case CRUSH_RULE_CHOOSELEAF_FIRSTN:
98762306a36Sopenharmony_ci		case CRUSH_RULE_CHOOSE_FIRSTN:
98862306a36Sopenharmony_ci			firstn = 1;
98962306a36Sopenharmony_ci			fallthrough;
99062306a36Sopenharmony_ci		case CRUSH_RULE_CHOOSELEAF_INDEP:
99162306a36Sopenharmony_ci		case CRUSH_RULE_CHOOSE_INDEP:
99262306a36Sopenharmony_ci			if (wsize == 0)
99362306a36Sopenharmony_ci				break;
99462306a36Sopenharmony_ci
99562306a36Sopenharmony_ci			recurse_to_leaf =
99662306a36Sopenharmony_ci				curstep->op ==
99762306a36Sopenharmony_ci				 CRUSH_RULE_CHOOSELEAF_FIRSTN ||
99862306a36Sopenharmony_ci				curstep->op ==
99962306a36Sopenharmony_ci				CRUSH_RULE_CHOOSELEAF_INDEP;
100062306a36Sopenharmony_ci
100162306a36Sopenharmony_ci			/* reset output */
100262306a36Sopenharmony_ci			osize = 0;
100362306a36Sopenharmony_ci
100462306a36Sopenharmony_ci			for (i = 0; i < wsize; i++) {
100562306a36Sopenharmony_ci				int bno;
100662306a36Sopenharmony_ci				numrep = curstep->arg1;
100762306a36Sopenharmony_ci				if (numrep <= 0) {
100862306a36Sopenharmony_ci					numrep += result_max;
100962306a36Sopenharmony_ci					if (numrep <= 0)
101062306a36Sopenharmony_ci						continue;
101162306a36Sopenharmony_ci				}
101262306a36Sopenharmony_ci				j = 0;
101362306a36Sopenharmony_ci				/* make sure bucket id is valid */
101462306a36Sopenharmony_ci				bno = -1 - w[i];
101562306a36Sopenharmony_ci				if (bno < 0 || bno >= map->max_buckets) {
101662306a36Sopenharmony_ci					/* w[i] is probably CRUSH_ITEM_NONE */
101762306a36Sopenharmony_ci					dprintk("  bad w[i] %d\n", w[i]);
101862306a36Sopenharmony_ci					continue;
101962306a36Sopenharmony_ci				}
102062306a36Sopenharmony_ci				if (firstn) {
102162306a36Sopenharmony_ci					int recurse_tries;
102262306a36Sopenharmony_ci					if (choose_leaf_tries)
102362306a36Sopenharmony_ci						recurse_tries =
102462306a36Sopenharmony_ci							choose_leaf_tries;
102562306a36Sopenharmony_ci					else if (map->chooseleaf_descend_once)
102662306a36Sopenharmony_ci						recurse_tries = 1;
102762306a36Sopenharmony_ci					else
102862306a36Sopenharmony_ci						recurse_tries = choose_tries;
102962306a36Sopenharmony_ci					osize += crush_choose_firstn(
103062306a36Sopenharmony_ci						map,
103162306a36Sopenharmony_ci						cw,
103262306a36Sopenharmony_ci						map->buckets[bno],
103362306a36Sopenharmony_ci						weight, weight_max,
103462306a36Sopenharmony_ci						x, numrep,
103562306a36Sopenharmony_ci						curstep->arg2,
103662306a36Sopenharmony_ci						o+osize, j,
103762306a36Sopenharmony_ci						result_max-osize,
103862306a36Sopenharmony_ci						choose_tries,
103962306a36Sopenharmony_ci						recurse_tries,
104062306a36Sopenharmony_ci						choose_local_retries,
104162306a36Sopenharmony_ci						choose_local_fallback_retries,
104262306a36Sopenharmony_ci						recurse_to_leaf,
104362306a36Sopenharmony_ci						vary_r,
104462306a36Sopenharmony_ci						stable,
104562306a36Sopenharmony_ci						c+osize,
104662306a36Sopenharmony_ci						0,
104762306a36Sopenharmony_ci						choose_args);
104862306a36Sopenharmony_ci				} else {
104962306a36Sopenharmony_ci					out_size = ((numrep < (result_max-osize)) ?
105062306a36Sopenharmony_ci						    numrep : (result_max-osize));
105162306a36Sopenharmony_ci					crush_choose_indep(
105262306a36Sopenharmony_ci						map,
105362306a36Sopenharmony_ci						cw,
105462306a36Sopenharmony_ci						map->buckets[bno],
105562306a36Sopenharmony_ci						weight, weight_max,
105662306a36Sopenharmony_ci						x, out_size, numrep,
105762306a36Sopenharmony_ci						curstep->arg2,
105862306a36Sopenharmony_ci						o+osize, j,
105962306a36Sopenharmony_ci						choose_tries,
106062306a36Sopenharmony_ci						choose_leaf_tries ?
106162306a36Sopenharmony_ci						   choose_leaf_tries : 1,
106262306a36Sopenharmony_ci						recurse_to_leaf,
106362306a36Sopenharmony_ci						c+osize,
106462306a36Sopenharmony_ci						0,
106562306a36Sopenharmony_ci						choose_args);
106662306a36Sopenharmony_ci					osize += out_size;
106762306a36Sopenharmony_ci				}
106862306a36Sopenharmony_ci			}
106962306a36Sopenharmony_ci
107062306a36Sopenharmony_ci			if (recurse_to_leaf)
107162306a36Sopenharmony_ci				/* copy final _leaf_ values to output set */
107262306a36Sopenharmony_ci				memcpy(o, c, osize*sizeof(*o));
107362306a36Sopenharmony_ci
107462306a36Sopenharmony_ci			/* swap o and w arrays */
107562306a36Sopenharmony_ci			swap(o, w);
107662306a36Sopenharmony_ci			wsize = osize;
107762306a36Sopenharmony_ci			break;
107862306a36Sopenharmony_ci
107962306a36Sopenharmony_ci
108062306a36Sopenharmony_ci		case CRUSH_RULE_EMIT:
108162306a36Sopenharmony_ci			for (i = 0; i < wsize && result_len < result_max; i++) {
108262306a36Sopenharmony_ci				result[result_len] = w[i];
108362306a36Sopenharmony_ci				result_len++;
108462306a36Sopenharmony_ci			}
108562306a36Sopenharmony_ci			wsize = 0;
108662306a36Sopenharmony_ci			break;
108762306a36Sopenharmony_ci
108862306a36Sopenharmony_ci		default:
108962306a36Sopenharmony_ci			dprintk(" unknown op %d at step %d\n",
109062306a36Sopenharmony_ci				curstep->op, step);
109162306a36Sopenharmony_ci			break;
109262306a36Sopenharmony_ci		}
109362306a36Sopenharmony_ci	}
109462306a36Sopenharmony_ci
109562306a36Sopenharmony_ci	return result_len;
109662306a36Sopenharmony_ci}
1097