162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci-*- linux-c -*-
462306a36Sopenharmony_ci   drbd_receiver.c
562306a36Sopenharmony_ci   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
662306a36Sopenharmony_ci
762306a36Sopenharmony_ci   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
862306a36Sopenharmony_ci   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
962306a36Sopenharmony_ci   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci */
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#ifndef _DRBD_VLI_H
1462306a36Sopenharmony_ci#define _DRBD_VLI_H
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci/*
1762306a36Sopenharmony_ci * At a granularity of 4KiB storage represented per bit,
1862306a36Sopenharmony_ci * and stroage sizes of several TiB,
1962306a36Sopenharmony_ci * and possibly small-bandwidth replication,
2062306a36Sopenharmony_ci * the bitmap transfer time can take much too long,
2162306a36Sopenharmony_ci * if transmitted in plain text.
2262306a36Sopenharmony_ci *
2362306a36Sopenharmony_ci * We try to reduce the transferred bitmap information
2462306a36Sopenharmony_ci * by encoding runlengths of bit polarity.
2562306a36Sopenharmony_ci *
2662306a36Sopenharmony_ci * We never actually need to encode a "zero" (runlengths are positive).
2762306a36Sopenharmony_ci * But then we have to store the value of the first bit.
2862306a36Sopenharmony_ci * The first bit of information thus shall encode if the first runlength
2962306a36Sopenharmony_ci * gives the number of set or unset bits.
3062306a36Sopenharmony_ci *
3162306a36Sopenharmony_ci * We assume that large areas are either completely set or unset,
3262306a36Sopenharmony_ci * which gives good compression with any runlength method,
3362306a36Sopenharmony_ci * even when encoding the runlength as fixed size 32bit/64bit integers.
3462306a36Sopenharmony_ci *
3562306a36Sopenharmony_ci * Still, there may be areas where the polarity flips every few bits,
3662306a36Sopenharmony_ci * and encoding the runlength sequence of those areas with fix size
3762306a36Sopenharmony_ci * integers would be much worse than plaintext.
3862306a36Sopenharmony_ci *
3962306a36Sopenharmony_ci * We want to encode small runlength values with minimum code length,
4062306a36Sopenharmony_ci * while still being able to encode a Huge run of all zeros.
4162306a36Sopenharmony_ci *
4262306a36Sopenharmony_ci * Thus we need a Variable Length Integer encoding, VLI.
4362306a36Sopenharmony_ci *
4462306a36Sopenharmony_ci * For some cases, we produce more code bits than plaintext input.
4562306a36Sopenharmony_ci * We need to send incompressible chunks as plaintext, skip over them
4662306a36Sopenharmony_ci * and then see if the next chunk compresses better.
4762306a36Sopenharmony_ci *
4862306a36Sopenharmony_ci * We don't care too much about "excellent" compression ratio for large
4962306a36Sopenharmony_ci * runlengths (all set/all clear): whether we achieve a factor of 100
5062306a36Sopenharmony_ci * or 1000 is not that much of an issue.
5162306a36Sopenharmony_ci * We do not want to waste too much on short runlengths in the "noisy"
5262306a36Sopenharmony_ci * parts of the bitmap, though.
5362306a36Sopenharmony_ci *
5462306a36Sopenharmony_ci * There are endless variants of VLI, we experimented with:
5562306a36Sopenharmony_ci *  * simple byte-based
5662306a36Sopenharmony_ci *  * various bit based with different code word length.
5762306a36Sopenharmony_ci *
5862306a36Sopenharmony_ci * To avoid yet an other configuration parameter (choice of bitmap compression
5962306a36Sopenharmony_ci * algorithm) which was difficult to explain and tune, we just chose the one
6062306a36Sopenharmony_ci * variant that turned out best in all test cases.
6162306a36Sopenharmony_ci * Based on real world usage patterns, with device sizes ranging from a few GiB
6262306a36Sopenharmony_ci * to several TiB, file server/mailserver/webserver/mysql/postgress,
6362306a36Sopenharmony_ci * mostly idle to really busy, the all time winner (though sometimes only
6462306a36Sopenharmony_ci * marginally better) is:
6562306a36Sopenharmony_ci */
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci/*
6862306a36Sopenharmony_ci * encoding is "visualised" as
6962306a36Sopenharmony_ci * __little endian__ bitstream, least significant bit first (left most)
7062306a36Sopenharmony_ci *
7162306a36Sopenharmony_ci * this particular encoding is chosen so that the prefix code
7262306a36Sopenharmony_ci * starts as unary encoding the level, then modified so that
7362306a36Sopenharmony_ci * 10 levels can be described in 8bit, with minimal overhead
7462306a36Sopenharmony_ci * for the smaller levels.
7562306a36Sopenharmony_ci *
7662306a36Sopenharmony_ci * Number of data bits follow fibonacci sequence, with the exception of the
7762306a36Sopenharmony_ci * last level (+1 data bit, so it makes 64bit total).  The only worse code when
7862306a36Sopenharmony_ci * encoding bit polarity runlength is 1 plain bits => 2 code bits.
7962306a36Sopenharmony_ciprefix    data bits                                    max val  Nº data bits
8062306a36Sopenharmony_ci0 x                                                         0x2            1
8162306a36Sopenharmony_ci10 x                                                        0x4            1
8262306a36Sopenharmony_ci110 xx                                                      0x8            2
8362306a36Sopenharmony_ci1110 xxx                                                   0x10            3
8462306a36Sopenharmony_ci11110 xxx xx                                               0x30            5
8562306a36Sopenharmony_ci111110 xx xxxxxx                                          0x130            8
8662306a36Sopenharmony_ci11111100  xxxxxxxx xxxxx                                 0x2130           13
8762306a36Sopenharmony_ci11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
8862306a36Sopenharmony_ci11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
8962306a36Sopenharmony_ci11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
9062306a36Sopenharmony_ci * maximum encodable value: 0x100000400202130 == 2**56 + some */
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_ci/* compression "table":
9362306a36Sopenharmony_ci transmitted   x                                0.29
9462306a36Sopenharmony_ci as plaintext x                                  ........................
9562306a36Sopenharmony_ci             x                                   ........................
9662306a36Sopenharmony_ci            x                                    ........................
9762306a36Sopenharmony_ci           x    0.59                         0.21........................
9862306a36Sopenharmony_ci          x      ........................................................
9962306a36Sopenharmony_ci         x       .. c ...................................................
10062306a36Sopenharmony_ci        x    0.44.. o ...................................................
10162306a36Sopenharmony_ci       x .......... d ...................................................
10262306a36Sopenharmony_ci      x  .......... e ...................................................
10362306a36Sopenharmony_ci     X.............   ...................................................
10462306a36Sopenharmony_ci    x.............. b ...................................................
10562306a36Sopenharmony_ci2.0x............... i ...................................................
10662306a36Sopenharmony_ci #X................ t ...................................................
10762306a36Sopenharmony_ci #................. s ...........................  plain bits  ..........
10862306a36Sopenharmony_ci-+-----------------------------------------------------------------------
10962306a36Sopenharmony_ci 1             16              32                              64
11062306a36Sopenharmony_ci*/
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ci/* LEVEL: (total bits, prefix bits, prefix value),
11362306a36Sopenharmony_ci * sorted ascending by number of total bits.
11462306a36Sopenharmony_ci * The rest of the code table is calculated at compiletime from this. */
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci/* fibonacci data 1, 1, ... */
11762306a36Sopenharmony_ci#define VLI_L_1_1() do { \
11862306a36Sopenharmony_ci	LEVEL( 2, 1, 0x00); \
11962306a36Sopenharmony_ci	LEVEL( 3, 2, 0x01); \
12062306a36Sopenharmony_ci	LEVEL( 5, 3, 0x03); \
12162306a36Sopenharmony_ci	LEVEL( 7, 4, 0x07); \
12262306a36Sopenharmony_ci	LEVEL(10, 5, 0x0f); \
12362306a36Sopenharmony_ci	LEVEL(14, 6, 0x1f); \
12462306a36Sopenharmony_ci	LEVEL(21, 8, 0x3f); \
12562306a36Sopenharmony_ci	LEVEL(29, 8, 0x7f); \
12662306a36Sopenharmony_ci	LEVEL(42, 8, 0xbf); \
12762306a36Sopenharmony_ci	LEVEL(64, 8, 0xff); \
12862306a36Sopenharmony_ci	} while (0)
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci/* finds a suitable level to decode the least significant part of in.
13162306a36Sopenharmony_ci * returns number of bits consumed.
13262306a36Sopenharmony_ci *
13362306a36Sopenharmony_ci * BUG() for bad input, as that would mean a buggy code table. */
13462306a36Sopenharmony_cistatic inline int vli_decode_bits(u64 *out, const u64 in)
13562306a36Sopenharmony_ci{
13662306a36Sopenharmony_ci	u64 adj = 1;
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci#define LEVEL(t,b,v)					\
13962306a36Sopenharmony_ci	do {						\
14062306a36Sopenharmony_ci		if ((in & ((1 << b) -1)) == v) {	\
14162306a36Sopenharmony_ci			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
14262306a36Sopenharmony_ci			return t;			\
14362306a36Sopenharmony_ci		}					\
14462306a36Sopenharmony_ci		adj += 1ULL << (t - b);			\
14562306a36Sopenharmony_ci	} while (0)
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	VLI_L_1_1();
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
15062306a36Sopenharmony_ci	BUG();
15162306a36Sopenharmony_ci#undef LEVEL
15262306a36Sopenharmony_ci}
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci/* return number of code bits needed,
15562306a36Sopenharmony_ci * or negative error number */
15662306a36Sopenharmony_cistatic inline int __vli_encode_bits(u64 *out, const u64 in)
15762306a36Sopenharmony_ci{
15862306a36Sopenharmony_ci	u64 max = 0;
15962306a36Sopenharmony_ci	u64 adj = 1;
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	if (in == 0)
16262306a36Sopenharmony_ci		return -EINVAL;
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci#define LEVEL(t,b,v) do {		\
16562306a36Sopenharmony_ci		max += 1ULL << (t - b);	\
16662306a36Sopenharmony_ci		if (in <= max) {	\
16762306a36Sopenharmony_ci			if (out)	\
16862306a36Sopenharmony_ci				*out = ((in - adj) << b) | v;	\
16962306a36Sopenharmony_ci			return t;	\
17062306a36Sopenharmony_ci		}			\
17162306a36Sopenharmony_ci		adj = max + 1;		\
17262306a36Sopenharmony_ci	} while (0)
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci	VLI_L_1_1();
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci	return -EOVERFLOW;
17762306a36Sopenharmony_ci#undef LEVEL
17862306a36Sopenharmony_ci}
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci#undef VLI_L_1_1
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci/* code from here down is independend of actually used bit code */
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci/*
18562306a36Sopenharmony_ci * Code length is determined by some unique (e.g. unary) prefix.
18662306a36Sopenharmony_ci * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
18762306a36Sopenharmony_ci * not a byte stream.
18862306a36Sopenharmony_ci */
18962306a36Sopenharmony_ci
19062306a36Sopenharmony_ci/* for the bitstream, we need a cursor */
19162306a36Sopenharmony_cistruct bitstream_cursor {
19262306a36Sopenharmony_ci	/* the current byte */
19362306a36Sopenharmony_ci	u8 *b;
19462306a36Sopenharmony_ci	/* the current bit within *b, nomalized: 0..7 */
19562306a36Sopenharmony_ci	unsigned int bit;
19662306a36Sopenharmony_ci};
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci/* initialize cursor to point to first bit of stream */
19962306a36Sopenharmony_cistatic inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
20062306a36Sopenharmony_ci{
20162306a36Sopenharmony_ci	cur->b = s;
20262306a36Sopenharmony_ci	cur->bit = 0;
20362306a36Sopenharmony_ci}
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci/* advance cursor by that many bits; maximum expected input value: 64,
20662306a36Sopenharmony_ci * but depending on VLI implementation, it may be more. */
20762306a36Sopenharmony_cistatic inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
20862306a36Sopenharmony_ci{
20962306a36Sopenharmony_ci	bits += cur->bit;
21062306a36Sopenharmony_ci	cur->b = cur->b + (bits >> 3);
21162306a36Sopenharmony_ci	cur->bit = bits & 7;
21262306a36Sopenharmony_ci}
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci/* the bitstream itself knows its length */
21562306a36Sopenharmony_cistruct bitstream {
21662306a36Sopenharmony_ci	struct bitstream_cursor cur;
21762306a36Sopenharmony_ci	unsigned char *buf;
21862306a36Sopenharmony_ci	size_t buf_len;		/* in bytes */
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci	/* for input stream:
22162306a36Sopenharmony_ci	 * number of trailing 0 bits for padding
22262306a36Sopenharmony_ci	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
22362306a36Sopenharmony_ci	unsigned int pad_bits;
22462306a36Sopenharmony_ci};
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_cistatic inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
22762306a36Sopenharmony_ci{
22862306a36Sopenharmony_ci	bs->buf = s;
22962306a36Sopenharmony_ci	bs->buf_len = len;
23062306a36Sopenharmony_ci	bs->pad_bits = pad_bits;
23162306a36Sopenharmony_ci	bitstream_cursor_reset(&bs->cur, bs->buf);
23262306a36Sopenharmony_ci}
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_cistatic inline void bitstream_rewind(struct bitstream *bs)
23562306a36Sopenharmony_ci{
23662306a36Sopenharmony_ci	bitstream_cursor_reset(&bs->cur, bs->buf);
23762306a36Sopenharmony_ci	memset(bs->buf, 0, bs->buf_len);
23862306a36Sopenharmony_ci}
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
24162306a36Sopenharmony_ci * Ignores "pad_bits".
24262306a36Sopenharmony_ci * Returns zero if bits == 0 (nothing to do).
24362306a36Sopenharmony_ci * Returns number of bits used if successful.
24462306a36Sopenharmony_ci *
24562306a36Sopenharmony_ci * If there is not enough room left in bitstream,
24662306a36Sopenharmony_ci * leaves bitstream unchanged and returns -ENOBUFS.
24762306a36Sopenharmony_ci */
24862306a36Sopenharmony_cistatic inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
24962306a36Sopenharmony_ci{
25062306a36Sopenharmony_ci	unsigned char *b = bs->cur.b;
25162306a36Sopenharmony_ci	unsigned int tmp;
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci	if (bits == 0)
25462306a36Sopenharmony_ci		return 0;
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
25762306a36Sopenharmony_ci		return -ENOBUFS;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	/* paranoia: strip off hi bits; they should not be set anyways. */
26062306a36Sopenharmony_ci	if (bits < 64)
26162306a36Sopenharmony_ci		val &= ~0ULL >> (64 - bits);
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci	*b++ |= (val & 0xff) << bs->cur.bit;
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
26662306a36Sopenharmony_ci		*b++ |= (val >> tmp) & 0xff;
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	bitstream_cursor_advance(&bs->cur, bits);
26962306a36Sopenharmony_ci	return bits;
27062306a36Sopenharmony_ci}
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
27362306a36Sopenharmony_ci *
27462306a36Sopenharmony_ci * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
27562306a36Sopenharmony_ci *
27662306a36Sopenharmony_ci * If there are less than the requested number of valid bits left in the
27762306a36Sopenharmony_ci * bitstream, still fetches all available bits.
27862306a36Sopenharmony_ci *
27962306a36Sopenharmony_ci * Returns number of actually fetched bits.
28062306a36Sopenharmony_ci */
28162306a36Sopenharmony_cistatic inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
28262306a36Sopenharmony_ci{
28362306a36Sopenharmony_ci	u64 val;
28462306a36Sopenharmony_ci	unsigned int n;
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	if (bits > 64)
28762306a36Sopenharmony_ci		return -EINVAL;
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
29062306a36Sopenharmony_ci		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
29162306a36Sopenharmony_ci			- bs->cur.bit - bs->pad_bits;
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	if (bits == 0) {
29462306a36Sopenharmony_ci		*out = 0;
29562306a36Sopenharmony_ci		return 0;
29662306a36Sopenharmony_ci	}
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	/* get the high bits */
29962306a36Sopenharmony_ci	val = 0;
30062306a36Sopenharmony_ci	n = (bs->cur.bit + bits + 7) >> 3;
30162306a36Sopenharmony_ci	/* n may be at most 9, if cur.bit + bits > 64 */
30262306a36Sopenharmony_ci	/* which means this copies at most 8 byte */
30362306a36Sopenharmony_ci	if (n) {
30462306a36Sopenharmony_ci		memcpy(&val, bs->cur.b+1, n - 1);
30562306a36Sopenharmony_ci		val = le64_to_cpu(val) << (8 - bs->cur.bit);
30662306a36Sopenharmony_ci	}
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	/* we still need the low bits */
30962306a36Sopenharmony_ci	val |= bs->cur.b[0] >> bs->cur.bit;
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	/* and mask out bits we don't want */
31262306a36Sopenharmony_ci	val &= ~0ULL >> (64 - bits);
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci	bitstream_cursor_advance(&bs->cur, bits);
31562306a36Sopenharmony_ci	*out = val;
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ci	return bits;
31862306a36Sopenharmony_ci}
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci/* encodes @in as vli into @bs;
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci * return values
32362306a36Sopenharmony_ci *  > 0: number of bits successfully stored in bitstream
32462306a36Sopenharmony_ci * -ENOBUFS @bs is full
32562306a36Sopenharmony_ci * -EINVAL input zero (invalid)
32662306a36Sopenharmony_ci * -EOVERFLOW input too large for this vli code (invalid)
32762306a36Sopenharmony_ci */
32862306a36Sopenharmony_cistatic inline int vli_encode_bits(struct bitstream *bs, u64 in)
32962306a36Sopenharmony_ci{
33062306a36Sopenharmony_ci	u64 code;
33162306a36Sopenharmony_ci	int bits = __vli_encode_bits(&code, in);
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci	if (bits <= 0)
33462306a36Sopenharmony_ci		return bits;
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci	return bitstream_put_bits(bs, code, bits);
33762306a36Sopenharmony_ci}
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci#endif
340