162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci-*- linux-c -*- 462306a36Sopenharmony_ci drbd_receiver.c 562306a36Sopenharmony_ci This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 662306a36Sopenharmony_ci 762306a36Sopenharmony_ci Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 862306a36Sopenharmony_ci Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 962306a36Sopenharmony_ci Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci */ 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci#ifndef _DRBD_VLI_H 1462306a36Sopenharmony_ci#define _DRBD_VLI_H 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci/* 1762306a36Sopenharmony_ci * At a granularity of 4KiB storage represented per bit, 1862306a36Sopenharmony_ci * and stroage sizes of several TiB, 1962306a36Sopenharmony_ci * and possibly small-bandwidth replication, 2062306a36Sopenharmony_ci * the bitmap transfer time can take much too long, 2162306a36Sopenharmony_ci * if transmitted in plain text. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * We try to reduce the transferred bitmap information 2462306a36Sopenharmony_ci * by encoding runlengths of bit polarity. 2562306a36Sopenharmony_ci * 2662306a36Sopenharmony_ci * We never actually need to encode a "zero" (runlengths are positive). 2762306a36Sopenharmony_ci * But then we have to store the value of the first bit. 2862306a36Sopenharmony_ci * The first bit of information thus shall encode if the first runlength 2962306a36Sopenharmony_ci * gives the number of set or unset bits. 3062306a36Sopenharmony_ci * 3162306a36Sopenharmony_ci * We assume that large areas are either completely set or unset, 3262306a36Sopenharmony_ci * which gives good compression with any runlength method, 3362306a36Sopenharmony_ci * even when encoding the runlength as fixed size 32bit/64bit integers. 3462306a36Sopenharmony_ci * 3562306a36Sopenharmony_ci * Still, there may be areas where the polarity flips every few bits, 3662306a36Sopenharmony_ci * and encoding the runlength sequence of those areas with fix size 3762306a36Sopenharmony_ci * integers would be much worse than plaintext. 3862306a36Sopenharmony_ci * 3962306a36Sopenharmony_ci * We want to encode small runlength values with minimum code length, 4062306a36Sopenharmony_ci * while still being able to encode a Huge run of all zeros. 4162306a36Sopenharmony_ci * 4262306a36Sopenharmony_ci * Thus we need a Variable Length Integer encoding, VLI. 4362306a36Sopenharmony_ci * 4462306a36Sopenharmony_ci * For some cases, we produce more code bits than plaintext input. 4562306a36Sopenharmony_ci * We need to send incompressible chunks as plaintext, skip over them 4662306a36Sopenharmony_ci * and then see if the next chunk compresses better. 4762306a36Sopenharmony_ci * 4862306a36Sopenharmony_ci * We don't care too much about "excellent" compression ratio for large 4962306a36Sopenharmony_ci * runlengths (all set/all clear): whether we achieve a factor of 100 5062306a36Sopenharmony_ci * or 1000 is not that much of an issue. 5162306a36Sopenharmony_ci * We do not want to waste too much on short runlengths in the "noisy" 5262306a36Sopenharmony_ci * parts of the bitmap, though. 5362306a36Sopenharmony_ci * 5462306a36Sopenharmony_ci * There are endless variants of VLI, we experimented with: 5562306a36Sopenharmony_ci * * simple byte-based 5662306a36Sopenharmony_ci * * various bit based with different code word length. 5762306a36Sopenharmony_ci * 5862306a36Sopenharmony_ci * To avoid yet an other configuration parameter (choice of bitmap compression 5962306a36Sopenharmony_ci * algorithm) which was difficult to explain and tune, we just chose the one 6062306a36Sopenharmony_ci * variant that turned out best in all test cases. 6162306a36Sopenharmony_ci * Based on real world usage patterns, with device sizes ranging from a few GiB 6262306a36Sopenharmony_ci * to several TiB, file server/mailserver/webserver/mysql/postgress, 6362306a36Sopenharmony_ci * mostly idle to really busy, the all time winner (though sometimes only 6462306a36Sopenharmony_ci * marginally better) is: 6562306a36Sopenharmony_ci */ 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci/* 6862306a36Sopenharmony_ci * encoding is "visualised" as 6962306a36Sopenharmony_ci * __little endian__ bitstream, least significant bit first (left most) 7062306a36Sopenharmony_ci * 7162306a36Sopenharmony_ci * this particular encoding is chosen so that the prefix code 7262306a36Sopenharmony_ci * starts as unary encoding the level, then modified so that 7362306a36Sopenharmony_ci * 10 levels can be described in 8bit, with minimal overhead 7462306a36Sopenharmony_ci * for the smaller levels. 7562306a36Sopenharmony_ci * 7662306a36Sopenharmony_ci * Number of data bits follow fibonacci sequence, with the exception of the 7762306a36Sopenharmony_ci * last level (+1 data bit, so it makes 64bit total). The only worse code when 7862306a36Sopenharmony_ci * encoding bit polarity runlength is 1 plain bits => 2 code bits. 7962306a36Sopenharmony_ciprefix data bits max val Nº data bits 8062306a36Sopenharmony_ci0 x 0x2 1 8162306a36Sopenharmony_ci10 x 0x4 1 8262306a36Sopenharmony_ci110 xx 0x8 2 8362306a36Sopenharmony_ci1110 xxx 0x10 3 8462306a36Sopenharmony_ci11110 xxx xx 0x30 5 8562306a36Sopenharmony_ci111110 xx xxxxxx 0x130 8 8662306a36Sopenharmony_ci11111100 xxxxxxxx xxxxx 0x2130 13 8762306a36Sopenharmony_ci11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 8862306a36Sopenharmony_ci11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 8962306a36Sopenharmony_ci11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 9062306a36Sopenharmony_ci * maximum encodable value: 0x100000400202130 == 2**56 + some */ 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci/* compression "table": 9362306a36Sopenharmony_ci transmitted x 0.29 9462306a36Sopenharmony_ci as plaintext x ........................ 9562306a36Sopenharmony_ci x ........................ 9662306a36Sopenharmony_ci x ........................ 9762306a36Sopenharmony_ci x 0.59 0.21........................ 9862306a36Sopenharmony_ci x ........................................................ 9962306a36Sopenharmony_ci x .. c ................................................... 10062306a36Sopenharmony_ci x 0.44.. o ................................................... 10162306a36Sopenharmony_ci x .......... d ................................................... 10262306a36Sopenharmony_ci x .......... e ................................................... 10362306a36Sopenharmony_ci X............. ................................................... 10462306a36Sopenharmony_ci x.............. b ................................................... 10562306a36Sopenharmony_ci2.0x............... i ................................................... 10662306a36Sopenharmony_ci #X................ t ................................................... 10762306a36Sopenharmony_ci #................. s ........................... plain bits .......... 10862306a36Sopenharmony_ci-+----------------------------------------------------------------------- 10962306a36Sopenharmony_ci 1 16 32 64 11062306a36Sopenharmony_ci*/ 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci/* LEVEL: (total bits, prefix bits, prefix value), 11362306a36Sopenharmony_ci * sorted ascending by number of total bits. 11462306a36Sopenharmony_ci * The rest of the code table is calculated at compiletime from this. */ 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci/* fibonacci data 1, 1, ... */ 11762306a36Sopenharmony_ci#define VLI_L_1_1() do { \ 11862306a36Sopenharmony_ci LEVEL( 2, 1, 0x00); \ 11962306a36Sopenharmony_ci LEVEL( 3, 2, 0x01); \ 12062306a36Sopenharmony_ci LEVEL( 5, 3, 0x03); \ 12162306a36Sopenharmony_ci LEVEL( 7, 4, 0x07); \ 12262306a36Sopenharmony_ci LEVEL(10, 5, 0x0f); \ 12362306a36Sopenharmony_ci LEVEL(14, 6, 0x1f); \ 12462306a36Sopenharmony_ci LEVEL(21, 8, 0x3f); \ 12562306a36Sopenharmony_ci LEVEL(29, 8, 0x7f); \ 12662306a36Sopenharmony_ci LEVEL(42, 8, 0xbf); \ 12762306a36Sopenharmony_ci LEVEL(64, 8, 0xff); \ 12862306a36Sopenharmony_ci } while (0) 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci/* finds a suitable level to decode the least significant part of in. 13162306a36Sopenharmony_ci * returns number of bits consumed. 13262306a36Sopenharmony_ci * 13362306a36Sopenharmony_ci * BUG() for bad input, as that would mean a buggy code table. */ 13462306a36Sopenharmony_cistatic inline int vli_decode_bits(u64 *out, const u64 in) 13562306a36Sopenharmony_ci{ 13662306a36Sopenharmony_ci u64 adj = 1; 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci#define LEVEL(t,b,v) \ 13962306a36Sopenharmony_ci do { \ 14062306a36Sopenharmony_ci if ((in & ((1 << b) -1)) == v) { \ 14162306a36Sopenharmony_ci *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ 14262306a36Sopenharmony_ci return t; \ 14362306a36Sopenharmony_ci } \ 14462306a36Sopenharmony_ci adj += 1ULL << (t - b); \ 14562306a36Sopenharmony_ci } while (0) 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci VLI_L_1_1(); 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci /* NOT REACHED, if VLI_LEVELS code table is defined properly */ 15062306a36Sopenharmony_ci BUG(); 15162306a36Sopenharmony_ci#undef LEVEL 15262306a36Sopenharmony_ci} 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci/* return number of code bits needed, 15562306a36Sopenharmony_ci * or negative error number */ 15662306a36Sopenharmony_cistatic inline int __vli_encode_bits(u64 *out, const u64 in) 15762306a36Sopenharmony_ci{ 15862306a36Sopenharmony_ci u64 max = 0; 15962306a36Sopenharmony_ci u64 adj = 1; 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci if (in == 0) 16262306a36Sopenharmony_ci return -EINVAL; 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci#define LEVEL(t,b,v) do { \ 16562306a36Sopenharmony_ci max += 1ULL << (t - b); \ 16662306a36Sopenharmony_ci if (in <= max) { \ 16762306a36Sopenharmony_ci if (out) \ 16862306a36Sopenharmony_ci *out = ((in - adj) << b) | v; \ 16962306a36Sopenharmony_ci return t; \ 17062306a36Sopenharmony_ci } \ 17162306a36Sopenharmony_ci adj = max + 1; \ 17262306a36Sopenharmony_ci } while (0) 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci VLI_L_1_1(); 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci return -EOVERFLOW; 17762306a36Sopenharmony_ci#undef LEVEL 17862306a36Sopenharmony_ci} 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci#undef VLI_L_1_1 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci/* code from here down is independend of actually used bit code */ 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci/* 18562306a36Sopenharmony_ci * Code length is determined by some unique (e.g. unary) prefix. 18662306a36Sopenharmony_ci * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, 18762306a36Sopenharmony_ci * not a byte stream. 18862306a36Sopenharmony_ci */ 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci/* for the bitstream, we need a cursor */ 19162306a36Sopenharmony_cistruct bitstream_cursor { 19262306a36Sopenharmony_ci /* the current byte */ 19362306a36Sopenharmony_ci u8 *b; 19462306a36Sopenharmony_ci /* the current bit within *b, nomalized: 0..7 */ 19562306a36Sopenharmony_ci unsigned int bit; 19662306a36Sopenharmony_ci}; 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci/* initialize cursor to point to first bit of stream */ 19962306a36Sopenharmony_cistatic inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) 20062306a36Sopenharmony_ci{ 20162306a36Sopenharmony_ci cur->b = s; 20262306a36Sopenharmony_ci cur->bit = 0; 20362306a36Sopenharmony_ci} 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci/* advance cursor by that many bits; maximum expected input value: 64, 20662306a36Sopenharmony_ci * but depending on VLI implementation, it may be more. */ 20762306a36Sopenharmony_cistatic inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) 20862306a36Sopenharmony_ci{ 20962306a36Sopenharmony_ci bits += cur->bit; 21062306a36Sopenharmony_ci cur->b = cur->b + (bits >> 3); 21162306a36Sopenharmony_ci cur->bit = bits & 7; 21262306a36Sopenharmony_ci} 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci/* the bitstream itself knows its length */ 21562306a36Sopenharmony_cistruct bitstream { 21662306a36Sopenharmony_ci struct bitstream_cursor cur; 21762306a36Sopenharmony_ci unsigned char *buf; 21862306a36Sopenharmony_ci size_t buf_len; /* in bytes */ 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci /* for input stream: 22162306a36Sopenharmony_ci * number of trailing 0 bits for padding 22262306a36Sopenharmony_ci * total number of valid bits in stream: buf_len * 8 - pad_bits */ 22362306a36Sopenharmony_ci unsigned int pad_bits; 22462306a36Sopenharmony_ci}; 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_cistatic inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) 22762306a36Sopenharmony_ci{ 22862306a36Sopenharmony_ci bs->buf = s; 22962306a36Sopenharmony_ci bs->buf_len = len; 23062306a36Sopenharmony_ci bs->pad_bits = pad_bits; 23162306a36Sopenharmony_ci bitstream_cursor_reset(&bs->cur, bs->buf); 23262306a36Sopenharmony_ci} 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_cistatic inline void bitstream_rewind(struct bitstream *bs) 23562306a36Sopenharmony_ci{ 23662306a36Sopenharmony_ci bitstream_cursor_reset(&bs->cur, bs->buf); 23762306a36Sopenharmony_ci memset(bs->buf, 0, bs->buf_len); 23862306a36Sopenharmony_ci} 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci/* Put (at most 64) least significant bits of val into bitstream, and advance cursor. 24162306a36Sopenharmony_ci * Ignores "pad_bits". 24262306a36Sopenharmony_ci * Returns zero if bits == 0 (nothing to do). 24362306a36Sopenharmony_ci * Returns number of bits used if successful. 24462306a36Sopenharmony_ci * 24562306a36Sopenharmony_ci * If there is not enough room left in bitstream, 24662306a36Sopenharmony_ci * leaves bitstream unchanged and returns -ENOBUFS. 24762306a36Sopenharmony_ci */ 24862306a36Sopenharmony_cistatic inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) 24962306a36Sopenharmony_ci{ 25062306a36Sopenharmony_ci unsigned char *b = bs->cur.b; 25162306a36Sopenharmony_ci unsigned int tmp; 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci if (bits == 0) 25462306a36Sopenharmony_ci return 0; 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) 25762306a36Sopenharmony_ci return -ENOBUFS; 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci /* paranoia: strip off hi bits; they should not be set anyways. */ 26062306a36Sopenharmony_ci if (bits < 64) 26162306a36Sopenharmony_ci val &= ~0ULL >> (64 - bits); 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci *b++ |= (val & 0xff) << bs->cur.bit; 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) 26662306a36Sopenharmony_ci *b++ |= (val >> tmp) & 0xff; 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci bitstream_cursor_advance(&bs->cur, bits); 26962306a36Sopenharmony_ci return bits; 27062306a36Sopenharmony_ci} 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci/* Fetch (at most 64) bits from bitstream into *out, and advance cursor. 27362306a36Sopenharmony_ci * 27462306a36Sopenharmony_ci * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. 27562306a36Sopenharmony_ci * 27662306a36Sopenharmony_ci * If there are less than the requested number of valid bits left in the 27762306a36Sopenharmony_ci * bitstream, still fetches all available bits. 27862306a36Sopenharmony_ci * 27962306a36Sopenharmony_ci * Returns number of actually fetched bits. 28062306a36Sopenharmony_ci */ 28162306a36Sopenharmony_cistatic inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) 28262306a36Sopenharmony_ci{ 28362306a36Sopenharmony_ci u64 val; 28462306a36Sopenharmony_ci unsigned int n; 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci if (bits > 64) 28762306a36Sopenharmony_ci return -EINVAL; 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) 29062306a36Sopenharmony_ci bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) 29162306a36Sopenharmony_ci - bs->cur.bit - bs->pad_bits; 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci if (bits == 0) { 29462306a36Sopenharmony_ci *out = 0; 29562306a36Sopenharmony_ci return 0; 29662306a36Sopenharmony_ci } 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci /* get the high bits */ 29962306a36Sopenharmony_ci val = 0; 30062306a36Sopenharmony_ci n = (bs->cur.bit + bits + 7) >> 3; 30162306a36Sopenharmony_ci /* n may be at most 9, if cur.bit + bits > 64 */ 30262306a36Sopenharmony_ci /* which means this copies at most 8 byte */ 30362306a36Sopenharmony_ci if (n) { 30462306a36Sopenharmony_ci memcpy(&val, bs->cur.b+1, n - 1); 30562306a36Sopenharmony_ci val = le64_to_cpu(val) << (8 - bs->cur.bit); 30662306a36Sopenharmony_ci } 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci /* we still need the low bits */ 30962306a36Sopenharmony_ci val |= bs->cur.b[0] >> bs->cur.bit; 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci /* and mask out bits we don't want */ 31262306a36Sopenharmony_ci val &= ~0ULL >> (64 - bits); 31362306a36Sopenharmony_ci 31462306a36Sopenharmony_ci bitstream_cursor_advance(&bs->cur, bits); 31562306a36Sopenharmony_ci *out = val; 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci return bits; 31862306a36Sopenharmony_ci} 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci/* encodes @in as vli into @bs; 32162306a36Sopenharmony_ci 32262306a36Sopenharmony_ci * return values 32362306a36Sopenharmony_ci * > 0: number of bits successfully stored in bitstream 32462306a36Sopenharmony_ci * -ENOBUFS @bs is full 32562306a36Sopenharmony_ci * -EINVAL input zero (invalid) 32662306a36Sopenharmony_ci * -EOVERFLOW input too large for this vli code (invalid) 32762306a36Sopenharmony_ci */ 32862306a36Sopenharmony_cistatic inline int vli_encode_bits(struct bitstream *bs, u64 in) 32962306a36Sopenharmony_ci{ 33062306a36Sopenharmony_ci u64 code; 33162306a36Sopenharmony_ci int bits = __vli_encode_bits(&code, in); 33262306a36Sopenharmony_ci 33362306a36Sopenharmony_ci if (bits <= 0) 33462306a36Sopenharmony_ci return bits; 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci return bitstream_put_bits(bs, code, bits); 33762306a36Sopenharmony_ci} 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci#endif 340