1 static inline void
enc_loop_avx512_inner(const uint8_t **s, uint8_t **o)2 enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
3 {
4 	// Load input.
5 	__m512i src = _mm512_loadu_si512((__m512i *) *s);
6 
7 	// Reshuffle, translate, store.
8 	src = enc_reshuffle_translate(src);
9 	_mm512_storeu_si512((__m512i *) *o, src);
10 
11 	*s += 48;
12 	*o += 64;
13 }
14 
15 static inline void
enc_loop_avx512(const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)16 enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
17 {
18 	if (*slen < 64) {
19 		return;
20 	}
21 
22 	// Process blocks of 48 bytes at a time. Because blocks are loaded 64
23 	// bytes at a time, ensure that there will be at least 24 remaining
24 	// bytes after the last round, so that the final read will not pass
25 	// beyond the bounds of the input buffer.
26 	size_t rounds = (*slen - 24) / 48;
27 
28 	*slen -= rounds * 48;   // 48 bytes consumed per round
29 	*olen += rounds * 64;   // 64 bytes produced per round
30 
31 	while (rounds > 0) {
32 		if (rounds >= 8) {
33 			enc_loop_avx512_inner(s, o);
34 			enc_loop_avx512_inner(s, o);
35 			enc_loop_avx512_inner(s, o);
36 			enc_loop_avx512_inner(s, o);
37 			enc_loop_avx512_inner(s, o);
38 			enc_loop_avx512_inner(s, o);
39 			enc_loop_avx512_inner(s, o);
40 			enc_loop_avx512_inner(s, o);
41 			rounds -= 8;
42 			continue;
43 		}
44 		if (rounds >= 4) {
45 			enc_loop_avx512_inner(s, o);
46 			enc_loop_avx512_inner(s, o);
47 			enc_loop_avx512_inner(s, o);
48 			enc_loop_avx512_inner(s, o);
49 			rounds -= 4;
50 			continue;
51 		}
52 		if (rounds >= 2) {
53 			enc_loop_avx512_inner(s, o);
54 			enc_loop_avx512_inner(s, o);
55 			rounds -= 2;
56 			continue;
57 		}
58 		enc_loop_avx512_inner(s, o);
59 		break;
60 	}
61 }
62