1 static inline void
enc_loop_avx512_inner(const uint8_t **s, uint8_t **o)2 enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
3 {
4 // Load input.
5 __m512i src = _mm512_loadu_si512((__m512i *) *s);
6
7 // Reshuffle, translate, store.
8 src = enc_reshuffle_translate(src);
9 _mm512_storeu_si512((__m512i *) *o, src);
10
11 *s += 48;
12 *o += 64;
13 }
14
15 static inline void
enc_loop_avx512(const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)16 enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
17 {
18 if (*slen < 64) {
19 return;
20 }
21
22 // Process blocks of 48 bytes at a time. Because blocks are loaded 64
23 // bytes at a time, ensure that there will be at least 24 remaining
24 // bytes after the last round, so that the final read will not pass
25 // beyond the bounds of the input buffer.
26 size_t rounds = (*slen - 24) / 48;
27
28 *slen -= rounds * 48; // 48 bytes consumed per round
29 *olen += rounds * 64; // 64 bytes produced per round
30
31 while (rounds > 0) {
32 if (rounds >= 8) {
33 enc_loop_avx512_inner(s, o);
34 enc_loop_avx512_inner(s, o);
35 enc_loop_avx512_inner(s, o);
36 enc_loop_avx512_inner(s, o);
37 enc_loop_avx512_inner(s, o);
38 enc_loop_avx512_inner(s, o);
39 enc_loop_avx512_inner(s, o);
40 enc_loop_avx512_inner(s, o);
41 rounds -= 8;
42 continue;
43 }
44 if (rounds >= 4) {
45 enc_loop_avx512_inner(s, o);
46 enc_loop_avx512_inner(s, o);
47 enc_loop_avx512_inner(s, o);
48 enc_loop_avx512_inner(s, o);
49 rounds -= 4;
50 continue;
51 }
52 if (rounds >= 2) {
53 enc_loop_avx512_inner(s, o);
54 enc_loop_avx512_inner(s, o);
55 rounds -= 2;
56 continue;
57 }
58 enc_loop_avx512_inner(s, o);
59 break;
60 }
61 }
62