1/* adler32_simd.c 2 * 3 * Copyright 2017 The Chromium Authors 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the Chromium source repository LICENSE file. 6 * 7 * Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is 8 * the sum of N input data bytes D1 ... DN, 9 * 10 * A = A0 + D1 + D2 + ... + DN 11 * 12 * where A0 is the initial value. 13 * 14 * SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD, 15 * for example) and accumulating the byte sums can use SSE shuffle-adds (see 16 * the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has 17 * similar instructions. 18 * 19 * The adler32 B value (aka s2) sums the A values from each step: 20 * 21 * B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or 22 * 23 * B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN 24 * 25 * B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD): 26 * 27 * B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1]. 28 * 29 * Adjacent blocks of 32 input bytes can be iterated with the expressions to 30 * compute the adler32 s1 s2 of M >> 32 input bytes [1]. 31 * 32 * As M grows, the s1 s2 sums grow. If left unchecked, they would eventually 33 * overflow the precision of their integer representation (bad). However, s1 34 * and s2 also need to be computed modulo the adler BASE value (reduced). If 35 * at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow 36 * a uint32_t type (the NMAX constraint) [2]. 37 * 38 * [1] the iterative equations for s2 contain constant factors; these can be 39 * hoisted from the n-blocks do loop of the SIMD code. 40 * 41 * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates 42 * of the adler s1 s2 of uint32_t type (see adler32.c). 43 */ 44 45#include "adler32_simd.h" 46 47/* Definitions from adler32.c: largest prime smaller than 65536 */ 48#define BASE 65521U 49/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ 50#define NMAX 5552 51 52#if defined(ADLER32_SIMD_SSSE3) 53 54#include <tmmintrin.h> 55 56uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */ 57 uint32_t adler, 58 const unsigned char *buf, 59 z_size_t len) 60{ 61 /* 62 * Split Adler-32 into component sums. 63 */ 64 uint32_t s1 = adler & 0xffff; 65 uint32_t s2 = adler >> 16; 66 67 /* 68 * Process the data in blocks. 69 */ 70 const unsigned BLOCK_SIZE = 1 << 5; 71 72 z_size_t blocks = len / BLOCK_SIZE; 73 len -= blocks * BLOCK_SIZE; 74 75 while (blocks) 76 { 77 unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ 78 if (n > blocks) 79 n = (unsigned) blocks; 80 blocks -= n; 81 82 const __m128i tap1 = 83 _mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17); 84 const __m128i tap2 = 85 _mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1); 86 const __m128i zero = 87 _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 88 const __m128i ones = 89 _mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1); 90 91 /* 92 * Process n blocks of data. At most NMAX data bytes can be 93 * processed before s2 must be reduced modulo BASE. 94 */ 95 __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n); 96 __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2); 97 __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0); 98 99 do { 100 /* 101 * Load 32 input bytes. 102 */ 103 const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf)); 104 const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16)); 105 106 /* 107 * Add previous block byte sum to v_ps. 108 */ 109 v_ps = _mm_add_epi32(v_ps, v_s1); 110 111 /* 112 * Horizontally add the bytes for s1, multiply-adds the 113 * bytes by [ 32, 31, 30, ... ] for s2. 114 */ 115 v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero)); 116 const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1); 117 v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones)); 118 119 v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); 120 const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); 121 v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); 122 123 buf += BLOCK_SIZE; 124 125 } while (--n); 126 127 v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); 128 129 /* 130 * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). 131 */ 132 133#define S23O1 _MM_SHUFFLE(2,3,0,1) /* A B C D -> B A D C */ 134#define S1O32 _MM_SHUFFLE(1,0,3,2) /* A B C D -> C D A B */ 135 136 v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1)); 137 v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32)); 138 139 s1 += _mm_cvtsi128_si32(v_s1); 140 141 v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1)); 142 v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32)); 143 144 s2 = _mm_cvtsi128_si32(v_s2); 145 146#undef S23O1 147#undef S1O32 148 149 /* 150 * Reduce. 151 */ 152 s1 %= BASE; 153 s2 %= BASE; 154 } 155 156 /* 157 * Handle leftover data. 158 */ 159 if (len) { 160 if (len >= 16) { 161 s2 += (s1 += *buf++); 162 s2 += (s1 += *buf++); 163 s2 += (s1 += *buf++); 164 s2 += (s1 += *buf++); 165 166 s2 += (s1 += *buf++); 167 s2 += (s1 += *buf++); 168 s2 += (s1 += *buf++); 169 s2 += (s1 += *buf++); 170 171 s2 += (s1 += *buf++); 172 s2 += (s1 += *buf++); 173 s2 += (s1 += *buf++); 174 s2 += (s1 += *buf++); 175 176 s2 += (s1 += *buf++); 177 s2 += (s1 += *buf++); 178 s2 += (s1 += *buf++); 179 s2 += (s1 += *buf++); 180 181 len -= 16; 182 } 183 184 while (len--) { 185 s2 += (s1 += *buf++); 186 } 187 188 if (s1 >= BASE) 189 s1 -= BASE; 190 s2 %= BASE; 191 } 192 193 /* 194 * Return the recombined sums. 195 */ 196 return s1 | (s2 << 16); 197} 198 199#elif defined(ADLER32_SIMD_NEON) 200 201#include <arm_neon.h> 202 203uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ 204 uint32_t adler, 205 const unsigned char *buf, 206 z_size_t len) 207{ 208 /* 209 * Split Adler-32 into component sums. 210 */ 211 uint32_t s1 = adler & 0xffff; 212 uint32_t s2 = adler >> 16; 213 214 /* 215 * Serially compute s1 & s2, until the data is 16-byte aligned. 216 */ 217 if ((uintptr_t)buf & 15) { 218 while ((uintptr_t)buf & 15) { 219 s2 += (s1 += *buf++); 220 --len; 221 } 222 223 if (s1 >= BASE) 224 s1 -= BASE; 225 s2 %= BASE; 226 } 227 228 /* 229 * Process the data in blocks. 230 */ 231 const unsigned BLOCK_SIZE = 1 << 5; 232 233 z_size_t blocks = len / BLOCK_SIZE; 234 len -= blocks * BLOCK_SIZE; 235 236 while (blocks) 237 { 238 unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ 239 if (n > blocks) 240 n = (unsigned) blocks; 241 blocks -= n; 242 243 /* 244 * Process n blocks of data. At most NMAX data bytes can be 245 * processed before s2 must be reduced modulo BASE. 246 */ 247 uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n }; 248 uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 }; 249 250 uint16x8_t v_column_sum_1 = vdupq_n_u16(0); 251 uint16x8_t v_column_sum_2 = vdupq_n_u16(0); 252 uint16x8_t v_column_sum_3 = vdupq_n_u16(0); 253 uint16x8_t v_column_sum_4 = vdupq_n_u16(0); 254 255 do { 256 /* 257 * Load 32 input bytes. 258 */ 259 const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf)); 260 const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16)); 261 262 /* 263 * Add previous block byte sum to v_s2. 264 */ 265 v_s2 = vaddq_u32(v_s2, v_s1); 266 267 /* 268 * Horizontally add the bytes for s1. 269 */ 270 v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); 271 272 /* 273 * Vertically add the bytes for s2. 274 */ 275 v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1)); 276 v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); 277 v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2)); 278 v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); 279 280 buf += BLOCK_SIZE; 281 282 } while (--n); 283 284 v_s2 = vshlq_n_u32(v_s2, 5); 285 286 /* 287 * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. 288 */ 289 v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1), 290 (uint16x4_t) { 32, 31, 30, 29 }); 291 v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), 292 (uint16x4_t) { 28, 27, 26, 25 }); 293 v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2), 294 (uint16x4_t) { 24, 23, 22, 21 }); 295 v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), 296 (uint16x4_t) { 20, 19, 18, 17 }); 297 v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3), 298 (uint16x4_t) { 16, 15, 14, 13 }); 299 v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), 300 (uint16x4_t) { 12, 11, 10, 9 }); 301 v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4), 302 (uint16x4_t) { 8, 7, 6, 5 }); 303 v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), 304 (uint16x4_t) { 4, 3, 2, 1 }); 305 306 /* 307 * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). 308 */ 309 uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); 310 uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); 311 uint32x2_t s1s2 = vpadd_u32(sum1, sum2); 312 313 s1 += vget_lane_u32(s1s2, 0); 314 s2 += vget_lane_u32(s1s2, 1); 315 316 /* 317 * Reduce. 318 */ 319 s1 %= BASE; 320 s2 %= BASE; 321 } 322 323 /* 324 * Handle leftover data. 325 */ 326 if (len) { 327 if (len >= 16) { 328 s2 += (s1 += *buf++); 329 s2 += (s1 += *buf++); 330 s2 += (s1 += *buf++); 331 s2 += (s1 += *buf++); 332 333 s2 += (s1 += *buf++); 334 s2 += (s1 += *buf++); 335 s2 += (s1 += *buf++); 336 s2 += (s1 += *buf++); 337 338 s2 += (s1 += *buf++); 339 s2 += (s1 += *buf++); 340 s2 += (s1 += *buf++); 341 s2 += (s1 += *buf++); 342 343 s2 += (s1 += *buf++); 344 s2 += (s1 += *buf++); 345 s2 += (s1 += *buf++); 346 s2 += (s1 += *buf++); 347 348 len -= 16; 349 } 350 351 while (len--) { 352 s2 += (s1 += *buf++); 353 } 354 355 if (s1 >= BASE) 356 s1 -= BASE; 357 s2 %= BASE; 358 } 359 360 /* 361 * Return the recombined sums. 362 */ 363 return s1 | (s2 << 16); 364} 365 366#endif /* ADLER32_SIMD_SSSE3 */ 367