11cb0ef41Sopenharmony_ci/* adler32_simd.c 21cb0ef41Sopenharmony_ci * 31cb0ef41Sopenharmony_ci * Copyright 2017 The Chromium Authors 41cb0ef41Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be 51cb0ef41Sopenharmony_ci * found in the Chromium source repository LICENSE file. 61cb0ef41Sopenharmony_ci * 71cb0ef41Sopenharmony_ci * Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is 81cb0ef41Sopenharmony_ci * the sum of N input data bytes D1 ... DN, 91cb0ef41Sopenharmony_ci * 101cb0ef41Sopenharmony_ci * A = A0 + D1 + D2 + ... + DN 111cb0ef41Sopenharmony_ci * 121cb0ef41Sopenharmony_ci * where A0 is the initial value. 131cb0ef41Sopenharmony_ci * 141cb0ef41Sopenharmony_ci * SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD, 151cb0ef41Sopenharmony_ci * for example) and accumulating the byte sums can use SSE shuffle-adds (see 161cb0ef41Sopenharmony_ci * the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has 171cb0ef41Sopenharmony_ci * similar instructions. 181cb0ef41Sopenharmony_ci * 191cb0ef41Sopenharmony_ci * The adler32 B value (aka s2) sums the A values from each step: 201cb0ef41Sopenharmony_ci * 211cb0ef41Sopenharmony_ci * B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or 221cb0ef41Sopenharmony_ci * 231cb0ef41Sopenharmony_ci * B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN 241cb0ef41Sopenharmony_ci * 251cb0ef41Sopenharmony_ci * B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD): 261cb0ef41Sopenharmony_ci * 271cb0ef41Sopenharmony_ci * B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1]. 281cb0ef41Sopenharmony_ci * 291cb0ef41Sopenharmony_ci * Adjacent blocks of 32 input bytes can be iterated with the expressions to 301cb0ef41Sopenharmony_ci * compute the adler32 s1 s2 of M >> 32 input bytes [1]. 311cb0ef41Sopenharmony_ci * 321cb0ef41Sopenharmony_ci * As M grows, the s1 s2 sums grow. If left unchecked, they would eventually 331cb0ef41Sopenharmony_ci * overflow the precision of their integer representation (bad). However, s1 341cb0ef41Sopenharmony_ci * and s2 also need to be computed modulo the adler BASE value (reduced). If 351cb0ef41Sopenharmony_ci * at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow 361cb0ef41Sopenharmony_ci * a uint32_t type (the NMAX constraint) [2]. 371cb0ef41Sopenharmony_ci * 381cb0ef41Sopenharmony_ci * [1] the iterative equations for s2 contain constant factors; these can be 391cb0ef41Sopenharmony_ci * hoisted from the n-blocks do loop of the SIMD code. 401cb0ef41Sopenharmony_ci * 411cb0ef41Sopenharmony_ci * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates 421cb0ef41Sopenharmony_ci * of the adler s1 s2 of uint32_t type (see adler32.c). 431cb0ef41Sopenharmony_ci */ 441cb0ef41Sopenharmony_ci 451cb0ef41Sopenharmony_ci#include "adler32_simd.h" 461cb0ef41Sopenharmony_ci 471cb0ef41Sopenharmony_ci/* Definitions from adler32.c: largest prime smaller than 65536 */ 481cb0ef41Sopenharmony_ci#define BASE 65521U 491cb0ef41Sopenharmony_ci/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ 501cb0ef41Sopenharmony_ci#define NMAX 5552 511cb0ef41Sopenharmony_ci 521cb0ef41Sopenharmony_ci#if defined(ADLER32_SIMD_SSSE3) 531cb0ef41Sopenharmony_ci 541cb0ef41Sopenharmony_ci#include <tmmintrin.h> 551cb0ef41Sopenharmony_ci 561cb0ef41Sopenharmony_ciuint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */ 571cb0ef41Sopenharmony_ci uint32_t adler, 581cb0ef41Sopenharmony_ci const unsigned char *buf, 591cb0ef41Sopenharmony_ci z_size_t len) 601cb0ef41Sopenharmony_ci{ 611cb0ef41Sopenharmony_ci /* 621cb0ef41Sopenharmony_ci * Split Adler-32 into component sums. 631cb0ef41Sopenharmony_ci */ 641cb0ef41Sopenharmony_ci uint32_t s1 = adler & 0xffff; 651cb0ef41Sopenharmony_ci uint32_t s2 = adler >> 16; 661cb0ef41Sopenharmony_ci 671cb0ef41Sopenharmony_ci /* 681cb0ef41Sopenharmony_ci * Process the data in blocks. 691cb0ef41Sopenharmony_ci */ 701cb0ef41Sopenharmony_ci const unsigned BLOCK_SIZE = 1 << 5; 711cb0ef41Sopenharmony_ci 721cb0ef41Sopenharmony_ci z_size_t blocks = len / BLOCK_SIZE; 731cb0ef41Sopenharmony_ci len -= blocks * BLOCK_SIZE; 741cb0ef41Sopenharmony_ci 751cb0ef41Sopenharmony_ci while (blocks) 761cb0ef41Sopenharmony_ci { 771cb0ef41Sopenharmony_ci unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ 781cb0ef41Sopenharmony_ci if (n > blocks) 791cb0ef41Sopenharmony_ci n = (unsigned) blocks; 801cb0ef41Sopenharmony_ci blocks -= n; 811cb0ef41Sopenharmony_ci 821cb0ef41Sopenharmony_ci const __m128i tap1 = 831cb0ef41Sopenharmony_ci _mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17); 841cb0ef41Sopenharmony_ci const __m128i tap2 = 851cb0ef41Sopenharmony_ci _mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1); 861cb0ef41Sopenharmony_ci const __m128i zero = 871cb0ef41Sopenharmony_ci _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 881cb0ef41Sopenharmony_ci const __m128i ones = 891cb0ef41Sopenharmony_ci _mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1); 901cb0ef41Sopenharmony_ci 911cb0ef41Sopenharmony_ci /* 921cb0ef41Sopenharmony_ci * Process n blocks of data. At most NMAX data bytes can be 931cb0ef41Sopenharmony_ci * processed before s2 must be reduced modulo BASE. 941cb0ef41Sopenharmony_ci */ 951cb0ef41Sopenharmony_ci __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n); 961cb0ef41Sopenharmony_ci __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2); 971cb0ef41Sopenharmony_ci __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0); 981cb0ef41Sopenharmony_ci 991cb0ef41Sopenharmony_ci do { 1001cb0ef41Sopenharmony_ci /* 1011cb0ef41Sopenharmony_ci * Load 32 input bytes. 1021cb0ef41Sopenharmony_ci */ 1031cb0ef41Sopenharmony_ci const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf)); 1041cb0ef41Sopenharmony_ci const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16)); 1051cb0ef41Sopenharmony_ci 1061cb0ef41Sopenharmony_ci /* 1071cb0ef41Sopenharmony_ci * Add previous block byte sum to v_ps. 1081cb0ef41Sopenharmony_ci */ 1091cb0ef41Sopenharmony_ci v_ps = _mm_add_epi32(v_ps, v_s1); 1101cb0ef41Sopenharmony_ci 1111cb0ef41Sopenharmony_ci /* 1121cb0ef41Sopenharmony_ci * Horizontally add the bytes for s1, multiply-adds the 1131cb0ef41Sopenharmony_ci * bytes by [ 32, 31, 30, ... ] for s2. 1141cb0ef41Sopenharmony_ci */ 1151cb0ef41Sopenharmony_ci v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero)); 1161cb0ef41Sopenharmony_ci const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1); 1171cb0ef41Sopenharmony_ci v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones)); 1181cb0ef41Sopenharmony_ci 1191cb0ef41Sopenharmony_ci v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); 1201cb0ef41Sopenharmony_ci const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); 1211cb0ef41Sopenharmony_ci v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); 1221cb0ef41Sopenharmony_ci 1231cb0ef41Sopenharmony_ci buf += BLOCK_SIZE; 1241cb0ef41Sopenharmony_ci 1251cb0ef41Sopenharmony_ci } while (--n); 1261cb0ef41Sopenharmony_ci 1271cb0ef41Sopenharmony_ci v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); 1281cb0ef41Sopenharmony_ci 1291cb0ef41Sopenharmony_ci /* 1301cb0ef41Sopenharmony_ci * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). 1311cb0ef41Sopenharmony_ci */ 1321cb0ef41Sopenharmony_ci 1331cb0ef41Sopenharmony_ci#define S23O1 _MM_SHUFFLE(2,3,0,1) /* A B C D -> B A D C */ 1341cb0ef41Sopenharmony_ci#define S1O32 _MM_SHUFFLE(1,0,3,2) /* A B C D -> C D A B */ 1351cb0ef41Sopenharmony_ci 1361cb0ef41Sopenharmony_ci v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1)); 1371cb0ef41Sopenharmony_ci v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32)); 1381cb0ef41Sopenharmony_ci 1391cb0ef41Sopenharmony_ci s1 += _mm_cvtsi128_si32(v_s1); 1401cb0ef41Sopenharmony_ci 1411cb0ef41Sopenharmony_ci v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1)); 1421cb0ef41Sopenharmony_ci v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32)); 1431cb0ef41Sopenharmony_ci 1441cb0ef41Sopenharmony_ci s2 = _mm_cvtsi128_si32(v_s2); 1451cb0ef41Sopenharmony_ci 1461cb0ef41Sopenharmony_ci#undef S23O1 1471cb0ef41Sopenharmony_ci#undef S1O32 1481cb0ef41Sopenharmony_ci 1491cb0ef41Sopenharmony_ci /* 1501cb0ef41Sopenharmony_ci * Reduce. 1511cb0ef41Sopenharmony_ci */ 1521cb0ef41Sopenharmony_ci s1 %= BASE; 1531cb0ef41Sopenharmony_ci s2 %= BASE; 1541cb0ef41Sopenharmony_ci } 1551cb0ef41Sopenharmony_ci 1561cb0ef41Sopenharmony_ci /* 1571cb0ef41Sopenharmony_ci * Handle leftover data. 1581cb0ef41Sopenharmony_ci */ 1591cb0ef41Sopenharmony_ci if (len) { 1601cb0ef41Sopenharmony_ci if (len >= 16) { 1611cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1621cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1631cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1641cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1651cb0ef41Sopenharmony_ci 1661cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1671cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1681cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1691cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1701cb0ef41Sopenharmony_ci 1711cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1721cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1731cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1741cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1751cb0ef41Sopenharmony_ci 1761cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1771cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1781cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1791cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1801cb0ef41Sopenharmony_ci 1811cb0ef41Sopenharmony_ci len -= 16; 1821cb0ef41Sopenharmony_ci } 1831cb0ef41Sopenharmony_ci 1841cb0ef41Sopenharmony_ci while (len--) { 1851cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 1861cb0ef41Sopenharmony_ci } 1871cb0ef41Sopenharmony_ci 1881cb0ef41Sopenharmony_ci if (s1 >= BASE) 1891cb0ef41Sopenharmony_ci s1 -= BASE; 1901cb0ef41Sopenharmony_ci s2 %= BASE; 1911cb0ef41Sopenharmony_ci } 1921cb0ef41Sopenharmony_ci 1931cb0ef41Sopenharmony_ci /* 1941cb0ef41Sopenharmony_ci * Return the recombined sums. 1951cb0ef41Sopenharmony_ci */ 1961cb0ef41Sopenharmony_ci return s1 | (s2 << 16); 1971cb0ef41Sopenharmony_ci} 1981cb0ef41Sopenharmony_ci 1991cb0ef41Sopenharmony_ci#elif defined(ADLER32_SIMD_NEON) 2001cb0ef41Sopenharmony_ci 2011cb0ef41Sopenharmony_ci#include <arm_neon.h> 2021cb0ef41Sopenharmony_ci 2031cb0ef41Sopenharmony_ciuint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ 2041cb0ef41Sopenharmony_ci uint32_t adler, 2051cb0ef41Sopenharmony_ci const unsigned char *buf, 2061cb0ef41Sopenharmony_ci z_size_t len) 2071cb0ef41Sopenharmony_ci{ 2081cb0ef41Sopenharmony_ci /* 2091cb0ef41Sopenharmony_ci * Split Adler-32 into component sums. 2101cb0ef41Sopenharmony_ci */ 2111cb0ef41Sopenharmony_ci uint32_t s1 = adler & 0xffff; 2121cb0ef41Sopenharmony_ci uint32_t s2 = adler >> 16; 2131cb0ef41Sopenharmony_ci 2141cb0ef41Sopenharmony_ci /* 2151cb0ef41Sopenharmony_ci * Serially compute s1 & s2, until the data is 16-byte aligned. 2161cb0ef41Sopenharmony_ci */ 2171cb0ef41Sopenharmony_ci if ((uintptr_t)buf & 15) { 2181cb0ef41Sopenharmony_ci while ((uintptr_t)buf & 15) { 2191cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 2201cb0ef41Sopenharmony_ci --len; 2211cb0ef41Sopenharmony_ci } 2221cb0ef41Sopenharmony_ci 2231cb0ef41Sopenharmony_ci if (s1 >= BASE) 2241cb0ef41Sopenharmony_ci s1 -= BASE; 2251cb0ef41Sopenharmony_ci s2 %= BASE; 2261cb0ef41Sopenharmony_ci } 2271cb0ef41Sopenharmony_ci 2281cb0ef41Sopenharmony_ci /* 2291cb0ef41Sopenharmony_ci * Process the data in blocks. 2301cb0ef41Sopenharmony_ci */ 2311cb0ef41Sopenharmony_ci const unsigned BLOCK_SIZE = 1 << 5; 2321cb0ef41Sopenharmony_ci 2331cb0ef41Sopenharmony_ci z_size_t blocks = len / BLOCK_SIZE; 2341cb0ef41Sopenharmony_ci len -= blocks * BLOCK_SIZE; 2351cb0ef41Sopenharmony_ci 2361cb0ef41Sopenharmony_ci while (blocks) 2371cb0ef41Sopenharmony_ci { 2381cb0ef41Sopenharmony_ci unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ 2391cb0ef41Sopenharmony_ci if (n > blocks) 2401cb0ef41Sopenharmony_ci n = (unsigned) blocks; 2411cb0ef41Sopenharmony_ci blocks -= n; 2421cb0ef41Sopenharmony_ci 2431cb0ef41Sopenharmony_ci /* 2441cb0ef41Sopenharmony_ci * Process n blocks of data. At most NMAX data bytes can be 2451cb0ef41Sopenharmony_ci * processed before s2 must be reduced modulo BASE. 2461cb0ef41Sopenharmony_ci */ 2471cb0ef41Sopenharmony_ci uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n }; 2481cb0ef41Sopenharmony_ci uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 }; 2491cb0ef41Sopenharmony_ci 2501cb0ef41Sopenharmony_ci uint16x8_t v_column_sum_1 = vdupq_n_u16(0); 2511cb0ef41Sopenharmony_ci uint16x8_t v_column_sum_2 = vdupq_n_u16(0); 2521cb0ef41Sopenharmony_ci uint16x8_t v_column_sum_3 = vdupq_n_u16(0); 2531cb0ef41Sopenharmony_ci uint16x8_t v_column_sum_4 = vdupq_n_u16(0); 2541cb0ef41Sopenharmony_ci 2551cb0ef41Sopenharmony_ci do { 2561cb0ef41Sopenharmony_ci /* 2571cb0ef41Sopenharmony_ci * Load 32 input bytes. 2581cb0ef41Sopenharmony_ci */ 2591cb0ef41Sopenharmony_ci const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf)); 2601cb0ef41Sopenharmony_ci const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16)); 2611cb0ef41Sopenharmony_ci 2621cb0ef41Sopenharmony_ci /* 2631cb0ef41Sopenharmony_ci * Add previous block byte sum to v_s2. 2641cb0ef41Sopenharmony_ci */ 2651cb0ef41Sopenharmony_ci v_s2 = vaddq_u32(v_s2, v_s1); 2661cb0ef41Sopenharmony_ci 2671cb0ef41Sopenharmony_ci /* 2681cb0ef41Sopenharmony_ci * Horizontally add the bytes for s1. 2691cb0ef41Sopenharmony_ci */ 2701cb0ef41Sopenharmony_ci v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); 2711cb0ef41Sopenharmony_ci 2721cb0ef41Sopenharmony_ci /* 2731cb0ef41Sopenharmony_ci * Vertically add the bytes for s2. 2741cb0ef41Sopenharmony_ci */ 2751cb0ef41Sopenharmony_ci v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1)); 2761cb0ef41Sopenharmony_ci v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); 2771cb0ef41Sopenharmony_ci v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2)); 2781cb0ef41Sopenharmony_ci v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); 2791cb0ef41Sopenharmony_ci 2801cb0ef41Sopenharmony_ci buf += BLOCK_SIZE; 2811cb0ef41Sopenharmony_ci 2821cb0ef41Sopenharmony_ci } while (--n); 2831cb0ef41Sopenharmony_ci 2841cb0ef41Sopenharmony_ci v_s2 = vshlq_n_u32(v_s2, 5); 2851cb0ef41Sopenharmony_ci 2861cb0ef41Sopenharmony_ci /* 2871cb0ef41Sopenharmony_ci * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. 2881cb0ef41Sopenharmony_ci */ 2891cb0ef41Sopenharmony_ci v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1), 2901cb0ef41Sopenharmony_ci (uint16x4_t) { 32, 31, 30, 29 }); 2911cb0ef41Sopenharmony_ci v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), 2921cb0ef41Sopenharmony_ci (uint16x4_t) { 28, 27, 26, 25 }); 2931cb0ef41Sopenharmony_ci v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2), 2941cb0ef41Sopenharmony_ci (uint16x4_t) { 24, 23, 22, 21 }); 2951cb0ef41Sopenharmony_ci v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), 2961cb0ef41Sopenharmony_ci (uint16x4_t) { 20, 19, 18, 17 }); 2971cb0ef41Sopenharmony_ci v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3), 2981cb0ef41Sopenharmony_ci (uint16x4_t) { 16, 15, 14, 13 }); 2991cb0ef41Sopenharmony_ci v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), 3001cb0ef41Sopenharmony_ci (uint16x4_t) { 12, 11, 10, 9 }); 3011cb0ef41Sopenharmony_ci v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4), 3021cb0ef41Sopenharmony_ci (uint16x4_t) { 8, 7, 6, 5 }); 3031cb0ef41Sopenharmony_ci v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), 3041cb0ef41Sopenharmony_ci (uint16x4_t) { 4, 3, 2, 1 }); 3051cb0ef41Sopenharmony_ci 3061cb0ef41Sopenharmony_ci /* 3071cb0ef41Sopenharmony_ci * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). 3081cb0ef41Sopenharmony_ci */ 3091cb0ef41Sopenharmony_ci uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); 3101cb0ef41Sopenharmony_ci uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); 3111cb0ef41Sopenharmony_ci uint32x2_t s1s2 = vpadd_u32(sum1, sum2); 3121cb0ef41Sopenharmony_ci 3131cb0ef41Sopenharmony_ci s1 += vget_lane_u32(s1s2, 0); 3141cb0ef41Sopenharmony_ci s2 += vget_lane_u32(s1s2, 1); 3151cb0ef41Sopenharmony_ci 3161cb0ef41Sopenharmony_ci /* 3171cb0ef41Sopenharmony_ci * Reduce. 3181cb0ef41Sopenharmony_ci */ 3191cb0ef41Sopenharmony_ci s1 %= BASE; 3201cb0ef41Sopenharmony_ci s2 %= BASE; 3211cb0ef41Sopenharmony_ci } 3221cb0ef41Sopenharmony_ci 3231cb0ef41Sopenharmony_ci /* 3241cb0ef41Sopenharmony_ci * Handle leftover data. 3251cb0ef41Sopenharmony_ci */ 3261cb0ef41Sopenharmony_ci if (len) { 3271cb0ef41Sopenharmony_ci if (len >= 16) { 3281cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3291cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3301cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3311cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3321cb0ef41Sopenharmony_ci 3331cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3341cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3351cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3361cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3371cb0ef41Sopenharmony_ci 3381cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3391cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3401cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3411cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3421cb0ef41Sopenharmony_ci 3431cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3441cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3451cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3461cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3471cb0ef41Sopenharmony_ci 3481cb0ef41Sopenharmony_ci len -= 16; 3491cb0ef41Sopenharmony_ci } 3501cb0ef41Sopenharmony_ci 3511cb0ef41Sopenharmony_ci while (len--) { 3521cb0ef41Sopenharmony_ci s2 += (s1 += *buf++); 3531cb0ef41Sopenharmony_ci } 3541cb0ef41Sopenharmony_ci 3551cb0ef41Sopenharmony_ci if (s1 >= BASE) 3561cb0ef41Sopenharmony_ci s1 -= BASE; 3571cb0ef41Sopenharmony_ci s2 %= BASE; 3581cb0ef41Sopenharmony_ci } 3591cb0ef41Sopenharmony_ci 3601cb0ef41Sopenharmony_ci /* 3611cb0ef41Sopenharmony_ci * Return the recombined sums. 3621cb0ef41Sopenharmony_ci */ 3631cb0ef41Sopenharmony_ci return s1 | (s2 << 16); 3641cb0ef41Sopenharmony_ci} 3651cb0ef41Sopenharmony_ci 3661cb0ef41Sopenharmony_ci#endif /* ADLER32_SIMD_SSSE3 */ 367