1bbbf1280Sopenharmony_ci/* 2bbbf1280Sopenharmony_ci * AArch64-specific checksum implementation using NEON 3bbbf1280Sopenharmony_ci * 4bbbf1280Sopenharmony_ci * Copyright (c) 2020, Arm Limited. 5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6bbbf1280Sopenharmony_ci */ 7bbbf1280Sopenharmony_ci 8bbbf1280Sopenharmony_ci#include "networking.h" 9bbbf1280Sopenharmony_ci#include "../chksum_common.h" 10bbbf1280Sopenharmony_ci 11bbbf1280Sopenharmony_ci#ifndef __ARM_NEON 12bbbf1280Sopenharmony_ci#pragma GCC target("+simd") 13bbbf1280Sopenharmony_ci#endif 14bbbf1280Sopenharmony_ci 15bbbf1280Sopenharmony_ci#include <arm_neon.h> 16bbbf1280Sopenharmony_ci 17bbbf1280Sopenharmony_cialways_inline 18bbbf1280Sopenharmony_cistatic inline uint64_t 19bbbf1280Sopenharmony_cislurp_head64(const void **pptr, uint32_t *nbytes) 20bbbf1280Sopenharmony_ci{ 21bbbf1280Sopenharmony_ci Assert(*nbytes >= 8); 22bbbf1280Sopenharmony_ci uint64_t sum = 0; 23bbbf1280Sopenharmony_ci uint32_t off = (uintptr_t) *pptr % 8; 24bbbf1280Sopenharmony_ci if (likely(off != 0)) 25bbbf1280Sopenharmony_ci { 26bbbf1280Sopenharmony_ci /* Get rid of bytes 0..off-1 */ 27bbbf1280Sopenharmony_ci const unsigned char *ptr64 = align_ptr(*pptr, 8); 28bbbf1280Sopenharmony_ci uint64_t mask = ALL_ONES << (CHAR_BIT * off); 29bbbf1280Sopenharmony_ci uint64_t val = load64(ptr64) & mask; 30bbbf1280Sopenharmony_ci /* Fold 64-bit sum to 33 bits */ 31bbbf1280Sopenharmony_ci sum = val >> 32; 32bbbf1280Sopenharmony_ci sum += (uint32_t) val; 33bbbf1280Sopenharmony_ci *pptr = ptr64 + 8; 34bbbf1280Sopenharmony_ci *nbytes -= 8 - off; 35bbbf1280Sopenharmony_ci } 36bbbf1280Sopenharmony_ci return sum; 37bbbf1280Sopenharmony_ci} 38bbbf1280Sopenharmony_ci 39bbbf1280Sopenharmony_cialways_inline 40bbbf1280Sopenharmony_cistatic inline uint64_t 41bbbf1280Sopenharmony_cislurp_tail64(uint64_t sum, const void *ptr, uint32_t nbytes) 42bbbf1280Sopenharmony_ci{ 43bbbf1280Sopenharmony_ci Assert(nbytes < 8); 44bbbf1280Sopenharmony_ci if (likely(nbytes != 0)) 45bbbf1280Sopenharmony_ci { 46bbbf1280Sopenharmony_ci /* Get rid of bytes 7..nbytes */ 47bbbf1280Sopenharmony_ci uint64_t mask = ALL_ONES >> (CHAR_BIT * (8 - nbytes)); 48bbbf1280Sopenharmony_ci Assert(__builtin_popcountl(mask) / CHAR_BIT == nbytes); 49bbbf1280Sopenharmony_ci uint64_t val = load64(ptr) & mask; 50bbbf1280Sopenharmony_ci sum += val >> 32; 51bbbf1280Sopenharmony_ci sum += (uint32_t) val; 52bbbf1280Sopenharmony_ci nbytes = 0; 53bbbf1280Sopenharmony_ci } 54bbbf1280Sopenharmony_ci Assert(nbytes == 0); 55bbbf1280Sopenharmony_ci return sum; 56bbbf1280Sopenharmony_ci} 57bbbf1280Sopenharmony_ci 58bbbf1280Sopenharmony_ciunsigned short 59bbbf1280Sopenharmony_ci__chksum_aarch64_simd(const void *ptr, unsigned int nbytes) 60bbbf1280Sopenharmony_ci{ 61bbbf1280Sopenharmony_ci bool swap = (uintptr_t) ptr & 1; 62bbbf1280Sopenharmony_ci uint64_t sum; 63bbbf1280Sopenharmony_ci 64bbbf1280Sopenharmony_ci if (unlikely(nbytes < 50)) 65bbbf1280Sopenharmony_ci { 66bbbf1280Sopenharmony_ci sum = slurp_small(ptr, nbytes); 67bbbf1280Sopenharmony_ci swap = false; 68bbbf1280Sopenharmony_ci goto fold; 69bbbf1280Sopenharmony_ci } 70bbbf1280Sopenharmony_ci 71bbbf1280Sopenharmony_ci /* 8-byte align pointer */ 72bbbf1280Sopenharmony_ci Assert(nbytes >= 8); 73bbbf1280Sopenharmony_ci sum = slurp_head64(&ptr, &nbytes); 74bbbf1280Sopenharmony_ci Assert(((uintptr_t) ptr & 7) == 0); 75bbbf1280Sopenharmony_ci 76bbbf1280Sopenharmony_ci const uint32_t *may_alias ptr32 = ptr; 77bbbf1280Sopenharmony_ci 78bbbf1280Sopenharmony_ci uint64x2_t vsum0 = { 0, 0 }; 79bbbf1280Sopenharmony_ci uint64x2_t vsum1 = { 0, 0 }; 80bbbf1280Sopenharmony_ci uint64x2_t vsum2 = { 0, 0 }; 81bbbf1280Sopenharmony_ci uint64x2_t vsum3 = { 0, 0 }; 82bbbf1280Sopenharmony_ci 83bbbf1280Sopenharmony_ci /* Sum groups of 64 bytes */ 84bbbf1280Sopenharmony_ci for (uint32_t i = 0; i < nbytes / 64; i++) 85bbbf1280Sopenharmony_ci { 86bbbf1280Sopenharmony_ci uint32x4_t vtmp0 = vld1q_u32(ptr32); 87bbbf1280Sopenharmony_ci uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4); 88bbbf1280Sopenharmony_ci uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8); 89bbbf1280Sopenharmony_ci uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12); 90bbbf1280Sopenharmony_ci vsum0 = vpadalq_u32(vsum0, vtmp0); 91bbbf1280Sopenharmony_ci vsum1 = vpadalq_u32(vsum1, vtmp1); 92bbbf1280Sopenharmony_ci vsum2 = vpadalq_u32(vsum2, vtmp2); 93bbbf1280Sopenharmony_ci vsum3 = vpadalq_u32(vsum3, vtmp3); 94bbbf1280Sopenharmony_ci ptr32 += 16; 95bbbf1280Sopenharmony_ci } 96bbbf1280Sopenharmony_ci nbytes %= 64; 97bbbf1280Sopenharmony_ci 98bbbf1280Sopenharmony_ci /* Fold vsum2 and vsum3 into vsum0 and vsum1 */ 99bbbf1280Sopenharmony_ci vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2)); 100bbbf1280Sopenharmony_ci vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3)); 101bbbf1280Sopenharmony_ci 102bbbf1280Sopenharmony_ci /* Add any trailing group of 32 bytes */ 103bbbf1280Sopenharmony_ci if (nbytes & 32) 104bbbf1280Sopenharmony_ci { 105bbbf1280Sopenharmony_ci uint32x4_t vtmp0 = vld1q_u32(ptr32); 106bbbf1280Sopenharmony_ci uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4); 107bbbf1280Sopenharmony_ci vsum0 = vpadalq_u32(vsum0, vtmp0); 108bbbf1280Sopenharmony_ci vsum1 = vpadalq_u32(vsum1, vtmp1); 109bbbf1280Sopenharmony_ci ptr32 += 8; 110bbbf1280Sopenharmony_ci nbytes -= 32; 111bbbf1280Sopenharmony_ci } 112bbbf1280Sopenharmony_ci Assert(nbytes < 32); 113bbbf1280Sopenharmony_ci 114bbbf1280Sopenharmony_ci /* Fold vsum1 into vsum0 */ 115bbbf1280Sopenharmony_ci vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1)); 116bbbf1280Sopenharmony_ci 117bbbf1280Sopenharmony_ci /* Add any trailing group of 16 bytes */ 118bbbf1280Sopenharmony_ci if (nbytes & 16) 119bbbf1280Sopenharmony_ci { 120bbbf1280Sopenharmony_ci uint32x4_t vtmp = vld1q_u32(ptr32); 121bbbf1280Sopenharmony_ci vsum0 = vpadalq_u32(vsum0, vtmp); 122bbbf1280Sopenharmony_ci ptr32 += 4; 123bbbf1280Sopenharmony_ci nbytes -= 16; 124bbbf1280Sopenharmony_ci } 125bbbf1280Sopenharmony_ci Assert(nbytes < 16); 126bbbf1280Sopenharmony_ci 127bbbf1280Sopenharmony_ci /* Add any trailing group of 8 bytes */ 128bbbf1280Sopenharmony_ci if (nbytes & 8) 129bbbf1280Sopenharmony_ci { 130bbbf1280Sopenharmony_ci uint32x2_t vtmp = vld1_u32(ptr32); 131bbbf1280Sopenharmony_ci vsum0 = vaddw_u32(vsum0, vtmp); 132bbbf1280Sopenharmony_ci ptr32 += 2; 133bbbf1280Sopenharmony_ci nbytes -= 8; 134bbbf1280Sopenharmony_ci } 135bbbf1280Sopenharmony_ci Assert(nbytes < 8); 136bbbf1280Sopenharmony_ci 137bbbf1280Sopenharmony_ci uint64_t val = vaddlvq_u32(vreinterpretq_u32_u64(vsum0)); 138bbbf1280Sopenharmony_ci sum += val >> 32; 139bbbf1280Sopenharmony_ci sum += (uint32_t) val; 140bbbf1280Sopenharmony_ci 141bbbf1280Sopenharmony_ci /* Handle any trailing 0..7 bytes */ 142bbbf1280Sopenharmony_ci sum = slurp_tail64(sum, ptr32, nbytes); 143bbbf1280Sopenharmony_ci 144bbbf1280Sopenharmony_cifold: 145bbbf1280Sopenharmony_ci return fold_and_swap(sum, swap); 146bbbf1280Sopenharmony_ci} 147