1cb93a386Sopenharmony_ci/* 2cb93a386Sopenharmony_ci * Copyright 2021 Google LLC 3cb93a386Sopenharmony_ci * 4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be 5cb93a386Sopenharmony_ci * found in the LICENSE file. 6cb93a386Sopenharmony_ci */ 7cb93a386Sopenharmony_ci 8cb93a386Sopenharmony_ci#ifndef QMath_DEFINED 9cb93a386Sopenharmony_ci#define QMath_DEFINED 10cb93a386Sopenharmony_ci 11cb93a386Sopenharmony_citemplate <int N, typename T> using V = T __attribute__((ext_vector_type(N))); 12cb93a386Sopenharmony_ci 13cb93a386Sopenharmony_ci#if !defined(__clang__) 14cb93a386Sopenharmony_cistatic_assert(false, "This only works on clang."); 15cb93a386Sopenharmony_ci#endif 16cb93a386Sopenharmony_ci 17cb93a386Sopenharmony_ci#if defined(__SSSE3__) 18cb93a386Sopenharmony_ci#include <immintrin.h> 19cb93a386Sopenharmony_ci#endif 20cb93a386Sopenharmony_ci 21cb93a386Sopenharmony_ci#if defined(__ARM_NEON) 22cb93a386Sopenharmony_ci// From section 5.5.5 of the ARM C Language Extensions (ACLE) 23cb93a386Sopenharmony_ci #include <arm_neon.h> 24cb93a386Sopenharmony_ci#endif 25cb93a386Sopenharmony_ci 26cb93a386Sopenharmony_ci#include <cassert> 27cb93a386Sopenharmony_ci#include <cstdint> 28cb93a386Sopenharmony_ci 29cb93a386Sopenharmony_ciusing Q15 = V<8, uint16_t>; 30cb93a386Sopenharmony_ciusing I16 = V<8, int16_t>; 31cb93a386Sopenharmony_ciusing U16 = V<8, uint16_t>; 32cb93a386Sopenharmony_ci 33cb93a386Sopenharmony_ci 34cb93a386Sopenharmony_cistatic inline U16 constrained_add(I16 a, U16 b) { 35cb93a386Sopenharmony_cifor (size_t i = 0; i < 8; i++) { 36cb93a386Sopenharmony_ci // Ensure that a + b is on the interval [0, UINT16_MAX] 37cb93a386Sopenharmony_ci assert(-b[i] <= a[i] && a[i] <= UINT16_MAX - b[i]); 38cb93a386Sopenharmony_ci} 39cb93a386Sopenharmony_ci U16 answer = b + a; 40cb93a386Sopenharmony_ci return answer; 41cb93a386Sopenharmony_ci} 42cb93a386Sopenharmony_ci 43cb93a386Sopenharmony_ci// A pure C version of the ssse3 intrinsic mm_mulhrs_epi16; 44cb93a386Sopenharmony_cistatic inline I16 simulate_ssse3_mm_mulhrs_epi16(I16 a, I16 b) { 45cb93a386Sopenharmony_ci I16 result; 46cb93a386Sopenharmony_ci auto m = [](int16_t r, int16_t s) { 47cb93a386Sopenharmony_ci const int32_t rounding = 1 << 14; 48cb93a386Sopenharmony_ci int32_t temp = (int32_t)r * (int32_t)s + rounding; 49cb93a386Sopenharmony_ci return (int16_t)(temp >> 15); 50cb93a386Sopenharmony_ci }; 51cb93a386Sopenharmony_ci for (int i = 0; i < 8; i++) { 52cb93a386Sopenharmony_ci result[i] = m(a[i], b[i]); 53cb93a386Sopenharmony_ci } 54cb93a386Sopenharmony_ci return result; 55cb93a386Sopenharmony_ci} 56cb93a386Sopenharmony_ci 57cb93a386Sopenharmony_ci// A pure C version of the neon intrinsic vqrdmulhq_s16; 58cb93a386Sopenharmony_cistatic inline Q15 simulate_neon_vqrdmulhq_s16(Q15 a, Q15 b) { 59cb93a386Sopenharmony_ci Q15 result; 60cb93a386Sopenharmony_ci const int esize = 16; 61cb93a386Sopenharmony_ci auto m = [](int16_t r, int16_t s) { 62cb93a386Sopenharmony_ci const int64_t rounding = 1 << (esize - 1); 63cb93a386Sopenharmony_ci int64_t product = 2LL * (int64_t)r * (int64_t)s + rounding; 64cb93a386Sopenharmony_ci int64_t result = product >> esize; 65cb93a386Sopenharmony_ci 66cb93a386Sopenharmony_ci // Saturate the result 67cb93a386Sopenharmony_ci if (int64_t limit = (1LL << (esize - 1)) - 1; result > limit) { result = limit; } 68cb93a386Sopenharmony_ci if (int64_t limit = -(1LL << (esize - 1)) ; result < limit) { result = limit; } 69cb93a386Sopenharmony_ci return result; 70cb93a386Sopenharmony_ci }; 71cb93a386Sopenharmony_ci for (int i = 0; i < 8; i++) { 72cb93a386Sopenharmony_ci result[i] = m(a[i], b[i]); 73cb93a386Sopenharmony_ci } 74cb93a386Sopenharmony_ci return result; 75cb93a386Sopenharmony_ci} 76cb93a386Sopenharmony_ci 77cb93a386Sopenharmony_ci#endif // QMath_DEFINED 78