1cb93a386Sopenharmony_ci/*
2cb93a386Sopenharmony_ci * Copyright 2021 Google LLC
3cb93a386Sopenharmony_ci *
4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be
5cb93a386Sopenharmony_ci * found in the LICENSE file.
6cb93a386Sopenharmony_ci */
7cb93a386Sopenharmony_ci
8cb93a386Sopenharmony_ci#ifndef QMath_DEFINED
9cb93a386Sopenharmony_ci#define QMath_DEFINED
10cb93a386Sopenharmony_ci
11cb93a386Sopenharmony_citemplate <int N, typename T> using V = T __attribute__((ext_vector_type(N)));
12cb93a386Sopenharmony_ci
13cb93a386Sopenharmony_ci#if !defined(__clang__)
14cb93a386Sopenharmony_cistatic_assert(false, "This only works on clang.");
15cb93a386Sopenharmony_ci#endif
16cb93a386Sopenharmony_ci
17cb93a386Sopenharmony_ci#if defined(__SSSE3__)
18cb93a386Sopenharmony_ci#include <immintrin.h>
19cb93a386Sopenharmony_ci#endif
20cb93a386Sopenharmony_ci
21cb93a386Sopenharmony_ci#if defined(__ARM_NEON)
22cb93a386Sopenharmony_ci// From section 5.5.5 of the ARM C Language Extensions (ACLE)
23cb93a386Sopenharmony_ci    #include <arm_neon.h>
24cb93a386Sopenharmony_ci#endif
25cb93a386Sopenharmony_ci
26cb93a386Sopenharmony_ci#include <cassert>
27cb93a386Sopenharmony_ci#include <cstdint>
28cb93a386Sopenharmony_ci
29cb93a386Sopenharmony_ciusing Q15 = V<8, uint16_t>;
30cb93a386Sopenharmony_ciusing I16 = V<8, int16_t>;
31cb93a386Sopenharmony_ciusing U16 = V<8, uint16_t>;
32cb93a386Sopenharmony_ci
33cb93a386Sopenharmony_ci
34cb93a386Sopenharmony_cistatic inline U16 constrained_add(I16 a, U16 b) {
35cb93a386Sopenharmony_cifor (size_t i = 0; i < 8; i++) {
36cb93a386Sopenharmony_ci    // Ensure that a + b is on the interval [0, UINT16_MAX]
37cb93a386Sopenharmony_ci    assert(-b[i] <= a[i] && a[i] <= UINT16_MAX - b[i]);
38cb93a386Sopenharmony_ci}
39cb93a386Sopenharmony_ci    U16 answer = b + a;
40cb93a386Sopenharmony_ci    return answer;
41cb93a386Sopenharmony_ci}
42cb93a386Sopenharmony_ci
43cb93a386Sopenharmony_ci// A pure C version of the ssse3 intrinsic mm_mulhrs_epi16;
44cb93a386Sopenharmony_cistatic inline I16 simulate_ssse3_mm_mulhrs_epi16(I16 a, I16 b) {
45cb93a386Sopenharmony_ci    I16 result;
46cb93a386Sopenharmony_ci    auto m = [](int16_t r, int16_t s) {
47cb93a386Sopenharmony_ci        const int32_t rounding = 1 << 14;
48cb93a386Sopenharmony_ci        int32_t temp = (int32_t)r * (int32_t)s + rounding;
49cb93a386Sopenharmony_ci        return (int16_t)(temp >> 15);
50cb93a386Sopenharmony_ci    };
51cb93a386Sopenharmony_ci    for (int i = 0; i < 8; i++) {
52cb93a386Sopenharmony_ci        result[i] = m(a[i], b[i]);
53cb93a386Sopenharmony_ci    }
54cb93a386Sopenharmony_ci    return result;
55cb93a386Sopenharmony_ci}
56cb93a386Sopenharmony_ci
57cb93a386Sopenharmony_ci// A pure C version of the neon intrinsic vqrdmulhq_s16;
58cb93a386Sopenharmony_cistatic inline Q15 simulate_neon_vqrdmulhq_s16(Q15 a, Q15 b) {
59cb93a386Sopenharmony_ci    Q15 result;
60cb93a386Sopenharmony_ci    const int esize = 16;
61cb93a386Sopenharmony_ci    auto m = [](int16_t r, int16_t s) {
62cb93a386Sopenharmony_ci        const int64_t rounding = 1 << (esize - 1);
63cb93a386Sopenharmony_ci        int64_t product = 2LL * (int64_t)r * (int64_t)s + rounding;
64cb93a386Sopenharmony_ci        int64_t result = product >> esize;
65cb93a386Sopenharmony_ci
66cb93a386Sopenharmony_ci        // Saturate the result
67cb93a386Sopenharmony_ci        if (int64_t limit =  (1LL << (esize - 1)) - 1; result > limit) { result = limit; }
68cb93a386Sopenharmony_ci        if (int64_t limit = -(1LL << (esize - 1))    ; result < limit) { result = limit; }
69cb93a386Sopenharmony_ci        return result;
70cb93a386Sopenharmony_ci    };
71cb93a386Sopenharmony_ci    for (int i = 0; i < 8; i++) {
72cb93a386Sopenharmony_ci        result[i] = m(a[i], b[i]);
73cb93a386Sopenharmony_ci    }
74cb93a386Sopenharmony_ci    return result;
75cb93a386Sopenharmony_ci}
76cb93a386Sopenharmony_ci
77cb93a386Sopenharmony_ci#endif  // QMath_DEFINED
78