1cb93a386Sopenharmony_ci/*
2cb93a386Sopenharmony_ci * Copyright 2021 Google LLC
3cb93a386Sopenharmony_ci *
4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be
5cb93a386Sopenharmony_ci * found in the LICENSE file.
6cb93a386Sopenharmony_ci */
7cb93a386Sopenharmony_ci
8cb93a386Sopenharmony_ci#include <cassert>
9cb93a386Sopenharmony_ci#include <cstdio>
10cb93a386Sopenharmony_ci#include <cstdint>
11cb93a386Sopenharmony_ci#include "experimental/lowp-basic/QMath.h"
12cb93a386Sopenharmony_ci
13cb93a386Sopenharmony_ci// Compile for x86_64 + ssse3 with:
14cb93a386Sopenharmony_ci//     c++ -O3 --std=c++17 -mssse3 experimental/lowp-basic/lowp_experiments.cpp -o lowp
15cb93a386Sopenharmony_ci//
16cb93a386Sopenharmony_ci// Compile for aarch64 with (Mac os):
17cb93a386Sopenharmony_ci//    c++ -O3 --std=c++17 -arch arm64 experimental/lowp-basic/lowp_experiments.cpp  -o lowp
18cb93a386Sopenharmony_ci//
19cb93a386Sopenharmony_ci// View assembly:
20cb93a386Sopenharmony_ci//    dumpobj -d lowp | less
21cb93a386Sopenharmony_ci
22cb93a386Sopenharmony_citemplate <int N, typename T> using V = T __attribute__((ext_vector_type(N)));
23cb93a386Sopenharmony_ci
24cb93a386Sopenharmony_ci#if !defined(__clang__)
25cb93a386Sopenharmony_ci    static_assert(false, "This only works on clang.");
26cb93a386Sopenharmony_ci#endif
27cb93a386Sopenharmony_ci
28cb93a386Sopenharmony_ci#if defined(__SSSE3__)
29cb93a386Sopenharmony_ci    #include <immintrin.h>
30cb93a386Sopenharmony_ci#endif
31cb93a386Sopenharmony_ci
32cb93a386Sopenharmony_ci#if defined(__ARM_NEON)
33cb93a386Sopenharmony_ci    // From section 5.5.5 of the ARM C Language Extensions (ACLE)
34cb93a386Sopenharmony_ci    #include <arm_neon.h>
35cb93a386Sopenharmony_ci#endif
36cb93a386Sopenharmony_ci
37cb93a386Sopenharmony_ciusing Q15 = V<8, uint16_t>;
38cb93a386Sopenharmony_ci
39cb93a386Sopenharmony_ci#if defined(__SSSE3__)
40cb93a386Sopenharmony_cistatic void test_mm_mulhrs_epi16_simulation() {
41cb93a386Sopenharmony_ci    for (int i = -32768; i < 32768; i++) {
42cb93a386Sopenharmony_ci        for (int j = -32768; j < 32768; j++) {
43cb93a386Sopenharmony_ci            Q15 a(i);
44cb93a386Sopenharmony_ci            Q15 b(j);
45cb93a386Sopenharmony_ci            Q15 simResult = simulate_ssse3_mm_mulhrs_epi16(a, b);
46cb93a386Sopenharmony_ci            Q15 intrinsicResult = _mm_mulhrs_epi16(a, b);
47cb93a386Sopenharmony_ci            for (int i = 0; i < 8; i++) {
48cb93a386Sopenharmony_ci                if (simResult[i] != intrinsicResult[i]) {
49cb93a386Sopenharmony_ci                    printf("simulate_ssse3_mm_mulhrs_epi16 broken\n");
50cb93a386Sopenharmony_ci                    printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n",
51cb93a386Sopenharmony_ci                           i, a[i], b[i], intrinsicResult[i], simResult[i]);
52cb93a386Sopenharmony_ci                    exit(1);
53cb93a386Sopenharmony_ci                }
54cb93a386Sopenharmony_ci            }
55cb93a386Sopenharmony_ci        }
56cb93a386Sopenharmony_ci    }
57cb93a386Sopenharmony_ci}
58cb93a386Sopenharmony_ci
59cb93a386Sopenharmony_ci// Use ssse3 to simulate saturating multiply on arm.
60cb93a386Sopenharmony_cistatic Q15 ssse3_vqrdmulhq_s16(Q15 a, Q15 b) {
61cb93a386Sopenharmony_ci    constexpr Q15 limit(0x8000);
62cb93a386Sopenharmony_ci    const Q15 product = _mm_mulhrs_epi16(a, b);
63cb93a386Sopenharmony_ci    const Q15 eq = _mm_cmpeq_epi16(product, limit);
64cb93a386Sopenharmony_ci    return _mm_xor_si128(eq, product);
65cb93a386Sopenharmony_ci}
66cb93a386Sopenharmony_ci
67cb93a386Sopenharmony_cistatic void test_ssse3_vqrdmulhq_s16() {
68cb93a386Sopenharmony_ci    for (int i = -32768; i < 32768; i++) {
69cb93a386Sopenharmony_ci        for (int j = -32768; j < 32768; j++) {
70cb93a386Sopenharmony_ci            Q15 a(i);
71cb93a386Sopenharmony_ci            Q15 b(j);
72cb93a386Sopenharmony_ci            Q15 simResult = ssse3_vqrdmulhq_s16(a, b);
73cb93a386Sopenharmony_ci            Q15 realVqrdmulhqS16 = simulate_neon_vqrdmulhq_s16(a, b);
74cb93a386Sopenharmony_ci            for (int i = 0; i < 8; i++) {
75cb93a386Sopenharmony_ci                if (simResult[i] != realVqrdmulhqS16[i]) {
76cb93a386Sopenharmony_ci                    printf("simulating vqrdmulhq_s16 with ssse3 broken\n");
77cb93a386Sopenharmony_ci                    printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n",
78cb93a386Sopenharmony_ci                           i, a[i], b[i], realVqrdmulhqS16[i], simResult[i]);
79cb93a386Sopenharmony_ci                    exit(1);
80cb93a386Sopenharmony_ci                }
81cb93a386Sopenharmony_ci            }
82cb93a386Sopenharmony_ci        }
83cb93a386Sopenharmony_ci    }
84cb93a386Sopenharmony_ci}
85cb93a386Sopenharmony_ci
86cb93a386Sopenharmony_ci#endif
87cb93a386Sopenharmony_ci
88cb93a386Sopenharmony_ci#if defined(__ARM_NEON)
89cb93a386Sopenharmony_cistatic void test_neon_vqrdmulhq_s16_simulation() {
90cb93a386Sopenharmony_ci    for (int i = -32768; i < 32768; i++) {
91cb93a386Sopenharmony_ci        for (int j = -32768; j < 32768; j++) {
92cb93a386Sopenharmony_ci            Q15 a(i);
93cb93a386Sopenharmony_ci            Q15 b(j);
94cb93a386Sopenharmony_ci            Q15 simResult = simulate_neon_vqrdmulhq_s16(a, b);
95cb93a386Sopenharmony_ci            Q15 intrinsicResult = vqrdmulhq_s16(a, b);
96cb93a386Sopenharmony_ci            for (int i = 0; i < 8; i++) {
97cb93a386Sopenharmony_ci                if (simResult[i] != intrinsicResult[i]) {
98cb93a386Sopenharmony_ci                    printf("simulate_neon_vqrdmulhq_s16 broken\n");
99cb93a386Sopenharmony_ci                    printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n",
100cb93a386Sopenharmony_ci                           i, a[i], b[i], intrinsicResult[i], simResult[i]);
101cb93a386Sopenharmony_ci                    exit(1);
102cb93a386Sopenharmony_ci                }
103cb93a386Sopenharmony_ci            }
104cb93a386Sopenharmony_ci        }
105cb93a386Sopenharmony_ci    }
106cb93a386Sopenharmony_ci}
107cb93a386Sopenharmony_ci#endif
108cb93a386Sopenharmony_ci
109cb93a386Sopenharmony_ciint main() {
110cb93a386Sopenharmony_ci    #if defined(__SSSE3__)
111cb93a386Sopenharmony_ci        //test_mm_mulhrs_epi16_simulation();
112cb93a386Sopenharmony_ci        test_ssse3_vqrdmulhq_s16();
113cb93a386Sopenharmony_ci    #endif
114cb93a386Sopenharmony_ci    #if defined(__ARM_NEON)
115cb93a386Sopenharmony_ci        test_neon_vqrdmulhq_s16_simulation();
116cb93a386Sopenharmony_ci    #endif
117cb93a386Sopenharmony_ci    printf("Done.\n");
118cb93a386Sopenharmony_ci    return 0;
119cb93a386Sopenharmony_ci}
120