1cb93a386Sopenharmony_ci/* 2cb93a386Sopenharmony_ci * Copyright 2021 Google LLC 3cb93a386Sopenharmony_ci * 4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be 5cb93a386Sopenharmony_ci * found in the LICENSE file. 6cb93a386Sopenharmony_ci */ 7cb93a386Sopenharmony_ci 8cb93a386Sopenharmony_ci#include <cassert> 9cb93a386Sopenharmony_ci#include <cstdio> 10cb93a386Sopenharmony_ci#include <cstdint> 11cb93a386Sopenharmony_ci#include "experimental/lowp-basic/QMath.h" 12cb93a386Sopenharmony_ci 13cb93a386Sopenharmony_ci// Compile for x86_64 + ssse3 with: 14cb93a386Sopenharmony_ci// c++ -O3 --std=c++17 -mssse3 experimental/lowp-basic/lowp_experiments.cpp -o lowp 15cb93a386Sopenharmony_ci// 16cb93a386Sopenharmony_ci// Compile for aarch64 with (Mac os): 17cb93a386Sopenharmony_ci// c++ -O3 --std=c++17 -arch arm64 experimental/lowp-basic/lowp_experiments.cpp -o lowp 18cb93a386Sopenharmony_ci// 19cb93a386Sopenharmony_ci// View assembly: 20cb93a386Sopenharmony_ci// dumpobj -d lowp | less 21cb93a386Sopenharmony_ci 22cb93a386Sopenharmony_citemplate <int N, typename T> using V = T __attribute__((ext_vector_type(N))); 23cb93a386Sopenharmony_ci 24cb93a386Sopenharmony_ci#if !defined(__clang__) 25cb93a386Sopenharmony_ci static_assert(false, "This only works on clang."); 26cb93a386Sopenharmony_ci#endif 27cb93a386Sopenharmony_ci 28cb93a386Sopenharmony_ci#if defined(__SSSE3__) 29cb93a386Sopenharmony_ci #include <immintrin.h> 30cb93a386Sopenharmony_ci#endif 31cb93a386Sopenharmony_ci 32cb93a386Sopenharmony_ci#if defined(__ARM_NEON) 33cb93a386Sopenharmony_ci // From section 5.5.5 of the ARM C Language Extensions (ACLE) 34cb93a386Sopenharmony_ci #include <arm_neon.h> 35cb93a386Sopenharmony_ci#endif 36cb93a386Sopenharmony_ci 37cb93a386Sopenharmony_ciusing Q15 = V<8, uint16_t>; 38cb93a386Sopenharmony_ci 39cb93a386Sopenharmony_ci#if defined(__SSSE3__) 40cb93a386Sopenharmony_cistatic void test_mm_mulhrs_epi16_simulation() { 41cb93a386Sopenharmony_ci for (int i = -32768; i < 32768; i++) { 42cb93a386Sopenharmony_ci for (int j = -32768; j < 32768; j++) { 43cb93a386Sopenharmony_ci Q15 a(i); 44cb93a386Sopenharmony_ci Q15 b(j); 45cb93a386Sopenharmony_ci Q15 simResult = simulate_ssse3_mm_mulhrs_epi16(a, b); 46cb93a386Sopenharmony_ci Q15 intrinsicResult = _mm_mulhrs_epi16(a, b); 47cb93a386Sopenharmony_ci for (int i = 0; i < 8; i++) { 48cb93a386Sopenharmony_ci if (simResult[i] != intrinsicResult[i]) { 49cb93a386Sopenharmony_ci printf("simulate_ssse3_mm_mulhrs_epi16 broken\n"); 50cb93a386Sopenharmony_ci printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n", 51cb93a386Sopenharmony_ci i, a[i], b[i], intrinsicResult[i], simResult[i]); 52cb93a386Sopenharmony_ci exit(1); 53cb93a386Sopenharmony_ci } 54cb93a386Sopenharmony_ci } 55cb93a386Sopenharmony_ci } 56cb93a386Sopenharmony_ci } 57cb93a386Sopenharmony_ci} 58cb93a386Sopenharmony_ci 59cb93a386Sopenharmony_ci// Use ssse3 to simulate saturating multiply on arm. 60cb93a386Sopenharmony_cistatic Q15 ssse3_vqrdmulhq_s16(Q15 a, Q15 b) { 61cb93a386Sopenharmony_ci constexpr Q15 limit(0x8000); 62cb93a386Sopenharmony_ci const Q15 product = _mm_mulhrs_epi16(a, b); 63cb93a386Sopenharmony_ci const Q15 eq = _mm_cmpeq_epi16(product, limit); 64cb93a386Sopenharmony_ci return _mm_xor_si128(eq, product); 65cb93a386Sopenharmony_ci} 66cb93a386Sopenharmony_ci 67cb93a386Sopenharmony_cistatic void test_ssse3_vqrdmulhq_s16() { 68cb93a386Sopenharmony_ci for (int i = -32768; i < 32768; i++) { 69cb93a386Sopenharmony_ci for (int j = -32768; j < 32768; j++) { 70cb93a386Sopenharmony_ci Q15 a(i); 71cb93a386Sopenharmony_ci Q15 b(j); 72cb93a386Sopenharmony_ci Q15 simResult = ssse3_vqrdmulhq_s16(a, b); 73cb93a386Sopenharmony_ci Q15 realVqrdmulhqS16 = simulate_neon_vqrdmulhq_s16(a, b); 74cb93a386Sopenharmony_ci for (int i = 0; i < 8; i++) { 75cb93a386Sopenharmony_ci if (simResult[i] != realVqrdmulhqS16[i]) { 76cb93a386Sopenharmony_ci printf("simulating vqrdmulhq_s16 with ssse3 broken\n"); 77cb93a386Sopenharmony_ci printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n", 78cb93a386Sopenharmony_ci i, a[i], b[i], realVqrdmulhqS16[i], simResult[i]); 79cb93a386Sopenharmony_ci exit(1); 80cb93a386Sopenharmony_ci } 81cb93a386Sopenharmony_ci } 82cb93a386Sopenharmony_ci } 83cb93a386Sopenharmony_ci } 84cb93a386Sopenharmony_ci} 85cb93a386Sopenharmony_ci 86cb93a386Sopenharmony_ci#endif 87cb93a386Sopenharmony_ci 88cb93a386Sopenharmony_ci#if defined(__ARM_NEON) 89cb93a386Sopenharmony_cistatic void test_neon_vqrdmulhq_s16_simulation() { 90cb93a386Sopenharmony_ci for (int i = -32768; i < 32768; i++) { 91cb93a386Sopenharmony_ci for (int j = -32768; j < 32768; j++) { 92cb93a386Sopenharmony_ci Q15 a(i); 93cb93a386Sopenharmony_ci Q15 b(j); 94cb93a386Sopenharmony_ci Q15 simResult = simulate_neon_vqrdmulhq_s16(a, b); 95cb93a386Sopenharmony_ci Q15 intrinsicResult = vqrdmulhq_s16(a, b); 96cb93a386Sopenharmony_ci for (int i = 0; i < 8; i++) { 97cb93a386Sopenharmony_ci if (simResult[i] != intrinsicResult[i]) { 98cb93a386Sopenharmony_ci printf("simulate_neon_vqrdmulhq_s16 broken\n"); 99cb93a386Sopenharmony_ci printf("i: %d, a: %hx b: %hx, intrinsic: %hx, sim: %hx\n", 100cb93a386Sopenharmony_ci i, a[i], b[i], intrinsicResult[i], simResult[i]); 101cb93a386Sopenharmony_ci exit(1); 102cb93a386Sopenharmony_ci } 103cb93a386Sopenharmony_ci } 104cb93a386Sopenharmony_ci } 105cb93a386Sopenharmony_ci } 106cb93a386Sopenharmony_ci} 107cb93a386Sopenharmony_ci#endif 108cb93a386Sopenharmony_ci 109cb93a386Sopenharmony_ciint main() { 110cb93a386Sopenharmony_ci #if defined(__SSSE3__) 111cb93a386Sopenharmony_ci //test_mm_mulhrs_epi16_simulation(); 112cb93a386Sopenharmony_ci test_ssse3_vqrdmulhq_s16(); 113cb93a386Sopenharmony_ci #endif 114cb93a386Sopenharmony_ci #if defined(__ARM_NEON) 115cb93a386Sopenharmony_ci test_neon_vqrdmulhq_s16_simulation(); 116cb93a386Sopenharmony_ci #endif 117cb93a386Sopenharmony_ci printf("Done.\n"); 118cb93a386Sopenharmony_ci return 0; 119cb93a386Sopenharmony_ci} 120