1cb93a386Sopenharmony_ci/* 2cb93a386Sopenharmony_ci * Copyright 2015 Google Inc. 3cb93a386Sopenharmony_ci * 4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be 5cb93a386Sopenharmony_ci * found in the LICENSE file. 6cb93a386Sopenharmony_ci */ 7cb93a386Sopenharmony_ci 8cb93a386Sopenharmony_ci#ifndef SkNx_neon_DEFINED 9cb93a386Sopenharmony_ci#define SkNx_neon_DEFINED 10cb93a386Sopenharmony_ci 11cb93a386Sopenharmony_ci#include <arm_neon.h> 12cb93a386Sopenharmony_ci 13cb93a386Sopenharmony_cinamespace { // NOLINT(google-build-namespaces) 14cb93a386Sopenharmony_ci 15cb93a386Sopenharmony_ci// ARMv8 has vrndm(q)_f32 to floor floats. Here we emulate it: 16cb93a386Sopenharmony_ci// - roundtrip through integers via truncation 17cb93a386Sopenharmony_ci// - subtract 1 if that's too big (possible for negative values). 18cb93a386Sopenharmony_ci// This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big. 19cb93a386Sopenharmony_ciAI static float32x4_t emulate_vrndmq_f32(float32x4_t v) { 20cb93a386Sopenharmony_ci auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); 21cb93a386Sopenharmony_ci auto too_big = vcgtq_f32(roundtrip, v); 22cb93a386Sopenharmony_ci return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1))); 23cb93a386Sopenharmony_ci} 24cb93a386Sopenharmony_ciAI static float32x2_t emulate_vrndm_f32(float32x2_t v) { 25cb93a386Sopenharmony_ci auto roundtrip = vcvt_f32_s32(vcvt_s32_f32(v)); 26cb93a386Sopenharmony_ci auto too_big = vcgt_f32(roundtrip, v); 27cb93a386Sopenharmony_ci return vsub_f32(roundtrip, (float32x2_t)vand_u32(too_big, (uint32x2_t)vdup_n_f32(1))); 28cb93a386Sopenharmony_ci} 29cb93a386Sopenharmony_ci 30cb93a386Sopenharmony_citemplate <> 31cb93a386Sopenharmony_ciclass SkNx<2, float> { 32cb93a386Sopenharmony_cipublic: 33cb93a386Sopenharmony_ci AI SkNx(float32x2_t vec) : fVec(vec) {} 34cb93a386Sopenharmony_ci 35cb93a386Sopenharmony_ci AI SkNx() {} 36cb93a386Sopenharmony_ci AI SkNx(float val) : fVec(vdup_n_f32(val)) {} 37cb93a386Sopenharmony_ci AI SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; } 38cb93a386Sopenharmony_ci 39cb93a386Sopenharmony_ci AI static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } 40cb93a386Sopenharmony_ci AI void store(void* ptr) const { vst1_f32((float*)ptr, fVec); } 41cb93a386Sopenharmony_ci 42cb93a386Sopenharmony_ci AI static void Load2(const void* ptr, SkNx* x, SkNx* y) { 43cb93a386Sopenharmony_ci float32x2x2_t xy = vld2_f32((const float*) ptr); 44cb93a386Sopenharmony_ci *x = xy.val[0]; 45cb93a386Sopenharmony_ci *y = xy.val[1]; 46cb93a386Sopenharmony_ci } 47cb93a386Sopenharmony_ci 48cb93a386Sopenharmony_ci AI static void Store2(void* dst, const SkNx& a, const SkNx& b) { 49cb93a386Sopenharmony_ci float32x2x2_t ab = {{ 50cb93a386Sopenharmony_ci a.fVec, 51cb93a386Sopenharmony_ci b.fVec, 52cb93a386Sopenharmony_ci }}; 53cb93a386Sopenharmony_ci vst2_f32((float*) dst, ab); 54cb93a386Sopenharmony_ci } 55cb93a386Sopenharmony_ci 56cb93a386Sopenharmony_ci AI static void Store3(void* dst, const SkNx& a, const SkNx& b, const SkNx& c) { 57cb93a386Sopenharmony_ci float32x2x3_t abc = {{ 58cb93a386Sopenharmony_ci a.fVec, 59cb93a386Sopenharmony_ci b.fVec, 60cb93a386Sopenharmony_ci c.fVec, 61cb93a386Sopenharmony_ci }}; 62cb93a386Sopenharmony_ci vst3_f32((float*) dst, abc); 63cb93a386Sopenharmony_ci } 64cb93a386Sopenharmony_ci 65cb93a386Sopenharmony_ci AI static void Store4(void* dst, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) { 66cb93a386Sopenharmony_ci float32x2x4_t abcd = {{ 67cb93a386Sopenharmony_ci a.fVec, 68cb93a386Sopenharmony_ci b.fVec, 69cb93a386Sopenharmony_ci c.fVec, 70cb93a386Sopenharmony_ci d.fVec, 71cb93a386Sopenharmony_ci }}; 72cb93a386Sopenharmony_ci vst4_f32((float*) dst, abcd); 73cb93a386Sopenharmony_ci } 74cb93a386Sopenharmony_ci 75cb93a386Sopenharmony_ci AI SkNx operator - () const { return vneg_f32(fVec); } 76cb93a386Sopenharmony_ci 77cb93a386Sopenharmony_ci AI SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); } 78cb93a386Sopenharmony_ci AI SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); } 79cb93a386Sopenharmony_ci AI SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); } 80cb93a386Sopenharmony_ci AI SkNx operator / (const SkNx& o) const { 81cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 82cb93a386Sopenharmony_ci return vdiv_f32(fVec, o.fVec); 83cb93a386Sopenharmony_ci #else 84cb93a386Sopenharmony_ci float32x2_t est0 = vrecpe_f32(o.fVec), 85cb93a386Sopenharmony_ci est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0), 86cb93a386Sopenharmony_ci est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1); 87cb93a386Sopenharmony_ci return vmul_f32(fVec, est2); 88cb93a386Sopenharmony_ci #endif 89cb93a386Sopenharmony_ci } 90cb93a386Sopenharmony_ci 91cb93a386Sopenharmony_ci AI SkNx operator==(const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); } 92cb93a386Sopenharmony_ci AI SkNx operator <(const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); } 93cb93a386Sopenharmony_ci AI SkNx operator >(const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); } 94cb93a386Sopenharmony_ci AI SkNx operator<=(const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); } 95cb93a386Sopenharmony_ci AI SkNx operator>=(const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); } 96cb93a386Sopenharmony_ci AI SkNx operator!=(const SkNx& o) const { 97cb93a386Sopenharmony_ci return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec))); 98cb93a386Sopenharmony_ci } 99cb93a386Sopenharmony_ci 100cb93a386Sopenharmony_ci AI static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); } 101cb93a386Sopenharmony_ci AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); } 102cb93a386Sopenharmony_ci 103cb93a386Sopenharmony_ci AI SkNx abs() const { return vabs_f32(fVec); } 104cb93a386Sopenharmony_ci AI SkNx floor() const { 105cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 106cb93a386Sopenharmony_ci return vrndm_f32(fVec); 107cb93a386Sopenharmony_ci #else 108cb93a386Sopenharmony_ci return emulate_vrndm_f32(fVec); 109cb93a386Sopenharmony_ci #endif 110cb93a386Sopenharmony_ci } 111cb93a386Sopenharmony_ci 112cb93a386Sopenharmony_ci AI SkNx sqrt() const { 113cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 114cb93a386Sopenharmony_ci return vsqrt_f32(fVec); 115cb93a386Sopenharmony_ci #else 116cb93a386Sopenharmony_ci float32x2_t est0 = vrsqrte_f32(fVec), 117cb93a386Sopenharmony_ci est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0), 118cb93a386Sopenharmony_ci est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1); 119cb93a386Sopenharmony_ci return vmul_f32(fVec, est2); 120cb93a386Sopenharmony_ci #endif 121cb93a386Sopenharmony_ci } 122cb93a386Sopenharmony_ci 123cb93a386Sopenharmony_ci AI float operator[](int k) const { 124cb93a386Sopenharmony_ci SkASSERT(0 <= k && k < 2); 125cb93a386Sopenharmony_ci union { float32x2_t v; float fs[2]; } pun = {fVec}; 126cb93a386Sopenharmony_ci return pun.fs[k&1]; 127cb93a386Sopenharmony_ci } 128cb93a386Sopenharmony_ci 129cb93a386Sopenharmony_ci AI bool allTrue() const { 130cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 131cb93a386Sopenharmony_ci return 0 != vminv_u32(vreinterpret_u32_f32(fVec)); 132cb93a386Sopenharmony_ci #else 133cb93a386Sopenharmony_ci auto v = vreinterpret_u32_f32(fVec); 134cb93a386Sopenharmony_ci return vget_lane_u32(v,0) && vget_lane_u32(v,1); 135cb93a386Sopenharmony_ci #endif 136cb93a386Sopenharmony_ci } 137cb93a386Sopenharmony_ci AI bool anyTrue() const { 138cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 139cb93a386Sopenharmony_ci return 0 != vmaxv_u32(vreinterpret_u32_f32(fVec)); 140cb93a386Sopenharmony_ci #else 141cb93a386Sopenharmony_ci auto v = vreinterpret_u32_f32(fVec); 142cb93a386Sopenharmony_ci return vget_lane_u32(v,0) || vget_lane_u32(v,1); 143cb93a386Sopenharmony_ci #endif 144cb93a386Sopenharmony_ci } 145cb93a386Sopenharmony_ci 146cb93a386Sopenharmony_ci AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 147cb93a386Sopenharmony_ci return vbsl_f32(vreinterpret_u32_f32(fVec), t.fVec, e.fVec); 148cb93a386Sopenharmony_ci } 149cb93a386Sopenharmony_ci 150cb93a386Sopenharmony_ci float32x2_t fVec; 151cb93a386Sopenharmony_ci}; 152cb93a386Sopenharmony_ci 153cb93a386Sopenharmony_citemplate <> 154cb93a386Sopenharmony_ciclass SkNx<4, float> { 155cb93a386Sopenharmony_cipublic: 156cb93a386Sopenharmony_ci AI SkNx(float32x4_t vec) : fVec(vec) {} 157cb93a386Sopenharmony_ci 158cb93a386Sopenharmony_ci AI SkNx() {} 159cb93a386Sopenharmony_ci AI SkNx(float val) : fVec(vdupq_n_f32(val)) {} 160cb93a386Sopenharmony_ci AI SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; } 161cb93a386Sopenharmony_ci 162cb93a386Sopenharmony_ci AI static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } 163cb93a386Sopenharmony_ci AI void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } 164cb93a386Sopenharmony_ci 165cb93a386Sopenharmony_ci AI static void Load2(const void* ptr, SkNx* x, SkNx* y) { 166cb93a386Sopenharmony_ci float32x4x2_t xy = vld2q_f32((const float*) ptr); 167cb93a386Sopenharmony_ci *x = xy.val[0]; 168cb93a386Sopenharmony_ci *y = xy.val[1]; 169cb93a386Sopenharmony_ci } 170cb93a386Sopenharmony_ci 171cb93a386Sopenharmony_ci AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { 172cb93a386Sopenharmony_ci float32x4x4_t rgba = vld4q_f32((const float*) ptr); 173cb93a386Sopenharmony_ci *r = rgba.val[0]; 174cb93a386Sopenharmony_ci *g = rgba.val[1]; 175cb93a386Sopenharmony_ci *b = rgba.val[2]; 176cb93a386Sopenharmony_ci *a = rgba.val[3]; 177cb93a386Sopenharmony_ci } 178cb93a386Sopenharmony_ci AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { 179cb93a386Sopenharmony_ci float32x4x4_t rgba = {{ 180cb93a386Sopenharmony_ci r.fVec, 181cb93a386Sopenharmony_ci g.fVec, 182cb93a386Sopenharmony_ci b.fVec, 183cb93a386Sopenharmony_ci a.fVec, 184cb93a386Sopenharmony_ci }}; 185cb93a386Sopenharmony_ci vst4q_f32((float*) dst, rgba); 186cb93a386Sopenharmony_ci } 187cb93a386Sopenharmony_ci 188cb93a386Sopenharmony_ci AI SkNx operator - () const { return vnegq_f32(fVec); } 189cb93a386Sopenharmony_ci 190cb93a386Sopenharmony_ci AI SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); } 191cb93a386Sopenharmony_ci AI SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); } 192cb93a386Sopenharmony_ci AI SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); } 193cb93a386Sopenharmony_ci AI SkNx operator / (const SkNx& o) const { 194cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 195cb93a386Sopenharmony_ci return vdivq_f32(fVec, o.fVec); 196cb93a386Sopenharmony_ci #else 197cb93a386Sopenharmony_ci float32x4_t est0 = vrecpeq_f32(o.fVec), 198cb93a386Sopenharmony_ci est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0), 199cb93a386Sopenharmony_ci est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1); 200cb93a386Sopenharmony_ci return vmulq_f32(fVec, est2); 201cb93a386Sopenharmony_ci #endif 202cb93a386Sopenharmony_ci } 203cb93a386Sopenharmony_ci 204cb93a386Sopenharmony_ci AI SkNx operator==(const SkNx& o) const {return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec));} 205cb93a386Sopenharmony_ci AI SkNx operator <(const SkNx& o) const {return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec));} 206cb93a386Sopenharmony_ci AI SkNx operator >(const SkNx& o) const {return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec));} 207cb93a386Sopenharmony_ci AI SkNx operator<=(const SkNx& o) const {return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec));} 208cb93a386Sopenharmony_ci AI SkNx operator>=(const SkNx& o) const {return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec));} 209cb93a386Sopenharmony_ci AI SkNx operator!=(const SkNx& o) const { 210cb93a386Sopenharmony_ci return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); 211cb93a386Sopenharmony_ci } 212cb93a386Sopenharmony_ci 213cb93a386Sopenharmony_ci AI static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); } 214cb93a386Sopenharmony_ci AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); } 215cb93a386Sopenharmony_ci 216cb93a386Sopenharmony_ci AI SkNx abs() const { return vabsq_f32(fVec); } 217cb93a386Sopenharmony_ci AI SkNx floor() const { 218cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 219cb93a386Sopenharmony_ci return vrndmq_f32(fVec); 220cb93a386Sopenharmony_ci #else 221cb93a386Sopenharmony_ci return emulate_vrndmq_f32(fVec); 222cb93a386Sopenharmony_ci #endif 223cb93a386Sopenharmony_ci } 224cb93a386Sopenharmony_ci 225cb93a386Sopenharmony_ci 226cb93a386Sopenharmony_ci AI SkNx sqrt() const { 227cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 228cb93a386Sopenharmony_ci return vsqrtq_f32(fVec); 229cb93a386Sopenharmony_ci #else 230cb93a386Sopenharmony_ci float32x4_t est0 = vrsqrteq_f32(fVec), 231cb93a386Sopenharmony_ci est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0), 232cb93a386Sopenharmony_ci est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1); 233cb93a386Sopenharmony_ci return vmulq_f32(fVec, est2); 234cb93a386Sopenharmony_ci #endif 235cb93a386Sopenharmony_ci } 236cb93a386Sopenharmony_ci 237cb93a386Sopenharmony_ci AI float operator[](int k) const { 238cb93a386Sopenharmony_ci SkASSERT(0 <= k && k < 4); 239cb93a386Sopenharmony_ci union { float32x4_t v; float fs[4]; } pun = {fVec}; 240cb93a386Sopenharmony_ci return pun.fs[k&3]; 241cb93a386Sopenharmony_ci } 242cb93a386Sopenharmony_ci 243cb93a386Sopenharmony_ci AI float min() const { 244cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 245cb93a386Sopenharmony_ci return vminvq_f32(fVec); 246cb93a386Sopenharmony_ci #else 247cb93a386Sopenharmony_ci SkNx min = Min(*this, vrev64q_f32(fVec)); 248cb93a386Sopenharmony_ci return std::min(min[0], min[2]); 249cb93a386Sopenharmony_ci #endif 250cb93a386Sopenharmony_ci } 251cb93a386Sopenharmony_ci 252cb93a386Sopenharmony_ci AI float max() const { 253cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 254cb93a386Sopenharmony_ci return vmaxvq_f32(fVec); 255cb93a386Sopenharmony_ci #else 256cb93a386Sopenharmony_ci SkNx max = Max(*this, vrev64q_f32(fVec)); 257cb93a386Sopenharmony_ci return std::max(max[0], max[2]); 258cb93a386Sopenharmony_ci #endif 259cb93a386Sopenharmony_ci } 260cb93a386Sopenharmony_ci 261cb93a386Sopenharmony_ci AI bool allTrue() const { 262cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 263cb93a386Sopenharmony_ci return 0 != vminvq_u32(vreinterpretq_u32_f32(fVec)); 264cb93a386Sopenharmony_ci #else 265cb93a386Sopenharmony_ci auto v = vreinterpretq_u32_f32(fVec); 266cb93a386Sopenharmony_ci return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1) 267cb93a386Sopenharmony_ci && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3); 268cb93a386Sopenharmony_ci #endif 269cb93a386Sopenharmony_ci } 270cb93a386Sopenharmony_ci AI bool anyTrue() const { 271cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 272cb93a386Sopenharmony_ci return 0 != vmaxvq_u32(vreinterpretq_u32_f32(fVec)); 273cb93a386Sopenharmony_ci #else 274cb93a386Sopenharmony_ci auto v = vreinterpretq_u32_f32(fVec); 275cb93a386Sopenharmony_ci return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1) 276cb93a386Sopenharmony_ci || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3); 277cb93a386Sopenharmony_ci #endif 278cb93a386Sopenharmony_ci } 279cb93a386Sopenharmony_ci 280cb93a386Sopenharmony_ci AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 281cb93a386Sopenharmony_ci return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec); 282cb93a386Sopenharmony_ci } 283cb93a386Sopenharmony_ci 284cb93a386Sopenharmony_ci float32x4_t fVec; 285cb93a386Sopenharmony_ci}; 286cb93a386Sopenharmony_ci 287cb93a386Sopenharmony_ci#if defined(SK_CPU_ARM64) 288cb93a386Sopenharmony_ci AI static Sk4f SkNx_fma(const Sk4f& f, const Sk4f& m, const Sk4f& a) { 289cb93a386Sopenharmony_ci return vfmaq_f32(a.fVec, f.fVec, m.fVec); 290cb93a386Sopenharmony_ci } 291cb93a386Sopenharmony_ci#endif 292cb93a386Sopenharmony_ci 293cb93a386Sopenharmony_ci// It's possible that for our current use cases, representing this as 294cb93a386Sopenharmony_ci// half a uint16x8_t might be better than representing it as a uint16x4_t. 295cb93a386Sopenharmony_ci// It'd make conversion to Sk4b one step simpler. 296cb93a386Sopenharmony_citemplate <> 297cb93a386Sopenharmony_ciclass SkNx<4, uint16_t> { 298cb93a386Sopenharmony_cipublic: 299cb93a386Sopenharmony_ci AI SkNx(const uint16x4_t& vec) : fVec(vec) {} 300cb93a386Sopenharmony_ci 301cb93a386Sopenharmony_ci AI SkNx() {} 302cb93a386Sopenharmony_ci AI SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {} 303cb93a386Sopenharmony_ci AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { 304cb93a386Sopenharmony_ci fVec = (uint16x4_t) { a,b,c,d }; 305cb93a386Sopenharmony_ci } 306cb93a386Sopenharmony_ci 307cb93a386Sopenharmony_ci AI static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } 308cb93a386Sopenharmony_ci AI void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } 309cb93a386Sopenharmony_ci 310cb93a386Sopenharmony_ci AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { 311cb93a386Sopenharmony_ci uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr); 312cb93a386Sopenharmony_ci *r = rgba.val[0]; 313cb93a386Sopenharmony_ci *g = rgba.val[1]; 314cb93a386Sopenharmony_ci *b = rgba.val[2]; 315cb93a386Sopenharmony_ci *a = rgba.val[3]; 316cb93a386Sopenharmony_ci } 317cb93a386Sopenharmony_ci AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) { 318cb93a386Sopenharmony_ci uint16x4x3_t rgba = vld3_u16((const uint16_t*)ptr); 319cb93a386Sopenharmony_ci *r = rgba.val[0]; 320cb93a386Sopenharmony_ci *g = rgba.val[1]; 321cb93a386Sopenharmony_ci *b = rgba.val[2]; 322cb93a386Sopenharmony_ci } 323cb93a386Sopenharmony_ci AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { 324cb93a386Sopenharmony_ci uint16x4x4_t rgba = {{ 325cb93a386Sopenharmony_ci r.fVec, 326cb93a386Sopenharmony_ci g.fVec, 327cb93a386Sopenharmony_ci b.fVec, 328cb93a386Sopenharmony_ci a.fVec, 329cb93a386Sopenharmony_ci }}; 330cb93a386Sopenharmony_ci vst4_u16((uint16_t*) dst, rgba); 331cb93a386Sopenharmony_ci } 332cb93a386Sopenharmony_ci 333cb93a386Sopenharmony_ci AI SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); } 334cb93a386Sopenharmony_ci AI SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); } 335cb93a386Sopenharmony_ci AI SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); } 336cb93a386Sopenharmony_ci AI SkNx operator & (const SkNx& o) const { return vand_u16(fVec, o.fVec); } 337cb93a386Sopenharmony_ci AI SkNx operator | (const SkNx& o) const { return vorr_u16(fVec, o.fVec); } 338cb93a386Sopenharmony_ci 339cb93a386Sopenharmony_ci AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } 340cb93a386Sopenharmony_ci AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } 341cb93a386Sopenharmony_ci 342cb93a386Sopenharmony_ci AI static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); } 343cb93a386Sopenharmony_ci 344cb93a386Sopenharmony_ci AI uint16_t operator[](int k) const { 345cb93a386Sopenharmony_ci SkASSERT(0 <= k && k < 4); 346cb93a386Sopenharmony_ci union { uint16x4_t v; uint16_t us[4]; } pun = {fVec}; 347cb93a386Sopenharmony_ci return pun.us[k&3]; 348cb93a386Sopenharmony_ci } 349cb93a386Sopenharmony_ci 350cb93a386Sopenharmony_ci AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 351cb93a386Sopenharmony_ci return vbsl_u16(fVec, t.fVec, e.fVec); 352cb93a386Sopenharmony_ci } 353cb93a386Sopenharmony_ci 354cb93a386Sopenharmony_ci uint16x4_t fVec; 355cb93a386Sopenharmony_ci}; 356cb93a386Sopenharmony_ci 357cb93a386Sopenharmony_citemplate <> 358cb93a386Sopenharmony_ciclass SkNx<8, uint16_t> { 359cb93a386Sopenharmony_cipublic: 360cb93a386Sopenharmony_ci AI SkNx(const uint16x8_t& vec) : fVec(vec) {} 361cb93a386Sopenharmony_ci 362cb93a386Sopenharmony_ci AI SkNx() {} 363cb93a386Sopenharmony_ci AI SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {} 364cb93a386Sopenharmony_ci AI static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); } 365cb93a386Sopenharmony_ci 366cb93a386Sopenharmony_ci AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, 367cb93a386Sopenharmony_ci uint16_t e, uint16_t f, uint16_t g, uint16_t h) { 368cb93a386Sopenharmony_ci fVec = (uint16x8_t) { a,b,c,d, e,f,g,h }; 369cb93a386Sopenharmony_ci } 370cb93a386Sopenharmony_ci 371cb93a386Sopenharmony_ci AI void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } 372cb93a386Sopenharmony_ci 373cb93a386Sopenharmony_ci AI SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); } 374cb93a386Sopenharmony_ci AI SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); } 375cb93a386Sopenharmony_ci AI SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); } 376cb93a386Sopenharmony_ci AI SkNx operator & (const SkNx& o) const { return vandq_u16(fVec, o.fVec); } 377cb93a386Sopenharmony_ci AI SkNx operator | (const SkNx& o) const { return vorrq_u16(fVec, o.fVec); } 378cb93a386Sopenharmony_ci 379cb93a386Sopenharmony_ci AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } 380cb93a386Sopenharmony_ci AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } 381cb93a386Sopenharmony_ci 382cb93a386Sopenharmony_ci AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); } 383cb93a386Sopenharmony_ci 384cb93a386Sopenharmony_ci AI uint16_t operator[](int k) const { 385cb93a386Sopenharmony_ci SkASSERT(0 <= k && k < 8); 386cb93a386Sopenharmony_ci union { uint16x8_t v; uint16_t us[8]; } pun = {fVec}; 387cb93a386Sopenharmony_ci return pun.us[k&7]; 388cb93a386Sopenharmony_ci } 389cb93a386Sopenharmony_ci 390cb93a386Sopenharmony_ci AI SkNx mulHi(const SkNx& m) const { 391cb93a386Sopenharmony_ci uint32x4_t hi = vmull_u16(vget_high_u16(fVec), vget_high_u16(m.fVec)); 392cb93a386Sopenharmony_ci uint32x4_t lo = vmull_u16( vget_low_u16(fVec), vget_low_u16(m.fVec)); 393cb93a386Sopenharmony_ci 394cb93a386Sopenharmony_ci return { vcombine_u16(vshrn_n_u32(lo,16), vshrn_n_u32(hi,16)) }; 395cb93a386Sopenharmony_ci } 396cb93a386Sopenharmony_ci 397cb93a386Sopenharmony_ci AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 398cb93a386Sopenharmony_ci return vbslq_u16(fVec, t.fVec, e.fVec); 399cb93a386Sopenharmony_ci } 400cb93a386Sopenharmony_ci 401cb93a386Sopenharmony_ci uint16x8_t fVec; 402cb93a386Sopenharmony_ci}; 403cb93a386Sopenharmony_ci 404cb93a386Sopenharmony_citemplate <> 405cb93a386Sopenharmony_ciclass SkNx<4, uint8_t> { 406cb93a386Sopenharmony_cipublic: 407cb93a386Sopenharmony_ci typedef uint32_t __attribute__((aligned(1))) unaligned_uint32_t; 408cb93a386Sopenharmony_ci 409cb93a386Sopenharmony_ci AI SkNx(const uint8x8_t& vec) : fVec(vec) {} 410cb93a386Sopenharmony_ci 411cb93a386Sopenharmony_ci AI SkNx() {} 412cb93a386Sopenharmony_ci AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) { 413cb93a386Sopenharmony_ci fVec = (uint8x8_t){a,b,c,d, 0,0,0,0}; 414cb93a386Sopenharmony_ci } 415cb93a386Sopenharmony_ci AI static SkNx Load(const void* ptr) { 416cb93a386Sopenharmony_ci return (uint8x8_t)vld1_dup_u32((const unaligned_uint32_t*)ptr); 417cb93a386Sopenharmony_ci } 418cb93a386Sopenharmony_ci AI void store(void* ptr) const { 419cb93a386Sopenharmony_ci return vst1_lane_u32((unaligned_uint32_t*)ptr, (uint32x2_t)fVec, 0); 420cb93a386Sopenharmony_ci } 421cb93a386Sopenharmony_ci AI uint8_t operator[](int k) const { 422cb93a386Sopenharmony_ci SkASSERT(0 <= k && k < 4); 423cb93a386Sopenharmony_ci union { uint8x8_t v; uint8_t us[8]; } pun = {fVec}; 424cb93a386Sopenharmony_ci return pun.us[k&3]; 425cb93a386Sopenharmony_ci } 426cb93a386Sopenharmony_ci 427cb93a386Sopenharmony_ci // TODO as needed 428cb93a386Sopenharmony_ci 429cb93a386Sopenharmony_ci uint8x8_t fVec; 430cb93a386Sopenharmony_ci}; 431cb93a386Sopenharmony_ci 432cb93a386Sopenharmony_citemplate <> 433cb93a386Sopenharmony_ciclass SkNx<8, uint8_t> { 434cb93a386Sopenharmony_cipublic: 435cb93a386Sopenharmony_ci AI SkNx(const uint8x8_t& vec) : fVec(vec) {} 436cb93a386Sopenharmony_ci 437cb93a386Sopenharmony_ci AI SkNx() {} 438cb93a386Sopenharmony_ci AI SkNx(uint8_t val) : fVec(vdup_n_u8(val)) {} 439cb93a386Sopenharmony_ci AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, 440cb93a386Sopenharmony_ci uint8_t e, uint8_t f, uint8_t g, uint8_t h) { 441cb93a386Sopenharmony_ci fVec = (uint8x8_t) { a,b,c,d, e,f,g,h }; 442cb93a386Sopenharmony_ci } 443cb93a386Sopenharmony_ci 444cb93a386Sopenharmony_ci AI static SkNx Load(const void* ptr) { return vld1_u8((const uint8_t*)ptr); } 445cb93a386Sopenharmony_ci AI void store(void* ptr) const { vst1_u8((uint8_t*)ptr, fVec); } 446cb93a386Sopenharmony_ci 447cb93a386Sopenharmony_ci AI uint8_t operator[](int k) const { 448cb93a386Sopenharmony_ci SkASSERT(0 <= k && k < 8); 449cb93a386Sopenharmony_ci union { uint8x8_t v; uint8_t us[8]; } pun = {fVec}; 450cb93a386Sopenharmony_ci return pun.us[k&7]; 451cb93a386Sopenharmony_ci } 452cb93a386Sopenharmony_ci 453cb93a386Sopenharmony_ci uint8x8_t fVec; 454cb93a386Sopenharmony_ci}; 455cb93a386Sopenharmony_ci 456cb93a386Sopenharmony_citemplate <> 457cb93a386Sopenharmony_ciclass SkNx<16, uint8_t> { 458cb93a386Sopenharmony_cipublic: 459cb93a386Sopenharmony_ci AI SkNx(const uint8x16_t& vec) : fVec(vec) {} 460cb93a386Sopenharmony_ci 461cb93a386Sopenharmony_ci AI SkNx() {} 462cb93a386Sopenharmony_ci AI SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {} 463cb93a386Sopenharmony_ci AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, 464cb93a386Sopenharmony_ci uint8_t e, uint8_t f, uint8_t g, uint8_t h, 465cb93a386Sopenharmony_ci uint8_t i, uint8_t j, uint8_t k, uint8_t l, 466cb93a386Sopenharmony_ci uint8_t m, uint8_t n, uint8_t o, uint8_t p) { 467cb93a386Sopenharmony_ci fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p }; 468cb93a386Sopenharmony_ci } 469cb93a386Sopenharmony_ci 470cb93a386Sopenharmony_ci AI static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); } 471cb93a386Sopenharmony_ci AI void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); } 472cb93a386Sopenharmony_ci 473cb93a386Sopenharmony_ci AI SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); } 474cb93a386Sopenharmony_ci 475cb93a386Sopenharmony_ci AI SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); } 476cb93a386Sopenharmony_ci AI SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); } 477cb93a386Sopenharmony_ci AI SkNx operator & (const SkNx& o) const { return vandq_u8(fVec, o.fVec); } 478cb93a386Sopenharmony_ci 479cb93a386Sopenharmony_ci AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); } 480cb93a386Sopenharmony_ci AI SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); } 481cb93a386Sopenharmony_ci 482cb93a386Sopenharmony_ci AI uint8_t operator[](int k) const { 483cb93a386Sopenharmony_ci SkASSERT(0 <= k && k < 16); 484cb93a386Sopenharmony_ci union { uint8x16_t v; uint8_t us[16]; } pun = {fVec}; 485cb93a386Sopenharmony_ci return pun.us[k&15]; 486cb93a386Sopenharmony_ci } 487cb93a386Sopenharmony_ci 488cb93a386Sopenharmony_ci AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 489cb93a386Sopenharmony_ci return vbslq_u8(fVec, t.fVec, e.fVec); 490cb93a386Sopenharmony_ci } 491cb93a386Sopenharmony_ci 492cb93a386Sopenharmony_ci uint8x16_t fVec; 493cb93a386Sopenharmony_ci}; 494cb93a386Sopenharmony_ci 495cb93a386Sopenharmony_citemplate <> 496cb93a386Sopenharmony_ciclass SkNx<4, int32_t> { 497cb93a386Sopenharmony_cipublic: 498cb93a386Sopenharmony_ci AI SkNx(const int32x4_t& vec) : fVec(vec) {} 499cb93a386Sopenharmony_ci 500cb93a386Sopenharmony_ci AI SkNx() {} 501cb93a386Sopenharmony_ci AI SkNx(int32_t v) { 502cb93a386Sopenharmony_ci fVec = vdupq_n_s32(v); 503cb93a386Sopenharmony_ci } 504cb93a386Sopenharmony_ci AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d) { 505cb93a386Sopenharmony_ci fVec = (int32x4_t){a,b,c,d}; 506cb93a386Sopenharmony_ci } 507cb93a386Sopenharmony_ci AI static SkNx Load(const void* ptr) { 508cb93a386Sopenharmony_ci return vld1q_s32((const int32_t*)ptr); 509cb93a386Sopenharmony_ci } 510cb93a386Sopenharmony_ci AI void store(void* ptr) const { 511cb93a386Sopenharmony_ci return vst1q_s32((int32_t*)ptr, fVec); 512cb93a386Sopenharmony_ci } 513cb93a386Sopenharmony_ci AI int32_t operator[](int k) const { 514cb93a386Sopenharmony_ci SkASSERT(0 <= k && k < 4); 515cb93a386Sopenharmony_ci union { int32x4_t v; int32_t is[4]; } pun = {fVec}; 516cb93a386Sopenharmony_ci return pun.is[k&3]; 517cb93a386Sopenharmony_ci } 518cb93a386Sopenharmony_ci 519cb93a386Sopenharmony_ci AI SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); } 520cb93a386Sopenharmony_ci AI SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); } 521cb93a386Sopenharmony_ci AI SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); } 522cb93a386Sopenharmony_ci 523cb93a386Sopenharmony_ci AI SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); } 524cb93a386Sopenharmony_ci AI SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); } 525cb93a386Sopenharmony_ci AI SkNx operator ^ (const SkNx& o) const { return veorq_s32(fVec, o.fVec); } 526cb93a386Sopenharmony_ci 527cb93a386Sopenharmony_ci AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } 528cb93a386Sopenharmony_ci AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } 529cb93a386Sopenharmony_ci 530cb93a386Sopenharmony_ci AI SkNx operator == (const SkNx& o) const { 531cb93a386Sopenharmony_ci return vreinterpretq_s32_u32(vceqq_s32(fVec, o.fVec)); 532cb93a386Sopenharmony_ci } 533cb93a386Sopenharmony_ci AI SkNx operator < (const SkNx& o) const { 534cb93a386Sopenharmony_ci return vreinterpretq_s32_u32(vcltq_s32(fVec, o.fVec)); 535cb93a386Sopenharmony_ci } 536cb93a386Sopenharmony_ci AI SkNx operator > (const SkNx& o) const { 537cb93a386Sopenharmony_ci return vreinterpretq_s32_u32(vcgtq_s32(fVec, o.fVec)); 538cb93a386Sopenharmony_ci } 539cb93a386Sopenharmony_ci 540cb93a386Sopenharmony_ci AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.fVec); } 541cb93a386Sopenharmony_ci AI static SkNx Max(const SkNx& a, const SkNx& b) { return vmaxq_s32(a.fVec, b.fVec); } 542cb93a386Sopenharmony_ci // TODO as needed 543cb93a386Sopenharmony_ci 544cb93a386Sopenharmony_ci AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 545cb93a386Sopenharmony_ci return vbslq_s32(vreinterpretq_u32_s32(fVec), t.fVec, e.fVec); 546cb93a386Sopenharmony_ci } 547cb93a386Sopenharmony_ci 548cb93a386Sopenharmony_ci AI SkNx abs() const { return vabsq_s32(fVec); } 549cb93a386Sopenharmony_ci 550cb93a386Sopenharmony_ci int32x4_t fVec; 551cb93a386Sopenharmony_ci}; 552cb93a386Sopenharmony_ci 553cb93a386Sopenharmony_citemplate <> 554cb93a386Sopenharmony_ciclass SkNx<4, uint32_t> { 555cb93a386Sopenharmony_cipublic: 556cb93a386Sopenharmony_ci AI SkNx(const uint32x4_t& vec) : fVec(vec) {} 557cb93a386Sopenharmony_ci 558cb93a386Sopenharmony_ci AI SkNx() {} 559cb93a386Sopenharmony_ci AI SkNx(uint32_t v) { 560cb93a386Sopenharmony_ci fVec = vdupq_n_u32(v); 561cb93a386Sopenharmony_ci } 562cb93a386Sopenharmony_ci AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { 563cb93a386Sopenharmony_ci fVec = (uint32x4_t){a,b,c,d}; 564cb93a386Sopenharmony_ci } 565cb93a386Sopenharmony_ci AI static SkNx Load(const void* ptr) { 566cb93a386Sopenharmony_ci return vld1q_u32((const uint32_t*)ptr); 567cb93a386Sopenharmony_ci } 568cb93a386Sopenharmony_ci AI void store(void* ptr) const { 569cb93a386Sopenharmony_ci return vst1q_u32((uint32_t*)ptr, fVec); 570cb93a386Sopenharmony_ci } 571cb93a386Sopenharmony_ci AI uint32_t operator[](int k) const { 572cb93a386Sopenharmony_ci SkASSERT(0 <= k && k < 4); 573cb93a386Sopenharmony_ci union { uint32x4_t v; uint32_t us[4]; } pun = {fVec}; 574cb93a386Sopenharmony_ci return pun.us[k&3]; 575cb93a386Sopenharmony_ci } 576cb93a386Sopenharmony_ci 577cb93a386Sopenharmony_ci AI SkNx operator + (const SkNx& o) const { return vaddq_u32(fVec, o.fVec); } 578cb93a386Sopenharmony_ci AI SkNx operator - (const SkNx& o) const { return vsubq_u32(fVec, o.fVec); } 579cb93a386Sopenharmony_ci AI SkNx operator * (const SkNx& o) const { return vmulq_u32(fVec, o.fVec); } 580cb93a386Sopenharmony_ci 581cb93a386Sopenharmony_ci AI SkNx operator & (const SkNx& o) const { return vandq_u32(fVec, o.fVec); } 582cb93a386Sopenharmony_ci AI SkNx operator | (const SkNx& o) const { return vorrq_u32(fVec, o.fVec); } 583cb93a386Sopenharmony_ci AI SkNx operator ^ (const SkNx& o) const { return veorq_u32(fVec, o.fVec); } 584cb93a386Sopenharmony_ci 585cb93a386Sopenharmony_ci AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; } 586cb93a386Sopenharmony_ci AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; } 587cb93a386Sopenharmony_ci 588cb93a386Sopenharmony_ci AI SkNx operator == (const SkNx& o) const { return vceqq_u32(fVec, o.fVec); } 589cb93a386Sopenharmony_ci AI SkNx operator < (const SkNx& o) const { return vcltq_u32(fVec, o.fVec); } 590cb93a386Sopenharmony_ci AI SkNx operator > (const SkNx& o) const { return vcgtq_u32(fVec, o.fVec); } 591cb93a386Sopenharmony_ci 592cb93a386Sopenharmony_ci AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.fVec); } 593cb93a386Sopenharmony_ci // TODO as needed 594cb93a386Sopenharmony_ci 595cb93a386Sopenharmony_ci AI SkNx mulHi(const SkNx& m) const { 596cb93a386Sopenharmony_ci uint64x2_t hi = vmull_u32(vget_high_u32(fVec), vget_high_u32(m.fVec)); 597cb93a386Sopenharmony_ci uint64x2_t lo = vmull_u32( vget_low_u32(fVec), vget_low_u32(m.fVec)); 598cb93a386Sopenharmony_ci 599cb93a386Sopenharmony_ci return { vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)) }; 600cb93a386Sopenharmony_ci } 601cb93a386Sopenharmony_ci 602cb93a386Sopenharmony_ci AI SkNx thenElse(const SkNx& t, const SkNx& e) const { 603cb93a386Sopenharmony_ci return vbslq_u32(fVec, t.fVec, e.fVec); 604cb93a386Sopenharmony_ci } 605cb93a386Sopenharmony_ci 606cb93a386Sopenharmony_ci uint32x4_t fVec; 607cb93a386Sopenharmony_ci}; 608cb93a386Sopenharmony_ci 609cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) { 610cb93a386Sopenharmony_ci return vcvtq_s32_f32(src.fVec); 611cb93a386Sopenharmony_ci 612cb93a386Sopenharmony_ci} 613cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { 614cb93a386Sopenharmony_ci return vcvtq_f32_s32(src.fVec); 615cb93a386Sopenharmony_ci} 616cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) { 617cb93a386Sopenharmony_ci return SkNx_cast<float>(Sk4i::Load(&src)); 618cb93a386Sopenharmony_ci} 619cb93a386Sopenharmony_ci 620cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { 621cb93a386Sopenharmony_ci return vqmovn_u32(vcvtq_u32_f32(src.fVec)); 622cb93a386Sopenharmony_ci} 623cb93a386Sopenharmony_ci 624cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { 625cb93a386Sopenharmony_ci return vcvtq_f32_u32(vmovl_u16(src.fVec)); 626cb93a386Sopenharmony_ci} 627cb93a386Sopenharmony_ci 628cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { 629cb93a386Sopenharmony_ci uint32x4_t _32 = vcvtq_u32_f32(src.fVec); 630cb93a386Sopenharmony_ci uint16x4_t _16 = vqmovn_u32(_32); 631cb93a386Sopenharmony_ci return vqmovn_u16(vcombine_u16(_16, _16)); 632cb93a386Sopenharmony_ci} 633cb93a386Sopenharmony_ci 634cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) { 635cb93a386Sopenharmony_ci uint16x8_t _16 = vmovl_u8(src.fVec); 636cb93a386Sopenharmony_ci return vmovl_u16(vget_low_u16(_16)); 637cb93a386Sopenharmony_ci} 638cb93a386Sopenharmony_ci 639cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) { 640cb93a386Sopenharmony_ci return vreinterpretq_s32_u32(SkNx_cast<uint32_t>(src).fVec); 641cb93a386Sopenharmony_ci} 642cb93a386Sopenharmony_ci 643cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { 644cb93a386Sopenharmony_ci return vcvtq_f32_s32(SkNx_cast<int32_t>(src).fVec); 645cb93a386Sopenharmony_ci} 646cb93a386Sopenharmony_ci 647cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) { 648cb93a386Sopenharmony_ci Sk8f ab, cd; 649cb93a386Sopenharmony_ci SkNx_split(src, &ab, &cd); 650cb93a386Sopenharmony_ci 651cb93a386Sopenharmony_ci Sk4f a,b,c,d; 652cb93a386Sopenharmony_ci SkNx_split(ab, &a, &b); 653cb93a386Sopenharmony_ci SkNx_split(cd, &c, &d); 654cb93a386Sopenharmony_ci return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), 655cb93a386Sopenharmony_ci (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], 656cb93a386Sopenharmony_ci vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), 657cb93a386Sopenharmony_ci (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0]; 658cb93a386Sopenharmony_ci} 659cb93a386Sopenharmony_ci 660cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk8b SkNx_cast<uint8_t, int32_t>(const Sk8i& src) { 661cb93a386Sopenharmony_ci Sk4i a, b; 662cb93a386Sopenharmony_ci SkNx_split(src, &a, &b); 663cb93a386Sopenharmony_ci uint16x4_t a16 = vqmovun_s32(a.fVec); 664cb93a386Sopenharmony_ci uint16x4_t b16 = vqmovun_s32(b.fVec); 665cb93a386Sopenharmony_ci 666cb93a386Sopenharmony_ci return vqmovn_u16(vcombine_u16(a16, b16)); 667cb93a386Sopenharmony_ci} 668cb93a386Sopenharmony_ci 669cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { 670cb93a386Sopenharmony_ci return vget_low_u16(vmovl_u8(src.fVec)); 671cb93a386Sopenharmony_ci} 672cb93a386Sopenharmony_ci 673cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk8h SkNx_cast<uint16_t, uint8_t>(const Sk8b& src) { 674cb93a386Sopenharmony_ci return vmovl_u8(src.fVec); 675cb93a386Sopenharmony_ci} 676cb93a386Sopenharmony_ci 677cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { 678cb93a386Sopenharmony_ci return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); 679cb93a386Sopenharmony_ci} 680cb93a386Sopenharmony_ci 681cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk8b SkNx_cast<uint8_t, uint16_t>(const Sk8h& src) { 682cb93a386Sopenharmony_ci return vqmovn_u16(src.fVec); 683cb93a386Sopenharmony_ci} 684cb93a386Sopenharmony_ci 685cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) { 686cb93a386Sopenharmony_ci uint16x4_t _16 = vqmovun_s32(src.fVec); 687cb93a386Sopenharmony_ci return vqmovn_u16(vcombine_u16(_16, _16)); 688cb93a386Sopenharmony_ci} 689cb93a386Sopenharmony_ci 690cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) { 691cb93a386Sopenharmony_ci uint16x4_t _16 = vqmovn_u32(src.fVec); 692cb93a386Sopenharmony_ci return vqmovn_u16(vcombine_u16(_16, _16)); 693cb93a386Sopenharmony_ci} 694cb93a386Sopenharmony_ci 695cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) { 696cb93a386Sopenharmony_ci return vreinterpretq_s32_u32(vmovl_u16(src.fVec)); 697cb93a386Sopenharmony_ci} 698cb93a386Sopenharmony_ci 699cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) { 700cb93a386Sopenharmony_ci return vmovn_u32(vreinterpretq_u32_s32(src.fVec)); 701cb93a386Sopenharmony_ci} 702cb93a386Sopenharmony_ci 703cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) { 704cb93a386Sopenharmony_ci return vreinterpretq_s32_u32(src.fVec); 705cb93a386Sopenharmony_ci} 706cb93a386Sopenharmony_ci 707cb93a386Sopenharmony_ciAI static Sk4i Sk4f_round(const Sk4f& x) { 708cb93a386Sopenharmony_ci return vcvtq_s32_f32((x + 0.5f).fVec); 709cb93a386Sopenharmony_ci} 710cb93a386Sopenharmony_ci 711cb93a386Sopenharmony_ci} // namespace 712cb93a386Sopenharmony_ci 713cb93a386Sopenharmony_ci#endif//SkNx_neon_DEFINED 714