1cb93a386Sopenharmony_ci/*
2cb93a386Sopenharmony_ci * Copyright 2015 Google Inc.
3cb93a386Sopenharmony_ci *
4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be
5cb93a386Sopenharmony_ci * found in the LICENSE file.
6cb93a386Sopenharmony_ci */
7cb93a386Sopenharmony_ci
8cb93a386Sopenharmony_ci#ifndef SkNx_neon_DEFINED
9cb93a386Sopenharmony_ci#define SkNx_neon_DEFINED
10cb93a386Sopenharmony_ci
11cb93a386Sopenharmony_ci#include <arm_neon.h>
12cb93a386Sopenharmony_ci
13cb93a386Sopenharmony_cinamespace {  // NOLINT(google-build-namespaces)
14cb93a386Sopenharmony_ci
15cb93a386Sopenharmony_ci// ARMv8 has vrndm(q)_f32 to floor floats.  Here we emulate it:
16cb93a386Sopenharmony_ci//   - roundtrip through integers via truncation
17cb93a386Sopenharmony_ci//   - subtract 1 if that's too big (possible for negative values).
18cb93a386Sopenharmony_ci// This restricts the domain of our inputs to a maximum somehwere around 2^31.  Seems plenty big.
19cb93a386Sopenharmony_ciAI static float32x4_t emulate_vrndmq_f32(float32x4_t v) {
20cb93a386Sopenharmony_ci    auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
21cb93a386Sopenharmony_ci    auto too_big = vcgtq_f32(roundtrip, v);
22cb93a386Sopenharmony_ci    return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1)));
23cb93a386Sopenharmony_ci}
24cb93a386Sopenharmony_ciAI static float32x2_t emulate_vrndm_f32(float32x2_t v) {
25cb93a386Sopenharmony_ci    auto roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
26cb93a386Sopenharmony_ci    auto too_big = vcgt_f32(roundtrip, v);
27cb93a386Sopenharmony_ci    return vsub_f32(roundtrip, (float32x2_t)vand_u32(too_big, (uint32x2_t)vdup_n_f32(1)));
28cb93a386Sopenharmony_ci}
29cb93a386Sopenharmony_ci
30cb93a386Sopenharmony_citemplate <>
31cb93a386Sopenharmony_ciclass SkNx<2, float> {
32cb93a386Sopenharmony_cipublic:
33cb93a386Sopenharmony_ci    AI SkNx(float32x2_t vec) : fVec(vec) {}
34cb93a386Sopenharmony_ci
35cb93a386Sopenharmony_ci    AI SkNx() {}
36cb93a386Sopenharmony_ci    AI SkNx(float val) : fVec(vdup_n_f32(val)) {}
37cb93a386Sopenharmony_ci    AI SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; }
38cb93a386Sopenharmony_ci
39cb93a386Sopenharmony_ci    AI static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); }
40cb93a386Sopenharmony_ci    AI void store(void* ptr) const { vst1_f32((float*)ptr, fVec); }
41cb93a386Sopenharmony_ci
42cb93a386Sopenharmony_ci    AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
43cb93a386Sopenharmony_ci        float32x2x2_t xy = vld2_f32((const float*) ptr);
44cb93a386Sopenharmony_ci        *x = xy.val[0];
45cb93a386Sopenharmony_ci        *y = xy.val[1];
46cb93a386Sopenharmony_ci    }
47cb93a386Sopenharmony_ci
48cb93a386Sopenharmony_ci    AI static void Store2(void* dst, const SkNx& a, const SkNx& b) {
49cb93a386Sopenharmony_ci        float32x2x2_t ab = {{
50cb93a386Sopenharmony_ci            a.fVec,
51cb93a386Sopenharmony_ci            b.fVec,
52cb93a386Sopenharmony_ci        }};
53cb93a386Sopenharmony_ci        vst2_f32((float*) dst, ab);
54cb93a386Sopenharmony_ci    }
55cb93a386Sopenharmony_ci
56cb93a386Sopenharmony_ci    AI static void Store3(void* dst, const SkNx& a, const SkNx& b, const SkNx& c) {
57cb93a386Sopenharmony_ci        float32x2x3_t abc = {{
58cb93a386Sopenharmony_ci            a.fVec,
59cb93a386Sopenharmony_ci            b.fVec,
60cb93a386Sopenharmony_ci            c.fVec,
61cb93a386Sopenharmony_ci        }};
62cb93a386Sopenharmony_ci        vst3_f32((float*) dst, abc);
63cb93a386Sopenharmony_ci    }
64cb93a386Sopenharmony_ci
65cb93a386Sopenharmony_ci    AI static void Store4(void* dst, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
66cb93a386Sopenharmony_ci        float32x2x4_t abcd = {{
67cb93a386Sopenharmony_ci            a.fVec,
68cb93a386Sopenharmony_ci            b.fVec,
69cb93a386Sopenharmony_ci            c.fVec,
70cb93a386Sopenharmony_ci            d.fVec,
71cb93a386Sopenharmony_ci        }};
72cb93a386Sopenharmony_ci        vst4_f32((float*) dst, abcd);
73cb93a386Sopenharmony_ci    }
74cb93a386Sopenharmony_ci
75cb93a386Sopenharmony_ci    AI SkNx operator - () const { return vneg_f32(fVec); }
76cb93a386Sopenharmony_ci
77cb93a386Sopenharmony_ci    AI SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); }
78cb93a386Sopenharmony_ci    AI SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); }
79cb93a386Sopenharmony_ci    AI SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); }
80cb93a386Sopenharmony_ci    AI SkNx operator / (const SkNx& o) const {
81cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
82cb93a386Sopenharmony_ci        return vdiv_f32(fVec, o.fVec);
83cb93a386Sopenharmony_ci    #else
84cb93a386Sopenharmony_ci        float32x2_t est0 = vrecpe_f32(o.fVec),
85cb93a386Sopenharmony_ci                    est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0),
86cb93a386Sopenharmony_ci                    est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1);
87cb93a386Sopenharmony_ci        return vmul_f32(fVec, est2);
88cb93a386Sopenharmony_ci    #endif
89cb93a386Sopenharmony_ci    }
90cb93a386Sopenharmony_ci
91cb93a386Sopenharmony_ci    AI SkNx operator==(const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); }
92cb93a386Sopenharmony_ci    AI SkNx operator <(const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); }
93cb93a386Sopenharmony_ci    AI SkNx operator >(const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); }
94cb93a386Sopenharmony_ci    AI SkNx operator<=(const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); }
95cb93a386Sopenharmony_ci    AI SkNx operator>=(const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); }
96cb93a386Sopenharmony_ci    AI SkNx operator!=(const SkNx& o) const {
97cb93a386Sopenharmony_ci        return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec)));
98cb93a386Sopenharmony_ci    }
99cb93a386Sopenharmony_ci
100cb93a386Sopenharmony_ci    AI static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); }
101cb93a386Sopenharmony_ci    AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); }
102cb93a386Sopenharmony_ci
103cb93a386Sopenharmony_ci    AI SkNx abs() const { return vabs_f32(fVec); }
104cb93a386Sopenharmony_ci    AI SkNx floor() const {
105cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
106cb93a386Sopenharmony_ci        return vrndm_f32(fVec);
107cb93a386Sopenharmony_ci    #else
108cb93a386Sopenharmony_ci        return emulate_vrndm_f32(fVec);
109cb93a386Sopenharmony_ci    #endif
110cb93a386Sopenharmony_ci    }
111cb93a386Sopenharmony_ci
112cb93a386Sopenharmony_ci    AI SkNx sqrt() const {
113cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
114cb93a386Sopenharmony_ci        return vsqrt_f32(fVec);
115cb93a386Sopenharmony_ci    #else
116cb93a386Sopenharmony_ci        float32x2_t est0 = vrsqrte_f32(fVec),
117cb93a386Sopenharmony_ci                    est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0),
118cb93a386Sopenharmony_ci                    est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
119cb93a386Sopenharmony_ci        return vmul_f32(fVec, est2);
120cb93a386Sopenharmony_ci    #endif
121cb93a386Sopenharmony_ci    }
122cb93a386Sopenharmony_ci
123cb93a386Sopenharmony_ci    AI float operator[](int k) const {
124cb93a386Sopenharmony_ci        SkASSERT(0 <= k && k < 2);
125cb93a386Sopenharmony_ci        union { float32x2_t v; float fs[2]; } pun = {fVec};
126cb93a386Sopenharmony_ci        return pun.fs[k&1];
127cb93a386Sopenharmony_ci    }
128cb93a386Sopenharmony_ci
129cb93a386Sopenharmony_ci    AI bool allTrue() const {
130cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
131cb93a386Sopenharmony_ci        return 0 != vminv_u32(vreinterpret_u32_f32(fVec));
132cb93a386Sopenharmony_ci    #else
133cb93a386Sopenharmony_ci        auto v = vreinterpret_u32_f32(fVec);
134cb93a386Sopenharmony_ci        return vget_lane_u32(v,0) && vget_lane_u32(v,1);
135cb93a386Sopenharmony_ci    #endif
136cb93a386Sopenharmony_ci    }
137cb93a386Sopenharmony_ci    AI bool anyTrue() const {
138cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
139cb93a386Sopenharmony_ci        return 0 != vmaxv_u32(vreinterpret_u32_f32(fVec));
140cb93a386Sopenharmony_ci    #else
141cb93a386Sopenharmony_ci        auto v = vreinterpret_u32_f32(fVec);
142cb93a386Sopenharmony_ci        return vget_lane_u32(v,0) || vget_lane_u32(v,1);
143cb93a386Sopenharmony_ci    #endif
144cb93a386Sopenharmony_ci    }
145cb93a386Sopenharmony_ci
146cb93a386Sopenharmony_ci    AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
147cb93a386Sopenharmony_ci        return vbsl_f32(vreinterpret_u32_f32(fVec), t.fVec, e.fVec);
148cb93a386Sopenharmony_ci    }
149cb93a386Sopenharmony_ci
150cb93a386Sopenharmony_ci    float32x2_t fVec;
151cb93a386Sopenharmony_ci};
152cb93a386Sopenharmony_ci
153cb93a386Sopenharmony_citemplate <>
154cb93a386Sopenharmony_ciclass SkNx<4, float> {
155cb93a386Sopenharmony_cipublic:
156cb93a386Sopenharmony_ci    AI SkNx(float32x4_t vec) : fVec(vec) {}
157cb93a386Sopenharmony_ci
158cb93a386Sopenharmony_ci    AI SkNx() {}
159cb93a386Sopenharmony_ci    AI SkNx(float val) : fVec(vdupq_n_f32(val)) {}
160cb93a386Sopenharmony_ci    AI SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
161cb93a386Sopenharmony_ci
162cb93a386Sopenharmony_ci    AI static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
163cb93a386Sopenharmony_ci    AI void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
164cb93a386Sopenharmony_ci
165cb93a386Sopenharmony_ci    AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
166cb93a386Sopenharmony_ci        float32x4x2_t xy = vld2q_f32((const float*) ptr);
167cb93a386Sopenharmony_ci        *x = xy.val[0];
168cb93a386Sopenharmony_ci        *y = xy.val[1];
169cb93a386Sopenharmony_ci    }
170cb93a386Sopenharmony_ci
171cb93a386Sopenharmony_ci    AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
172cb93a386Sopenharmony_ci        float32x4x4_t rgba = vld4q_f32((const float*) ptr);
173cb93a386Sopenharmony_ci        *r = rgba.val[0];
174cb93a386Sopenharmony_ci        *g = rgba.val[1];
175cb93a386Sopenharmony_ci        *b = rgba.val[2];
176cb93a386Sopenharmony_ci        *a = rgba.val[3];
177cb93a386Sopenharmony_ci    }
178cb93a386Sopenharmony_ci    AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
179cb93a386Sopenharmony_ci        float32x4x4_t rgba = {{
180cb93a386Sopenharmony_ci            r.fVec,
181cb93a386Sopenharmony_ci            g.fVec,
182cb93a386Sopenharmony_ci            b.fVec,
183cb93a386Sopenharmony_ci            a.fVec,
184cb93a386Sopenharmony_ci        }};
185cb93a386Sopenharmony_ci        vst4q_f32((float*) dst, rgba);
186cb93a386Sopenharmony_ci    }
187cb93a386Sopenharmony_ci
188cb93a386Sopenharmony_ci    AI SkNx operator - () const { return vnegq_f32(fVec); }
189cb93a386Sopenharmony_ci
190cb93a386Sopenharmony_ci    AI SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); }
191cb93a386Sopenharmony_ci    AI SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); }
192cb93a386Sopenharmony_ci    AI SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); }
193cb93a386Sopenharmony_ci    AI SkNx operator / (const SkNx& o) const {
194cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
195cb93a386Sopenharmony_ci        return vdivq_f32(fVec, o.fVec);
196cb93a386Sopenharmony_ci    #else
197cb93a386Sopenharmony_ci        float32x4_t est0 = vrecpeq_f32(o.fVec),
198cb93a386Sopenharmony_ci                    est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
199cb93a386Sopenharmony_ci                    est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
200cb93a386Sopenharmony_ci        return vmulq_f32(fVec, est2);
201cb93a386Sopenharmony_ci    #endif
202cb93a386Sopenharmony_ci    }
203cb93a386Sopenharmony_ci
204cb93a386Sopenharmony_ci    AI SkNx operator==(const SkNx& o) const {return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec));}
205cb93a386Sopenharmony_ci    AI SkNx operator <(const SkNx& o) const {return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec));}
206cb93a386Sopenharmony_ci    AI SkNx operator >(const SkNx& o) const {return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec));}
207cb93a386Sopenharmony_ci    AI SkNx operator<=(const SkNx& o) const {return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec));}
208cb93a386Sopenharmony_ci    AI SkNx operator>=(const SkNx& o) const {return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec));}
209cb93a386Sopenharmony_ci    AI SkNx operator!=(const SkNx& o) const {
210cb93a386Sopenharmony_ci        return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec)));
211cb93a386Sopenharmony_ci    }
212cb93a386Sopenharmony_ci
213cb93a386Sopenharmony_ci    AI static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); }
214cb93a386Sopenharmony_ci    AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); }
215cb93a386Sopenharmony_ci
216cb93a386Sopenharmony_ci    AI SkNx abs() const { return vabsq_f32(fVec); }
217cb93a386Sopenharmony_ci    AI SkNx floor() const {
218cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
219cb93a386Sopenharmony_ci        return vrndmq_f32(fVec);
220cb93a386Sopenharmony_ci    #else
221cb93a386Sopenharmony_ci        return emulate_vrndmq_f32(fVec);
222cb93a386Sopenharmony_ci    #endif
223cb93a386Sopenharmony_ci    }
224cb93a386Sopenharmony_ci
225cb93a386Sopenharmony_ci
226cb93a386Sopenharmony_ci    AI SkNx sqrt() const {
227cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
228cb93a386Sopenharmony_ci        return vsqrtq_f32(fVec);
229cb93a386Sopenharmony_ci    #else
230cb93a386Sopenharmony_ci        float32x4_t est0 = vrsqrteq_f32(fVec),
231cb93a386Sopenharmony_ci                    est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0),
232cb93a386Sopenharmony_ci                    est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
233cb93a386Sopenharmony_ci        return vmulq_f32(fVec, est2);
234cb93a386Sopenharmony_ci    #endif
235cb93a386Sopenharmony_ci    }
236cb93a386Sopenharmony_ci
237cb93a386Sopenharmony_ci    AI float operator[](int k) const {
238cb93a386Sopenharmony_ci        SkASSERT(0 <= k && k < 4);
239cb93a386Sopenharmony_ci        union { float32x4_t v; float fs[4]; } pun = {fVec};
240cb93a386Sopenharmony_ci        return pun.fs[k&3];
241cb93a386Sopenharmony_ci    }
242cb93a386Sopenharmony_ci
243cb93a386Sopenharmony_ci    AI float min() const {
244cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
245cb93a386Sopenharmony_ci        return vminvq_f32(fVec);
246cb93a386Sopenharmony_ci    #else
247cb93a386Sopenharmony_ci        SkNx min = Min(*this, vrev64q_f32(fVec));
248cb93a386Sopenharmony_ci        return std::min(min[0], min[2]);
249cb93a386Sopenharmony_ci    #endif
250cb93a386Sopenharmony_ci    }
251cb93a386Sopenharmony_ci
252cb93a386Sopenharmony_ci    AI float max() const {
253cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
254cb93a386Sopenharmony_ci        return vmaxvq_f32(fVec);
255cb93a386Sopenharmony_ci    #else
256cb93a386Sopenharmony_ci        SkNx max = Max(*this, vrev64q_f32(fVec));
257cb93a386Sopenharmony_ci        return std::max(max[0], max[2]);
258cb93a386Sopenharmony_ci    #endif
259cb93a386Sopenharmony_ci    }
260cb93a386Sopenharmony_ci
261cb93a386Sopenharmony_ci    AI bool allTrue() const {
262cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
263cb93a386Sopenharmony_ci        return 0 != vminvq_u32(vreinterpretq_u32_f32(fVec));
264cb93a386Sopenharmony_ci    #else
265cb93a386Sopenharmony_ci        auto v = vreinterpretq_u32_f32(fVec);
266cb93a386Sopenharmony_ci        return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1)
267cb93a386Sopenharmony_ci            && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3);
268cb93a386Sopenharmony_ci    #endif
269cb93a386Sopenharmony_ci    }
270cb93a386Sopenharmony_ci    AI bool anyTrue() const {
271cb93a386Sopenharmony_ci    #if defined(SK_CPU_ARM64)
272cb93a386Sopenharmony_ci        return 0 != vmaxvq_u32(vreinterpretq_u32_f32(fVec));
273cb93a386Sopenharmony_ci    #else
274cb93a386Sopenharmony_ci        auto v = vreinterpretq_u32_f32(fVec);
275cb93a386Sopenharmony_ci        return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1)
276cb93a386Sopenharmony_ci            || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
277cb93a386Sopenharmony_ci    #endif
278cb93a386Sopenharmony_ci    }
279cb93a386Sopenharmony_ci
280cb93a386Sopenharmony_ci    AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
281cb93a386Sopenharmony_ci        return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec);
282cb93a386Sopenharmony_ci    }
283cb93a386Sopenharmony_ci
284cb93a386Sopenharmony_ci    float32x4_t fVec;
285cb93a386Sopenharmony_ci};
286cb93a386Sopenharmony_ci
287cb93a386Sopenharmony_ci#if defined(SK_CPU_ARM64)
288cb93a386Sopenharmony_ci    AI static Sk4f SkNx_fma(const Sk4f& f, const Sk4f& m, const Sk4f& a) {
289cb93a386Sopenharmony_ci        return vfmaq_f32(a.fVec, f.fVec, m.fVec);
290cb93a386Sopenharmony_ci    }
291cb93a386Sopenharmony_ci#endif
292cb93a386Sopenharmony_ci
293cb93a386Sopenharmony_ci// It's possible that for our current use cases, representing this as
294cb93a386Sopenharmony_ci// half a uint16x8_t might be better than representing it as a uint16x4_t.
295cb93a386Sopenharmony_ci// It'd make conversion to Sk4b one step simpler.
296cb93a386Sopenharmony_citemplate <>
297cb93a386Sopenharmony_ciclass SkNx<4, uint16_t> {
298cb93a386Sopenharmony_cipublic:
299cb93a386Sopenharmony_ci    AI SkNx(const uint16x4_t& vec) : fVec(vec) {}
300cb93a386Sopenharmony_ci
301cb93a386Sopenharmony_ci    AI SkNx() {}
302cb93a386Sopenharmony_ci    AI SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {}
303cb93a386Sopenharmony_ci    AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
304cb93a386Sopenharmony_ci        fVec = (uint16x4_t) { a,b,c,d };
305cb93a386Sopenharmony_ci    }
306cb93a386Sopenharmony_ci
307cb93a386Sopenharmony_ci    AI static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
308cb93a386Sopenharmony_ci    AI void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); }
309cb93a386Sopenharmony_ci
310cb93a386Sopenharmony_ci    AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
311cb93a386Sopenharmony_ci        uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
312cb93a386Sopenharmony_ci        *r = rgba.val[0];
313cb93a386Sopenharmony_ci        *g = rgba.val[1];
314cb93a386Sopenharmony_ci        *b = rgba.val[2];
315cb93a386Sopenharmony_ci        *a = rgba.val[3];
316cb93a386Sopenharmony_ci    }
317cb93a386Sopenharmony_ci    AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
318cb93a386Sopenharmony_ci        uint16x4x3_t rgba = vld3_u16((const uint16_t*)ptr);
319cb93a386Sopenharmony_ci        *r = rgba.val[0];
320cb93a386Sopenharmony_ci        *g = rgba.val[1];
321cb93a386Sopenharmony_ci        *b = rgba.val[2];
322cb93a386Sopenharmony_ci    }
323cb93a386Sopenharmony_ci    AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
324cb93a386Sopenharmony_ci        uint16x4x4_t rgba = {{
325cb93a386Sopenharmony_ci            r.fVec,
326cb93a386Sopenharmony_ci            g.fVec,
327cb93a386Sopenharmony_ci            b.fVec,
328cb93a386Sopenharmony_ci            a.fVec,
329cb93a386Sopenharmony_ci        }};
330cb93a386Sopenharmony_ci        vst4_u16((uint16_t*) dst, rgba);
331cb93a386Sopenharmony_ci    }
332cb93a386Sopenharmony_ci
333cb93a386Sopenharmony_ci    AI SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); }
334cb93a386Sopenharmony_ci    AI SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); }
335cb93a386Sopenharmony_ci    AI SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); }
336cb93a386Sopenharmony_ci    AI SkNx operator & (const SkNx& o) const { return vand_u16(fVec, o.fVec); }
337cb93a386Sopenharmony_ci    AI SkNx operator | (const SkNx& o) const { return vorr_u16(fVec, o.fVec); }
338cb93a386Sopenharmony_ci
339cb93a386Sopenharmony_ci    AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
340cb93a386Sopenharmony_ci    AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
341cb93a386Sopenharmony_ci
342cb93a386Sopenharmony_ci    AI static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); }
343cb93a386Sopenharmony_ci
344cb93a386Sopenharmony_ci    AI uint16_t operator[](int k) const {
345cb93a386Sopenharmony_ci        SkASSERT(0 <= k && k < 4);
346cb93a386Sopenharmony_ci        union { uint16x4_t v; uint16_t us[4]; } pun = {fVec};
347cb93a386Sopenharmony_ci        return pun.us[k&3];
348cb93a386Sopenharmony_ci    }
349cb93a386Sopenharmony_ci
350cb93a386Sopenharmony_ci    AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
351cb93a386Sopenharmony_ci        return vbsl_u16(fVec, t.fVec, e.fVec);
352cb93a386Sopenharmony_ci    }
353cb93a386Sopenharmony_ci
354cb93a386Sopenharmony_ci    uint16x4_t fVec;
355cb93a386Sopenharmony_ci};
356cb93a386Sopenharmony_ci
357cb93a386Sopenharmony_citemplate <>
358cb93a386Sopenharmony_ciclass SkNx<8, uint16_t> {
359cb93a386Sopenharmony_cipublic:
360cb93a386Sopenharmony_ci    AI SkNx(const uint16x8_t& vec) : fVec(vec) {}
361cb93a386Sopenharmony_ci
362cb93a386Sopenharmony_ci    AI SkNx() {}
363cb93a386Sopenharmony_ci    AI SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {}
364cb93a386Sopenharmony_ci    AI static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); }
365cb93a386Sopenharmony_ci
366cb93a386Sopenharmony_ci    AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
367cb93a386Sopenharmony_ci            uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
368cb93a386Sopenharmony_ci        fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
369cb93a386Sopenharmony_ci    }
370cb93a386Sopenharmony_ci
371cb93a386Sopenharmony_ci    AI void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); }
372cb93a386Sopenharmony_ci
373cb93a386Sopenharmony_ci    AI SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); }
374cb93a386Sopenharmony_ci    AI SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); }
375cb93a386Sopenharmony_ci    AI SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); }
376cb93a386Sopenharmony_ci    AI SkNx operator & (const SkNx& o) const { return vandq_u16(fVec, o.fVec); }
377cb93a386Sopenharmony_ci    AI SkNx operator | (const SkNx& o) const { return vorrq_u16(fVec, o.fVec); }
378cb93a386Sopenharmony_ci
379cb93a386Sopenharmony_ci    AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
380cb93a386Sopenharmony_ci    AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
381cb93a386Sopenharmony_ci
382cb93a386Sopenharmony_ci    AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); }
383cb93a386Sopenharmony_ci
384cb93a386Sopenharmony_ci    AI uint16_t operator[](int k) const {
385cb93a386Sopenharmony_ci        SkASSERT(0 <= k && k < 8);
386cb93a386Sopenharmony_ci        union { uint16x8_t v; uint16_t us[8]; } pun = {fVec};
387cb93a386Sopenharmony_ci        return pun.us[k&7];
388cb93a386Sopenharmony_ci    }
389cb93a386Sopenharmony_ci
390cb93a386Sopenharmony_ci    AI SkNx mulHi(const SkNx& m) const {
391cb93a386Sopenharmony_ci        uint32x4_t hi = vmull_u16(vget_high_u16(fVec), vget_high_u16(m.fVec));
392cb93a386Sopenharmony_ci        uint32x4_t lo = vmull_u16( vget_low_u16(fVec),  vget_low_u16(m.fVec));
393cb93a386Sopenharmony_ci
394cb93a386Sopenharmony_ci        return { vcombine_u16(vshrn_n_u32(lo,16), vshrn_n_u32(hi,16)) };
395cb93a386Sopenharmony_ci    }
396cb93a386Sopenharmony_ci
397cb93a386Sopenharmony_ci    AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
398cb93a386Sopenharmony_ci        return vbslq_u16(fVec, t.fVec, e.fVec);
399cb93a386Sopenharmony_ci    }
400cb93a386Sopenharmony_ci
401cb93a386Sopenharmony_ci    uint16x8_t fVec;
402cb93a386Sopenharmony_ci};
403cb93a386Sopenharmony_ci
404cb93a386Sopenharmony_citemplate <>
405cb93a386Sopenharmony_ciclass SkNx<4, uint8_t> {
406cb93a386Sopenharmony_cipublic:
407cb93a386Sopenharmony_ci    typedef uint32_t __attribute__((aligned(1))) unaligned_uint32_t;
408cb93a386Sopenharmony_ci
409cb93a386Sopenharmony_ci    AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
410cb93a386Sopenharmony_ci
411cb93a386Sopenharmony_ci    AI SkNx() {}
412cb93a386Sopenharmony_ci    AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
413cb93a386Sopenharmony_ci        fVec = (uint8x8_t){a,b,c,d, 0,0,0,0};
414cb93a386Sopenharmony_ci    }
415cb93a386Sopenharmony_ci    AI static SkNx Load(const void* ptr) {
416cb93a386Sopenharmony_ci        return (uint8x8_t)vld1_dup_u32((const unaligned_uint32_t*)ptr);
417cb93a386Sopenharmony_ci    }
418cb93a386Sopenharmony_ci    AI void store(void* ptr) const {
419cb93a386Sopenharmony_ci        return vst1_lane_u32((unaligned_uint32_t*)ptr, (uint32x2_t)fVec, 0);
420cb93a386Sopenharmony_ci    }
421cb93a386Sopenharmony_ci    AI uint8_t operator[](int k) const {
422cb93a386Sopenharmony_ci        SkASSERT(0 <= k && k < 4);
423cb93a386Sopenharmony_ci        union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
424cb93a386Sopenharmony_ci        return pun.us[k&3];
425cb93a386Sopenharmony_ci    }
426cb93a386Sopenharmony_ci
427cb93a386Sopenharmony_ci    // TODO as needed
428cb93a386Sopenharmony_ci
429cb93a386Sopenharmony_ci    uint8x8_t fVec;
430cb93a386Sopenharmony_ci};
431cb93a386Sopenharmony_ci
432cb93a386Sopenharmony_citemplate <>
433cb93a386Sopenharmony_ciclass SkNx<8, uint8_t> {
434cb93a386Sopenharmony_cipublic:
435cb93a386Sopenharmony_ci    AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
436cb93a386Sopenharmony_ci
437cb93a386Sopenharmony_ci    AI SkNx() {}
438cb93a386Sopenharmony_ci    AI SkNx(uint8_t val) : fVec(vdup_n_u8(val)) {}
439cb93a386Sopenharmony_ci    AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
440cb93a386Sopenharmony_ci            uint8_t e, uint8_t f, uint8_t g, uint8_t h) {
441cb93a386Sopenharmony_ci        fVec = (uint8x8_t) { a,b,c,d, e,f,g,h };
442cb93a386Sopenharmony_ci    }
443cb93a386Sopenharmony_ci
444cb93a386Sopenharmony_ci    AI static SkNx Load(const void* ptr) { return vld1_u8((const uint8_t*)ptr); }
445cb93a386Sopenharmony_ci    AI void store(void* ptr) const { vst1_u8((uint8_t*)ptr, fVec); }
446cb93a386Sopenharmony_ci
447cb93a386Sopenharmony_ci    AI uint8_t operator[](int k) const {
448cb93a386Sopenharmony_ci        SkASSERT(0 <= k && k < 8);
449cb93a386Sopenharmony_ci        union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
450cb93a386Sopenharmony_ci        return pun.us[k&7];
451cb93a386Sopenharmony_ci    }
452cb93a386Sopenharmony_ci
453cb93a386Sopenharmony_ci    uint8x8_t fVec;
454cb93a386Sopenharmony_ci};
455cb93a386Sopenharmony_ci
456cb93a386Sopenharmony_citemplate <>
457cb93a386Sopenharmony_ciclass SkNx<16, uint8_t> {
458cb93a386Sopenharmony_cipublic:
459cb93a386Sopenharmony_ci    AI SkNx(const uint8x16_t& vec) : fVec(vec) {}
460cb93a386Sopenharmony_ci
461cb93a386Sopenharmony_ci    AI SkNx() {}
462cb93a386Sopenharmony_ci    AI SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {}
463cb93a386Sopenharmony_ci    AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
464cb93a386Sopenharmony_ci            uint8_t e, uint8_t f, uint8_t g, uint8_t h,
465cb93a386Sopenharmony_ci            uint8_t i, uint8_t j, uint8_t k, uint8_t l,
466cb93a386Sopenharmony_ci            uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
467cb93a386Sopenharmony_ci        fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p };
468cb93a386Sopenharmony_ci    }
469cb93a386Sopenharmony_ci
470cb93a386Sopenharmony_ci    AI static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); }
471cb93a386Sopenharmony_ci    AI void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); }
472cb93a386Sopenharmony_ci
473cb93a386Sopenharmony_ci    AI SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); }
474cb93a386Sopenharmony_ci
475cb93a386Sopenharmony_ci    AI SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); }
476cb93a386Sopenharmony_ci    AI SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); }
477cb93a386Sopenharmony_ci    AI SkNx operator & (const SkNx& o) const { return vandq_u8(fVec, o.fVec); }
478cb93a386Sopenharmony_ci
479cb93a386Sopenharmony_ci    AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); }
480cb93a386Sopenharmony_ci    AI SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
481cb93a386Sopenharmony_ci
482cb93a386Sopenharmony_ci    AI uint8_t operator[](int k) const {
483cb93a386Sopenharmony_ci        SkASSERT(0 <= k && k < 16);
484cb93a386Sopenharmony_ci        union { uint8x16_t v; uint8_t us[16]; } pun = {fVec};
485cb93a386Sopenharmony_ci        return pun.us[k&15];
486cb93a386Sopenharmony_ci    }
487cb93a386Sopenharmony_ci
488cb93a386Sopenharmony_ci    AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
489cb93a386Sopenharmony_ci        return vbslq_u8(fVec, t.fVec, e.fVec);
490cb93a386Sopenharmony_ci    }
491cb93a386Sopenharmony_ci
492cb93a386Sopenharmony_ci    uint8x16_t fVec;
493cb93a386Sopenharmony_ci};
494cb93a386Sopenharmony_ci
495cb93a386Sopenharmony_citemplate <>
496cb93a386Sopenharmony_ciclass SkNx<4, int32_t> {
497cb93a386Sopenharmony_cipublic:
498cb93a386Sopenharmony_ci    AI SkNx(const int32x4_t& vec) : fVec(vec) {}
499cb93a386Sopenharmony_ci
500cb93a386Sopenharmony_ci    AI SkNx() {}
501cb93a386Sopenharmony_ci    AI SkNx(int32_t v) {
502cb93a386Sopenharmony_ci        fVec = vdupq_n_s32(v);
503cb93a386Sopenharmony_ci    }
504cb93a386Sopenharmony_ci    AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d) {
505cb93a386Sopenharmony_ci        fVec = (int32x4_t){a,b,c,d};
506cb93a386Sopenharmony_ci    }
507cb93a386Sopenharmony_ci    AI static SkNx Load(const void* ptr) {
508cb93a386Sopenharmony_ci        return vld1q_s32((const int32_t*)ptr);
509cb93a386Sopenharmony_ci    }
510cb93a386Sopenharmony_ci    AI void store(void* ptr) const {
511cb93a386Sopenharmony_ci        return vst1q_s32((int32_t*)ptr, fVec);
512cb93a386Sopenharmony_ci    }
513cb93a386Sopenharmony_ci    AI int32_t operator[](int k) const {
514cb93a386Sopenharmony_ci        SkASSERT(0 <= k && k < 4);
515cb93a386Sopenharmony_ci        union { int32x4_t v; int32_t is[4]; } pun = {fVec};
516cb93a386Sopenharmony_ci        return pun.is[k&3];
517cb93a386Sopenharmony_ci    }
518cb93a386Sopenharmony_ci
519cb93a386Sopenharmony_ci    AI SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); }
520cb93a386Sopenharmony_ci    AI SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
521cb93a386Sopenharmony_ci    AI SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
522cb93a386Sopenharmony_ci
523cb93a386Sopenharmony_ci    AI SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
524cb93a386Sopenharmony_ci    AI SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
525cb93a386Sopenharmony_ci    AI SkNx operator ^ (const SkNx& o) const { return veorq_s32(fVec, o.fVec); }
526cb93a386Sopenharmony_ci
527cb93a386Sopenharmony_ci    AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
528cb93a386Sopenharmony_ci    AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
529cb93a386Sopenharmony_ci
530cb93a386Sopenharmony_ci    AI SkNx operator == (const SkNx& o) const {
531cb93a386Sopenharmony_ci        return vreinterpretq_s32_u32(vceqq_s32(fVec, o.fVec));
532cb93a386Sopenharmony_ci    }
533cb93a386Sopenharmony_ci    AI SkNx operator <  (const SkNx& o) const {
534cb93a386Sopenharmony_ci        return vreinterpretq_s32_u32(vcltq_s32(fVec, o.fVec));
535cb93a386Sopenharmony_ci    }
536cb93a386Sopenharmony_ci    AI SkNx operator >  (const SkNx& o) const {
537cb93a386Sopenharmony_ci        return vreinterpretq_s32_u32(vcgtq_s32(fVec, o.fVec));
538cb93a386Sopenharmony_ci    }
539cb93a386Sopenharmony_ci
540cb93a386Sopenharmony_ci    AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.fVec); }
541cb93a386Sopenharmony_ci    AI static SkNx Max(const SkNx& a, const SkNx& b) { return vmaxq_s32(a.fVec, b.fVec); }
542cb93a386Sopenharmony_ci    // TODO as needed
543cb93a386Sopenharmony_ci
544cb93a386Sopenharmony_ci    AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
545cb93a386Sopenharmony_ci        return vbslq_s32(vreinterpretq_u32_s32(fVec), t.fVec, e.fVec);
546cb93a386Sopenharmony_ci    }
547cb93a386Sopenharmony_ci
548cb93a386Sopenharmony_ci    AI SkNx abs() const { return vabsq_s32(fVec); }
549cb93a386Sopenharmony_ci
550cb93a386Sopenharmony_ci    int32x4_t fVec;
551cb93a386Sopenharmony_ci};
552cb93a386Sopenharmony_ci
553cb93a386Sopenharmony_citemplate <>
554cb93a386Sopenharmony_ciclass SkNx<4, uint32_t> {
555cb93a386Sopenharmony_cipublic:
556cb93a386Sopenharmony_ci    AI SkNx(const uint32x4_t& vec) : fVec(vec) {}
557cb93a386Sopenharmony_ci
558cb93a386Sopenharmony_ci    AI SkNx() {}
559cb93a386Sopenharmony_ci    AI SkNx(uint32_t v) {
560cb93a386Sopenharmony_ci        fVec = vdupq_n_u32(v);
561cb93a386Sopenharmony_ci    }
562cb93a386Sopenharmony_ci    AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
563cb93a386Sopenharmony_ci        fVec = (uint32x4_t){a,b,c,d};
564cb93a386Sopenharmony_ci    }
565cb93a386Sopenharmony_ci    AI static SkNx Load(const void* ptr) {
566cb93a386Sopenharmony_ci        return vld1q_u32((const uint32_t*)ptr);
567cb93a386Sopenharmony_ci    }
568cb93a386Sopenharmony_ci    AI void store(void* ptr) const {
569cb93a386Sopenharmony_ci        return vst1q_u32((uint32_t*)ptr, fVec);
570cb93a386Sopenharmony_ci    }
571cb93a386Sopenharmony_ci    AI uint32_t operator[](int k) const {
572cb93a386Sopenharmony_ci        SkASSERT(0 <= k && k < 4);
573cb93a386Sopenharmony_ci        union { uint32x4_t v; uint32_t us[4]; } pun = {fVec};
574cb93a386Sopenharmony_ci        return pun.us[k&3];
575cb93a386Sopenharmony_ci    }
576cb93a386Sopenharmony_ci
577cb93a386Sopenharmony_ci    AI SkNx operator + (const SkNx& o) const { return vaddq_u32(fVec, o.fVec); }
578cb93a386Sopenharmony_ci    AI SkNx operator - (const SkNx& o) const { return vsubq_u32(fVec, o.fVec); }
579cb93a386Sopenharmony_ci    AI SkNx operator * (const SkNx& o) const { return vmulq_u32(fVec, o.fVec); }
580cb93a386Sopenharmony_ci
581cb93a386Sopenharmony_ci    AI SkNx operator & (const SkNx& o) const { return vandq_u32(fVec, o.fVec); }
582cb93a386Sopenharmony_ci    AI SkNx operator | (const SkNx& o) const { return vorrq_u32(fVec, o.fVec); }
583cb93a386Sopenharmony_ci    AI SkNx operator ^ (const SkNx& o) const { return veorq_u32(fVec, o.fVec); }
584cb93a386Sopenharmony_ci
585cb93a386Sopenharmony_ci    AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
586cb93a386Sopenharmony_ci    AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
587cb93a386Sopenharmony_ci
588cb93a386Sopenharmony_ci    AI SkNx operator == (const SkNx& o) const { return vceqq_u32(fVec, o.fVec); }
589cb93a386Sopenharmony_ci    AI SkNx operator <  (const SkNx& o) const { return vcltq_u32(fVec, o.fVec); }
590cb93a386Sopenharmony_ci    AI SkNx operator >  (const SkNx& o) const { return vcgtq_u32(fVec, o.fVec); }
591cb93a386Sopenharmony_ci
592cb93a386Sopenharmony_ci    AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.fVec); }
593cb93a386Sopenharmony_ci    // TODO as needed
594cb93a386Sopenharmony_ci
595cb93a386Sopenharmony_ci    AI SkNx mulHi(const SkNx& m) const {
596cb93a386Sopenharmony_ci        uint64x2_t hi = vmull_u32(vget_high_u32(fVec), vget_high_u32(m.fVec));
597cb93a386Sopenharmony_ci        uint64x2_t lo = vmull_u32( vget_low_u32(fVec),  vget_low_u32(m.fVec));
598cb93a386Sopenharmony_ci
599cb93a386Sopenharmony_ci        return { vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)) };
600cb93a386Sopenharmony_ci    }
601cb93a386Sopenharmony_ci
602cb93a386Sopenharmony_ci    AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
603cb93a386Sopenharmony_ci        return vbslq_u32(fVec, t.fVec, e.fVec);
604cb93a386Sopenharmony_ci    }
605cb93a386Sopenharmony_ci
606cb93a386Sopenharmony_ci    uint32x4_t fVec;
607cb93a386Sopenharmony_ci};
608cb93a386Sopenharmony_ci
609cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {
610cb93a386Sopenharmony_ci    return vcvtq_s32_f32(src.fVec);
611cb93a386Sopenharmony_ci
612cb93a386Sopenharmony_ci}
613cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
614cb93a386Sopenharmony_ci    return vcvtq_f32_s32(src.fVec);
615cb93a386Sopenharmony_ci}
616cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {
617cb93a386Sopenharmony_ci    return SkNx_cast<float>(Sk4i::Load(&src));
618cb93a386Sopenharmony_ci}
619cb93a386Sopenharmony_ci
620cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
621cb93a386Sopenharmony_ci    return vqmovn_u32(vcvtq_u32_f32(src.fVec));
622cb93a386Sopenharmony_ci}
623cb93a386Sopenharmony_ci
624cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
625cb93a386Sopenharmony_ci    return vcvtq_f32_u32(vmovl_u16(src.fVec));
626cb93a386Sopenharmony_ci}
627cb93a386Sopenharmony_ci
628cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
629cb93a386Sopenharmony_ci    uint32x4_t _32 = vcvtq_u32_f32(src.fVec);
630cb93a386Sopenharmony_ci    uint16x4_t _16 = vqmovn_u32(_32);
631cb93a386Sopenharmony_ci    return vqmovn_u16(vcombine_u16(_16, _16));
632cb93a386Sopenharmony_ci}
633cb93a386Sopenharmony_ci
634cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) {
635cb93a386Sopenharmony_ci    uint16x8_t _16 = vmovl_u8(src.fVec);
636cb93a386Sopenharmony_ci    return vmovl_u16(vget_low_u16(_16));
637cb93a386Sopenharmony_ci}
638cb93a386Sopenharmony_ci
639cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
640cb93a386Sopenharmony_ci    return vreinterpretq_s32_u32(SkNx_cast<uint32_t>(src).fVec);
641cb93a386Sopenharmony_ci}
642cb93a386Sopenharmony_ci
643cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
644cb93a386Sopenharmony_ci    return vcvtq_f32_s32(SkNx_cast<int32_t>(src).fVec);
645cb93a386Sopenharmony_ci}
646cb93a386Sopenharmony_ci
647cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) {
648cb93a386Sopenharmony_ci    Sk8f ab, cd;
649cb93a386Sopenharmony_ci    SkNx_split(src, &ab, &cd);
650cb93a386Sopenharmony_ci
651cb93a386Sopenharmony_ci    Sk4f a,b,c,d;
652cb93a386Sopenharmony_ci    SkNx_split(ab, &a, &b);
653cb93a386Sopenharmony_ci    SkNx_split(cd, &c, &d);
654cb93a386Sopenharmony_ci    return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec),
655cb93a386Sopenharmony_ci                             (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0],
656cb93a386Sopenharmony_ci                    vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec),
657cb93a386Sopenharmony_ci                             (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0];
658cb93a386Sopenharmony_ci}
659cb93a386Sopenharmony_ci
660cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk8b SkNx_cast<uint8_t, int32_t>(const Sk8i& src) {
661cb93a386Sopenharmony_ci    Sk4i a, b;
662cb93a386Sopenharmony_ci    SkNx_split(src, &a, &b);
663cb93a386Sopenharmony_ci    uint16x4_t a16 = vqmovun_s32(a.fVec);
664cb93a386Sopenharmony_ci    uint16x4_t b16 = vqmovun_s32(b.fVec);
665cb93a386Sopenharmony_ci
666cb93a386Sopenharmony_ci    return vqmovn_u16(vcombine_u16(a16, b16));
667cb93a386Sopenharmony_ci}
668cb93a386Sopenharmony_ci
669cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
670cb93a386Sopenharmony_ci    return vget_low_u16(vmovl_u8(src.fVec));
671cb93a386Sopenharmony_ci}
672cb93a386Sopenharmony_ci
673cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk8h SkNx_cast<uint16_t, uint8_t>(const Sk8b& src) {
674cb93a386Sopenharmony_ci    return vmovl_u8(src.fVec);
675cb93a386Sopenharmony_ci}
676cb93a386Sopenharmony_ci
677cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
678cb93a386Sopenharmony_ci    return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
679cb93a386Sopenharmony_ci}
680cb93a386Sopenharmony_ci
681cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk8b SkNx_cast<uint8_t, uint16_t>(const Sk8h& src) {
682cb93a386Sopenharmony_ci    return vqmovn_u16(src.fVec);
683cb93a386Sopenharmony_ci}
684cb93a386Sopenharmony_ci
685cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
686cb93a386Sopenharmony_ci    uint16x4_t _16 = vqmovun_s32(src.fVec);
687cb93a386Sopenharmony_ci    return vqmovn_u16(vcombine_u16(_16, _16));
688cb93a386Sopenharmony_ci}
689cb93a386Sopenharmony_ci
690cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) {
691cb93a386Sopenharmony_ci    uint16x4_t _16 = vqmovn_u32(src.fVec);
692cb93a386Sopenharmony_ci    return vqmovn_u16(vcombine_u16(_16, _16));
693cb93a386Sopenharmony_ci}
694cb93a386Sopenharmony_ci
695cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
696cb93a386Sopenharmony_ci    return vreinterpretq_s32_u32(vmovl_u16(src.fVec));
697cb93a386Sopenharmony_ci}
698cb93a386Sopenharmony_ci
699cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {
700cb93a386Sopenharmony_ci    return vmovn_u32(vreinterpretq_u32_s32(src.fVec));
701cb93a386Sopenharmony_ci}
702cb93a386Sopenharmony_ci
703cb93a386Sopenharmony_citemplate<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) {
704cb93a386Sopenharmony_ci    return vreinterpretq_s32_u32(src.fVec);
705cb93a386Sopenharmony_ci}
706cb93a386Sopenharmony_ci
707cb93a386Sopenharmony_ciAI static Sk4i Sk4f_round(const Sk4f& x) {
708cb93a386Sopenharmony_ci    return vcvtq_s32_f32((x + 0.5f).fVec);
709cb93a386Sopenharmony_ci}
710cb93a386Sopenharmony_ci
711cb93a386Sopenharmony_ci}  // namespace
712cb93a386Sopenharmony_ci
713cb93a386Sopenharmony_ci#endif//SkNx_neon_DEFINED
714