1 /* auto-generated on 2023-12-01 13:59:01 -0500. Do not edit! */
2 /* begin file src/simdutf.cpp */
3 #include "simdutf.h"
4 /* begin file src/implementation.cpp */
5 #include <initializer_list>
6 #include <climits>
7 
8 // Useful for debugging purposes
9 namespace simdutf {
10 namespace {
11 
12 template <typename T>
toBinaryString(T b)13 std::string toBinaryString(T b) {
14    std::string binary = "";
15    T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
16    while (mask > 0) {
17     binary += ((b & mask) == 0) ? '0' : '1';
18     mask >>= 1;
19   }
20   return binary;
21 }
22 }
23 }
24 
25 // Implementations
26 // The best choice should always come first!
27 /* begin file src/simdutf/arm64.h */
28 #ifndef SIMDUTF_ARM64_H
29 #define SIMDUTF_ARM64_H
30 
31 #ifdef SIMDUTF_FALLBACK_H
32 #error "arm64.h must be included before fallback.h"
33 #endif
34 
35 
36 #ifndef SIMDUTF_IMPLEMENTATION_ARM64
37 #define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
38 #endif
39 #define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
40 
41 
42 
43 #if SIMDUTF_IMPLEMENTATION_ARM64
44 
45 namespace simdutf {
46 /**
47  * Implementation for NEON (ARMv8).
48  */
49 namespace arm64 {
50 } // namespace arm64
51 } // namespace simdutf
52 
53 /* begin file src/simdutf/arm64/implementation.h */
54 #ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
55 #define SIMDUTF_ARM64_IMPLEMENTATION_H
56 
57 
58 namespace simdutf {
59 namespace arm64 {
60 
61 namespace {
62 using namespace simdutf;
63 }
64 
65 class implementation final : public simdutf::implementation {
66 public:
implementation()67   simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
68   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
69   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
70   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
71   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
72   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
73   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
74   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
75   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
76   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
77   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
78   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
79   simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
80   simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
81   simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
82   simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
83   simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
84   simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
85   simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
86   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
87   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
88   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
89   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
90   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
91   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
92   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
93   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
94   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
95   simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
96   simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
97   simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
98   simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
99   simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
100   simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
101   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
102   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
103   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
104   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
105   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
106   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
107   simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
108   simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
109   simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
110   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
111   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
112   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
113   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
114   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
115   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
116   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
117   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
118   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
119   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
120   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
121   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
122   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
123   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
124   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
125   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
126   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
127   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
128   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
129   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
130   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
131   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
132   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
133   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
134   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
135   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
136   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
137   simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
138   simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
139   simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
140   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
141   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
142   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
143 
144 };
145 
146 } // namespace arm64
147 } // namespace simdutf
148 
149 #endif // SIMDUTF_ARM64_IMPLEMENTATION_H
150 /* end file src/simdutf/arm64/implementation.h */
151 
152 /* begin file src/simdutf/arm64/begin.h */
153 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
154 // #define SIMDUTF_IMPLEMENTATION arm64
155 /* end file src/simdutf/arm64/begin.h */
156 
157 // Declarations
158 /* begin file src/simdutf/arm64/intrinsics.h */
159 #ifndef SIMDUTF_ARM64_INTRINSICS_H
160 #define SIMDUTF_ARM64_INTRINSICS_H
161 
162 
163 // This should be the correct header whether
164 // you use visual studio or other compilers.
165 #include <arm_neon.h>
166 
167 #endif //  SIMDUTF_ARM64_INTRINSICS_H
168 /* end file src/simdutf/arm64/intrinsics.h */
169 /* begin file src/simdutf/arm64/bitmanipulation.h */
170 #ifndef SIMDUTF_ARM64_BITMANIPULATION_H
171 #define SIMDUTF_ARM64_BITMANIPULATION_H
172 
173 namespace simdutf {
174 namespace arm64 {
175 namespace {
176 
177 /* result might be undefined when input_num is zero */
count_ones(uint64_t input_num)178 simdutf_really_inline int count_ones(uint64_t input_num) {
179    return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
180 }
181 
182 } // unnamed namespace
183 } // namespace arm64
184 } // namespace simdutf
185 
186 #endif // SIMDUTF_ARM64_BITMANIPULATION_H
187 /* end file src/simdutf/arm64/bitmanipulation.h */
188 /* begin file src/simdutf/arm64/simd.h */
189 #ifndef SIMDUTF_ARM64_SIMD_H
190 #define SIMDUTF_ARM64_SIMD_H
191 
192 #include <type_traits>
193 
194 
195 namespace simdutf {
196 namespace arm64 {
197 namespace {
198 namespace simd {
199 
200 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
201 namespace {
202 // Start of private section with Visual Studio workaround
203 
204 #ifndef simdutf_make_uint8x16_t
205 #define simdutf_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
206                              x13, x14, x15, x16)                                   \
207    ([=]() {                                                                        \
208      uint8_t array[16] = {x1, x2,  x3,  x4,  x5,  x6,  x7,  x8,                    \
209                                  x9, x10, x11, x12, x13, x14, x15, x16};           \
210      return vld1q_u8(array);                                                       \
211    }())
212 #endif
213 #ifndef simdutf_make_int8x16_t
214 #define simdutf_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
215                              x13, x14, x15, x16)                                  \
216    ([=]() {                                                                       \
217      int8_t array[16] = {x1, x2,  x3,  x4,  x5,  x6,  x7,  x8,                    \
218                                  x9, x10, x11, x12, x13, x14, x15, x16};          \
219      return vld1q_s8(array);                                                      \
220    }())
221 #endif
222 
223 #ifndef simdutf_make_uint8x8_t
224 #define simdutf_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8)                \
225    ([=]() {                                                                   \
226      uint8_t array[8] = {x1, x2,  x3,  x4,  x5,  x6,  x7,  x8};               \
227      return vld1_u8(array);                                                   \
228    }())
229 #endif
230 #ifndef simdutf_make_int8x8_t
231 #define simdutf_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8)                 \
232    ([=]() {                                                                   \
233      int8_t array[8] = {x1, x2,  x3,  x4,  x5,  x6,  x7,  x8};                \
234      return vld1_s8(array);                                                   \
235    }())
236 #endif
237 #ifndef simdutf_make_uint16x8_t
238 #define simdutf_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8)                \
239    ([=]() {                                                                    \
240      uint16_t array[8] = {x1, x2,  x3,  x4,  x5,  x6,  x7,  x8};               \
241      return vld1q_u16(array);                                                  \
242    }())
243 #endif
244 #ifndef simdutf_make_int16x8_t
245 #define simdutf_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8)                 \
246    ([=]() {                                                                    \
247      int16_t array[8] = {x1, x2,  x3,  x4,  x5,  x6,  x7,  x8};                \
248      return vld1q_s16(array);                                                  \
249    }())
250 #endif
251 
252 
253 // End of private section with Visual Studio workaround
254 } // namespace
255 #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
256 
257 
258   template<typename T>
259   struct simd8;
260 
261   //
262   // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
263   //
264   template<typename T, typename Mask=simd8<bool>>
265   struct base_u8 {
266     uint8x16_t value;
267     static const int SIZE = sizeof(value);
268 
269     // Conversion from/to SIMD register
base_u8simdutf::arm64::__anon13834::simd::base_u8270     simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
operator const uint8x16_t&simdutf::arm64::__anon13834::simd::base_u8271     simdutf_really_inline operator const uint8x16_t&() const { return this->value; }
operator uint8x16_t&simdutf::arm64::__anon13834::simd::base_u8272     simdutf_really_inline operator uint8x16_t&() { return this->value; }
firstsimdutf::arm64::__anon13834::simd::base_u8273     simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); }
lastsimdutf::arm64::__anon13834::simd::base_u8274     simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); }
275 
276     // Bit operations
operator |simdutf::arm64::__anon13834::simd::base_u8277     simdutf_really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
operator &simdutf::arm64::__anon13834::simd::base_u8278     simdutf_really_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
operator ^simdutf::arm64::__anon13834::simd::base_u8279     simdutf_really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
bit_andnotsimdutf::arm64::__anon13834::simd::base_u8280     simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
operator ~simdutf::arm64::__anon13834::simd::base_u8281     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
operator |=simdutf::arm64::__anon13834::simd::base_u8282     simdutf_really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::arm64::__anon13834::simd::base_u8283     simdutf_really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::arm64::__anon13834::simd::base_u8284     simdutf_really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
285 
operator ==simdutf::arm64::__anon13834::simd::base_u8286     friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
287 
288     template<int N=1>
prevsimdutf::arm64::__anon13834::simd::base_u8289     simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
290       return vextq_u8(prev_chunk, *this, 16 - N);
291     }
292   };
293 
294   // SIMD byte mask type (returned by things like eq and gt)
295   template<>
296   struct simd8<bool>: base_u8<bool> {
297     typedef uint16_t bitmask_t;
298     typedef uint32_t bitmask2_t;
299 
splatsimdutf::arm64::__anon13834::simd::simd8300     static simdutf_really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
301 
simd8simdutf::arm64::__anon13834::simd::simd8302     simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
303     // False constructor
simd8simdutf::arm64::__anon13834::simd::simd8304     simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
305     // Splat constructor
simd8simdutf::arm64::__anon13834::simd::simd8306     simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
storesimdutf::arm64::__anon13834::simd::simd8307     simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
308 
309     // We return uint32_t instead of uint16_t because that seems to be more efficient for most
310     // purposes (cutting it down to uint16_t costs performance in some compilers).
to_bitmasksimdutf::arm64::__anon13834::simd::simd8311     simdutf_really_inline uint32_t to_bitmask() const {
312 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
313       const uint8x16_t bit_mask =  simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
314                                                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
315 #else
316       const uint8x16_t bit_mask =  {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
317                                     0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
318 #endif
319       auto minput = *this & bit_mask;
320       uint8x16_t tmp = vpaddq_u8(minput, minput);
321       tmp = vpaddq_u8(tmp, tmp);
322       tmp = vpaddq_u8(tmp, tmp);
323       return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
324     }
325 
326     // Returns 4-bit out of each byte, alternating between the high 4 bits and low bits
327     // result it is 64 bit.
328     // This method is expected to be faster than none() and is equivalent
329     // when the vector register is the result of a comparison, with byte
330     // values 0xff and 0x00.
to_bitmask64simdutf::arm64::__anon13834::simd::simd8331     simdutf_really_inline uint64_t to_bitmask64() const {
332       return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
333     }
334 
anysimdutf::arm64::__anon13834::simd::simd8335     simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
nonesimdutf::arm64::__anon13834::simd::simd8336     simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; }
allsimdutf::arm64::__anon13834::simd::simd8337     simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; }
338 
339 
340   };
341 
342   // Unsigned bytes
343   template<>
344   struct simd8<uint8_t>: base_u8<uint8_t> {
splatsimdutf::arm64::__anon13834::simd::simd8345     static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) { return vmovq_n_u8(_value); }
zerosimdutf::arm64::__anon13834::simd::simd8346     static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
loadsimdutf::arm64::__anon13834::simd::simd8347     static simdutf_really_inline simd8<uint8_t> load(const uint8_t* values) { return vld1q_u8(values); }
simd8simdutf::arm64::__anon13834::simd::simd8348     simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
349     // Zero constructor
simd8simdutf::arm64::__anon13834::simd::simd8350     simdutf_really_inline simd8() : simd8(zero()) {}
351     // Array constructor
simd8simdutf::arm64::__anon13834::simd::simd8352     simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
353     // Splat constructor
simd8simdutf::arm64::__anon13834::simd::simd8354     simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
355     // Member-by-member initialization
356 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simd8simdutf::arm64::__anon13834::simd::simd8357     simdutf_really_inline simd8(
358       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
359       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
360     ) : simd8(simdutf_make_uint8x16_t(
361       v0, v1, v2, v3, v4, v5, v6, v7,
362       v8, v9, v10,v11,v12,v13,v14,v15
363     )) {}
364 #else
simd8simdutf::arm64::__anon13834::simd::simd8365     simdutf_really_inline simd8(
366       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
367       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
368     ) : simd8(uint8x16_t{
369       v0, v1, v2, v3, v4, v5, v6, v7,
370       v8, v9, v10,v11,v12,v13,v14,v15
371     }) {}
372 #endif
373 
374     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::arm64::__anon13834::simd::simd8375     simdutf_really_inline static simd8<uint8_t> repeat_16(
376       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
377       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
378     ) {
379       return simd8<uint8_t>(
380         v0, v1, v2, v3, v4, v5, v6, v7,
381         v8, v9, v10,v11,v12,v13,v14,v15
382       );
383     }
384 
385     // Store to array
storesimdutf::arm64::__anon13834::simd::simd8386     simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
387 
388     // Saturated math
saturating_addsimdutf::arm64::__anon13834::simd::simd8389     simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
saturating_subsimdutf::arm64::__anon13834::simd::simd8390     simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
391 
392     // Addition/subtraction are the same for signed and unsigned
operator +simdutf::arm64::__anon13834::simd::simd8393     simdutf_really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
operator -simdutf::arm64::__anon13834::simd::simd8394     simdutf_really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
operator +=simdutf::arm64::__anon13834::simd::simd8395     simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
operator -=simdutf::arm64::__anon13834::simd::simd8396     simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
397 
398     // Order-specific operations
max_valsimdutf::arm64::__anon13834::simd::simd8399     simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
min_valsimdutf::arm64::__anon13834::simd::simd8400     simdutf_really_inline uint8_t min_val() const { return vminvq_u8(*this); }
max_valsimdutf::arm64::__anon13834::simd::simd8401     simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
min_valsimdutf::arm64::__anon13834::simd::simd8402     simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
operator <=simdutf::arm64::__anon13834::simd::simd8403     simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
operator >=simdutf::arm64::__anon13834::simd::simd8404     simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
operator <simdutf::arm64::__anon13834::simd::simd8405     simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
operator >simdutf::arm64::__anon13834::simd::simd8406     simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
407     // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
gt_bitssimdutf::arm64::__anon13834::simd::simd8408     simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
409     // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
lt_bitssimdutf::arm64::__anon13834::simd::simd8410     simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
411 
412     // Bit-specific operations
any_bits_setsimdutf::arm64::__anon13834::simd::simd8413     simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
is_asciisimdutf::arm64::__anon13834::simd::simd8414     simdutf_really_inline bool is_ascii() const { return this->max_val() < 0b10000000u; }
415 
any_bits_set_anywheresimdutf::arm64::__anon13834::simd::simd8416     simdutf_really_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
any_bits_set_anywheresimdutf::arm64::__anon13834::simd::simd8417     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
418     template<int N>
shrsimdutf::arm64::__anon13834::simd::simd8419     simdutf_really_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
420     template<int N>
shlsimdutf::arm64::__anon13834::simd::simd8421     simdutf_really_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
422 
423     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
424     template<typename L>
lookup_16simdutf::arm64::__anon13834::simd::simd8425     simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
426       return lookup_table.apply_lookup_16_to(*this);
427     }
428 
429 
430     template<typename L>
lookup_16simdutf::arm64::__anon13834::simd::simd8431     simdutf_really_inline simd8<L> lookup_16(
432         L replace0,  L replace1,  L replace2,  L replace3,
433         L replace4,  L replace5,  L replace6,  L replace7,
434         L replace8,  L replace9,  L replace10, L replace11,
435         L replace12, L replace13, L replace14, L replace15) const {
436       return lookup_16(simd8<L>::repeat_16(
437         replace0,  replace1,  replace2,  replace3,
438         replace4,  replace5,  replace6,  replace7,
439         replace8,  replace9,  replace10, replace11,
440         replace12, replace13, replace14, replace15
441       ));
442     }
443 
444     template<typename T>
apply_lookup_16_tosimdutf::arm64::__anon13834::simd::simd8445     simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const {
446       return vqtbl1q_u8(*this, simd8<uint8_t>(original));
447     }
448   };
449 
450   // Signed bytes
451   template<>
452   struct simd8<int8_t> {
453     int8x16_t value;
454 
splatsimdutf::arm64::__anon13834::simd::simd8455     static simdutf_really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
zerosimdutf::arm64::__anon13834::simd::simd8456     static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
loadsimdutf::arm64::__anon13834::simd::simd8457     static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
458 
459     // Use ST2 instead of UXTL+UXTL2 to interleave zeroes. UXTL is actually a USHLL #0,
460     // and shifting in NEON is actually quite slow.
461     //
462     // While this needs the registers to be in a specific order, bigger cores can interleave
463     // these with no overhead, and it still performs decently on little cores.
464     //    movi  v1.3d, #0
465     //      mov   v0.16b, value[0]
466     //    st2   {v0.16b, v1.16b}, [ptr], #32
467     //      mov   v0.16b, value[1]
468     //    st2   {v0.16b, v1.16b}, [ptr], #32
469     //    ...
470     template <endianness big_endian>
store_ascii_as_utf16simdutf::arm64::__anon13834::simd::simd8471     simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
472       int8x16x2_t pair = match_system(big_endian)
473           ? int8x16x2_t{{this->value, vmovq_n_s8(0)}}
474           : int8x16x2_t{{vmovq_n_s8(0), this->value}};
475       vst2q_s8(reinterpret_cast<int8_t *>(p), pair);
476     }
477 
478     // currently unused
479     // Technically this could be done with ST4 like in store_ascii_as_utf16, but it is
480     // very much not worth it, as explicitly mentioned in the ARM Cortex-X1 Core Software
481     // Optimization Guide:
482     //   4.18 Complex ASIMD instructions
483     //     The bandwidth of [ST4 with element size less than 64b] is limited by decode
484     //     constraints and it is advisable to avoid them when high performing code is desired.
485     // Instead, it is better to use ZIP1+ZIP2 and two ST2.
store_ascii_as_utf32simdutf::arm64::__anon13834::simd::simd8486     simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
487       const uint16x8_t low = vreinterpretq_u16_s8(vzip1q_s8(this->value, vmovq_n_s8(0)));
488       const uint16x8_t high = vreinterpretq_u16_s8(vzip2q_s8(this->value, vmovq_n_s8(0)));
489       const uint16x8x2_t low_pair{{ low, vmovq_n_u16(0) }};
490       vst2q_u16(reinterpret_cast<uint16_t *>(p), low_pair);
491       const uint16x8x2_t high_pair{{ high, vmovq_n_u16(0) }};
492       vst2q_u16(reinterpret_cast<uint16_t *>(p + 8), high_pair);
493     }
494 
495     // In places where the table can be reused, which is most uses in simdutf, it is worth it to do
496     // 4 table lookups, as there is no direct zero extension from u8 to u32.
store_ascii_as_utf32_tblsimdutf::arm64::__anon13834::simd::simd8497     simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t * p) const {
498       const simd8<uint8_t> tb1{  0,255,255,255,  1,255,255,255,  2,255,255,255,  3,255,255,255 };
499       const simd8<uint8_t> tb2{  4,255,255,255,  5,255,255,255,  6,255,255,255,  7,255,255,255 };
500       const simd8<uint8_t> tb3{  8,255,255,255,  9,255,255,255, 10,255,255,255, 11,255,255,255 };
501       const simd8<uint8_t> tb4{ 12,255,255,255, 13,255,255,255, 14,255,255,255, 15,255,255,255 };
502 
503       // encourage store pairing and interleaving
504       const auto shuf1 = this->apply_lookup_16_to(tb1);
505       const auto shuf2 = this->apply_lookup_16_to(tb2);
506       shuf1.store(reinterpret_cast<int8_t *>(p));
507       shuf2.store(reinterpret_cast<int8_t *>(p + 4));
508 
509       const auto shuf3 = this->apply_lookup_16_to(tb3);
510       const auto shuf4 = this->apply_lookup_16_to(tb4);
511       shuf3.store(reinterpret_cast<int8_t *>(p + 8));
512       shuf4.store(reinterpret_cast<int8_t *>(p + 12));
513     }
514     // Conversion from/to SIMD register
simd8simdutf::arm64::__anon13834::simd::simd8515     simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
operator const int8x16_t&simdutf::arm64::__anon13834::simd::simd8516     simdutf_really_inline operator const int8x16_t&() const { return this->value; }
517 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
operator const uint8x16_tsimdutf::arm64::__anon13834::simd::simd8518     simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); }
519 #endif
operator int8x16_t&simdutf::arm64::__anon13834::simd::simd8520     simdutf_really_inline operator int8x16_t&() { return this->value; }
521 
522     // Zero constructor
simd8simdutf::arm64::__anon13834::simd::simd8523     simdutf_really_inline simd8() : simd8(zero()) {}
524     // Splat constructor
simd8simdutf::arm64::__anon13834::simd::simd8525     simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
526     // Array constructor
simd8simdutf::arm64::__anon13834::simd::simd8527     simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
528     // Member-by-member initialization
529 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simd8simdutf::arm64::__anon13834::simd::simd8530     simdutf_really_inline simd8(
531       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
532       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
533     ) : simd8(simdutf_make_int8x16_t(
534       v0, v1, v2, v3, v4, v5, v6, v7,
535       v8, v9, v10,v11,v12,v13,v14,v15
536     )) {}
537 #else
simd8simdutf::arm64::__anon13834::simd::simd8538     simdutf_really_inline simd8(
539       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
540       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
541     ) : simd8(int8x16_t{
542       v0, v1, v2, v3, v4, v5, v6, v7,
543       v8, v9, v10,v11,v12,v13,v14,v15
544     }) {}
545 #endif
546     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::arm64::__anon13834::simd::simd8547     simdutf_really_inline static simd8<int8_t> repeat_16(
548       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
549       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
550     ) {
551       return simd8<int8_t>(
552         v0, v1, v2, v3, v4, v5, v6, v7,
553         v8, v9, v10,v11,v12,v13,v14,v15
554       );
555     }
556 
557     // Store to array
storesimdutf::arm64::__anon13834::simd::simd8558     simdutf_really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, value); }
559     // Explicit conversion to/from unsigned
560     //
561     // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
562     // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
563     // and relatively ugly and hard to read.
564 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
simd8simdutf::arm64::__anon13834::simd::simd8565     simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
566 #endif
operator simd8<uint8_t>simdutf::arm64::__anon13834::simd::simd8567     simdutf_really_inline operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
568 
operator |simdutf::arm64::__anon13834::simd::simd8569     simdutf_really_inline simd8<int8_t> operator|(const simd8<int8_t> other) const { return vorrq_s8(value, other.value); }
operator &simdutf::arm64::__anon13834::simd::simd8570     simdutf_really_inline simd8<int8_t> operator&(const simd8<int8_t> other) const { return vandq_s8(value, other.value); }
operator ^simdutf::arm64::__anon13834::simd::simd8571     simdutf_really_inline simd8<int8_t> operator^(const simd8<int8_t> other) const { return veorq_s8(value, other.value); }
bit_andnotsimdutf::arm64::__anon13834::simd::simd8572     simdutf_really_inline simd8<int8_t> bit_andnot(const simd8<int8_t> other) const { return vbicq_s8(value, other.value); }
573 
574     // Math
operator +simdutf::arm64::__anon13834::simd::simd8575     simdutf_really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(value, other.value); }
operator -simdutf::arm64::__anon13834::simd::simd8576     simdutf_really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(value, other.value); }
operator +=simdutf::arm64::__anon13834::simd::simd8577     simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
operator -=simdutf::arm64::__anon13834::simd::simd8578     simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
579 
max_valsimdutf::arm64::__anon13834::simd::simd8580     simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
min_valsimdutf::arm64::__anon13834::simd::simd8581     simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
is_asciisimdutf::arm64::__anon13834::simd::simd8582     simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; }
583 
584     // Order-sensitive comparisons
max_valsimdutf::arm64::__anon13834::simd::simd8585     simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(value, other.value); }
min_valsimdutf::arm64::__anon13834::simd::simd8586     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(value, other.value); }
operator >simdutf::arm64::__anon13834::simd::simd8587     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(value, other.value); }
operator <simdutf::arm64::__anon13834::simd::simd8588     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(value, other.value); }
operator ==simdutf::arm64::__anon13834::simd::simd8589     simdutf_really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(value, other.value); }
590 
591     template<int N=1>
prevsimdutf::arm64::__anon13834::simd::simd8592     simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
593       return vextq_s8(prev_chunk, *this, 16 - N);
594     }
595 
596     // Perform a lookup assuming no value is larger than 16
597     template<typename L>
lookup_16simdutf::arm64::__anon13834::simd::simd8598     simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
599       return lookup_table.apply_lookup_16_to(*this);
600     }
601     template<typename L>
lookup_16simdutf::arm64::__anon13834::simd::simd8602     simdutf_really_inline simd8<L> lookup_16(
603         L replace0,  L replace1,  L replace2,  L replace3,
604         L replace4,  L replace5,  L replace6,  L replace7,
605         L replace8,  L replace9,  L replace10, L replace11,
606         L replace12, L replace13, L replace14, L replace15) const {
607       return lookup_16(simd8<L>::repeat_16(
608         replace0,  replace1,  replace2,  replace3,
609         replace4,  replace5,  replace6,  replace7,
610         replace8,  replace9,  replace10, replace11,
611         replace12, replace13, replace14, replace15
612       ));
613     }
614 
615     template<typename T>
apply_lookup_16_tosimdutf::arm64::__anon13834::simd::simd8616     simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) const {
617       return vqtbl1q_s8(*this, simd8<uint8_t>(original));
618     }
619   };
620 
621   template<typename T>
622   struct simd8x64 {
623     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
624     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
625     simd8<T> chunks[NUM_CHUNKS];
626 
627     simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
628     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
629     simd8x64() = delete; // no default constructor allowed
630 
simd8x64simdutf::arm64::__anon13834::simd::simd8x64631     simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd8x64simdutf::arm64::__anon13834::simd::simd8x64632     simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
633 
storesimdutf::arm64::__anon13834::simd::simd8x64634     simdutf_really_inline void store(T* ptr) const {
635       this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
636       this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
637       this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
638       this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
639     }
640 
641 
operator |=simdutf::arm64::__anon13834::simd::simd8x64642     simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
643       this->chunks[0] |= other.chunks[0];
644       this->chunks[1] |= other.chunks[1];
645       this->chunks[2] |= other.chunks[2];
646       this->chunks[3] |= other.chunks[3];
647       return *this;
648     }
649 
reduce_orsimdutf::arm64::__anon13834::simd::simd8x64650     simdutf_really_inline simd8<T> reduce_or() const {
651       return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
652     }
653 
is_asciisimdutf::arm64::__anon13834::simd::simd8x64654     simdutf_really_inline bool is_ascii() const {
655       return reduce_or().is_ascii();
656     }
657 
658     template <endianness endian>
store_ascii_as_utf16simdutf::arm64::__anon13834::simd::simd8x64659     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
660       this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
661       this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
662       this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
663       this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
664     }
665 
store_ascii_as_utf32simdutf::arm64::__anon13834::simd::simd8x64666     simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
667       this->chunks[0].store_ascii_as_utf32_tbl(ptr+sizeof(simd8<T>)*0);
668       this->chunks[1].store_ascii_as_utf32_tbl(ptr+sizeof(simd8<T>)*1);
669       this->chunks[2].store_ascii_as_utf32_tbl(ptr+sizeof(simd8<T>)*2);
670       this->chunks[3].store_ascii_as_utf32_tbl(ptr+sizeof(simd8<T>)*3);
671     }
672 
to_bitmasksimdutf::arm64::__anon13834::simd::simd8x64673     simdutf_really_inline uint64_t to_bitmask() const {
674 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
675       const uint8x16_t bit_mask = simdutf_make_uint8x16_t(
676         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
677         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
678       );
679 #else
680       const uint8x16_t bit_mask = {
681         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
682         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
683       };
684 #endif
685       // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
686       uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
687       uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
688       sum0 = vpaddq_u8(sum0, sum1);
689       sum0 = vpaddq_u8(sum0, sum0);
690       return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
691     }
692 
eqsimdutf::arm64::__anon13834::simd::simd8x64693     simdutf_really_inline uint64_t eq(const T m) const {
694     const simd8<T> mask = simd8<T>::splat(m);
695     return  simd8x64<bool>(
696       this->chunks[0] == mask,
697       this->chunks[1] == mask,
698       this->chunks[2] == mask,
699       this->chunks[3] == mask
700     ).to_bitmask();
701   }
702 
lteqsimdutf::arm64::__anon13834::simd::simd8x64703   simdutf_really_inline uint64_t lteq(const T m) const {
704     const simd8<T> mask = simd8<T>::splat(m);
705     return  simd8x64<bool>(
706       this->chunks[0] <= mask,
707       this->chunks[1] <= mask,
708       this->chunks[2] <= mask,
709       this->chunks[3] <= mask
710     ).to_bitmask();
711   }
712 
in_rangesimdutf::arm64::__anon13834::simd::simd8x64713     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
714       const simd8<T> mask_low = simd8<T>::splat(low);
715       const simd8<T> mask_high = simd8<T>::splat(high);
716 
717       return  simd8x64<bool>(
718         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
719         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
720         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
721         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
722       ).to_bitmask();
723     }
not_in_rangesimdutf::arm64::__anon13834::simd::simd8x64724     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
725       const simd8<T> mask_low = simd8<T>::splat(low);
726       const simd8<T> mask_high = simd8<T>::splat(high);
727       return  simd8x64<bool>(
728         (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
729         (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
730         (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
731         (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
732       ).to_bitmask();
733     }
ltsimdutf::arm64::__anon13834::simd::simd8x64734     simdutf_really_inline uint64_t lt(const T m) const {
735       const simd8<T> mask = simd8<T>::splat(m);
736       return  simd8x64<bool>(
737         this->chunks[0] < mask,
738         this->chunks[1] < mask,
739         this->chunks[2] < mask,
740         this->chunks[3] < mask
741       ).to_bitmask();
742     }
gtsimdutf::arm64::__anon13834::simd::simd8x64743     simdutf_really_inline uint64_t gt(const T m) const {
744       const simd8<T> mask = simd8<T>::splat(m);
745       return  simd8x64<bool>(
746         this->chunks[0] > mask,
747         this->chunks[1] > mask,
748         this->chunks[2] > mask,
749         this->chunks[3] > mask
750       ).to_bitmask();
751     }
gteqsimdutf::arm64::__anon13834::simd::simd8x64752     simdutf_really_inline uint64_t gteq(const T m) const {
753       const simd8<T> mask = simd8<T>::splat(m);
754       return  simd8x64<bool>(
755         this->chunks[0] >= mask,
756         this->chunks[1] >= mask,
757         this->chunks[2] >= mask,
758         this->chunks[3] >= mask
759       ).to_bitmask();
760     }
gteq_unsignedsimdutf::arm64::__anon13834::simd::simd8x64761     simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
762       const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
763       return  simd8x64<bool>(
764         simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
765         simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
766         simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
767         simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask
768       ).to_bitmask();
769     }
770   }; // struct simd8x64<T>
771 /* begin file src/simdutf/arm64/simd16-inl.h */
772 template<typename T>
773 struct simd16;
774 
775   template<typename T, typename Mask=simd16<bool>>
776   struct base_u16 {
777     uint16x8_t value;
778     static const int SIZE = sizeof(value);
779 
780     // Conversion from/to SIMD register
781     simdutf_really_inline base_u16() = default;
base_u16simdutf::arm64::__anon13834::simd::base_u16782     simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
operator const uint16x8_t&simdutf::arm64::__anon13834::simd::base_u16783     simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
operator uint16x8_t&simdutf::arm64::__anon13834::simd::base_u16784     simdutf_really_inline operator uint16x8_t&() { return this->value; }
785     // Bit operations
operator |simdutf::arm64::__anon13834::simd::base_u16786     simdutf_really_inline simd16<T> operator|(const simd16<T> other) const { return vorrq_u16(*this, other); }
operator &simdutf::arm64::__anon13834::simd::base_u16787     simdutf_really_inline simd16<T> operator&(const simd16<T> other) const { return vandq_u16(*this, other); }
operator ^simdutf::arm64::__anon13834::simd::base_u16788     simdutf_really_inline simd16<T> operator^(const simd16<T> other) const { return veorq_u16(*this, other); }
bit_andnotsimdutf::arm64::__anon13834::simd::base_u16789     simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const { return vbicq_u16(*this, other); }
operator ~simdutf::arm64::__anon13834::simd::base_u16790     simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
operator |=simdutf::arm64::__anon13834::simd::base_u16791     simdutf_really_inline simd16<T>& operator|=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::arm64::__anon13834::simd::base_u16792     simdutf_really_inline simd16<T>& operator&=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::arm64::__anon13834::simd::base_u16793     simdutf_really_inline simd16<T>& operator^=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
794 
operator ==simdutf::arm64::__anon13834::simd::base_u16795     friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return vceqq_u16(lhs, rhs); }
796 
797     template<int N=1>
prevsimdutf::arm64::__anon13834::simd::base_u16798     simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
799       return vextq_u18(prev_chunk, *this, 8 - N);
800     }
801   };
802 
803 template<typename T, typename Mask=simd16<bool>>
804 struct base16: base_u16<T> {
805   typedef uint16_t bitmask_t;
806   typedef uint32_t bitmask2_t;
807 
base16simdutf::arm64::__anon13834::simd::base16808   simdutf_really_inline base16() : base_u16<T>() {}
base16simdutf::arm64::__anon13834::simd::base16809   simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
810   template <typename Pointer>
base16simdutf::arm64::__anon13834::simd::base16811   simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {}
812 
813   static const int SIZE = sizeof(base_u16<T>::value);
814 
815   template<int N=1>
prevsimdutf::arm64::__anon13834::simd::base16816   simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
817     return vextq_u18(prev_chunk, *this, 8 - N);
818   }
819 };
820 
821 // SIMD byte mask type (returned by things like eq and gt)
822 template<>
823 struct simd16<bool>: base16<bool> {
splatsimdutf::arm64::__anon13834::simd::simd16824   static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
825 
simd16simdutf::arm64::__anon13834::simd::simd16826   simdutf_really_inline simd16<bool>() : base16() {}
simd16simdutf::arm64::__anon13834::simd::simd16827   simdutf_really_inline simd16<bool>(const uint16x8_t _value) : base16<bool>(_value) {}
828   // Splat constructor
simd16simdutf::arm64::__anon13834::simd::simd16829   simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
830 
831 };
832 
833 template<typename T>
834 struct base16_numeric: base16<T> {
splatsimdutf::arm64::__anon13834::simd::base16_numeric835   static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
zerosimdutf::arm64::__anon13834::simd::base16_numeric836   static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
loadsimdutf::arm64::__anon13834::simd::base16_numeric837   static simdutf_really_inline simd16<T> load(const T values[8]) {
838     return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
839   }
840 
base16_numericsimdutf::arm64::__anon13834::simd::base16_numeric841   simdutf_really_inline base16_numeric() : base16<T>() {}
base16_numericsimdutf::arm64::__anon13834::simd::base16_numeric842   simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16<T>(_value) {}
843 
844   // Store to array
storesimdutf::arm64::__anon13834::simd::base16_numeric845   simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
846 
847   // Override to distinguish from bool version
operator ~simdutf::arm64::__anon13834::simd::base16_numeric848   simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
849 
850   // Addition/subtraction are the same for signed and unsigned
operator +simdutf::arm64::__anon13834::simd::base16_numeric851   simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
operator -simdutf::arm64::__anon13834::simd::base16_numeric852   simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
operator +=simdutf::arm64::__anon13834::simd::base16_numeric853   simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
operator -=simdutf::arm64::__anon13834::simd::base16_numeric854   simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
855 };
856 
857 // Signed code units
858 template<>
859 struct simd16<int16_t> : base16_numeric<int16_t> {
simd16simdutf::arm64::__anon13834::simd::simd16860   simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
861 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
simd16simdutf::arm64::__anon13834::simd::simd16862   simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<int16_t>(_value) {}
863 #endif
simd16simdutf::arm64::__anon13834::simd::simd16864   simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
865 
866   // Splat constructor
simd16simdutf::arm64::__anon13834::simd::simd16867   simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
868   // Array constructor
simd16simdutf::arm64::__anon13834::simd::simd16869   simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
simd16simdutf::arm64::__anon13834::simd::simd16870   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
871   simdutf_really_inline operator simd16<uint16_t>() const;
operator const uint16x8_t&simdutf::arm64::__anon13834::simd::simd16872   simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
operator const int16x8_tsimdutf::arm64::__anon13834::simd::simd16873   simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
874 
max_valsimdutf::arm64::__anon13834::simd::simd16875   simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
min_valsimdutf::arm64::__anon13834::simd::simd16876   simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
877   // Order-sensitive comparisons
max_valsimdutf::arm64::__anon13834::simd::simd16878   simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
min_valsimdutf::arm64::__anon13834::simd::simd16879   simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
operator >simdutf::arm64::__anon13834::simd::simd16880   simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
operator <simdutf::arm64::__anon13834::simd::simd16881   simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
882 };
883 
884 
885 
886 
887 // Unsigned code units
888 template<>
889 struct simd16<uint16_t>: base16_numeric<uint16_t>  {
simd16simdutf::arm64::__anon13834::simd::simd16890   simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
simd16simdutf::arm64::__anon13834::simd::simd16891   simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<uint16_t>(_value) {}
892 
893   // Splat constructor
simd16simdutf::arm64::__anon13834::simd::simd16894   simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
895   // Array constructor
simd16simdutf::arm64::__anon13834::simd::simd16896   simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
simd16simdutf::arm64::__anon13834::simd::simd16897   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
898 
899 
max_valsimdutf::arm64::__anon13834::simd::simd16900   simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
min_valsimdutf::arm64::__anon13834::simd::simd16901   simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
902   // Saturated math
saturating_addsimdutf::arm64::__anon13834::simd::simd16903   simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
saturating_subsimdutf::arm64::__anon13834::simd::simd16904   simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
905 
906   // Order-specific operations
max_valsimdutf::arm64::__anon13834::simd::simd16907   simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
min_valsimdutf::arm64::__anon13834::simd::simd16908   simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
909   // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::arm64::__anon13834::simd::simd16910   simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
911   // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::arm64::__anon13834::simd::simd16912   simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::arm64::__anon13834::simd::simd16913   simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
operator >=simdutf::arm64::__anon13834::simd::simd16914   simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
operator >simdutf::arm64::__anon13834::simd::simd16915   simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return  vcgtq_u16(*this, other); }
operator <simdutf::arm64::__anon13834::simd::simd16916   simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
917 
918   // Bit-specific operations
bits_not_setsimdutf::arm64::__anon13834::simd::simd16919   simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
920   template<int N>
shrsimdutf::arm64::__anon13834::simd::simd16921   simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
922   template<int N>
shlsimdutf::arm64::__anon13834::simd::simd16923   simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
924 
925   // logical operations
operator |simdutf::arm64::__anon13834::simd::simd16926   simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
operator &simdutf::arm64::__anon13834::simd::simd16927   simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
operator ^simdutf::arm64::__anon13834::simd::simd16928   simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
929 
930   // Pack with the unsigned saturation  two uint16_t code units into single uint8_t vector
packsimdutf::arm64::__anon13834::simd::simd16931   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
932     return vqmovn_high_u16(vqmovn_u16(v0), v1);
933   }
934 
935   // Change the endianness
swap_bytessimdutf::arm64::__anon13834::simd::simd16936   simdutf_really_inline simd16<uint16_t> swap_bytes() const {
937     return vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(*this)));
938   }
939 };
operator simd16<uint16_t>() const940 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
941 
942 
943   template<typename T>
944   struct simd16x32 {
945     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
946     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
947     simd16<T> chunks[NUM_CHUNKS];
948 
949     simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
950     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
951     simd16x32() = delete; // no default constructor allowed
952 
simd16x32simdutf::arm64::__anon13834::simd::simd16x32953     simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd16x32simdutf::arm64::__anon13834::simd::simd16x32954     simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
955 
storesimdutf::arm64::__anon13834::simd::simd16x32956     simdutf_really_inline void store(T* ptr) const {
957       this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
958       this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
959       this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
960       this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
961     }
962 
reduce_orsimdutf::arm64::__anon13834::simd::simd16x32963     simdutf_really_inline simd16<T> reduce_or() const {
964       return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
965     }
966 
is_asciisimdutf::arm64::__anon13834::simd::simd16x32967     simdutf_really_inline bool is_ascii() const {
968       return reduce_or().is_ascii();
969     }
970 
store_ascii_as_utf16simdutf::arm64::__anon13834::simd::simd16x32971     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
972       this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
973       this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
974       this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
975       this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
976     }
977 
to_bitmasksimdutf::arm64::__anon13834::simd::simd16x32978     simdutf_really_inline uint64_t to_bitmask() const {
979 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
980       const uint8x16_t bit_mask = simdutf_make_uint8x16_t(
981         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
982         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
983       );
984 #else
985       const uint8x16_t bit_mask = {
986         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
987         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
988       };
989 #endif
990       // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
991       uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
992       uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
993       sum0 = vpaddq_u8(sum0, sum1);
994       sum0 = vpaddq_u8(sum0, sum0);
995       return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
996     }
997 
swap_bytessimdutf::arm64::__anon13834::simd::simd16x32998     simdutf_really_inline void swap_bytes() {
999       this->chunks[0] = this->chunks[0].swap_bytes();
1000       this->chunks[1] = this->chunks[1].swap_bytes();
1001       this->chunks[2] = this->chunks[2].swap_bytes();
1002       this->chunks[3] = this->chunks[3].swap_bytes();
1003     }
1004 
eqsimdutf::arm64::__anon13834::simd::simd16x321005     simdutf_really_inline uint64_t eq(const T m) const {
1006     const simd16<T> mask = simd16<T>::splat(m);
1007     return  simd16x32<bool>(
1008       this->chunks[0] == mask,
1009       this->chunks[1] == mask,
1010       this->chunks[2] == mask,
1011       this->chunks[3] == mask
1012     ).to_bitmask();
1013   }
1014 
lteqsimdutf::arm64::__anon13834::simd::simd16x321015   simdutf_really_inline uint64_t lteq(const T m) const {
1016     const simd16<T> mask = simd16<T>::splat(m);
1017     return  simd16x32<bool>(
1018       this->chunks[0] <= mask,
1019       this->chunks[1] <= mask,
1020       this->chunks[2] <= mask,
1021       this->chunks[3] <= mask
1022     ).to_bitmask();
1023   }
1024 
in_rangesimdutf::arm64::__anon13834::simd::simd16x321025     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
1026       const simd16<T> mask_low = simd16<T>::splat(low);
1027       const simd16<T> mask_high = simd16<T>::splat(high);
1028 
1029       return  simd16x32<bool>(
1030         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
1031         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
1032         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
1033         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
1034       ).to_bitmask();
1035     }
not_in_rangesimdutf::arm64::__anon13834::simd::simd16x321036     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
1037       const simd16<T> mask_low = simd16<T>::splat(low);
1038       const simd16<T> mask_high = simd16<T>::splat(high);
1039       return  simd16x32<bool>(
1040         (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
1041         (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
1042         (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
1043         (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
1044       ).to_bitmask();
1045     }
ltsimdutf::arm64::__anon13834::simd::simd16x321046     simdutf_really_inline uint64_t lt(const T m) const {
1047       const simd16<T> mask = simd16<T>::splat(m);
1048       return  simd16x32<bool>(
1049         this->chunks[0] < mask,
1050         this->chunks[1] < mask,
1051         this->chunks[2] < mask,
1052         this->chunks[3] < mask
1053       ).to_bitmask();
1054     }
1055 
1056   }; // struct simd16x32<T>
1057   template<>
not_in_range(const uint16_t low, const uint16_t high) const1058   simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const {
1059       const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
1060       const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
1061       simd16x32<uint16_t> x(
1062         simd16<uint16_t>((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)),
1063         simd16<uint16_t>((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)),
1064         simd16<uint16_t>((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)),
1065         simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
1066       );
1067       return  x.to_bitmask();
1068     }
1069 /* end file src/simdutf/arm64/simd16-inl.h */
1070 } // namespace simd
1071 } // unnamed namespace
1072 } // namespace arm64
1073 } // namespace simdutf
1074 
1075 #endif // SIMDUTF_ARM64_SIMD_H
1076 /* end file src/simdutf/arm64/simd.h */
1077 
1078 /* begin file src/simdutf/arm64/end.h */
1079 /* end file src/simdutf/arm64/end.h */
1080 
1081 #endif // SIMDUTF_IMPLEMENTATION_ARM64
1082 
1083 #endif // SIMDUTF_ARM64_H
1084 /* end file src/simdutf/arm64.h */
1085 /* begin file src/simdutf/icelake.h */
1086 #ifndef SIMDUTF_ICELAKE_H
1087 #define SIMDUTF_ICELAKE_H
1088 
1089 
1090 
1091 #ifdef __has_include
1092 // How do we detect that a compiler supports vbmi2?
1093 // For sure if the following header is found, we are ok?
1094 #if __has_include(<avx512vbmi2intrin.h>)
1095 #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
1096 #endif
1097 #endif
1098 
1099 #ifdef _MSC_VER
1100 #if _MSC_VER >= 1930
1101 // Visual Studio 2022 and up support VBMI2 under x64 even if the header
1102 // avx512vbmi2intrin.h is not found.
1103 // Visual Studio 2019 technically supports VBMI2, but the implementation
1104 // might be unreliable. Search for visualstudio2019icelakeissue in our
1105 // tests.
1106 #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
1107 #endif
1108 #endif
1109 
1110 // We allow icelake on x64 as long as the compiler is known to support VBMI2.
1111 #ifndef SIMDUTF_IMPLEMENTATION_ICELAKE
1112 #define SIMDUTF_IMPLEMENTATION_ICELAKE ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2))
1113 #endif
1114 
1115 // To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
1116 // https://github.com/simdutf/simdutf/issues/1247
1117 #define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && \
1118                                          SIMDUTF_HAS_AVX512DQ && \
1119                                          SIMDUTF_HAS_AVX512VL && \
1120                                            SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
1121 
1122 #if SIMDUTF_IMPLEMENTATION_ICELAKE
1123 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1124 #define SIMDUTF_TARGET_ICELAKE
1125 #else
1126 #define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512vpopcntdq")
1127 #endif
1128 
1129 namespace simdutf {
1130 namespace icelake {
1131 } // namespace icelake
1132 } // namespace simdutf
1133 
1134 
1135 
1136 //
1137 // These two need to be included outside SIMDUTF_TARGET_REGION
1138 //
1139 /* begin file src/simdutf/icelake/intrinsics.h */
1140 #ifndef SIMDUTF_ICELAKE_INTRINSICS_H
1141 #define SIMDUTF_ICELAKE_INTRINSICS_H
1142 
1143 
1144 #ifdef SIMDUTF_VISUAL_STUDIO
1145 // under clang within visual studio, this will include <x86intrin.h>
1146 #include <intrin.h>  // visual studio or clang
1147 #include <immintrin.h>
1148 #else
1149 
1150 #if SIMDUTF_GCC11ORMORE
1151 // We should not get warnings while including <x86intrin.h> yet we do
1152 // under some versions of GCC.
1153 // If the x86intrin.h header has uninitialized values that are problematic,
1154 // it is a GCC issue, we want to ignore these warnigns.
1155 SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
1156 #endif
1157 
1158 #include <x86intrin.h> // elsewhere
1159 
1160 
1161 #if SIMDUTF_GCC11ORMORE
1162 // cancels the suppression of the -Wuninitialized
1163 SIMDUTF_POP_DISABLE_WARNINGS
1164 #endif
1165 
1166 #ifndef _tzcnt_u64
1167 #define _tzcnt_u64(x) __tzcnt_u64(x)
1168 #endif // _tzcnt_u64
1169 #endif // SIMDUTF_VISUAL_STUDIO
1170 
1171 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
1172 /**
1173  * You are not supposed, normally, to include these
1174  * headers directly. Instead you should either include intrin.h
1175  * or x86intrin.h. However, when compiling with clang
1176  * under Windows (i.e., when _MSC_VER is set), these headers
1177  * only get included *if* the corresponding features are detected
1178  * from macros:
1179  * e.g., if __AVX2__ is set... in turn,  we normally set these
1180  * macros by compiling against the corresponding architecture
1181  * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
1182  * software with these advanced instructions. In simdutf, we
1183  * want to compile the whole program for a generic target,
1184  * and only target our specific kernels. As a workaround,
1185  * we directly include the needed headers. These headers would
1186  * normally guard against such usage, but we carefully included
1187  * <x86intrin.h>  (or <intrin.h>) before, so the headers
1188  * are fooled.
1189  */
1190 #include <bmiintrin.h>   // for _blsr_u64
1191 #include <bmi2intrin.h>  // for _pext_u64, _pdep_u64
1192 #include <lzcntintrin.h> // for  __lzcnt64
1193 #include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
1194 #include <smmintrin.h>
1195 #include <tmmintrin.h>
1196 #include <avxintrin.h>
1197 #include <avx2intrin.h>
1198 // Important: we need the AVX-512 headers:
1199 #include <avx512fintrin.h>
1200 #include <avx512dqintrin.h>
1201 #include <avx512cdintrin.h>
1202 #include <avx512bwintrin.h>
1203 #include <avx512vlintrin.h>
1204 #include <avx512vlbwintrin.h>
1205 #include <avx512vbmiintrin.h>
1206 #include <avx512vbmi2intrin.h>
1207 #include <avx512vpopcntdqintrin.h>
1208 #include <avx512vpopcntdqvlintrin.h>
1209 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
1210 // has it as a macro.
1211 #ifndef _blsr_u64
1212 // we roll our own
1213 #define _blsr_u64(n) ((n - 1) & n)
1214 #endif //  _blsr_u64
1215 #endif // SIMDUTF_CLANG_VISUAL_STUDIO
1216 
1217 
1218 
1219 #if defined(__GNUC__) && !defined(__clang__)
1220 
1221 #if __GNUC__ == 8
1222 #define SIMDUTF_GCC8 1
1223 #elif __GNUC__ == 9
1224 #define SIMDUTF_GCC9 1
1225 #endif //  __GNUC__ == 8 || __GNUC__ == 9
1226 
1227 #endif // defined(__GNUC__) && !defined(__clang__)
1228 
1229 #if SIMDUTF_GCC8
1230 #pragma GCC push_options
1231 #pragma GCC target("avx512f")
1232 /**
1233  * GCC 8 fails to provide _mm512_set_epi8. We roll our own.
1234  */
_mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63)1235 inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
1236   return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
1237                           uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
1238                           uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
1239                           uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
1240                           uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
1241                           uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
1242                           uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
1243                           uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
1244 }
1245 #pragma GCC pop_options
1246 #endif // SIMDUTF_GCC8
1247 
1248 #endif // SIMDUTF_HASWELL_INTRINSICS_H
1249 /* end file src/simdutf/icelake/intrinsics.h */
1250 /* begin file src/simdutf/icelake/implementation.h */
1251 #ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
1252 #define SIMDUTF_ICELAKE_IMPLEMENTATION_H
1253 
1254 
1255 namespace simdutf {
1256 namespace icelake {
1257 
1258 namespace {
1259 using namespace simdutf;
1260 }
1261 
1262 class implementation final : public simdutf::implementation {
1263 public:
implementation()1264   simdutf_really_inline implementation() : simdutf::implementation(
1265       "icelake",
1266       "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
1267       internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 | internal::instruction_set::AVX512VPOPCNTDQ ) {}
1268   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
1269   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
1270   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
1271   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
1272   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
1273   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
1274   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
1275   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
1276   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
1277   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
1278   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
1279   simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
1280   simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1281   simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1282   simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1283   simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
1284   simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
1285   simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
1286   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1287   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1288   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1289   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1290   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1291   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1292   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1293   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1294   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1295   simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1296   simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1297   simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1298   simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1299   simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1300   simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1301   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1302   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1303   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1304   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1305   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1306   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1307   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1308   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1309   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1310   simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
1311   simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
1312   simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
1313   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1314   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1315   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1316   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1317   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1318   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1319   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1320   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1321   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1322   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1323   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1324   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1325   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
1326   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
1327   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
1328   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
1329   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1330   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1331   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1332   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1333   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
1334   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1335   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1336   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
1337   simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
1338   simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
1339   simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
1340   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
1341   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
1342   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
1343 };
1344 
1345 } // namespace icelake
1346 } // namespace simdutf
1347 
1348 #endif // SIMDUTF_ICELAKE_IMPLEMENTATION_H
1349 /* end file src/simdutf/icelake/implementation.h */
1350 
1351 //
1352 // The rest need to be inside the region
1353 //
1354 /* begin file src/simdutf/icelake/begin.h */
1355 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
1356 // #define SIMDUTF_IMPLEMENTATION icelake
1357 
1358 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1359 // nothing needed.
1360 #else
1361 SIMDUTF_TARGET_ICELAKE
1362 #endif
1363 
1364 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
1365 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
1366 #endif // end of workaround
1367 /* end file src/simdutf/icelake/begin.h */
1368 // Declarations
1369 /* begin file src/simdutf/icelake/bitmanipulation.h */
1370 #ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H
1371 #define SIMDUTF_ICELAKE_BITMANIPULATION_H
1372 
1373 namespace simdutf {
1374 namespace icelake {
1375 namespace {
1376 
1377 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)1378 simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
1379   // note: we do not support legacy 32-bit Windows
1380   return __popcnt64(input_num);// Visual Studio wants two underscores
1381 }
1382 #else
1383 simdutf_really_inline long long int count_ones(uint64_t input_num) {
1384   return _popcnt64(input_num);
1385 }
1386 #endif
1387 
1388 } // unnamed namespace
1389 } // namespace icelake
1390 } // namespace simdutf
1391 
1392 #endif // SIMDUTF_ICELAKE_BITMANIPULATION_H
1393 /* end file src/simdutf/icelake/bitmanipulation.h */
1394 /* begin file src/simdutf/icelake/end.h */
1395 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1396 // nothing needed.
1397 #else
1398 SIMDUTF_UNTARGET_REGION
1399 #endif
1400 
1401 
1402 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
1403 SIMDUTF_POP_DISABLE_WARNINGS
1404 #endif // end of workaround
1405 /* end file src/simdutf/icelake/end.h */
1406 
1407 
1408 
1409 #endif // SIMDUTF_IMPLEMENTATION_ICELAKE
1410 #endif // SIMDUTF_ICELAKE_H
1411 /* end file src/simdutf/icelake.h */
1412 /* begin file src/simdutf/haswell.h */
1413 #ifndef SIMDUTF_HASWELL_H
1414 #define SIMDUTF_HASWELL_H
1415 
1416 #ifdef SIMDUTF_WESTMERE_H
1417 #error "haswell.h must be included before westmere.h"
1418 #endif
1419 #ifdef SIMDUTF_FALLBACK_H
1420 #error "haswell.h must be included before fallback.h"
1421 #endif
1422 
1423 
1424 // Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
1425 // at runtime.
1426 #ifndef SIMDUTF_IMPLEMENTATION_HASWELL
1427 //
1428 // You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__
1429 // because we want to rely on *runtime dispatch*.
1430 //
1431 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
1432 #define SIMDUTF_IMPLEMENTATION_HASWELL 0
1433 #else
1434 #define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64)
1435 #endif
1436 
1437 #endif
1438 // To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
1439 // https://github.com/simdutf/simdutf/issues/1247
1440 #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
1441 
1442 #if SIMDUTF_IMPLEMENTATION_HASWELL
1443 
1444 #define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt")
1445 
1446 namespace simdutf {
1447 /**
1448  * Implementation for Haswell (Intel AVX2).
1449  */
1450 namespace haswell {
1451 } // namespace haswell
1452 } // namespace simdutf
1453 
1454 //
1455 // These two need to be included outside SIMDUTF_TARGET_REGION
1456 //
1457 /* begin file src/simdutf/haswell/implementation.h */
1458 #ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
1459 #define SIMDUTF_HASWELL_IMPLEMENTATION_H
1460 
1461 
1462 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
1463 namespace simdutf {
1464 namespace haswell {
1465 
1466 using namespace simdutf;
1467 
1468 class implementation final : public simdutf::implementation {
1469 public:
implementation()1470   simdutf_really_inline implementation() : simdutf::implementation(
1471       "haswell",
1472       "Intel/AMD AVX2",
1473       internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
1474   ) {}
1475   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
1476   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
1477   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
1478   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
1479   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
1480   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
1481   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
1482   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
1483   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
1484   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
1485   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
1486   simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
1487   simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1488   simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1489   simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1490   simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
1491   simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
1492   simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
1493   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1494   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1495   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1496   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
1497   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1498   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1499   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1500   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
1501   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1502   simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1503   simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1504   simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1505   simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1506   simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1507   simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
1508   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1509   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1510   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1511   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1512   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1513   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1514   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1515   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1516   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
1517   simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
1518   simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
1519   simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
1520   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1521   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1522   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1523   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1524   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1525   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
1526   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1527   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1528   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1529   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1530   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1531   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
1532   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
1533   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
1534   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
1535   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
1536   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1537   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1538   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1539   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1540   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
1541   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1542   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1543   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
1544   simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
1545   simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
1546   simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
1547   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
1548   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
1549   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
1550 };
1551 
1552 } // namespace haswell
1553 } // namespace simdutf
1554 
1555 #endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
1556 /* end file src/simdutf/haswell/implementation.h */
1557 /* begin file src/simdutf/haswell/intrinsics.h */
1558 #ifndef SIMDUTF_HASWELL_INTRINSICS_H
1559 #define SIMDUTF_HASWELL_INTRINSICS_H
1560 
1561 
1562 #ifdef SIMDUTF_VISUAL_STUDIO
1563 // under clang within visual studio, this will include <x86intrin.h>
1564 #include <intrin.h>  // visual studio or clang
1565 #else
1566 
1567 #if SIMDUTF_GCC11ORMORE
1568 // We should not get warnings while including <x86intrin.h> yet we do
1569 // under some versions of GCC.
1570 // If the x86intrin.h header has uninitialized values that are problematic,
1571 // it is a GCC issue, we want to ignore these warnigns.
1572 SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
1573 #endif
1574 
1575 #include <x86intrin.h> // elsewhere
1576 
1577 
1578 #if SIMDUTF_GCC11ORMORE
1579 // cancels the suppression of the -Wuninitialized
1580 SIMDUTF_POP_DISABLE_WARNINGS
1581 #endif
1582 
1583 #endif // SIMDUTF_VISUAL_STUDIO
1584 
1585 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
1586 /**
1587  * You are not supposed, normally, to include these
1588  * headers directly. Instead you should either include intrin.h
1589  * or x86intrin.h. However, when compiling with clang
1590  * under Windows (i.e., when _MSC_VER is set), these headers
1591  * only get included *if* the corresponding features are detected
1592  * from macros:
1593  * e.g., if __AVX2__ is set... in turn,  we normally set these
1594  * macros by compiling against the corresponding architecture
1595  * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
1596  * software with these advanced instructions. In simdutf, we
1597  * want to compile the whole program for a generic target,
1598  * and only target our specific kernels. As a workaround,
1599  * we directly include the needed headers. These headers would
1600  * normally guard against such usage, but we carefully included
1601  * <x86intrin.h>  (or <intrin.h>) before, so the headers
1602  * are fooled.
1603  */
1604 #include <bmiintrin.h>   // for _blsr_u64
1605 #include <lzcntintrin.h> // for  __lzcnt64
1606 #include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
1607 #include <smmintrin.h>
1608 #include <tmmintrin.h>
1609 #include <avxintrin.h>
1610 #include <avx2intrin.h>
1611 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
1612 // has it as a macro.
1613 #ifndef _blsr_u64
1614 // we roll our own
1615 #define _blsr_u64(n) ((n - 1) & n)
1616 #endif //  _blsr_u64
1617 #endif // SIMDUTF_CLANG_VISUAL_STUDIO
1618 
1619 #endif // SIMDUTF_HASWELL_INTRINSICS_H
1620 /* end file src/simdutf/haswell/intrinsics.h */
1621 
1622 //
1623 // The rest need to be inside the region
1624 //
1625 /* begin file src/simdutf/haswell/begin.h */
1626 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
1627 // #define SIMDUTF_IMPLEMENTATION haswell
1628 
1629 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
1630 // nothing needed.
1631 #else
1632 SIMDUTF_TARGET_HASWELL
1633 #endif
1634 
1635 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
1636 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
1637 #endif // end of workaround
1638 /* end file src/simdutf/haswell/begin.h */
1639 // Declarations
1640 /* begin file src/simdutf/haswell/bitmanipulation.h */
1641 #ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
1642 #define SIMDUTF_HASWELL_BITMANIPULATION_H
1643 
1644 namespace simdutf {
1645 namespace haswell {
1646 namespace {
1647 
1648 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)1649 simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
1650   // note: we do not support legacy 32-bit Windows
1651   return __popcnt64(input_num);// Visual Studio wants two underscores
1652 }
1653 #else
1654 simdutf_really_inline long long int count_ones(uint64_t input_num) {
1655   return _popcnt64(input_num);
1656 }
1657 #endif
1658 
1659 } // unnamed namespace
1660 } // namespace haswell
1661 } // namespace simdutf
1662 
1663 #endif // SIMDUTF_HASWELL_BITMANIPULATION_H
1664 /* end file src/simdutf/haswell/bitmanipulation.h */
1665 /* begin file src/simdutf/haswell/simd.h */
1666 #ifndef SIMDUTF_HASWELL_SIMD_H
1667 #define SIMDUTF_HASWELL_SIMD_H
1668 
1669 
1670 namespace simdutf {
1671 namespace haswell {
1672 namespace {
1673 namespace simd {
1674 
1675   // Forward-declared so they can be used by splat and friends.
1676   template<typename Child>
1677   struct base {
1678     __m256i value;
1679 
1680     // Zero constructor
basesimdutf::haswell::__anon13839::simd::base1681     simdutf_really_inline base() : value{__m256i()} {}
1682 
1683     // Conversion from SIMD register
basesimdutf::haswell::__anon13839::simd::base1684     simdutf_really_inline base(const __m256i _value) : value(_value) {}
1685     // Conversion to SIMD register
operator const __m256i&simdutf::haswell::__anon13839::simd::base1686     simdutf_really_inline operator const __m256i&() const { return this->value; }
operator __m256i&simdutf::haswell::__anon13839::simd::base1687     simdutf_really_inline operator __m256i&() { return this->value; }
1688     template <endianness big_endian>
store_ascii_as_utf16simdutf::haswell::__anon13839::simd::base1689     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
1690       __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
1691       __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1));
1692       if (big_endian) {
1693         const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
1694                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
1695         first = _mm256_shuffle_epi8(first, swap);
1696         second = _mm256_shuffle_epi8(second, swap);
1697       }
1698       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
1699       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
1700     }
store_ascii_as_utf32simdutf::haswell::__anon13839::simd::base1701     simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
1702       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
1703       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8))));
1704       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1)));
1705       _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8)));
1706     }
1707     // Bit operations
operator |simdutf::haswell::__anon13839::simd::base1708     simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
operator &simdutf::haswell::__anon13839::simd::base1709     simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
operator ^simdutf::haswell::__anon13839::simd::base1710     simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
bit_andnotsimdutf::haswell::__anon13839::simd::base1711     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
operator |=simdutf::haswell::__anon13839::simd::base1712     simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::haswell::__anon13839::simd::base1713     simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::haswell::__anon13839::simd::base1714     simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
1715   };
1716 
1717   // Forward-declared so they can be used by splat and friends.
1718   template<typename T>
1719   struct simd8;
1720 
1721   template<typename T, typename Mask=simd8<bool>>
1722   struct base8: base<simd8<T>> {
1723     typedef uint32_t bitmask_t;
1724     typedef uint64_t bitmask2_t;
1725 
base8simdutf::haswell::__anon13839::simd::base81726     simdutf_really_inline base8() : base<simd8<T>>() {}
base8simdutf::haswell::__anon13839::simd::base81727     simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
firstsimdutf::haswell::__anon13839::simd::base81728     simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); }
lastsimdutf::haswell::__anon13839::simd::base81729     simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); }
operator ==simdutf::haswell::__anon13839::simd::base81730     friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
1731 
1732     static const int SIZE = sizeof(base<T>::value);
1733 
1734     template<int N=1>
prevsimdutf::haswell::__anon13839::simd::base81735     simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
1736       return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
1737     }
1738   };
1739 
1740   // SIMD byte mask type (returned by things like eq and gt)
1741   template<>
1742   struct simd8<bool>: base8<bool> {
splatsimdutf::haswell::__anon13839::simd::simd81743     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
1744 
simd8simdutf::haswell::__anon13839::simd::simd81745     simdutf_really_inline simd8<bool>() : base8() {}
simd8simdutf::haswell::__anon13839::simd::simd81746     simdutf_really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
1747     // Splat constructor
simd8simdutf::haswell::__anon13839::simd::simd81748     simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
1749 
to_bitmasksimdutf::haswell::__anon13839::simd::simd81750     simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); }
anysimdutf::haswell::__anon13839::simd::simd81751     simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
nonesimdutf::haswell::__anon13839::simd::simd81752     simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); }
allsimdutf::haswell::__anon13839::simd::simd81753     simdutf_really_inline bool all() const { return static_cast<uint32_t>(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; }
operator ~simdutf::haswell::__anon13839::simd::simd81754     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
1755   };
1756 
1757   template<typename T>
1758   struct base8_numeric: base8<T> {
splatsimdutf::haswell::__anon13839::simd::base8_numeric1759     static simdutf_really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
zerosimdutf::haswell::__anon13839::simd::base8_numeric1760     static simdutf_really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
loadsimdutf::haswell::__anon13839::simd::base8_numeric1761     static simdutf_really_inline simd8<T> load(const T values[32]) {
1762       return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
1763     }
1764     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::haswell::__anon13839::simd::base8_numeric1765     static simdutf_really_inline simd8<T> repeat_16(
1766       T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
1767       T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
1768     ) {
1769       return simd8<T>(
1770         v0, v1, v2, v3, v4, v5, v6, v7,
1771         v8, v9, v10,v11,v12,v13,v14,v15,
1772         v0, v1, v2, v3, v4, v5, v6, v7,
1773         v8, v9, v10,v11,v12,v13,v14,v15
1774       );
1775     }
1776 
base8_numericsimdutf::haswell::__anon13839::simd::base8_numeric1777     simdutf_really_inline base8_numeric() : base8<T>() {}
base8_numericsimdutf::haswell::__anon13839::simd::base8_numeric1778     simdutf_really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
1779 
1780     // Store to array
storesimdutf::haswell::__anon13839::simd::base8_numeric1781     simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
1782 
1783     // Addition/subtraction are the same for signed and unsigned
operator +simdutf::haswell::__anon13839::simd::base8_numeric1784     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
operator -simdutf::haswell::__anon13839::simd::base8_numeric1785     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
operator +=simdutf::haswell::__anon13839::simd::base8_numeric1786     simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
operator -=simdutf::haswell::__anon13839::simd::base8_numeric1787     simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
1788 
1789     // Override to distinguish from bool version
operator ~simdutf::haswell::__anon13839::simd::base8_numeric1790     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
1791 
1792     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
1793     template<typename L>
lookup_16simdutf::haswell::__anon13839::simd::base8_numeric1794     simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
1795       return _mm256_shuffle_epi8(lookup_table, *this);
1796     }
1797 
1798     template<typename L>
lookup_16simdutf::haswell::__anon13839::simd::base8_numeric1799     simdutf_really_inline simd8<L> lookup_16(
1800         L replace0,  L replace1,  L replace2,  L replace3,
1801         L replace4,  L replace5,  L replace6,  L replace7,
1802         L replace8,  L replace9,  L replace10, L replace11,
1803         L replace12, L replace13, L replace14, L replace15) const {
1804       return lookup_16(simd8<L>::repeat_16(
1805         replace0,  replace1,  replace2,  replace3,
1806         replace4,  replace5,  replace6,  replace7,
1807         replace8,  replace9,  replace10, replace11,
1808         replace12, replace13, replace14, replace15
1809       ));
1810     }
1811   };
1812 
1813 
1814   // Signed bytes
1815   template<>
1816   struct simd8<int8_t> : base8_numeric<int8_t> {
simd8simdutf::haswell::__anon13839::simd::simd81817     simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
simd8simdutf::haswell::__anon13839::simd::simd81818     simdutf_really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
1819 
1820     // Splat constructor
simd8simdutf::haswell::__anon13839::simd::simd81821     simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
1822     // Array constructor
simd8simdutf::haswell::__anon13839::simd::simd81823     simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
1824     simdutf_really_inline operator simd8<uint8_t>() const;
1825     // Member-by-member initialization
simd8simdutf::haswell::__anon13839::simd::simd81826     simdutf_really_inline simd8(
1827       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
1828       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
1829       int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
1830       int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
1831     ) : simd8(_mm256_setr_epi8(
1832       v0, v1, v2, v3, v4, v5, v6, v7,
1833       v8, v9, v10,v11,v12,v13,v14,v15,
1834       v16,v17,v18,v19,v20,v21,v22,v23,
1835       v24,v25,v26,v27,v28,v29,v30,v31
1836     )) {}
1837     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::haswell::__anon13839::simd::simd81838     simdutf_really_inline static simd8<int8_t> repeat_16(
1839       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
1840       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
1841     ) {
1842       return simd8<int8_t>(
1843         v0, v1, v2, v3, v4, v5, v6, v7,
1844         v8, v9, v10,v11,v12,v13,v14,v15,
1845         v0, v1, v2, v3, v4, v5, v6, v7,
1846         v8, v9, v10,v11,v12,v13,v14,v15
1847       );
1848     }
is_asciisimdutf::haswell::__anon13839::simd::simd81849     simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
1850     // Order-sensitive comparisons
max_valsimdutf::haswell::__anon13839::simd::simd81851     simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
min_valsimdutf::haswell::__anon13839::simd::simd81852     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
operator >simdutf::haswell::__anon13839::simd::simd81853     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
operator <simdutf::haswell::__anon13839::simd::simd81854     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
1855   };
1856 
1857   // Unsigned bytes
1858   template<>
1859   struct simd8<uint8_t>: base8_numeric<uint8_t> {
simd8simdutf::haswell::__anon13839::simd::simd81860     simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
simd8simdutf::haswell::__anon13839::simd::simd81861     simdutf_really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
1862     // Splat constructor
simd8simdutf::haswell::__anon13839::simd::simd81863     simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
1864     // Array constructor
simd8simdutf::haswell::__anon13839::simd::simd81865     simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
1866     // Member-by-member initialization
simd8simdutf::haswell::__anon13839::simd::simd81867     simdutf_really_inline simd8(
1868       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
1869       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
1870       uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
1871       uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
1872     ) : simd8(_mm256_setr_epi8(
1873       v0, v1, v2, v3, v4, v5, v6, v7,
1874       v8, v9, v10,v11,v12,v13,v14,v15,
1875       v16,v17,v18,v19,v20,v21,v22,v23,
1876       v24,v25,v26,v27,v28,v29,v30,v31
1877     )) {}
1878     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::haswell::__anon13839::simd::simd81879     simdutf_really_inline static simd8<uint8_t> repeat_16(
1880       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
1881       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
1882     ) {
1883       return simd8<uint8_t>(
1884         v0, v1, v2, v3, v4, v5, v6, v7,
1885         v8, v9, v10,v11,v12,v13,v14,v15,
1886         v0, v1, v2, v3, v4, v5, v6, v7,
1887         v8, v9, v10,v11,v12,v13,v14,v15
1888       );
1889     }
1890 
1891 
1892     // Saturated math
saturating_addsimdutf::haswell::__anon13839::simd::simd81893     simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
saturating_subsimdutf::haswell::__anon13839::simd::simd81894     simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
1895 
1896     // Order-specific operations
max_valsimdutf::haswell::__anon13839::simd::simd81897     simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
min_valsimdutf::haswell::__anon13839::simd::simd81898     simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
1899     // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::haswell::__anon13839::simd::simd81900     simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
1901     // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::haswell::__anon13839::simd::simd81902     simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::haswell::__anon13839::simd::simd81903     simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::haswell::__anon13839::simd::simd81904     simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::haswell::__anon13839::simd::simd81905     simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::haswell::__anon13839::simd::simd81906     simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
1907 
1908     // Bit-specific operations
bits_not_setsimdutf::haswell::__anon13839::simd::simd81909     simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
bits_not_setsimdutf::haswell::__anon13839::simd::simd81910     simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::haswell::__anon13839::simd::simd81911     simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::haswell::__anon13839::simd::simd81912     simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
is_asciisimdutf::haswell::__anon13839::simd::simd81913     simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
bits_not_set_anywheresimdutf::haswell::__anon13839::simd::simd81914     simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
any_bits_set_anywheresimdutf::haswell::__anon13839::simd::simd81915     simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::haswell::__anon13839::simd::simd81916     simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
any_bits_set_anywheresimdutf::haswell::__anon13839::simd::simd81917     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
1918     template<int N>
shrsimdutf::haswell::__anon13839::simd::simd81919     simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
1920     template<int N>
shlsimdutf::haswell::__anon13839::simd::simd81921     simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
1922     // Get one of the bits and make a bitmask out of it.
1923     // e.g. value.get_bit<7>() gets the high bit
1924     template<int N>
get_bitsimdutf::haswell::__anon13839::simd::simd81925     simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
1926   };
operator simd8<uint8_t>() const1927   simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
1928 
1929 
1930   template<typename T>
1931   struct simd8x64 {
1932     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
1933     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
1934     simd8<T> chunks[NUM_CHUNKS];
1935 
1936     simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
1937     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
1938     simd8x64() = delete; // no default constructor allowed
1939 
simd8x64simdutf::haswell::__anon13839::simd::simd8x641940     simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
simd8x64simdutf::haswell::__anon13839::simd::simd8x641941     simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T))} {}
1942 
storesimdutf::haswell::__anon13839::simd::simd8x641943     simdutf_really_inline void store(T* ptr) const {
1944       this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
1945       this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
1946     }
1947 
to_bitmasksimdutf::haswell::__anon13839::simd::simd8x641948     simdutf_really_inline uint64_t to_bitmask() const {
1949       uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
1950       uint64_t r_hi =                       this->chunks[1].to_bitmask();
1951       return r_lo | (r_hi << 32);
1952     }
1953 
operator |=simdutf::haswell::__anon13839::simd::simd8x641954     simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T> &other) {
1955       this->chunks[0] |= other.chunks[0];
1956       this->chunks[1] |= other.chunks[1];
1957       return *this;
1958     }
1959 
reduce_orsimdutf::haswell::__anon13839::simd::simd8x641960     simdutf_really_inline simd8<T> reduce_or() const {
1961       return this->chunks[0] | this->chunks[1];
1962     }
1963 
is_asciisimdutf::haswell::__anon13839::simd::simd8x641964     simdutf_really_inline bool is_ascii() const {
1965       return this->reduce_or().is_ascii();
1966     }
1967 
1968     template <endianness endian>
store_ascii_as_utf16simdutf::haswell::__anon13839::simd::simd8x641969     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
1970       this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
1971       this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
1972     }
1973 
store_ascii_as_utf32simdutf::haswell::__anon13839::simd::simd8x641974     simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
1975       this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
1976       this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
1977     }
1978 
bit_orsimdutf::haswell::__anon13839::simd::simd8x641979     simdutf_really_inline simd8x64<T> bit_or(const T m) const {
1980       const simd8<T> mask = simd8<T>::splat(m);
1981       return simd8x64<T>(
1982         this->chunks[0] | mask,
1983         this->chunks[1] | mask
1984       );
1985     }
1986 
eqsimdutf::haswell::__anon13839::simd::simd8x641987     simdutf_really_inline uint64_t eq(const T m) const {
1988       const simd8<T> mask = simd8<T>::splat(m);
1989       return  simd8x64<bool>(
1990         this->chunks[0] == mask,
1991         this->chunks[1] == mask
1992       ).to_bitmask();
1993     }
1994 
eqsimdutf::haswell::__anon13839::simd::simd8x641995     simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
1996       return  simd8x64<bool>(
1997         this->chunks[0] == other.chunks[0],
1998         this->chunks[1] == other.chunks[1]
1999       ).to_bitmask();
2000     }
2001 
lteqsimdutf::haswell::__anon13839::simd::simd8x642002     simdutf_really_inline uint64_t lteq(const T m) const {
2003       const simd8<T> mask = simd8<T>::splat(m);
2004       return  simd8x64<bool>(
2005         this->chunks[0] <= mask,
2006         this->chunks[1] <= mask
2007       ).to_bitmask();
2008     }
2009 
in_rangesimdutf::haswell::__anon13839::simd::simd8x642010     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
2011       const simd8<T> mask_low = simd8<T>::splat(low);
2012       const simd8<T> mask_high = simd8<T>::splat(high);
2013 
2014       return  simd8x64<bool>(
2015         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
2016         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low)
2017       ).to_bitmask();
2018     }
not_in_rangesimdutf::haswell::__anon13839::simd::simd8x642019     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
2020       const simd8<T> mask_low = simd8<T>::splat(low);
2021       const simd8<T> mask_high = simd8<T>::splat(high);
2022       return  simd8x64<bool>(
2023         (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
2024         (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)
2025       ).to_bitmask();
2026     }
ltsimdutf::haswell::__anon13839::simd::simd8x642027     simdutf_really_inline uint64_t lt(const T m) const {
2028       const simd8<T> mask = simd8<T>::splat(m);
2029       return  simd8x64<bool>(
2030         this->chunks[0] < mask,
2031         this->chunks[1] < mask
2032       ).to_bitmask();
2033     }
2034 
gtsimdutf::haswell::__anon13839::simd::simd8x642035     simdutf_really_inline uint64_t gt(const T m) const {
2036       const simd8<T> mask = simd8<T>::splat(m);
2037       return  simd8x64<bool>(
2038         this->chunks[0] > mask,
2039         this->chunks[1] > mask
2040       ).to_bitmask();
2041     }
gteqsimdutf::haswell::__anon13839::simd::simd8x642042     simdutf_really_inline uint64_t gteq(const T m) const {
2043       const simd8<T> mask = simd8<T>::splat(m);
2044       return  simd8x64<bool>(
2045         this->chunks[0] >= mask,
2046         this->chunks[1] >= mask
2047       ).to_bitmask();
2048     }
gteq_unsignedsimdutf::haswell::__anon13839::simd::simd8x642049     simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
2050       const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
2051       return  simd8x64<bool>(
2052         (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
2053         (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask)
2054       ).to_bitmask();
2055     }
2056   }; // struct simd8x64<T>
2057 
2058 /* begin file src/simdutf/haswell/simd16-inl.h */
2059 #ifdef __GNUC__
2060 #if __GNUC__ < 8
2061 #define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
2062 #define _mm256_setr_m128i(xmm2, xmm1)  _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
2063 #endif
2064 #endif
2065 
2066 template<typename T>
2067 struct simd16;
2068 
2069 template<typename T, typename Mask=simd16<bool>>
2070 struct base16: base<simd16<T>> {
2071   using bitmask_type = uint32_t;
2072 
base16simdutf::haswell::__anon13839::simd::base162073   simdutf_really_inline base16() : base<simd16<T>>() {}
base16simdutf::haswell::__anon13839::simd::base162074   simdutf_really_inline base16(const __m256i _value) : base<simd16<T>>(_value) {}
2075   template <typename Pointer>
base16simdutf::haswell::__anon13839::simd::base162076   simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr))) {}
operator ==simdutf::haswell::__anon13839::simd::base162077   friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm256_cmpeq_epi16(lhs, rhs); }
2078 
2079   /// the size of vector in bytes
2080   static const int SIZE = sizeof(base<simd16<T>>::value);
2081 
2082   /// the number of elements of type T a vector can hold
2083   static const int ELEMENTS = SIZE / sizeof(T);
2084 
2085   template<int N=1>
prevsimdutf::haswell::__anon13839::simd::base162086   simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
2087     return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
2088   }
2089 };
2090 
2091 // SIMD byte mask type (returned by things like eq and gt)
2092 template<>
2093 struct simd16<bool>: base16<bool> {
splatsimdutf::haswell::__anon13839::simd::simd162094   static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
2095 
simd16simdutf::haswell::__anon13839::simd::simd162096   simdutf_really_inline simd16<bool>() : base16() {}
simd16simdutf::haswell::__anon13839::simd::simd162097   simdutf_really_inline simd16<bool>(const __m256i _value) : base16<bool>(_value) {}
2098   // Splat constructor
simd16simdutf::haswell::__anon13839::simd::simd162099   simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
2100 
to_bitmasksimdutf::haswell::__anon13839::simd::simd162101   simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
anysimdutf::haswell::__anon13839::simd::simd162102   simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
operator ~simdutf::haswell::__anon13839::simd::simd162103   simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
2104 };
2105 
2106 template<typename T>
2107 struct base16_numeric: base16<T> {
splatsimdutf::haswell::__anon13839::simd::base16_numeric2108   static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
zerosimdutf::haswell::__anon13839::simd::base16_numeric2109   static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
loadsimdutf::haswell::__anon13839::simd::base16_numeric2110   static simdutf_really_inline simd16<T> load(const T values[8]) {
2111     return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
2112   }
2113 
base16_numericsimdutf::haswell::__anon13839::simd::base16_numeric2114   simdutf_really_inline base16_numeric() : base16<T>() {}
base16_numericsimdutf::haswell::__anon13839::simd::base16_numeric2115   simdutf_really_inline base16_numeric(const __m256i _value) : base16<T>(_value) {}
2116 
2117   // Store to array
storesimdutf::haswell::__anon13839::simd::base16_numeric2118   simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
2119 
2120   // Override to distinguish from bool version
operator ~simdutf::haswell::__anon13839::simd::base16_numeric2121   simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
2122 
2123   // Addition/subtraction are the same for signed and unsigned
operator +simdutf::haswell::__anon13839::simd::base16_numeric2124   simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
operator -simdutf::haswell::__anon13839::simd::base16_numeric2125   simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
operator +=simdutf::haswell::__anon13839::simd::base16_numeric2126   simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
operator -=simdutf::haswell::__anon13839::simd::base16_numeric2127   simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
2128 };
2129 
2130 // Signed code units
2131 template<>
2132 struct simd16<int16_t> : base16_numeric<int16_t> {
simd16simdutf::haswell::__anon13839::simd::simd162133   simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
simd16simdutf::haswell::__anon13839::simd::simd162134   simdutf_really_inline simd16(const __m256i _value) : base16_numeric<int16_t>(_value) {}
2135   // Splat constructor
simd16simdutf::haswell::__anon13839::simd::simd162136   simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
2137   // Array constructor
simd16simdutf::haswell::__anon13839::simd::simd162138   simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
simd16simdutf::haswell::__anon13839::simd::simd162139   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
2140   // Order-sensitive comparisons
max_valsimdutf::haswell::__anon13839::simd::simd162141   simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
min_valsimdutf::haswell::__anon13839::simd::simd162142   simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
operator >simdutf::haswell::__anon13839::simd::simd162143   simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
operator <simdutf::haswell::__anon13839::simd::simd162144   simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
2145 };
2146 
2147 // Unsigned code units
2148 template<>
2149 struct simd16<uint16_t>: base16_numeric<uint16_t>  {
simd16simdutf::haswell::__anon13839::simd::simd162150   simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
simd16simdutf::haswell::__anon13839::simd::simd162151   simdutf_really_inline simd16(const __m256i _value) : base16_numeric<uint16_t>(_value) {}
2152 
2153   // Splat constructor
simd16simdutf::haswell::__anon13839::simd::simd162154   simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
2155   // Array constructor
simd16simdutf::haswell::__anon13839::simd::simd162156   simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
simd16simdutf::haswell::__anon13839::simd::simd162157   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
2158 
2159   // Saturated math
saturating_addsimdutf::haswell::__anon13839::simd::simd162160   simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
saturating_subsimdutf::haswell::__anon13839::simd::simd162161   simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
2162 
2163   // Order-specific operations
max_valsimdutf::haswell::__anon13839::simd::simd162164   simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
min_valsimdutf::haswell::__anon13839::simd::simd162165   simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
2166   // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::haswell::__anon13839::simd::simd162167   simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
2168   // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::haswell::__anon13839::simd::simd162169   simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::haswell::__anon13839::simd::simd162170   simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::haswell::__anon13839::simd::simd162171   simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::haswell::__anon13839::simd::simd162172   simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::haswell::__anon13839::simd::simd162173   simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
2174 
2175   // Bit-specific operations
bits_not_setsimdutf::haswell::__anon13839::simd::simd162176   simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
bits_not_setsimdutf::haswell::__anon13839::simd::simd162177   simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::haswell::__anon13839::simd::simd162178   simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::haswell::__anon13839::simd::simd162179   simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
2180 
bits_not_set_anywheresimdutf::haswell::__anon13839::simd::simd162181   simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
any_bits_set_anywheresimdutf::haswell::__anon13839::simd::simd162182   simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::haswell::__anon13839::simd::simd162183   simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
any_bits_set_anywheresimdutf::haswell::__anon13839::simd::simd162184   simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
2185   template<int N>
shrsimdutf::haswell::__anon13839::simd::simd162186   simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
2187   template<int N>
shlsimdutf::haswell::__anon13839::simd::simd162188   simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
2189   // Get one of the bits and make a bitmask out of it.
2190   // e.g. value.get_bit<7>() gets the high bit
2191   template<int N>
get_bitsimdutf::haswell::__anon13839::simd::simd162192   simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); }
2193 
2194   // Change the endianness
swap_bytessimdutf::haswell::__anon13839::simd::simd162195   simdutf_really_inline simd16<uint16_t> swap_bytes() const {
2196     const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
2197                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
2198     return _mm256_shuffle_epi8(*this, swap);
2199   }
2200 
2201   // Pack with the unsigned saturation two uint16_t code units into single uint8_t vector
packsimdutf::haswell::__anon13839::simd::simd162202   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
2203     // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
2204     //       we have to shuffle lanes in order to produce bytes in the
2205     //       correct order.
2206 
2207     // get the 0th lanes
2208     const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
2209     const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
2210 
2211     // get the 1st lanes
2212     const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
2213     const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
2214 
2215     // build new vectors (shuffle lanes)
2216     const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
2217     const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
2218 
2219     // pack code units in linear order from v0 and v1
2220     return _mm256_packus_epi16(t0, t1);
2221   }
2222 };
2223 
2224 
2225   template<typename T>
2226   struct simd16x32 {
2227     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
2228     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
2229     simd16<T> chunks[NUM_CHUNKS];
2230 
2231     simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
2232     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
2233     simd16x32() = delete; // no default constructor allowed
2234 
simd16x32simdutf::haswell::__anon13839::simd::simd16x322235     simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1) : chunks{chunk0, chunk1} {}
simd16x32simdutf::haswell::__anon13839::simd::simd16x322236     simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T))} {}
2237 
storesimdutf::haswell::__anon13839::simd::simd16x322238     simdutf_really_inline void store(T* ptr) const {
2239       this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
2240       this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
2241     }
2242 
to_bitmasksimdutf::haswell::__anon13839::simd::simd16x322243     simdutf_really_inline uint64_t to_bitmask() const {
2244       uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
2245       uint64_t r_hi =                       this->chunks[1].to_bitmask();
2246       return r_lo | (r_hi << 32);
2247     }
2248 
reduce_orsimdutf::haswell::__anon13839::simd::simd16x322249     simdutf_really_inline simd16<T> reduce_or() const {
2250       return this->chunks[0] | this->chunks[1];
2251     }
2252 
is_asciisimdutf::haswell::__anon13839::simd::simd16x322253     simdutf_really_inline bool is_ascii() const {
2254       return this->reduce_or().is_ascii();
2255     }
2256 
store_ascii_as_utf16simdutf::haswell::__anon13839::simd::simd16x322257     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
2258       this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
2259       this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>));
2260     }
2261 
bit_orsimdutf::haswell::__anon13839::simd::simd16x322262     simdutf_really_inline simd16x32<T> bit_or(const T m) const {
2263       const simd16<T> mask = simd16<T>::splat(m);
2264       return simd16x32<T>(
2265         this->chunks[0] | mask,
2266         this->chunks[1] | mask
2267       );
2268     }
2269 
swap_bytessimdutf::haswell::__anon13839::simd::simd16x322270     simdutf_really_inline void swap_bytes() {
2271       this->chunks[0] = this->chunks[0].swap_bytes();
2272       this->chunks[1] = this->chunks[1].swap_bytes();
2273     }
2274 
eqsimdutf::haswell::__anon13839::simd::simd16x322275     simdutf_really_inline uint64_t eq(const T m) const {
2276       const simd16<T> mask = simd16<T>::splat(m);
2277       return  simd16x32<bool>(
2278         this->chunks[0] == mask,
2279         this->chunks[1] == mask
2280       ).to_bitmask();
2281     }
2282 
eqsimdutf::haswell::__anon13839::simd::simd16x322283     simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
2284       return  simd16x32<bool>(
2285         this->chunks[0] == other.chunks[0],
2286         this->chunks[1] == other.chunks[1]
2287       ).to_bitmask();
2288     }
2289 
lteqsimdutf::haswell::__anon13839::simd::simd16x322290     simdutf_really_inline uint64_t lteq(const T m) const {
2291       const simd16<T> mask = simd16<T>::splat(m);
2292       return  simd16x32<bool>(
2293         this->chunks[0] <= mask,
2294         this->chunks[1] <= mask
2295       ).to_bitmask();
2296     }
2297 
in_rangesimdutf::haswell::__anon13839::simd::simd16x322298     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
2299       const simd16<T> mask_low = simd16<T>::splat(low);
2300       const simd16<T> mask_high = simd16<T>::splat(high);
2301 
2302       return  simd16x32<bool>(
2303         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
2304         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low)
2305       ).to_bitmask();
2306     }
not_in_rangesimdutf::haswell::__anon13839::simd::simd16x322307     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
2308       const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
2309       const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
2310       return simd16x32<bool>(
2311         (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
2312         (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low)
2313       ).to_bitmask();
2314     }
ltsimdutf::haswell::__anon13839::simd::simd16x322315     simdutf_really_inline uint64_t lt(const T m) const {
2316       const simd16<T> mask = simd16<T>::splat(m);
2317       return  simd16x32<bool>(
2318         this->chunks[0] < mask,
2319         this->chunks[1] < mask
2320       ).to_bitmask();
2321     }
2322   }; // struct simd16x32<T>
2323 /* end file src/simdutf/haswell/simd16-inl.h */
2324 
2325 } // namespace simd
2326 
2327 } // unnamed namespace
2328 } // namespace haswell
2329 } // namespace simdutf
2330 
2331 #endif // SIMDUTF_HASWELL_SIMD_H
2332 /* end file src/simdutf/haswell/simd.h */
2333 
2334 /* begin file src/simdutf/haswell/end.h */
2335 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
2336 // nothing needed.
2337 #else
2338 SIMDUTF_UNTARGET_REGION
2339 #endif
2340 
2341 
2342 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
2343 SIMDUTF_POP_DISABLE_WARNINGS
2344 #endif // end of workaround
2345 /* end file src/simdutf/haswell/end.h */
2346 
2347 #endif // SIMDUTF_IMPLEMENTATION_HASWELL
2348 #endif // SIMDUTF_HASWELL_COMMON_H
2349 /* end file src/simdutf/haswell.h */
2350 /* begin file src/simdutf/westmere.h */
2351 #ifndef SIMDUTF_WESTMERE_H
2352 #define SIMDUTF_WESTMERE_H
2353 
2354 #ifdef SIMDUTF_FALLBACK_H
2355 #error "westmere.h must be included before fallback.h"
2356 #endif
2357 
2358 
2359 // Default Westmere to on if this is x86-64, unless we'll always select Haswell.
2360 #ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
2361 //
2362 // You do not want to set it to (SIMDUTF_IS_X86_64 && !SIMDUTF_REQUIRES_HASWELL)
2363 // because you want to rely on runtime dispatch!
2364 //
2365 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL
2366 #define SIMDUTF_IMPLEMENTATION_WESTMERE 0
2367 #else
2368 #define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64)
2369 #endif
2370 
2371 #endif
2372 
2373 #define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__)
2374 
2375 #if SIMDUTF_IMPLEMENTATION_WESTMERE
2376 
2377 #define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt")
2378 
2379 namespace simdutf {
2380 /**
2381  * Implementation for Westmere (Intel SSE4.2).
2382  */
2383 namespace westmere {
2384 } // namespace westmere
2385 } // namespace simdutf
2386 
2387 //
2388 // These two need to be included outside SIMDUTF_TARGET_REGION
2389 //
2390 /* begin file src/simdutf/westmere/implementation.h */
2391 #ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
2392 #define SIMDUTF_WESTMERE_IMPLEMENTATION_H
2393 
2394 
2395 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
2396 namespace simdutf {
2397 namespace westmere {
2398 
2399 namespace {
2400 using namespace simdutf;
2401 }
2402 
2403 class implementation final : public simdutf::implementation {
2404 public:
implementation()2405   simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42) {}
2406   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
2407   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
2408   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
2409   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
2410   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
2411   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
2412   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
2413   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
2414   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
2415   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
2416   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
2417   simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
2418   simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2419   simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2420   simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
2421   simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
2422   simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
2423   simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
2424   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2425   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2426   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2427   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
2428   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2429   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2430   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
2431   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
2432   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2433   simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
2434   simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
2435   simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
2436   simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
2437   simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
2438   simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
2439   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2440   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2441   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2442   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2443   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2444   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2445   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2446   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2447   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
2448   simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
2449   simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
2450   simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
2451   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2452   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2453   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2454   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2455   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2456   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
2457   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2458   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2459   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2460   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2461   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2462   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
2463   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
2464   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
2465   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
2466   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
2467   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
2468   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
2469   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
2470   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
2471   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
2472   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
2473   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
2474   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
2475   simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
2476   simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
2477   simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
2478   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
2479   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
2480   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
2481 };
2482 
2483 } // namespace westmere
2484 } // namespace simdutf
2485 
2486 #endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
2487 /* end file src/simdutf/westmere/implementation.h */
2488 /* begin file src/simdutf/westmere/intrinsics.h */
2489 #ifndef SIMDUTF_WESTMERE_INTRINSICS_H
2490 #define SIMDUTF_WESTMERE_INTRINSICS_H
2491 
2492 #ifdef SIMDUTF_VISUAL_STUDIO
2493 // under clang within visual studio, this will include <x86intrin.h>
2494 #include <intrin.h> // visual studio or clang
2495 #else
2496 
2497 #if SIMDUTF_GCC11ORMORE
2498 // We should not get warnings while including <x86intrin.h> yet we do
2499 // under some versions of GCC.
2500 // If the x86intrin.h header has uninitialized values that are problematic,
2501 // it is a GCC issue, we want to ignore these warnigns.
2502 SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
2503 #endif
2504 
2505 #include <x86intrin.h> // elsewhere
2506 
2507 
2508 #if SIMDUTF_GCC11ORMORE
2509 // cancels the suppression of the -Wuninitialized
2510 SIMDUTF_POP_DISABLE_WARNINGS
2511 #endif
2512 
2513 #endif // SIMDUTF_VISUAL_STUDIO
2514 
2515 
2516 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
2517 /**
2518  * You are not supposed, normally, to include these
2519  * headers directly. Instead you should either include intrin.h
2520  * or x86intrin.h. However, when compiling with clang
2521  * under Windows (i.e., when _MSC_VER is set), these headers
2522  * only get included *if* the corresponding features are detected
2523  * from macros:
2524  */
2525 #include <smmintrin.h>  // for _mm_alignr_epi8
2526 #endif
2527 
2528 
2529 
2530 #endif // SIMDUTF_WESTMERE_INTRINSICS_H
2531 /* end file src/simdutf/westmere/intrinsics.h */
2532 
2533 //
2534 // The rest need to be inside the region
2535 //
2536 /* begin file src/simdutf/westmere/begin.h */
2537 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
2538 // #define SIMDUTF_IMPLEMENTATION westmere
2539 
2540 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
2541 // nothing needed.
2542 #else
2543 SIMDUTF_TARGET_WESTMERE
2544 #endif
2545 /* end file src/simdutf/westmere/begin.h */
2546 
2547 // Declarations
2548 /* begin file src/simdutf/westmere/bitmanipulation.h */
2549 #ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
2550 #define SIMDUTF_WESTMERE_BITMANIPULATION_H
2551 
2552 namespace simdutf {
2553 namespace westmere {
2554 namespace {
2555 
2556 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)2557 simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
2558   // note: we do not support legacy 32-bit Windows
2559   return __popcnt64(input_num);// Visual Studio wants two underscores
2560 }
2561 #else
2562 simdutf_really_inline long long int count_ones(uint64_t input_num) {
2563   return _popcnt64(input_num);
2564 }
2565 #endif
2566 
2567 } // unnamed namespace
2568 } // namespace westmere
2569 } // namespace simdutf
2570 
2571 #endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
2572 /* end file src/simdutf/westmere/bitmanipulation.h */
2573 /* begin file src/simdutf/westmere/simd.h */
2574 #ifndef SIMDUTF_WESTMERE_SIMD_H
2575 #define SIMDUTF_WESTMERE_SIMD_H
2576 
2577 namespace simdutf {
2578 namespace westmere {
2579 namespace {
2580 namespace simd {
2581 
2582   template<typename Child>
2583   struct base {
2584     __m128i value;
2585 
2586     // Zero constructor
basesimdutf::westmere::__anon13842::simd::base2587     simdutf_really_inline base() : value{__m128i()} {}
2588 
2589     // Conversion from SIMD register
basesimdutf::westmere::__anon13842::simd::base2590     simdutf_really_inline base(const __m128i _value) : value(_value) {}
2591     // Conversion to SIMD register
operator const __m128i&simdutf::westmere::__anon13842::simd::base2592     simdutf_really_inline operator const __m128i&() const { return this->value; }
operator __m128i&simdutf::westmere::__anon13842::simd::base2593     simdutf_really_inline operator __m128i&() { return this->value; }
2594     template <endianness big_endian>
store_ascii_as_utf16simdutf::westmere::__anon13842::simd::base2595     simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
2596       __m128i first = _mm_cvtepu8_epi16(*this);
2597       __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8));
2598       if (big_endian) {
2599         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
2600         first = _mm_shuffle_epi8(first, swap);
2601         second = _mm_shuffle_epi8(second, swap);
2602       }
2603       _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
2604       _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second);
2605     }
store_ascii_as_utf32simdutf::westmere::__anon13842::simd::base2606     simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
2607       _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
2608       _mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4)));
2609       _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8)));
2610       _mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12)));
2611     }
2612     // Bit operations
operator |simdutf::westmere::__anon13842::simd::base2613     simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
operator &simdutf::westmere::__anon13842::simd::base2614     simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
operator ^simdutf::westmere::__anon13842::simd::base2615     simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
bit_andnotsimdutf::westmere::__anon13842::simd::base2616     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
operator |=simdutf::westmere::__anon13842::simd::base2617     simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
operator &=simdutf::westmere::__anon13842::simd::base2618     simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
operator ^=simdutf::westmere::__anon13842::simd::base2619     simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
2620   };
2621 
2622   // Forward-declared so they can be used by splat and friends.
2623   template<typename T>
2624   struct simd8;
2625 
2626   template<typename T, typename Mask=simd8<bool>>
2627   struct base8: base<simd8<T>> {
2628     typedef uint16_t bitmask_t;
2629     typedef uint32_t bitmask2_t;
2630 
firstsimdutf::westmere::__anon13842::simd::base82631     simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); }
lastsimdutf::westmere::__anon13842::simd::base82632     simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); }
base8simdutf::westmere::__anon13842::simd::base82633     simdutf_really_inline base8() : base<simd8<T>>() {}
base8simdutf::westmere::__anon13842::simd::base82634     simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
2635 
operator ==simdutf::westmere::__anon13842::simd::base82636     friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
2637 
2638     static const int SIZE = sizeof(base<simd8<T>>::value);
2639 
2640     template<int N=1>
prevsimdutf::westmere::__anon13842::simd::base82641     simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
2642       return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
2643     }
2644   };
2645 
2646   // SIMD byte mask type (returned by things like eq and gt)
2647   template<>
2648   struct simd8<bool>: base8<bool> {
splatsimdutf::westmere::__anon13842::simd::simd82649     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
2650 
simd8simdutf::westmere::__anon13842::simd::simd82651     simdutf_really_inline simd8<bool>() : base8() {}
simd8simdutf::westmere::__anon13842::simd::simd82652     simdutf_really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
2653     // Splat constructor
simd8simdutf::westmere::__anon13842::simd::simd82654     simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
2655 
to_bitmasksimdutf::westmere::__anon13842::simd::simd82656     simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
anysimdutf::westmere::__anon13842::simd::simd82657     simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
nonesimdutf::westmere::__anon13842::simd::simd82658     simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); }
allsimdutf::westmere::__anon13842::simd::simd82659     simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; }
operator ~simdutf::westmere::__anon13842::simd::simd82660     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
2661   };
2662 
2663   template<typename T>
2664   struct base8_numeric: base8<T> {
splatsimdutf::westmere::__anon13842::simd::base8_numeric2665     static simdutf_really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
zerosimdutf::westmere::__anon13842::simd::base8_numeric2666     static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
loadsimdutf::westmere::__anon13842::simd::base8_numeric2667     static simdutf_really_inline simd8<T> load(const T values[16]) {
2668       return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
2669     }
2670     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anon13842::simd::base8_numeric2671     static simdutf_really_inline simd8<T> repeat_16(
2672       T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
2673       T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
2674     ) {
2675       return simd8<T>(
2676         v0, v1, v2, v3, v4, v5, v6, v7,
2677         v8, v9, v10,v11,v12,v13,v14,v15
2678       );
2679     }
2680 
base8_numericsimdutf::westmere::__anon13842::simd::base8_numeric2681     simdutf_really_inline base8_numeric() : base8<T>() {}
base8_numericsimdutf::westmere::__anon13842::simd::base8_numeric2682     simdutf_really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
2683 
2684     // Store to array
storesimdutf::westmere::__anon13842::simd::base8_numeric2685     simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
2686 
2687     // Override to distinguish from bool version
operator ~simdutf::westmere::__anon13842::simd::base8_numeric2688     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
2689 
2690     // Addition/subtraction are the same for signed and unsigned
operator +simdutf::westmere::__anon13842::simd::base8_numeric2691     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
operator -simdutf::westmere::__anon13842::simd::base8_numeric2692     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
operator +=simdutf::westmere::__anon13842::simd::base8_numeric2693     simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
operator -=simdutf::westmere::__anon13842::simd::base8_numeric2694     simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
2695 
2696     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
2697     template<typename L>
lookup_16simdutf::westmere::__anon13842::simd::base8_numeric2698     simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
2699       return _mm_shuffle_epi8(lookup_table, *this);
2700     }
2701 
2702     template<typename L>
lookup_16simdutf::westmere::__anon13842::simd::base8_numeric2703     simdutf_really_inline simd8<L> lookup_16(
2704         L replace0,  L replace1,  L replace2,  L replace3,
2705         L replace4,  L replace5,  L replace6,  L replace7,
2706         L replace8,  L replace9,  L replace10, L replace11,
2707         L replace12, L replace13, L replace14, L replace15) const {
2708       return lookup_16(simd8<L>::repeat_16(
2709         replace0,  replace1,  replace2,  replace3,
2710         replace4,  replace5,  replace6,  replace7,
2711         replace8,  replace9,  replace10, replace11,
2712         replace12, replace13, replace14, replace15
2713       ));
2714     }
2715   };
2716 
2717   // Signed bytes
2718   template<>
2719   struct simd8<int8_t> : base8_numeric<int8_t> {
simd8simdutf::westmere::__anon13842::simd::simd82720     simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
simd8simdutf::westmere::__anon13842::simd::simd82721     simdutf_really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
2722     // Splat constructor
simd8simdutf::westmere::__anon13842::simd::simd82723     simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
2724     // Array constructor
simd8simdutf::westmere::__anon13842::simd::simd82725     simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
2726     // Member-by-member initialization
simd8simdutf::westmere::__anon13842::simd::simd82727     simdutf_really_inline simd8(
2728       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
2729       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
2730     ) : simd8(_mm_setr_epi8(
2731       v0, v1, v2, v3, v4, v5, v6, v7,
2732       v8, v9, v10,v11,v12,v13,v14,v15
2733     )) {}
2734     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anon13842::simd::simd82735     simdutf_really_inline static simd8<int8_t> repeat_16(
2736       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
2737       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
2738     ) {
2739       return simd8<int8_t>(
2740         v0, v1, v2, v3, v4, v5, v6, v7,
2741         v8, v9, v10,v11,v12,v13,v14,v15
2742       );
2743     }
2744     simdutf_really_inline operator simd8<uint8_t>() const;
is_asciisimdutf::westmere::__anon13842::simd::simd82745     simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
2746 
2747     // Order-sensitive comparisons
max_valsimdutf::westmere::__anon13842::simd::simd82748     simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
min_valsimdutf::westmere::__anon13842::simd::simd82749     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
operator >simdutf::westmere::__anon13842::simd::simd82750     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
operator <simdutf::westmere::__anon13842::simd::simd82751     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
2752   };
2753 
2754   // Unsigned bytes
2755   template<>
2756   struct simd8<uint8_t>: base8_numeric<uint8_t>  {
simd8simdutf::westmere::__anon13842::simd::simd82757     simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
simd8simdutf::westmere::__anon13842::simd::simd82758     simdutf_really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
2759 
2760     // Splat constructor
simd8simdutf::westmere::__anon13842::simd::simd82761     simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
2762     // Array constructor
simd8simdutf::westmere::__anon13842::simd::simd82763     simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
2764     // Member-by-member initialization
simd8simdutf::westmere::__anon13842::simd::simd82765     simdutf_really_inline simd8(
2766       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
2767       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
2768     ) : simd8(_mm_setr_epi8(
2769       v0, v1, v2, v3, v4, v5, v6, v7,
2770       v8, v9, v10,v11,v12,v13,v14,v15
2771     )) {}
2772     // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anon13842::simd::simd82773     simdutf_really_inline static simd8<uint8_t> repeat_16(
2774       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
2775       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
2776     ) {
2777       return simd8<uint8_t>(
2778         v0, v1, v2, v3, v4, v5, v6, v7,
2779         v8, v9, v10,v11,v12,v13,v14,v15
2780       );
2781     }
2782 
2783     // Saturated math
saturating_addsimdutf::westmere::__anon13842::simd::simd82784     simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
saturating_subsimdutf::westmere::__anon13842::simd::simd82785     simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
2786 
2787     // Order-specific operations
max_valsimdutf::westmere::__anon13842::simd::simd82788     simdutf_really_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
min_valsimdutf::westmere::__anon13842::simd::simd82789     simdutf_really_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
2790     // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::westmere::__anon13842::simd::simd82791     simdutf_really_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
2792     // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::westmere::__anon13842::simd::simd82793     simdutf_really_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::westmere::__anon13842::simd::simd82794     simdutf_really_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::westmere::__anon13842::simd::simd82795     simdutf_really_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::westmere::__anon13842::simd::simd82796     simdutf_really_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::westmere::__anon13842::simd::simd82797     simdutf_really_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
2798 
2799     // Bit-specific operations
bits_not_setsimdutf::westmere::__anon13842::simd::simd82800     simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
bits_not_setsimdutf::westmere::__anon13842::simd::simd82801     simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::westmere::__anon13842::simd::simd82802     simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::westmere::__anon13842::simd::simd82803     simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
is_asciisimdutf::westmere::__anon13842::simd::simd82804     simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
2805 
bits_not_set_anywheresimdutf::westmere::__anon13842::simd::simd82806     simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
any_bits_set_anywheresimdutf::westmere::__anon13842::simd::simd82807     simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::westmere::__anon13842::simd::simd82808     simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
any_bits_set_anywheresimdutf::westmere::__anon13842::simd::simd82809     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
2810     template<int N>
shrsimdutf::westmere::__anon13842::simd::simd82811     simdutf_really_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
2812     template<int N>
shlsimdutf::westmere::__anon13842::simd::simd82813     simdutf_really_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
2814     // Get one of the bits and make a bitmask out of it.
2815     // e.g. value.get_bit<7>() gets the high bit
2816     template<int N>
get_bitsimdutf::westmere::__anon13842::simd::simd82817     simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
2818   };
operator simd8<uint8_t>() const2819   simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
2820 
2821   // Unsigned bytes
2822   template<>
2823   struct simd8<uint16_t>: base<uint16_t> {
splatsimdutf::westmere::__anon13842::simd::simd82824     static simdutf_really_inline simd8<uint16_t> splat(uint16_t _value) { return _mm_set1_epi16(_value); }
loadsimdutf::westmere::__anon13842::simd::simd82825     static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8]) {
2826       return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
2827     }
2828 
simd8simdutf::westmere::__anon13842::simd::simd82829     simdutf_really_inline simd8() : base<uint16_t>() {}
simd8simdutf::westmere::__anon13842::simd::simd82830     simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
2831     // Splat constructor
simd8simdutf::westmere::__anon13842::simd::simd82832     simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {}
2833     // Array constructor
simd8simdutf::westmere::__anon13842::simd::simd82834     simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {}
2835     // Member-by-member initialization
simd8simdutf::westmere::__anon13842::simd::simd82836     simdutf_really_inline simd8(
2837       uint16_t v0,  uint16_t v1,  uint16_t v2,  uint16_t v3,  uint16_t v4,  uint16_t v5,  uint16_t v6,  uint16_t v7
2838     ) : simd8(_mm_setr_epi16(
2839       v0, v1, v2, v3, v4, v5, v6, v7
2840     )) {}
2841 
2842     // Saturated math
saturating_addsimdutf::westmere::__anon13842::simd::simd82843     simdutf_really_inline simd8<uint16_t> saturating_add(const simd8<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
saturating_subsimdutf::westmere::__anon13842::simd::simd82844     simdutf_really_inline simd8<uint16_t> saturating_sub(const simd8<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
2845 
2846     // Order-specific operations
max_valsimdutf::westmere::__anon13842::simd::simd82847     simdutf_really_inline simd8<uint16_t> max_val(const simd8<uint16_t> other) const { return _mm_max_epu16(*this, other); }
min_valsimdutf::westmere::__anon13842::simd::simd82848     simdutf_really_inline simd8<uint16_t> min_val(const simd8<uint16_t> other) const { return _mm_min_epu16(*this, other); }
2849     // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::westmere::__anon13842::simd::simd82850     simdutf_really_inline simd8<uint16_t> gt_bits(const simd8<uint16_t> other) const { return this->saturating_sub(other); }
2851     // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::westmere::__anon13842::simd::simd82852     simdutf_really_inline simd8<uint16_t> lt_bits(const simd8<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::westmere::__anon13842::simd::simd82853     simdutf_really_inline simd8<bool> operator<=(const simd8<uint16_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::westmere::__anon13842::simd::simd82854     simdutf_really_inline simd8<bool> operator>=(const simd8<uint16_t> other) const { return other.min_val(*this) == other; }
operator ==simdutf::westmere::__anon13842::simd::simd82855     simdutf_really_inline simd8<bool> operator==(const simd8<uint16_t> other) const { return _mm_cmpeq_epi16(*this, other); }
operator &simdutf::westmere::__anon13842::simd::simd82856     simdutf_really_inline simd8<bool> operator&(const simd8<uint16_t> other) const { return _mm_and_si128(*this, other); }
operator |simdutf::westmere::__anon13842::simd::simd82857     simdutf_really_inline simd8<bool> operator|(const simd8<uint16_t> other) const { return _mm_or_si128(*this, other); }
2858 
2859     // Bit-specific operations
bits_not_setsimdutf::westmere::__anon13842::simd::simd82860     simdutf_really_inline simd8<bool> bits_not_set() const { return *this == uint16_t(0); }
any_bits_setsimdutf::westmere::__anon13842::simd::simd82861     simdutf_really_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
2862 
bits_not_set_anywheresimdutf::westmere::__anon13842::simd::simd82863     simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
any_bits_set_anywheresimdutf::westmere::__anon13842::simd::simd82864     simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::westmere::__anon13842::simd::simd82865     simdutf_really_inline bool bits_not_set_anywhere(simd8<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
any_bits_set_anywheresimdutf::westmere::__anon13842::simd::simd82866     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
2867      };
2868   template<typename T>
2869   struct simd8x64 {
2870     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
2871     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
2872     simd8<T> chunks[NUM_CHUNKS];
2873 
2874     simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
2875     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
2876     simd8x64() = delete; // no default constructor allowed
2877 
simd8x64simdutf::westmere::__anon13842::simd::simd8x642878     simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd8x64simdutf::westmere::__anon13842::simd::simd8x642879     simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
2880 
storesimdutf::westmere::__anon13842::simd::simd8x642881     simdutf_really_inline void store(T* ptr) const {
2882       this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
2883       this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
2884       this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
2885       this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
2886     }
2887 
operator |=simdutf::westmere::__anon13842::simd::simd8x642888     simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
2889       this->chunks[0] |= other.chunks[0];
2890       this->chunks[1] |= other.chunks[1];
2891       this->chunks[2] |= other.chunks[2];
2892       this->chunks[3] |= other.chunks[3];
2893       return *this;
2894     }
2895 
reduce_orsimdutf::westmere::__anon13842::simd::simd8x642896     simdutf_really_inline simd8<T> reduce_or() const {
2897       return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
2898     }
2899 
is_asciisimdutf::westmere::__anon13842::simd::simd8x642900     simdutf_really_inline bool is_ascii() const {
2901       return this->reduce_or().is_ascii();
2902     }
2903 
2904     template <endianness endian>
store_ascii_as_utf16simdutf::westmere::__anon13842::simd::simd8x642905     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
2906       this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
2907       this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
2908       this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
2909       this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
2910     }
2911 
store_ascii_as_utf32simdutf::westmere::__anon13842::simd::simd8x642912     simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
2913       this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
2914       this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
2915       this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
2916       this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
2917     }
2918 
to_bitmasksimdutf::westmere::__anon13842::simd::simd8x642919     simdutf_really_inline uint64_t to_bitmask() const {
2920       uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
2921       uint64_t r1 =          this->chunks[1].to_bitmask();
2922       uint64_t r2 =          this->chunks[2].to_bitmask();
2923       uint64_t r3 =          this->chunks[3].to_bitmask();
2924       return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
2925     }
2926 
eqsimdutf::westmere::__anon13842::simd::simd8x642927     simdutf_really_inline uint64_t eq(const T m) const {
2928       const simd8<T> mask = simd8<T>::splat(m);
2929       return  simd8x64<bool>(
2930         this->chunks[0] == mask,
2931         this->chunks[1] == mask,
2932         this->chunks[2] == mask,
2933         this->chunks[3] == mask
2934       ).to_bitmask();
2935     }
2936 
eqsimdutf::westmere::__anon13842::simd::simd8x642937     simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
2938       return  simd8x64<bool>(
2939         this->chunks[0] == other.chunks[0],
2940         this->chunks[1] == other.chunks[1],
2941         this->chunks[2] == other.chunks[2],
2942         this->chunks[3] == other.chunks[3]
2943       ).to_bitmask();
2944     }
2945 
lteqsimdutf::westmere::__anon13842::simd::simd8x642946     simdutf_really_inline uint64_t lteq(const T m) const {
2947       const simd8<T> mask = simd8<T>::splat(m);
2948       return  simd8x64<bool>(
2949         this->chunks[0] <= mask,
2950         this->chunks[1] <= mask,
2951         this->chunks[2] <= mask,
2952         this->chunks[3] <= mask
2953       ).to_bitmask();
2954     }
2955 
in_rangesimdutf::westmere::__anon13842::simd::simd8x642956     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
2957       const simd8<T> mask_low = simd8<T>::splat(low);
2958       const simd8<T> mask_high = simd8<T>::splat(high);
2959 
2960       return  simd8x64<bool>(
2961         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
2962         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
2963         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
2964         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
2965       ).to_bitmask();
2966     }
not_in_rangesimdutf::westmere::__anon13842::simd::simd8x642967     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
2968       const simd8<T> mask_low = simd8<T>::splat(low-1);
2969       const simd8<T> mask_high = simd8<T>::splat(high+1);
2970       return simd8x64<bool>(
2971         (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
2972         (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
2973         (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
2974         (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
2975       ).to_bitmask();
2976     }
ltsimdutf::westmere::__anon13842::simd::simd8x642977     simdutf_really_inline uint64_t lt(const T m) const {
2978       const simd8<T> mask = simd8<T>::splat(m);
2979       return  simd8x64<bool>(
2980         this->chunks[0] < mask,
2981         this->chunks[1] < mask,
2982         this->chunks[2] < mask,
2983         this->chunks[3] < mask
2984       ).to_bitmask();
2985     }
2986 
gtsimdutf::westmere::__anon13842::simd::simd8x642987     simdutf_really_inline uint64_t gt(const T m) const {
2988       const simd8<T> mask = simd8<T>::splat(m);
2989       return  simd8x64<bool>(
2990         this->chunks[0] > mask,
2991         this->chunks[1] > mask,
2992         this->chunks[2] > mask,
2993         this->chunks[3] > mask
2994       ).to_bitmask();
2995     }
gteqsimdutf::westmere::__anon13842::simd::simd8x642996     simdutf_really_inline uint64_t gteq(const T m) const {
2997       const simd8<T> mask = simd8<T>::splat(m);
2998       return  simd8x64<bool>(
2999         this->chunks[0] >= mask,
3000         this->chunks[1] >= mask,
3001         this->chunks[2] >= mask,
3002         this->chunks[3] >= mask
3003       ).to_bitmask();
3004     }
gteq_unsignedsimdutf::westmere::__anon13842::simd::simd8x643005     simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
3006       const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
3007       return  simd8x64<bool>(
3008         simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
3009         simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
3010         simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
3011         simd8<uint8_t>(__m128i(this->chunks[3])) >= mask
3012       ).to_bitmask();
3013     }
3014   }; // struct simd8x64<T>
3015 
3016 /* begin file src/simdutf/westmere/simd16-inl.h */
3017 template<typename T>
3018 struct simd16;
3019 
3020 template<typename T, typename Mask=simd16<bool>>
3021 struct base16: base<simd16<T>> {
3022   typedef uint16_t bitmask_t;
3023   typedef uint32_t bitmask2_t;
3024 
base16simdutf::westmere::__anon13842::simd::base163025   simdutf_really_inline base16() : base<simd16<T>>() {}
base16simdutf::westmere::__anon13842::simd::base163026   simdutf_really_inline base16(const __m128i _value) : base<simd16<T>>(_value) {}
3027   template <typename Pointer>
base16simdutf::westmere::__anon13842::simd::base163028   simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr))) {}
3029 
operator ==simdutf::westmere::__anon13842::simd::base163030   friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm_cmpeq_epi16(lhs, rhs); }
3031 
3032   static const int SIZE = sizeof(base<simd16<T>>::value);
3033 
3034   template<int N=1>
prevsimdutf::westmere::__anon13842::simd::base163035   simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
3036     return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
3037   }
3038 };
3039 
3040 // SIMD byte mask type (returned by things like eq and gt)
3041 template<>
3042 struct simd16<bool>: base16<bool> {
splatsimdutf::westmere::__anon13842::simd::simd163043   static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
3044 
simd16simdutf::westmere::__anon13842::simd::simd163045   simdutf_really_inline simd16<bool>() : base16() {}
simd16simdutf::westmere::__anon13842::simd::simd163046   simdutf_really_inline simd16<bool>(const __m128i _value) : base16<bool>(_value) {}
3047   // Splat constructor
simd16simdutf::westmere::__anon13842::simd::simd163048   simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
3049 
to_bitmasksimdutf::westmere::__anon13842::simd::simd163050   simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
anysimdutf::westmere::__anon13842::simd::simd163051   simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
operator ~simdutf::westmere::__anon13842::simd::simd163052   simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
3053 };
3054 
3055 template<typename T>
3056 struct base16_numeric: base16<T> {
splatsimdutf::westmere::__anon13842::simd::base16_numeric3057   static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
zerosimdutf::westmere::__anon13842::simd::base16_numeric3058   static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
loadsimdutf::westmere::__anon13842::simd::base16_numeric3059   static simdutf_really_inline simd16<T> load(const T values[8]) {
3060     return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
3061   }
3062 
base16_numericsimdutf::westmere::__anon13842::simd::base16_numeric3063   simdutf_really_inline base16_numeric() : base16<T>() {}
base16_numericsimdutf::westmere::__anon13842::simd::base16_numeric3064   simdutf_really_inline base16_numeric(const __m128i _value) : base16<T>(_value) {}
3065 
3066   // Store to array
storesimdutf::westmere::__anon13842::simd::base16_numeric3067   simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
3068 
3069   // Override to distinguish from bool version
operator ~simdutf::westmere::__anon13842::simd::base16_numeric3070   simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
3071 
3072   // Addition/subtraction are the same for signed and unsigned
operator +simdutf::westmere::__anon13842::simd::base16_numeric3073   simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
operator -simdutf::westmere::__anon13842::simd::base16_numeric3074   simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
operator +=simdutf::westmere::__anon13842::simd::base16_numeric3075   simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
operator -=simdutf::westmere::__anon13842::simd::base16_numeric3076   simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
3077 };
3078 
3079 // Signed code units
3080 template<>
3081 struct simd16<int16_t> : base16_numeric<int16_t> {
simd16simdutf::westmere::__anon13842::simd::simd163082   simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
simd16simdutf::westmere::__anon13842::simd::simd163083   simdutf_really_inline simd16(const __m128i _value) : base16_numeric<int16_t>(_value) {}
3084   // Splat constructor
simd16simdutf::westmere::__anon13842::simd::simd163085   simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
3086   // Array constructor
simd16simdutf::westmere::__anon13842::simd::simd163087   simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
simd16simdutf::westmere::__anon13842::simd::simd163088   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
3089   // Member-by-member initialization
simd16simdutf::westmere::__anon13842::simd::simd163090   simdutf_really_inline simd16(
3091     int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
3092     : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
3093   simdutf_really_inline operator simd16<uint16_t>() const;
3094 
3095   // Order-sensitive comparisons
max_valsimdutf::westmere::__anon13842::simd::simd163096   simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
min_valsimdutf::westmere::__anon13842::simd::simd163097   simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
operator >simdutf::westmere::__anon13842::simd::simd163098   simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
operator <simdutf::westmere::__anon13842::simd::simd163099   simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
3100 };
3101 
3102 // Unsigned code units
3103 template<>
3104 struct simd16<uint16_t>: base16_numeric<uint16_t>  {
simd16simdutf::westmere::__anon13842::simd::simd163105   simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
simd16simdutf::westmere::__anon13842::simd::simd163106   simdutf_really_inline simd16(const __m128i _value) : base16_numeric<uint16_t>(_value) {}
3107 
3108   // Splat constructor
simd16simdutf::westmere::__anon13842::simd::simd163109   simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
3110   // Array constructor
simd16simdutf::westmere::__anon13842::simd::simd163111   simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
simd16simdutf::westmere::__anon13842::simd::simd163112   simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
3113   // Member-by-member initialization
simd16simdutf::westmere::__anon13842::simd::simd163114   simdutf_really_inline simd16(
3115     uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
3116   : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
3117   // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::westmere::__anon13842::simd::simd163118   simdutf_really_inline static simd16<uint16_t> repeat_16(
3119     uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
3120   ) {
3121     return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
3122   }
3123 
3124   // Saturated math
saturating_addsimdutf::westmere::__anon13842::simd::simd163125   simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
saturating_subsimdutf::westmere::__anon13842::simd::simd163126   simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
3127 
3128   // Order-specific operations
max_valsimdutf::westmere::__anon13842::simd::simd163129   simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
min_valsimdutf::westmere::__anon13842::simd::simd163130   simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
3131   // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
gt_bitssimdutf::westmere::__anon13842::simd::simd163132   simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
3133   // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
lt_bitssimdutf::westmere::__anon13842::simd::simd163134   simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
operator <=simdutf::westmere::__anon13842::simd::simd163135   simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
operator >=simdutf::westmere::__anon13842::simd::simd163136   simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
operator >simdutf::westmere::__anon13842::simd::simd163137   simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
operator <simdutf::westmere::__anon13842::simd::simd163138   simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
3139 
3140   // Bit-specific operations
bits_not_setsimdutf::westmere::__anon13842::simd::simd163141   simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
bits_not_setsimdutf::westmere::__anon13842::simd::simd163142   simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
any_bits_setsimdutf::westmere::__anon13842::simd::simd163143   simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
any_bits_setsimdutf::westmere::__anon13842::simd::simd163144   simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
3145 
bits_not_set_anywheresimdutf::westmere::__anon13842::simd::simd163146   simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
any_bits_set_anywheresimdutf::westmere::__anon13842::simd::simd163147   simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
bits_not_set_anywheresimdutf::westmere::__anon13842::simd::simd163148   simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
any_bits_set_anywheresimdutf::westmere::__anon13842::simd::simd163149   simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
3150   template<int N>
shrsimdutf::westmere::__anon13842::simd::simd163151   simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
3152   template<int N>
shlsimdutf::westmere::__anon13842::simd::simd163153   simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
3154   // Get one of the bits and make a bitmask out of it.
3155   // e.g. value.get_bit<7>() gets the high bit
3156   template<int N>
get_bitsimdutf::westmere::__anon13842::simd::simd163157   simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
3158 
3159   // Change the endianness
swap_bytessimdutf::westmere::__anon13842::simd::simd163160   simdutf_really_inline simd16<uint16_t> swap_bytes() const {
3161     const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
3162     return _mm_shuffle_epi8(*this, swap);
3163   }
3164 
3165   // Pack with the unsigned saturation  two uint16_t code units into single uint8_t vector
packsimdutf::westmere::__anon13842::simd::simd163166   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
3167     return _mm_packus_epi16(v0, v1);
3168   }
3169 };
operator simd16<uint16_t>() const3170 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
3171 
3172 template<typename T>
3173   struct simd16x32 {
3174     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
3175     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
3176     simd16<T> chunks[NUM_CHUNKS];
3177 
3178     simd16x32(const simd16x32<T>& o) = delete; // no copy allowed
3179     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
3180     simd16x32() = delete; // no default constructor allowed
3181 
simd16x32simdutf::westmere::__anon13842::simd::simd16x323182     simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
simd16x32simdutf::westmere::__anon13842::simd::simd16x323183     simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
3184 
storesimdutf::westmere::__anon13842::simd::simd16x323185     simdutf_really_inline void store(T* ptr) const {
3186       this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
3187       this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
3188       this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
3189       this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
3190     }
3191 
reduce_orsimdutf::westmere::__anon13842::simd::simd16x323192     simdutf_really_inline simd16<T> reduce_or() const {
3193       return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
3194     }
3195 
is_asciisimdutf::westmere::__anon13842::simd::simd16x323196     simdutf_really_inline bool is_ascii() const {
3197       return this->reduce_or().is_ascii();
3198     }
3199 
store_ascii_as_utf16simdutf::westmere::__anon13842::simd::simd16x323200     simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
3201       this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
3202       this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
3203       this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
3204       this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
3205     }
3206 
to_bitmasksimdutf::westmere::__anon13842::simd::simd16x323207     simdutf_really_inline uint64_t to_bitmask() const {
3208       uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
3209       uint64_t r1 =          this->chunks[1].to_bitmask();
3210       uint64_t r2 =          this->chunks[2].to_bitmask();
3211       uint64_t r3 =          this->chunks[3].to_bitmask();
3212       return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
3213     }
3214 
swap_bytessimdutf::westmere::__anon13842::simd::simd16x323215     simdutf_really_inline void swap_bytes() {
3216       this->chunks[0] = this->chunks[0].swap_bytes();
3217       this->chunks[1] = this->chunks[1].swap_bytes();
3218       this->chunks[2] = this->chunks[2].swap_bytes();
3219       this->chunks[3] = this->chunks[3].swap_bytes();
3220     }
3221 
eqsimdutf::westmere::__anon13842::simd::simd16x323222     simdutf_really_inline uint64_t eq(const T m) const {
3223       const simd16<T> mask = simd16<T>::splat(m);
3224       return  simd16x32<bool>(
3225         this->chunks[0] == mask,
3226         this->chunks[1] == mask,
3227         this->chunks[2] == mask,
3228         this->chunks[3] == mask
3229       ).to_bitmask();
3230     }
3231 
eqsimdutf::westmere::__anon13842::simd::simd16x323232     simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
3233       return  simd16x32<bool>(
3234         this->chunks[0] == other.chunks[0],
3235         this->chunks[1] == other.chunks[1],
3236         this->chunks[2] == other.chunks[2],
3237         this->chunks[3] == other.chunks[3]
3238       ).to_bitmask();
3239     }
3240 
lteqsimdutf::westmere::__anon13842::simd::simd16x323241     simdutf_really_inline uint64_t lteq(const T m) const {
3242       const simd16<T> mask = simd16<T>::splat(m);
3243       return  simd16x32<bool>(
3244         this->chunks[0] <= mask,
3245         this->chunks[1] <= mask,
3246         this->chunks[2] <= mask,
3247         this->chunks[3] <= mask
3248       ).to_bitmask();
3249     }
3250 
in_rangesimdutf::westmere::__anon13842::simd::simd16x323251     simdutf_really_inline uint64_t in_range(const T low, const T high) const {
3252       const simd16<T> mask_low = simd16<T>::splat(low);
3253       const simd16<T> mask_high = simd16<T>::splat(high);
3254 
3255       return  simd16x32<bool>(
3256         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
3257         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
3258         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
3259         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
3260       ).to_bitmask();
3261     }
not_in_rangesimdutf::westmere::__anon13842::simd::simd16x323262     simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
3263       const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
3264       const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
3265       return simd16x32<bool>(
3266         (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
3267         (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
3268         (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
3269         (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
3270       ).to_bitmask();
3271     }
ltsimdutf::westmere::__anon13842::simd::simd16x323272     simdutf_really_inline uint64_t lt(const T m) const {
3273       const simd16<T> mask = simd16<T>::splat(m);
3274       return  simd16x32<bool>(
3275         this->chunks[0] < mask,
3276         this->chunks[1] < mask,
3277         this->chunks[2] < mask,
3278         this->chunks[3] < mask
3279       ).to_bitmask();
3280     }
3281   }; // struct simd16x32<T>
3282 /* end file src/simdutf/westmere/simd16-inl.h */
3283 
3284 } // namespace simd
3285 } // unnamed namespace
3286 } // namespace westmere
3287 } // namespace simdutf
3288 
3289 #endif // SIMDUTF_WESTMERE_SIMD_INPUT_H
3290 /* end file src/simdutf/westmere/simd.h */
3291 
3292 /* begin file src/simdutf/westmere/end.h */
3293 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
3294 // nothing needed.
3295 #else
3296 SIMDUTF_UNTARGET_REGION
3297 #endif
3298 
3299 /* end file src/simdutf/westmere/end.h */
3300 
3301 #endif // SIMDUTF_IMPLEMENTATION_WESTMERE
3302 #endif // SIMDUTF_WESTMERE_COMMON_H
3303 /* end file src/simdutf/westmere.h */
3304 /* begin file src/simdutf/ppc64.h */
3305 #ifndef SIMDUTF_PPC64_H
3306 #define SIMDUTF_PPC64_H
3307 
3308 #ifdef SIMDUTF_FALLBACK_H
3309 #error "ppc64.h must be included before fallback.h"
3310 #endif
3311 
3312 
3313 #ifndef SIMDUTF_IMPLEMENTATION_PPC64
3314 #define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
3315 #endif
3316 #define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64
3317 
3318 
3319 
3320 #if SIMDUTF_IMPLEMENTATION_PPC64
3321 
3322 namespace simdutf {
3323 /**
3324  * Implementation for ALTIVEC (PPC64).
3325  */
3326 namespace ppc64 {
3327 } // namespace ppc64
3328 } // namespace simdutf
3329 
3330 /* begin file src/simdutf/ppc64/implementation.h */
3331 #ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
3332 #define SIMDUTF_PPC64_IMPLEMENTATION_H
3333 
3334 
3335 namespace simdutf {
3336 namespace ppc64 {
3337 
3338 namespace {
3339 using namespace simdutf;
3340 } // namespace
3341 
3342 class implementation final : public simdutf::implementation {
3343 public:
implementation()3344   simdutf_really_inline implementation()
3345       : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
3346                                  internal::instruction_set::ALTIVEC) {}
3347   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
3348   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
3349   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
3350   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
3351   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
3352   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
3353   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
3354   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
3355   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
3356   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
3357   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
3358   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
3359   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
3360   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
3361   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
3362   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3363   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3364   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
3365   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
3366   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3367   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3368   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3369   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3370   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3371   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3372   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3373   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3374   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3375   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
3376   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3377   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3378   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3379   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3380   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3381   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
3382   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3383   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3384   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3385   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3386   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3387   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
3388   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
3389   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
3390   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
3391   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
3392   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
3393   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
3394   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
3395   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
3396   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
3397   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
3398   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
3399   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
3400 };
3401 
3402 } // namespace ppc64
3403 } // namespace simdutf
3404 
3405 #endif // SIMDUTF_PPC64_IMPLEMENTATION_H
3406 /* end file src/simdutf/ppc64/implementation.h */
3407 
3408 /* begin file src/simdutf/ppc64/begin.h */
3409 // redefining SIMDUTF_IMPLEMENTATION to "ppc64"
3410 // #define SIMDUTF_IMPLEMENTATION ppc64
3411 /* end file src/simdutf/ppc64/begin.h */
3412 
3413 // Declarations
3414 /* begin file src/simdutf/ppc64/intrinsics.h */
3415 #ifndef SIMDUTF_PPC64_INTRINSICS_H
3416 #define SIMDUTF_PPC64_INTRINSICS_H
3417 
3418 
3419 // This should be the correct header whether
3420 // you use visual studio or other compilers.
3421 #include <altivec.h>
3422 
3423 // These are defined by altivec.h in GCC toolchain, it is safe to undef them.
3424 #ifdef bool
3425 #undef bool
3426 #endif
3427 
3428 #ifdef vector
3429 #undef vector
3430 #endif
3431 
3432 #endif //  SIMDUTF_PPC64_INTRINSICS_H
3433 /* end file src/simdutf/ppc64/intrinsics.h */
3434 /* begin file src/simdutf/ppc64/bitmanipulation.h */
3435 #ifndef SIMDUTF_PPC64_BITMANIPULATION_H
3436 #define SIMDUTF_PPC64_BITMANIPULATION_H
3437 
3438 namespace simdutf {
3439 namespace ppc64 {
3440 namespace {
3441 
3442 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
count_ones(uint64_t input_num)3443 simdutf_really_inline int count_ones(uint64_t input_num) {
3444   // note: we do not support legacy 32-bit Windows
3445   return __popcnt64(input_num); // Visual Studio wants two underscores
3446 }
3447 #else
3448 simdutf_really_inline int count_ones(uint64_t input_num) {
3449   return __builtin_popcountll(input_num);
3450 }
3451 #endif
3452 
3453 } // unnamed namespace
3454 } // namespace ppc64
3455 } // namespace simdutf
3456 
3457 #endif // SIMDUTF_PPC64_BITMANIPULATION_H
3458 /* end file src/simdutf/ppc64/bitmanipulation.h */
3459 /* begin file src/simdutf/ppc64/simd.h */
3460 #ifndef SIMDUTF_PPC64_SIMD_H
3461 #define SIMDUTF_PPC64_SIMD_H
3462 
3463 #include <type_traits>
3464 
3465 namespace simdutf {
3466 namespace ppc64 {
3467 namespace {
3468 namespace simd {
3469 
3470 using __m128i = __vector unsigned char;
3471 
3472 template <typename Child> struct base {
3473   __m128i value;
3474 
3475   // Zero constructor
basesimdutf::ppc64::__anon13845::simd::base3476   simdutf_really_inline base() : value{__m128i()} {}
3477 
3478   // Conversion from SIMD register
basesimdutf::ppc64::__anon13845::simd::base3479   simdutf_really_inline base(const __m128i _value) : value(_value) {}
3480 
3481   // Conversion to SIMD register
operator const __m128i &simdutf::ppc64::__anon13845::simd::base3482   simdutf_really_inline operator const __m128i &() const {
3483     return this->value;
3484   }
operator __m128i &simdutf::ppc64::__anon13845::simd::base3485   simdutf_really_inline operator __m128i &() { return this->value; }
3486 
3487   // Bit operations
operator |simdutf::ppc64::__anon13845::simd::base3488   simdutf_really_inline Child operator|(const Child other) const {
3489     return vec_or(this->value, (__m128i)other);
3490   }
operator &simdutf::ppc64::__anon13845::simd::base3491   simdutf_really_inline Child operator&(const Child other) const {
3492     return vec_and(this->value, (__m128i)other);
3493   }
operator ^simdutf::ppc64::__anon13845::simd::base3494   simdutf_really_inline Child operator^(const Child other) const {
3495     return vec_xor(this->value, (__m128i)other);
3496   }
bit_andnotsimdutf::ppc64::__anon13845::simd::base3497   simdutf_really_inline Child bit_andnot(const Child other) const {
3498     return vec_andc(this->value, (__m128i)other);
3499   }
operator |=simdutf::ppc64::__anon13845::simd::base3500   simdutf_really_inline Child &operator|=(const Child other) {
3501     auto this_cast = static_cast<Child*>(this);
3502     *this_cast = *this_cast | other;
3503     return *this_cast;
3504   }
operator &=simdutf::ppc64::__anon13845::simd::base3505   simdutf_really_inline Child &operator&=(const Child other) {
3506     auto this_cast = static_cast<Child*>(this);
3507     *this_cast = *this_cast & other;
3508     return *this_cast;
3509   }
operator ^=simdutf::ppc64::__anon13845::simd::base3510   simdutf_really_inline Child &operator^=(const Child other) {
3511     auto this_cast = static_cast<Child*>(this);
3512     *this_cast = *this_cast ^ other;
3513     return *this_cast;
3514   }
3515 };
3516 
3517 // Forward-declared so they can be used by splat and friends.
3518 template <typename T> struct simd8;
3519 
3520 template <typename T, typename Mask = simd8<bool>>
3521 struct base8 : base<simd8<T>> {
3522   typedef uint16_t bitmask_t;
3523   typedef uint32_t bitmask2_t;
3524 
base8simdutf::ppc64::__anon13845::simd::base83525   simdutf_really_inline base8() : base<simd8<T>>() {}
base8simdutf::ppc64::__anon13845::simd::base83526   simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
3527 
operator ==simdutf::ppc64::__anon13845::simd::base83528   friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) {
3529     return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
3530   }
3531 
3532   static const int SIZE = sizeof(base<simd8<T>>::value);
3533 
3534   template <int N = 1>
prevsimdutf::ppc64::__anon13845::simd::base83535   simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const {
3536     __m128i chunk = this->value;
3537 #ifdef __LITTLE_ENDIAN__
3538     chunk = (__m128i)vec_reve(this->value);
3539     prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
3540 #endif
3541     chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
3542 #ifdef __LITTLE_ENDIAN__
3543     chunk = (__m128i)vec_reve((__m128i)chunk);
3544 #endif
3545     return chunk;
3546   }
3547 };
3548 
3549 // SIMD byte mask type (returned by things like eq and gt)
3550 template <> struct simd8<bool> : base8<bool> {
splatsimdutf::ppc64::__anon13845::simd::simd83551   static simdutf_really_inline simd8<bool> splat(bool _value) {
3552     return (__m128i)vec_splats((unsigned char)(-(!!_value)));
3553   }
3554 
simd8simdutf::ppc64::__anon13845::simd::simd83555   simdutf_really_inline simd8<bool>() : base8() {}
simd8simdutf::ppc64::__anon13845::simd::simd83556   simdutf_really_inline simd8<bool>(const __m128i _value)
3557       : base8<bool>(_value) {}
3558   // Splat constructor
simd8simdutf::ppc64::__anon13845::simd::simd83559   simdutf_really_inline simd8<bool>(bool _value)
3560       : base8<bool>(splat(_value)) {}
3561 
to_bitmasksimdutf::ppc64::__anon13845::simd::simd83562   simdutf_really_inline int to_bitmask() const {
3563     __vector unsigned long long result;
3564     const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
3565                                0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
3566 
3567     result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
3568                                                        (__m128i)perm_mask));
3569 #ifdef __LITTLE_ENDIAN__
3570     return static_cast<int>(result[1]);
3571 #else
3572     return static_cast<int>(result[0]);
3573 #endif
3574   }
anysimdutf::ppc64::__anon13845::simd::simd83575   simdutf_really_inline bool any() const {
3576     return !vec_all_eq(this->value, (__m128i)vec_splats(0));
3577   }
operator ~simdutf::ppc64::__anon13845::simd::simd83578   simdutf_really_inline simd8<bool> operator~() const {
3579     return this->value ^ (__m128i)splat(true);
3580   }
3581 };
3582 
3583 template <typename T> struct base8_numeric : base8<T> {
splatsimdutf::ppc64::__anon13845::simd::base8_numeric3584   static simdutf_really_inline simd8<T> splat(T value) {
3585     (void)value;
3586     return (__m128i)vec_splats(value);
3587   }
zerosimdutf::ppc64::__anon13845::simd::base8_numeric3588   static simdutf_really_inline simd8<T> zero() { return splat(0); }
loadsimdutf::ppc64::__anon13845::simd::base8_numeric3589   static simdutf_really_inline simd8<T> load(const T values[16]) {
3590     return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
3591   }
3592   // Repeat 16 values as many times as necessary (usually for lookup tables)
repeat_16simdutf::ppc64::__anon13845::simd::base8_numeric3593   static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
3594                                                    T v5, T v6, T v7, T v8, T v9,
3595                                                    T v10, T v11, T v12, T v13,
3596                                                    T v14, T v15) {
3597     return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
3598                     v14, v15);
3599   }
3600 
base8_numericsimdutf::ppc64::__anon13845::simd::base8_numeric3601   simdutf_really_inline base8_numeric() : base8<T>() {}
base8_numericsimdutf::ppc64::__anon13845::simd::base8_numeric3602   simdutf_really_inline base8_numeric(const __m128i _value)
3603       : base8<T>(_value) {}
3604 
3605   // Store to array
storesimdutf::ppc64::__anon13845::simd::base8_numeric3606   simdutf_really_inline void store(T dst[16]) const {
3607     vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
3608   }
3609 
3610   // Override to distinguish from bool version
operator ~simdutf::ppc64::__anon13845::simd::base8_numeric3611   simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
3612 
3613   // Addition/subtraction are the same for signed and unsigned
operator +simdutf::ppc64::__anon13845::simd::base8_numeric3614   simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
3615     return (__m128i)((__m128i)this->value + (__m128i)other);
3616   }
operator -simdutf::ppc64::__anon13845::simd::base8_numeric3617   simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
3618     return (__m128i)((__m128i)this->value - (__m128i)other);
3619   }
operator +=simdutf::ppc64::__anon13845::simd::base8_numeric3620   simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
3621     *this = *this + other;
3622     return *static_cast<simd8<T> *>(this);
3623   }
operator -=simdutf::ppc64::__anon13845::simd::base8_numeric3624   simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
3625     *this = *this - other;
3626     return *static_cast<simd8<T> *>(this);
3627   }
3628 
3629   // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
3630   // for out of range values)
3631   template <typename L>
lookup_16simdutf::ppc64::__anon13845::simd::base8_numeric3632   simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
3633     return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
3634   }
3635 
3636   template <typename L>
3637   simdutf_really_inline simd8<L>
lookup_16simdutf::ppc64::__anon13845::simd::base8_numeric3638   lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
3639             L replace5, L replace6, L replace7, L replace8, L replace9,
3640             L replace10, L replace11, L replace12, L replace13, L replace14,
3641             L replace15) const {
3642     return lookup_16(simd8<L>::repeat_16(
3643         replace0, replace1, replace2, replace3, replace4, replace5, replace6,
3644         replace7, replace8, replace9, replace10, replace11, replace12,
3645         replace13, replace14, replace15));
3646   }
3647 };
3648 
3649 // Signed bytes
3650 template <> struct simd8<int8_t> : base8_numeric<int8_t> {
simd8simdutf::ppc64::__anon13845::simd::simd83651   simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
simd8simdutf::ppc64::__anon13845::simd::simd83652   simdutf_really_inline simd8(const __m128i _value)
3653       : base8_numeric<int8_t>(_value) {}
3654 
3655   // Splat constructor
simd8simdutf::ppc64::__anon13845::simd::simd83656   simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
3657   // Array constructor
simd8simdutf::ppc64::__anon13845::simd::simd83658   simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
3659   // Member-by-member initialization
simd8simdutf::ppc64::__anon13845::simd::simd83660   simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
3661                                int8_t v4, int8_t v5, int8_t v6, int8_t v7,
3662                                int8_t v8, int8_t v9, int8_t v10, int8_t v11,
3663                                int8_t v12, int8_t v13, int8_t v14, int8_t v15)
3664       : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
3665                                               v8, v9, v10, v11, v12, v13, v14,
3666                                               v15}) {}
3667   // Repeat 16 values as many times as necessary (usually for lookup tables)
3668   simdutf_really_inline static simd8<int8_t>
repeat_16simdutf::ppc64::__anon13845::simd::simd83669   repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
3670             int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
3671             int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
3672     return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
3673                          v13, v14, v15);
3674   }
3675 
3676   // Order-sensitive comparisons
3677   simdutf_really_inline simd8<int8_t>
max_valsimdutf::ppc64::__anon13845::simd::simd83678   max_val(const simd8<int8_t> other) const {
3679     return (__m128i)vec_max((__vector signed char)this->value,
3680                             (__vector signed char)(__m128i)other);
3681   }
3682   simdutf_really_inline simd8<int8_t>
min_valsimdutf::ppc64::__anon13845::simd::simd83683   min_val(const simd8<int8_t> other) const {
3684     return (__m128i)vec_min((__vector signed char)this->value,
3685                             (__vector signed char)(__m128i)other);
3686   }
3687   simdutf_really_inline simd8<bool>
operator >simdutf::ppc64::__anon13845::simd::simd83688   operator>(const simd8<int8_t> other) const {
3689     return (__m128i)vec_cmpgt((__vector signed char)this->value,
3690                               (__vector signed char)(__m128i)other);
3691   }
3692   simdutf_really_inline simd8<bool>
operator <simdutf::ppc64::__anon13845::simd::simd83693   operator<(const simd8<int8_t> other) const {
3694     return (__m128i)vec_cmplt((__vector signed char)this->value,
3695                               (__vector signed char)(__m128i)other);
3696   }
3697 };
3698 
3699 // Unsigned bytes
3700 template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
simd8simdutf::ppc64::__anon13845::simd::simd83701   simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
simd8simdutf::ppc64::__anon13845::simd::simd83702   simdutf_really_inline simd8(const __m128i _value)
3703       : base8_numeric<uint8_t>(_value) {}
3704   // Splat constructor
simd8simdutf::ppc64::__anon13845::simd::simd83705   simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
3706   // Array constructor
simd8simdutf::ppc64::__anon13845::simd::simd83707   simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
3708   // Member-by-member initialization
3709   simdutf_really_inline
simd8simdutf::ppc64::__anon13845::simd::simd83710   simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
3711         uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
3712         uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
3713       : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
3714                         v13, v14, v15}) {}
3715   // Repeat 16 values as many times as necessary (usually for lookup tables)
3716   simdutf_really_inline static simd8<uint8_t>
repeat_16simdutf::ppc64::__anon13845::simd::simd83717   repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
3718             uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
3719             uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
3720             uint8_t v15) {
3721     return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
3722                           v13, v14, v15);
3723   }
3724 
3725   // Saturated math
3726   simdutf_really_inline simd8<uint8_t>
saturating_addsimdutf::ppc64::__anon13845::simd::simd83727   saturating_add(const simd8<uint8_t> other) const {
3728     return (__m128i)vec_adds(this->value, (__m128i)other);
3729   }
3730   simdutf_really_inline simd8<uint8_t>
saturating_subsimdutf::ppc64::__anon13845::simd::simd83731   saturating_sub(const simd8<uint8_t> other) const {
3732     return (__m128i)vec_subs(this->value, (__m128i)other);
3733   }
3734 
3735   // Order-specific operations
3736   simdutf_really_inline simd8<uint8_t>
max_valsimdutf::ppc64::__anon13845::simd::simd83737   max_val(const simd8<uint8_t> other) const {
3738     return (__m128i)vec_max(this->value, (__m128i)other);
3739   }
3740   simdutf_really_inline simd8<uint8_t>
min_valsimdutf::ppc64::__anon13845::simd::simd83741   min_val(const simd8<uint8_t> other) const {
3742     return (__m128i)vec_min(this->value, (__m128i)other);
3743   }
3744   // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
3745   simdutf_really_inline simd8<uint8_t>
gt_bitssimdutf::ppc64::__anon13845::simd::simd83746   gt_bits(const simd8<uint8_t> other) const {
3747     return this->saturating_sub(other);
3748   }
3749   // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
3750   simdutf_really_inline simd8<uint8_t>
lt_bitssimdutf::ppc64::__anon13845::simd::simd83751   lt_bits(const simd8<uint8_t> other) const {
3752     return other.saturating_sub(*this);
3753   }
3754   simdutf_really_inline simd8<bool>
operator <=simdutf::ppc64::__anon13845::simd::simd83755   operator<=(const simd8<uint8_t> other) const {
3756     return other.max_val(*this) == other;
3757   }
3758   simdutf_really_inline simd8<bool>
operator >=simdutf::ppc64::__anon13845::simd::simd83759   operator>=(const simd8<uint8_t> other) const {
3760     return other.min_val(*this) == other;
3761   }
3762   simdutf_really_inline simd8<bool>
operator >simdutf::ppc64::__anon13845::simd::simd83763   operator>(const simd8<uint8_t> other) const {
3764     return this->gt_bits(other).any_bits_set();
3765   }
3766   simdutf_really_inline simd8<bool>
operator <simdutf::ppc64::__anon13845::simd::simd83767   operator<(const simd8<uint8_t> other) const {
3768     return this->gt_bits(other).any_bits_set();
3769   }
3770 
3771   // Bit-specific operations
bits_not_setsimdutf::ppc64::__anon13845::simd::simd83772   simdutf_really_inline simd8<bool> bits_not_set() const {
3773     return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
3774   }
bits_not_setsimdutf::ppc64::__anon13845::simd::simd83775   simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
3776     return (*this & bits).bits_not_set();
3777   }
any_bits_setsimdutf::ppc64::__anon13845::simd::simd83778   simdutf_really_inline simd8<bool> any_bits_set() const {
3779     return ~this->bits_not_set();
3780   }
any_bits_setsimdutf::ppc64::__anon13845::simd::simd83781   simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
3782     return ~this->bits_not_set(bits);
3783   }
3784 
is_asciisimdutf::ppc64::__anon13845::simd::simd83785   simdutf_really_inline bool is_ascii() const {
3786       return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
3787   }
3788 
bits_not_set_anywheresimdutf::ppc64::__anon13845::simd::simd83789   simdutf_really_inline bool bits_not_set_anywhere() const {
3790     return vec_all_eq(this->value, (__m128i)vec_splats(0));
3791   }
any_bits_set_anywheresimdutf::ppc64::__anon13845::simd::simd83792   simdutf_really_inline bool any_bits_set_anywhere() const {
3793     return !bits_not_set_anywhere();
3794   }
bits_not_set_anywheresimdutf::ppc64::__anon13845::simd::simd83795   simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
3796     return vec_all_eq(vec_and(this->value, (__m128i)bits),
3797                       (__m128i)vec_splats(0));
3798   }
any_bits_set_anywheresimdutf::ppc64::__anon13845::simd::simd83799   simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
3800     return !bits_not_set_anywhere(bits);
3801   }
shrsimdutf::ppc64::__anon13845::simd::simd83802   template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
3803     return simd8<uint8_t>(
3804         (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
3805   }
shlsimdutf::ppc64::__anon13845::simd::simd83806   template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
3807     return simd8<uint8_t>(
3808         (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
3809   }
3810 };
3811 
3812 template <typename T> struct simd8x64 {
3813   static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
3814   static_assert(NUM_CHUNKS == 4,
3815                 "PPC64 kernel should use four registers per 64-byte block.");
3816   simd8<T> chunks[NUM_CHUNKS];
3817 
3818   simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
3819   simd8x64<T> &
3820   operator=(const simd8<T> other) = delete; // no assignment allowed
3821   simd8x64() = delete;                      // no default constructor allowed
3822 
simd8x64simdutf::ppc64::__anon13845::simd::simd8x643823   simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
3824                                   const simd8<T> chunk2, const simd8<T> chunk3)
3825       : chunks{chunk0, chunk1, chunk2, chunk3} {}
3826 
simd8x64simdutf::ppc64::__anon13845::simd::simd8x643827   simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
3828 
storesimdutf::ppc64::__anon13845::simd::simd8x643829   simdutf_really_inline void store(T* ptr) const {
3830     this->chunks[0].store(ptr + sizeof(simd8<T>) * 0/sizeof(T));
3831     this->chunks[1].store(ptr + sizeof(simd8<T>) * 1/sizeof(T));
3832     this->chunks[2].store(ptr + sizeof(simd8<T>) * 2/sizeof(T));
3833     this->chunks[3].store(ptr + sizeof(simd8<T>) * 3/sizeof(T));
3834   }
3835 
3836 
operator |=simdutf::ppc64::__anon13845::simd::simd8x643837   simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
3838       this->chunks[0] |= other.chunks[0];
3839       this->chunks[1] |= other.chunks[1];
3840       this->chunks[2] |= other.chunks[2];
3841       this->chunks[3] |= other.chunks[3];
3842       return *this;
3843     }
3844 
reduce_orsimdutf::ppc64::__anon13845::simd::simd8x643845   simdutf_really_inline simd8<T> reduce_or() const {
3846     return (this->chunks[0] | this->chunks[1]) |
3847            (this->chunks[2] | this->chunks[3]);
3848   }
3849 
3850 
is_asciisimdutf::ppc64::__anon13845::simd::simd8x643851   simdutf_really_inline bool is_ascii() const {
3852     return input.reduce_or().is_ascii();
3853   }
3854 
to_bitmasksimdutf::ppc64::__anon13845::simd::simd8x643855   simdutf_really_inline uint64_t to_bitmask() const {
3856     uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
3857     uint64_t r1 = this->chunks[1].to_bitmask();
3858     uint64_t r2 = this->chunks[2].to_bitmask();
3859     uint64_t r3 = this->chunks[3].to_bitmask();
3860     return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
3861   }
3862 
eqsimdutf::ppc64::__anon13845::simd::simd8x643863   simdutf_really_inline uint64_t eq(const T m) const {
3864     const simd8<T> mask = simd8<T>::splat(m);
3865     return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
3866                           this->chunks[2] == mask, this->chunks[3] == mask)
3867         .to_bitmask();
3868   }
3869 
eqsimdutf::ppc64::__anon13845::simd::simd8x643870   simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
3871     return simd8x64<bool>(this->chunks[0] == other.chunks[0],
3872                           this->chunks[1] == other.chunks[1],
3873                           this->chunks[2] == other.chunks[2],
3874                           this->chunks[3] == other.chunks[3])
3875         .to_bitmask();
3876   }
3877 
lteqsimdutf::ppc64::__anon13845::simd::simd8x643878   simdutf_really_inline uint64_t lteq(const T m) const {
3879     const simd8<T> mask = simd8<T>::splat(m);
3880     return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
3881                           this->chunks[2] <= mask, this->chunks[3] <= mask)
3882         .to_bitmask();
3883   }
3884 
in_rangesimdutf::ppc64::__anon13845::simd::simd8x643885   simdutf_really_inline uint64_t in_range(const T low, const T high) const {
3886       const simd8<T> mask_low = simd8<T>::splat(low);
3887       const simd8<T> mask_high = simd8<T>::splat(high);
3888 
3889       return  simd8x64<bool>(
3890         (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
3891         (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
3892         (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
3893         (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
3894       ).to_bitmask();
3895   }
not_in_rangesimdutf::ppc64::__anon13845::simd::simd8x643896   simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
3897       const simd8<T> mask_low = simd8<T>::splat(low);
3898       const simd8<T> mask_high = simd8<T>::splat(high);
3899       return  simd8x64<bool>(
3900         (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
3901         (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
3902         (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
3903         (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
3904       ).to_bitmask();
3905   }
ltsimdutf::ppc64::__anon13845::simd::simd8x643906   simdutf_really_inline uint64_t lt(const T m) const {
3907     const simd8<T> mask = simd8<T>::splat(m);
3908     return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
3909                           this->chunks[2] < mask, this->chunks[3] < mask)
3910         .to_bitmask();
3911   }
3912 
gtsimdutf::ppc64::__anon13845::simd::simd8x643913   simdutf_really_inline uint64_t gt(const T m) const {
3914       const simd8<T> mask = simd8<T>::splat(m);
3915       return  simd8x64<bool>(
3916         this->chunks[0] > mask,
3917         this->chunks[1] > mask,
3918         this->chunks[2] > mask,
3919         this->chunks[3] > mask
3920       ).to_bitmask();
3921   }
gteqsimdutf::ppc64::__anon13845::simd::simd8x643922   simdutf_really_inline uint64_t gteq(const T m) const {
3923       const simd8<T> mask = simd8<T>::splat(m);
3924       return  simd8x64<bool>(
3925         this->chunks[0] >= mask,
3926         this->chunks[1] >= mask,
3927         this->chunks[2] >= mask,
3928         this->chunks[3] >= mask
3929       ).to_bitmask();
3930   }
gteq_unsignedsimdutf::ppc64::__anon13845::simd::simd8x643931   simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
3932       const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
3933       return  simd8x64<bool>(
3934         simd8<uint8_t>(this->chunks[0]) >= mask,
3935         simd8<uint8_t>(this->chunks[1]) >= mask,
3936         simd8<uint8_t>(this->chunks[2]) >= mask,
3937         simd8<uint8_t>(this->chunks[3]) >= mask
3938       ).to_bitmask();
3939   }
3940 }; // struct simd8x64<T>
3941 
3942 } // namespace simd
3943 } // unnamed namespace
3944 } // namespace ppc64
3945 } // namespace simdutf
3946 
3947 #endif // SIMDUTF_PPC64_SIMD_INPUT_H
3948 /* end file src/simdutf/ppc64/simd.h */
3949 
3950 /* begin file src/simdutf/ppc64/end.h */
3951 /* end file src/simdutf/ppc64/end.h */
3952 
3953 #endif // SIMDUTF_IMPLEMENTATION_PPC64
3954 
3955 #endif // SIMDUTF_PPC64_H
3956 /* end file src/simdutf/ppc64.h */
3957 /* begin file src/simdutf/fallback.h */
3958 #ifndef SIMDUTF_FALLBACK_H
3959 #define SIMDUTF_FALLBACK_H
3960 
3961 
3962 // Note that fallback.h is always imported last.
3963 
3964 // Default Fallback to on unless a builtin implementation has already been selected.
3965 #ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
3966 #if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE || SIMDUTF_CAN_ALWAYS_RUN_PPC64
3967 #define SIMDUTF_IMPLEMENTATION_FALLBACK 0
3968 #else
3969 #define SIMDUTF_IMPLEMENTATION_FALLBACK 1
3970 #endif
3971 #endif
3972 
3973 #define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
3974 
3975 #if SIMDUTF_IMPLEMENTATION_FALLBACK
3976 
3977 namespace simdutf {
3978 /**
3979  * Fallback implementation (runs on any machine).
3980  */
3981 namespace fallback {
3982 } // namespace fallback
3983 } // namespace simdutf
3984 
3985 /* begin file src/simdutf/fallback/implementation.h */
3986 #ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
3987 #define SIMDUTF_FALLBACK_IMPLEMENTATION_H
3988 
3989 
3990 namespace simdutf {
3991 namespace fallback {
3992 
3993 namespace {
3994 using namespace simdutf;
3995 }
3996 
3997 class implementation final : public simdutf::implementation {
3998 public:
implementation()3999   simdutf_really_inline implementation() : simdutf::implementation(
4000       "fallback",
4001       "Generic fallback implementation",
4002       0
4003   ) {}
4004   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
4005   simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
4006   simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
4007   simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
4008   simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
4009   simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
4010   simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
4011   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
4012   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
4013   simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
4014   simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
4015   simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
4016   simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4017   simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4018   simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
4019   simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
4020   simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
4021   simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
4022   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
4023   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
4024   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
4025   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
4026   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4027   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4028   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
4029   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
4030   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4031   simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
4032   simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
4033   simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
4034   simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
4035   simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
4036   simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
4037   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4038   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4039   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4040   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4041   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4042   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4043   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4044   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4045   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
4046   simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
4047   simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
4048   simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
4049   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4050   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4051   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4052   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4053   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4054   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
4055   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4056   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4057   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4058   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4059   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4060   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
4061   void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
4062   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
4063   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
4064   simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
4065   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
4066   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
4067   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
4068   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
4069   simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
4070   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
4071   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
4072   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
4073   simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
4074   simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
4075   simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
4076   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
4077   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
4078   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;};
4079 
4080 } // namespace fallback
4081 } // namespace simdutf
4082 
4083 #endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
4084 /* end file src/simdutf/fallback/implementation.h */
4085 
4086 /* begin file src/simdutf/fallback/begin.h */
4087 // redefining SIMDUTF_IMPLEMENTATION to "fallback"
4088 // #define SIMDUTF_IMPLEMENTATION fallback
4089 /* end file src/simdutf/fallback/begin.h */
4090 
4091 // Declarations
4092 /* begin file src/simdutf/fallback/bitmanipulation.h */
4093 #ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
4094 #define SIMDUTF_FALLBACK_BITMANIPULATION_H
4095 
4096 #include <limits>
4097 
4098 namespace simdutf {
4099 namespace fallback {
4100 namespace {
4101 
4102 } // unnamed namespace
4103 } // namespace fallback
4104 } // namespace simdutf
4105 
4106 #endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
4107 /* end file src/simdutf/fallback/bitmanipulation.h */
4108 
4109 /* begin file src/simdutf/fallback/end.h */
4110 /* end file src/simdutf/fallback/end.h */
4111 
4112 #endif // SIMDUTF_IMPLEMENTATION_FALLBACK
4113 #endif // SIMDUTF_FALLBACK_H
4114 /* end file src/simdutf/fallback.h */
4115 
4116 /* begin file src/scalar/utf8.h */
4117 #ifndef SIMDUTF_UTF8_H
4118 #define SIMDUTF_UTF8_H
4119 
4120 namespace simdutf {
4121 namespace scalar {
4122 namespace {
4123 namespace utf8 {
4124 #if SIMDUTF_IMPLEMENTATION_FALLBACK
4125 // only used by the fallback kernel.
4126 // credit: based on code from Google Fuchsia (Apache Licensed)
4127 inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
4128   const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
4129   uint64_t pos = 0;
4130   uint32_t code_point = 0;
4131   while (pos < len) {
4132     // check of the next 16 bytes are ascii.
4133     uint64_t next_pos = pos + 16;
4134     if (next_pos <= len) { // if it is safe to read 16 more bytes, check that they are ascii
4135       uint64_t v1;
4136       std::memcpy(&v1, data + pos, sizeof(uint64_t));
4137       uint64_t v2;
4138       std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
4139       uint64_t v{v1 | v2};
4140       if ((v & 0x8080808080808080) == 0) {
4141         pos = next_pos;
4142         continue;
4143       }
4144     }
4145     unsigned char byte = data[pos];
4146 
4147     while (byte < 0b10000000) {
4148       if (++pos == len) { return true; }
4149       byte = data[pos];
4150     }
4151 
4152     if ((byte & 0b11100000) == 0b11000000) {
4153       next_pos = pos + 2;
4154       if (next_pos > len) { return false; }
4155       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
4156       // range check
4157       code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
4158       if ((code_point < 0x80) || (0x7ff < code_point)) { return false; }
4159     } else if ((byte & 0b11110000) == 0b11100000) {
4160       next_pos = pos + 3;
4161       if (next_pos > len) { return false; }
4162       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
4163       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
4164       // range check
4165       code_point = (byte & 0b00001111) << 12 |
4166                    (data[pos + 1] & 0b00111111) << 6 |
4167                    (data[pos + 2] & 0b00111111);
4168       if ((code_point < 0x800) || (0xffff < code_point) ||
4169           (0xd7ff < code_point && code_point < 0xe000)) {
4170         return false;
4171       }
4172     } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
4173       next_pos = pos + 4;
4174       if (next_pos > len) { return false; }
4175       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
4176       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
4177       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
4178       // range check
4179       code_point =
4180           (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
4181           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
4182       if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
4183     } else {
4184       // we may have a continuation
4185       return false;
4186     }
4187     pos = next_pos;
4188   }
4189   return true;
4190 }
4191 #endif
4192 
4193 inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
4194   const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
4195   size_t pos = 0;
4196   uint32_t code_point = 0;
4197   while (pos < len) {
4198     // check of the next 16 bytes are ascii.
4199     size_t next_pos = pos + 16;
4200     if (next_pos <= len) { // if it is safe to read 16 more bytes, check that they are ascii
4201       uint64_t v1;
4202       std::memcpy(&v1, data + pos, sizeof(uint64_t));
4203       uint64_t v2;
4204       std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
4205       uint64_t v{v1 | v2};
4206       if ((v & 0x8080808080808080) == 0) {
4207         pos = next_pos;
4208         continue;
4209       }
4210     }
4211     unsigned char byte = data[pos];
4212 
4213     while (byte < 0b10000000) {
4214       if (++pos == len) { return result(error_code::SUCCESS, len); }
4215       byte = data[pos];
4216     }
4217 
4218     if ((byte & 0b11100000) == 0b11000000) {
4219       next_pos = pos + 2;
4220       if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
4221       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
4222       // range check
4223       code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
4224       if ((code_point < 0x80) || (0x7ff < code_point)) { return result(error_code::OVERLONG, pos); }
4225     } else if ((byte & 0b11110000) == 0b11100000) {
4226       next_pos = pos + 3;
4227       if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
4228       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
4229       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
4230       // range check
4231       code_point = (byte & 0b00001111) << 12 |
4232                    (data[pos + 1] & 0b00111111) << 6 |
4233                    (data[pos + 2] & 0b00111111);
4234       if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
4235       if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
4236     } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
4237       next_pos = pos + 4;
4238       if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
4239       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
4240       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
4241       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
4242       // range check
4243       code_point =
4244           (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
4245           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
4246       if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
4247       if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
4248     } else {
4249       // we either have too many continuation bytes or an invalid leading byte
4250       if ((byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
4251       else { return result(error_code::HEADER_BITS, pos); }
4252     }
4253     pos = next_pos;
4254   }
4255   return result(error_code::SUCCESS, len);
4256 }
4257 
4258 // Finds the previous leading byte starting backward from buf and validates with errors from there
4259 // Used to pinpoint the location of an error when an invalid chunk is detected
4260 // We assume that the stream starts with a leading byte, and to check that it is the case, we
4261 // ask that you pass a pointer to the start of the stream (start).
4262 inline simdutf_warn_unused result rewind_and_validate_with_errors(const char *start, const char *buf, size_t len) noexcept {
4263     // First check that we start with a leading byte
4264   if ((*start & 0b11000000) == 0b10000000) {
4265     return result(error_code::TOO_LONG, 0);
4266   }
4267   size_t extra_len{0};
4268   // A leading byte cannot be further than 4 bytes away
4269   for(int i = 0; i < 5; i++) {
4270     unsigned char byte = *buf;
4271     if ((byte & 0b11000000) != 0b10000000) {
4272       break;
4273     } else {
4274       buf--;
4275       extra_len++;
4276     }
4277   }
4278 
4279   result res = validate_with_errors(buf, len + extra_len);
4280   res.count -= extra_len;
4281   return res;
4282 }
4283 
count_code_points(const char* buf, size_t len)4284 inline size_t count_code_points(const char* buf, size_t len) {
4285     const int8_t * p = reinterpret_cast<const int8_t *>(buf);
4286     size_t counter{0};
4287     for(size_t i = 0; i < len; i++) {
4288         // -65 is 0b10111111, anything larger in two-complement's should start a new code point.
4289         if(p[i] > -65) { counter++; }
4290     }
4291     return counter;
4292 }
4293 
utf16_length_from_utf8(const char* buf, size_t len)4294 inline size_t utf16_length_from_utf8(const char* buf, size_t len) {
4295     const int8_t * p = reinterpret_cast<const int8_t *>(buf);
4296     size_t counter{0};
4297     for(size_t i = 0; i < len; i++) {
4298         if(p[i] > -65) { counter++; }
4299         if(uint8_t(p[i]) >= 240) { counter++; }
4300     }
4301     return counter;
4302 }
4303 
trim_partial_utf8(const char *input, size_t length)4304 simdutf_warn_unused inline size_t trim_partial_utf8(const char *input, size_t length) {
4305   if (length < 3) {
4306     switch (length) {
4307       case 2:
4308         if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
4309         if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left
4310         return length;
4311       case 1:
4312         if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
4313         return length;
4314       case 0:
4315         return length;
4316     }
4317   }
4318   if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
4319   if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left
4320   if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left
4321   return length;
4322 }
4323 
4324 } // utf8 namespace
4325 } // unnamed namespace
4326 } // namespace scalar
4327 } // namespace simdutf
4328 
4329 #endif
4330 /* end file src/scalar/utf8.h */
4331 /* begin file src/scalar/utf16.h */
4332 #ifndef SIMDUTF_UTF16_H
4333 #define SIMDUTF_UTF16_H
4334 
4335 namespace simdutf {
4336 namespace scalar {
4337 namespace {
4338 namespace utf16 {
4339 
swap_bytes(const uint16_t word)4340 inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
4341   return uint16_t((word >> 8) | (word << 8));
4342 }
4343 
4344 template <endianness big_endian>
4345 inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept {
4346   const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
4347   uint64_t pos = 0;
4348   while (pos < len) {
4349     uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
4350     if((word &0xF800) == 0xD800) {
4351         if(pos + 1 >= len) { return false; }
4352         uint16_t diff = uint16_t(word - 0xD800);
4353         if(diff > 0x3FF) { return false; }
4354         uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
4355         uint16_t diff2 = uint16_t(next_word - 0xDC00);
4356         if(diff2 > 0x3FF) { return false; }
4357         pos += 2;
4358     } else {
4359         pos++;
4360     }
4361   }
4362   return true;
4363 }
4364 
4365 template <endianness big_endian>
4366 inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, size_t len) noexcept {
4367   const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
4368   size_t pos = 0;
4369   while (pos < len) {
4370     uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
4371     if((word & 0xF800) == 0xD800) {
4372         if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
4373         uint16_t diff = uint16_t(word - 0xD800);
4374         if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
4375         uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
4376         uint16_t diff2 = uint16_t(next_word - 0xDC00);
4377         if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
4378         pos += 2;
4379     } else {
4380         pos++;
4381     }
4382   }
4383   return result(error_code::SUCCESS, pos);
4384 }
4385 
4386 template <endianness big_endian>
count_code_points(const char16_t* buf, size_t len)4387 inline size_t count_code_points(const char16_t* buf, size_t len) {
4388   // We are not BOM aware.
4389   const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
4390   size_t counter{0};
4391   for(size_t i = 0; i < len; i++) {
4392     uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
4393     counter += ((word & 0xFC00) != 0xDC00);
4394   }
4395   return counter;
4396 }
4397 
4398 template <endianness big_endian>
utf8_length_from_utf16(const char16_t* buf, size_t len)4399 inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) {
4400   // We are not BOM aware.
4401   const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
4402   size_t counter{0};
4403   for(size_t i = 0; i < len; i++) {
4404     uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
4405     counter++;                                      // ASCII
4406     counter += static_cast<size_t>(word > 0x7F);    // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
4407     counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) || (word >= 0xE000));   // three-byte
4408   }
4409   return counter;
4410 }
4411 
4412 template <endianness big_endian>
utf32_length_from_utf16(const char16_t* buf, size_t len)4413 inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) {
4414   // We are not BOM aware.
4415   const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
4416   size_t counter{0};
4417   for(size_t i = 0; i < len; i++) {
4418     uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
4419     counter += ((word & 0xFC00) != 0xDC00);
4420   }
4421   return counter;
4422 }
4423 
4424 
latin1_length_from_utf16(size_t len)4425 inline size_t latin1_length_from_utf16(size_t len) {
4426   return len;
4427 }
4428 
change_endianness_utf16(const char16_t* in, size_t size, char16_t* out)4429 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out) {
4430   const uint16_t * input = reinterpret_cast<const uint16_t *>(in);
4431   uint16_t * output = reinterpret_cast<uint16_t *>(out);
4432   for (size_t i = 0; i < size; i++) {
4433     *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
4434   }
4435 }
4436 
4437 
4438 template <endianness big_endian>
trim_partial_utf16(const char16_t* input, size_t length)4439 simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t* input, size_t length) {
4440   if (length <= 1) {
4441     return length;
4442   }
4443   uint16_t last_word = uint16_t(input[length-1]);
4444   last_word = !match_system(big_endian) ? swap_bytes(last_word) : last_word;
4445   length -= ((last_word & 0xFC00) == 0xD800);
4446   return length;
4447 }
4448 
4449 } // utf16 namespace
4450 } // unnamed namespace
4451 } // namespace scalar
4452 } // namespace simdutf
4453 
4454 #endif
4455 /* end file src/scalar/utf16.h */
4456 
4457 namespace simdutf {
supported_by_runtime_system() const4458 bool implementation::supported_by_runtime_system() const {
4459   uint32_t required_instruction_sets = this->required_instruction_sets();
4460   uint32_t supported_instruction_sets = internal::detect_supported_architectures();
4461   return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
4462 }
4463 
4464 simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept {
4465     // If there is a BOM, then we trust it.
4466     auto bom_encoding = simdutf::BOM::check_bom(input, length);
4467     if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
4468     // UTF8 is common, it includes ASCII, and is commonly represented
4469     // without a BOM, so if it fits, go with that. Note that it is still
4470     // possible to get it wrong, we are only 'guessing'. If some has UTF-16
4471     // data without a BOM, it could pass as UTF-8.
4472     //
4473     // An interesting twist might be to check for UTF-16 ASCII first (every
4474     // other byte is zero).
4475     if(validate_utf8(input, length)) { return encoding_type::UTF8; }
4476     // The next most common encoding that might appear without BOM is probably
4477     // UTF-16LE, so try that next.
4478     if((length % 2) == 0) {
4479       // important: we need to divide by two
4480       if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { return encoding_type::UTF16_LE; }
4481     }
4482     if((length % 4) == 0) {
4483       if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { return encoding_type::UTF32_LE; }
4484     }
4485     return encoding_type::unspecified;
4486 }
4487 
4488 namespace internal {
4489 
4490 // Static array of known implementations. We're hoping these get baked into the executable
4491 // without requiring a static initializer.
4492 
4493 
4494 #if SIMDUTF_IMPLEMENTATION_ICELAKE
get_icelake_singleton()4495 static const icelake::implementation* get_icelake_singleton() {
4496   static const icelake::implementation icelake_singleton{};
4497   return &icelake_singleton;
4498 }
4499 #endif
4500 #if SIMDUTF_IMPLEMENTATION_HASWELL
get_haswell_singleton()4501 static const haswell::implementation* get_haswell_singleton() {
4502   static const haswell::implementation haswell_singleton{};
4503   return &haswell_singleton;
4504 }
4505 #endif
4506 #if SIMDUTF_IMPLEMENTATION_WESTMERE
get_westmere_singleton()4507 static const westmere::implementation* get_westmere_singleton() {
4508   static const westmere::implementation westmere_singleton{};
4509   return &westmere_singleton;
4510 }
4511 #endif
4512 #if SIMDUTF_IMPLEMENTATION_ARM64
get_arm64_singleton()4513 static const arm64::implementation* get_arm64_singleton() {
4514   static const arm64::implementation arm64_singleton{};
4515   return &arm64_singleton;
4516 }
4517 #endif
4518 #if SIMDUTF_IMPLEMENTATION_PPC64
get_ppc64_singleton()4519 static const ppc64::implementation* get_ppc64_singleton() {
4520   static const ppc64::implementation ppc64_singleton{};
4521   return &ppc64_singleton;
4522 }
4523 #endif
4524 #if SIMDUTF_IMPLEMENTATION_FALLBACK
get_fallback_singleton()4525 static const fallback::implementation* get_fallback_singleton() {
4526   static const fallback::implementation fallback_singleton{};
4527   return &fallback_singleton;
4528 }
4529 #endif
4530 
4531 /**
4532  * @private Detects best supported implementation on first use, and sets it
4533  */
4534 class detect_best_supported_implementation_on_first_use final : public implementation {
4535 public:
4536   const std::string &name() const noexcept final { return set_best()->name(); }
4537   const std::string &description() const noexcept final { return set_best()->description(); }
4538   uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
4539 
4540   simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept override {
4541     return set_best()->detect_encodings(input, length);
4542   }
4543 
4544   simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
4545     return set_best()->validate_utf8(buf, len);
4546   }
4547 
4548   simdutf_warn_unused result validate_utf8_with_errors(const char * buf, size_t len) const noexcept final override {
4549     return set_best()->validate_utf8_with_errors(buf, len);
4550   }
4551 
4552   simdutf_warn_unused bool validate_ascii(const char * buf, size_t len) const noexcept final override {
4553     return set_best()->validate_ascii(buf, len);
4554   }
4555 
4556   simdutf_warn_unused result validate_ascii_with_errors(const char * buf, size_t len) const noexcept final override {
4557     return set_best()->validate_ascii_with_errors(buf, len);
4558   }
4559 
4560   simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) const noexcept final override {
4561     return set_best()->validate_utf16le(buf, len);
4562   }
4563 
4564   simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) const noexcept final override {
4565     return set_best()->validate_utf16be(buf, len);
4566   }
4567 
4568   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) const noexcept final override {
4569     return set_best()->validate_utf16le_with_errors(buf, len);
4570   }
4571 
4572   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) const noexcept final override {
4573     return set_best()->validate_utf16be_with_errors(buf, len);
4574   }
4575 
4576   simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) const noexcept final override {
4577     return set_best()->validate_utf32(buf, len);
4578   }
4579 
4580   simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) const noexcept final override {
4581     return set_best()->validate_utf32_with_errors(buf, len);
4582   }
4583 
4584   simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final override {
4585     return set_best()->convert_latin1_to_utf8(buf, len,utf8_output);
4586   }
4587 
4588   simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4589     return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
4590   }
4591 
4592   simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4593     return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
4594   }
4595 
4596   simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t * latin1_output) const noexcept final override {
4597     return set_best()->convert_latin1_to_utf32(buf, len,latin1_output);
4598   }
4599 
4600   simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final override {
4601     return set_best()->convert_utf8_to_latin1(buf, len,latin1_output);
4602   }
4603 
4604   simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept  final override {
4605   return set_best()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output);
4606   }
4607 
4608   simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final override {
4609     return set_best()->convert_valid_utf8_to_latin1(buf, len,latin1_output);
4610   }
4611 
4612   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4613     return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
4614   }
4615 
4616   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4617     return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
4618   }
4619 
4620   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4621     return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output);
4622   }
4623 
4624   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4625     return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output);
4626   }
4627 
4628   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4629     return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
4630   }
4631 
4632   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4633     return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
4634   }
4635 
4636   simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4637     return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
4638   }
4639 
4640   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4641     return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output);
4642   }
4643 
4644   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4645     return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
4646   }
4647 
4648   simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
4649     return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
4650   }
4651 
4652   simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
4653     return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
4654   }
4655 
4656   simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
4657     return set_best()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_output);
4658   }
4659 
4660   simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
4661     return set_best()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_output);
4662   }
4663 
4664   simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
4665     return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
4666   }
4667 
4668   simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
4669     return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
4670   }
4671 
4672   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4673     return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
4674   }
4675 
4676   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4677     return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
4678   }
4679 
4680   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4681     return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output);
4682   }
4683 
4684   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4685     return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output);
4686   }
4687 
4688   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4689     return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
4690   }
4691 
4692   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
4693     return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
4694   }
4695 
4696   simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override {
4697     return set_best()->convert_utf32_to_latin1(buf, len,latin1_output);
4698   }
4699 
4700   simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override {
4701     return set_best()->convert_utf32_to_latin1_with_errors(buf, len,latin1_output);
4702   }
4703 
4704   simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override {
4705     return set_best()->convert_utf32_to_latin1(buf, len,latin1_output);
4706   }
4707 
4708   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
4709     return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
4710   }
4711 
4712   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
4713     return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
4714   }
4715 
4716   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
4717     return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
4718   }
4719 
4720   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4721     return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
4722   }
4723 
4724   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4725     return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
4726   }
4727 
4728   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4729     return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output);
4730   }
4731 
4732   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4733     return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output);
4734   }
4735 
4736   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4737     return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
4738   }
4739 
4740   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
4741     return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
4742   }
4743 
4744   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4745     return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
4746   }
4747 
4748   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4749     return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
4750   }
4751 
4752   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4753     return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output);
4754   }
4755 
4756   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4757     return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output);
4758   }
4759 
4760   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4761     return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
4762   }
4763 
4764   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
4765     return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
4766   }
4767 
4768   void change_endianness_utf16(const char16_t * buf, size_t len, char16_t * output) const noexcept final override {
4769     set_best()->change_endianness_utf16(buf, len, output);
4770   }
4771 
4772   simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t len) const noexcept final override {
4773     return set_best()->count_utf16le(buf, len);
4774   }
4775 
4776   simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t len) const noexcept final override {
4777     return set_best()->count_utf16be(buf, len);
4778   }
4779 
4780   simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override {
4781     return set_best()->count_utf8(buf, len);
4782   }
4783 
4784   simdutf_warn_unused size_t latin1_length_from_utf8(const char * buf, size_t len) const noexcept override {
4785     return set_best()->latin1_length_from_utf8(buf, len);
4786   }
4787 
4788   simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) const noexcept override {
4789     return set_best()->latin1_length_from_utf16(len);
4790   }
4791 
4792   simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) const noexcept override {
4793     return set_best()->latin1_length_from_utf32(len);
4794   }
4795 
4796   simdutf_warn_unused size_t utf8_length_from_latin1(const char * buf, size_t len) const noexcept override {
4797     return set_best()->utf8_length_from_latin1(buf, len);
4798   }
4799 
4800   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
4801     return set_best()->utf8_length_from_utf16le(buf, len);
4802   }
4803 
4804   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
4805     return set_best()->utf8_length_from_utf16be(buf, len);
4806   }
4807 
4808   simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept override {
4809     return set_best()->utf16_length_from_latin1(len);
4810   }
4811 
4812   simdutf_warn_unused size_t utf32_length_from_latin1(size_t len) const noexcept override {
4813     return set_best()->utf32_length_from_latin1(len);
4814   }
4815 
4816   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
4817     return set_best()->utf32_length_from_utf16le(buf, len);
4818   }
4819 
4820   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
4821     return set_best()->utf32_length_from_utf16be(buf, len);
4822   }
4823 
4824   simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override {
4825     return set_best()->utf16_length_from_utf8(buf, len);
4826   }
4827 
4828   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
4829     return set_best()->utf8_length_from_utf32(buf, len);
4830   }
4831 
4832   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
4833     return set_best()->utf16_length_from_utf32(buf, len);
4834   }
4835 
4836   simdutf_warn_unused size_t utf32_length_from_utf8(const char * buf, size_t len) const noexcept override {
4837     return set_best()->utf32_length_from_utf8(buf, len);
4838   }
4839 
4840   simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
4841 
4842 private:
4843   const implementation *set_best() const noexcept;
4844 };
4845 
get_available_implementation_pointers()4846 static const std::initializer_list<const implementation *>& get_available_implementation_pointers() {
4847   static const std::initializer_list<const implementation *> available_implementation_pointers {
4848 #if SIMDUTF_IMPLEMENTATION_ICELAKE
4849     get_icelake_singleton(),
4850 #endif
4851 #if SIMDUTF_IMPLEMENTATION_HASWELL
4852     get_haswell_singleton(),
4853 #endif
4854 #if SIMDUTF_IMPLEMENTATION_WESTMERE
4855     get_westmere_singleton(),
4856 #endif
4857 #if SIMDUTF_IMPLEMENTATION_ARM64
4858     get_arm64_singleton(),
4859 #endif
4860 #if SIMDUTF_IMPLEMENTATION_PPC64
4861     get_ppc64_singleton(),
4862 #endif
4863 #if SIMDUTF_IMPLEMENTATION_FALLBACK
4864     get_fallback_singleton(),
4865 #endif
4866   }; // available_implementation_pointers
4867   return available_implementation_pointers;
4868 }
4869 
4870 // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
4871 class unsupported_implementation final : public implementation {
4872 public:
4873   simdutf_warn_unused int detect_encodings(const char *, size_t) const noexcept override {
4874     return encoding_type::unspecified;
4875   }
4876 
4877   simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
4878     return false; // Just refuse to validate. Given that we have a fallback implementation
4879     // it seems unlikely that unsupported_implementation will ever be used. If it is used,
4880     // then it will flag all strings as invalid. The alternative is to return an error_code
4881     // from which the user has to figure out whether the string is valid UTF-8... which seems
4882     // like a lot of work just to handle the very unlikely case that we have an unsupported
4883     // implementation. And, when it does happen (that we have an unsupported implementation),
4884     // what are the chances that the programmer has a fallback? Given that *we* provide the
4885     // fallback, it implies that the programmer would need a fallback for our fallback.
4886   }
4887 
4888   simdutf_warn_unused result validate_utf8_with_errors(const char *, size_t) const noexcept final override {
4889     return result(error_code::OTHER, 0);
4890   }
4891 
4892   simdutf_warn_unused bool validate_ascii(const char *, size_t) const noexcept final override {
4893     return false;
4894   }
4895 
4896   simdutf_warn_unused result validate_ascii_with_errors(const char *, size_t) const noexcept final override {
4897     return result(error_code::OTHER, 0);
4898   }
4899 
4900   simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override {
4901     return false;
4902   }
4903 
4904   simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override {
4905     return false;
4906   }
4907 
4908   simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override {
4909     return result(error_code::OTHER, 0);
4910   }
4911 
4912   simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override {
4913     return result(error_code::OTHER, 0);
4914   }
4915 
4916   simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override {
4917     return false;
4918   }
4919 
4920   simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override {
4921     return result(error_code::OTHER, 0);
4922   }
4923 
4924   simdutf_warn_unused size_t convert_latin1_to_utf8(const char*, size_t, char*) const noexcept final override {
4925     return 0;
4926   }
4927 
4928   simdutf_warn_unused size_t convert_latin1_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
4929     return 0;
4930   }
4931 
4932   simdutf_warn_unused size_t convert_latin1_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
4933     return 0;
4934   }
4935 
4936   simdutf_warn_unused size_t convert_latin1_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
4937     return 0;
4938   }
4939 
4940   simdutf_warn_unused size_t convert_utf8_to_latin1(const char*, size_t, char*) const noexcept final override {
4941     return 0;
4942   }
4943 
4944   simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char*, size_t, char*) const noexcept final override {
4945     return result(error_code::OTHER, 0);
4946   }
4947 
4948   simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char*, size_t, char*) const noexcept final override {
4949     return 0;
4950   }
4951 
4952   simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
4953     return 0;
4954   }
4955 
4956   simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
4957     return 0;
4958   }
4959 
4960   simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override {
4961     return result(error_code::OTHER, 0);
4962   }
4963 
4964   simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override {
4965     return result(error_code::OTHER, 0);
4966   }
4967 
4968   simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
4969     return 0;
4970   }
4971 
4972   simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
4973     return 0;
4974   }
4975 
4976   simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
4977     return 0;
4978   }
4979 
4980   simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override {
4981     return result(error_code::OTHER, 0);
4982   }
4983 
4984   simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
4985     return 0;
4986   }
4987 
4988   simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override {
4989     return 0;
4990   }
4991 
4992   simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override {
4993     return 0;
4994   }
4995 
4996   simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override {
4997     return result(error_code::OTHER, 0);
4998   }
4999 
5000   simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override {
5001     return result(error_code::OTHER, 0);
5002   }
5003 
5004   simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override {
5005     return 0;
5006   }
5007 
5008   simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override {
5009     return 0;
5010   }
5011 
5012   simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
5013     return 0;
5014   }
5015 
5016   simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
5017     return 0;
5018   }
5019 
5020   simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
5021     return result(error_code::OTHER, 0);
5022   }
5023 
5024   simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
5025     return result(error_code::OTHER, 0);
5026   }
5027 
5028   simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
5029     return 0;
5030   }
5031 
5032   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
5033     return 0;
5034   }
5035 
5036   simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t *, size_t, char* ) const noexcept final override {
5037     return 0;
5038   }
5039 
5040   simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t *, size_t, char* ) const noexcept final override {
5041     return result(error_code::OTHER, 0);
5042   }
5043 
5044   simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t *, size_t, char* ) const noexcept final override {
5045     return 0;
5046   }
5047 
5048   simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
5049     return 0;
5050   }
5051 
5052   simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override {
5053     return result(error_code::OTHER, 0);
5054   }
5055 
5056   simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
5057     return 0;
5058   }
5059 
5060   simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
5061     return 0;
5062   }
5063 
5064   simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
5065     return 0;
5066   }
5067 
5068   simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
5069     return result(error_code::OTHER, 0);
5070   }
5071 
5072   simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
5073     return result(error_code::OTHER, 0);
5074   }
5075 
5076   simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
5077     return 0;
5078   }
5079 
5080   simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
5081     return 0;
5082   }
5083 
5084   simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
5085     return 0;
5086   }
5087 
5088   simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
5089     return 0;
5090   }
5091 
5092   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
5093     return result(error_code::OTHER, 0);
5094   }
5095 
5096   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
5097     return result(error_code::OTHER, 0);
5098   }
5099 
5100   simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
5101     return 0;
5102   }
5103 
5104   simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
5105     return 0;
5106   }
5107 
5108   void change_endianness_utf16(const char16_t *, size_t, char16_t *) const noexcept final override {
5109 
5110   }
5111 
5112   simdutf_warn_unused size_t count_utf16le(const char16_t *, size_t) const noexcept final override {
5113     return 0;
5114   }
5115 
5116   simdutf_warn_unused size_t count_utf16be(const char16_t *, size_t) const noexcept final override {
5117     return 0;
5118   }
5119 
5120   simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override {
5121     return 0;
5122   }
5123 
5124   simdutf_warn_unused size_t latin1_length_from_utf8(const char *, size_t) const noexcept override {
5125     return 0;
5126   }
5127 
5128   simdutf_warn_unused size_t latin1_length_from_utf16(size_t) const noexcept override {
5129     return 0;
5130   }
5131 
5132   simdutf_warn_unused size_t latin1_length_from_utf32(size_t) const noexcept override {
5133     return 0;
5134   }
5135   simdutf_warn_unused size_t utf8_length_from_latin1(const char *, size_t) const noexcept override {
5136     return 0;
5137   }
5138 
5139   simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
5140     return 0;
5141   }
5142 
5143   simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
5144     return 0;
5145   }
5146 
5147   simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
5148     return 0;
5149   }
5150 
5151   simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
5152     return 0;
5153   }
5154 
5155     simdutf_warn_unused size_t utf32_length_from_latin1(size_t) const noexcept override {
5156     return 0;
5157   }
5158 
5159   simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override {
5160     return 0;
5161   }
5162   simdutf_warn_unused size_t utf16_length_from_latin1(size_t) const noexcept override {
5163     return 0;
5164   }
5165   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
5166     return 0;
5167   }
5168 
5169   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
5170     return 0;
5171   }
5172 
5173   simdutf_warn_unused size_t utf32_length_from_utf8(const char *, size_t) const noexcept override {
5174     return 0;
5175   }
5176 
unsupported_implementation()5177   unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
5178 };
5179 
5180 const unsupported_implementation unsupported_singleton{};
5181 
5182 size_t available_implementation_list::size() const noexcept {
5183   return internal::get_available_implementation_pointers().size();
5184 }
5185 const implementation * const *available_implementation_list::begin() const noexcept {
5186   return internal::get_available_implementation_pointers().begin();
5187 }
5188 const implementation * const *available_implementation_list::end() const noexcept {
5189   return internal::get_available_implementation_pointers().end();
5190 }
5191 const implementation *available_implementation_list::detect_best_supported() const noexcept {
5192   // They are prelisted in priority order, so we just go down the list
5193   uint32_t supported_instruction_sets = internal::detect_supported_architectures();
5194   for (const implementation *impl : internal::get_available_implementation_pointers()) {
5195     uint32_t required_instruction_sets = impl->required_instruction_sets();
5196     if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
5197   }
5198   return &unsupported_singleton; // this should never happen?
5199 }
5200 
5201 const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
5202   SIMDUTF_PUSH_DISABLE_WARNINGS
5203   SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
5204   char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
5205   SIMDUTF_POP_DISABLE_WARNINGS
5206 
5207   if (force_implementation_name) {
5208     auto force_implementation = get_available_implementations()[force_implementation_name];
5209     if (force_implementation) {
5210       return get_active_implementation() = force_implementation;
5211     } else {
5212       // Note: abort() and stderr usage within the library is forbidden.
5213       return get_active_implementation() = &unsupported_singleton;
5214     }
5215   }
5216   return get_active_implementation() = get_available_implementations().detect_best_supported();
5217 }
5218 
5219 } // namespace internal
5220 
5221 
5222 
5223 /**
5224  * The list of available implementations compiled into simdutf.
5225  */
get_available_implementations()5226 SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() {
5227   static const internal::available_implementation_list available_implementations{};
5228   return available_implementations;
5229 }
5230 
5231 /**
5232   * The active implementation.
5233   */
get_active_implementation()5234 SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation() {
5235     static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
5236     static internal::atomic_ptr<const implementation> active_implementation{&detect_best_supported_implementation_on_first_use_singleton};
5237     return active_implementation;
5238 }
5239 
5240 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
5241   return get_active_implementation()->validate_utf8(buf, len);
5242 }
5243 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept {
5244   return get_active_implementation()->validate_utf8_with_errors(buf, len);
5245 }
5246 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
5247   return get_active_implementation()->validate_ascii(buf, len);
5248 }
5249 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept {
5250   return get_active_implementation()->validate_ascii_with_errors(buf, len);
5251 }
5252 simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept {
5253   #if SIMDUTF_IS_BIG_ENDIAN
5254   return convert_utf8_to_utf16be(input, length, utf16_output);
5255   #else
5256   return convert_utf8_to_utf16le(input, length, utf16_output);
5257   #endif
5258 }
5259 simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) noexcept {
5260   return get_active_implementation()->convert_latin1_to_utf8(buf, len,utf8_output);
5261 }
5262 simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) noexcept {
5263   return get_active_implementation()->convert_latin1_to_utf16le(buf, len, utf16_output);
5264 }
5265 simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) noexcept{
5266   return get_active_implementation()->convert_latin1_to_utf16be(buf, len, utf16_output);
5267 }
5268 simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t * latin1_output) noexcept {
5269   return get_active_implementation()->convert_latin1_to_utf32(buf, len,latin1_output);
5270 }
5271 simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) noexcept {
5272   return get_active_implementation()->convert_utf8_to_latin1(buf, len,latin1_output);
5273 }
5274 simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) noexcept {
5275   return get_active_implementation()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output);
5276 }
5277 simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) noexcept {
5278   return get_active_implementation()->convert_valid_utf8_to_latin1(buf, len,latin1_output);
5279 }
5280 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept {
5281   return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
5282 }
5283 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept {
5284   return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
5285 }
5286 simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
5287   #if SIMDUTF_IS_BIG_ENDIAN
5288   return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
5289   #else
5290   return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
5291   #endif
5292 }
5293 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
5294   return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
5295 }
5296 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
5297   return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
5298 }
5299 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept {
5300   return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
5301 }
5302 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept {
5303   return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
5304 }
5305 simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept {
5306   #if SIMDUTF_IS_BIG_ENDIAN
5307   return validate_utf16be(buf, len);
5308   #else
5309   return validate_utf16le(buf, len);
5310   #endif
5311 }
5312 simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept {
5313   return get_active_implementation()->validate_utf16le(buf, len);
5314 }
5315 simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept {
5316   return get_active_implementation()->validate_utf16be(buf, len);
5317 }
5318 simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size_t len) noexcept {
5319   #if SIMDUTF_IS_BIG_ENDIAN
5320   return validate_utf16be_with_errors(buf, len);
5321   #else
5322   return validate_utf16le_with_errors(buf, len);
5323   #endif
5324 }
5325 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept {
5326   return get_active_implementation()->validate_utf16le_with_errors(buf, len);
5327 }
5328 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept {
5329   return get_active_implementation()->validate_utf16be_with_errors(buf, len);
5330 }
5331 simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept {
5332   return get_active_implementation()->validate_utf32(buf, len);
5333 }
5334 simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept {
5335   return get_active_implementation()->validate_utf32_with_errors(buf, len);
5336 }
5337 simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
5338   #if SIMDUTF_IS_BIG_ENDIAN
5339   return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
5340   #else
5341   return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
5342   #endif
5343 }
5344 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
5345   return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
5346 }
5347 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
5348   return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
5349 }
5350 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept {
5351   return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
5352 }
5353 simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
5354   #if SIMDUTF_IS_BIG_ENDIAN
5355   return convert_utf16be_to_utf8(buf, len, utf8_buffer);
5356   #else
5357   return convert_utf16le_to_utf8(buf, len, utf8_buffer);
5358   #endif
5359 }
5360 simdutf_warn_unused size_t convert_utf16_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
5361   #if SIMDUTF_IS_BIG_ENDIAN
5362   return convert_utf16be_to_latin1(buf, len, latin1_buffer);
5363   #else
5364   return convert_utf16le_to_latin1(buf, len, latin1_buffer);
5365   #endif
5366 }
5367 simdutf_warn_unused size_t convert_latin1_to_utf16(const char * buf, size_t len, char16_t* utf16_output) noexcept {
5368   #if SIMDUTF_IS_BIG_ENDIAN
5369   return convert_latin1_to_utf16be(buf, len, utf16_output);
5370   #else
5371   return convert_latin1_to_utf16le(buf, len, utf16_output);
5372   #endif
5373 }
5374 simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
5375   return get_active_implementation()->convert_utf16be_to_latin1(buf, len, latin1_buffer);
5376 }
5377 simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
5378   return get_active_implementation()->convert_utf16le_to_latin1(buf, len, latin1_buffer);
5379 }
5380 simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
5381   return get_active_implementation()->convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
5382 }
5383 simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
5384   return get_active_implementation()->convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
5385 }
5386 simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
5387   return get_active_implementation()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
5388 }
5389 simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
5390   return get_active_implementation()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
5391 }
5392 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
5393   return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
5394 }
5395 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
5396   return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
5397 }
5398 simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
5399   #if SIMDUTF_IS_BIG_ENDIAN
5400   return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
5401   #else
5402   return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
5403   #endif
5404 }
5405 simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
5406   #if SIMDUTF_IS_BIG_ENDIAN
5407   return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
5408   #else
5409   return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
5410   #endif
5411 }
5412 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
5413   return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
5414 }
5415 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
5416   return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
5417 }
5418 simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
5419   #if SIMDUTF_IS_BIG_ENDIAN
5420   return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
5421   #else
5422   return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
5423   #endif
5424 }
5425 simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
5426   #if SIMDUTF_IS_BIG_ENDIAN
5427   return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
5428   #else
5429   return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
5430   #endif
5431 }
5432 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
5433   return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
5434 }
5435 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
5436   return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
5437 }
5438 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
5439   return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
5440 }
5441 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
5442   return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
5443 }
5444 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
5445   return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
5446 }
5447 simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
5448   #if SIMDUTF_IS_BIG_ENDIAN
5449   return convert_utf32_to_utf16be(buf, len, utf16_buffer);
5450   #else
5451   return convert_utf32_to_utf16le(buf, len, utf16_buffer);
5452   #endif
5453 }
5454 simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_output) noexcept {
5455   return get_active_implementation()->convert_utf32_to_latin1(input, length, latin1_output);
5456 }
5457 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
5458   return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
5459 }
5460 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
5461   return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
5462 }
5463 simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
5464   #if SIMDUTF_IS_BIG_ENDIAN
5465   return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
5466   #else
5467   return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
5468   #endif
5469 }
5470 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
5471   return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
5472 }
5473 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
5474   return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
5475 }
5476 simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
5477   #if SIMDUTF_IS_BIG_ENDIAN
5478   return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
5479   #else
5480   return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
5481   #endif
5482 }
5483 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
5484   return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
5485 }
5486 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
5487   return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
5488 }
5489 simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
5490   #if SIMDUTF_IS_BIG_ENDIAN
5491   return convert_utf16be_to_utf32(buf, len, utf32_buffer);
5492   #else
5493   return convert_utf16le_to_utf32(buf, len, utf32_buffer);
5494   #endif
5495 }
5496 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
5497   return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
5498 }
5499 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
5500   return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
5501 }
5502 simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
5503   #if SIMDUTF_IS_BIG_ENDIAN
5504   return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
5505   #else
5506   return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
5507   #endif
5508 }
5509 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
5510   return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
5511 }
5512 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
5513   return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
5514 }
5515 simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
5516   #if SIMDUTF_IS_BIG_ENDIAN
5517   return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
5518   #else
5519   return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
5520   #endif
5521 }
5522 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
5523   return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
5524 }
5525 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
5526   return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
5527 }
5528 void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept {
5529   get_active_implementation()->change_endianness_utf16(input, length, output);
5530 }
5531 simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept {
5532   #if SIMDUTF_IS_BIG_ENDIAN
5533   return count_utf16be(input, length);
5534   #else
5535   return count_utf16le(input, length);
5536   #endif
5537 }
5538 simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept {
5539   return get_active_implementation()->count_utf16le(input, length);
5540 }
5541 simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept {
5542   return get_active_implementation()->count_utf16be(input, length);
5543 }
5544 simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept {
5545   return get_active_implementation()->count_utf8(input, length);
5546 }
5547 simdutf_warn_unused size_t latin1_length_from_utf8(const char * buf, size_t len) noexcept {
5548   return get_active_implementation()->latin1_length_from_utf8(buf, len);
5549 }
5550 simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) noexcept {
5551   return get_active_implementation()->latin1_length_from_utf16(len);
5552 }
5553 simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) noexcept {
5554   return get_active_implementation()->latin1_length_from_utf32(len);
5555 }
5556 simdutf_warn_unused size_t utf8_length_from_latin1(const char * buf, size_t len) noexcept {
5557   return get_active_implementation()->utf8_length_from_latin1(buf, len);
5558 }
5559 simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept {
5560   #if SIMDUTF_IS_BIG_ENDIAN
5561   return utf8_length_from_utf16be(input, length);
5562   #else
5563   return utf8_length_from_utf16le(input, length);
5564   #endif
5565 }
5566 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept {
5567   return get_active_implementation()->utf8_length_from_utf16le(input, length);
5568 }
5569 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept {
5570   return get_active_implementation()->utf8_length_from_utf16be(input, length);
5571 }
5572 simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept {
5573   #if SIMDUTF_IS_BIG_ENDIAN
5574   return utf32_length_from_utf16be(input, length);
5575   #else
5576   return utf32_length_from_utf16le(input, length);
5577   #endif
5578 }
5579 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept {
5580   return get_active_implementation()->utf32_length_from_utf16le(input, length);
5581 }
5582 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept {
5583   return get_active_implementation()->utf32_length_from_utf16be(input, length);
5584 }
5585 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept {
5586   return get_active_implementation()->utf16_length_from_utf8(input, length);
5587 }
5588 simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept {
5589   return get_active_implementation()->utf16_length_from_latin1(length);
5590 }
5591 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept {
5592   return get_active_implementation()->utf8_length_from_utf32(input, length);
5593 }
5594 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept {
5595   return get_active_implementation()->utf16_length_from_utf32(input, length);
5596 }
5597 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept {
5598   return get_active_implementation()->utf32_length_from_utf8(input, length);
5599 }
5600 simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {
5601   return get_active_implementation()->autodetect_encoding(buf, length);
5602 }
5603 simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept {
5604   return get_active_implementation()->detect_encodings(buf, length);
5605 }
builtin_implementation()5606 const implementation * builtin_implementation() {
5607   static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
5608   return builtin_impl;
5609 }
5610 
trim_partial_utf8(const char *input, size_t length)5611 simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) {
5612   return scalar::utf8::trim_partial_utf8(input, length);
5613 }
5614 
trim_partial_utf16be(const char16_t* input, size_t length)5615 simdutf_warn_unused size_t trim_partial_utf16be(const char16_t* input, size_t length) {
5616   return scalar::utf16::trim_partial_utf16<BIG>(input, length);
5617 }
5618 
trim_partial_utf16le(const char16_t* input, size_t length)5619 simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t length) {
5620   return scalar::utf16::trim_partial_utf16<LITTLE>(input, length);
5621 }
5622 
trim_partial_utf16(const char16_t* input, size_t length)5623 simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length) {
5624   #if SIMDUTF_IS_BIG_ENDIAN
5625   return trim_partial_utf16be(input, length);
5626   #else
5627   return trim_partial_utf16le(input, length);
5628   #endif
5629 }
5630 
5631 } // namespace simdutf
5632 
5633 /* end file src/implementation.cpp */
5634 /* begin file src/encoding_types.cpp */
5635 
5636 namespace simdutf {
match_system(endianness e)5637 bool match_system(endianness e) {
5638 #if SIMDUTF_IS_BIG_ENDIAN
5639     return e == endianness::BIG;
5640 #else
5641     return e == endianness::LITTLE;
5642 #endif
5643 }
5644 
to_string(encoding_type bom)5645 std::string to_string(encoding_type bom) {
5646   switch (bom) {
5647       case UTF16_LE:     return "UTF16 little-endian";
5648       case UTF16_BE:     return "UTF16 big-endian";
5649       case UTF32_LE:     return "UTF32 little-endian";
5650       case UTF32_BE:     return "UTF32 big-endian";
5651       case UTF8:         return "UTF8";
5652       case unspecified:  return "unknown";
5653       default:           return "error";
5654   }
5655 }
5656 
5657 namespace BOM {
5658 // Note that BOM for UTF8 is discouraged.
check_bom(const uint8_t* byte, size_t length)5659 encoding_type check_bom(const uint8_t* byte, size_t length) {
5660         if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
5661             if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
5662                 return encoding_type::UTF32_LE;
5663             } else {
5664                 return encoding_type::UTF16_LE;
5665             }
5666         } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
5667             return encoding_type::UTF16_BE;
5668         } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
5669             return encoding_type::UTF32_BE;
5670         } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
5671             return encoding_type::UTF8;
5672         }
5673         return encoding_type::unspecified;
5674     }
5675 
check_bom(const char* byte, size_t length)5676 encoding_type check_bom(const char* byte, size_t length) {
5677       return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
5678  }
5679 
bom_byte_size(encoding_type bom)5680  size_t bom_byte_size(encoding_type bom) {
5681         switch (bom) {
5682             case UTF16_LE:     return 2;
5683             case UTF16_BE:     return 2;
5684             case UTF32_LE:     return 4;
5685             case UTF32_BE:     return 4;
5686             case UTF8:         return 3;
5687             case unspecified:  return 0;
5688             default:           return 0;
5689         }
5690 }
5691 
5692 }
5693 }
5694 /* end file src/encoding_types.cpp */
5695 /* begin file src/error.cpp */
5696 namespace simdutf {
5697 
result()5698   simdutf_really_inline result::result() : error{error_code::SUCCESS}, count{0} {}
5699 
result(error_code _err, size_t _pos)5700   simdutf_really_inline result::result(error_code _err, size_t _pos) : error{_err}, count{_pos} {}
5701 
5702 }
5703 /* end file src/error.cpp */
5704 // The large tables should be included once and they
5705 // should not depend on a kernel.
5706 /* begin file src/tables/utf8_to_utf16_tables.h */
5707 #ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
5708 #define SIMDUTF_UTF8_TO_UTF16_TABLES_H
5709 #include <cstdint>
5710 
5711 namespace simdutf {
5712 namespace {
5713 namespace tables {
5714 namespace utf8_to_utf16 {
5715 /**
5716  * utf8bigindex uses about 8 kB
5717  * shufutf8 uses about 3344 B
5718  *
5719  * So we use a bit over 11 kB. It would be
5720  * easy to save about 4 kB by only
5721  * storing the index in utf8bigindex, and
5722  * deriving the consumed bytes otherwise.
5723  * However, this may come at a significant (10% to 20%)
5724  * performance penalty.
5725  */
5726 
5727 const uint8_t shufutf8[209][16] =
5728 {	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
5729  	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
5730  	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
5731  	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
5732  	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
5733  	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
5734  	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
5735  	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
5736  	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
5737  	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
5738  	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
5739  	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
5740  	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
5741  	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
5742  	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
5743  	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
5744  	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
5745  	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
5746  	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
5747  	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
5748  	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
5749  	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
5750  	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
5751  	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
5752  	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
5753  	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
5754  	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
5755  	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
5756  	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
5757  	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
5758  	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
5759  	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
5760  	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
5761  	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
5762  	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
5763  	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
5764  	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
5765  	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
5766  	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
5767  	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
5768  	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
5769  	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
5770  	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
5771  	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
5772  	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
5773  	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
5774  	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
5775  	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
5776  	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
5777  	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
5778  	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
5779  	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
5780  	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
5781  	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
5782  	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
5783  	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
5784  	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
5785  	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
5786  	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
5787  	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
5788  	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
5789  	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
5790  	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
5791  	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
5792  	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
5793  	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
5794  	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
5795  	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
5796  	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
5797  	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
5798  	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
5799  	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
5800  	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
5801  	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
5802  	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
5803  	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
5804  	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
5805  	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
5806  	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
5807  	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
5808  	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
5809  	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
5810  	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
5811  	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
5812  	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
5813  	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
5814  	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
5815  	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
5816  	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
5817  	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
5818  	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
5819  	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
5820  	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
5821  	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
5822  	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
5823  	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
5824  	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
5825  	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
5826  	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
5827  	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
5828  	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
5829  	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
5830  	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
5831  	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
5832  	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
5833  	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
5834  	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
5835  	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
5836  	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
5837  	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
5838  	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
5839  	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
5840  	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
5841  	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
5842  	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
5843  	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
5844  	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
5845  	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
5846  	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
5847  	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
5848  	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
5849  	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
5850  	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
5851  	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
5852  	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
5853  	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
5854  	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
5855  	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
5856  	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
5857  	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
5858  	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
5859  	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
5860  	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
5861  	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
5862  	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
5863  	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
5864  	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
5865  	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
5866  	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
5867  	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
5868  	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
5869  	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
5870  	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
5871  	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
5872  	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
5873  	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
5874  	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
5875  	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
5876  	{0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
5877  	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
5878  	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
5879  	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
5880  	{0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
5881  	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
5882  	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
5883  	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
5884  	{0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
5885  	{0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
5886  	{0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
5887  	{0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
5888  	{0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
5889  	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
5890  	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
5891  	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
5892  	{1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
5893  	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
5894  	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
5895  	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
5896  	{1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
5897  	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
5898  	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
5899  	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
5900  	{1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
5901  	{1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
5902  	{1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
5903  	{1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
5904  	{1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
5905  	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
5906  	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
5907  	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
5908  	{2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
5909  	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
5910  	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
5911  	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
5912  	{2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
5913  	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
5914  	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
5915  	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
5916  	{2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
5917  	{2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
5918  	{2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
5919  	{2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
5920  	{2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
5921  	{3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
5922  	{3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
5923  	{3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
5924  	{3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
5925  	{3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
5926  	{3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
5927  	{3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
5928  	{3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
5929  	{3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
5930  	{3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
5931  	{3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
5932  	{3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
5933  	{3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
5934  	{3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
5935  	{3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
5936  	{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
5937 /* number of two bytes : 64 */
5938 /* number of two + three bytes : 145 */
5939 /* number of two + three + four bytes : 209 */
5940 const uint8_t utf8bigindex[4096][2] =
5941 {	{209, 12},
5942  	{209, 12},
5943  	{209, 12},
5944  	{209, 12},
5945  	{209, 12},
5946  	{209, 12},
5947  	{209, 12},
5948  	{145, 3},
5949  	{209, 12},
5950  	{209, 12},
5951  	{209, 12},
5952  	{146, 4},
5953  	{209, 12},
5954  	{149, 4},
5955  	{161, 4},
5956  	{64, 4},
5957  	{209, 12},
5958  	{209, 12},
5959  	{209, 12},
5960  	{147, 5},
5961  	{209, 12},
5962  	{150, 5},
5963  	{162, 5},
5964  	{65, 5},
5965  	{209, 12},
5966  	{153, 5},
5967  	{165, 5},
5968  	{67, 5},
5969  	{177, 5},
5970  	{73, 5},
5971  	{91, 5},
5972  	{64, 4},
5973  	{209, 12},
5974  	{209, 12},
5975  	{209, 12},
5976  	{148, 6},
5977  	{209, 12},
5978  	{151, 6},
5979  	{163, 6},
5980  	{66, 6},
5981  	{209, 12},
5982  	{154, 6},
5983  	{166, 6},
5984  	{68, 6},
5985  	{178, 6},
5986  	{74, 6},
5987  	{92, 6},
5988  	{64, 4},
5989  	{209, 12},
5990  	{157, 6},
5991  	{169, 6},
5992  	{70, 6},
5993  	{181, 6},
5994  	{76, 6},
5995  	{94, 6},
5996  	{65, 5},
5997  	{193, 6},
5998  	{82, 6},
5999  	{100, 6},
6000  	{67, 5},
6001  	{118, 6},
6002  	{73, 5},
6003  	{91, 5},
6004  	{0, 6},
6005  	{209, 12},
6006  	{209, 12},
6007  	{209, 12},
6008  	{209, 12},
6009  	{209, 12},
6010  	{152, 7},
6011  	{164, 7},
6012  	{145, 3},
6013  	{209, 12},
6014  	{155, 7},
6015  	{167, 7},
6016  	{69, 7},
6017  	{179, 7},
6018  	{75, 7},
6019  	{93, 7},
6020  	{64, 4},
6021  	{209, 12},
6022  	{158, 7},
6023  	{170, 7},
6024  	{71, 7},
6025  	{182, 7},
6026  	{77, 7},
6027  	{95, 7},
6028  	{65, 5},
6029  	{194, 7},
6030  	{83, 7},
6031  	{101, 7},
6032  	{67, 5},
6033  	{119, 7},
6034  	{73, 5},
6035  	{91, 5},
6036  	{1, 7},
6037  	{209, 12},
6038  	{209, 12},
6039  	{173, 7},
6040  	{148, 6},
6041  	{185, 7},
6042  	{79, 7},
6043  	{97, 7},
6044  	{66, 6},
6045  	{197, 7},
6046  	{85, 7},
6047  	{103, 7},
6048  	{68, 6},
6049  	{121, 7},
6050  	{74, 6},
6051  	{92, 6},
6052  	{2, 7},
6053  	{209, 12},
6054  	{157, 6},
6055  	{109, 7},
6056  	{70, 6},
6057  	{127, 7},
6058  	{76, 6},
6059  	{94, 6},
6060  	{4, 7},
6061  	{193, 6},
6062  	{82, 6},
6063  	{100, 6},
6064  	{8, 7},
6065  	{118, 6},
6066  	{16, 7},
6067  	{32, 7},
6068  	{0, 6},
6069  	{209, 12},
6070  	{209, 12},
6071  	{209, 12},
6072  	{209, 12},
6073  	{209, 12},
6074  	{209, 12},
6075  	{209, 12},
6076  	{145, 3},
6077  	{209, 12},
6078  	{156, 8},
6079  	{168, 8},
6080  	{146, 4},
6081  	{180, 8},
6082  	{149, 4},
6083  	{161, 4},
6084  	{64, 4},
6085  	{209, 12},
6086  	{159, 8},
6087  	{171, 8},
6088  	{72, 8},
6089  	{183, 8},
6090  	{78, 8},
6091  	{96, 8},
6092  	{65, 5},
6093  	{195, 8},
6094  	{84, 8},
6095  	{102, 8},
6096  	{67, 5},
6097  	{120, 8},
6098  	{73, 5},
6099  	{91, 5},
6100  	{64, 4},
6101  	{209, 12},
6102  	{209, 12},
6103  	{174, 8},
6104  	{148, 6},
6105  	{186, 8},
6106  	{80, 8},
6107  	{98, 8},
6108  	{66, 6},
6109  	{198, 8},
6110  	{86, 8},
6111  	{104, 8},
6112  	{68, 6},
6113  	{122, 8},
6114  	{74, 6},
6115  	{92, 6},
6116  	{3, 8},
6117  	{209, 12},
6118  	{157, 6},
6119  	{110, 8},
6120  	{70, 6},
6121  	{128, 8},
6122  	{76, 6},
6123  	{94, 6},
6124  	{5, 8},
6125  	{193, 6},
6126  	{82, 6},
6127  	{100, 6},
6128  	{9, 8},
6129  	{118, 6},
6130  	{17, 8},
6131  	{33, 8},
6132  	{0, 6},
6133  	{209, 12},
6134  	{209, 12},
6135  	{209, 12},
6136  	{209, 12},
6137  	{189, 8},
6138  	{152, 7},
6139  	{164, 7},
6140  	{145, 3},
6141  	{201, 8},
6142  	{88, 8},
6143  	{106, 8},
6144  	{69, 7},
6145  	{124, 8},
6146  	{75, 7},
6147  	{93, 7},
6148  	{64, 4},
6149  	{209, 12},
6150  	{158, 7},
6151  	{112, 8},
6152  	{71, 7},
6153  	{130, 8},
6154  	{77, 7},
6155  	{95, 7},
6156  	{6, 8},
6157  	{194, 7},
6158  	{83, 7},
6159  	{101, 7},
6160  	{10, 8},
6161  	{119, 7},
6162  	{18, 8},
6163  	{34, 8},
6164  	{1, 7},
6165  	{209, 12},
6166  	{209, 12},
6167  	{173, 7},
6168  	{148, 6},
6169  	{136, 8},
6170  	{79, 7},
6171  	{97, 7},
6172  	{66, 6},
6173  	{197, 7},
6174  	{85, 7},
6175  	{103, 7},
6176  	{12, 8},
6177  	{121, 7},
6178  	{20, 8},
6179  	{36, 8},
6180  	{2, 7},
6181  	{209, 12},
6182  	{157, 6},
6183  	{109, 7},
6184  	{70, 6},
6185  	{127, 7},
6186  	{24, 8},
6187  	{40, 8},
6188  	{4, 7},
6189  	{193, 6},
6190  	{82, 6},
6191  	{48, 8},
6192  	{8, 7},
6193  	{118, 6},
6194  	{16, 7},
6195  	{32, 7},
6196  	{0, 6},
6197  	{209, 12},
6198  	{209, 12},
6199  	{209, 12},
6200  	{209, 12},
6201  	{209, 12},
6202  	{209, 12},
6203  	{209, 12},
6204  	{145, 3},
6205  	{209, 12},
6206  	{209, 12},
6207  	{209, 12},
6208  	{146, 4},
6209  	{209, 12},
6210  	{149, 4},
6211  	{161, 4},
6212  	{64, 4},
6213  	{209, 12},
6214  	{160, 9},
6215  	{172, 9},
6216  	{147, 5},
6217  	{184, 9},
6218  	{150, 5},
6219  	{162, 5},
6220  	{65, 5},
6221  	{196, 9},
6222  	{153, 5},
6223  	{165, 5},
6224  	{67, 5},
6225  	{177, 5},
6226  	{73, 5},
6227  	{91, 5},
6228  	{64, 4},
6229  	{209, 12},
6230  	{209, 12},
6231  	{175, 9},
6232  	{148, 6},
6233  	{187, 9},
6234  	{81, 9},
6235  	{99, 9},
6236  	{66, 6},
6237  	{199, 9},
6238  	{87, 9},
6239  	{105, 9},
6240  	{68, 6},
6241  	{123, 9},
6242  	{74, 6},
6243  	{92, 6},
6244  	{64, 4},
6245  	{209, 12},
6246  	{157, 6},
6247  	{111, 9},
6248  	{70, 6},
6249  	{129, 9},
6250  	{76, 6},
6251  	{94, 6},
6252  	{65, 5},
6253  	{193, 6},
6254  	{82, 6},
6255  	{100, 6},
6256  	{67, 5},
6257  	{118, 6},
6258  	{73, 5},
6259  	{91, 5},
6260  	{0, 6},
6261  	{209, 12},
6262  	{209, 12},
6263  	{209, 12},
6264  	{209, 12},
6265  	{190, 9},
6266  	{152, 7},
6267  	{164, 7},
6268  	{145, 3},
6269  	{202, 9},
6270  	{89, 9},
6271  	{107, 9},
6272  	{69, 7},
6273  	{125, 9},
6274  	{75, 7},
6275  	{93, 7},
6276  	{64, 4},
6277  	{209, 12},
6278  	{158, 7},
6279  	{113, 9},
6280  	{71, 7},
6281  	{131, 9},
6282  	{77, 7},
6283  	{95, 7},
6284  	{7, 9},
6285  	{194, 7},
6286  	{83, 7},
6287  	{101, 7},
6288  	{11, 9},
6289  	{119, 7},
6290  	{19, 9},
6291  	{35, 9},
6292  	{1, 7},
6293  	{209, 12},
6294  	{209, 12},
6295  	{173, 7},
6296  	{148, 6},
6297  	{137, 9},
6298  	{79, 7},
6299  	{97, 7},
6300  	{66, 6},
6301  	{197, 7},
6302  	{85, 7},
6303  	{103, 7},
6304  	{13, 9},
6305  	{121, 7},
6306  	{21, 9},
6307  	{37, 9},
6308  	{2, 7},
6309  	{209, 12},
6310  	{157, 6},
6311  	{109, 7},
6312  	{70, 6},
6313  	{127, 7},
6314  	{25, 9},
6315  	{41, 9},
6316  	{4, 7},
6317  	{193, 6},
6318  	{82, 6},
6319  	{49, 9},
6320  	{8, 7},
6321  	{118, 6},
6322  	{16, 7},
6323  	{32, 7},
6324  	{0, 6},
6325  	{209, 12},
6326  	{209, 12},
6327  	{209, 12},
6328  	{209, 12},
6329  	{209, 12},
6330  	{209, 12},
6331  	{209, 12},
6332  	{145, 3},
6333  	{205, 9},
6334  	{156, 8},
6335  	{168, 8},
6336  	{146, 4},
6337  	{180, 8},
6338  	{149, 4},
6339  	{161, 4},
6340  	{64, 4},
6341  	{209, 12},
6342  	{159, 8},
6343  	{115, 9},
6344  	{72, 8},
6345  	{133, 9},
6346  	{78, 8},
6347  	{96, 8},
6348  	{65, 5},
6349  	{195, 8},
6350  	{84, 8},
6351  	{102, 8},
6352  	{67, 5},
6353  	{120, 8},
6354  	{73, 5},
6355  	{91, 5},
6356  	{64, 4},
6357  	{209, 12},
6358  	{209, 12},
6359  	{174, 8},
6360  	{148, 6},
6361  	{139, 9},
6362  	{80, 8},
6363  	{98, 8},
6364  	{66, 6},
6365  	{198, 8},
6366  	{86, 8},
6367  	{104, 8},
6368  	{14, 9},
6369  	{122, 8},
6370  	{22, 9},
6371  	{38, 9},
6372  	{3, 8},
6373  	{209, 12},
6374  	{157, 6},
6375  	{110, 8},
6376  	{70, 6},
6377  	{128, 8},
6378  	{26, 9},
6379  	{42, 9},
6380  	{5, 8},
6381  	{193, 6},
6382  	{82, 6},
6383  	{50, 9},
6384  	{9, 8},
6385  	{118, 6},
6386  	{17, 8},
6387  	{33, 8},
6388  	{0, 6},
6389  	{209, 12},
6390  	{209, 12},
6391  	{209, 12},
6392  	{209, 12},
6393  	{189, 8},
6394  	{152, 7},
6395  	{164, 7},
6396  	{145, 3},
6397  	{201, 8},
6398  	{88, 8},
6399  	{106, 8},
6400  	{69, 7},
6401  	{124, 8},
6402  	{75, 7},
6403  	{93, 7},
6404  	{64, 4},
6405  	{209, 12},
6406  	{158, 7},
6407  	{112, 8},
6408  	{71, 7},
6409  	{130, 8},
6410  	{28, 9},
6411  	{44, 9},
6412  	{6, 8},
6413  	{194, 7},
6414  	{83, 7},
6415  	{52, 9},
6416  	{10, 8},
6417  	{119, 7},
6418  	{18, 8},
6419  	{34, 8},
6420  	{1, 7},
6421  	{209, 12},
6422  	{209, 12},
6423  	{173, 7},
6424  	{148, 6},
6425  	{136, 8},
6426  	{79, 7},
6427  	{97, 7},
6428  	{66, 6},
6429  	{197, 7},
6430  	{85, 7},
6431  	{56, 9},
6432  	{12, 8},
6433  	{121, 7},
6434  	{20, 8},
6435  	{36, 8},
6436  	{2, 7},
6437  	{209, 12},
6438  	{157, 6},
6439  	{109, 7},
6440  	{70, 6},
6441  	{127, 7},
6442  	{24, 8},
6443  	{40, 8},
6444  	{4, 7},
6445  	{193, 6},
6446  	{82, 6},
6447  	{48, 8},
6448  	{8, 7},
6449  	{118, 6},
6450  	{16, 7},
6451  	{32, 7},
6452  	{0, 6},
6453  	{209, 12},
6454  	{209, 12},
6455  	{209, 12},
6456  	{209, 12},
6457  	{209, 12},
6458  	{209, 12},
6459  	{209, 12},
6460  	{145, 3},
6461  	{209, 12},
6462  	{209, 12},
6463  	{209, 12},
6464  	{146, 4},
6465  	{209, 12},
6466  	{149, 4},
6467  	{161, 4},
6468  	{64, 4},
6469  	{209, 12},
6470  	{209, 12},
6471  	{209, 12},
6472  	{147, 5},
6473  	{209, 12},
6474  	{150, 5},
6475  	{162, 5},
6476  	{65, 5},
6477  	{209, 12},
6478  	{153, 5},
6479  	{165, 5},
6480  	{67, 5},
6481  	{177, 5},
6482  	{73, 5},
6483  	{91, 5},
6484  	{64, 4},
6485  	{209, 12},
6486  	{209, 12},
6487  	{176, 10},
6488  	{148, 6},
6489  	{188, 10},
6490  	{151, 6},
6491  	{163, 6},
6492  	{66, 6},
6493  	{200, 10},
6494  	{154, 6},
6495  	{166, 6},
6496  	{68, 6},
6497  	{178, 6},
6498  	{74, 6},
6499  	{92, 6},
6500  	{64, 4},
6501  	{209, 12},
6502  	{157, 6},
6503  	{169, 6},
6504  	{70, 6},
6505  	{181, 6},
6506  	{76, 6},
6507  	{94, 6},
6508  	{65, 5},
6509  	{193, 6},
6510  	{82, 6},
6511  	{100, 6},
6512  	{67, 5},
6513  	{118, 6},
6514  	{73, 5},
6515  	{91, 5},
6516  	{0, 6},
6517  	{209, 12},
6518  	{209, 12},
6519  	{209, 12},
6520  	{209, 12},
6521  	{191, 10},
6522  	{152, 7},
6523  	{164, 7},
6524  	{145, 3},
6525  	{203, 10},
6526  	{90, 10},
6527  	{108, 10},
6528  	{69, 7},
6529  	{126, 10},
6530  	{75, 7},
6531  	{93, 7},
6532  	{64, 4},
6533  	{209, 12},
6534  	{158, 7},
6535  	{114, 10},
6536  	{71, 7},
6537  	{132, 10},
6538  	{77, 7},
6539  	{95, 7},
6540  	{65, 5},
6541  	{194, 7},
6542  	{83, 7},
6543  	{101, 7},
6544  	{67, 5},
6545  	{119, 7},
6546  	{73, 5},
6547  	{91, 5},
6548  	{1, 7},
6549  	{209, 12},
6550  	{209, 12},
6551  	{173, 7},
6552  	{148, 6},
6553  	{138, 10},
6554  	{79, 7},
6555  	{97, 7},
6556  	{66, 6},
6557  	{197, 7},
6558  	{85, 7},
6559  	{103, 7},
6560  	{68, 6},
6561  	{121, 7},
6562  	{74, 6},
6563  	{92, 6},
6564  	{2, 7},
6565  	{209, 12},
6566  	{157, 6},
6567  	{109, 7},
6568  	{70, 6},
6569  	{127, 7},
6570  	{76, 6},
6571  	{94, 6},
6572  	{4, 7},
6573  	{193, 6},
6574  	{82, 6},
6575  	{100, 6},
6576  	{8, 7},
6577  	{118, 6},
6578  	{16, 7},
6579  	{32, 7},
6580  	{0, 6},
6581  	{209, 12},
6582  	{209, 12},
6583  	{209, 12},
6584  	{209, 12},
6585  	{209, 12},
6586  	{209, 12},
6587  	{209, 12},
6588  	{145, 3},
6589  	{206, 10},
6590  	{156, 8},
6591  	{168, 8},
6592  	{146, 4},
6593  	{180, 8},
6594  	{149, 4},
6595  	{161, 4},
6596  	{64, 4},
6597  	{209, 12},
6598  	{159, 8},
6599  	{116, 10},
6600  	{72, 8},
6601  	{134, 10},
6602  	{78, 8},
6603  	{96, 8},
6604  	{65, 5},
6605  	{195, 8},
6606  	{84, 8},
6607  	{102, 8},
6608  	{67, 5},
6609  	{120, 8},
6610  	{73, 5},
6611  	{91, 5},
6612  	{64, 4},
6613  	{209, 12},
6614  	{209, 12},
6615  	{174, 8},
6616  	{148, 6},
6617  	{140, 10},
6618  	{80, 8},
6619  	{98, 8},
6620  	{66, 6},
6621  	{198, 8},
6622  	{86, 8},
6623  	{104, 8},
6624  	{15, 10},
6625  	{122, 8},
6626  	{23, 10},
6627  	{39, 10},
6628  	{3, 8},
6629  	{209, 12},
6630  	{157, 6},
6631  	{110, 8},
6632  	{70, 6},
6633  	{128, 8},
6634  	{27, 10},
6635  	{43, 10},
6636  	{5, 8},
6637  	{193, 6},
6638  	{82, 6},
6639  	{51, 10},
6640  	{9, 8},
6641  	{118, 6},
6642  	{17, 8},
6643  	{33, 8},
6644  	{0, 6},
6645  	{209, 12},
6646  	{209, 12},
6647  	{209, 12},
6648  	{209, 12},
6649  	{189, 8},
6650  	{152, 7},
6651  	{164, 7},
6652  	{145, 3},
6653  	{201, 8},
6654  	{88, 8},
6655  	{106, 8},
6656  	{69, 7},
6657  	{124, 8},
6658  	{75, 7},
6659  	{93, 7},
6660  	{64, 4},
6661  	{209, 12},
6662  	{158, 7},
6663  	{112, 8},
6664  	{71, 7},
6665  	{130, 8},
6666  	{29, 10},
6667  	{45, 10},
6668  	{6, 8},
6669  	{194, 7},
6670  	{83, 7},
6671  	{53, 10},
6672  	{10, 8},
6673  	{119, 7},
6674  	{18, 8},
6675  	{34, 8},
6676  	{1, 7},
6677  	{209, 12},
6678  	{209, 12},
6679  	{173, 7},
6680  	{148, 6},
6681  	{136, 8},
6682  	{79, 7},
6683  	{97, 7},
6684  	{66, 6},
6685  	{197, 7},
6686  	{85, 7},
6687  	{57, 10},
6688  	{12, 8},
6689  	{121, 7},
6690  	{20, 8},
6691  	{36, 8},
6692  	{2, 7},
6693  	{209, 12},
6694  	{157, 6},
6695  	{109, 7},
6696  	{70, 6},
6697  	{127, 7},
6698  	{24, 8},
6699  	{40, 8},
6700  	{4, 7},
6701  	{193, 6},
6702  	{82, 6},
6703  	{48, 8},
6704  	{8, 7},
6705  	{118, 6},
6706  	{16, 7},
6707  	{32, 7},
6708  	{0, 6},
6709  	{209, 12},
6710  	{209, 12},
6711  	{209, 12},
6712  	{209, 12},
6713  	{209, 12},
6714  	{209, 12},
6715  	{209, 12},
6716  	{145, 3},
6717  	{209, 12},
6718  	{209, 12},
6719  	{209, 12},
6720  	{146, 4},
6721  	{209, 12},
6722  	{149, 4},
6723  	{161, 4},
6724  	{64, 4},
6725  	{209, 12},
6726  	{160, 9},
6727  	{172, 9},
6728  	{147, 5},
6729  	{184, 9},
6730  	{150, 5},
6731  	{162, 5},
6732  	{65, 5},
6733  	{196, 9},
6734  	{153, 5},
6735  	{165, 5},
6736  	{67, 5},
6737  	{177, 5},
6738  	{73, 5},
6739  	{91, 5},
6740  	{64, 4},
6741  	{209, 12},
6742  	{209, 12},
6743  	{175, 9},
6744  	{148, 6},
6745  	{142, 10},
6746  	{81, 9},
6747  	{99, 9},
6748  	{66, 6},
6749  	{199, 9},
6750  	{87, 9},
6751  	{105, 9},
6752  	{68, 6},
6753  	{123, 9},
6754  	{74, 6},
6755  	{92, 6},
6756  	{64, 4},
6757  	{209, 12},
6758  	{157, 6},
6759  	{111, 9},
6760  	{70, 6},
6761  	{129, 9},
6762  	{76, 6},
6763  	{94, 6},
6764  	{65, 5},
6765  	{193, 6},
6766  	{82, 6},
6767  	{100, 6},
6768  	{67, 5},
6769  	{118, 6},
6770  	{73, 5},
6771  	{91, 5},
6772  	{0, 6},
6773  	{209, 12},
6774  	{209, 12},
6775  	{209, 12},
6776  	{209, 12},
6777  	{190, 9},
6778  	{152, 7},
6779  	{164, 7},
6780  	{145, 3},
6781  	{202, 9},
6782  	{89, 9},
6783  	{107, 9},
6784  	{69, 7},
6785  	{125, 9},
6786  	{75, 7},
6787  	{93, 7},
6788  	{64, 4},
6789  	{209, 12},
6790  	{158, 7},
6791  	{113, 9},
6792  	{71, 7},
6793  	{131, 9},
6794  	{30, 10},
6795  	{46, 10},
6796  	{7, 9},
6797  	{194, 7},
6798  	{83, 7},
6799  	{54, 10},
6800  	{11, 9},
6801  	{119, 7},
6802  	{19, 9},
6803  	{35, 9},
6804  	{1, 7},
6805  	{209, 12},
6806  	{209, 12},
6807  	{173, 7},
6808  	{148, 6},
6809  	{137, 9},
6810  	{79, 7},
6811  	{97, 7},
6812  	{66, 6},
6813  	{197, 7},
6814  	{85, 7},
6815  	{58, 10},
6816  	{13, 9},
6817  	{121, 7},
6818  	{21, 9},
6819  	{37, 9},
6820  	{2, 7},
6821  	{209, 12},
6822  	{157, 6},
6823  	{109, 7},
6824  	{70, 6},
6825  	{127, 7},
6826  	{25, 9},
6827  	{41, 9},
6828  	{4, 7},
6829  	{193, 6},
6830  	{82, 6},
6831  	{49, 9},
6832  	{8, 7},
6833  	{118, 6},
6834  	{16, 7},
6835  	{32, 7},
6836  	{0, 6},
6837  	{209, 12},
6838  	{209, 12},
6839  	{209, 12},
6840  	{209, 12},
6841  	{209, 12},
6842  	{209, 12},
6843  	{209, 12},
6844  	{145, 3},
6845  	{205, 9},
6846  	{156, 8},
6847  	{168, 8},
6848  	{146, 4},
6849  	{180, 8},
6850  	{149, 4},
6851  	{161, 4},
6852  	{64, 4},
6853  	{209, 12},
6854  	{159, 8},
6855  	{115, 9},
6856  	{72, 8},
6857  	{133, 9},
6858  	{78, 8},
6859  	{96, 8},
6860  	{65, 5},
6861  	{195, 8},
6862  	{84, 8},
6863  	{102, 8},
6864  	{67, 5},
6865  	{120, 8},
6866  	{73, 5},
6867  	{91, 5},
6868  	{64, 4},
6869  	{209, 12},
6870  	{209, 12},
6871  	{174, 8},
6872  	{148, 6},
6873  	{139, 9},
6874  	{80, 8},
6875  	{98, 8},
6876  	{66, 6},
6877  	{198, 8},
6878  	{86, 8},
6879  	{60, 10},
6880  	{14, 9},
6881  	{122, 8},
6882  	{22, 9},
6883  	{38, 9},
6884  	{3, 8},
6885  	{209, 12},
6886  	{157, 6},
6887  	{110, 8},
6888  	{70, 6},
6889  	{128, 8},
6890  	{26, 9},
6891  	{42, 9},
6892  	{5, 8},
6893  	{193, 6},
6894  	{82, 6},
6895  	{50, 9},
6896  	{9, 8},
6897  	{118, 6},
6898  	{17, 8},
6899  	{33, 8},
6900  	{0, 6},
6901  	{209, 12},
6902  	{209, 12},
6903  	{209, 12},
6904  	{209, 12},
6905  	{189, 8},
6906  	{152, 7},
6907  	{164, 7},
6908  	{145, 3},
6909  	{201, 8},
6910  	{88, 8},
6911  	{106, 8},
6912  	{69, 7},
6913  	{124, 8},
6914  	{75, 7},
6915  	{93, 7},
6916  	{64, 4},
6917  	{209, 12},
6918  	{158, 7},
6919  	{112, 8},
6920  	{71, 7},
6921  	{130, 8},
6922  	{28, 9},
6923  	{44, 9},
6924  	{6, 8},
6925  	{194, 7},
6926  	{83, 7},
6927  	{52, 9},
6928  	{10, 8},
6929  	{119, 7},
6930  	{18, 8},
6931  	{34, 8},
6932  	{1, 7},
6933  	{209, 12},
6934  	{209, 12},
6935  	{173, 7},
6936  	{148, 6},
6937  	{136, 8},
6938  	{79, 7},
6939  	{97, 7},
6940  	{66, 6},
6941  	{197, 7},
6942  	{85, 7},
6943  	{56, 9},
6944  	{12, 8},
6945  	{121, 7},
6946  	{20, 8},
6947  	{36, 8},
6948  	{2, 7},
6949  	{209, 12},
6950  	{157, 6},
6951  	{109, 7},
6952  	{70, 6},
6953  	{127, 7},
6954  	{24, 8},
6955  	{40, 8},
6956  	{4, 7},
6957  	{193, 6},
6958  	{82, 6},
6959  	{48, 8},
6960  	{8, 7},
6961  	{118, 6},
6962  	{16, 7},
6963  	{32, 7},
6964  	{0, 6},
6965  	{209, 12},
6966  	{209, 12},
6967  	{209, 12},
6968  	{209, 12},
6969  	{209, 12},
6970  	{209, 12},
6971  	{209, 12},
6972  	{145, 3},
6973  	{209, 12},
6974  	{209, 12},
6975  	{209, 12},
6976  	{146, 4},
6977  	{209, 12},
6978  	{149, 4},
6979  	{161, 4},
6980  	{64, 4},
6981  	{209, 12},
6982  	{209, 12},
6983  	{209, 12},
6984  	{147, 5},
6985  	{209, 12},
6986  	{150, 5},
6987  	{162, 5},
6988  	{65, 5},
6989  	{209, 12},
6990  	{153, 5},
6991  	{165, 5},
6992  	{67, 5},
6993  	{177, 5},
6994  	{73, 5},
6995  	{91, 5},
6996  	{64, 4},
6997  	{209, 12},
6998  	{209, 12},
6999  	{209, 12},
7000  	{148, 6},
7001  	{209, 12},
7002  	{151, 6},
7003  	{163, 6},
7004  	{66, 6},
7005  	{209, 12},
7006  	{154, 6},
7007  	{166, 6},
7008  	{68, 6},
7009  	{178, 6},
7010  	{74, 6},
7011  	{92, 6},
7012  	{64, 4},
7013  	{209, 12},
7014  	{157, 6},
7015  	{169, 6},
7016  	{70, 6},
7017  	{181, 6},
7018  	{76, 6},
7019  	{94, 6},
7020  	{65, 5},
7021  	{193, 6},
7022  	{82, 6},
7023  	{100, 6},
7024  	{67, 5},
7025  	{118, 6},
7026  	{73, 5},
7027  	{91, 5},
7028  	{0, 6},
7029  	{209, 12},
7030  	{209, 12},
7031  	{209, 12},
7032  	{209, 12},
7033  	{192, 11},
7034  	{152, 7},
7035  	{164, 7},
7036  	{145, 3},
7037  	{204, 11},
7038  	{155, 7},
7039  	{167, 7},
7040  	{69, 7},
7041  	{179, 7},
7042  	{75, 7},
7043  	{93, 7},
7044  	{64, 4},
7045  	{209, 12},
7046  	{158, 7},
7047  	{170, 7},
7048  	{71, 7},
7049  	{182, 7},
7050  	{77, 7},
7051  	{95, 7},
7052  	{65, 5},
7053  	{194, 7},
7054  	{83, 7},
7055  	{101, 7},
7056  	{67, 5},
7057  	{119, 7},
7058  	{73, 5},
7059  	{91, 5},
7060  	{1, 7},
7061  	{209, 12},
7062  	{209, 12},
7063  	{173, 7},
7064  	{148, 6},
7065  	{185, 7},
7066  	{79, 7},
7067  	{97, 7},
7068  	{66, 6},
7069  	{197, 7},
7070  	{85, 7},
7071  	{103, 7},
7072  	{68, 6},
7073  	{121, 7},
7074  	{74, 6},
7075  	{92, 6},
7076  	{2, 7},
7077  	{209, 12},
7078  	{157, 6},
7079  	{109, 7},
7080  	{70, 6},
7081  	{127, 7},
7082  	{76, 6},
7083  	{94, 6},
7084  	{4, 7},
7085  	{193, 6},
7086  	{82, 6},
7087  	{100, 6},
7088  	{8, 7},
7089  	{118, 6},
7090  	{16, 7},
7091  	{32, 7},
7092  	{0, 6},
7093  	{209, 12},
7094  	{209, 12},
7095  	{209, 12},
7096  	{209, 12},
7097  	{209, 12},
7098  	{209, 12},
7099  	{209, 12},
7100  	{145, 3},
7101  	{207, 11},
7102  	{156, 8},
7103  	{168, 8},
7104  	{146, 4},
7105  	{180, 8},
7106  	{149, 4},
7107  	{161, 4},
7108  	{64, 4},
7109  	{209, 12},
7110  	{159, 8},
7111  	{117, 11},
7112  	{72, 8},
7113  	{135, 11},
7114  	{78, 8},
7115  	{96, 8},
7116  	{65, 5},
7117  	{195, 8},
7118  	{84, 8},
7119  	{102, 8},
7120  	{67, 5},
7121  	{120, 8},
7122  	{73, 5},
7123  	{91, 5},
7124  	{64, 4},
7125  	{209, 12},
7126  	{209, 12},
7127  	{174, 8},
7128  	{148, 6},
7129  	{141, 11},
7130  	{80, 8},
7131  	{98, 8},
7132  	{66, 6},
7133  	{198, 8},
7134  	{86, 8},
7135  	{104, 8},
7136  	{68, 6},
7137  	{122, 8},
7138  	{74, 6},
7139  	{92, 6},
7140  	{3, 8},
7141  	{209, 12},
7142  	{157, 6},
7143  	{110, 8},
7144  	{70, 6},
7145  	{128, 8},
7146  	{76, 6},
7147  	{94, 6},
7148  	{5, 8},
7149  	{193, 6},
7150  	{82, 6},
7151  	{100, 6},
7152  	{9, 8},
7153  	{118, 6},
7154  	{17, 8},
7155  	{33, 8},
7156  	{0, 6},
7157  	{209, 12},
7158  	{209, 12},
7159  	{209, 12},
7160  	{209, 12},
7161  	{189, 8},
7162  	{152, 7},
7163  	{164, 7},
7164  	{145, 3},
7165  	{201, 8},
7166  	{88, 8},
7167  	{106, 8},
7168  	{69, 7},
7169  	{124, 8},
7170  	{75, 7},
7171  	{93, 7},
7172  	{64, 4},
7173  	{209, 12},
7174  	{158, 7},
7175  	{112, 8},
7176  	{71, 7},
7177  	{130, 8},
7178  	{77, 7},
7179  	{95, 7},
7180  	{6, 8},
7181  	{194, 7},
7182  	{83, 7},
7183  	{101, 7},
7184  	{10, 8},
7185  	{119, 7},
7186  	{18, 8},
7187  	{34, 8},
7188  	{1, 7},
7189  	{209, 12},
7190  	{209, 12},
7191  	{173, 7},
7192  	{148, 6},
7193  	{136, 8},
7194  	{79, 7},
7195  	{97, 7},
7196  	{66, 6},
7197  	{197, 7},
7198  	{85, 7},
7199  	{103, 7},
7200  	{12, 8},
7201  	{121, 7},
7202  	{20, 8},
7203  	{36, 8},
7204  	{2, 7},
7205  	{209, 12},
7206  	{157, 6},
7207  	{109, 7},
7208  	{70, 6},
7209  	{127, 7},
7210  	{24, 8},
7211  	{40, 8},
7212  	{4, 7},
7213  	{193, 6},
7214  	{82, 6},
7215  	{48, 8},
7216  	{8, 7},
7217  	{118, 6},
7218  	{16, 7},
7219  	{32, 7},
7220  	{0, 6},
7221  	{209, 12},
7222  	{209, 12},
7223  	{209, 12},
7224  	{209, 12},
7225  	{209, 12},
7226  	{209, 12},
7227  	{209, 12},
7228  	{145, 3},
7229  	{209, 12},
7230  	{209, 12},
7231  	{209, 12},
7232  	{146, 4},
7233  	{209, 12},
7234  	{149, 4},
7235  	{161, 4},
7236  	{64, 4},
7237  	{209, 12},
7238  	{160, 9},
7239  	{172, 9},
7240  	{147, 5},
7241  	{184, 9},
7242  	{150, 5},
7243  	{162, 5},
7244  	{65, 5},
7245  	{196, 9},
7246  	{153, 5},
7247  	{165, 5},
7248  	{67, 5},
7249  	{177, 5},
7250  	{73, 5},
7251  	{91, 5},
7252  	{64, 4},
7253  	{209, 12},
7254  	{209, 12},
7255  	{175, 9},
7256  	{148, 6},
7257  	{143, 11},
7258  	{81, 9},
7259  	{99, 9},
7260  	{66, 6},
7261  	{199, 9},
7262  	{87, 9},
7263  	{105, 9},
7264  	{68, 6},
7265  	{123, 9},
7266  	{74, 6},
7267  	{92, 6},
7268  	{64, 4},
7269  	{209, 12},
7270  	{157, 6},
7271  	{111, 9},
7272  	{70, 6},
7273  	{129, 9},
7274  	{76, 6},
7275  	{94, 6},
7276  	{65, 5},
7277  	{193, 6},
7278  	{82, 6},
7279  	{100, 6},
7280  	{67, 5},
7281  	{118, 6},
7282  	{73, 5},
7283  	{91, 5},
7284  	{0, 6},
7285  	{209, 12},
7286  	{209, 12},
7287  	{209, 12},
7288  	{209, 12},
7289  	{190, 9},
7290  	{152, 7},
7291  	{164, 7},
7292  	{145, 3},
7293  	{202, 9},
7294  	{89, 9},
7295  	{107, 9},
7296  	{69, 7},
7297  	{125, 9},
7298  	{75, 7},
7299  	{93, 7},
7300  	{64, 4},
7301  	{209, 12},
7302  	{158, 7},
7303  	{113, 9},
7304  	{71, 7},
7305  	{131, 9},
7306  	{31, 11},
7307  	{47, 11},
7308  	{7, 9},
7309  	{194, 7},
7310  	{83, 7},
7311  	{55, 11},
7312  	{11, 9},
7313  	{119, 7},
7314  	{19, 9},
7315  	{35, 9},
7316  	{1, 7},
7317  	{209, 12},
7318  	{209, 12},
7319  	{173, 7},
7320  	{148, 6},
7321  	{137, 9},
7322  	{79, 7},
7323  	{97, 7},
7324  	{66, 6},
7325  	{197, 7},
7326  	{85, 7},
7327  	{59, 11},
7328  	{13, 9},
7329  	{121, 7},
7330  	{21, 9},
7331  	{37, 9},
7332  	{2, 7},
7333  	{209, 12},
7334  	{157, 6},
7335  	{109, 7},
7336  	{70, 6},
7337  	{127, 7},
7338  	{25, 9},
7339  	{41, 9},
7340  	{4, 7},
7341  	{193, 6},
7342  	{82, 6},
7343  	{49, 9},
7344  	{8, 7},
7345  	{118, 6},
7346  	{16, 7},
7347  	{32, 7},
7348  	{0, 6},
7349  	{209, 12},
7350  	{209, 12},
7351  	{209, 12},
7352  	{209, 12},
7353  	{209, 12},
7354  	{209, 12},
7355  	{209, 12},
7356  	{145, 3},
7357  	{205, 9},
7358  	{156, 8},
7359  	{168, 8},
7360  	{146, 4},
7361  	{180, 8},
7362  	{149, 4},
7363  	{161, 4},
7364  	{64, 4},
7365  	{209, 12},
7366  	{159, 8},
7367  	{115, 9},
7368  	{72, 8},
7369  	{133, 9},
7370  	{78, 8},
7371  	{96, 8},
7372  	{65, 5},
7373  	{195, 8},
7374  	{84, 8},
7375  	{102, 8},
7376  	{67, 5},
7377  	{120, 8},
7378  	{73, 5},
7379  	{91, 5},
7380  	{64, 4},
7381  	{209, 12},
7382  	{209, 12},
7383  	{174, 8},
7384  	{148, 6},
7385  	{139, 9},
7386  	{80, 8},
7387  	{98, 8},
7388  	{66, 6},
7389  	{198, 8},
7390  	{86, 8},
7391  	{61, 11},
7392  	{14, 9},
7393  	{122, 8},
7394  	{22, 9},
7395  	{38, 9},
7396  	{3, 8},
7397  	{209, 12},
7398  	{157, 6},
7399  	{110, 8},
7400  	{70, 6},
7401  	{128, 8},
7402  	{26, 9},
7403  	{42, 9},
7404  	{5, 8},
7405  	{193, 6},
7406  	{82, 6},
7407  	{50, 9},
7408  	{9, 8},
7409  	{118, 6},
7410  	{17, 8},
7411  	{33, 8},
7412  	{0, 6},
7413  	{209, 12},
7414  	{209, 12},
7415  	{209, 12},
7416  	{209, 12},
7417  	{189, 8},
7418  	{152, 7},
7419  	{164, 7},
7420  	{145, 3},
7421  	{201, 8},
7422  	{88, 8},
7423  	{106, 8},
7424  	{69, 7},
7425  	{124, 8},
7426  	{75, 7},
7427  	{93, 7},
7428  	{64, 4},
7429  	{209, 12},
7430  	{158, 7},
7431  	{112, 8},
7432  	{71, 7},
7433  	{130, 8},
7434  	{28, 9},
7435  	{44, 9},
7436  	{6, 8},
7437  	{194, 7},
7438  	{83, 7},
7439  	{52, 9},
7440  	{10, 8},
7441  	{119, 7},
7442  	{18, 8},
7443  	{34, 8},
7444  	{1, 7},
7445  	{209, 12},
7446  	{209, 12},
7447  	{173, 7},
7448  	{148, 6},
7449  	{136, 8},
7450  	{79, 7},
7451  	{97, 7},
7452  	{66, 6},
7453  	{197, 7},
7454  	{85, 7},
7455  	{56, 9},
7456  	{12, 8},
7457  	{121, 7},
7458  	{20, 8},
7459  	{36, 8},
7460  	{2, 7},
7461  	{209, 12},
7462  	{157, 6},
7463  	{109, 7},
7464  	{70, 6},
7465  	{127, 7},
7466  	{24, 8},
7467  	{40, 8},
7468  	{4, 7},
7469  	{193, 6},
7470  	{82, 6},
7471  	{48, 8},
7472  	{8, 7},
7473  	{118, 6},
7474  	{16, 7},
7475  	{32, 7},
7476  	{0, 6},
7477  	{209, 12},
7478  	{209, 12},
7479  	{209, 12},
7480  	{209, 12},
7481  	{209, 12},
7482  	{209, 12},
7483  	{209, 12},
7484  	{145, 3},
7485  	{209, 12},
7486  	{209, 12},
7487  	{209, 12},
7488  	{146, 4},
7489  	{209, 12},
7490  	{149, 4},
7491  	{161, 4},
7492  	{64, 4},
7493  	{209, 12},
7494  	{209, 12},
7495  	{209, 12},
7496  	{147, 5},
7497  	{209, 12},
7498  	{150, 5},
7499  	{162, 5},
7500  	{65, 5},
7501  	{209, 12},
7502  	{153, 5},
7503  	{165, 5},
7504  	{67, 5},
7505  	{177, 5},
7506  	{73, 5},
7507  	{91, 5},
7508  	{64, 4},
7509  	{209, 12},
7510  	{209, 12},
7511  	{176, 10},
7512  	{148, 6},
7513  	{188, 10},
7514  	{151, 6},
7515  	{163, 6},
7516  	{66, 6},
7517  	{200, 10},
7518  	{154, 6},
7519  	{166, 6},
7520  	{68, 6},
7521  	{178, 6},
7522  	{74, 6},
7523  	{92, 6},
7524  	{64, 4},
7525  	{209, 12},
7526  	{157, 6},
7527  	{169, 6},
7528  	{70, 6},
7529  	{181, 6},
7530  	{76, 6},
7531  	{94, 6},
7532  	{65, 5},
7533  	{193, 6},
7534  	{82, 6},
7535  	{100, 6},
7536  	{67, 5},
7537  	{118, 6},
7538  	{73, 5},
7539  	{91, 5},
7540  	{0, 6},
7541  	{209, 12},
7542  	{209, 12},
7543  	{209, 12},
7544  	{209, 12},
7545  	{191, 10},
7546  	{152, 7},
7547  	{164, 7},
7548  	{145, 3},
7549  	{203, 10},
7550  	{90, 10},
7551  	{108, 10},
7552  	{69, 7},
7553  	{126, 10},
7554  	{75, 7},
7555  	{93, 7},
7556  	{64, 4},
7557  	{209, 12},
7558  	{158, 7},
7559  	{114, 10},
7560  	{71, 7},
7561  	{132, 10},
7562  	{77, 7},
7563  	{95, 7},
7564  	{65, 5},
7565  	{194, 7},
7566  	{83, 7},
7567  	{101, 7},
7568  	{67, 5},
7569  	{119, 7},
7570  	{73, 5},
7571  	{91, 5},
7572  	{1, 7},
7573  	{209, 12},
7574  	{209, 12},
7575  	{173, 7},
7576  	{148, 6},
7577  	{138, 10},
7578  	{79, 7},
7579  	{97, 7},
7580  	{66, 6},
7581  	{197, 7},
7582  	{85, 7},
7583  	{103, 7},
7584  	{68, 6},
7585  	{121, 7},
7586  	{74, 6},
7587  	{92, 6},
7588  	{2, 7},
7589  	{209, 12},
7590  	{157, 6},
7591  	{109, 7},
7592  	{70, 6},
7593  	{127, 7},
7594  	{76, 6},
7595  	{94, 6},
7596  	{4, 7},
7597  	{193, 6},
7598  	{82, 6},
7599  	{100, 6},
7600  	{8, 7},
7601  	{118, 6},
7602  	{16, 7},
7603  	{32, 7},
7604  	{0, 6},
7605  	{209, 12},
7606  	{209, 12},
7607  	{209, 12},
7608  	{209, 12},
7609  	{209, 12},
7610  	{209, 12},
7611  	{209, 12},
7612  	{145, 3},
7613  	{206, 10},
7614  	{156, 8},
7615  	{168, 8},
7616  	{146, 4},
7617  	{180, 8},
7618  	{149, 4},
7619  	{161, 4},
7620  	{64, 4},
7621  	{209, 12},
7622  	{159, 8},
7623  	{116, 10},
7624  	{72, 8},
7625  	{134, 10},
7626  	{78, 8},
7627  	{96, 8},
7628  	{65, 5},
7629  	{195, 8},
7630  	{84, 8},
7631  	{102, 8},
7632  	{67, 5},
7633  	{120, 8},
7634  	{73, 5},
7635  	{91, 5},
7636  	{64, 4},
7637  	{209, 12},
7638  	{209, 12},
7639  	{174, 8},
7640  	{148, 6},
7641  	{140, 10},
7642  	{80, 8},
7643  	{98, 8},
7644  	{66, 6},
7645  	{198, 8},
7646  	{86, 8},
7647  	{62, 11},
7648  	{15, 10},
7649  	{122, 8},
7650  	{23, 10},
7651  	{39, 10},
7652  	{3, 8},
7653  	{209, 12},
7654  	{157, 6},
7655  	{110, 8},
7656  	{70, 6},
7657  	{128, 8},
7658  	{27, 10},
7659  	{43, 10},
7660  	{5, 8},
7661  	{193, 6},
7662  	{82, 6},
7663  	{51, 10},
7664  	{9, 8},
7665  	{118, 6},
7666  	{17, 8},
7667  	{33, 8},
7668  	{0, 6},
7669  	{209, 12},
7670  	{209, 12},
7671  	{209, 12},
7672  	{209, 12},
7673  	{189, 8},
7674  	{152, 7},
7675  	{164, 7},
7676  	{145, 3},
7677  	{201, 8},
7678  	{88, 8},
7679  	{106, 8},
7680  	{69, 7},
7681  	{124, 8},
7682  	{75, 7},
7683  	{93, 7},
7684  	{64, 4},
7685  	{209, 12},
7686  	{158, 7},
7687  	{112, 8},
7688  	{71, 7},
7689  	{130, 8},
7690  	{29, 10},
7691  	{45, 10},
7692  	{6, 8},
7693  	{194, 7},
7694  	{83, 7},
7695  	{53, 10},
7696  	{10, 8},
7697  	{119, 7},
7698  	{18, 8},
7699  	{34, 8},
7700  	{1, 7},
7701  	{209, 12},
7702  	{209, 12},
7703  	{173, 7},
7704  	{148, 6},
7705  	{136, 8},
7706  	{79, 7},
7707  	{97, 7},
7708  	{66, 6},
7709  	{197, 7},
7710  	{85, 7},
7711  	{57, 10},
7712  	{12, 8},
7713  	{121, 7},
7714  	{20, 8},
7715  	{36, 8},
7716  	{2, 7},
7717  	{209, 12},
7718  	{157, 6},
7719  	{109, 7},
7720  	{70, 6},
7721  	{127, 7},
7722  	{24, 8},
7723  	{40, 8},
7724  	{4, 7},
7725  	{193, 6},
7726  	{82, 6},
7727  	{48, 8},
7728  	{8, 7},
7729  	{118, 6},
7730  	{16, 7},
7731  	{32, 7},
7732  	{0, 6},
7733  	{209, 12},
7734  	{209, 12},
7735  	{209, 12},
7736  	{209, 12},
7737  	{209, 12},
7738  	{209, 12},
7739  	{209, 12},
7740  	{145, 3},
7741  	{209, 12},
7742  	{209, 12},
7743  	{209, 12},
7744  	{146, 4},
7745  	{209, 12},
7746  	{149, 4},
7747  	{161, 4},
7748  	{64, 4},
7749  	{209, 12},
7750  	{160, 9},
7751  	{172, 9},
7752  	{147, 5},
7753  	{184, 9},
7754  	{150, 5},
7755  	{162, 5},
7756  	{65, 5},
7757  	{196, 9},
7758  	{153, 5},
7759  	{165, 5},
7760  	{67, 5},
7761  	{177, 5},
7762  	{73, 5},
7763  	{91, 5},
7764  	{64, 4},
7765  	{209, 12},
7766  	{209, 12},
7767  	{175, 9},
7768  	{148, 6},
7769  	{142, 10},
7770  	{81, 9},
7771  	{99, 9},
7772  	{66, 6},
7773  	{199, 9},
7774  	{87, 9},
7775  	{105, 9},
7776  	{68, 6},
7777  	{123, 9},
7778  	{74, 6},
7779  	{92, 6},
7780  	{64, 4},
7781  	{209, 12},
7782  	{157, 6},
7783  	{111, 9},
7784  	{70, 6},
7785  	{129, 9},
7786  	{76, 6},
7787  	{94, 6},
7788  	{65, 5},
7789  	{193, 6},
7790  	{82, 6},
7791  	{100, 6},
7792  	{67, 5},
7793  	{118, 6},
7794  	{73, 5},
7795  	{91, 5},
7796  	{0, 6},
7797  	{209, 12},
7798  	{209, 12},
7799  	{209, 12},
7800  	{209, 12},
7801  	{190, 9},
7802  	{152, 7},
7803  	{164, 7},
7804  	{145, 3},
7805  	{202, 9},
7806  	{89, 9},
7807  	{107, 9},
7808  	{69, 7},
7809  	{125, 9},
7810  	{75, 7},
7811  	{93, 7},
7812  	{64, 4},
7813  	{209, 12},
7814  	{158, 7},
7815  	{113, 9},
7816  	{71, 7},
7817  	{131, 9},
7818  	{30, 10},
7819  	{46, 10},
7820  	{7, 9},
7821  	{194, 7},
7822  	{83, 7},
7823  	{54, 10},
7824  	{11, 9},
7825  	{119, 7},
7826  	{19, 9},
7827  	{35, 9},
7828  	{1, 7},
7829  	{209, 12},
7830  	{209, 12},
7831  	{173, 7},
7832  	{148, 6},
7833  	{137, 9},
7834  	{79, 7},
7835  	{97, 7},
7836  	{66, 6},
7837  	{197, 7},
7838  	{85, 7},
7839  	{58, 10},
7840  	{13, 9},
7841  	{121, 7},
7842  	{21, 9},
7843  	{37, 9},
7844  	{2, 7},
7845  	{209, 12},
7846  	{157, 6},
7847  	{109, 7},
7848  	{70, 6},
7849  	{127, 7},
7850  	{25, 9},
7851  	{41, 9},
7852  	{4, 7},
7853  	{193, 6},
7854  	{82, 6},
7855  	{49, 9},
7856  	{8, 7},
7857  	{118, 6},
7858  	{16, 7},
7859  	{32, 7},
7860  	{0, 6},
7861  	{209, 12},
7862  	{209, 12},
7863  	{209, 12},
7864  	{209, 12},
7865  	{209, 12},
7866  	{209, 12},
7867  	{209, 12},
7868  	{145, 3},
7869  	{205, 9},
7870  	{156, 8},
7871  	{168, 8},
7872  	{146, 4},
7873  	{180, 8},
7874  	{149, 4},
7875  	{161, 4},
7876  	{64, 4},
7877  	{209, 12},
7878  	{159, 8},
7879  	{115, 9},
7880  	{72, 8},
7881  	{133, 9},
7882  	{78, 8},
7883  	{96, 8},
7884  	{65, 5},
7885  	{195, 8},
7886  	{84, 8},
7887  	{102, 8},
7888  	{67, 5},
7889  	{120, 8},
7890  	{73, 5},
7891  	{91, 5},
7892  	{64, 4},
7893  	{209, 12},
7894  	{209, 12},
7895  	{174, 8},
7896  	{148, 6},
7897  	{139, 9},
7898  	{80, 8},
7899  	{98, 8},
7900  	{66, 6},
7901  	{198, 8},
7902  	{86, 8},
7903  	{60, 10},
7904  	{14, 9},
7905  	{122, 8},
7906  	{22, 9},
7907  	{38, 9},
7908  	{3, 8},
7909  	{209, 12},
7910  	{157, 6},
7911  	{110, 8},
7912  	{70, 6},
7913  	{128, 8},
7914  	{26, 9},
7915  	{42, 9},
7916  	{5, 8},
7917  	{193, 6},
7918  	{82, 6},
7919  	{50, 9},
7920  	{9, 8},
7921  	{118, 6},
7922  	{17, 8},
7923  	{33, 8},
7924  	{0, 6},
7925  	{209, 12},
7926  	{209, 12},
7927  	{209, 12},
7928  	{209, 12},
7929  	{189, 8},
7930  	{152, 7},
7931  	{164, 7},
7932  	{145, 3},
7933  	{201, 8},
7934  	{88, 8},
7935  	{106, 8},
7936  	{69, 7},
7937  	{124, 8},
7938  	{75, 7},
7939  	{93, 7},
7940  	{64, 4},
7941  	{209, 12},
7942  	{158, 7},
7943  	{112, 8},
7944  	{71, 7},
7945  	{130, 8},
7946  	{28, 9},
7947  	{44, 9},
7948  	{6, 8},
7949  	{194, 7},
7950  	{83, 7},
7951  	{52, 9},
7952  	{10, 8},
7953  	{119, 7},
7954  	{18, 8},
7955  	{34, 8},
7956  	{1, 7},
7957  	{209, 12},
7958  	{209, 12},
7959  	{173, 7},
7960  	{148, 6},
7961  	{136, 8},
7962  	{79, 7},
7963  	{97, 7},
7964  	{66, 6},
7965  	{197, 7},
7966  	{85, 7},
7967  	{56, 9},
7968  	{12, 8},
7969  	{121, 7},
7970  	{20, 8},
7971  	{36, 8},
7972  	{2, 7},
7973  	{209, 12},
7974  	{157, 6},
7975  	{109, 7},
7976  	{70, 6},
7977  	{127, 7},
7978  	{24, 8},
7979  	{40, 8},
7980  	{4, 7},
7981  	{193, 6},
7982  	{82, 6},
7983  	{48, 8},
7984  	{8, 7},
7985  	{118, 6},
7986  	{16, 7},
7987  	{32, 7},
7988  	{0, 6},
7989  	{209, 12},
7990  	{209, 12},
7991  	{209, 12},
7992  	{209, 12},
7993  	{209, 12},
7994  	{209, 12},
7995  	{209, 12},
7996  	{145, 3},
7997  	{209, 12},
7998  	{209, 12},
7999  	{209, 12},
8000  	{146, 4},
8001  	{209, 12},
8002  	{149, 4},
8003  	{161, 4},
8004  	{64, 4},
8005  	{209, 12},
8006  	{209, 12},
8007  	{209, 12},
8008  	{147, 5},
8009  	{209, 12},
8010  	{150, 5},
8011  	{162, 5},
8012  	{65, 5},
8013  	{209, 12},
8014  	{153, 5},
8015  	{165, 5},
8016  	{67, 5},
8017  	{177, 5},
8018  	{73, 5},
8019  	{91, 5},
8020  	{64, 4},
8021  	{209, 12},
8022  	{209, 12},
8023  	{209, 12},
8024  	{148, 6},
8025  	{209, 12},
8026  	{151, 6},
8027  	{163, 6},
8028  	{66, 6},
8029  	{209, 12},
8030  	{154, 6},
8031  	{166, 6},
8032  	{68, 6},
8033  	{178, 6},
8034  	{74, 6},
8035  	{92, 6},
8036  	{64, 4},
8037  	{209, 12},
8038  	{157, 6},
8039  	{169, 6},
8040  	{70, 6},
8041  	{181, 6},
8042  	{76, 6},
8043  	{94, 6},
8044  	{65, 5},
8045  	{193, 6},
8046  	{82, 6},
8047  	{100, 6},
8048  	{67, 5},
8049  	{118, 6},
8050  	{73, 5},
8051  	{91, 5},
8052  	{0, 6},
8053  	{209, 12},
8054  	{209, 12},
8055  	{209, 12},
8056  	{209, 12},
8057  	{209, 12},
8058  	{152, 7},
8059  	{164, 7},
8060  	{145, 3},
8061  	{209, 12},
8062  	{155, 7},
8063  	{167, 7},
8064  	{69, 7},
8065  	{179, 7},
8066  	{75, 7},
8067  	{93, 7},
8068  	{64, 4},
8069  	{209, 12},
8070  	{158, 7},
8071  	{170, 7},
8072  	{71, 7},
8073  	{182, 7},
8074  	{77, 7},
8075  	{95, 7},
8076  	{65, 5},
8077  	{194, 7},
8078  	{83, 7},
8079  	{101, 7},
8080  	{67, 5},
8081  	{119, 7},
8082  	{73, 5},
8083  	{91, 5},
8084  	{1, 7},
8085  	{209, 12},
8086  	{209, 12},
8087  	{173, 7},
8088  	{148, 6},
8089  	{185, 7},
8090  	{79, 7},
8091  	{97, 7},
8092  	{66, 6},
8093  	{197, 7},
8094  	{85, 7},
8095  	{103, 7},
8096  	{68, 6},
8097  	{121, 7},
8098  	{74, 6},
8099  	{92, 6},
8100  	{2, 7},
8101  	{209, 12},
8102  	{157, 6},
8103  	{109, 7},
8104  	{70, 6},
8105  	{127, 7},
8106  	{76, 6},
8107  	{94, 6},
8108  	{4, 7},
8109  	{193, 6},
8110  	{82, 6},
8111  	{100, 6},
8112  	{8, 7},
8113  	{118, 6},
8114  	{16, 7},
8115  	{32, 7},
8116  	{0, 6},
8117  	{209, 12},
8118  	{209, 12},
8119  	{209, 12},
8120  	{209, 12},
8121  	{209, 12},
8122  	{209, 12},
8123  	{209, 12},
8124  	{145, 3},
8125  	{208, 12},
8126  	{156, 8},
8127  	{168, 8},
8128  	{146, 4},
8129  	{180, 8},
8130  	{149, 4},
8131  	{161, 4},
8132  	{64, 4},
8133  	{209, 12},
8134  	{159, 8},
8135  	{171, 8},
8136  	{72, 8},
8137  	{183, 8},
8138  	{78, 8},
8139  	{96, 8},
8140  	{65, 5},
8141  	{195, 8},
8142  	{84, 8},
8143  	{102, 8},
8144  	{67, 5},
8145  	{120, 8},
8146  	{73, 5},
8147  	{91, 5},
8148  	{64, 4},
8149  	{209, 12},
8150  	{209, 12},
8151  	{174, 8},
8152  	{148, 6},
8153  	{186, 8},
8154  	{80, 8},
8155  	{98, 8},
8156  	{66, 6},
8157  	{198, 8},
8158  	{86, 8},
8159  	{104, 8},
8160  	{68, 6},
8161  	{122, 8},
8162  	{74, 6},
8163  	{92, 6},
8164  	{3, 8},
8165  	{209, 12},
8166  	{157, 6},
8167  	{110, 8},
8168  	{70, 6},
8169  	{128, 8},
8170  	{76, 6},
8171  	{94, 6},
8172  	{5, 8},
8173  	{193, 6},
8174  	{82, 6},
8175  	{100, 6},
8176  	{9, 8},
8177  	{118, 6},
8178  	{17, 8},
8179  	{33, 8},
8180  	{0, 6},
8181  	{209, 12},
8182  	{209, 12},
8183  	{209, 12},
8184  	{209, 12},
8185  	{189, 8},
8186  	{152, 7},
8187  	{164, 7},
8188  	{145, 3},
8189  	{201, 8},
8190  	{88, 8},
8191  	{106, 8},
8192  	{69, 7},
8193  	{124, 8},
8194  	{75, 7},
8195  	{93, 7},
8196  	{64, 4},
8197  	{209, 12},
8198  	{158, 7},
8199  	{112, 8},
8200  	{71, 7},
8201  	{130, 8},
8202  	{77, 7},
8203  	{95, 7},
8204  	{6, 8},
8205  	{194, 7},
8206  	{83, 7},
8207  	{101, 7},
8208  	{10, 8},
8209  	{119, 7},
8210  	{18, 8},
8211  	{34, 8},
8212  	{1, 7},
8213  	{209, 12},
8214  	{209, 12},
8215  	{173, 7},
8216  	{148, 6},
8217  	{136, 8},
8218  	{79, 7},
8219  	{97, 7},
8220  	{66, 6},
8221  	{197, 7},
8222  	{85, 7},
8223  	{103, 7},
8224  	{12, 8},
8225  	{121, 7},
8226  	{20, 8},
8227  	{36, 8},
8228  	{2, 7},
8229  	{209, 12},
8230  	{157, 6},
8231  	{109, 7},
8232  	{70, 6},
8233  	{127, 7},
8234  	{24, 8},
8235  	{40, 8},
8236  	{4, 7},
8237  	{193, 6},
8238  	{82, 6},
8239  	{48, 8},
8240  	{8, 7},
8241  	{118, 6},
8242  	{16, 7},
8243  	{32, 7},
8244  	{0, 6},
8245  	{209, 12},
8246  	{209, 12},
8247  	{209, 12},
8248  	{209, 12},
8249  	{209, 12},
8250  	{209, 12},
8251  	{209, 12},
8252  	{145, 3},
8253  	{209, 12},
8254  	{209, 12},
8255  	{209, 12},
8256  	{146, 4},
8257  	{209, 12},
8258  	{149, 4},
8259  	{161, 4},
8260  	{64, 4},
8261  	{209, 12},
8262  	{160, 9},
8263  	{172, 9},
8264  	{147, 5},
8265  	{184, 9},
8266  	{150, 5},
8267  	{162, 5},
8268  	{65, 5},
8269  	{196, 9},
8270  	{153, 5},
8271  	{165, 5},
8272  	{67, 5},
8273  	{177, 5},
8274  	{73, 5},
8275  	{91, 5},
8276  	{64, 4},
8277  	{209, 12},
8278  	{209, 12},
8279  	{175, 9},
8280  	{148, 6},
8281  	{144, 12},
8282  	{81, 9},
8283  	{99, 9},
8284  	{66, 6},
8285  	{199, 9},
8286  	{87, 9},
8287  	{105, 9},
8288  	{68, 6},
8289  	{123, 9},
8290  	{74, 6},
8291  	{92, 6},
8292  	{64, 4},
8293  	{209, 12},
8294  	{157, 6},
8295  	{111, 9},
8296  	{70, 6},
8297  	{129, 9},
8298  	{76, 6},
8299  	{94, 6},
8300  	{65, 5},
8301  	{193, 6},
8302  	{82, 6},
8303  	{100, 6},
8304  	{67, 5},
8305  	{118, 6},
8306  	{73, 5},
8307  	{91, 5},
8308  	{0, 6},
8309  	{209, 12},
8310  	{209, 12},
8311  	{209, 12},
8312  	{209, 12},
8313  	{190, 9},
8314  	{152, 7},
8315  	{164, 7},
8316  	{145, 3},
8317  	{202, 9},
8318  	{89, 9},
8319  	{107, 9},
8320  	{69, 7},
8321  	{125, 9},
8322  	{75, 7},
8323  	{93, 7},
8324  	{64, 4},
8325  	{209, 12},
8326  	{158, 7},
8327  	{113, 9},
8328  	{71, 7},
8329  	{131, 9},
8330  	{77, 7},
8331  	{95, 7},
8332  	{7, 9},
8333  	{194, 7},
8334  	{83, 7},
8335  	{101, 7},
8336  	{11, 9},
8337  	{119, 7},
8338  	{19, 9},
8339  	{35, 9},
8340  	{1, 7},
8341  	{209, 12},
8342  	{209, 12},
8343  	{173, 7},
8344  	{148, 6},
8345  	{137, 9},
8346  	{79, 7},
8347  	{97, 7},
8348  	{66, 6},
8349  	{197, 7},
8350  	{85, 7},
8351  	{103, 7},
8352  	{13, 9},
8353  	{121, 7},
8354  	{21, 9},
8355  	{37, 9},
8356  	{2, 7},
8357  	{209, 12},
8358  	{157, 6},
8359  	{109, 7},
8360  	{70, 6},
8361  	{127, 7},
8362  	{25, 9},
8363  	{41, 9},
8364  	{4, 7},
8365  	{193, 6},
8366  	{82, 6},
8367  	{49, 9},
8368  	{8, 7},
8369  	{118, 6},
8370  	{16, 7},
8371  	{32, 7},
8372  	{0, 6},
8373  	{209, 12},
8374  	{209, 12},
8375  	{209, 12},
8376  	{209, 12},
8377  	{209, 12},
8378  	{209, 12},
8379  	{209, 12},
8380  	{145, 3},
8381  	{205, 9},
8382  	{156, 8},
8383  	{168, 8},
8384  	{146, 4},
8385  	{180, 8},
8386  	{149, 4},
8387  	{161, 4},
8388  	{64, 4},
8389  	{209, 12},
8390  	{159, 8},
8391  	{115, 9},
8392  	{72, 8},
8393  	{133, 9},
8394  	{78, 8},
8395  	{96, 8},
8396  	{65, 5},
8397  	{195, 8},
8398  	{84, 8},
8399  	{102, 8},
8400  	{67, 5},
8401  	{120, 8},
8402  	{73, 5},
8403  	{91, 5},
8404  	{64, 4},
8405  	{209, 12},
8406  	{209, 12},
8407  	{174, 8},
8408  	{148, 6},
8409  	{139, 9},
8410  	{80, 8},
8411  	{98, 8},
8412  	{66, 6},
8413  	{198, 8},
8414  	{86, 8},
8415  	{104, 8},
8416  	{14, 9},
8417  	{122, 8},
8418  	{22, 9},
8419  	{38, 9},
8420  	{3, 8},
8421  	{209, 12},
8422  	{157, 6},
8423  	{110, 8},
8424  	{70, 6},
8425  	{128, 8},
8426  	{26, 9},
8427  	{42, 9},
8428  	{5, 8},
8429  	{193, 6},
8430  	{82, 6},
8431  	{50, 9},
8432  	{9, 8},
8433  	{118, 6},
8434  	{17, 8},
8435  	{33, 8},
8436  	{0, 6},
8437  	{209, 12},
8438  	{209, 12},
8439  	{209, 12},
8440  	{209, 12},
8441  	{189, 8},
8442  	{152, 7},
8443  	{164, 7},
8444  	{145, 3},
8445  	{201, 8},
8446  	{88, 8},
8447  	{106, 8},
8448  	{69, 7},
8449  	{124, 8},
8450  	{75, 7},
8451  	{93, 7},
8452  	{64, 4},
8453  	{209, 12},
8454  	{158, 7},
8455  	{112, 8},
8456  	{71, 7},
8457  	{130, 8},
8458  	{28, 9},
8459  	{44, 9},
8460  	{6, 8},
8461  	{194, 7},
8462  	{83, 7},
8463  	{52, 9},
8464  	{10, 8},
8465  	{119, 7},
8466  	{18, 8},
8467  	{34, 8},
8468  	{1, 7},
8469  	{209, 12},
8470  	{209, 12},
8471  	{173, 7},
8472  	{148, 6},
8473  	{136, 8},
8474  	{79, 7},
8475  	{97, 7},
8476  	{66, 6},
8477  	{197, 7},
8478  	{85, 7},
8479  	{56, 9},
8480  	{12, 8},
8481  	{121, 7},
8482  	{20, 8},
8483  	{36, 8},
8484  	{2, 7},
8485  	{209, 12},
8486  	{157, 6},
8487  	{109, 7},
8488  	{70, 6},
8489  	{127, 7},
8490  	{24, 8},
8491  	{40, 8},
8492  	{4, 7},
8493  	{193, 6},
8494  	{82, 6},
8495  	{48, 8},
8496  	{8, 7},
8497  	{118, 6},
8498  	{16, 7},
8499  	{32, 7},
8500  	{0, 6},
8501  	{209, 12},
8502  	{209, 12},
8503  	{209, 12},
8504  	{209, 12},
8505  	{209, 12},
8506  	{209, 12},
8507  	{209, 12},
8508  	{145, 3},
8509  	{209, 12},
8510  	{209, 12},
8511  	{209, 12},
8512  	{146, 4},
8513  	{209, 12},
8514  	{149, 4},
8515  	{161, 4},
8516  	{64, 4},
8517  	{209, 12},
8518  	{209, 12},
8519  	{209, 12},
8520  	{147, 5},
8521  	{209, 12},
8522  	{150, 5},
8523  	{162, 5},
8524  	{65, 5},
8525  	{209, 12},
8526  	{153, 5},
8527  	{165, 5},
8528  	{67, 5},
8529  	{177, 5},
8530  	{73, 5},
8531  	{91, 5},
8532  	{64, 4},
8533  	{209, 12},
8534  	{209, 12},
8535  	{176, 10},
8536  	{148, 6},
8537  	{188, 10},
8538  	{151, 6},
8539  	{163, 6},
8540  	{66, 6},
8541  	{200, 10},
8542  	{154, 6},
8543  	{166, 6},
8544  	{68, 6},
8545  	{178, 6},
8546  	{74, 6},
8547  	{92, 6},
8548  	{64, 4},
8549  	{209, 12},
8550  	{157, 6},
8551  	{169, 6},
8552  	{70, 6},
8553  	{181, 6},
8554  	{76, 6},
8555  	{94, 6},
8556  	{65, 5},
8557  	{193, 6},
8558  	{82, 6},
8559  	{100, 6},
8560  	{67, 5},
8561  	{118, 6},
8562  	{73, 5},
8563  	{91, 5},
8564  	{0, 6},
8565  	{209, 12},
8566  	{209, 12},
8567  	{209, 12},
8568  	{209, 12},
8569  	{191, 10},
8570  	{152, 7},
8571  	{164, 7},
8572  	{145, 3},
8573  	{203, 10},
8574  	{90, 10},
8575  	{108, 10},
8576  	{69, 7},
8577  	{126, 10},
8578  	{75, 7},
8579  	{93, 7},
8580  	{64, 4},
8581  	{209, 12},
8582  	{158, 7},
8583  	{114, 10},
8584  	{71, 7},
8585  	{132, 10},
8586  	{77, 7},
8587  	{95, 7},
8588  	{65, 5},
8589  	{194, 7},
8590  	{83, 7},
8591  	{101, 7},
8592  	{67, 5},
8593  	{119, 7},
8594  	{73, 5},
8595  	{91, 5},
8596  	{1, 7},
8597  	{209, 12},
8598  	{209, 12},
8599  	{173, 7},
8600  	{148, 6},
8601  	{138, 10},
8602  	{79, 7},
8603  	{97, 7},
8604  	{66, 6},
8605  	{197, 7},
8606  	{85, 7},
8607  	{103, 7},
8608  	{68, 6},
8609  	{121, 7},
8610  	{74, 6},
8611  	{92, 6},
8612  	{2, 7},
8613  	{209, 12},
8614  	{157, 6},
8615  	{109, 7},
8616  	{70, 6},
8617  	{127, 7},
8618  	{76, 6},
8619  	{94, 6},
8620  	{4, 7},
8621  	{193, 6},
8622  	{82, 6},
8623  	{100, 6},
8624  	{8, 7},
8625  	{118, 6},
8626  	{16, 7},
8627  	{32, 7},
8628  	{0, 6},
8629  	{209, 12},
8630  	{209, 12},
8631  	{209, 12},
8632  	{209, 12},
8633  	{209, 12},
8634  	{209, 12},
8635  	{209, 12},
8636  	{145, 3},
8637  	{206, 10},
8638  	{156, 8},
8639  	{168, 8},
8640  	{146, 4},
8641  	{180, 8},
8642  	{149, 4},
8643  	{161, 4},
8644  	{64, 4},
8645  	{209, 12},
8646  	{159, 8},
8647  	{116, 10},
8648  	{72, 8},
8649  	{134, 10},
8650  	{78, 8},
8651  	{96, 8},
8652  	{65, 5},
8653  	{195, 8},
8654  	{84, 8},
8655  	{102, 8},
8656  	{67, 5},
8657  	{120, 8},
8658  	{73, 5},
8659  	{91, 5},
8660  	{64, 4},
8661  	{209, 12},
8662  	{209, 12},
8663  	{174, 8},
8664  	{148, 6},
8665  	{140, 10},
8666  	{80, 8},
8667  	{98, 8},
8668  	{66, 6},
8669  	{198, 8},
8670  	{86, 8},
8671  	{63, 12},
8672  	{15, 10},
8673  	{122, 8},
8674  	{23, 10},
8675  	{39, 10},
8676  	{3, 8},
8677  	{209, 12},
8678  	{157, 6},
8679  	{110, 8},
8680  	{70, 6},
8681  	{128, 8},
8682  	{27, 10},
8683  	{43, 10},
8684  	{5, 8},
8685  	{193, 6},
8686  	{82, 6},
8687  	{51, 10},
8688  	{9, 8},
8689  	{118, 6},
8690  	{17, 8},
8691  	{33, 8},
8692  	{0, 6},
8693  	{209, 12},
8694  	{209, 12},
8695  	{209, 12},
8696  	{209, 12},
8697  	{189, 8},
8698  	{152, 7},
8699  	{164, 7},
8700  	{145, 3},
8701  	{201, 8},
8702  	{88, 8},
8703  	{106, 8},
8704  	{69, 7},
8705  	{124, 8},
8706  	{75, 7},
8707  	{93, 7},
8708  	{64, 4},
8709  	{209, 12},
8710  	{158, 7},
8711  	{112, 8},
8712  	{71, 7},
8713  	{130, 8},
8714  	{29, 10},
8715  	{45, 10},
8716  	{6, 8},
8717  	{194, 7},
8718  	{83, 7},
8719  	{53, 10},
8720  	{10, 8},
8721  	{119, 7},
8722  	{18, 8},
8723  	{34, 8},
8724  	{1, 7},
8725  	{209, 12},
8726  	{209, 12},
8727  	{173, 7},
8728  	{148, 6},
8729  	{136, 8},
8730  	{79, 7},
8731  	{97, 7},
8732  	{66, 6},
8733  	{197, 7},
8734  	{85, 7},
8735  	{57, 10},
8736  	{12, 8},
8737  	{121, 7},
8738  	{20, 8},
8739  	{36, 8},
8740  	{2, 7},
8741  	{209, 12},
8742  	{157, 6},
8743  	{109, 7},
8744  	{70, 6},
8745  	{127, 7},
8746  	{24, 8},
8747  	{40, 8},
8748  	{4, 7},
8749  	{193, 6},
8750  	{82, 6},
8751  	{48, 8},
8752  	{8, 7},
8753  	{118, 6},
8754  	{16, 7},
8755  	{32, 7},
8756  	{0, 6},
8757  	{209, 12},
8758  	{209, 12},
8759  	{209, 12},
8760  	{209, 12},
8761  	{209, 12},
8762  	{209, 12},
8763  	{209, 12},
8764  	{145, 3},
8765  	{209, 12},
8766  	{209, 12},
8767  	{209, 12},
8768  	{146, 4},
8769  	{209, 12},
8770  	{149, 4},
8771  	{161, 4},
8772  	{64, 4},
8773  	{209, 12},
8774  	{160, 9},
8775  	{172, 9},
8776  	{147, 5},
8777  	{184, 9},
8778  	{150, 5},
8779  	{162, 5},
8780  	{65, 5},
8781  	{196, 9},
8782  	{153, 5},
8783  	{165, 5},
8784  	{67, 5},
8785  	{177, 5},
8786  	{73, 5},
8787  	{91, 5},
8788  	{64, 4},
8789  	{209, 12},
8790  	{209, 12},
8791  	{175, 9},
8792  	{148, 6},
8793  	{142, 10},
8794  	{81, 9},
8795  	{99, 9},
8796  	{66, 6},
8797  	{199, 9},
8798  	{87, 9},
8799  	{105, 9},
8800  	{68, 6},
8801  	{123, 9},
8802  	{74, 6},
8803  	{92, 6},
8804  	{64, 4},
8805  	{209, 12},
8806  	{157, 6},
8807  	{111, 9},
8808  	{70, 6},
8809  	{129, 9},
8810  	{76, 6},
8811  	{94, 6},
8812  	{65, 5},
8813  	{193, 6},
8814  	{82, 6},
8815  	{100, 6},
8816  	{67, 5},
8817  	{118, 6},
8818  	{73, 5},
8819  	{91, 5},
8820  	{0, 6},
8821  	{209, 12},
8822  	{209, 12},
8823  	{209, 12},
8824  	{209, 12},
8825  	{190, 9},
8826  	{152, 7},
8827  	{164, 7},
8828  	{145, 3},
8829  	{202, 9},
8830  	{89, 9},
8831  	{107, 9},
8832  	{69, 7},
8833  	{125, 9},
8834  	{75, 7},
8835  	{93, 7},
8836  	{64, 4},
8837  	{209, 12},
8838  	{158, 7},
8839  	{113, 9},
8840  	{71, 7},
8841  	{131, 9},
8842  	{30, 10},
8843  	{46, 10},
8844  	{7, 9},
8845  	{194, 7},
8846  	{83, 7},
8847  	{54, 10},
8848  	{11, 9},
8849  	{119, 7},
8850  	{19, 9},
8851  	{35, 9},
8852  	{1, 7},
8853  	{209, 12},
8854  	{209, 12},
8855  	{173, 7},
8856  	{148, 6},
8857  	{137, 9},
8858  	{79, 7},
8859  	{97, 7},
8860  	{66, 6},
8861  	{197, 7},
8862  	{85, 7},
8863  	{58, 10},
8864  	{13, 9},
8865  	{121, 7},
8866  	{21, 9},
8867  	{37, 9},
8868  	{2, 7},
8869  	{209, 12},
8870  	{157, 6},
8871  	{109, 7},
8872  	{70, 6},
8873  	{127, 7},
8874  	{25, 9},
8875  	{41, 9},
8876  	{4, 7},
8877  	{193, 6},
8878  	{82, 6},
8879  	{49, 9},
8880  	{8, 7},
8881  	{118, 6},
8882  	{16, 7},
8883  	{32, 7},
8884  	{0, 6},
8885  	{209, 12},
8886  	{209, 12},
8887  	{209, 12},
8888  	{209, 12},
8889  	{209, 12},
8890  	{209, 12},
8891  	{209, 12},
8892  	{145, 3},
8893  	{205, 9},
8894  	{156, 8},
8895  	{168, 8},
8896  	{146, 4},
8897  	{180, 8},
8898  	{149, 4},
8899  	{161, 4},
8900  	{64, 4},
8901  	{209, 12},
8902  	{159, 8},
8903  	{115, 9},
8904  	{72, 8},
8905  	{133, 9},
8906  	{78, 8},
8907  	{96, 8},
8908  	{65, 5},
8909  	{195, 8},
8910  	{84, 8},
8911  	{102, 8},
8912  	{67, 5},
8913  	{120, 8},
8914  	{73, 5},
8915  	{91, 5},
8916  	{64, 4},
8917  	{209, 12},
8918  	{209, 12},
8919  	{174, 8},
8920  	{148, 6},
8921  	{139, 9},
8922  	{80, 8},
8923  	{98, 8},
8924  	{66, 6},
8925  	{198, 8},
8926  	{86, 8},
8927  	{60, 10},
8928  	{14, 9},
8929  	{122, 8},
8930  	{22, 9},
8931  	{38, 9},
8932  	{3, 8},
8933  	{209, 12},
8934  	{157, 6},
8935  	{110, 8},
8936  	{70, 6},
8937  	{128, 8},
8938  	{26, 9},
8939  	{42, 9},
8940  	{5, 8},
8941  	{193, 6},
8942  	{82, 6},
8943  	{50, 9},
8944  	{9, 8},
8945  	{118, 6},
8946  	{17, 8},
8947  	{33, 8},
8948  	{0, 6},
8949  	{209, 12},
8950  	{209, 12},
8951  	{209, 12},
8952  	{209, 12},
8953  	{189, 8},
8954  	{152, 7},
8955  	{164, 7},
8956  	{145, 3},
8957  	{201, 8},
8958  	{88, 8},
8959  	{106, 8},
8960  	{69, 7},
8961  	{124, 8},
8962  	{75, 7},
8963  	{93, 7},
8964  	{64, 4},
8965  	{209, 12},
8966  	{158, 7},
8967  	{112, 8},
8968  	{71, 7},
8969  	{130, 8},
8970  	{28, 9},
8971  	{44, 9},
8972  	{6, 8},
8973  	{194, 7},
8974  	{83, 7},
8975  	{52, 9},
8976  	{10, 8},
8977  	{119, 7},
8978  	{18, 8},
8979  	{34, 8},
8980  	{1, 7},
8981  	{209, 12},
8982  	{209, 12},
8983  	{173, 7},
8984  	{148, 6},
8985  	{136, 8},
8986  	{79, 7},
8987  	{97, 7},
8988  	{66, 6},
8989  	{197, 7},
8990  	{85, 7},
8991  	{56, 9},
8992  	{12, 8},
8993  	{121, 7},
8994  	{20, 8},
8995  	{36, 8},
8996  	{2, 7},
8997  	{209, 12},
8998  	{157, 6},
8999  	{109, 7},
9000  	{70, 6},
9001  	{127, 7},
9002  	{24, 8},
9003  	{40, 8},
9004  	{4, 7},
9005  	{193, 6},
9006  	{82, 6},
9007  	{48, 8},
9008  	{8, 7},
9009  	{118, 6},
9010  	{16, 7},
9011  	{32, 7},
9012  	{0, 6},
9013  	{209, 12},
9014  	{209, 12},
9015  	{209, 12},
9016  	{209, 12},
9017  	{209, 12},
9018  	{209, 12},
9019  	{209, 12},
9020  	{145, 3},
9021  	{209, 12},
9022  	{209, 12},
9023  	{209, 12},
9024  	{146, 4},
9025  	{209, 12},
9026  	{149, 4},
9027  	{161, 4},
9028  	{64, 4},
9029  	{209, 12},
9030  	{209, 12},
9031  	{209, 12},
9032  	{147, 5},
9033  	{209, 12},
9034  	{150, 5},
9035  	{162, 5},
9036  	{65, 5},
9037  	{209, 12},
9038  	{153, 5},
9039  	{165, 5},
9040  	{67, 5},
9041  	{177, 5},
9042  	{73, 5},
9043  	{91, 5},
9044  	{64, 4},
9045  	{209, 12},
9046  	{209, 12},
9047  	{209, 12},
9048  	{148, 6},
9049  	{209, 12},
9050  	{151, 6},
9051  	{163, 6},
9052  	{66, 6},
9053  	{209, 12},
9054  	{154, 6},
9055  	{166, 6},
9056  	{68, 6},
9057  	{178, 6},
9058  	{74, 6},
9059  	{92, 6},
9060  	{64, 4},
9061  	{209, 12},
9062  	{157, 6},
9063  	{169, 6},
9064  	{70, 6},
9065  	{181, 6},
9066  	{76, 6},
9067  	{94, 6},
9068  	{65, 5},
9069  	{193, 6},
9070  	{82, 6},
9071  	{100, 6},
9072  	{67, 5},
9073  	{118, 6},
9074  	{73, 5},
9075  	{91, 5},
9076  	{0, 6},
9077  	{209, 12},
9078  	{209, 12},
9079  	{209, 12},
9080  	{209, 12},
9081  	{192, 11},
9082  	{152, 7},
9083  	{164, 7},
9084  	{145, 3},
9085  	{204, 11},
9086  	{155, 7},
9087  	{167, 7},
9088  	{69, 7},
9089  	{179, 7},
9090  	{75, 7},
9091  	{93, 7},
9092  	{64, 4},
9093  	{209, 12},
9094  	{158, 7},
9095  	{170, 7},
9096  	{71, 7},
9097  	{182, 7},
9098  	{77, 7},
9099  	{95, 7},
9100  	{65, 5},
9101  	{194, 7},
9102  	{83, 7},
9103  	{101, 7},
9104  	{67, 5},
9105  	{119, 7},
9106  	{73, 5},
9107  	{91, 5},
9108  	{1, 7},
9109  	{209, 12},
9110  	{209, 12},
9111  	{173, 7},
9112  	{148, 6},
9113  	{185, 7},
9114  	{79, 7},
9115  	{97, 7},
9116  	{66, 6},
9117  	{197, 7},
9118  	{85, 7},
9119  	{103, 7},
9120  	{68, 6},
9121  	{121, 7},
9122  	{74, 6},
9123  	{92, 6},
9124  	{2, 7},
9125  	{209, 12},
9126  	{157, 6},
9127  	{109, 7},
9128  	{70, 6},
9129  	{127, 7},
9130  	{76, 6},
9131  	{94, 6},
9132  	{4, 7},
9133  	{193, 6},
9134  	{82, 6},
9135  	{100, 6},
9136  	{8, 7},
9137  	{118, 6},
9138  	{16, 7},
9139  	{32, 7},
9140  	{0, 6},
9141  	{209, 12},
9142  	{209, 12},
9143  	{209, 12},
9144  	{209, 12},
9145  	{209, 12},
9146  	{209, 12},
9147  	{209, 12},
9148  	{145, 3},
9149  	{207, 11},
9150  	{156, 8},
9151  	{168, 8},
9152  	{146, 4},
9153  	{180, 8},
9154  	{149, 4},
9155  	{161, 4},
9156  	{64, 4},
9157  	{209, 12},
9158  	{159, 8},
9159  	{117, 11},
9160  	{72, 8},
9161  	{135, 11},
9162  	{78, 8},
9163  	{96, 8},
9164  	{65, 5},
9165  	{195, 8},
9166  	{84, 8},
9167  	{102, 8},
9168  	{67, 5},
9169  	{120, 8},
9170  	{73, 5},
9171  	{91, 5},
9172  	{64, 4},
9173  	{209, 12},
9174  	{209, 12},
9175  	{174, 8},
9176  	{148, 6},
9177  	{141, 11},
9178  	{80, 8},
9179  	{98, 8},
9180  	{66, 6},
9181  	{198, 8},
9182  	{86, 8},
9183  	{104, 8},
9184  	{68, 6},
9185  	{122, 8},
9186  	{74, 6},
9187  	{92, 6},
9188  	{3, 8},
9189  	{209, 12},
9190  	{157, 6},
9191  	{110, 8},
9192  	{70, 6},
9193  	{128, 8},
9194  	{76, 6},
9195  	{94, 6},
9196  	{5, 8},
9197  	{193, 6},
9198  	{82, 6},
9199  	{100, 6},
9200  	{9, 8},
9201  	{118, 6},
9202  	{17, 8},
9203  	{33, 8},
9204  	{0, 6},
9205  	{209, 12},
9206  	{209, 12},
9207  	{209, 12},
9208  	{209, 12},
9209  	{189, 8},
9210  	{152, 7},
9211  	{164, 7},
9212  	{145, 3},
9213  	{201, 8},
9214  	{88, 8},
9215  	{106, 8},
9216  	{69, 7},
9217  	{124, 8},
9218  	{75, 7},
9219  	{93, 7},
9220  	{64, 4},
9221  	{209, 12},
9222  	{158, 7},
9223  	{112, 8},
9224  	{71, 7},
9225  	{130, 8},
9226  	{77, 7},
9227  	{95, 7},
9228  	{6, 8},
9229  	{194, 7},
9230  	{83, 7},
9231  	{101, 7},
9232  	{10, 8},
9233  	{119, 7},
9234  	{18, 8},
9235  	{34, 8},
9236  	{1, 7},
9237  	{209, 12},
9238  	{209, 12},
9239  	{173, 7},
9240  	{148, 6},
9241  	{136, 8},
9242  	{79, 7},
9243  	{97, 7},
9244  	{66, 6},
9245  	{197, 7},
9246  	{85, 7},
9247  	{103, 7},
9248  	{12, 8},
9249  	{121, 7},
9250  	{20, 8},
9251  	{36, 8},
9252  	{2, 7},
9253  	{209, 12},
9254  	{157, 6},
9255  	{109, 7},
9256  	{70, 6},
9257  	{127, 7},
9258  	{24, 8},
9259  	{40, 8},
9260  	{4, 7},
9261  	{193, 6},
9262  	{82, 6},
9263  	{48, 8},
9264  	{8, 7},
9265  	{118, 6},
9266  	{16, 7},
9267  	{32, 7},
9268  	{0, 6},
9269  	{209, 12},
9270  	{209, 12},
9271  	{209, 12},
9272  	{209, 12},
9273  	{209, 12},
9274  	{209, 12},
9275  	{209, 12},
9276  	{145, 3},
9277  	{209, 12},
9278  	{209, 12},
9279  	{209, 12},
9280  	{146, 4},
9281  	{209, 12},
9282  	{149, 4},
9283  	{161, 4},
9284  	{64, 4},
9285  	{209, 12},
9286  	{160, 9},
9287  	{172, 9},
9288  	{147, 5},
9289  	{184, 9},
9290  	{150, 5},
9291  	{162, 5},
9292  	{65, 5},
9293  	{196, 9},
9294  	{153, 5},
9295  	{165, 5},
9296  	{67, 5},
9297  	{177, 5},
9298  	{73, 5},
9299  	{91, 5},
9300  	{64, 4},
9301  	{209, 12},
9302  	{209, 12},
9303  	{175, 9},
9304  	{148, 6},
9305  	{143, 11},
9306  	{81, 9},
9307  	{99, 9},
9308  	{66, 6},
9309  	{199, 9},
9310  	{87, 9},
9311  	{105, 9},
9312  	{68, 6},
9313  	{123, 9},
9314  	{74, 6},
9315  	{92, 6},
9316  	{64, 4},
9317  	{209, 12},
9318  	{157, 6},
9319  	{111, 9},
9320  	{70, 6},
9321  	{129, 9},
9322  	{76, 6},
9323  	{94, 6},
9324  	{65, 5},
9325  	{193, 6},
9326  	{82, 6},
9327  	{100, 6},
9328  	{67, 5},
9329  	{118, 6},
9330  	{73, 5},
9331  	{91, 5},
9332  	{0, 6},
9333  	{209, 12},
9334  	{209, 12},
9335  	{209, 12},
9336  	{209, 12},
9337  	{190, 9},
9338  	{152, 7},
9339  	{164, 7},
9340  	{145, 3},
9341  	{202, 9},
9342  	{89, 9},
9343  	{107, 9},
9344  	{69, 7},
9345  	{125, 9},
9346  	{75, 7},
9347  	{93, 7},
9348  	{64, 4},
9349  	{209, 12},
9350  	{158, 7},
9351  	{113, 9},
9352  	{71, 7},
9353  	{131, 9},
9354  	{31, 11},
9355  	{47, 11},
9356  	{7, 9},
9357  	{194, 7},
9358  	{83, 7},
9359  	{55, 11},
9360  	{11, 9},
9361  	{119, 7},
9362  	{19, 9},
9363  	{35, 9},
9364  	{1, 7},
9365  	{209, 12},
9366  	{209, 12},
9367  	{173, 7},
9368  	{148, 6},
9369  	{137, 9},
9370  	{79, 7},
9371  	{97, 7},
9372  	{66, 6},
9373  	{197, 7},
9374  	{85, 7},
9375  	{59, 11},
9376  	{13, 9},
9377  	{121, 7},
9378  	{21, 9},
9379  	{37, 9},
9380  	{2, 7},
9381  	{209, 12},
9382  	{157, 6},
9383  	{109, 7},
9384  	{70, 6},
9385  	{127, 7},
9386  	{25, 9},
9387  	{41, 9},
9388  	{4, 7},
9389  	{193, 6},
9390  	{82, 6},
9391  	{49, 9},
9392  	{8, 7},
9393  	{118, 6},
9394  	{16, 7},
9395  	{32, 7},
9396  	{0, 6},
9397  	{209, 12},
9398  	{209, 12},
9399  	{209, 12},
9400  	{209, 12},
9401  	{209, 12},
9402  	{209, 12},
9403  	{209, 12},
9404  	{145, 3},
9405  	{205, 9},
9406  	{156, 8},
9407  	{168, 8},
9408  	{146, 4},
9409  	{180, 8},
9410  	{149, 4},
9411  	{161, 4},
9412  	{64, 4},
9413  	{209, 12},
9414  	{159, 8},
9415  	{115, 9},
9416  	{72, 8},
9417  	{133, 9},
9418  	{78, 8},
9419  	{96, 8},
9420  	{65, 5},
9421  	{195, 8},
9422  	{84, 8},
9423  	{102, 8},
9424  	{67, 5},
9425  	{120, 8},
9426  	{73, 5},
9427  	{91, 5},
9428  	{64, 4},
9429  	{209, 12},
9430  	{209, 12},
9431  	{174, 8},
9432  	{148, 6},
9433  	{139, 9},
9434  	{80, 8},
9435  	{98, 8},
9436  	{66, 6},
9437  	{198, 8},
9438  	{86, 8},
9439  	{61, 11},
9440  	{14, 9},
9441  	{122, 8},
9442  	{22, 9},
9443  	{38, 9},
9444  	{3, 8},
9445  	{209, 12},
9446  	{157, 6},
9447  	{110, 8},
9448  	{70, 6},
9449  	{128, 8},
9450  	{26, 9},
9451  	{42, 9},
9452  	{5, 8},
9453  	{193, 6},
9454  	{82, 6},
9455  	{50, 9},
9456  	{9, 8},
9457  	{118, 6},
9458  	{17, 8},
9459  	{33, 8},
9460  	{0, 6},
9461  	{209, 12},
9462  	{209, 12},
9463  	{209, 12},
9464  	{209, 12},
9465  	{189, 8},
9466  	{152, 7},
9467  	{164, 7},
9468  	{145, 3},
9469  	{201, 8},
9470  	{88, 8},
9471  	{106, 8},
9472  	{69, 7},
9473  	{124, 8},
9474  	{75, 7},
9475  	{93, 7},
9476  	{64, 4},
9477  	{209, 12},
9478  	{158, 7},
9479  	{112, 8},
9480  	{71, 7},
9481  	{130, 8},
9482  	{28, 9},
9483  	{44, 9},
9484  	{6, 8},
9485  	{194, 7},
9486  	{83, 7},
9487  	{52, 9},
9488  	{10, 8},
9489  	{119, 7},
9490  	{18, 8},
9491  	{34, 8},
9492  	{1, 7},
9493  	{209, 12},
9494  	{209, 12},
9495  	{173, 7},
9496  	{148, 6},
9497  	{136, 8},
9498  	{79, 7},
9499  	{97, 7},
9500  	{66, 6},
9501  	{197, 7},
9502  	{85, 7},
9503  	{56, 9},
9504  	{12, 8},
9505  	{121, 7},
9506  	{20, 8},
9507  	{36, 8},
9508  	{2, 7},
9509  	{209, 12},
9510  	{157, 6},
9511  	{109, 7},
9512  	{70, 6},
9513  	{127, 7},
9514  	{24, 8},
9515  	{40, 8},
9516  	{4, 7},
9517  	{193, 6},
9518  	{82, 6},
9519  	{48, 8},
9520  	{8, 7},
9521  	{118, 6},
9522  	{16, 7},
9523  	{32, 7},
9524  	{0, 6},
9525  	{209, 12},
9526  	{209, 12},
9527  	{209, 12},
9528  	{209, 12},
9529  	{209, 12},
9530  	{209, 12},
9531  	{209, 12},
9532  	{145, 3},
9533  	{209, 12},
9534  	{209, 12},
9535  	{209, 12},
9536  	{146, 4},
9537  	{209, 12},
9538  	{149, 4},
9539  	{161, 4},
9540  	{64, 4},
9541  	{209, 12},
9542  	{209, 12},
9543  	{209, 12},
9544  	{147, 5},
9545  	{209, 12},
9546  	{150, 5},
9547  	{162, 5},
9548  	{65, 5},
9549  	{209, 12},
9550  	{153, 5},
9551  	{165, 5},
9552  	{67, 5},
9553  	{177, 5},
9554  	{73, 5},
9555  	{91, 5},
9556  	{64, 4},
9557  	{209, 12},
9558  	{209, 12},
9559  	{176, 10},
9560  	{148, 6},
9561  	{188, 10},
9562  	{151, 6},
9563  	{163, 6},
9564  	{66, 6},
9565  	{200, 10},
9566  	{154, 6},
9567  	{166, 6},
9568  	{68, 6},
9569  	{178, 6},
9570  	{74, 6},
9571  	{92, 6},
9572  	{64, 4},
9573  	{209, 12},
9574  	{157, 6},
9575  	{169, 6},
9576  	{70, 6},
9577  	{181, 6},
9578  	{76, 6},
9579  	{94, 6},
9580  	{65, 5},
9581  	{193, 6},
9582  	{82, 6},
9583  	{100, 6},
9584  	{67, 5},
9585  	{118, 6},
9586  	{73, 5},
9587  	{91, 5},
9588  	{0, 6},
9589  	{209, 12},
9590  	{209, 12},
9591  	{209, 12},
9592  	{209, 12},
9593  	{191, 10},
9594  	{152, 7},
9595  	{164, 7},
9596  	{145, 3},
9597  	{203, 10},
9598  	{90, 10},
9599  	{108, 10},
9600  	{69, 7},
9601  	{126, 10},
9602  	{75, 7},
9603  	{93, 7},
9604  	{64, 4},
9605  	{209, 12},
9606  	{158, 7},
9607  	{114, 10},
9608  	{71, 7},
9609  	{132, 10},
9610  	{77, 7},
9611  	{95, 7},
9612  	{65, 5},
9613  	{194, 7},
9614  	{83, 7},
9615  	{101, 7},
9616  	{67, 5},
9617  	{119, 7},
9618  	{73, 5},
9619  	{91, 5},
9620  	{1, 7},
9621  	{209, 12},
9622  	{209, 12},
9623  	{173, 7},
9624  	{148, 6},
9625  	{138, 10},
9626  	{79, 7},
9627  	{97, 7},
9628  	{66, 6},
9629  	{197, 7},
9630  	{85, 7},
9631  	{103, 7},
9632  	{68, 6},
9633  	{121, 7},
9634  	{74, 6},
9635  	{92, 6},
9636  	{2, 7},
9637  	{209, 12},
9638  	{157, 6},
9639  	{109, 7},
9640  	{70, 6},
9641  	{127, 7},
9642  	{76, 6},
9643  	{94, 6},
9644  	{4, 7},
9645  	{193, 6},
9646  	{82, 6},
9647  	{100, 6},
9648  	{8, 7},
9649  	{118, 6},
9650  	{16, 7},
9651  	{32, 7},
9652  	{0, 6},
9653  	{209, 12},
9654  	{209, 12},
9655  	{209, 12},
9656  	{209, 12},
9657  	{209, 12},
9658  	{209, 12},
9659  	{209, 12},
9660  	{145, 3},
9661  	{206, 10},
9662  	{156, 8},
9663  	{168, 8},
9664  	{146, 4},
9665  	{180, 8},
9666  	{149, 4},
9667  	{161, 4},
9668  	{64, 4},
9669  	{209, 12},
9670  	{159, 8},
9671  	{116, 10},
9672  	{72, 8},
9673  	{134, 10},
9674  	{78, 8},
9675  	{96, 8},
9676  	{65, 5},
9677  	{195, 8},
9678  	{84, 8},
9679  	{102, 8},
9680  	{67, 5},
9681  	{120, 8},
9682  	{73, 5},
9683  	{91, 5},
9684  	{64, 4},
9685  	{209, 12},
9686  	{209, 12},
9687  	{174, 8},
9688  	{148, 6},
9689  	{140, 10},
9690  	{80, 8},
9691  	{98, 8},
9692  	{66, 6},
9693  	{198, 8},
9694  	{86, 8},
9695  	{62, 11},
9696  	{15, 10},
9697  	{122, 8},
9698  	{23, 10},
9699  	{39, 10},
9700  	{3, 8},
9701  	{209, 12},
9702  	{157, 6},
9703  	{110, 8},
9704  	{70, 6},
9705  	{128, 8},
9706  	{27, 10},
9707  	{43, 10},
9708  	{5, 8},
9709  	{193, 6},
9710  	{82, 6},
9711  	{51, 10},
9712  	{9, 8},
9713  	{118, 6},
9714  	{17, 8},
9715  	{33, 8},
9716  	{0, 6},
9717  	{209, 12},
9718  	{209, 12},
9719  	{209, 12},
9720  	{209, 12},
9721  	{189, 8},
9722  	{152, 7},
9723  	{164, 7},
9724  	{145, 3},
9725  	{201, 8},
9726  	{88, 8},
9727  	{106, 8},
9728  	{69, 7},
9729  	{124, 8},
9730  	{75, 7},
9731  	{93, 7},
9732  	{64, 4},
9733  	{209, 12},
9734  	{158, 7},
9735  	{112, 8},
9736  	{71, 7},
9737  	{130, 8},
9738  	{29, 10},
9739  	{45, 10},
9740  	{6, 8},
9741  	{194, 7},
9742  	{83, 7},
9743  	{53, 10},
9744  	{10, 8},
9745  	{119, 7},
9746  	{18, 8},
9747  	{34, 8},
9748  	{1, 7},
9749  	{209, 12},
9750  	{209, 12},
9751  	{173, 7},
9752  	{148, 6},
9753  	{136, 8},
9754  	{79, 7},
9755  	{97, 7},
9756  	{66, 6},
9757  	{197, 7},
9758  	{85, 7},
9759  	{57, 10},
9760  	{12, 8},
9761  	{121, 7},
9762  	{20, 8},
9763  	{36, 8},
9764  	{2, 7},
9765  	{209, 12},
9766  	{157, 6},
9767  	{109, 7},
9768  	{70, 6},
9769  	{127, 7},
9770  	{24, 8},
9771  	{40, 8},
9772  	{4, 7},
9773  	{193, 6},
9774  	{82, 6},
9775  	{48, 8},
9776  	{8, 7},
9777  	{118, 6},
9778  	{16, 7},
9779  	{32, 7},
9780  	{0, 6},
9781  	{209, 12},
9782  	{209, 12},
9783  	{209, 12},
9784  	{209, 12},
9785  	{209, 12},
9786  	{209, 12},
9787  	{209, 12},
9788  	{145, 3},
9789  	{209, 12},
9790  	{209, 12},
9791  	{209, 12},
9792  	{146, 4},
9793  	{209, 12},
9794  	{149, 4},
9795  	{161, 4},
9796  	{64, 4},
9797  	{209, 12},
9798  	{160, 9},
9799  	{172, 9},
9800  	{147, 5},
9801  	{184, 9},
9802  	{150, 5},
9803  	{162, 5},
9804  	{65, 5},
9805  	{196, 9},
9806  	{153, 5},
9807  	{165, 5},
9808  	{67, 5},
9809  	{177, 5},
9810  	{73, 5},
9811  	{91, 5},
9812  	{64, 4},
9813  	{209, 12},
9814  	{209, 12},
9815  	{175, 9},
9816  	{148, 6},
9817  	{142, 10},
9818  	{81, 9},
9819  	{99, 9},
9820  	{66, 6},
9821  	{199, 9},
9822  	{87, 9},
9823  	{105, 9},
9824  	{68, 6},
9825  	{123, 9},
9826  	{74, 6},
9827  	{92, 6},
9828  	{64, 4},
9829  	{209, 12},
9830  	{157, 6},
9831  	{111, 9},
9832  	{70, 6},
9833  	{129, 9},
9834  	{76, 6},
9835  	{94, 6},
9836  	{65, 5},
9837  	{193, 6},
9838  	{82, 6},
9839  	{100, 6},
9840  	{67, 5},
9841  	{118, 6},
9842  	{73, 5},
9843  	{91, 5},
9844  	{0, 6},
9845  	{209, 12},
9846  	{209, 12},
9847  	{209, 12},
9848  	{209, 12},
9849  	{190, 9},
9850  	{152, 7},
9851  	{164, 7},
9852  	{145, 3},
9853  	{202, 9},
9854  	{89, 9},
9855  	{107, 9},
9856  	{69, 7},
9857  	{125, 9},
9858  	{75, 7},
9859  	{93, 7},
9860  	{64, 4},
9861  	{209, 12},
9862  	{158, 7},
9863  	{113, 9},
9864  	{71, 7},
9865  	{131, 9},
9866  	{30, 10},
9867  	{46, 10},
9868  	{7, 9},
9869  	{194, 7},
9870  	{83, 7},
9871  	{54, 10},
9872  	{11, 9},
9873  	{119, 7},
9874  	{19, 9},
9875  	{35, 9},
9876  	{1, 7},
9877  	{209, 12},
9878  	{209, 12},
9879  	{173, 7},
9880  	{148, 6},
9881  	{137, 9},
9882  	{79, 7},
9883  	{97, 7},
9884  	{66, 6},
9885  	{197, 7},
9886  	{85, 7},
9887  	{58, 10},
9888  	{13, 9},
9889  	{121, 7},
9890  	{21, 9},
9891  	{37, 9},
9892  	{2, 7},
9893  	{209, 12},
9894  	{157, 6},
9895  	{109, 7},
9896  	{70, 6},
9897  	{127, 7},
9898  	{25, 9},
9899  	{41, 9},
9900  	{4, 7},
9901  	{193, 6},
9902  	{82, 6},
9903  	{49, 9},
9904  	{8, 7},
9905  	{118, 6},
9906  	{16, 7},
9907  	{32, 7},
9908  	{0, 6},
9909  	{209, 12},
9910  	{209, 12},
9911  	{209, 12},
9912  	{209, 12},
9913  	{209, 12},
9914  	{209, 12},
9915  	{209, 12},
9916  	{145, 3},
9917  	{205, 9},
9918  	{156, 8},
9919  	{168, 8},
9920  	{146, 4},
9921  	{180, 8},
9922  	{149, 4},
9923  	{161, 4},
9924  	{64, 4},
9925  	{209, 12},
9926  	{159, 8},
9927  	{115, 9},
9928  	{72, 8},
9929  	{133, 9},
9930  	{78, 8},
9931  	{96, 8},
9932  	{65, 5},
9933  	{195, 8},
9934  	{84, 8},
9935  	{102, 8},
9936  	{67, 5},
9937  	{120, 8},
9938  	{73, 5},
9939  	{91, 5},
9940  	{64, 4},
9941  	{209, 12},
9942  	{209, 12},
9943  	{174, 8},
9944  	{148, 6},
9945  	{139, 9},
9946  	{80, 8},
9947  	{98, 8},
9948  	{66, 6},
9949  	{198, 8},
9950  	{86, 8},
9951  	{60, 10},
9952  	{14, 9},
9953  	{122, 8},
9954  	{22, 9},
9955  	{38, 9},
9956  	{3, 8},
9957  	{209, 12},
9958  	{157, 6},
9959  	{110, 8},
9960  	{70, 6},
9961  	{128, 8},
9962  	{26, 9},
9963  	{42, 9},
9964  	{5, 8},
9965  	{193, 6},
9966  	{82, 6},
9967  	{50, 9},
9968  	{9, 8},
9969  	{118, 6},
9970  	{17, 8},
9971  	{33, 8},
9972  	{0, 6},
9973  	{209, 12},
9974  	{209, 12},
9975  	{209, 12},
9976  	{209, 12},
9977  	{189, 8},
9978  	{152, 7},
9979  	{164, 7},
9980  	{145, 3},
9981  	{201, 8},
9982  	{88, 8},
9983  	{106, 8},
9984  	{69, 7},
9985  	{124, 8},
9986  	{75, 7},
9987  	{93, 7},
9988  	{64, 4},
9989  	{209, 12},
9990  	{158, 7},
9991  	{112, 8},
9992  	{71, 7},
9993  	{130, 8},
9994  	{28, 9},
9995  	{44, 9},
9996  	{6, 8},
9997  	{194, 7},
9998  	{83, 7},
9999  	{52, 9},
10000  	{10, 8},
10001  	{119, 7},
10002  	{18, 8},
10003  	{34, 8},
10004  	{1, 7},
10005  	{209, 12},
10006  	{209, 12},
10007  	{173, 7},
10008  	{148, 6},
10009  	{136, 8},
10010  	{79, 7},
10011  	{97, 7},
10012  	{66, 6},
10013  	{197, 7},
10014  	{85, 7},
10015  	{56, 9},
10016  	{12, 8},
10017  	{121, 7},
10018  	{20, 8},
10019  	{36, 8},
10020  	{2, 7},
10021  	{209, 12},
10022  	{157, 6},
10023  	{109, 7},
10024  	{70, 6},
10025  	{127, 7},
10026  	{24, 8},
10027  	{40, 8},
10028  	{4, 7},
10029  	{193, 6},
10030  	{82, 6},
10031  	{48, 8},
10032  	{8, 7},
10033  	{118, 6},
10034  	{16, 7},
10035  	{32, 7},
10036  	{0, 6}};
10037 } // utf8_to_utf16 namespace
10038 } // tables namespace
10039 } // unnamed namespace
10040 } // namespace simdutf
10041 
10042 #endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
10043 /* end file src/tables/utf8_to_utf16_tables.h */
10044 /* begin file src/tables/utf16_to_utf8_tables.h */
10045 // file generated by scripts/sse_convert_utf16_to_utf8.py
10046 #ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
10047 #define SIMDUTF_UTF16_TO_UTF8_TABLES_H
10048 
10049 namespace simdutf {
10050 namespace {
10051 namespace tables {
10052 namespace utf16_to_utf8 {
10053 
10054   // 1 byte for length, 16 bytes for mask
10055   const uint8_t pack_1_2_utf8_bytes[256][17] = {
10056     {16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
10057     {15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
10058     {15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80},
10059     {14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
10060     {15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
10061     {14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
10062     {14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
10063     {13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
10064     {15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80},
10065     {14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
10066     {14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80},
10067     {13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
10068     {14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
10069     {13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
10070     {13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
10071     {12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10072     {15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80},
10073     {14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
10074     {14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
10075     {13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
10076     {14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
10077     {13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
10078     {13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
10079     {12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
10080     {14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
10081     {13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
10082     {13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
10083     {12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10084     {13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
10085     {12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10086     {12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10087     {11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
10088     {15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80},
10089     {14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
10090     {14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80},
10091     {13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
10092     {14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
10093     {13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
10094     {13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
10095     {12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10096     {14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80},
10097     {13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
10098     {13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80},
10099     {12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
10100     {13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
10101     {12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
10102     {12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
10103     {11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10104     {14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
10105     {13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
10106     {13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
10107     {12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10108     {13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
10109     {12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10110     {12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10111     {11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10112     {13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
10113     {12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
10114     {12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
10115     {11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10116     {12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
10117     {11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10118     {11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10119     {10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
10120     {15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80},
10121     {14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
10122     {14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80},
10123     {13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
10124     {14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
10125     {13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
10126     {13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
10127     {12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
10128     {14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80},
10129     {13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
10130     {13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80},
10131     {12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10132     {13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
10133     {12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10134     {12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10135     {11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
10136     {14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
10137     {13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
10138     {13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
10139     {12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
10140     {13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
10141     {12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
10142     {12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
10143     {11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
10144     {13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
10145     {12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10146     {12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10147     {11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
10148     {12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
10149     {11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
10150     {11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
10151     {10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
10152     {14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80},
10153     {13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
10154     {13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80},
10155     {12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10156     {13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
10157     {12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10158     {12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10159     {11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10160     {13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80},
10161     {12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
10162     {12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
10163     {11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10164     {12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
10165     {11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10166     {11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10167     {10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
10168     {13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
10169     {12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10170     {12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10171     {11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10172     {12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
10173     {11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10174     {11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10175     {10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
10176     {12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
10177     {11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10178     {11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10179     {10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
10180     {11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
10181     {10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
10182     {10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
10183     {9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10184     {15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80},
10185     {14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
10186     {14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80},
10187     {13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
10188     {14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
10189     {13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
10190     {13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
10191     {12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10192     {14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80},
10193     {13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
10194     {13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80},
10195     {12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
10196     {13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
10197     {12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
10198     {12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
10199     {11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10200     {14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
10201     {13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
10202     {13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
10203     {12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10204     {13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
10205     {12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10206     {12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10207     {11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10208     {13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
10209     {12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
10210     {12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
10211     {11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10212     {12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
10213     {11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10214     {11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10215     {10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10216     {14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80},
10217     {13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
10218     {13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80},
10219     {12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
10220     {13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
10221     {12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
10222     {12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
10223     {11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10224     {13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80},
10225     {12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
10226     {12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80},
10227     {11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10228     {12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
10229     {11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10230     {11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10231     {10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10232     {13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
10233     {12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
10234     {12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
10235     {11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10236     {12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
10237     {11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10238     {11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10239     {10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10240     {12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
10241     {11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10242     {11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10243     {10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10244     {11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10245     {10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10246     {10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10247     {9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10248     {14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80},
10249     {13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
10250     {13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80},
10251     {12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10252     {13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
10253     {12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10254     {12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10255     {11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10256     {13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80},
10257     {12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
10258     {12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
10259     {11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10260     {12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
10261     {11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10262     {11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10263     {10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10264     {13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
10265     {12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10266     {12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10267     {11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10268     {12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
10269     {11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10270     {11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10271     {10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10272     {12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
10273     {11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10274     {11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10275     {10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10276     {11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
10277     {10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10278     {10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10279     {9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10280     {13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80},
10281     {12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
10282     {12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
10283     {11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10284     {12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
10285     {11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10286     {11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10287     {10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10288     {12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
10289     {11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10290     {11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10291     {10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10292     {11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10293     {10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10294     {10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10295     {9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10296     {12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
10297     {11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10298     {11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10299     {10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10300     {11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
10301     {10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10302     {10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10303     {9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10304     {11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
10305     {10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10306     {10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10307     {9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10308     {10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
10309     {9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10310     {9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10311     {8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
10312   };
10313 
10314   // 1 byte for length, 16 bytes for mask
10315   const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
10316     {12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80},
10317     {9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10318     {11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
10319     {10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
10320     {9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10321     {6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10322     {8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10323     {7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10324     {11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
10325     {8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10326     {10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
10327     {9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10328     {10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
10329     {7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10330     {9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10331     {8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10332     {9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10333     {6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10334     {8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10335     {7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10336     {6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10337     {3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10338     {5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10339     {4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10340     {8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10341     {5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10342     {7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10343     {6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10344     {7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10345     {4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10346     {6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10347     {5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10348     {11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
10349     {8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10350     {10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
10351     {9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10352     {8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10353     {5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10354     {7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10355     {6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10356     {10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
10357     {7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10358     {9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10359     {8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10360     {9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10361     {6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10362     {8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10363     {7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10364     {10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
10365     {7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10366     {9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10367     {8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10368     {7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10369     {4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10370     {6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10371     {5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10372     {9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10373     {6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10374     {8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10375     {7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10376     {8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10377     {5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10378     {7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10379     {6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10380     {9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10381     {6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10382     {8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10383     {7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10384     {6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10385     {3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10386     {5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10387     {4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10388     {8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10389     {5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10390     {7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10391     {6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10392     {7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10393     {4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10394     {6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10395     {5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10396     {6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10397     {3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10398     {5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10399     {4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10400     {3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10401     {0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10402     {2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10403     {1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10404     {5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10405     {2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10406     {4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10407     {3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10408     {4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10409     {1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10410     {3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10411     {2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10412     {8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10413     {5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10414     {7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10415     {6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10416     {5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10417     {2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10418     {4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10419     {3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10420     {7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10421     {4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10422     {6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10423     {5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10424     {6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10425     {3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10426     {5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10427     {4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10428     {7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10429     {4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10430     {6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10431     {5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10432     {4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10433     {1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10434     {3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10435     {2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10436     {6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10437     {3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10438     {5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10439     {4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10440     {5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10441     {2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10442     {4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10443     {3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10444     {11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80},
10445     {8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10446     {10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
10447     {9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10448     {8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10449     {5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10450     {7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10451     {6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10452     {10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
10453     {7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10454     {9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10455     {8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10456     {9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10457     {6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10458     {8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10459     {7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10460     {8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10461     {5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10462     {7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10463     {6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10464     {5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10465     {2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10466     {4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10467     {3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10468     {7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10469     {4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10470     {6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10471     {5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10472     {6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10473     {3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10474     {5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10475     {4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10476     {10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
10477     {7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10478     {9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10479     {8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10480     {7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10481     {4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10482     {6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10483     {5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10484     {9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10485     {6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10486     {8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10487     {7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10488     {8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10489     {5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10490     {7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10491     {6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10492     {9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10493     {6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10494     {8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10495     {7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10496     {6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10497     {3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10498     {5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10499     {4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10500     {8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10501     {5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10502     {7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10503     {6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10504     {7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10505     {4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10506     {6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10507     {5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10508     {10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80},
10509     {7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10510     {9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10511     {8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10512     {7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10513     {4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10514     {6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10515     {5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10516     {9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10517     {6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10518     {8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10519     {7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10520     {8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10521     {5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10522     {7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10523     {6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10524     {7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10525     {4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10526     {6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10527     {5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10528     {4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10529     {1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10530     {3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10531     {2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10532     {6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10533     {3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10534     {5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10535     {4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10536     {5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10537     {2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10538     {4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10539     {3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10540     {9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10541     {6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10542     {8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10543     {7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10544     {6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10545     {3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10546     {5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10547     {4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10548     {8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10549     {5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10550     {7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10551     {6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10552     {7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10553     {4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10554     {6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10555     {5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10556     {8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10557     {5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10558     {7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10559     {6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10560     {5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10561     {2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10562     {4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10563     {3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10564     {7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10565     {4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10566     {6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10567     {5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10568     {6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10569     {3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10570     {5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
10571     {4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
10572   };
10573 
10574 } // utf16_to_utf8 namespace
10575 } // tables namespace
10576 } // unnamed namespace
10577 } // namespace simdutf
10578 
10579 #endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
10580 /* end file src/tables/utf16_to_utf8_tables.h */
10581 // End of tables.
10582 
10583 // The scalar routines should be included once.
10584 /* begin file src/scalar/ascii.h */
10585 #ifndef SIMDUTF_ASCII_H
10586 #define SIMDUTF_ASCII_H
10587 
10588 namespace simdutf {
10589 namespace scalar {
10590 namespace {
10591 namespace ascii {
10592 #if SIMDUTF_IMPLEMENTATION_FALLBACK
10593 // Only used by the fallback kernel.
10594 inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
10595     const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
10596     uint64_t pos = 0;
10597     // process in blocks of 16 bytes when possible
10598     for (;pos + 16 <= len; pos += 16) {
10599         uint64_t v1;
10600         std::memcpy(&v1, data + pos, sizeof(uint64_t));
10601         uint64_t v2;
10602         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
10603         uint64_t v{v1 | v2};
10604         if ((v & 0x8080808080808080) != 0) { return false; }
10605     }
10606     // process the tail byte-by-byte
10607     for (;pos < len; pos ++) {
10608         if (data[pos] >= 0b10000000) { return false; }
10609     }
10610     return true;
10611 }
10612 #endif
10613 
10614 inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
10615     const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
10616     size_t pos = 0;
10617     // process in blocks of 16 bytes when possible
10618     for (;pos + 16 <= len; pos += 16) {
10619         uint64_t v1;
10620         std::memcpy(&v1, data + pos, sizeof(uint64_t));
10621         uint64_t v2;
10622         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
10623         uint64_t v{v1 | v2};
10624         if ((v & 0x8080808080808080) != 0) {
10625             for (;pos < len; pos ++) {
10626                 if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
10627             }
10628         }
10629     }
10630     // process the tail byte-by-byte
10631     for (;pos < len; pos ++) {
10632         if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
10633     }
10634     return result(error_code::SUCCESS, pos);
10635 }
10636 
10637 } // ascii namespace
10638 } // unnamed namespace
10639 } // namespace scalar
10640 } // namespace simdutf
10641 
10642 #endif
10643 /* end file src/scalar/ascii.h */
10644 /* begin file src/scalar/utf32.h */
10645 #ifndef SIMDUTF_UTF32_H
10646 #define SIMDUTF_UTF32_H
10647 
10648 namespace simdutf {
10649 namespace scalar {
10650 namespace {
10651 namespace utf32 {
10652 
10653 inline simdutf_warn_unused bool validate(const char32_t *buf, size_t len) noexcept {
10654   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10655   uint64_t pos = 0;
10656   for(;pos < len; pos++) {
10657     uint32_t word = data[pos];
10658     if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
10659         return false;
10660     }
10661   }
10662   return true;
10663 }
10664 
10665 inline simdutf_warn_unused result validate_with_errors(const char32_t *buf, size_t len) noexcept {
10666   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10667   size_t pos = 0;
10668   for(;pos < len; pos++) {
10669     uint32_t word = data[pos];
10670     if(word > 0x10FFFF) {
10671         return result(error_code::TOO_LARGE, pos);
10672     }
10673     if(word >= 0xD800 && word <= 0xDFFF) {
10674         return result(error_code::SURROGATE, pos);
10675     }
10676   }
10677   return result(error_code::SUCCESS, pos);
10678 }
10679 
utf8_length_from_utf32(const char32_t* buf, size_t len)10680 inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len) {
10681   // We are not BOM aware.
10682   const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
10683   size_t counter{0};
10684   for(size_t i = 0; i < len; i++) {
10685     // credit: @ttsugriy  for the vectorizable approach
10686     counter++;                                      // ASCII
10687     counter += static_cast<size_t>(p[i] > 0x7F);    // two-byte
10688     counter += static_cast<size_t>(p[i] > 0x7FF);   // three-byte
10689     counter += static_cast<size_t>(p[i] > 0xFFFF);  // four-bytes
10690   }
10691   return counter;
10692 }
10693 
utf16_length_from_utf32(const char32_t* buf, size_t len)10694 inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) {
10695   // We are not BOM aware.
10696   const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
10697   size_t counter{0};
10698   for(size_t i = 0; i < len; i++) {
10699     counter++;                                      // non-surrogate word
10700     counter += static_cast<size_t>(p[i] > 0xFFFF);  // surrogate pair
10701   }
10702   return counter;
10703 }
10704 
latin1_length_from_utf32(size_t len)10705 inline size_t latin1_length_from_utf32(size_t len) {
10706   // We are not BOM aware.
10707   return len; // a utf32 codepoint will always represent 1 latin1 character
10708 }
10709 
10710 
10711 
10712 } // utf32 namespace
10713 } // unnamed namespace
10714 } // namespace scalar
10715 } // namespace simdutf
10716 
10717 #endif
10718 /* end file src/scalar/utf32.h */
10719 /* begin file src/scalar/latin1.h */
10720 #ifndef SIMDUTF_LATIN1_H
10721 #define SIMDUTF_LATIN1_H
10722 
10723 namespace simdutf {
10724 namespace scalar {
10725 namespace {
10726 namespace latin1 {
10727 
utf32_length_from_latin1(size_t len)10728 inline size_t utf32_length_from_latin1(size_t len) {
10729   // We are not BOM aware.
10730   return len; // a utf32 unit will always represent 1 latin1 character
10731 }
10732 
utf8_length_from_latin1(const char *buf, size_t len)10733 inline size_t utf8_length_from_latin1(const char *buf, size_t len) {
10734   const uint8_t * c = reinterpret_cast<const uint8_t *>(buf);
10735   size_t answer = 0;
10736   for(size_t i = 0; i<len; i++) {
10737     if((c[i]>>7)) { answer++; }
10738   }
10739   return answer + len;
10740 }
10741 
utf16_length_from_latin1(size_t len)10742 inline size_t utf16_length_from_latin1(size_t len) {
10743   return len;
10744 }
10745 
10746 } // utf32 namespace
10747 } // unnamed namespace
10748 } // namespace scalar
10749 } // namespace simdutf
10750 
10751 #endif
10752 /* end file src/scalar/latin1.h */
10753 
10754 /* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
10755 #ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
10756 #define SIMDUTF_VALID_UTF32_TO_UTF8_H
10757 
10758 namespace simdutf {
10759 namespace scalar {
10760 namespace {
10761 namespace utf32_to_utf8 {
10762 
10763 #if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
10764 // only used by the fallback and POWER kernel
convert_valid(const char32_t* buf, size_t len, char* utf8_output)10765 inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) {
10766 	const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10767   size_t pos = 0;
10768   char* start{utf8_output};
10769   while (pos < len) {
10770     // try to convert the next block of 2 ASCII characters
10771     if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10772       uint64_t v;
10773       ::memcpy(&v, data + pos, sizeof(uint64_t));
10774       if ((v & 0xFFFFFF80FFFFFF80) == 0) {
10775         *utf8_output++ = char(buf[pos]);
10776 				*utf8_output++ = char(buf[pos+1]);
10777         pos += 2;
10778         continue;
10779       }
10780     }
10781     uint32_t word = data[pos];
10782     if((word & 0xFFFFFF80)==0) {
10783       // will generate one UTF-8 bytes
10784       *utf8_output++ = char(word);
10785       pos++;
10786     } else if((word & 0xFFFFF800)==0) {
10787       // will generate two UTF-8 bytes
10788       // we have 0b110XXXXX 0b10XXXXXX
10789       *utf8_output++ = char((word>>6) | 0b11000000);
10790       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10791       pos++;
10792     } else if((word & 0xFFFF0000)==0) {
10793       // will generate three UTF-8 bytes
10794       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
10795       *utf8_output++ = char((word>>12) | 0b11100000);
10796       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10797       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10798       pos++;
10799     } else {
10800       // will generate four UTF-8 bytes
10801       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
10802       *utf8_output++ = char((word>>18) | 0b11110000);
10803       *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10804       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10805       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10806       pos ++;
10807     }
10808   }
10809   return utf8_output - start;
10810 }
10811 #endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
10812 
10813 } // utf32_to_utf8 namespace
10814 } // unnamed namespace
10815 } // namespace scalar
10816 } // namespace simdutf
10817 
10818 #endif
10819 /* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
10820 /* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
10821 #ifndef SIMDUTF_UTF32_TO_UTF8_H
10822 #define SIMDUTF_UTF32_TO_UTF8_H
10823 
10824 namespace simdutf {
10825 namespace scalar {
10826 namespace {
10827 namespace utf32_to_utf8 {
10828 
convert(const char32_t* buf, size_t len, char* utf8_output)10829 inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) {
10830   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10831   size_t pos = 0;
10832   char* start{utf8_output};
10833   while (pos < len) {
10834     // try to convert the next block of 2 ASCII characters
10835     if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10836       uint64_t v;
10837       ::memcpy(&v, data + pos, sizeof(uint64_t));
10838       if ((v & 0xFFFFFF80FFFFFF80) == 0) {
10839         *utf8_output++ = char(buf[pos]);
10840 				*utf8_output++ = char(buf[pos+1]);
10841         pos += 2;
10842         continue;
10843       }
10844     }
10845     uint32_t word = data[pos];
10846     if((word & 0xFFFFFF80)==0) {
10847       // will generate one UTF-8 bytes
10848       *utf8_output++ = char(word);
10849       pos++;
10850     } else if((word & 0xFFFFF800)==0) {
10851       // will generate two UTF-8 bytes
10852       // we have 0b110XXXXX 0b10XXXXXX
10853       *utf8_output++ = char((word>>6) | 0b11000000);
10854       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10855       pos++;
10856     } else if((word & 0xFFFF0000)==0) {
10857       // will generate three UTF-8 bytes
10858       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
10859 			if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
10860       *utf8_output++ = char((word>>12) | 0b11100000);
10861       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10862       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10863       pos++;
10864     } else {
10865       // will generate four UTF-8 bytes
10866       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
10867 			if (word > 0x10FFFF) { return 0; }
10868       *utf8_output++ = char((word>>18) | 0b11110000);
10869       *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10870       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10871       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10872       pos ++;
10873     }
10874   }
10875   return utf8_output - start;
10876 }
10877 
convert_with_errors(const char32_t* buf, size_t len, char* utf8_output)10878 inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
10879   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10880   size_t pos = 0;
10881   char* start{utf8_output};
10882   while (pos < len) {
10883     // try to convert the next block of 2 ASCII characters
10884     if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
10885       uint64_t v;
10886       ::memcpy(&v, data + pos, sizeof(uint64_t));
10887       if ((v & 0xFFFFFF80FFFFFF80) == 0) {
10888         *utf8_output++ = char(buf[pos]);
10889 				*utf8_output++ = char(buf[pos+1]);
10890         pos += 2;
10891         continue;
10892       }
10893     }
10894     uint32_t word = data[pos];
10895     if((word & 0xFFFFFF80)==0) {
10896       // will generate one UTF-8 bytes
10897       *utf8_output++ = char(word);
10898       pos++;
10899     } else if((word & 0xFFFFF800)==0) {
10900       // will generate two UTF-8 bytes
10901       // we have 0b110XXXXX 0b10XXXXXX
10902       *utf8_output++ = char((word>>6) | 0b11000000);
10903       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10904       pos++;
10905     } else if((word & 0xFFFF0000)==0) {
10906       // will generate three UTF-8 bytes
10907       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
10908 			if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
10909       *utf8_output++ = char((word>>12) | 0b11100000);
10910       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10911       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10912       pos++;
10913     } else {
10914       // will generate four UTF-8 bytes
10915       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
10916 			if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
10917       *utf8_output++ = char((word>>18) | 0b11110000);
10918       *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10919       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10920       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10921       pos ++;
10922     }
10923   }
10924   return result(error_code::SUCCESS, utf8_output - start);
10925 }
10926 
10927 } // utf32_to_utf8 namespace
10928 } // unnamed namespace
10929 } // namespace scalar
10930 } // namespace simdutf
10931 
10932 #endif
10933 /* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
10934 
10935 /* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
10936 #ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
10937 #define SIMDUTF_VALID_UTF32_TO_UTF16_H
10938 
10939 namespace simdutf {
10940 namespace scalar {
10941 namespace {
10942 namespace utf32_to_utf16 {
10943 
10944 template <endianness big_endian>
convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output)10945 inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output) {
10946   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10947   size_t pos = 0;
10948   char16_t* start{utf16_output};
10949   while (pos < len) {
10950     uint32_t word = data[pos];
10951     if((word & 0xFFFF0000)==0) {
10952       // will not generate a surrogate pair
10953       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
10954       pos++;
10955     } else {
10956       // will generate a surrogate pair
10957       word -= 0x10000;
10958       uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
10959       uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
10960       if (!match_system(big_endian)) {
10961         high_surrogate = utf16::swap_bytes(high_surrogate);
10962         low_surrogate = utf16::swap_bytes(low_surrogate);
10963       }
10964       *utf16_output++ = char16_t(high_surrogate);
10965       *utf16_output++ = char16_t(low_surrogate);
10966       pos++;
10967     }
10968   }
10969   return utf16_output - start;
10970 }
10971 
10972 } // utf32_to_utf16 namespace
10973 } // unnamed namespace
10974 } // namespace scalar
10975 } // namespace simdutf
10976 
10977 #endif
10978 /* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
10979 /* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
10980 #ifndef SIMDUTF_UTF32_TO_UTF16_H
10981 #define SIMDUTF_UTF32_TO_UTF16_H
10982 
10983 namespace simdutf {
10984 namespace scalar {
10985 namespace {
10986 namespace utf32_to_utf16 {
10987 
10988 template <endianness big_endian>
convert(const char32_t* buf, size_t len, char16_t* utf16_output)10989 inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output) {
10990   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
10991   size_t pos = 0;
10992   char16_t* start{utf16_output};
10993   while (pos < len) {
10994     uint32_t word = data[pos];
10995     if((word & 0xFFFF0000)==0) {
10996       if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
10997       // will not generate a surrogate pair
10998       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
10999     } else {
11000       // will generate a surrogate pair
11001       if (word > 0x10FFFF) { return 0; }
11002       word -= 0x10000;
11003       uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
11004       uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
11005       if (!match_system(big_endian)) {
11006         high_surrogate = utf16::swap_bytes(high_surrogate);
11007         low_surrogate = utf16::swap_bytes(low_surrogate);
11008       }
11009       *utf16_output++ = char16_t(high_surrogate);
11010       *utf16_output++ = char16_t(low_surrogate);
11011     }
11012     pos++;
11013   }
11014   return utf16_output - start;
11015 }
11016 
11017 template <endianness big_endian>
convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)11018 inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
11019   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
11020   size_t pos = 0;
11021   char16_t* start{utf16_output};
11022   while (pos < len) {
11023     uint32_t word = data[pos];
11024     if((word & 0xFFFF0000)==0) {
11025       if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
11026       // will not generate a surrogate pair
11027       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
11028     } else {
11029       // will generate a surrogate pair
11030       if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
11031       word -= 0x10000;
11032       uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
11033       uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
11034       if (!match_system(big_endian)) {
11035         high_surrogate = utf16::swap_bytes(high_surrogate);
11036         low_surrogate = utf16::swap_bytes(low_surrogate);
11037       }
11038       *utf16_output++ = char16_t(high_surrogate);
11039       *utf16_output++ = char16_t(low_surrogate);
11040     }
11041     pos++;
11042   }
11043   return result(error_code::SUCCESS, utf16_output - start);
11044 }
11045 
11046 } // utf32_to_utf16 namespace
11047 } // unnamed namespace
11048 } // namespace scalar
11049 } // namespace simdutf
11050 
11051 #endif
11052 /* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
11053 
11054 /* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
11055 #ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
11056 #define SIMDUTF_VALID_UTF16_TO_UTF8_H
11057 
11058 namespace simdutf {
11059 namespace scalar {
11060 namespace {
11061 namespace utf16_to_utf8 {
11062 
11063 template <endianness big_endian>
convert_valid(const char16_t* buf, size_t len, char* utf8_output)11064 inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) {
11065  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
11066   size_t pos = 0;
11067   char* start{utf8_output};
11068   while (pos < len) {
11069     // try to convert the next block of 4 ASCII characters
11070     if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
11071       uint64_t v;
11072       ::memcpy(&v, data + pos, sizeof(uint64_t));
11073       if (!match_system(big_endian)) { v = (v >> 8) | (v << (64 - 8)); }
11074       if ((v & 0xFF80FF80FF80FF80) == 0) {
11075         size_t final_pos = pos + 4;
11076         while(pos < final_pos) {
11077           *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
11078           pos++;
11079         }
11080         continue;
11081       }
11082     }
11083 
11084     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11085     if((word & 0xFF80)==0) {
11086       // will generate one UTF-8 bytes
11087       *utf8_output++ = char(word);
11088       pos++;
11089     } else if((word & 0xF800)==0) {
11090       // will generate two UTF-8 bytes
11091       // we have 0b110XXXXX 0b10XXXXXX
11092       *utf8_output++ = char((word>>6) | 0b11000000);
11093       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11094       pos++;
11095     } else if((word &0xF800 ) != 0xD800) {
11096       // will generate three UTF-8 bytes
11097       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
11098       *utf8_output++ = char((word>>12) | 0b11100000);
11099       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
11100       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11101       pos++;
11102     } else {
11103       // must be a surrogate pair
11104       uint16_t diff = uint16_t(word - 0xD800);
11105       if(pos + 1 >= len) { return 0; } // minimal bound checking
11106       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
11107       uint16_t diff2 = uint16_t(next_word - 0xDC00);
11108       uint32_t value = (diff << 10) + diff2 + 0x10000;
11109       // will generate four UTF-8 bytes
11110       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
11111       *utf8_output++ = char((value>>18) | 0b11110000);
11112       *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
11113       *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
11114       *utf8_output++ = char((value & 0b111111) | 0b10000000);
11115       pos += 2;
11116     }
11117   }
11118   return utf8_output - start;
11119 }
11120 
11121 } // utf16_to_utf8 namespace
11122 } // unnamed namespace
11123 } // namespace scalar
11124 } // namespace simdutf
11125 
11126 #endif
11127 /* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
11128 /* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
11129 #ifndef SIMDUTF_UTF16_TO_UTF8_H
11130 #define SIMDUTF_UTF16_TO_UTF8_H
11131 
11132 namespace simdutf {
11133 namespace scalar {
11134 namespace {
11135 namespace utf16_to_utf8 {
11136 
11137 template <endianness big_endian>
convert(const char16_t* buf, size_t len, char* utf8_output)11138 inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) {
11139  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
11140   size_t pos = 0;
11141   char* start{utf8_output};
11142   while (pos < len) {
11143     // try to convert the next block of 8 bytes
11144     if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
11145       uint64_t v;
11146       ::memcpy(&v, data + pos, sizeof(uint64_t));
11147       if (!match_system(big_endian)) { v = (v >> 8) | (v << (64 - 8)); }
11148       if ((v & 0xFF80FF80FF80FF80) == 0) {
11149         size_t final_pos = pos + 4;
11150         while(pos < final_pos) {
11151           *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
11152           pos++;
11153         }
11154         continue;
11155       }
11156     }
11157     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11158     if((word & 0xFF80)==0) {
11159       // will generate one UTF-8 bytes
11160       *utf8_output++ = char(word);
11161       pos++;
11162     } else if((word & 0xF800)==0) {
11163       // will generate two UTF-8 bytes
11164       // we have 0b110XXXXX 0b10XXXXXX
11165       *utf8_output++ = char((word>>6) | 0b11000000);
11166       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11167       pos++;
11168     } else if((word &0xF800 ) != 0xD800) {
11169       // will generate three UTF-8 bytes
11170       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
11171       *utf8_output++ = char((word>>12) | 0b11100000);
11172       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
11173       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11174       pos++;
11175     } else {
11176       // must be a surrogate pair
11177       if(pos + 1 >= len) { return 0; }
11178       uint16_t diff = uint16_t(word - 0xD800);
11179       if(diff > 0x3FF) { return 0; }
11180       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
11181       uint16_t diff2 = uint16_t(next_word - 0xDC00);
11182       if(diff2 > 0x3FF) { return 0; }
11183       uint32_t value = (diff << 10) + diff2 + 0x10000;
11184       // will generate four UTF-8 bytes
11185       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
11186       *utf8_output++ = char((value>>18) | 0b11110000);
11187       *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
11188       *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
11189       *utf8_output++ = char((value & 0b111111) | 0b10000000);
11190       pos += 2;
11191     }
11192   }
11193   return utf8_output - start;
11194 }
11195 
11196 template <endianness big_endian>
convert_with_errors(const char16_t* buf, size_t len, char* utf8_output)11197 inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
11198  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
11199   size_t pos = 0;
11200   char* start{utf8_output};
11201   while (pos < len) {
11202     // try to convert the next block of 8 bytes
11203     if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
11204       uint64_t v;
11205       ::memcpy(&v, data + pos, sizeof(uint64_t));
11206       if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
11207       if ((v & 0xFF80FF80FF80FF80) == 0) {
11208         size_t final_pos = pos + 4;
11209         while(pos < final_pos) {
11210           *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
11211           pos++;
11212         }
11213         continue;
11214       }
11215     }
11216     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11217     if((word & 0xFF80)==0) {
11218       // will generate one UTF-8 bytes
11219       *utf8_output++ = char(word);
11220       pos++;
11221     } else if((word & 0xF800)==0) {
11222       // will generate two UTF-8 bytes
11223       // we have 0b110XXXXX 0b10XXXXXX
11224       *utf8_output++ = char((word>>6) | 0b11000000);
11225       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11226       pos++;
11227     } else if((word &0xF800 ) != 0xD800) {
11228       // will generate three UTF-8 bytes
11229       // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
11230       *utf8_output++ = char((word>>12) | 0b11100000);
11231       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
11232       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11233       pos++;
11234     } else {
11235       // must be a surrogate pair
11236       if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
11237       uint16_t diff = uint16_t(word - 0xD800);
11238       if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
11239       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
11240       uint16_t diff2 = uint16_t(next_word - 0xDC00);
11241       if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
11242       uint32_t value = (diff << 10) + diff2 + 0x10000;
11243       // will generate four UTF-8 bytes
11244       // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
11245       *utf8_output++ = char((value>>18) | 0b11110000);
11246       *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
11247       *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
11248       *utf8_output++ = char((value & 0b111111) | 0b10000000);
11249       pos += 2;
11250     }
11251   }
11252   return result(error_code::SUCCESS, utf8_output - start);
11253 }
11254 
11255 } // utf16_to_utf8 namespace
11256 } // unnamed namespace
11257 } // namespace scalar
11258 } // namespace simdutf
11259 
11260 #endif
11261 /* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
11262 
11263 /* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
11264 #ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
11265 #define SIMDUTF_VALID_UTF16_TO_UTF32_H
11266 
11267 namespace simdutf {
11268 namespace scalar {
11269 namespace {
11270 namespace utf16_to_utf32 {
11271 
11272 template <endianness big_endian>
convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output)11273 inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output) {
11274  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
11275   size_t pos = 0;
11276   char32_t* start{utf32_output};
11277   while (pos < len) {
11278     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11279     if((word &0xF800 ) != 0xD800) {
11280       // No surrogate pair, extend 16-bit word to 32-bit word
11281       *utf32_output++ = char32_t(word);
11282       pos++;
11283     } else {
11284       // must be a surrogate pair
11285       uint16_t diff = uint16_t(word - 0xD800);
11286       if(pos + 1 >= len) { return 0; } // minimal bound checking
11287       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
11288       uint16_t diff2 = uint16_t(next_word - 0xDC00);
11289       uint32_t value = (diff << 10) + diff2 + 0x10000;
11290       *utf32_output++ = char32_t(value);
11291       pos += 2;
11292     }
11293   }
11294   return utf32_output - start;
11295 }
11296 
11297 } // utf16_to_utf32 namespace
11298 } // unnamed namespace
11299 } // namespace scalar
11300 } // namespace simdutf
11301 
11302 #endif
11303 /* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
11304 /* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
11305 #ifndef SIMDUTF_UTF16_TO_UTF32_H
11306 #define SIMDUTF_UTF16_TO_UTF32_H
11307 
11308 namespace simdutf {
11309 namespace scalar {
11310 namespace {
11311 namespace utf16_to_utf32 {
11312 
11313 template <endianness big_endian>
convert(const char16_t* buf, size_t len, char32_t* utf32_output)11314 inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) {
11315  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
11316   size_t pos = 0;
11317   char32_t* start{utf32_output};
11318   while (pos < len) {
11319     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11320     if((word &0xF800 ) != 0xD800) {
11321       // No surrogate pair, extend 16-bit word to 32-bit word
11322       *utf32_output++ = char32_t(word);
11323       pos++;
11324     } else {
11325       // must be a surrogate pair
11326       uint16_t diff = uint16_t(word - 0xD800);
11327       if(diff > 0x3FF) { return 0; }
11328       if(pos + 1 >= len) { return 0; } // minimal bound checking
11329       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
11330       uint16_t diff2 = uint16_t(next_word - 0xDC00);
11331       if(diff2 > 0x3FF) { return 0; }
11332       uint32_t value = (diff << 10) + diff2 + 0x10000;
11333       *utf32_output++ = char32_t(value);
11334       pos += 2;
11335     }
11336   }
11337   return utf32_output - start;
11338 }
11339 
11340 template <endianness big_endian>
convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)11341 inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
11342  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
11343   size_t pos = 0;
11344   char32_t* start{utf32_output};
11345   while (pos < len) {
11346     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11347     if((word &0xF800 ) != 0xD800) {
11348       // No surrogate pair, extend 16-bit word to 32-bit word
11349       *utf32_output++ = char32_t(word);
11350       pos++;
11351     } else {
11352       // must be a surrogate pair
11353       uint16_t diff = uint16_t(word - 0xD800);
11354       if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
11355       if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } // minimal bound checking
11356       uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
11357       uint16_t diff2 = uint16_t(next_word - 0xDC00);
11358       if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
11359       uint32_t value = (diff << 10) + diff2 + 0x10000;
11360       *utf32_output++ = char32_t(value);
11361       pos += 2;
11362     }
11363   }
11364   return result(error_code::SUCCESS, utf32_output - start);
11365 }
11366 
11367 } // utf16_to_utf32 namespace
11368 } // unnamed namespace
11369 } // namespace scalar
11370 } // namespace simdutf
11371 
11372 #endif
11373 /* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
11374 
11375 /* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
11376 #ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
11377 #define SIMDUTF_VALID_UTF8_TO_UTF16_H
11378 
11379 namespace simdutf {
11380 namespace scalar {
11381 namespace {
11382 namespace utf8_to_utf16 {
11383 
11384 template <endianness big_endian>
convert_valid(const char* buf, size_t len, char16_t* utf16_output)11385 inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) {
11386  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11387   size_t pos = 0;
11388   char16_t* start{utf16_output};
11389   while (pos < len) {
11390     // try to convert the next block of 8 ASCII bytes
11391     if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
11392       uint64_t v;
11393       ::memcpy(&v, data + pos, sizeof(uint64_t));
11394       if ((v & 0x8080808080808080) == 0) {
11395         size_t final_pos = pos + 8;
11396         while(pos < final_pos) {
11397           *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
11398           pos++;
11399         }
11400         continue;
11401       }
11402     }
11403     uint8_t leading_byte = data[pos]; // leading byte
11404     if (leading_byte < 0b10000000) {
11405       // converting one ASCII byte !!!
11406       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
11407       pos++;
11408     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11409       // We have a two-byte UTF-8, it should become
11410       // a single UTF-16 word.
11411       if(pos + 1 >= len) { break; } // minimal bound checking
11412       uint16_t code_point = uint16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
11413       if (!match_system(big_endian)) {
11414         code_point = utf16::swap_bytes(uint16_t(code_point));
11415       }
11416       *utf16_output++ = char16_t(code_point);
11417       pos += 2;
11418     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11419       // We have a three-byte UTF-8, it should become
11420       // a single UTF-16 word.
11421       if(pos + 2 >= len) { break; } // minimal bound checking
11422       uint16_t code_point = uint16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
11423       if (!match_system(big_endian)) {
11424         code_point = utf16::swap_bytes(uint16_t(code_point));
11425       }
11426       *utf16_output++ = char16_t(code_point);
11427       pos += 3;
11428     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11429       // we have a 4-byte UTF-8 word.
11430       if(pos + 3 >= len) { break; } // minimal bound checking
11431       uint32_t code_point = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
11432                            | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
11433       code_point -= 0x10000;
11434       uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
11435       uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
11436       if (!match_system(big_endian)) {
11437         high_surrogate = utf16::swap_bytes(high_surrogate);
11438         low_surrogate = utf16::swap_bytes(low_surrogate);
11439       }
11440       *utf16_output++ = char16_t(high_surrogate);
11441       *utf16_output++ = char16_t(low_surrogate);
11442       pos += 4;
11443     } else {
11444       // we may have a continuation but we do not do error checking
11445       return 0;
11446     }
11447   }
11448   return utf16_output - start;
11449 }
11450 
11451 
11452 } // namespace utf8_to_utf16
11453 } // unnamed namespace
11454 } // namespace scalar
11455 } // namespace simdutf
11456 
11457 #endif
11458 /* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
11459 /* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
11460 #ifndef SIMDUTF_UTF8_TO_UTF16_H
11461 #define SIMDUTF_UTF8_TO_UTF16_H
11462 
11463 namespace simdutf {
11464 namespace scalar {
11465 namespace {
11466 namespace utf8_to_utf16 {
11467 
11468 template <endianness big_endian>
convert(const char* buf, size_t len, char16_t* utf16_output)11469 inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) {
11470  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11471   size_t pos = 0;
11472   char16_t* start{utf16_output};
11473   while (pos < len) {
11474     // try to convert the next block of 16 ASCII bytes
11475     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
11476       uint64_t v1;
11477       ::memcpy(&v1, data + pos, sizeof(uint64_t));
11478       uint64_t v2;
11479       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
11480       uint64_t v{v1 | v2};
11481       if ((v & 0x8080808080808080) == 0) {
11482         size_t final_pos = pos + 16;
11483         while(pos < final_pos) {
11484           *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
11485           pos++;
11486         }
11487         continue;
11488       }
11489     }
11490 
11491     uint8_t leading_byte = data[pos]; // leading byte
11492     if (leading_byte < 0b10000000) {
11493       // converting one ASCII byte !!!
11494       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
11495       pos++;
11496     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11497       // We have a two-byte UTF-8, it should become
11498       // a single UTF-16 word.
11499       if(pos + 1 >= len) { return 0; } // minimal bound checking
11500       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11501       // range check
11502       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
11503       if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
11504       if (!match_system(big_endian)) {
11505         code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
11506       }
11507       *utf16_output++ = char16_t(code_point);
11508       pos += 2;
11509     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11510       // We have a three-byte UTF-8, it should become
11511       // a single UTF-16 word.
11512       if(pos + 2 >= len) { return 0; } // minimal bound checking
11513 
11514       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11515       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
11516       // range check
11517       uint32_t code_point = (leading_byte & 0b00001111) << 12 |
11518                    (data[pos + 1] & 0b00111111) << 6 |
11519                    (data[pos + 2] & 0b00111111);
11520       if (code_point < 0x800 || 0xffff < code_point ||
11521           (0xd7ff < code_point && code_point < 0xe000)) {
11522         return 0;
11523       }
11524       if (!match_system(big_endian)) {
11525         code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
11526       }
11527       *utf16_output++ = char16_t(code_point);
11528       pos += 3;
11529     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11530       // we have a 4-byte UTF-8 word.
11531       if(pos + 3 >= len) { return 0; } // minimal bound checking
11532       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11533       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
11534       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
11535 
11536       // range check
11537       uint32_t code_point =
11538           (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
11539           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
11540       if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
11541       code_point -= 0x10000;
11542       uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
11543       uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
11544       if (!match_system(big_endian)) {
11545         high_surrogate = utf16::swap_bytes(high_surrogate);
11546         low_surrogate = utf16::swap_bytes(low_surrogate);
11547       }
11548       *utf16_output++ = char16_t(high_surrogate);
11549       *utf16_output++ = char16_t(low_surrogate);
11550       pos += 4;
11551     } else {
11552       return 0;
11553     }
11554   }
11555   return utf16_output - start;
11556 }
11557 
11558 template <endianness big_endian>
convert_with_errors(const char* buf, size_t len, char16_t* utf16_output)11559 inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) {
11560  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11561   size_t pos = 0;
11562   char16_t* start{utf16_output};
11563   while (pos < len) {
11564     // try to convert the next block of 16 ASCII bytes
11565     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
11566       uint64_t v1;
11567       ::memcpy(&v1, data + pos, sizeof(uint64_t));
11568       uint64_t v2;
11569       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
11570       uint64_t v{v1 | v2};
11571       if ((v & 0x8080808080808080) == 0) {
11572         size_t final_pos = pos + 16;
11573         while(pos < final_pos) {
11574           *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
11575           pos++;
11576         }
11577         continue;
11578       }
11579     }
11580     uint8_t leading_byte = data[pos]; // leading byte
11581     if (leading_byte < 0b10000000) {
11582       // converting one ASCII byte !!!
11583       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
11584       pos++;
11585     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11586       // We have a two-byte UTF-8, it should become
11587       // a single UTF-16 word.
11588       if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11589       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11590       // range check
11591       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
11592       if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
11593       if (!match_system(big_endian)) {
11594         code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
11595       }
11596       *utf16_output++ = char16_t(code_point);
11597       pos += 2;
11598     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11599       // We have a three-byte UTF-8, it should become
11600       // a single UTF-16 word.
11601       if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11602 
11603       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11604       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11605       // range check
11606       uint32_t code_point = (leading_byte & 0b00001111) << 12 |
11607                    (data[pos + 1] & 0b00111111) << 6 |
11608                    (data[pos + 2] & 0b00111111);
11609       if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
11610       if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
11611       if (!match_system(big_endian)) {
11612         code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
11613       }
11614       *utf16_output++ = char16_t(code_point);
11615       pos += 3;
11616     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11617       // we have a 4-byte UTF-8 word.
11618       if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11619       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11620       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11621       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11622 
11623       // range check
11624       uint32_t code_point =
11625           (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
11626           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
11627       if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
11628       if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
11629       code_point -= 0x10000;
11630       uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
11631       uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
11632       if (!match_system(big_endian)) {
11633         high_surrogate = utf16::swap_bytes(high_surrogate);
11634         low_surrogate = utf16::swap_bytes(low_surrogate);
11635       }
11636       *utf16_output++ = char16_t(high_surrogate);
11637       *utf16_output++ = char16_t(low_surrogate);
11638       pos += 4;
11639     } else {
11640       // we either have too many continuation bytes or an invalid leading byte
11641       if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
11642       else { return result(error_code::HEADER_BITS, pos); }
11643     }
11644   }
11645   return result(error_code::SUCCESS, utf16_output - start);
11646 }
11647 
11648 /**
11649  * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and we have
11650  * up to len input bytes left, and we encountered some error. It is possible that
11651  * the error is at 'buf' exactly, but it could also be in the previous bytes  (up to 3 bytes back).
11652  *
11653  * prior_bytes indicates how many bytes, prior to 'buf' may belong to the current memory section
11654  * and can be safely accessed. We prior_bytes to access safely up to three bytes before 'buf'.
11655  *
11656  * The caller is responsible to ensure that len > 0.
11657  *
11658  * If the error is believed to have occured prior to 'buf', the count value contain in the result
11659  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
11660  */
11661 template <endianness endian>
rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output)11662 inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output) {
11663   size_t extra_len{0};
11664   // We potentially need to go back in time and find a leading byte.
11665   // In theory '3' would be sufficient, but sometimes the error can go back quite far.
11666   size_t how_far_back = prior_bytes;
11667   // size_t how_far_back = 3; // 3 bytes in the past + current position
11668   // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
11669   bool found_leading_bytes{false};
11670   // important: it is i <= how_far_back and not 'i < how_far_back'.
11671   for(size_t i = 0; i <= how_far_back; i++) {
11672     unsigned char byte = buf[0-i];
11673     found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
11674     if(found_leading_bytes) {
11675       buf -= i;
11676       extra_len = i;
11677       break;
11678     }
11679   }
11680   //
11681   // It is possible for this function to return a negative count in its result.
11682   // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
11683   // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
11684   //
11685   // An unsigned type will simply wrap round arithmetically (well defined).
11686   //
11687   if(!found_leading_bytes) {
11688     // If how_far_back == 3, we may have four consecutive continuation bytes!!!
11689     // [....] [continuation] [continuation] [continuation] | [buf is continuation]
11690     // Or we possibly have a stream that does not start with a leading byte.
11691     return result(error_code::TOO_LONG, 0-how_far_back);
11692   }
11693   result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
11694   if (res.error) {
11695     res.count -= extra_len;
11696   }
11697   return res;
11698 }
11699 
11700 } // utf8_to_utf16 namespace
11701 } // unnamed namespace
11702 } // namespace scalar
11703 } // namespace simdutf
11704 
11705 #endif
11706 /* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
11707 
11708 /* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
11709 #ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
11710 #define SIMDUTF_VALID_UTF8_TO_UTF32_H
11711 
11712 namespace simdutf {
11713 namespace scalar {
11714 namespace {
11715 namespace utf8_to_utf32 {
11716 
convert_valid(const char* buf, size_t len, char32_t* utf32_output)11717 inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output) {
11718  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11719   size_t pos = 0;
11720   char32_t* start{utf32_output};
11721   while (pos < len) {
11722     // try to convert the next block of 8 ASCII bytes
11723     if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
11724       uint64_t v;
11725       ::memcpy(&v, data + pos, sizeof(uint64_t));
11726       if ((v & 0x8080808080808080) == 0) {
11727         size_t final_pos = pos + 8;
11728         while(pos < final_pos) {
11729           *utf32_output++ = char32_t(buf[pos]);
11730           pos++;
11731         }
11732         continue;
11733       }
11734     }
11735     uint8_t leading_byte = data[pos]; // leading byte
11736     if (leading_byte < 0b10000000) {
11737       // converting one ASCII byte !!!
11738       *utf32_output++ = char32_t(leading_byte);
11739       pos++;
11740     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11741       // We have a two-byte UTF-8
11742       if(pos + 1 >= len) { break; } // minimal bound checking
11743       *utf32_output++ = char32_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
11744       pos += 2;
11745     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11746       // We have a three-byte UTF-8
11747       if(pos + 2 >= len) { break; } // minimal bound checking
11748       *utf32_output++ = char32_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
11749       pos += 3;
11750     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11751       // we have a 4-byte UTF-8 word.
11752       if(pos + 3 >= len) { break; } // minimal bound checking
11753       uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
11754                            | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
11755       *utf32_output++ = char32_t(code_word);
11756       pos += 4;
11757     } else {
11758       // we may have a continuation but we do not do error checking
11759       return 0;
11760     }
11761   }
11762   return utf32_output - start;
11763 }
11764 
11765 
11766 } // namespace utf8_to_utf32
11767 } // unnamed namespace
11768 } // namespace scalar
11769 } // namespace simdutf
11770 
11771 #endif
11772 /* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
11773 /* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
11774 #ifndef SIMDUTF_UTF8_TO_UTF32_H
11775 #define SIMDUTF_UTF8_TO_UTF32_H
11776 
11777 namespace simdutf {
11778 namespace scalar {
11779 namespace {
11780 namespace utf8_to_utf32 {
11781 
convert(const char* buf, size_t len, char32_t* utf32_output)11782 inline size_t convert(const char* buf, size_t len, char32_t* utf32_output) {
11783  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11784   size_t pos = 0;
11785   char32_t* start{utf32_output};
11786   while (pos < len) {
11787     // try to convert the next block of 16 ASCII bytes
11788     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
11789       uint64_t v1;
11790       ::memcpy(&v1, data + pos, sizeof(uint64_t));
11791       uint64_t v2;
11792       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
11793       uint64_t v{v1 | v2};
11794       if ((v & 0x8080808080808080) == 0) {
11795         size_t final_pos = pos + 16;
11796         while(pos < final_pos) {
11797           *utf32_output++ = char32_t(buf[pos]);
11798           pos++;
11799         }
11800         continue;
11801       }
11802     }
11803     uint8_t leading_byte = data[pos]; // leading byte
11804     if (leading_byte < 0b10000000) {
11805       // converting one ASCII byte !!!
11806       *utf32_output++ = char32_t(leading_byte);
11807       pos++;
11808     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11809       // We have a two-byte UTF-8
11810       if(pos + 1 >= len) { return 0; } // minimal bound checking
11811       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11812       // range check
11813       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
11814       if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
11815       *utf32_output++ = char32_t(code_point);
11816       pos += 2;
11817     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11818       // We have a three-byte UTF-8
11819       if(pos + 2 >= len) { return 0; } // minimal bound checking
11820 
11821       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11822       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
11823       // range check
11824       uint32_t code_point = (leading_byte & 0b00001111) << 12 |
11825                    (data[pos + 1] & 0b00111111) << 6 |
11826                    (data[pos + 2] & 0b00111111);
11827       if (code_point < 0x800 || 0xffff < code_point ||
11828           (0xd7ff < code_point && code_point < 0xe000)) {
11829         return 0;
11830       }
11831       *utf32_output++ = char32_t(code_point);
11832       pos += 3;
11833     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11834       // we have a 4-byte UTF-8 word.
11835       if(pos + 3 >= len) { return 0; } // minimal bound checking
11836       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
11837       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
11838       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
11839 
11840       // range check
11841       uint32_t code_point =
11842           (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
11843           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
11844       if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
11845       *utf32_output++ = char32_t(code_point);
11846       pos += 4;
11847     } else {
11848       return 0;
11849     }
11850   }
11851   return utf32_output - start;
11852 }
11853 
convert_with_errors(const char* buf, size_t len, char32_t* utf32_output)11854 inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output) {
11855  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
11856   size_t pos = 0;
11857   char32_t* start{utf32_output};
11858   while (pos < len) {
11859     // try to convert the next block of 16 ASCII bytes
11860     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
11861       uint64_t v1;
11862       ::memcpy(&v1, data + pos, sizeof(uint64_t));
11863       uint64_t v2;
11864       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
11865       uint64_t v{v1 | v2};
11866       if ((v & 0x8080808080808080) == 0) {
11867         size_t final_pos = pos + 16;
11868         while(pos < final_pos) {
11869           *utf32_output++ = char32_t(buf[pos]);
11870           pos++;
11871         }
11872         continue;
11873       }
11874     }
11875     uint8_t leading_byte = data[pos]; // leading byte
11876     if (leading_byte < 0b10000000) {
11877       // converting one ASCII byte !!!
11878       *utf32_output++ = char32_t(leading_byte);
11879       pos++;
11880     } else if ((leading_byte & 0b11100000) == 0b11000000) {
11881       // We have a two-byte UTF-8
11882       if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11883       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11884       // range check
11885       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
11886       if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
11887       *utf32_output++ = char32_t(code_point);
11888       pos += 2;
11889     } else if ((leading_byte & 0b11110000) == 0b11100000) {
11890       // We have a three-byte UTF-8
11891       if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11892 
11893       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11894       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11895       // range check
11896       uint32_t code_point = (leading_byte & 0b00001111) << 12 |
11897                    (data[pos + 1] & 0b00111111) << 6 |
11898                    (data[pos + 2] & 0b00111111);
11899       if (code_point < 0x800 || 0xffff < code_point) { return result(error_code::OVERLONG, pos); }
11900       if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
11901       *utf32_output++ = char32_t(code_point);
11902       pos += 3;
11903     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
11904       // we have a 4-byte UTF-8 word.
11905       if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
11906       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos);}
11907       if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11908       if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
11909 
11910       // range check
11911       uint32_t code_point =
11912           (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
11913           (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
11914       if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
11915       if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
11916       *utf32_output++ = char32_t(code_point);
11917       pos += 4;
11918     } else {
11919       // we either have too many continuation bytes or an invalid leading byte
11920       if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
11921       else { return result(error_code::HEADER_BITS, pos); }
11922     }
11923   }
11924   return result(error_code::SUCCESS, utf32_output - start);
11925 }
11926 
11927 /**
11928  * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and we have
11929  * up to len input bytes left, and we encountered some error. It is possible that
11930  * the error is at 'buf' exactly, but it could also be in the previous bytes location (up to 3 bytes back).
11931  *
11932  * prior_bytes indicates how many bytes, prior to 'buf' may belong to the current memory section
11933  * and can be safely accessed. We prior_bytes to access safely up to three bytes before 'buf'.
11934  *
11935  * The caller is responsible to ensure that len > 0.
11936  *
11937  * If the error is believed to have occured prior to 'buf', the count value contain in the result
11938  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
11939  */
rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output)11940 inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output) {
11941   size_t extra_len{0};
11942   // We potentially need to go back in time and find a leading byte.
11943   size_t how_far_back = 3; // 3 bytes in the past + current position
11944   if(how_far_back > prior_bytes) { how_far_back = prior_bytes; }
11945   bool found_leading_bytes{false};
11946   // important: it is i <= how_far_back and not 'i < how_far_back'.
11947   for(size_t i = 0; i <= how_far_back; i++) {
11948     unsigned char byte = buf[0-i];
11949     found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
11950     if(found_leading_bytes) {
11951       buf -= i;
11952       extra_len = i;
11953       break;
11954     }
11955   }
11956   //
11957   // It is possible for this function to return a negative count in its result.
11958   // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
11959   // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
11960   //
11961   // An unsigned type will simply wrap round arithmetically (well defined).
11962   //
11963   if(!found_leading_bytes) {
11964     // If how_far_back == 3, we may have four consecutive continuation bytes!!!
11965     // [....] [continuation] [continuation] [continuation] | [buf is continuation]
11966     // Or we possibly have a stream that does not start with a leading byte.
11967     return result(error_code::TOO_LONG, 0-how_far_back);
11968   }
11969 
11970   result res = convert_with_errors(buf, len + extra_len, utf32_output);
11971   if (res.error) {
11972     res.count -= extra_len;
11973   }
11974   return res;
11975 }
11976 
11977 } // utf8_to_utf32 namespace
11978 } // unnamed namespace
11979 } // namespace scalar
11980 } // namespace simdutf
11981 
11982 #endif
11983 /* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
11984 
11985 /* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
11986 #ifndef SIMDUTF_LATIN1_TO_UTF8_H
11987 #define SIMDUTF_LATIN1_TO_UTF8_H
11988 
11989 namespace simdutf {
11990 namespace scalar {
11991 namespace {
11992 namespace latin1_to_utf8 {
11993 
convert(const char* buf, size_t len, char* utf8_output)11994 inline size_t convert(const char* buf, size_t len, char* utf8_output) {
11995   const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
11996   size_t pos = 0;
11997   char* start{utf8_output};
11998   while (pos < len) {
11999     // try to convert the next block of 16 ASCII bytes
12000     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
12001       uint64_t v1;
12002       ::memcpy(&v1, data + pos, sizeof(uint64_t));
12003       uint64_t v2;
12004       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
12005       uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
12006       if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
12007         size_t final_pos = pos + 16;
12008         while(pos < final_pos) {
12009           *utf8_output++ = char(buf[pos]);
12010           pos++;
12011         }
12012         continue;
12013       }
12014     }
12015 
12016     unsigned char byte = data[pos];
12017     if((byte & 0x80) == 0) { // if ASCII
12018       // will generate one UTF-8 bytes
12019       *utf8_output++ = char(byte);
12020       pos++;
12021     } else {
12022       // will generate two UTF-8 bytes
12023       *utf8_output++ = char((byte>>6) | 0b11000000);
12024       *utf8_output++ = char((byte & 0b111111) | 0b10000000);
12025       pos++;
12026     }
12027   }
12028   return utf8_output - start;
12029 }
12030 
12031 } // latin1_to_utf8 namespace
12032 } // unnamed namespace
12033 } // namespace scalar
12034 } // namespace simdutf
12035 
12036 #endif
12037 /* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
12038 /* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
12039 #ifndef SIMDUTF_LATIN1_TO_UTF16_H
12040 #define SIMDUTF_LATIN1_TO_UTF16_H
12041 
12042 namespace simdutf {
12043 namespace scalar {
12044 namespace {
12045 namespace latin1_to_utf16 {
12046 
12047 template <endianness big_endian>
convert(const char* buf, size_t len, char16_t* utf16_output)12048 inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) {
12049   const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
12050   size_t pos = 0;
12051   char16_t* start{ utf16_output };
12052 
12053   while (pos < len) {
12054     uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
12055     *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
12056     pos++;
12057   }
12058 
12059   return utf16_output - start;
12060 }
12061 
12062 template <endianness big_endian>
convert_with_errors(const char* buf, size_t len, char16_t* utf16_output)12063 inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) {
12064   const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
12065   size_t pos = 0;
12066   char16_t* start{ utf16_output };
12067 
12068   while (pos < len) {
12069     uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
12070     *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
12071     pos++;
12072   }
12073 
12074   return result(error_code::SUCCESS, utf16_output - start);
12075 }
12076 
12077 } // latin1_to_utf16 namespace
12078 } // unnamed namespace
12079 } // namespace scalar
12080 } // namespace simdutf
12081 
12082 #endif
12083 /* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
12084 /* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
12085 #ifndef SIMDUTF_LATIN1_TO_UTF32_H
12086 #define SIMDUTF_LATIN1_TO_UTF32_H
12087 
12088 namespace simdutf {
12089 namespace scalar {
12090 namespace {
12091 namespace latin1_to_utf32 {
12092 
12093 
convert(const char *buf, size_t len, char32_t *utf32_output)12094 inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
12095   const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
12096   char32_t* start{utf32_output};
12097   for (size_t i = 0; i < len; i++) {
12098     *utf32_output++ = (char32_t)data[i];
12099   }
12100   return utf32_output - start;
12101 }
12102 
12103 } // latin1_to_utf32 namespace
12104 } // unnamed namespace
12105 } // namespace scalar
12106 } // namespace simdutf
12107 
12108 #endif
12109 /* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
12110 
12111 /* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
12112 #ifndef SIMDUTF_UTF8_TO_LATIN1_H
12113 #define SIMDUTF_UTF8_TO_LATIN1_H
12114 #include <iostream>
12115 
12116 namespace simdutf {
12117 namespace scalar {
12118 namespace {
12119 namespace utf8_to_latin1 {
12120 
convert(const char* buf, size_t len, char* latin_output)12121 inline size_t convert(const char* buf, size_t len, char* latin_output) {
12122  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
12123   size_t pos = 0;
12124   char* start{latin_output};
12125 
12126   while (pos < len) {
12127     // try to convert the next block of 16 ASCII bytes
12128     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
12129       uint64_t v1;
12130       ::memcpy(&v1, data + pos, sizeof(uint64_t));
12131       uint64_t v2;
12132       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
12133       uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000 .... etc
12134       if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
12135         size_t final_pos = pos + 16;
12136         while(pos < final_pos) {
12137           *latin_output++ = char(buf[pos]);
12138           pos++;
12139         }
12140         continue;
12141       }
12142     }
12143 
12144     // suppose it is not an all ASCII byte sequence
12145     uint8_t leading_byte = data[pos]; // leading byte
12146     if (leading_byte < 0b10000000) {
12147       // converting one ASCII byte !!!
12148       *latin_output++ = char(leading_byte);
12149       pos++;
12150     } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
12151       // We have a two-byte UTF-8
12152       if(pos + 1 >= len) {
12153         return 0;
12154       } // minimal bound checking
12155       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
12156       // range check -
12157       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
12158       if (code_point < 0x80 || 0xFF < code_point) {
12159         return 0; // We only care about the range 129-255 which is Non-ASCII latin1 characters. A code_point beneath 0x80 is invalid as it's already covered by bytes whose leading bit is zero.
12160       }
12161       *latin_output++ = char(code_point);
12162       pos += 2;
12163     } else {
12164       return 0;
12165     }
12166   }
12167   return latin_output - start;
12168 }
12169 
convert_with_errors(const char* buf, size_t len, char* latin_output)12170 inline result convert_with_errors(const char* buf, size_t len, char* latin_output) {
12171  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
12172   size_t pos = 0;
12173   char* start{latin_output};
12174 
12175   while (pos < len) {
12176     // try to convert the next block of 16 ASCII bytes
12177     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
12178       uint64_t v1;
12179       ::memcpy(&v1, data + pos, sizeof(uint64_t));
12180       uint64_t v2;
12181       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
12182       uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000...etc
12183       if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
12184         size_t final_pos = pos + 16;
12185         while(pos < final_pos) {
12186           *latin_output++ = char(buf[pos]);
12187           pos++;
12188         }
12189         continue;
12190       }
12191     }
12192     // suppose it is not an all ASCII byte sequence
12193     uint8_t leading_byte = data[pos]; // leading byte
12194     if (leading_byte < 0b10000000) {
12195       // converting one ASCII byte !!!
12196       *latin_output++ = char(leading_byte);
12197       pos++;
12198     } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
12199       // We have a two-byte UTF-8
12200       if(pos + 1 >= len) {
12201         return result(error_code::TOO_SHORT, pos); } // minimal bound checking
12202       if ((data[pos + 1] & 0b11000000) != 0b10000000) {
12203         return result(error_code::TOO_SHORT, pos); } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
12204       // range check -
12205       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
12206       if (code_point < 0x80) {
12207         return result(error_code::OVERLONG, pos);
12208       }
12209       if (0xFF < code_point) {
12210         return result(error_code::TOO_LARGE, pos);
12211       } // We only care about the range 129-255 which is Non-ASCII latin1 characters
12212       *latin_output++ = char(code_point);
12213       pos += 2;
12214     } else if ((leading_byte & 0b11110000) == 0b11100000) {
12215       // We have a three-byte UTF-8
12216       return result(error_code::TOO_LARGE, pos);
12217     } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
12218       // we have a 4-byte UTF-8 word.
12219       return result(error_code::TOO_LARGE, pos);
12220     } else {
12221       // we either have too many continuation bytes or an invalid leading byte
12222       if ((leading_byte & 0b11000000) == 0b10000000) {
12223         return result(error_code::TOO_LONG, pos);
12224       }
12225 
12226       return result(error_code::HEADER_BITS, pos);
12227 
12228     }
12229   }
12230   return result(error_code::SUCCESS, latin_output - start);
12231 }
12232 
12233 
rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char* latin1_output)12234 inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char* latin1_output) {
12235   size_t extra_len{0};
12236   // We potentially need to go back in time and find a leading byte.
12237   // In theory '3' would be sufficient, but sometimes the error can go back quite far.
12238   size_t how_far_back = prior_bytes;
12239   // size_t how_far_back = 3; // 3 bytes in the past + current position
12240   // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
12241   bool found_leading_bytes{false};
12242   // important: it is i <= how_far_back and not 'i < how_far_back'.
12243   for(size_t i = 0; i <= how_far_back; i++) {
12244     unsigned char byte = buf[0-i];
12245     found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
12246     if(found_leading_bytes) {
12247       buf -= i;
12248       extra_len = i;
12249       break;
12250     }
12251   }
12252   //
12253   // It is possible for this function to return a negative count in its result.
12254   // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
12255   // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
12256   //
12257   // An unsigned type will simply wrap round arithmetically (well defined).
12258   //
12259   if(!found_leading_bytes) {
12260     // If how_far_back == 3, we may have four consecutive continuation bytes!!!
12261     // [....] [continuation] [continuation] [continuation] | [buf is continuation]
12262     // Or we possibly have a stream that does not start with a leading byte.
12263     return result(error_code::TOO_LONG, 0-how_far_back);
12264   }
12265   result res = convert_with_errors(buf, len + extra_len, latin1_output);
12266   if (res.error) {
12267     res.count -= extra_len;
12268   }
12269   return res;
12270 }
12271 
12272 
12273 } // utf8_to_latin1 namespace
12274 } // unnamed namespace
12275 } // namespace scalar
12276 } // namespace simdutf
12277 
12278 #endif
12279 /* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
12280 /* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
12281 #ifndef SIMDUTF_UTF16_TO_LATIN1_H
12282 #define SIMDUTF_UTF16_TO_LATIN1_H
12283 
12284 namespace simdutf {
12285 namespace scalar {
12286 namespace {
12287 namespace utf16_to_latin1 {
12288 
12289 #include <cstring>  // for std::memcpy
12290 
12291 template <endianness big_endian>
convert(const char16_t* buf, size_t len, char* latin_output)12292 inline size_t convert(const char16_t* buf, size_t len, char* latin_output) {
12293   const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
12294   size_t pos = 0;
12295   std::vector<char> temp_output(len);
12296   char* current_write = temp_output.data();
12297   uint16_t word = 0;
12298   uint16_t too_large = 0;
12299 
12300   while (pos < len) {
12301     word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
12302     too_large |= word;
12303     *current_write++ = char(word & 0xFF);
12304     pos++;
12305   }
12306   if((too_large & 0xFF00) != 0) { return 0; }
12307 
12308   // Only copy to latin_output if there were no errors
12309   std::memcpy(latin_output, temp_output.data(), len);
12310 
12311   return current_write - temp_output.data();
12312 }
12313 
12314 template <endianness big_endian>
convert_with_errors(const char16_t* buf, size_t len, char* latin_output)12315 inline result convert_with_errors(const char16_t* buf, size_t len, char* latin_output) {
12316  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
12317   size_t pos = 0;
12318   char* start{latin_output};
12319   uint16_t word;
12320 
12321   while (pos < len) {
12322     if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that they are Latin1
12323       uint64_t v1, v2, v3, v4;
12324       ::memcpy(&v1, data + pos, sizeof(uint64_t));
12325       ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
12326       ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
12327       ::memcpy(&v4, data + pos  + 12, sizeof(uint64_t));
12328 
12329       if (!match_system(big_endian)) { v1 = (v1 >> 8) | (v1 << (64 - 8)); }
12330       if (!match_system(big_endian)) { v2 = (v2 >> 8) | (v2 << (64 - 8)); }
12331       if (!match_system(big_endian)) { v3 = (v3 >> 8) | (v3 << (64 - 8)); }
12332       if (!match_system(big_endian)) { v4 = (v1 >> 8) | (v4 << (64 - 8)); }
12333 
12334       if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
12335         size_t final_pos = pos + 16;
12336         while(pos < final_pos) {
12337           *latin_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(data[pos])) : char(data[pos]);
12338           pos++;
12339         }
12340         continue;
12341       }
12342     }
12343     word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
12344     if((word & 0xFF00 ) == 0) {
12345         *latin_output++ = char(word & 0xFF);
12346         pos++;
12347     } else { return result(error_code::TOO_LARGE, pos); }
12348   }
12349   return result(error_code::SUCCESS,latin_output - start);
12350 }
12351 
12352 
12353 } // utf16_to_latin1 namespace
12354 } // unnamed namespace
12355 } // namespace scalar
12356 } // namespace simdutf
12357 
12358 #endif
12359 /* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
12360 /* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
12361 #ifndef SIMDUTF_UTF32_TO_LATIN1_H
12362 #define SIMDUTF_UTF32_TO_LATIN1_H
12363 
12364 namespace simdutf {
12365 namespace scalar {
12366 namespace {
12367 namespace utf32_to_latin1 {
12368 
convert(const char32_t *buf, size_t len, char *latin1_output)12369 inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) {
12370   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
12371   char* start = latin1_output;
12372   uint32_t utf32_char;
12373   size_t pos = 0;
12374   uint32_t too_large = 0;
12375 
12376   while (pos < len) {
12377     utf32_char = (uint32_t)data[pos];
12378     too_large |= utf32_char;
12379     *latin1_output++ = (char)(utf32_char & 0xFF);
12380     pos++;
12381   }
12382   if((too_large & 0xFFFFFF00) != 0) { return 0; }
12383   return latin1_output - start;
12384 }
12385 
convert_with_errors(const char32_t *buf, size_t len, char *latin1_output)12386 inline result convert_with_errors(const char32_t *buf, size_t len, char *latin1_output) {
12387   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
12388   char* start{latin1_output};
12389   size_t pos = 0;
12390   while (pos < len) {
12391     if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
12392       uint64_t v;
12393       ::memcpy(&v, data + pos, sizeof(uint64_t));
12394       if ((v & 0xFFFFFF00FFFFFF00) == 0) {
12395         *latin1_output++ = char(buf[pos]);
12396         *latin1_output++ = char(buf[pos+1]);
12397         pos += 2;
12398         continue;
12399       }
12400     }
12401     uint32_t utf32_char = data[pos];
12402     if ((utf32_char & 0xFFFFFF00) == 0) { // Check if the character can be represented in Latin-1
12403       *latin1_output++ = (char)(utf32_char & 0xFF);
12404       pos++;
12405     } else { return result(error_code::TOO_LARGE, pos); };
12406   }
12407   return result(error_code::SUCCESS, latin1_output - start);
12408 }
12409 
12410 } // utf32_to_latin1 namespace
12411 } // unnamed namespace
12412 } // namespace scalar
12413 } // namespace simdutf
12414 
12415 #endif
12416 /* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
12417 
12418 /* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
12419 #ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
12420 #define SIMDUTF_VALID_UTF8_TO_LATIN1_H
12421 
12422 namespace simdutf {
12423 namespace scalar {
12424 namespace {
12425 namespace utf8_to_latin1 {
12426 
convert_valid(const char* buf, size_t len, char* latin_output)12427 inline size_t convert_valid(const char* buf, size_t len, char* latin_output) {
12428  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
12429 
12430   size_t pos = 0;
12431   char* start{latin_output};
12432 
12433   while (pos < len) {
12434     // try to convert the next block of 16 ASCII bytes
12435     if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
12436       uint64_t v1;
12437       ::memcpy(&v1, data + pos, sizeof(uint64_t));
12438       uint64_t v2;
12439       ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
12440       uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
12441       if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
12442         size_t final_pos = pos + 16;
12443         while(pos < final_pos) {
12444           *latin_output++ = char(buf[pos]);
12445           pos++;
12446         }
12447         continue;
12448       }
12449     }
12450 
12451     // suppose it is not an all ASCII byte sequence
12452     uint8_t leading_byte = data[pos]; // leading byte
12453     if (leading_byte < 0b10000000) {
12454       // converting one ASCII byte !!!
12455       *latin_output++ = char(leading_byte);
12456       pos++;
12457     } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
12458       // We have a two-byte UTF-8
12459       if(pos + 1 >= len) { break; } // minimal bound checking
12460       if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
12461       // range check -
12462       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
12463       *latin_output++ = char(code_point);
12464       pos += 2;
12465     } else {
12466       // we may have a continuation but we do not do error checking
12467       return 0;
12468     }
12469   }
12470   return latin_output - start;
12471 }
12472 
12473 } // utf8_to_latin1 namespace
12474 } // unnamed namespace
12475 } // namespace scalar
12476 } // namespace simdutf
12477 
12478 #endif
12479 /* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
12480 /* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
12481 #ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
12482 #define SIMDUTF_VALID_UTF16_TO_LATIN1_H
12483 
12484 namespace simdutf {
12485 namespace scalar {
12486 namespace {
12487 namespace utf16_to_latin1 {
12488 
12489 template <endianness big_endian>
convert_valid(const char16_t* buf, size_t len, char* latin_output)12490 inline size_t convert_valid(const char16_t* buf, size_t len, char* latin_output) {
12491  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
12492   size_t pos = 0;
12493   char* start{latin_output};
12494   uint16_t word = 0;
12495 
12496   while (pos < len) {
12497     word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
12498     *latin_output++ = char(word);
12499     pos++;
12500   }
12501 
12502   return latin_output - start;
12503 }
12504 
12505 } // utf16_to_latin1 namespace
12506 } // unnamed namespace
12507 } // namespace scalar
12508 } // namespace simdutf
12509 
12510 #endif
12511 /* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
12512 /* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
12513 #ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
12514 #define SIMDUTF_VALID_UTF32_TO_LATIN1_H
12515 
12516 namespace simdutf {
12517 namespace scalar {
12518 namespace {
12519 namespace utf32_to_latin1 {
12520 
convert_valid(const char32_t *buf, size_t len, char *latin1_output)12521 inline size_t convert_valid(const char32_t *buf, size_t len, char *latin1_output) {
12522   const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
12523   char* start = latin1_output;
12524   uint32_t utf32_char;
12525   size_t pos = 0;
12526 
12527   while (pos < len) {
12528   utf32_char = (uint32_t)data[pos];
12529 
12530   if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
12531       uint64_t v;
12532       ::memcpy(&v, data + pos, sizeof(uint64_t));
12533       if ((v & 0xFFFFFF00FFFFFF00) == 0) {
12534       *latin1_output++ = char(buf[pos]);
12535       *latin1_output++ = char(buf[pos+1]);
12536       pos += 2;
12537       continue;
12538     }
12539   }
12540   *latin1_output++ = (char)(utf32_char & 0xFF);
12541   pos++;
12542 
12543   }
12544   return latin1_output - start;
12545 }
12546 
12547 
12548 } // utf32_to_latin1 namespace
12549 } // unnamed namespace
12550 } // namespace scalar
12551 } // namespace simdutf
12552 
12553 #endif
12554 /* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
12555 
12556 
12557 
12558 SIMDUTF_PUSH_DISABLE_WARNINGS
12559 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
12560 
12561 
12562 #if SIMDUTF_IMPLEMENTATION_ARM64
12563 /* begin file src/arm64/implementation.cpp */
12564 /* begin file src/simdutf/arm64/begin.h */
12565 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
12566 // #define SIMDUTF_IMPLEMENTATION arm64
12567 /* end file src/simdutf/arm64/begin.h */
12568 namespace simdutf {
12569 namespace arm64 {
12570 namespace {
12571 #ifndef SIMDUTF_ARM64_H
12572 #error "arm64.h must be included"
12573 #endif
12574 using namespace simd;
12575 
is_ascii(const simd8x64<uint8_t>& input)12576 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
12577     simd8<uint8_t> bits = input.reduce_or();
12578     return bits.max_val() < 0b10000000u;
12579 }
12580 
must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)12581 simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
12582     simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
12583     simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
12584     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
12585     // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
12586     // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
12587     // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
12588     // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
12589     // The error will be detected there.
12590     return is_second_byte ^ is_third_byte ^ is_fourth_byte;
12591 }
12592 
must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)12593 simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
12594     simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
12595     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
12596     return is_third_byte ^ is_fourth_byte;
12597 }
12598 
12599 // common functions for utf8 conversions
convert_utf8_3_byte_to_utf16(uint8x16_t in)12600 simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) {
12601   // Low half contains  10cccccc|1110aaaa
12602   // High half contains 10bbbbbb|10bbbbbb
12603 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
12604   const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10);
12605 #else
12606   const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10};
12607 #endif
12608   uint8x16_t perm = vqtbl1q_u8(in, sh);
12609   // Split into half vectors.
12610   // 10cccccc|1110aaaa
12611   uint8x8_t perm_low = vget_low_u8(perm); // no-op
12612   // 10bbbbbb|10bbbbbb
12613   uint8x8_t perm_high = vget_high_u8(perm);
12614   // xxxxxxxx 10bbbbbb
12615   uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op
12616   // xxxxxxxx 1110aaaa
12617   uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op
12618   // Assemble with shift left insert.
12619   // xxxxxxaa aabbbbbb
12620   uint16x4_t mid_high = vsli_n_u16(mid, high, 6);
12621   // (perm_low << 8) | (perm_low >> 8)
12622   // xxxxxxxx 10cccccc
12623   uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low));
12624   // Shift left insert into the low bits
12625   // aaaabbbb bbcccccc
12626   uint16x4_t composed = vsli_n_u16(low, mid_high, 6);
12627   return composed;
12628 }
12629 
convert_utf8_2_byte_to_utf16(uint8x16_t in)12630 simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) {
12631   // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters.
12632   // Technically this calculates 8, but 6 does better and happens more often
12633   // (The languages which use these codepoints use ASCII spaces so 8 would need to be
12634   // in the middle of a very long word).
12635 
12636   // 10bbbbbb 110aaaaa
12637   uint16x8_t upper = vreinterpretq_u16_u8(in);
12638   // (in << 8) | (in >> 8)
12639   // 110aaaaa 10bbbbbb
12640   uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in));
12641   // 00000000 000aaaaa
12642   uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F));
12643   // Assemble with shift left insert.
12644   // 00000aaa aabbbbbb
12645   uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6);
12646   return composed;
12647 }
12648 
convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx)12649 simdutf_really_inline uint16x8_t convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) {
12650   // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
12651   // This is a relatively easy scenario
12652   // we process SIX (6) input code-code units. The max length in bytes of six code
12653   // code units spanning between 1 and 2 bytes each is 12 bytes.
12654   uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]));
12655   // Shuffle
12656   // 1 byte: 00000000 0bbbbbbb
12657   // 2 byte: 110aaaaa 10bbbbbb
12658   uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
12659   // Mask
12660   // 1 byte: 00000000 0bbbbbbb
12661   // 2 byte: 00000000 00bbbbbb
12662   uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
12663   // 1 byte: 00000000 00000000
12664   // 2 byte: 000aaaaa 00000000
12665   uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
12666   // Combine with a shift right accumulate
12667   // 1 byte: 00000000 0bbbbbbb
12668   // 2 byte: 00000aaa aabbbbbb
12669   uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
12670   return composed;
12671 }
12672 
12673 /* begin file src/arm64/arm_detect_encodings.cpp */
12674 template<class checker>
12675 // len is known to be a multiple of 2 when this is called
arm_detect_encodings(const char * buf, size_t len)12676 int arm_detect_encodings(const char * buf, size_t len) {
12677     const char* start = buf;
12678     const char* end = buf + len;
12679 
12680     bool is_utf8 = true;
12681     bool is_utf16 = true;
12682     bool is_utf32 = true;
12683 
12684     int out = 0;
12685 
12686     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
12687     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
12688 
12689     uint32x4_t currentmax = vmovq_n_u32(0x0);
12690 
12691     checker check{};
12692 
12693     while(buf + 64 <= end) {
12694         uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
12695         uint16x8_t secondin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + simd16<uint16_t>::SIZE / sizeof(char16_t));
12696         uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2*simd16<uint16_t>::SIZE / sizeof(char16_t));
12697         uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3*simd16<uint16_t>::SIZE / sizeof(char16_t));
12698 
12699         const auto u0 = simd16<uint16_t>(in);
12700         const auto u1 = simd16<uint16_t>(secondin);
12701         const auto u2 = simd16<uint16_t>(thirdin);
12702         const auto u3 = simd16<uint16_t>(fourthin);
12703 
12704         const auto v0 = u0.shr<8>();
12705         const auto v1 = u1.shr<8>();
12706         const auto v2 = u2.shr<8>();
12707         const auto v3 = u3.shr<8>();
12708 
12709         const auto in16 = simd16<uint16_t>::pack(v0, v1);
12710         const auto nextin16 = simd16<uint16_t>::pack(v2, v3);
12711 
12712         const uint64_t surrogates_wordmask0 = ((in16 & v_f8) == v_d8).to_bitmask64();
12713         const uint64_t surrogates_wordmask1 = ((nextin16 & v_f8) == v_d8).to_bitmask64();
12714 
12715         // Check for surrogates
12716         if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) {
12717             // Cannot be UTF8
12718             is_utf8 = false;
12719             // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
12720             // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
12721             // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
12722             // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
12723             // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units.
12724 
12725             if (((surrogates_wordmask0 | surrogates_wordmask1) & 0xf0f0f0f0f0f0f0f0) != 0) {
12726                 is_utf32 = false;
12727                 // Code from arm_validate_utf16le.cpp
12728                 // Not efficient, we do not process surrogates_wordmask1
12729                 const char16_t * input = reinterpret_cast<const char16_t*>(buf);
12730                 const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
12731 
12732                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
12733                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
12734 
12735                 const uint64_t V0 = ~surrogates_wordmask0;
12736 
12737                 const auto vH0 = ((in16 & v_fc) ==  v_dc);
12738                 const uint64_t H0 = vH0.to_bitmask64();
12739 
12740                 const uint64_t L0 = ~H0 & surrogates_wordmask0;
12741 
12742                 const uint64_t a0 = L0 & (H0 >> 4);
12743 
12744                 const uint64_t b0 = a0 << 4;
12745 
12746                 const uint64_t c0 = V0 | a0 | b0;
12747                 if (c0 == ~0ull) {
12748                     input += 16;
12749                 } else if (c0 == 0xfffffffffffffffull) {
12750                     input += 15;
12751                 } else {
12752                     is_utf16 = false;
12753                     break;
12754                 }
12755 
12756                 while (input + 16 < end16) {
12757                     const auto in0 = simd16<uint16_t>(input);
12758                     const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
12759                     const auto t0 = in0.shr<8>();
12760                     const auto t1 = in1.shr<8>();
12761                     const simd8<uint8_t> in_16 = simd16<uint16_t>::pack(t0, t1);
12762 
12763                     const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64();
12764                     if(surrogates_wordmask == 0) {
12765                         input += 16;
12766                     } else {
12767                         const uint64_t V = ~surrogates_wordmask;
12768 
12769                         const auto vH = ((in_16 & v_fc) ==  v_dc);
12770                         const uint64_t H = vH.to_bitmask64();
12771 
12772                         const uint64_t L = ~H & surrogates_wordmask;
12773 
12774                         const uint64_t a = L & (H >> 4);
12775 
12776                         const uint64_t b = a << 4;
12777 
12778                         const uint64_t c = V | a | b;
12779                         if (c == ~0ull) {
12780                             input += 16;
12781                         } else if (c == 0xfffffffffffffffull) {
12782                             input += 15;
12783                         } else {
12784                             is_utf16 = false;
12785                             break;
12786                         }
12787                     }
12788                 }
12789             } else {
12790                 is_utf16 = false;
12791                 // Check for UTF-32
12792                 if (len % 4 == 0) {
12793                     const char32_t * input = reinterpret_cast<const char32_t*>(buf);
12794                     const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
12795 
12796                     // Must start checking for surrogates
12797                     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
12798                     const uint32x4_t offset = vmovq_n_u32(0xffff2000);
12799                     const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
12800 
12801                     const uint32x4_t in32 =  vreinterpretq_u32_u16(in);
12802                     const uint32x4_t secondin32 =  vreinterpretq_u32_u16(secondin);
12803                     const uint32x4_t thirdin32 =  vreinterpretq_u32_u16(thirdin);
12804                     const uint32x4_t fourthin32 =  vreinterpretq_u32_u16(fourthin);
12805 
12806                     currentmax = vmaxq_u32(in32,currentmax);
12807                     currentmax = vmaxq_u32(secondin32,currentmax);
12808                     currentmax = vmaxq_u32(thirdin32,currentmax);
12809                     currentmax = vmaxq_u32(fourthin32,currentmax);
12810 
12811                     currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax);
12812                     currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax);
12813                     currentoffsetmax = vmaxq_u32(vaddq_u32(thirdin32, offset), currentoffsetmax);
12814                     currentoffsetmax = vmaxq_u32(vaddq_u32(fourthin32, offset), currentoffsetmax);
12815 
12816                     while (input + 4 < end32) {
12817                         const uint32x4_t in_32 = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
12818                         currentmax = vmaxq_u32(in_32,currentmax);
12819                         currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax);
12820                         input += 4;
12821                     }
12822 
12823                     uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
12824                     if(vmaxvq_u32(forbidden_words) != 0) {
12825                         is_utf32 = false;
12826                     }
12827                 } else {
12828                     is_utf32 = false;
12829                 }
12830             }
12831             break;
12832         }
12833         // If no surrogate, validate under other encodings as well
12834 
12835         // UTF-32 validation
12836         currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax);
12837         currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax);
12838         currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax);
12839         currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax);
12840 
12841         // UTF-8 validation
12842         // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
12843         simd::simd8x64<uint8_t> in8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(secondin), vreinterpretq_u8_u16(thirdin), vreinterpretq_u8_u16(fourthin));
12844         check.check_next_input(in8);
12845 
12846         buf += 64;
12847     }
12848 
12849     // Check which encodings are possible
12850 
12851     if (is_utf8) {
12852         if (static_cast<size_t>(buf - start) != len) {
12853             uint8_t block[64]{};
12854             std::memset(block, 0x20, 64);
12855             std::memcpy(block, buf, len - (buf - start));
12856             simd::simd8x64<uint8_t> in(block);
12857             check.check_next_input(in);
12858         }
12859         if (!check.errors()) {
12860             out |= simdutf::encoding_type::UTF8;
12861         }
12862     }
12863 
12864     if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
12865         out |= simdutf::encoding_type::UTF16_LE;
12866     }
12867 
12868     if (is_utf32 && (len % 4 == 0)) {
12869         const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
12870         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
12871         if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
12872             out |= simdutf::encoding_type::UTF32_LE;
12873         }
12874     }
12875 
12876     return out;
12877 }
12878 /* end file src/arm64/arm_detect_encodings.cpp */
12879 
12880 /* begin file src/arm64/arm_validate_utf16.cpp */
12881 template <endianness big_endian>
arm_validate_utf16(const char16_t* input, size_t size)12882 const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
12883     const char16_t* end = input + size;
12884     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
12885     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
12886     const auto v_fc = simd8<uint8_t>::splat(0xfc);
12887     const auto v_dc = simd8<uint8_t>::splat(0xdc);
12888     while (input + 16 < end) {
12889         // 0. Load data: since the validation takes into account only higher
12890         //    byte of each word, we compress the two vectors into one which
12891         //    consists only the higher bytes.
12892         auto in0 = simd16<uint16_t>(input);
12893         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
12894         if (!match_system(big_endian)) {
12895             in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
12896             in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
12897         }
12898         const auto t0 = in0.shr<8>();
12899         const auto t1 = in1.shr<8>();
12900         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
12901         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
12902         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
12903         if(surrogates_wordmask == 0) {
12904             input += 16;
12905         } else {
12906             // 2. We have some surrogates that have to be distinguished:
12907             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
12908             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
12909             //
12910             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
12911 
12912             // V - non-surrogate code units
12913             //     V = not surrogates_wordmask
12914             const uint64_t V = ~surrogates_wordmask;
12915 
12916             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
12917             const auto vH = ((in & v_fc) ==  v_dc);
12918             const uint64_t H = vH.to_bitmask64();
12919 
12920             // L - word mask for low surrogates
12921             //     L = not H and surrogates_wordmask
12922             const uint64_t L = ~H & surrogates_wordmask;
12923 
12924             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
12925                               // (A low surrogate placed in the 7th register's word
12926                               // is an exception we handle.)
12927             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
12928                           // thanks to that we have only two masks for valid case.
12929             const uint64_t c = V | a | b;      // Combine all the masks into the final one.
12930             if (c == ~0ull) {
12931                 // The whole input register contains valid UTF-16, i.e.,
12932                 // either single code units or proper surrogate pairs.
12933                 input += 16;
12934             } else if (c == 0xfffffffffffffffull) {
12935                 // The 15 lower code units of the input register contains valid UTF-16.
12936                 // The 15th word may be either a low or high surrogate. It the next
12937                 // iteration we 1) check if the low surrogate is followed by a high
12938                 // one, 2) reject sole high surrogate.
12939                 input += 15;
12940             } else {
12941                 return nullptr;
12942             }
12943         }
12944     }
12945     return input;
12946 }
12947 
12948 
12949 template <endianness big_endian>
arm_validate_utf16_with_errors(const char16_t* input, size_t size)12950 const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) {
12951     const char16_t* start = input;
12952     const char16_t* end = input + size;
12953 
12954     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
12955     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
12956     const auto v_fc = simd8<uint8_t>::splat(0xfc);
12957     const auto v_dc = simd8<uint8_t>::splat(0xdc);
12958     while (input + 16 < end) {
12959         // 0. Load data: since the validation takes into account only higher
12960         //    byte of each word, we compress the two vectors into one which
12961         //    consists only the higher bytes.
12962         auto in0 = simd16<uint16_t>(input);
12963         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
12964 
12965         if (!match_system(big_endian)) {
12966             in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
12967             in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
12968         }
12969         const auto t0 = in0.shr<8>();
12970         const auto t1 = in1.shr<8>();
12971         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
12972         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
12973         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
12974         if(surrogates_wordmask == 0) {
12975             input += 16;
12976         } else {
12977             // 2. We have some surrogates that have to be distinguished:
12978             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
12979             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
12980             //
12981             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
12982 
12983             // V - non-surrogate code units
12984             //     V = not surrogates_wordmask
12985             const uint64_t V = ~surrogates_wordmask;
12986 
12987             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
12988             const auto vH = ((in & v_fc) ==  v_dc);
12989             const uint64_t H = vH.to_bitmask64();
12990 
12991             // L - word mask for low surrogates
12992             //     L = not H and surrogates_wordmask
12993             const uint64_t L = ~H & surrogates_wordmask;
12994 
12995             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
12996                               // (A low surrogate placed in the 7th register's word
12997                               // is an exception we handle.)
12998             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
12999                           // thanks to that we have only two masks for valid case.
13000             const uint64_t c = V | a | b;      // Combine all the masks into the final one.
13001             if (c == ~0ull) {
13002                 // The whole input register contains valid UTF-16, i.e.,
13003                 // either single code units or proper surrogate pairs.
13004                 input += 16;
13005             } else if (c == 0xfffffffffffffffull) {
13006                 // The 15 lower code units of the input register contains valid UTF-16.
13007                 // The 15th word may be either a low or high surrogate. It the next
13008                 // iteration we 1) check if the low surrogate is followed by a high
13009                 // one, 2) reject sole high surrogate.
13010                 input += 15;
13011             } else {
13012                 return result(error_code::SURROGATE, input - start);
13013             }
13014         }
13015     }
13016     return result(error_code::SUCCESS, input - start);
13017 }
13018 /* end file src/arm64/arm_validate_utf16.cpp */
13019 /* begin file src/arm64/arm_validate_utf32le.cpp */
13020 
arm_validate_utf32le(const char32_t* input, size_t size)13021 const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
13022     const char32_t* end = input + size;
13023 
13024     const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
13025     const uint32x4_t offset = vmovq_n_u32(0xffff2000);
13026     const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
13027     uint32x4_t currentmax = vmovq_n_u32(0x0);
13028     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
13029 
13030     while (input + 4 < end) {
13031         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
13032         currentmax = vmaxq_u32(in,currentmax);
13033         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
13034         input += 4;
13035     }
13036 
13037     uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
13038     if(vmaxvq_u32(is_zero) != 0) {
13039         return nullptr;
13040     }
13041 
13042     is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
13043     if(vmaxvq_u32(is_zero) != 0) {
13044         return nullptr;
13045     }
13046 
13047     return input;
13048 }
13049 
13050 
arm_validate_utf32le_with_errors(const char32_t* input, size_t size)13051 const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) {
13052     const char32_t* start = input;
13053     const char32_t* end = input + size;
13054 
13055     const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
13056     const uint32x4_t offset = vmovq_n_u32(0xffff2000);
13057     const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
13058     uint32x4_t currentmax = vmovq_n_u32(0x0);
13059     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
13060 
13061     while (input + 4 < end) {
13062         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
13063         currentmax = vmaxq_u32(in,currentmax);
13064         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
13065 
13066         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
13067         if(vmaxvq_u32(is_zero) != 0) {
13068             return result(error_code::TOO_LARGE, input - start);
13069         }
13070 
13071         is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
13072         if(vmaxvq_u32(is_zero) != 0) {
13073             return result(error_code::SURROGATE, input - start);
13074         }
13075 
13076         input += 4;
13077     }
13078 
13079     return result(error_code::SUCCESS, input - start);
13080 }
13081 /* end file src/arm64/arm_validate_utf32le.cpp */
13082 
13083 /* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */
13084 /*
13085   Returns a pair: the first unprocessed byte from buf and utf8_output
13086   A scalar routing should carry on the conversion of the tail.
13087 */
13088 std::pair<const char *, char *>
arm_convert_latin1_to_utf8(const char *latin1_input, size_t len, char *utf8_out)13089 arm_convert_latin1_to_utf8(const char *latin1_input, size_t len,
13090                            char *utf8_out) {
13091   uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
13092   const char *end = latin1_input + len;
13093   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
13094   // We always write 16 bytes, of which more than the first 8 bytes
13095   // are valid. A safety margin of 8 is more than sufficient.
13096   while (latin1_input + 16 + 8 <= end) {
13097     uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(latin1_input));
13098     if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!!
13099       vst1q_u8(utf8_output, in8);
13100       utf8_output += 16;
13101       latin1_input += 16;
13102       continue;
13103     }
13104 
13105     // We just fallback on UTF-16 code. This could be optimized/simplified
13106     // further.
13107     uint16x8_t in16 = vmovl_u8(vget_low_u8(in8));
13108     // 1. prepare 2-byte values
13109     // input 8-bit word : [aabb|bbbb] x 8
13110     // expected output   : [1100|00aa|10bb|bbbb] x 8
13111     const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
13112     const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
13113 
13114     // t0 = [0000|00aa|bbbb|bb00]
13115     const uint16x8_t t0 = vshlq_n_u16(in16, 2);
13116     // t1 = [0000|00aa|0000|0000]
13117     const uint16x8_t t1 = vandq_u16(t0, v_1f00);
13118     // t2 = [0000|0000|00bb|bbbb]
13119     const uint16x8_t t2 = vandq_u16(in16, v_003f);
13120     // t3 = [0000|00aa|00bb|bbbb]
13121     const uint16x8_t t3 = vorrq_u16(t1, t2);
13122     // t4 = [1100|00aa|10bb|bbbb]
13123     const uint16x8_t t4 = vorrq_u16(t3, v_c080);
13124     // 2. merge ASCII and 2-byte codewords
13125     const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
13126     const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f);
13127     const uint8x16_t utf8_unpacked =
13128         vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4));
13129     // 3. prepare bitmask for 8-bit lookup
13130 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13131     const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004, 0x0010, 0x0040,
13132                                             0x0002, 0x0008, 0x0020, 0x0080);
13133 #else
13134     const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
13135                              0x0002, 0x0008, 0x0020, 0x0080};
13136 #endif
13137     uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
13138     // 4. pack the bytes
13139     const uint8_t *row =
13140         &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
13141     const uint8x16_t shuffle = vld1q_u8(row + 1);
13142     const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
13143 
13144     // 5. store bytes
13145     vst1q_u8(utf8_output, utf8_packed);
13146     // 6. adjust pointers
13147     latin1_input += 8;
13148     utf8_output += row[0];
13149 
13150   } // while
13151 
13152   return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
13153 }
13154 /* end file src/arm64/arm_convert_latin1_to_utf8.cpp */
13155 /* begin file src/arm64/arm_convert_latin1_to_utf16.cpp */
13156 template <endianness big_endian>
arm_convert_latin1_to_utf16(const char* buf, size_t len, char16_t* utf16_output)13157 std::pair<const char*, char16_t*> arm_convert_latin1_to_utf16(const char* buf, size_t len, char16_t* utf16_output) {
13158     const char* end = buf + len;
13159 
13160     while (buf + 16 <= end) {
13161         uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
13162         uint16x8_t inlow = vmovl_u8(vget_low_u8(in8));
13163         if (!match_system(big_endian)) { inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow))); }
13164         vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), inlow);
13165         uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8));
13166         if (!match_system(big_endian)) { inhigh = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inhigh))); }
13167         vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output+8), inhigh);
13168         utf16_output += 16;
13169         buf += 16;
13170     }
13171 
13172     return std::make_pair(buf, utf16_output);
13173 }
13174 /* end file src/arm64/arm_convert_latin1_to_utf16.cpp */
13175 /* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */
arm_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output)13176 std::pair<const char*, char32_t*> arm_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) {
13177     const char* end = buf + len;
13178 
13179     while (buf + 16 <= end) {
13180         uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
13181         uint16x8_t in8low = vmovl_u8(vget_low_u8(in8));
13182         uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low));
13183         uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low));
13184         uint16x8_t in8high = vmovl_u8(vget_high_u8(in8));
13185         uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high));
13186         uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high));
13187         vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output), in16lowlow);
13188         vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output+4), in16lowhigh);
13189         vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output+8), in8highlow);
13190         vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output+12), in8highhigh);
13191 
13192         utf32_output += 16;
13193         buf += 16;
13194     }
13195 
13196     return std::make_pair(buf, utf32_output);
13197 }
13198 /* end file src/arm64/arm_convert_latin1_to_utf32.cpp */
13199 
13200 /* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
13201 // Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
13202 // end of the code points. Only the least significant 12 bits of the mask
13203 // are accessed.
13204 // It returns how many bytes were consumed (up to 16, usually 12).
13205 template <endianness big_endian>
convert_masked_utf8_to_utf16(const char *input, uint64_t utf8_end_of_code_point_mask, char16_t *&utf16_output)13206 size_t convert_masked_utf8_to_utf16(const char *input,
13207                            uint64_t utf8_end_of_code_point_mask,
13208                            char16_t *&utf16_output) {
13209   // we use an approach where we try to process up to 12 input bytes.
13210   // Why 12 input bytes and not 16? Because we are concerned with the size of
13211   // the lookup tables. Also 12 is nicely divisible by two and three.
13212   //
13213   uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
13214   const uint16_t input_utf8_end_of_code_point_mask =
13215       utf8_end_of_code_point_mask & 0xfff;
13216   //
13217   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
13218   // beneficial to have fast paths that depend on branch prediction but have less latency.
13219   // This results in more instructions but, potentially, also higher speeds.
13220 
13221   // We first try a few fast paths.
13222   // The obvious first test is ASCII, which actually consumes the full 16.
13223   if((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
13224     // We process in chunks of 16 bytes
13225     // The routine in simd.h is reused.
13226     simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
13227     temp.store_ascii_as_utf16<big_endian>(utf16_output);
13228     utf16_output += 16; // We wrote 16 16-bit characters.
13229     return 16; // We consumed 16 bytes.
13230   }
13231 
13232   // 3 byte sequences are the next most common, as seen in CJK, which has long sequences
13233   // of these.
13234   if (input_utf8_end_of_code_point_mask == 0x924) {
13235     // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte UTF-16 code units.
13236     uint16x4_t composed = convert_utf8_3_byte_to_utf16(in);
13237     // Byte swap if necessary
13238     if (!match_system(big_endian)) {
13239       composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
13240     }
13241     vst1_u16(reinterpret_cast<uint16_t*>(utf16_output), composed);
13242     utf16_output += 4; // We wrote 4 16-bit characters.
13243     return 12; // We consumed 12 bytes.
13244   }
13245 
13246   // 2 byte sequences occur in short bursts in languages like Greek and Russian.
13247   if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) {
13248     // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte UTF-16 code units.
13249     uint16x8_t composed = convert_utf8_2_byte_to_utf16(in);
13250     // Byte swap if necessary
13251     if (!match_system(big_endian)) {
13252       composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
13253     }
13254     vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
13255 
13256     utf16_output += 6; // We wrote 6 16-bit characters.
13257     return 12; // We consumed 12 bytes.
13258   }
13259 
13260   /// We do not have a fast path available, or the fast path is unimportant, so we fallback.
13261   const uint8_t idx =
13262       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
13263 
13264   const uint8_t consumed =
13265       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
13266 
13267   if (idx < 64) {
13268     // SIX (6) input code-code units
13269     // Convert to UTF-16
13270     uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
13271     // Byte swap if necessary
13272     if (!match_system(big_endian)) {
13273       composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
13274     }
13275     // Store
13276     vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed);
13277     utf16_output += 6; // We wrote 6 16-bit characters.
13278     return consumed;
13279   } else if (idx < 145) {
13280     // FOUR (4) input code-code units
13281     // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
13282     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
13283     // XXX: depending on the system scalar instructions might be faster.
13284     // 1 byte: 00000000 00000000 0ccccccc
13285     // 2 byte: 00000000 110bbbbb 10cccccc
13286     // 3 byte: 1110aaaa 10bbbbbb 10cccccc
13287     uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
13288     // 1 byte: 00000000 0ccccccc
13289     // 2 byte: xx0bbbbb x0cccccc
13290     // 3 byte: xxbbbbbb x0cccccc
13291     uint16x4_t lowperm = vmovn_u32(perm);
13292     // Partially mask with bic (doesn't require a temporary register unlike and)
13293     // The shift left insert below will clear the top bits.
13294     // 1 byte: 00000000 00000000
13295     // 2 byte: xx0bbbbb 00000000
13296     // 3 byte: xxbbbbbb 00000000
13297     uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00)));
13298     // ASCII
13299     // 1 byte: 00000000 0ccccccc
13300     // 2+byte: 00000000 00cccccc
13301     uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F));
13302     // Split into narrow vectors.
13303     // 2 byte: 00000000 00000000
13304     // 3 byte: 00000000 xxxxaaaa
13305     uint16x4_t highperm = vshrn_n_u32(perm, 16);
13306     // Shift right accumulate the middle byte
13307     // 1 byte: 00000000 0ccccccc
13308     // 2 byte: 00xx0bbb bbcccccc
13309     // 3 byte: 00xxbbbb bbcccccc
13310     uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2);
13311     // Shift left and insert the top 4 bits, overwriting the garbage
13312     // 1 byte: 00000000 0ccccccc
13313     // 2 byte: 00000bbb bbcccccc
13314     // 3 byte: aaaabbbb bbcccccc
13315     uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12);
13316     // Byte swap if necessary
13317     if (!match_system(big_endian)) {
13318       composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
13319     }
13320     vst1_u16(reinterpret_cast<uint16_t*>(utf16_output), composed);
13321 
13322     utf16_output += 4; // We wrote 4 16-bit codepoints
13323     return consumed;
13324   } else if (idx < 209) {
13325     // THREE (3) input code-code units
13326     if (input_utf8_end_of_code_point_mask == 0x888) {
13327       // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte UTF-16 pairs.
13328       // Generating surrogate pairs is a little tricky though, but it is easier when we
13329       // can assume they are all pairs.
13330       // This version does not use the LUT, but 4 byte sequences are less common and the
13331       // overhead of the extra memory access is less important than the early branch overhead
13332       // in shorter sequences.
13333 
13334       // Swap byte pairs
13335       // 10dddddd 10cccccc|10bbbbbb 11110aaa
13336       // 10cccccc 10dddddd|11110aaa 10bbbbbb
13337       uint8x16_t swap = vrev16q_u8(in);
13338       // Shift left 2 bits
13339       // cccccc00 dddddd00 xxxxxxxx bbbbbb00
13340       uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2));
13341       // Create a magic number containing the low 2 bits of the trail surrogate and all the
13342       // corrections needed to create the pair.
13343       // UTF-8 4b prefix   = -0x0000|0xF000
13344       // surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
13345       // surrogate high    = +0x0000|0xD800
13346       // surrogate low     = +0xDC00|0x0000
13347       // -------------------------------
13348       //                   = +0xDC00|0xE7C0
13349       uint32x4_t magic = vmovq_n_u32(0xDC00E7C0);
13350       // Generate unadjusted trail surrogate minus lowest 2 bits
13351       // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
13352       uint32x4_t trail = vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift);
13353       // Insert low 2 bits of trail surrogate to magic number for later
13354       // 11011100 00000000 11100111 110000cc
13355       uint16x8_t magic_with_low_2 = vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30));
13356       // Generate lead surrogate
13357       // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
13358       uint32x4_t lead = vreinterpretq_u32_u16(vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6));
13359       // Mask out lead
13360       // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
13361       lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF)));
13362       // Blend pairs
13363       // 000000cc ccdddddd|11110aaa bbbbbb00
13364       uint16x8_t blend = vreinterpretq_u16_u32(vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead));
13365       // Add magic number to finish the result
13366       // 110111CC CCDDDDDD|110110AA BBBBBBCC
13367       uint16x8_t composed = vaddq_u16(blend, magic_with_low_2);
13368       // Byte swap if necessary
13369       if (!match_system(big_endian)) {
13370         composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
13371       }
13372       vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
13373       utf16_output += 6; // We 3 32-bit surrogate pairs.
13374       return 12; // We consumed 12 bytes.
13375     }
13376     // 3 1-4 byte sequences
13377     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
13378 
13379     // 1 byte: 00000000 00000000 00000000 0ddddddd
13380     // 3 byte: 00000000 00000000 110ccccc 10dddddd
13381     // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
13382     // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
13383     uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
13384     // Mask the low and middle bytes
13385     // 00000000 00000000 00000000 0ddddddd
13386     uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
13387     // Because the surrogates need more work, the high surrogate is computed first.
13388     uint32x4_t middlehigh = vshlq_n_u32(perm, 2);
13389     // 00000000 00000000 00cccccc 00000000
13390     uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00));
13391     // Start assembling the sequence. Since the 4th byte is in the same position as it
13392     // would be in a surrogate and there is no dependency, shift left instead of right.
13393     // 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx
13394     // 4 byte: 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
13395     uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh);
13396     // Top 16 bits contains the high ten bits of the surrogate pair before correction
13397     // 3 byte: 00000000 10bbbbcc|cccc0000 00000000
13398     // 4 byte: 11110aaa bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
13399     uint32x4_t abc = vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4));
13400     // Combine the low 6 or 7 bits by a shift right accumulate
13401     // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
13402     // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o correction
13403     uint32x4_t composed = vsraq_n_u32(ascii, abc, 6);
13404     // After this is for surrogates
13405     // Blend the low and high surrogates
13406     // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
13407     uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed);
13408     // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits yet as
13409     // 0x10000 was not subtracted from the codepoint yet.
13410     // 4 byte: 11110aaa bbbbbbcc|000000cc ccdddddd
13411     uint16x8_t masked_pair =
13412         vreinterpretq_u16_u32(vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF))));
13413     // Correct the remaining UTF-8 prefix, surrogate offset, and add the surrogate prefixes
13414     // in one magic 16-bit addition.
13415     // similar magic number but without the continue byte adjust and halfword swapped
13416     // UTF-8 4b prefix   = -0xF000|0x0000
13417     // surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
13418     // surrogate high    = +0xD800|0x0000
13419     // surrogate low     = +0x0000|0xDC00
13420     // -----------------------------------
13421     //                   = +0xE7C0|0xDC00
13422     uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00));
13423     // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
13424     uint32x4_t surrogates = vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic));
13425     // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
13426     uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm));
13427 
13428     // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
13429     // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
13430     // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
13431     uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed);
13432     // Byte swap if necessary
13433     if (!match_system(big_endian)) {
13434       selected = vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected)));
13435     }
13436     // Attempting to shuffle and store would be complex, just scalarize.
13437     uint32_t buffer[4];
13438     vst1q_u32(buffer, selected);
13439     // Test for the top bit of the surrogate mask.
13440     const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : 0x00800000;
13441     for (size_t i = 0; i < 3; i++) {
13442       // Surrogate
13443       if (buffer[i] & SURROGATE_MASK) {
13444         utf16_output[0] = uint16_t(buffer[i] >> 16);
13445         utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
13446         utf16_output += 2;
13447       } else {
13448         utf16_output[0] = uint16_t(buffer[i] & 0xFFFF);
13449         utf16_output++;
13450       }
13451     }
13452     return consumed;
13453   } else {
13454     // here we know that there is an error but we do not handle errors
13455     return 12;
13456   }
13457 }
13458 
13459 /* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
13460 /* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
13461 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
13462 // end of the code points. Only the least significant 12 bits of the mask
13463 // are accessed.
13464 // It returns how many bytes were consumed (up to 12).
convert_masked_utf8_to_utf32(const char *input, uint64_t utf8_end_of_code_point_mask, char32_t *&utf32_out)13465 size_t convert_masked_utf8_to_utf32(const char *input,
13466                            uint64_t utf8_end_of_code_point_mask,
13467                            char32_t *&utf32_out) {
13468   // we use an approach where we try to process up to 12 input bytes.
13469   // Why 12 input bytes and not 16? Because we are concerned with the size of
13470   // the lookup tables. Also 12 is nicely divisible by two and three.
13471   //
13472   uint32_t*& utf32_output = reinterpret_cast<uint32_t*&>(utf32_out);
13473   uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
13474   const uint16_t input_utf8_end_of_code_point_mask =
13475       utf8_end_of_code_point_mask & 0xFFF;
13476   //
13477   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
13478   // beneficial to have fast paths that depend on branch prediction but have less latency.
13479   // This results in more instructions but, potentially, also higher speeds.
13480   //
13481   // We first try a few fast paths.
13482   if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
13483     // We process in chunks of 16 bytes.
13484     // use fast implementation in src/simdutf/arm64/simd.h
13485     // Ideally the compiler can keep the tables in registers.
13486     simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
13487     temp.store_ascii_as_utf32_tbl(utf32_out);
13488     utf32_output += 16; // We wrote 16 32-bit characters.
13489     return 16; // We consumed 16 bytes.
13490   }
13491   if(input_utf8_end_of_code_point_mask == 0x924) {
13492     // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units.
13493     // Convert to UTF-16
13494     uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in);
13495     // Zero extend and store via ST2 with a zero.
13496     uint16x4x2_t interleaver = {{ composed_utf16, vmov_n_u16(0) }};
13497     vst2_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
13498     utf32_output += 4; // We wrote 4 32-bit characters.
13499     return 12; // We consumed 12 bytes.
13500   }
13501 
13502   // 2 byte sequences occur in short bursts in languages like Greek and Russian.
13503   if(input_utf8_end_of_code_point_mask == 0xaaa) {
13504     // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte UTF-32 code units.
13505     // Convert to UTF-16
13506     uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in);
13507     // Zero extend and store via ST2 with a zero.
13508     uint16x8x2_t interleaver = {{ composed_utf16, vmovq_n_u16(0) }};
13509     vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
13510     utf32_output += 6; // We wrote 6 32-bit characters.
13511     return 12; // We consumed 12 bytes.
13512   }
13513   /// Either no fast path or an unimportant fast path.
13514 
13515   const uint8_t idx =
13516       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
13517   const uint8_t consumed =
13518       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
13519 
13520 
13521   if (idx < 64) {
13522     // SIX (6) input code-code units
13523     // Convert to UTF-16
13524     uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
13525     // Zero extend and store with ST2 and zero
13526     uint16x8x2_t interleaver = {{ composed_utf16, vmovq_n_u16(0) }};
13527     vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
13528     utf32_output += 6; // We wrote 6 32-bit characters.
13529     return consumed;
13530   } else if (idx < 145) {
13531     // FOUR (4) input code-code units
13532     // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
13533     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
13534     // Shuffle
13535     // 1 byte: 00000000 00000000 0ccccccc
13536     // 2 byte: 00000000 110bbbbb 10cccccc
13537     // 3 byte: 1110aaaa 10bbbbbb 10cccccc
13538     uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
13539     // Split
13540     // 00000000 00000000 0ccccccc
13541     uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));    // 6 or 7 bits
13542     // Note: unmasked
13543     // xxxxxxxx aaaaxxxx xxxxxxxx
13544     uint32x4_t high = vshrq_n_u32(perm, 4);                   // 4 bits
13545     // Use 16 bit bic instead of and.
13546     // The top bits will be corrected later in the bsl
13547     // 00000000 10bbbbbb 00000000
13548     uint32x4_t middle =
13549         vreinterpretq_u32_u16(vbicq_u16(vreinterpretq_u16_u32(perm), vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits
13550     // Combine low and middle with shift right accumulate
13551     // 00000000 00xxbbbb bbcccccc
13552     uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2);
13553     // Insert top 4 bits from high byte with bitwise select
13554     // 00000000 aaaabbbb bbcccccc
13555     uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid);
13556     vst1q_u32(utf32_output, composed);
13557     utf32_output += 4; // We wrote 4 32-bit characters.
13558     return consumed;
13559   } else if (idx < 209) {
13560     // THREE (3) input code-code units
13561     if (input_utf8_end_of_code_point_mask == 0x888) {
13562       // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte UTF-32 code units.
13563       // This uses the same method as the fixed 3 byte version, reversing and shift left insert.
13564       // However, there is no need for a shuffle mask now, just rev16 and rev32.
13565       //
13566       // This version does not use the LUT, but 4 byte sequences are less common and the
13567       // overhead of the extra memory access is less important than the early branch overhead
13568       // in shorter sequences, so it comes last.
13569 
13570       // Swap pairs of bytes
13571       // 10dddddd|10cccccc|10bbbbbb|11110aaa
13572       // 10cccccc 10dddddd|11110aaa 10bbbbbb
13573       uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in));
13574       // Shift left and insert
13575       // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
13576       uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6);
13577       // Swap 16-bit lanes
13578       // xxxxcccc ccdddddd xxxxxxxa aabbbbbb
13579       // xxxxxxxa aabbbbbb xxxxcccc ccdddddd
13580       uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1));
13581       // Shift insert again
13582       // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
13583       uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12);
13584       // Clear the garbage
13585       // 00000000 000aaabb bbbbcccc ccdddddd
13586       uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF));
13587       // Store
13588       vst1q_u32(utf32_output, composed);
13589 
13590       utf32_output += 3; // We wrote 3 32-bit characters.
13591       return 12; // We consumed 12 bytes.
13592     }
13593     // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit due to
13594     // surrogates no longer being involved.
13595     uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
13596     // 1 byte: 00000000 00000000 00000000 0ddddddd
13597     // 2 byte: 00000000 00000000 110ccccc 10dddddd
13598     // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
13599     // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
13600     uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
13601     // Ascii
13602     uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));
13603     uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00));
13604     // When converting the way we do, the 3 byte prefix will be interpreted as the
13605     // 18th bit being set, since the code would interpret the lead byte (0b1110bbbb)
13606     // as a continuation byte (0b10bbbbbb). To fix this, we can either xor or do an
13607     // 8 bit add of the 6th bit shifted right by 1. Since NEON has shift right accumulate,
13608     // we use that.
13609     //  4 byte   3 byte
13610     // 10bbbbbb 1110bbbb
13611     // 00000000 01000000 6th bit
13612     // 00000000 00100000 shift right
13613     // 10bbbbbb 0000bbbb add
13614     // 00bbbbbb 0000bbbb mask
13615     uint8x16_t correction =
13616         vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000)));
13617     uint32x4_t corrected =
13618         vreinterpretq_u32_u8(vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1));
13619     // 00000000 00000000 0000cccc ccdddddd
13620     uint32x4_t cd = vsraq_n_u32(ascii, middle, 2);
13621     // Insert twice
13622     // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx
13623     uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6), vshrq_n_u32(corrected, 4));
13624     // 00000000 000aaabb bbbbcccc ccdddddd
13625     uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab);
13626     // Store
13627     vst1q_u32(utf32_output, composed);
13628     utf32_output += 3; // We wrote 3 32-bit characters.
13629     return consumed;
13630   } else {
13631     // here we know that there is an error but we do not handle errors
13632     return 12;
13633   }
13634 }
13635 /* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
13636 /* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */
13637 // Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
13638 // end of the code points. Only the least significant 12 bits of the mask
13639 // are accessed.
13640 // It returns how many bytes were consumed (up to 16, usually 12).
convert_masked_utf8_to_latin1(const char *input, uint64_t utf8_end_of_code_point_mask, char *&latin1_output)13641 size_t convert_masked_utf8_to_latin1(const char *input,
13642                            uint64_t utf8_end_of_code_point_mask,
13643                            char *&latin1_output) {
13644   // we use an approach where we try to process up to 12 input bytes.
13645   // Why 12 input bytes and not 16? Because we are concerned with the size of
13646   // the lookup tables. Also 12 is nicely divisible by two and three.
13647   //
13648   uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
13649   const uint16_t input_utf8_end_of_code_point_mask =
13650       utf8_end_of_code_point_mask & 0xfff;
13651   //
13652   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
13653   // beneficial to have fast paths that depend on branch prediction but have less latency.
13654   // This results in more instructions but, potentially, also higher speeds.
13655 
13656   // We first try a few fast paths.
13657   // The obvious first test is ASCII, which actually consumes the full 16.
13658   if((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
13659     // We process in chunks of 16 bytes
13660     vst1q_u8(reinterpret_cast<uint8_t*>(latin1_output), in);
13661     latin1_output += 16; // We wrote 16 18-bit characters.
13662     return 16; // We consumed 16 bytes.
13663   }
13664   /// We do not have a fast path available, or the fast path is unimportant, so we fallback.
13665   const uint8_t idx =
13666       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
13667 
13668   const uint8_t consumed =
13669       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
13670   // this indicates an invalid input:
13671   if(idx >= 64) { return consumed; }
13672   // Here we should have (idx < 64), if not, there is a bug in the validation or elsewhere.
13673   // SIX (6) input code-code units
13674   // this is a relatively easy scenario
13675   // we process SIX (6) input code-code units. The max length in bytes of six code
13676   // code units spanning between 1 and 2 bytes each is 12 bytes.
13677   // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
13678   // This is a relatively easy scenario
13679   // we process SIX (6) input code-code units. The max length in bytes of six code
13680   // code units spanning between 1 and 2 bytes each is 12 bytes.
13681   uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
13682   // Shuffle
13683   // 1 byte: 00000000 0bbbbbbb
13684   // 2 byte: 110aaaaa 10bbbbbb
13685   uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
13686   // Mask
13687   // 1 byte: 00000000 0bbbbbbb
13688   // 2 byte: 00000000 00bbbbbb
13689   uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
13690   // 1 byte: 00000000 00000000
13691   // 2 byte: 000aaaaa 00000000
13692   uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
13693   // Combine with a shift right accumulate
13694   // 1 byte: 00000000 0bbbbbbb
13695   // 2 byte: 00000aaa aabbbbbb
13696   uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
13697   // writing 8 bytes even though we only care about the first 6 bytes.
13698   uint8x8_t latin1_packed = vmovn_u16(composed);
13699   vst1_u8(reinterpret_cast<uint8_t*>(latin1_output), latin1_packed);
13700   latin1_output += 6; // We wrote 6 bytes.
13701   return consumed;
13702 }
13703 
13704 /* end file src/arm64/arm_convert_utf8_to_latin1.cpp */
13705 
13706 /* begin file src/arm64/arm_convert_utf16_to_latin1.cpp */
13707 
13708 template <endianness big_endian>
arm_convert_utf16_to_latin1(const char16_t* buf, size_t len, char* latin1_output)13709 std::pair<const char16_t*, char*> arm_convert_utf16_to_latin1(const char16_t* buf, size_t len, char* latin1_output) {
13710   const char16_t* end = buf + len;
13711   while (buf + 8 <= end) {
13712     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
13713     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
13714     if (vmaxvq_u16(in) <= 0xff) {
13715         // 1. pack the bytes
13716         uint8x8_t latin1_packed = vmovn_u16(in);
13717         // 2. store (8 bytes)
13718         vst1_u8(reinterpret_cast<uint8_t*>(latin1_output), latin1_packed);
13719         // 3. adjust pointers
13720         buf += 8;
13721         latin1_output += 8;
13722     } else {
13723       return std::make_pair(nullptr, reinterpret_cast<char*>(latin1_output));
13724     }
13725   } // while
13726   return std::make_pair(buf, latin1_output);
13727 }
13728 
13729 template <endianness big_endian>
arm_convert_utf16_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output)13730 std::pair<result, char*> arm_convert_utf16_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) {
13731   const char16_t* start = buf;
13732   const char16_t* end = buf + len;
13733   while (buf + 8 <= end) {
13734     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
13735     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
13736     if (vmaxvq_u16(in) <= 0xff) {
13737         // 1. pack the bytes
13738         uint8x8_t latin1_packed = vmovn_u16(in);
13739         // 2. store (8 bytes)
13740         vst1_u8(reinterpret_cast<uint8_t*>(latin1_output), latin1_packed);
13741         // 3. adjust pointers
13742         buf += 8;
13743         latin1_output += 8;
13744     } else {
13745       // Let us do a scalar fallback.
13746       for(int k = 0; k < 8; k++) {
13747         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
13748         if(word <= 0xff) {
13749           *latin1_output++ = char(word);
13750         } else {
13751           return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), latin1_output);
13752         }
13753       }
13754     }
13755   } // while
13756   return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output);
13757 }
13758 /* end file src/arm64/arm_convert_utf16_to_latin1.cpp */
13759 /* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
13760 /*
13761     The vectorized algorithm works on single SSE register i.e., it
13762     loads eight 16-bit code units.
13763 
13764     We consider three cases:
13765     1. an input register contains no surrogates and each value
13766        is in range 0x0000 .. 0x07ff.
13767     2. an input register contains no surrogates and values are
13768        is in range 0x0000 .. 0xffff.
13769     3. an input register contains surrogates --- i.e. codepoints
13770        can have 16 or 32 bits.
13771 
13772     Ad 1.
13773 
13774     When values are less than 0x0800, it means that a 16-bit code unit
13775     can be converted into: 1) single UTF8 byte (when it's an ASCII
13776     char) or 2) two UTF8 bytes.
13777 
13778     For this case we do only some shuffle to obtain these 2-byte
13779     codes and finally compress the whole SSE register with a single
13780     shuffle.
13781 
13782     We need 256-entry lookup table to get a compression pattern
13783     and the number of output bytes in the compressed vector register.
13784     Each entry occupies 17 bytes.
13785 
13786     Ad 2.
13787 
13788     When values fit in 16-bit code units, but are above 0x07ff, then
13789     a single word may produce one, two or three UTF8 bytes.
13790 
13791     We prepare data for all these three cases in two registers.
13792     The first register contains lower two UTF8 bytes (used in all
13793     cases), while the second one contains just the third byte for
13794     the three-UTF8-bytes case.
13795 
13796     Finally these two registers are interleaved forming eight-element
13797     array of 32-bit values. The array spans two SSE registers.
13798     The bytes from the registers are compressed using two shuffles.
13799 
13800     We need 256-entry lookup table to get a compression pattern
13801     and the number of output bytes in the compressed vector register.
13802     Each entry occupies 17 bytes.
13803 
13804 
13805     To summarize:
13806     - We need two 256-entry tables that have 8704 bytes in total.
13807 */
13808 /*
13809   Returns a pair: the first unprocessed byte from buf and utf8_output
13810   A scalar routing should carry on the conversion of the tail.
13811 */
13812 template <endianness big_endian>
arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out)13813 std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) {
13814   uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
13815   const char16_t* end = buf + len;
13816 
13817   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
13818   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
13819   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
13820   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
13821   while (buf + 16 + safety_margin <= end) {
13822     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
13823     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
13824     if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
13825         // It is common enough that we have sequences of 16 consecutive ASCII characters.
13826         uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
13827         if (!match_system(big_endian)) { nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin))); }
13828         if(vmaxvq_u16(nextin) > 0x7F) {
13829           // 1. pack the bytes
13830           // obviously suboptimal.
13831           uint8x8_t utf8_packed = vmovn_u16(in);
13832           // 2. store (8 bytes)
13833           vst1_u8(utf8_output, utf8_packed);
13834           // 3. adjust pointers
13835           buf += 8;
13836           utf8_output += 8;
13837           in = nextin;
13838         } else {
13839           // 1. pack the bytes
13840           // obviously suboptimal.
13841           uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
13842           // 2. store (16 bytes)
13843           vst1q_u8(utf8_output, utf8_packed);
13844           // 3. adjust pointers
13845           buf += 16;
13846           utf8_output += 16;
13847           continue; // we are done for this round!
13848         }
13849     }
13850 
13851     if (vmaxvq_u16(in) <= 0x7FF) {
13852           // 1. prepare 2-byte values
13853           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
13854           // expected output   : [110a|aaaa|10bb|bbbb] x 8
13855           const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
13856           const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
13857 
13858           // t0 = [000a|aaaa|bbbb|bb00]
13859           const uint16x8_t t0 = vshlq_n_u16(in, 2);
13860           // t1 = [000a|aaaa|0000|0000]
13861           const uint16x8_t t1 = vandq_u16(t0, v_1f00);
13862           // t2 = [0000|0000|00bb|bbbb]
13863           const uint16x8_t t2 = vandq_u16(in, v_003f);
13864           // t3 = [000a|aaaa|00bb|bbbb]
13865           const uint16x8_t t3 = vorrq_u16(t1, t2);
13866           // t4 = [110a|aaaa|10bb|bbbb]
13867           const uint16x8_t t4 = vorrq_u16(t3, v_c080);
13868           // 2. merge ASCII and 2-byte codewords
13869           const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
13870           const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
13871           const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
13872           // 3. prepare bitmask for 8-bit lookup
13873 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13874           const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004,
13875                                     0x0010, 0x0040,
13876                                     0x0002, 0x0008,
13877                                     0x0020, 0x0080);
13878 #else
13879           const uint16x8_t mask = { 0x0001, 0x0004,
13880                                     0x0010, 0x0040,
13881                                     0x0002, 0x0008,
13882                                     0x0020, 0x0080 };
13883 #endif
13884           uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
13885           // 4. pack the bytes
13886           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
13887           const uint8x16_t shuffle = vld1q_u8(row + 1);
13888           const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
13889 
13890           // 5. store bytes
13891           vst1q_u8(utf8_output, utf8_packed);
13892 
13893           // 6. adjust pointers
13894           buf += 8;
13895           utf8_output += row[0];
13896           continue;
13897 
13898     }
13899     const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
13900     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
13901     // it is likely an uncommon occurrence.
13902     if (vmaxvq_u16(surrogates_bytemask) == 0) {
13903         // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
13904 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13905         const uint16x8_t dup_even = simdutf_make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
13906                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
13907 #else
13908         const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
13909                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
13910 #endif
13911         /* In this branch we handle three cases:
13912            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
13913            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
13914            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
13915 
13916           We expand the input word (16-bit) into two code units (32-bit), thus
13917           we have room for four bytes. However, we need five distinct bit
13918           layouts. Note that the last byte in cases #2 and #3 is the same.
13919 
13920           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
13921           in register t2.
13922 
13923           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
13924           either byte 1 for case #2 or byte 2 for case #3. Note that they
13925           differ by exactly one bit.
13926 
13927           Finally from these two code units we build proper UTF-8 sequence, taking
13928           into account the case (i.e, the number of bytes to write).
13929         */
13930         /**
13931          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
13932          * t2 => [0ccc|cccc] [10cc|cccc]
13933          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
13934          */
13935 #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
13936         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
13937         const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
13938         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
13939         const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
13940         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
13941         const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
13942 
13943         // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
13944         const uint16x8_t s0 = vshrq_n_u16(in, 12);
13945         // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
13946         const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
13947         // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
13948         const uint16x8_t s1s = vshlq_n_u16(s1, 2);
13949         // [00bb|bbbb|0000|aaaa]
13950         const uint16x8_t s2 = vorrq_u16(s0, s1s);
13951         // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
13952         const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
13953         const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
13954         const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
13955         const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
13956         const uint16x8_t s4 = veorq_u16(s3, m0);
13957 #undef simdutf_vec
13958 
13959         // 4. expand code units 16-bit => 32-bit
13960         const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
13961         const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
13962 
13963         // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
13964         const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
13965         const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
13966 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
13967         const uint16x8_t onemask = simdutf_make_uint16x8_t(0x0001, 0x0004,
13968                                     0x0010, 0x0040,
13969                                     0x0100, 0x0400,
13970                                     0x1000, 0x4000 );
13971         const uint16x8_t twomask = simdutf_make_uint16x8_t(0x0002, 0x0008,
13972                                     0x0020, 0x0080,
13973                                     0x0200, 0x0800,
13974                                     0x2000, 0x8000 );
13975 #else
13976         const uint16x8_t onemask = { 0x0001, 0x0004,
13977                                     0x0010, 0x0040,
13978                                     0x0100, 0x0400,
13979                                     0x1000, 0x4000 };
13980         const uint16x8_t twomask = { 0x0002, 0x0008,
13981                                     0x0020, 0x0080,
13982                                     0x0200, 0x0800,
13983                                     0x2000, 0x8000 };
13984 #endif
13985         const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
13986         const uint16_t mask = vaddvq_u16(combined);
13987         // The following fast path may or may not be beneficial.
13988         /*if(mask == 0) {
13989           // We only have three-byte code units. Use fast path.
13990           const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
13991           const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
13992           const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
13993           vst1q_u8(utf8_output, utf8_0);
13994           utf8_output += 12;
13995           vst1q_u8(utf8_output, utf8_1);
13996           utf8_output += 12;
13997           buf += 8;
13998           continue;
13999         }*/
14000         const uint8_t mask0 = uint8_t(mask);
14001 
14002         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
14003         const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
14004         const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
14005 
14006         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
14007         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
14008         const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
14009         const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
14010 
14011         vst1q_u8(utf8_output, utf8_0);
14012         utf8_output += row0[0];
14013         vst1q_u8(utf8_output, utf8_1);
14014         utf8_output += row1[0];
14015 
14016         buf += 8;
14017     // surrogate pair(s) in a register
14018     } else {
14019       // Let us do a scalar fallback.
14020       // It may seem wasteful to use scalar code, but being efficient with SIMD
14021       // in the presence of surrogate pairs may require non-trivial tables.
14022       size_t forward = 15;
14023       size_t k = 0;
14024       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
14025       for(; k < forward; k++) {
14026         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14027         if((word & 0xFF80)==0) {
14028           *utf8_output++ = char(word);
14029         } else if((word & 0xF800)==0) {
14030           *utf8_output++ = char((word>>6) | 0b11000000);
14031           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14032         } else if((word &0xF800 ) != 0xD800) {
14033           *utf8_output++ = char((word>>12) | 0b11100000);
14034           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14035           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14036         } else {
14037           // must be a surrogate pair
14038           uint16_t diff = uint16_t(word - 0xD800);
14039           uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
14040           k++;
14041           uint16_t diff2 = uint16_t(next_word - 0xDC00);
14042           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
14043           uint32_t value = (diff << 10) + diff2 + 0x10000;
14044           *utf8_output++ = char((value>>18) | 0b11110000);
14045           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
14046           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
14047           *utf8_output++ = char((value & 0b111111) | 0b10000000);
14048         }
14049       }
14050       buf += k;
14051     }
14052   } // while
14053 
14054   return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
14055 }
14056 
14057 
14058 /*
14059   Returns a pair: a result struct and utf8_output.
14060   If there is an error, the count field of the result is the position of the error.
14061   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
14062   A scalar routing should carry on the conversion of the tail if needed.
14063 */
14064 template <endianness big_endian>
arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out)14065 std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) {
14066   uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
14067     const char16_t* start = buf;
14068   const char16_t* end = buf + len;
14069 
14070   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
14071   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
14072   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
14073   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
14074 
14075   while (buf + 16 + safety_margin <= end) {
14076     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
14077     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
14078     if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
14079         // It is common enough that we have sequences of 16 consecutive ASCII characters.
14080         uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
14081         if (!match_system(big_endian)) { nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin))); }
14082         if(vmaxvq_u16(nextin) > 0x7F) {
14083           // 1. pack the bytes
14084           // obviously suboptimal.
14085           uint8x8_t utf8_packed = vmovn_u16(in);
14086           // 2. store (8 bytes)
14087           vst1_u8(utf8_output, utf8_packed);
14088           // 3. adjust pointers
14089           buf += 8;
14090           utf8_output += 8;
14091           in = nextin;
14092         } else {
14093           // 1. pack the bytes
14094           // obviously suboptimal.
14095           uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
14096           // 2. store (16 bytes)
14097           vst1q_u8(utf8_output, utf8_packed);
14098           // 3. adjust pointers
14099           buf += 16;
14100           utf8_output += 16;
14101           continue; // we are done for this round!
14102         }
14103     }
14104 
14105     if (vmaxvq_u16(in) <= 0x7FF) {
14106           // 1. prepare 2-byte values
14107           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14108           // expected output   : [110a|aaaa|10bb|bbbb] x 8
14109           const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
14110           const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
14111 
14112           // t0 = [000a|aaaa|bbbb|bb00]
14113           const uint16x8_t t0 = vshlq_n_u16(in, 2);
14114           // t1 = [000a|aaaa|0000|0000]
14115           const uint16x8_t t1 = vandq_u16(t0, v_1f00);
14116           // t2 = [0000|0000|00bb|bbbb]
14117           const uint16x8_t t2 = vandq_u16(in, v_003f);
14118           // t3 = [000a|aaaa|00bb|bbbb]
14119           const uint16x8_t t3 = vorrq_u16(t1, t2);
14120           // t4 = [110a|aaaa|10bb|bbbb]
14121           const uint16x8_t t4 = vorrq_u16(t3, v_c080);
14122           // 2. merge ASCII and 2-byte codewords
14123           const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
14124           const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
14125           const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
14126           // 3. prepare bitmask for 8-bit lookup
14127 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
14128           const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004,
14129                                     0x0010, 0x0040,
14130                                     0x0002, 0x0008,
14131                                     0x0020, 0x0080);
14132 #else
14133           const uint16x8_t mask = { 0x0001, 0x0004,
14134                                     0x0010, 0x0040,
14135                                     0x0002, 0x0008,
14136                                     0x0020, 0x0080 };
14137 #endif
14138           uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
14139           // 4. pack the bytes
14140           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
14141           const uint8x16_t shuffle = vld1q_u8(row + 1);
14142           const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
14143 
14144           // 5. store bytes
14145           vst1q_u8(utf8_output, utf8_packed);
14146 
14147           // 6. adjust pointers
14148           buf += 8;
14149           utf8_output += row[0];
14150           continue;
14151 
14152     }
14153     const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
14154     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
14155     // it is likely an uncommon occurrence.
14156     if (vmaxvq_u16(surrogates_bytemask) == 0) {
14157         // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
14158 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
14159         const uint16x8_t dup_even = simdutf_make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
14160                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
14161 #else
14162         const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
14163                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
14164 #endif
14165         /* In this branch we handle three cases:
14166            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
14167            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
14168            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
14169 
14170           We expand the input word (16-bit) into two code units (32-bit), thus
14171           we have room for four bytes. However, we need five distinct bit
14172           layouts. Note that the last byte in cases #2 and #3 is the same.
14173 
14174           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
14175           in register t2.
14176 
14177           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
14178           either byte 1 for case #2 or byte 2 for case #3. Note that they
14179           differ by exactly one bit.
14180 
14181           Finally from these two code units we build proper UTF-8 sequence, taking
14182           into account the case (i.e, the number of bytes to write).
14183         */
14184         /**
14185          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
14186          * t2 => [0ccc|cccc] [10cc|cccc]
14187          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
14188          */
14189 #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
14190         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
14191         const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
14192         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
14193         const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
14194         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
14195         const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
14196 
14197         // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
14198         const uint16x8_t s0 = vshrq_n_u16(in, 12);
14199         // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
14200         const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
14201         // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
14202         const uint16x8_t s1s = vshlq_n_u16(s1, 2);
14203         // [00bb|bbbb|0000|aaaa]
14204         const uint16x8_t s2 = vorrq_u16(s0, s1s);
14205         // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
14206         const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
14207         const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
14208         const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
14209         const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
14210         const uint16x8_t s4 = veorq_u16(s3, m0);
14211 #undef simdutf_vec
14212 
14213         // 4. expand code units 16-bit => 32-bit
14214         const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
14215         const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
14216 
14217         // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
14218         const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
14219         const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
14220 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
14221         const uint16x8_t onemask = simdutf_make_uint16x8_t(0x0001, 0x0004,
14222                                     0x0010, 0x0040,
14223                                     0x0100, 0x0400,
14224                                     0x1000, 0x4000 );
14225         const uint16x8_t twomask = simdutf_make_uint16x8_t(0x0002, 0x0008,
14226                                     0x0020, 0x0080,
14227                                     0x0200, 0x0800,
14228                                     0x2000, 0x8000 );
14229 #else
14230         const uint16x8_t onemask = { 0x0001, 0x0004,
14231                                     0x0010, 0x0040,
14232                                     0x0100, 0x0400,
14233                                     0x1000, 0x4000 };
14234         const uint16x8_t twomask = { 0x0002, 0x0008,
14235                                     0x0020, 0x0080,
14236                                     0x0200, 0x0800,
14237                                     0x2000, 0x8000 };
14238 #endif
14239         const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
14240         const uint16_t mask = vaddvq_u16(combined);
14241         // The following fast path may or may not be beneficial.
14242         /*if(mask == 0) {
14243           // We only have three-byte code units. Use fast path.
14244           const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
14245           const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
14246           const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
14247           vst1q_u8(utf8_output, utf8_0);
14248           utf8_output += 12;
14249           vst1q_u8(utf8_output, utf8_1);
14250           utf8_output += 12;
14251           buf += 8;
14252           continue;
14253         }*/
14254         const uint8_t mask0 = uint8_t(mask);
14255 
14256         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
14257         const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
14258         const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
14259 
14260         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
14261         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
14262         const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
14263         const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
14264 
14265         vst1q_u8(utf8_output, utf8_0);
14266         utf8_output += row0[0];
14267         vst1q_u8(utf8_output, utf8_1);
14268         utf8_output += row1[0];
14269 
14270         buf += 8;
14271     // surrogate pair(s) in a register
14272     } else {
14273       // Let us do a scalar fallback.
14274       // It may seem wasteful to use scalar code, but being efficient with SIMD
14275       // in the presence of surrogate pairs may require non-trivial tables.
14276       size_t forward = 15;
14277       size_t k = 0;
14278       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
14279       for(; k < forward; k++) {
14280         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14281         if((word & 0xFF80)==0) {
14282           *utf8_output++ = char(word);
14283         } else if((word & 0xF800)==0) {
14284           *utf8_output++ = char((word>>6) | 0b11000000);
14285           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14286         } else if((word &0xF800 ) != 0xD800) {
14287           *utf8_output++ = char((word>>12) | 0b11100000);
14288           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14289           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14290         } else {
14291           // must be a surrogate pair
14292           uint16_t diff = uint16_t(word - 0xD800);
14293           uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
14294           k++;
14295           uint16_t diff2 = uint16_t(next_word - 0xDC00);
14296           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char*>(utf8_output)); }
14297           uint32_t value = (diff << 10) + diff2 + 0x10000;
14298           *utf8_output++ = char((value>>18) | 0b11110000);
14299           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
14300           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
14301           *utf8_output++ = char((value & 0b111111) | 0b10000000);
14302         }
14303       }
14304       buf += k;
14305     }
14306   } // while
14307 
14308   return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
14309 }
14310 /* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
14311 /* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
14312 /*
14313     The vectorized algorithm works on single SSE register i.e., it
14314     loads eight 16-bit code units.
14315 
14316     We consider three cases:
14317     1. an input register contains no surrogates and each value
14318        is in range 0x0000 .. 0x07ff.
14319     2. an input register contains no surrogates and values are
14320        is in range 0x0000 .. 0xffff.
14321     3. an input register contains surrogates --- i.e. codepoints
14322        can have 16 or 32 bits.
14323 
14324     Ad 1.
14325 
14326     When values are less than 0x0800, it means that a 16-bit code unit
14327     can be converted into: 1) single UTF8 byte (when it's an ASCII
14328     char) or 2) two UTF8 bytes.
14329 
14330     For this case we do only some shuffle to obtain these 2-byte
14331     codes and finally compress the whole SSE register with a single
14332     shuffle.
14333 
14334     We need 256-entry lookup table to get a compression pattern
14335     and the number of output bytes in the compressed vector register.
14336     Each entry occupies 17 bytes.
14337 
14338     Ad 2.
14339 
14340     When values fit in 16-bit code units, but are above 0x07ff, then
14341     a single word may produce one, two or three UTF8 bytes.
14342 
14343     We prepare data for all these three cases in two registers.
14344     The first register contains lower two UTF8 bytes (used in all
14345     cases), while the second one contains just the third byte for
14346     the three-UTF8-bytes case.
14347 
14348     Finally these two registers are interleaved forming eight-element
14349     array of 32-bit values. The array spans two SSE registers.
14350     The bytes from the registers are compressed using two shuffles.
14351 
14352     We need 256-entry lookup table to get a compression pattern
14353     and the number of output bytes in the compressed vector register.
14354     Each entry occupies 17 bytes.
14355 
14356 
14357     To summarize:
14358     - We need two 256-entry tables that have 8704 bytes in total.
14359 */
14360 /*
14361   Returns a pair: the first unprocessed byte from buf and utf8_output
14362   A scalar routing should carry on the conversion of the tail.
14363 */
14364 template <endianness big_endian>
arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out)14365 std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) {
14366   uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
14367   const char16_t* end = buf + len;
14368 
14369   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
14370   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
14371 
14372   while (buf + 8 <= end) {
14373     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
14374     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
14375 
14376     const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
14377     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
14378     // it is likely an uncommon occurrence.
14379     if (vmaxvq_u16(surrogates_bytemask) == 0) {
14380       // case: no surrogate pairs, extend all 16-bit code units to 32-bit code units
14381       vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
14382       vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
14383       utf32_output += 8;
14384       buf += 8;
14385     // surrogate pair(s) in a register
14386     } else {
14387       // Let us do a scalar fallback.
14388       // It may seem wasteful to use scalar code, but being efficient with SIMD
14389       // in the presence of surrogate pairs may require non-trivial tables.
14390       size_t forward = 15;
14391       size_t k = 0;
14392       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
14393       for(; k < forward; k++) {
14394         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14395         if((word &0xF800 ) != 0xD800) {
14396           *utf32_output++ = char32_t(word);
14397         } else {
14398           // must be a surrogate pair
14399           uint16_t diff = uint16_t(word - 0xD800);
14400           uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
14401           k++;
14402           uint16_t diff2 = uint16_t(next_word - 0xDC00);
14403           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char32_t*>(utf32_output)); }
14404           uint32_t value = (diff << 10) + diff2 + 0x10000;
14405           *utf32_output++ = char32_t(value);
14406         }
14407       }
14408       buf += k;
14409     }
14410   } // while
14411   return std::make_pair(buf, reinterpret_cast<char32_t*>(utf32_output));
14412 }
14413 
14414 
14415 /*
14416   Returns a pair: a result struct and utf8_output.
14417   If there is an error, the count field of the result is the position of the error.
14418   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
14419   A scalar routing should carry on the conversion of the tail if needed.
14420 */
14421 template <endianness big_endian>
arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out)14422 std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) {
14423   uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
14424   const char16_t* start = buf;
14425   const char16_t* end = buf + len;
14426 
14427   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
14428   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
14429 
14430   while (buf + 8 <= end) {
14431     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
14432     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
14433 
14434     const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
14435     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
14436     // it is likely an uncommon occurrence.
14437     if (vmaxvq_u16(surrogates_bytemask) == 0) {
14438       // case: no surrogate pairs, extend all 16-bit code units to 32-bit code units
14439       vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
14440       vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
14441       utf32_output += 8;
14442       buf += 8;
14443     // surrogate pair(s) in a register
14444     } else {
14445       // Let us do a scalar fallback.
14446       // It may seem wasteful to use scalar code, but being efficient with SIMD
14447       // in the presence of surrogate pairs may require non-trivial tables.
14448       size_t forward = 15;
14449       size_t k = 0;
14450       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
14451       for(; k < forward; k++) {
14452         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14453         if((word &0xF800 ) != 0xD800) {
14454           *utf32_output++ = char32_t(word);
14455         } else {
14456           // must be a surrogate pair
14457           uint16_t diff = uint16_t(word - 0xD800);
14458           uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
14459           k++;
14460           uint16_t diff2 = uint16_t(next_word - 0xDC00);
14461           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char32_t*>(utf32_output)); }
14462           uint32_t value = (diff << 10) + diff2 + 0x10000;
14463           *utf32_output++ = char32_t(value);
14464         }
14465       }
14466       buf += k;
14467     }
14468   } // while
14469   return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char32_t*>(utf32_output));
14470 }
14471 /* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
14472 
14473 /* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */
arm_convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output)14474 std::pair<const char32_t*, char*> arm_convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) {
14475   const char32_t* end = buf + len;
14476   while (buf + 8 <= end) {
14477     uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
14478     uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
14479 
14480     uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
14481     if (vmaxvq_u16(utf16_packed) <= 0xff) {
14482         // 1. pack the bytes
14483         uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
14484         // 2. store (8 bytes)
14485         vst1_u8(reinterpret_cast<uint8_t*>(latin1_output), latin1_packed);
14486         // 3. adjust pointers
14487         buf += 8;
14488         latin1_output += 8;
14489     } else {
14490       return std::make_pair(nullptr, reinterpret_cast<char*>(latin1_output));
14491     }
14492   } // while
14493   return std::make_pair(buf, latin1_output);
14494 }
14495 
14496 
arm_convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output)14497 std::pair<result, char*> arm_convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) {
14498   const char32_t* start = buf;
14499   const char32_t* end = buf + len;
14500 
14501   while (buf + 8 <= end) {
14502     uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
14503     uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
14504 
14505     uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
14506 
14507     if (vmaxvq_u16(utf16_packed) <= 0xff) {
14508         // 1. pack the bytes
14509         uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
14510         // 2. store (8 bytes)
14511         vst1_u8(reinterpret_cast<uint8_t*>(latin1_output), latin1_packed);
14512         // 3. adjust pointers
14513         buf += 8;
14514         latin1_output += 8;
14515     } else {
14516       // Let us do a scalar fallback.
14517       for(int k = 0; k < 8; k++) {
14518         uint32_t word = buf[k];
14519         if(word <= 0xff) {
14520           *latin1_output++ = char(word);
14521         } else {
14522           return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), latin1_output);
14523         }
14524       }
14525     }
14526   } // while
14527   return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output);
14528 }
14529 /* end file src/arm64/arm_convert_utf32_to_latin1.cpp */
14530 /* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out)14531 std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) {
14532   uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
14533   const char32_t* end = buf + len;
14534 
14535   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
14536 
14537   uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
14538 
14539   while (buf + 8 < end) {
14540     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
14541     uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
14542 
14543     // Check if no bits set above 16th
14544     if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
14545       // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
14546       // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
14547       uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
14548       if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
14549         // 1. pack the bytes
14550         // obviously suboptimal.
14551         uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
14552         // 2. store (8 bytes)
14553         vst1_u8(utf8_output, utf8_packed);
14554         // 3. adjust pointers
14555         buf += 8;
14556         utf8_output += 8;
14557         continue; // we are done for this round!
14558       }
14559 
14560       if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
14561         // 1. prepare 2-byte values
14562         // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14563         // expected output   : [110a|aaaa|10bb|bbbb] x 8
14564         const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
14565         const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
14566 
14567         // t0 = [000a|aaaa|bbbb|bb00]
14568         const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
14569         // t1 = [000a|aaaa|0000|0000]
14570         const uint16x8_t t1 = vandq_u16(t0, v_1f00);
14571         // t2 = [0000|0000|00bb|bbbb]
14572         const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
14573         // t3 = [000a|aaaa|00bb|bbbb]
14574         const uint16x8_t t3 = vorrq_u16(t1, t2);
14575         // t4 = [110a|aaaa|10bb|bbbb]
14576         const uint16x8_t t4 = vorrq_u16(t3, v_c080);
14577         // 2. merge ASCII and 2-byte codewords
14578         const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
14579         const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
14580         const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
14581         // 3. prepare bitmask for 8-bit lookup
14582   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
14583         const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004,
14584                                   0x0010, 0x0040,
14585                                   0x0002, 0x0008,
14586                                   0x0020, 0x0080);
14587   #else
14588         const uint16x8_t mask = { 0x0001, 0x0004,
14589                                   0x0010, 0x0040,
14590                                   0x0002, 0x0008,
14591                                   0x0020, 0x0080 };
14592   #endif
14593         uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
14594         // 4. pack the bytes
14595         const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
14596         const uint8x16_t shuffle = vld1q_u8(row + 1);
14597         const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
14598 
14599         // 5. store bytes
14600         vst1q_u8(utf8_output, utf8_packed);
14601 
14602         // 6. adjust pointers
14603         buf += 8;
14604         utf8_output += row[0];
14605         continue;
14606       } else {
14607         // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
14608         const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
14609         const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
14610         forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask);
14611 
14612   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
14613           const uint16x8_t dup_even = simdutf_make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
14614                                       0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
14615   #else
14616           const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
14617                                       0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
14618   #endif
14619           /* In this branch we handle three cases:
14620             1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
14621             2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
14622             3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
14623 
14624             We expand the input word (16-bit) into two code units (32-bit), thus
14625             we have room for four bytes. However, we need five distinct bit
14626             layouts. Note that the last byte in cases #2 and #3 is the same.
14627 
14628             We precompute byte 1 for case #1 and the common byte for cases #2 & #3
14629             in register t2.
14630 
14631             We precompute byte 1 for case #3 and -- **conditionally** -- precompute
14632             either byte 1 for case #2 or byte 2 for case #3. Note that they
14633             differ by exactly one bit.
14634 
14635             Finally from these two code units we build proper UTF-8 sequence, taking
14636             into account the case (i.e, the number of bytes to write).
14637           */
14638           /**
14639            * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
14640            * t2 => [0ccc|cccc] [10cc|cccc]
14641            * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
14642            */
14643   #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
14644           // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
14645           const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
14646           // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
14647           const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
14648           // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
14649           const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
14650 
14651           // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
14652           const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
14653           // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
14654           const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
14655           // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
14656           const uint16x8_t s1s = vshlq_n_u16(s1, 2);
14657           // [00bb|bbbb|0000|aaaa]
14658           const uint16x8_t s2 = vorrq_u16(s0, s1s);
14659           // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
14660           const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
14661           const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
14662           const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
14663           const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
14664           const uint16x8_t s4 = veorq_u16(s3, m0);
14665   #undef simdutf_vec
14666 
14667           // 4. expand code units 16-bit => 32-bit
14668           const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
14669           const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
14670 
14671           // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
14672           const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
14673           const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
14674   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
14675           const uint16x8_t onemask = simdutf_make_uint16x8_t(0x0001, 0x0004,
14676                                       0x0010, 0x0040,
14677                                       0x0100, 0x0400,
14678                                       0x1000, 0x4000 );
14679           const uint16x8_t twomask = simdutf_make_uint16x8_t(0x0002, 0x0008,
14680                                       0x0020, 0x0080,
14681                                       0x0200, 0x0800,
14682                                       0x2000, 0x8000 );
14683   #else
14684           const uint16x8_t onemask = { 0x0001, 0x0004,
14685                                       0x0010, 0x0040,
14686                                       0x0100, 0x0400,
14687                                       0x1000, 0x4000 };
14688           const uint16x8_t twomask = { 0x0002, 0x0008,
14689                                       0x0020, 0x0080,
14690                                       0x0200, 0x0800,
14691                                       0x2000, 0x8000 };
14692   #endif
14693           const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
14694           const uint16_t mask = vaddvq_u16(combined);
14695           // The following fast path may or may not be beneficial.
14696           /*if(mask == 0) {
14697             // We only have three-byte code units. Use fast path.
14698             const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
14699             const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
14700             const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
14701             vst1q_u8(utf8_output, utf8_0);
14702             utf8_output += 12;
14703             vst1q_u8(utf8_output, utf8_1);
14704             utf8_output += 12;
14705             buf += 8;
14706             continue;
14707           }*/
14708           const uint8_t mask0 = uint8_t(mask);
14709           const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
14710           const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
14711           const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
14712 
14713           const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
14714           const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
14715           const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
14716           const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
14717 
14718           vst1q_u8(utf8_output, utf8_0);
14719           utf8_output += row0[0];
14720           vst1q_u8(utf8_output, utf8_1);
14721           utf8_output += row1[0];
14722 
14723           buf += 8;
14724       }
14725     // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
14726     } else {
14727       // Let us do a scalar fallback.
14728       // It may seem wasteful to use scalar code, but being efficient with SIMD
14729       // in the presence of surrogate pairs may require non-trivial tables.
14730       size_t forward = 15;
14731       size_t k = 0;
14732       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
14733       for(; k < forward; k++) {
14734         uint32_t word = buf[k];
14735         if((word & 0xFFFFFF80)==0) {
14736           *utf8_output++ = char(word);
14737         } else if((word & 0xFFFFF800)==0) {
14738           *utf8_output++ = char((word>>6) | 0b11000000);
14739           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14740         } else if((word & 0xFFFF0000)==0) {
14741           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
14742           *utf8_output++ = char((word>>12) | 0b11100000);
14743           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14744           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14745         } else {
14746           if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
14747           *utf8_output++ = char((word>>18) | 0b11110000);
14748           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
14749           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14750           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14751         }
14752       }
14753       buf += k;
14754     }
14755   } // while
14756 
14757   // check for invalid input
14758   if (vmaxvq_u16(forbidden_bytemask) != 0) {
14759     return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
14760   }
14761   return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
14762 }
14763 
14764 
arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out)14765 std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) {
14766   uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
14767   const char32_t* start = buf;
14768   const char32_t* end = buf + len;
14769 
14770   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
14771 
14772   while (buf + 8 < end) {
14773     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
14774     uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
14775 
14776     // Check if no bits set above 16th
14777     if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
14778       // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
14779       // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
14780       uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
14781       if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
14782           // 1. pack the bytes
14783           // obviously suboptimal.
14784           uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
14785           // 2. store (8 bytes)
14786           vst1_u8(utf8_output, utf8_packed);
14787           // 3. adjust pointers
14788           buf += 8;
14789           utf8_output += 8;
14790           continue; // we are done for this round!
14791       }
14792 
14793       if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
14794         // 1. prepare 2-byte values
14795         // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14796         // expected output   : [110a|aaaa|10bb|bbbb] x 8
14797         const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
14798         const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
14799 
14800         // t0 = [000a|aaaa|bbbb|bb00]
14801         const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
14802         // t1 = [000a|aaaa|0000|0000]
14803         const uint16x8_t t1 = vandq_u16(t0, v_1f00);
14804         // t2 = [0000|0000|00bb|bbbb]
14805         const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
14806         // t3 = [000a|aaaa|00bb|bbbb]
14807         const uint16x8_t t3 = vorrq_u16(t1, t2);
14808         // t4 = [110a|aaaa|10bb|bbbb]
14809         const uint16x8_t t4 = vorrq_u16(t3, v_c080);
14810         // 2. merge ASCII and 2-byte codewords
14811         const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
14812         const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
14813         const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
14814         // 3. prepare bitmask for 8-bit lookup
14815   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
14816         const uint16x8_t mask = simdutf_make_uint16x8_t(0x0001, 0x0004,
14817                                   0x0010, 0x0040,
14818                                   0x0002, 0x0008,
14819                                   0x0020, 0x0080);
14820   #else
14821         const uint16x8_t mask = { 0x0001, 0x0004,
14822                                   0x0010, 0x0040,
14823                                   0x0002, 0x0008,
14824                                   0x0020, 0x0080 };
14825   #endif
14826         uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
14827         // 4. pack the bytes
14828         const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
14829         const uint8x16_t shuffle = vld1q_u8(row + 1);
14830         const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
14831 
14832         // 5. store bytes
14833         vst1q_u8(utf8_output, utf8_packed);
14834 
14835         // 6. adjust pointers
14836         buf += 8;
14837         utf8_output += row[0];
14838         continue;
14839       } else {
14840         // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
14841 
14842         // check for invalid input
14843         const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
14844         const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
14845         const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
14846         if (vmaxvq_u16(forbidden_bytemask) != 0) {
14847           return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char*>(utf8_output));
14848         }
14849 
14850   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
14851           const uint16x8_t dup_even = simdutf_make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
14852                                       0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
14853   #else
14854           const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
14855                                       0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
14856   #endif
14857           /* In this branch we handle three cases:
14858             1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
14859             2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
14860             3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
14861 
14862             We expand the input word (16-bit) into two code units (32-bit), thus
14863             we have room for four bytes. However, we need five distinct bit
14864             layouts. Note that the last byte in cases #2 and #3 is the same.
14865 
14866             We precompute byte 1 for case #1 and the common byte for cases #2 & #3
14867             in register t2.
14868 
14869             We precompute byte 1 for case #3 and -- **conditionally** -- precompute
14870             either byte 1 for case #2 or byte 2 for case #3. Note that they
14871             differ by exactly one bit.
14872 
14873             Finally from these two code units we build proper UTF-8 sequence, taking
14874             into account the case (i.e, the number of bytes to write).
14875           */
14876           /**
14877            * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
14878            * t2 => [0ccc|cccc] [10cc|cccc]
14879            * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
14880            */
14881   #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
14882           // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
14883           const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
14884           // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
14885           const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
14886           // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
14887           const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
14888 
14889           // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
14890           const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
14891           // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
14892           const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
14893           // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
14894           const uint16x8_t s1s = vshlq_n_u16(s1, 2);
14895           // [00bb|bbbb|0000|aaaa]
14896           const uint16x8_t s2 = vorrq_u16(s0, s1s);
14897           // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
14898           const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
14899           const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
14900           const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
14901           const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
14902           const uint16x8_t s4 = veorq_u16(s3, m0);
14903   #undef simdutf_vec
14904 
14905           // 4. expand code units 16-bit => 32-bit
14906           const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
14907           const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
14908 
14909           // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
14910           const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
14911           const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
14912   #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
14913           const uint16x8_t onemask = simdutf_make_uint16x8_t(0x0001, 0x0004,
14914                                       0x0010, 0x0040,
14915                                       0x0100, 0x0400,
14916                                       0x1000, 0x4000 );
14917           const uint16x8_t twomask = simdutf_make_uint16x8_t(0x0002, 0x0008,
14918                                       0x0020, 0x0080,
14919                                       0x0200, 0x0800,
14920                                       0x2000, 0x8000 );
14921   #else
14922           const uint16x8_t onemask = { 0x0001, 0x0004,
14923                                       0x0010, 0x0040,
14924                                       0x0100, 0x0400,
14925                                       0x1000, 0x4000 };
14926           const uint16x8_t twomask = { 0x0002, 0x0008,
14927                                       0x0020, 0x0080,
14928                                       0x0200, 0x0800,
14929                                       0x2000, 0x8000 };
14930   #endif
14931           const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
14932           const uint16_t mask = vaddvq_u16(combined);
14933           // The following fast path may or may not be beneficial.
14934           /*if(mask == 0) {
14935             // We only have three-byte code units. Use fast path.
14936             const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
14937             const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
14938             const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
14939             vst1q_u8(utf8_output, utf8_0);
14940             utf8_output += 12;
14941             vst1q_u8(utf8_output, utf8_1);
14942             utf8_output += 12;
14943             buf += 8;
14944             continue;
14945           }*/
14946           const uint8_t mask0 = uint8_t(mask);
14947 
14948           const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
14949           const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
14950           const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
14951 
14952           const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
14953           const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
14954           const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
14955           const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
14956 
14957           vst1q_u8(utf8_output, utf8_0);
14958           utf8_output += row0[0];
14959           vst1q_u8(utf8_output, utf8_1);
14960           utf8_output += row1[0];
14961 
14962           buf += 8;
14963       }
14964     // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
14965     } else {
14966       // Let us do a scalar fallback.
14967       // It may seem wasteful to use scalar code, but being efficient with SIMD
14968       // in the presence of surrogate pairs may require non-trivial tables.
14969       size_t forward = 15;
14970       size_t k = 0;
14971       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
14972       for(; k < forward; k++) {
14973         uint32_t word = buf[k];
14974         if((word & 0xFFFFFF80)==0) {
14975           *utf8_output++ = char(word);
14976         } else if((word & 0xFFFFF800)==0) {
14977           *utf8_output++ = char((word>>6) | 0b11000000);
14978           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14979         } else if((word & 0xFFFF0000)==0) {
14980           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
14981           *utf8_output++ = char((word>>12) | 0b11100000);
14982           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14983           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14984         } else {
14985           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
14986           *utf8_output++ = char((word>>18) | 0b11110000);
14987           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
14988           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14989           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14990         }
14991       }
14992       buf += k;
14993     }
14994   } // while
14995 
14996   return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
14997 }
14998 /* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
14999 /* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
15000 template <endianness big_endian>
arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out)15001 std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) {
15002   uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
15003   const char32_t* end = buf + len;
15004 
15005   uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
15006 
15007   while(buf + 4 <= end) {
15008     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
15009 
15010     // Check if no bits set above 16th
15011     if(vmaxvq_u32(in) <= 0xFFFF) {
15012       uint16x4_t utf16_packed = vmovn_u32(in);
15013 
15014       const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
15015       const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
15016       forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask);
15017 
15018       if (!match_system(big_endian)) { utf16_packed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed))); }
15019       vst1_u16(utf16_output, utf16_packed);
15020       utf16_output += 4;
15021       buf += 4;
15022     } else {
15023       size_t forward = 3;
15024       size_t k = 0;
15025       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
15026       for(; k < forward; k++) {
15027         uint32_t word = buf[k];
15028         if((word & 0xFFFF0000)==0) {
15029           // will not generate a surrogate pair
15030           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
15031           *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
15032         } else {
15033           // will generate a surrogate pair
15034           if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
15035           word -= 0x10000;
15036           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
15037           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
15038           if (!match_system(big_endian)) {
15039             high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
15040             low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
15041           }
15042           *utf16_output++ = char16_t(high_surrogate);
15043           *utf16_output++ = char16_t(low_surrogate);
15044         }
15045       }
15046       buf += k;
15047     }
15048   }
15049 
15050   // check for invalid input
15051   if (vmaxv_u16(forbidden_bytemask) != 0) {
15052     return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
15053   }
15054 
15055   return std::make_pair(buf, reinterpret_cast<char16_t*>(utf16_output));
15056 }
15057 
15058 
15059 template <endianness big_endian>
arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out)15060 std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) {
15061   uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
15062   const char32_t* start = buf;
15063   const char32_t* end = buf + len;
15064 
15065   while(buf + 4 <= end) {
15066     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
15067 
15068     // Check if no bits set above 16th
15069     if(vmaxvq_u32(in) <= 0xFFFF) {
15070       uint16x4_t utf16_packed = vmovn_u32(in);
15071 
15072       const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
15073       const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
15074       const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
15075       if (vmaxv_u16(forbidden_bytemask) != 0) {
15076         return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char16_t*>(utf16_output));
15077       }
15078 
15079       if (!match_system(big_endian)) { utf16_packed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed))); }
15080       vst1_u16(utf16_output, utf16_packed);
15081       utf16_output += 4;
15082       buf += 4;
15083     } else {
15084       size_t forward = 3;
15085       size_t k = 0;
15086       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
15087       for(; k < forward; k++) {
15088         uint32_t word = buf[k];
15089         if((word & 0xFFFF0000)==0) {
15090           // will not generate a surrogate pair
15091           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
15092           *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
15093         } else {
15094           // will generate a surrogate pair
15095           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
15096           word -= 0x10000;
15097           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
15098           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
15099           if (!match_system(big_endian)) {
15100             high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
15101             low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
15102           }
15103           *utf16_output++ = char16_t(high_surrogate);
15104           *utf16_output++ = char16_t(low_surrogate);
15105         }
15106       }
15107       buf += k;
15108     }
15109   }
15110 
15111   return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
15112 }
15113 /* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
15114 } // unnamed namespace
15115 } // namespace arm64
15116 } // namespace simdutf
15117 /* begin file src/generic/buf_block_reader.h */
15118 namespace simdutf {
15119 namespace arm64 {
15120 namespace {
15121 
15122 // Walks through a buffer in block-sized increments, loading the last part with spaces
15123 template<size_t STEP_SIZE>
15124 struct buf_block_reader {
15125 public:
15126   simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
15127   simdutf_really_inline size_t block_index();
15128   simdutf_really_inline bool has_full_block() const;
15129   simdutf_really_inline const uint8_t *full_block() const;
15130   /**
15131    * Get the last block, padded with spaces.
15132    *
15133    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
15134    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
15135    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
15136    *
15137    * @return the number of effective characters in the last block.
15138    */
15139   simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
15140   simdutf_really_inline void advance();
15141 private:
15142   const uint8_t *buf;
15143   const size_t len;
15144   const size_t lenminusstep;
15145   size_t idx;
15146 };
15147 
15148 // Routines to print masks and text for debugging bitmask operations
format_input_text_64(const uint8_t *text)15149 simdutf_unused static char * format_input_text_64(const uint8_t *text) {
15150   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
15151   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
15152     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
15153   }
15154   buf[sizeof(simd8x64<uint8_t>)] = '\0';
15155   return buf;
15156 }
15157 
15158 // Routines to print masks and text for debugging bitmask operations
15159 simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
15160   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
15161   in.store(reinterpret_cast<uint8_t*>(buf));
15162   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
15163     if (buf[i] < ' ') { buf[i] = '_'; }
15164   }
15165   buf[sizeof(simd8x64<uint8_t>)] = '\0';
15166   return buf;
15167 }
15168 
15169 simdutf_unused static char * format_mask(uint64_t mask) {
15170   static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
15171   for (size_t i=0; i<64; i++) {
15172     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
15173   }
15174   buf[64] = '\0';
15175   return buf;
15176 }
15177 
15178 template<size_t STEP_SIZE>
15179 simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
15180 
15181 template<size_t STEP_SIZE>
15182 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
15183 
15184 template<size_t STEP_SIZE>
15185 simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
15186   return idx < lenminusstep;
15187 }
15188 
15189 template<size_t STEP_SIZE>
15190 simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
15191   return &buf[idx];
15192 }
15193 
15194 template<size_t STEP_SIZE>
15195 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
15196   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
15197   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
15198   std::memcpy(dst, buf + idx, len - idx);
15199   return len - idx;
15200 }
15201 
15202 template<size_t STEP_SIZE>
15203 simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
15204   idx += STEP_SIZE;
15205 }
15206 
15207 } // unnamed namespace
15208 } // namespace arm64
15209 } // namespace simdutf
15210 /* end file src/generic/buf_block_reader.h */
15211 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
15212 namespace simdutf {
15213 namespace arm64 {
15214 namespace {
15215 namespace utf8_validation {
15216 
15217 using namespace simd;
15218 
15219   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
15220 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
15221 // Bit 1 = Too Long (ASCII followed by continuation)
15222 // Bit 2 = Overlong 3-byte
15223 // Bit 4 = Surrogate
15224 // Bit 5 = Overlong 2-byte
15225 // Bit 7 = Two Continuations
15226     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
15227                                                 // 11______ 11______
15228     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
15229     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
15230     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
15231     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
15232     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
15233     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
15234                                                 // 11110100 101_____
15235                                                 // 11110101 1001____
15236                                                 // 11110101 101_____
15237                                                 // 1111011_ 1001____
15238                                                 // 1111011_ 101_____
15239                                                 // 11111___ 1001____
15240                                                 // 11111___ 101_____
15241     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
15242                                                 // 11110101 1000____
15243                                                 // 1111011_ 1000____
15244                                                 // 11111___ 1000____
15245     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
15246 
15247     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
15248       // 0_______ ________ <ASCII in byte 1>
15249       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
15250       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
15251       // 10______ ________ <continuation in byte 1>
15252       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
15253       // 1100____ ________ <two byte lead in byte 1>
15254       TOO_SHORT | OVERLONG_2,
15255       // 1101____ ________ <two byte lead in byte 1>
15256       TOO_SHORT,
15257       // 1110____ ________ <three byte lead in byte 1>
15258       TOO_SHORT | OVERLONG_3 | SURROGATE,
15259       // 1111____ ________ <four+ byte lead in byte 1>
15260       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
15261     );
15262     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
15263     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
15264       // ____0000 ________
15265       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
15266       // ____0001 ________
15267       CARRY | OVERLONG_2,
15268       // ____001_ ________
15269       CARRY,
15270       CARRY,
15271 
15272       // ____0100 ________
15273       CARRY | TOO_LARGE,
15274       // ____0101 ________
15275       CARRY | TOO_LARGE | TOO_LARGE_1000,
15276       // ____011_ ________
15277       CARRY | TOO_LARGE | TOO_LARGE_1000,
15278       CARRY | TOO_LARGE | TOO_LARGE_1000,
15279 
15280       // ____1___ ________
15281       CARRY | TOO_LARGE | TOO_LARGE_1000,
15282       CARRY | TOO_LARGE | TOO_LARGE_1000,
15283       CARRY | TOO_LARGE | TOO_LARGE_1000,
15284       CARRY | TOO_LARGE | TOO_LARGE_1000,
15285       CARRY | TOO_LARGE | TOO_LARGE_1000,
15286       // ____1101 ________
15287       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
15288       CARRY | TOO_LARGE | TOO_LARGE_1000,
15289       CARRY | TOO_LARGE | TOO_LARGE_1000
15290     );
15291     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
15292       // ________ 0_______ <ASCII in byte 2>
15293       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
15294       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
15295 
15296       // ________ 1000____
15297       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
15298       // ________ 1001____
15299       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
15300       // ________ 101_____
15301       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
15302       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
15303 
15304       // ________ 11______
15305       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
15306     );
15307     return (byte_1_high & byte_1_low & byte_2_high);
15308   }
15309   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
15310       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
15311     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
15312     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
15313     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
15314     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
15315     return must23_80 ^ sc;
15316   }
15317 
15318   //
15319   // Return nonzero if there are incomplete multibyte characters at the end of the block:
15320   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
15321   //
15322   simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
15323     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
15324     // ... 1111____ 111_____ 11______
15325     static const uint8_t max_array[32] = {
15326       255, 255, 255, 255, 255, 255, 255, 255,
15327       255, 255, 255, 255, 255, 255, 255, 255,
15328       255, 255, 255, 255, 255, 255, 255, 255,
15329       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
15330     };
15331     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
15332     return input.gt_bits(max_value);
15333   }
15334 
15335   struct utf8_checker {
15336     // If this is nonzero, there has been a UTF-8 error.
15337     simd8<uint8_t> error;
15338     // The last input we received
15339     simd8<uint8_t> prev_input_block;
15340     // Whether the last input we received was incomplete (used for ASCII fast path)
15341     simd8<uint8_t> prev_incomplete;
15342 
15343     //
15344     // Check whether the current bytes are valid UTF-8.
15345     //
15346     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
15347       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
15348       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
15349       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
15350       simd8<uint8_t> sc = check_special_cases(input, prev1);
15351       this->error |= check_multibyte_lengths(input, prev_input, sc);
15352     }
15353 
15354     // The only problem that can happen at EOF is that a multibyte character is too short
15355     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
15356     // too large in the first of two bytes.
15357     simdutf_really_inline void check_eof() {
15358       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
15359       // possibly finish them.
15360       this->error |= this->prev_incomplete;
15361     }
15362 
15363     simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
15364       if(simdutf_likely(is_ascii(input))) {
15365         this->error |= this->prev_incomplete;
15366       } else {
15367         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
15368         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
15369             "We support either two or four chunks per 64-byte block.");
15370         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
15371           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
15372           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15373         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
15374           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
15375           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15376           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
15377           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
15378         }
15379         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
15380         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
15381 
15382       }
15383     }
15384 
15385     // do not forget to call check_eof!
15386     simdutf_really_inline bool errors() const {
15387       return this->error.any_bits_set_anywhere();
15388     }
15389 
15390   }; // struct utf8_checker
15391 } // namespace utf8_validation
15392 
15393 using utf8_validation::utf8_checker;
15394 
15395 } // unnamed namespace
15396 } // namespace arm64
15397 } // namespace simdutf
15398 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
15399 /* begin file src/generic/utf8_validation/utf8_validator.h */
15400 namespace simdutf {
15401 namespace arm64 {
15402 namespace {
15403 namespace utf8_validation {
15404 
15405 /**
15406  * Validates that the string is actual UTF-8.
15407  */
15408 template<class checker>
15409 bool generic_validate_utf8(const uint8_t * input, size_t length) {
15410     checker c{};
15411     buf_block_reader<64> reader(input, length);
15412     while (reader.has_full_block()) {
15413       simd::simd8x64<uint8_t> in(reader.full_block());
15414       c.check_next_input(in);
15415       reader.advance();
15416     }
15417     uint8_t block[64]{};
15418     reader.get_remainder(block);
15419     simd::simd8x64<uint8_t> in(block);
15420     c.check_next_input(in);
15421     reader.advance();
15422     c.check_eof();
15423     return !c.errors();
15424 }
15425 
15426 bool generic_validate_utf8(const char * input, size_t length) {
15427   return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
15428 }
15429 
15430 /**
15431  * Validates that the string is actual UTF-8 and stops on errors.
15432  */
15433 template<class checker>
15434 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
15435     checker c{};
15436     buf_block_reader<64> reader(input, length);
15437     size_t count{0};
15438     while (reader.has_full_block()) {
15439       simd::simd8x64<uint8_t> in(reader.full_block());
15440       c.check_next_input(in);
15441       if(c.errors()) {
15442         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
15443         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
15444         res.count += count;
15445         return res;
15446       }
15447       reader.advance();
15448       count += 64;
15449     }
15450     uint8_t block[64]{};
15451     reader.get_remainder(block);
15452     simd::simd8x64<uint8_t> in(block);
15453     c.check_next_input(in);
15454     reader.advance();
15455     c.check_eof();
15456     if (c.errors()) {
15457       if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
15458       result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
15459       res.count += count;
15460       return res;
15461     } else {
15462       return result(error_code::SUCCESS, length);
15463     }
15464 }
15465 
15466 result generic_validate_utf8_with_errors(const char * input, size_t length) {
15467   return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
15468 }
15469 
15470 template<class checker>
15471 bool generic_validate_ascii(const uint8_t * input, size_t length) {
15472     buf_block_reader<64> reader(input, length);
15473     uint8_t blocks[64]{};
15474     simd::simd8x64<uint8_t> running_or(blocks);
15475     while (reader.has_full_block()) {
15476       simd::simd8x64<uint8_t> in(reader.full_block());
15477       running_or |= in;
15478       reader.advance();
15479     }
15480     uint8_t block[64]{};
15481     reader.get_remainder(block);
15482     simd::simd8x64<uint8_t> in(block);
15483     running_or |= in;
15484     return running_or.is_ascii();
15485 }
15486 
15487 bool generic_validate_ascii(const char * input, size_t length) {
15488   return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
15489 }
15490 
15491 template<class checker>
15492 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
15493   buf_block_reader<64> reader(input, length);
15494   size_t count{0};
15495   while (reader.has_full_block()) {
15496     simd::simd8x64<uint8_t> in(reader.full_block());
15497     if (!in.is_ascii()) {
15498       result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
15499       return result(res.error, count + res.count);
15500     }
15501     reader.advance();
15502 
15503     count += 64;
15504   }
15505   uint8_t block[64]{};
15506   reader.get_remainder(block);
15507   simd::simd8x64<uint8_t> in(block);
15508   if (!in.is_ascii()) {
15509     result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
15510     return result(res.error, count + res.count);
15511   } else {
15512     return result(error_code::SUCCESS, length);
15513   }
15514 }
15515 
15516 result generic_validate_ascii_with_errors(const char * input, size_t length) {
15517   return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
15518 }
15519 
15520 } // namespace utf8_validation
15521 } // unnamed namespace
15522 } // namespace arm64
15523 } // namespace simdutf
15524 /* end file src/generic/utf8_validation/utf8_validator.h */
15525 // transcoding from UTF-8 to UTF-16
15526 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
15527 
15528 
15529 namespace simdutf {
15530 namespace arm64 {
15531 namespace {
15532 namespace utf8_to_utf16 {
15533 
15534 using namespace simd;
15535 
15536 template <endianness endian>
15537 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
15538     char16_t* utf16_output) noexcept {
15539   // The implementation is not specific to haswell and should be moved to the generic directory.
15540   size_t pos = 0;
15541   char16_t* start{utf16_output};
15542   const size_t safety_margin = 16; // to avoid overruns!
15543   while(pos + 64 + safety_margin <= size) {
15544     // this loop could be unrolled further. For example, we could process the mask
15545     // far more than 64 bytes.
15546     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
15547     if(in.is_ascii()) {
15548       in.store_ascii_as_utf16<endian>(utf16_output);
15549       utf16_output += 64;
15550       pos += 64;
15551     } else {
15552       // Slow path. We hope that the compiler will recognize that this is a slow path.
15553       // Anything that is not a continuation mask is a 'leading byte', that is, the
15554       // start of a new code point.
15555       uint64_t utf8_continuation_mask = in.lt(-65 + 1);
15556       // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
15557       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
15558       // The *start* of code points is not so useful, rather, we want the *end* of code points.
15559       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
15560       // We process in blocks of up to 12 bytes except possibly
15561       // for fast paths which may process up to 16 bytes. For the
15562       // slow path to work, we should have at least 12 input bytes left.
15563       size_t max_starting_point = (pos + 64) - 12;
15564       // Next loop is going to run at least five times when using solely
15565       // the slow/regular path, and at least four times if there are fast paths.
15566       while(pos < max_starting_point) {
15567         // Performance note: our ability to compute 'consumed' and
15568         // then shift and recompute is critical. If there is a
15569         // latency of, say, 4 cycles on getting 'consumed', then
15570         // the inner loop might have a total latency of about 6 cycles.
15571         // Yet we process between 6 to 12 inputs bytes, thus we get
15572         // a speed limit between 1 cycle/byte and 0.5 cycle/byte
15573         // for this section of the code. Hence, there is a limit
15574         // to how much we can further increase this latency before
15575         // it seriously harms performance.
15576         //
15577         // Thus we may allow convert_masked_utf8_to_utf16 to process
15578         // more bytes at a time under a fast-path mode where 16 bytes
15579         // are consumed at once (e.g., when encountering ASCII).
15580         size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
15581                             utf8_end_of_code_point_mask, utf16_output);
15582         pos += consumed;
15583         utf8_end_of_code_point_mask >>= consumed;
15584       }
15585       // At this point there may remain between 0 and 12 bytes in the
15586       // 64-byte block. These bytes will be processed again. So we have an
15587       // 80% efficiency (in the worst case). In practice we expect an
15588       // 85% to 90% efficiency.
15589     }
15590   }
15591   utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
15592   return utf16_output - start;
15593 }
15594 
15595 } // namespace utf8_to_utf16
15596 } // unnamed namespace
15597 } // namespace arm64
15598 } // namespace simdutf
15599 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
15600 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
15601 
15602 
15603 namespace simdutf {
15604 namespace arm64 {
15605 namespace {
15606 namespace utf8_to_utf16 {
15607 using namespace simd;
15608 
15609 
15610   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
15611 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
15612 // Bit 1 = Too Long (ASCII followed by continuation)
15613 // Bit 2 = Overlong 3-byte
15614 // Bit 4 = Surrogate
15615 // Bit 5 = Overlong 2-byte
15616 // Bit 7 = Two Continuations
15617     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
15618                                                 // 11______ 11______
15619     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
15620     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
15621     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
15622     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
15623     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
15624     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
15625                                                 // 11110100 101_____
15626                                                 // 11110101 1001____
15627                                                 // 11110101 101_____
15628                                                 // 1111011_ 1001____
15629                                                 // 1111011_ 101_____
15630                                                 // 11111___ 1001____
15631                                                 // 11111___ 101_____
15632     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
15633                                                 // 11110101 1000____
15634                                                 // 1111011_ 1000____
15635                                                 // 11111___ 1000____
15636     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
15637 
15638     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
15639       // 0_______ ________ <ASCII in byte 1>
15640       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
15641       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
15642       // 10______ ________ <continuation in byte 1>
15643       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
15644       // 1100____ ________ <two byte lead in byte 1>
15645       TOO_SHORT | OVERLONG_2,
15646       // 1101____ ________ <two byte lead in byte 1>
15647       TOO_SHORT,
15648       // 1110____ ________ <three byte lead in byte 1>
15649       TOO_SHORT | OVERLONG_3 | SURROGATE,
15650       // 1111____ ________ <four+ byte lead in byte 1>
15651       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
15652     );
15653     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
15654     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
15655       // ____0000 ________
15656       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
15657       // ____0001 ________
15658       CARRY | OVERLONG_2,
15659       // ____001_ ________
15660       CARRY,
15661       CARRY,
15662 
15663       // ____0100 ________
15664       CARRY | TOO_LARGE,
15665       // ____0101 ________
15666       CARRY | TOO_LARGE | TOO_LARGE_1000,
15667       // ____011_ ________
15668       CARRY | TOO_LARGE | TOO_LARGE_1000,
15669       CARRY | TOO_LARGE | TOO_LARGE_1000,
15670 
15671       // ____1___ ________
15672       CARRY | TOO_LARGE | TOO_LARGE_1000,
15673       CARRY | TOO_LARGE | TOO_LARGE_1000,
15674       CARRY | TOO_LARGE | TOO_LARGE_1000,
15675       CARRY | TOO_LARGE | TOO_LARGE_1000,
15676       CARRY | TOO_LARGE | TOO_LARGE_1000,
15677       // ____1101 ________
15678       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
15679       CARRY | TOO_LARGE | TOO_LARGE_1000,
15680       CARRY | TOO_LARGE | TOO_LARGE_1000
15681     );
15682     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
15683       // ________ 0_______ <ASCII in byte 2>
15684       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
15685       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
15686 
15687       // ________ 1000____
15688       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
15689       // ________ 1001____
15690       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
15691       // ________ 101_____
15692       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
15693       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
15694 
15695       // ________ 11______
15696       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
15697     );
15698     return (byte_1_high & byte_1_low & byte_2_high);
15699   }
15700   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
15701       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
15702     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
15703     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
15704     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
15705     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
15706     return must23_80 ^ sc;
15707   }
15708 
15709 
15710   struct validating_transcoder {
15711     // If this is nonzero, there has been a UTF-8 error.
15712     simd8<uint8_t> error;
15713 
15714     validating_transcoder() : error(uint8_t(0)) {}
15715     //
15716     // Check whether the current bytes are valid UTF-8.
15717     //
15718     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
15719       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
15720       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
15721       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
15722       simd8<uint8_t> sc = check_special_cases(input, prev1);
15723       this->error |= check_multibyte_lengths(input, prev_input, sc);
15724     }
15725 
15726 
15727     template <endianness endian>
15728     simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
15729       size_t pos = 0;
15730       char16_t* start{utf16_output};
15731       // In the worst case, we have the haswell kernel which can cause an overflow of
15732       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
15733       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
15734       // much more than 8 bytes. However, you cannot generally assume that you have valid
15735       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
15736       // to give us a good margin.
15737       size_t leading_byte = 0;
15738       size_t margin = size;
15739       for(; margin > 0 && leading_byte < 8; margin--) {
15740         leading_byte += (int8_t(in[margin-1]) > -65);
15741       }
15742       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
15743       const size_t safety_margin = size - margin + 1; // to avoid overruns!
15744       while(pos + 64 + safety_margin <= size) {
15745         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
15746         if(input.is_ascii()) {
15747           input.store_ascii_as_utf16<endian>(utf16_output);
15748           utf16_output += 64;
15749           pos += 64;
15750         } else {
15751           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
15752           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
15753               "We support either two or four chunks per 64-byte block.");
15754           auto zero = simd8<uint8_t>{uint8_t(0)};
15755           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
15756             this->check_utf8_bytes(input.chunks[0], zero);
15757             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15758           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
15759             this->check_utf8_bytes(input.chunks[0], zero);
15760             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15761             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
15762             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
15763           }
15764           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
15765           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
15766           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
15767           // We process in blocks of up to 12 bytes except possibly
15768           // for fast paths which may process up to 16 bytes. For the
15769           // slow path to work, we should have at least 12 input bytes left.
15770           size_t max_starting_point = (pos + 64) - 12;
15771           // Next loop is going to run at least five times.
15772           while(pos < max_starting_point) {
15773             // Performance note: our ability to compute 'consumed' and
15774             // then shift and recompute is critical. If there is a
15775             // latency of, say, 4 cycles on getting 'consumed', then
15776             // the inner loop might have a total latency of about 6 cycles.
15777             // Yet we process between 6 to 12 inputs bytes, thus we get
15778             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
15779             // for this section of the code. Hence, there is a limit
15780             // to how much we can further increase this latency before
15781             // it seriously harms performance.
15782             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
15783                             utf8_end_of_code_point_mask, utf16_output);
15784             pos += consumed;
15785             utf8_end_of_code_point_mask >>= consumed;
15786           }
15787           // At this point there may remain between 0 and 12 bytes in the
15788           // 64-byte block. These bytes will be processed again. So we have an
15789           // 80% efficiency (in the worst case). In practice we expect an
15790           // 85% to 90% efficiency.
15791         }
15792       }
15793       if(errors()) { return 0; }
15794       if(pos < size) {
15795         size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
15796         if(howmany == 0) { return 0; }
15797         utf16_output += howmany;
15798       }
15799       return utf16_output - start;
15800     }
15801 
15802     template <endianness endian>
15803     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
15804       size_t pos = 0;
15805       char16_t* start{utf16_output};
15806       // In the worst case, we have the haswell kernel which can cause an overflow of
15807       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
15808       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
15809       // much more than 8 bytes. However, you cannot generally assume that you have valid
15810       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
15811       // to give us a good margin.
15812       size_t leading_byte = 0;
15813       size_t margin = size;
15814       for(; margin > 0 && leading_byte < 8; margin--) {
15815         leading_byte += (int8_t(in[margin-1]) > -65);
15816       }
15817       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
15818       const size_t safety_margin = size - margin + 1; // to avoid overruns!
15819       while(pos + 64 + safety_margin <= size) {
15820         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
15821         if(input.is_ascii()) {
15822           input.store_ascii_as_utf16<endian>(utf16_output);
15823           utf16_output += 64;
15824           pos += 64;
15825         } else {
15826           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
15827           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
15828               "We support either two or four chunks per 64-byte block.");
15829           auto zero = simd8<uint8_t>{uint8_t(0)};
15830           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
15831             this->check_utf8_bytes(input.chunks[0], zero);
15832             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15833           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
15834             this->check_utf8_bytes(input.chunks[0], zero);
15835             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15836             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
15837             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
15838           }
15839           if (errors()) {
15840             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
15841             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
15842             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
15843             res.count += pos;
15844             return res;
15845           }
15846           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
15847           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
15848           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
15849           // We process in blocks of up to 12 bytes except possibly
15850           // for fast paths which may process up to 16 bytes. For the
15851           // slow path to work, we should have at least 12 input bytes left.
15852           size_t max_starting_point = (pos + 64) - 12;
15853           // Next loop is going to run at least five times.
15854           while(pos < max_starting_point) {
15855             // Performance note: our ability to compute 'consumed' and
15856             // then shift and recompute is critical. If there is a
15857             // latency of, say, 4 cycles on getting 'consumed', then
15858             // the inner loop might have a total latency of about 6 cycles.
15859             // Yet we process between 6 to 12 inputs bytes, thus we get
15860             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
15861             // for this section of the code. Hence, there is a limit
15862             // to how much we can further increase this latency before
15863             // it seriously harms performance.
15864             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
15865                             utf8_end_of_code_point_mask, utf16_output);
15866             pos += consumed;
15867             utf8_end_of_code_point_mask >>= consumed;
15868           }
15869           // At this point there may remain between 0 and 12 bytes in the
15870           // 64-byte block. These bytes will be processed again. So we have an
15871           // 80% efficiency (in the worst case). In practice we expect an
15872           // 85% to 90% efficiency.
15873         }
15874       }
15875       if(errors()) {
15876         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
15877         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
15878         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
15879         res.count += pos;
15880         return res;
15881       }
15882       if(pos < size) {
15883         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
15884         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
15885         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
15886         if (res.error) {    // In case of error, we want the error position
15887           res.count += pos;
15888           return res;
15889         } else {    // In case of success, we want the number of word written
15890           utf16_output += res.count;
15891         }
15892       }
15893       return result(error_code::SUCCESS, utf16_output - start);
15894     }
15895 
15896     simdutf_really_inline bool errors() const {
15897       return this->error.any_bits_set_anywhere();
15898     }
15899 
15900   }; // struct utf8_checker
15901 } // utf8_to_utf16 namespace
15902 } // unnamed namespace
15903 } // namespace arm64
15904 } // namespace simdutf
15905 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
15906 // transcoding from UTF-8 to UTF-32
15907 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
15908 
15909 namespace simdutf {
15910 namespace arm64 {
15911 namespace {
15912 namespace utf8_to_utf32 {
15913 
15914 using namespace simd;
15915 
15916 
15917 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
15918     char32_t* utf32_output) noexcept {
15919   size_t pos = 0;
15920   char32_t* start{utf32_output};
15921   const size_t safety_margin = 16; // to avoid overruns!
15922   while(pos + 64 + safety_margin <= size) {
15923     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
15924     if(in.is_ascii()) {
15925       in.store_ascii_as_utf32(utf32_output);
15926       utf32_output += 64;
15927       pos += 64;
15928     } else {
15929     // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
15930     uint64_t utf8_continuation_mask = in.lt(-65 + 1);
15931     uint64_t utf8_leading_mask = ~utf8_continuation_mask;
15932     uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
15933     size_t max_starting_point = (pos + 64) - 12;
15934     while(pos < max_starting_point) {
15935       size_t consumed = convert_masked_utf8_to_utf32(input + pos,
15936                           utf8_end_of_code_point_mask, utf32_output);
15937       pos += consumed;
15938       utf8_end_of_code_point_mask >>= consumed;
15939       }
15940     }
15941   }
15942   utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
15943   return utf32_output - start;
15944 }
15945 
15946 
15947 } // namespace utf8_to_utf32
15948 } // unnamed namespace
15949 } // namespace arm64
15950 } // namespace simdutf
15951 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
15952 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
15953 
15954 
15955 namespace simdutf {
15956 namespace arm64 {
15957 namespace {
15958 namespace utf8_to_utf32 {
15959 using namespace simd;
15960 
15961 
15962   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
15963 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
15964 // Bit 1 = Too Long (ASCII followed by continuation)
15965 // Bit 2 = Overlong 3-byte
15966 // Bit 4 = Surrogate
15967 // Bit 5 = Overlong 2-byte
15968 // Bit 7 = Two Continuations
15969     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
15970                                                 // 11______ 11______
15971     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
15972     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
15973     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
15974     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
15975     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
15976     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
15977                                                 // 11110100 101_____
15978                                                 // 11110101 1001____
15979                                                 // 11110101 101_____
15980                                                 // 1111011_ 1001____
15981                                                 // 1111011_ 101_____
15982                                                 // 11111___ 1001____
15983                                                 // 11111___ 101_____
15984     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
15985                                                 // 11110101 1000____
15986                                                 // 1111011_ 1000____
15987                                                 // 11111___ 1000____
15988     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
15989 
15990     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
15991       // 0_______ ________ <ASCII in byte 1>
15992       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
15993       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
15994       // 10______ ________ <continuation in byte 1>
15995       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
15996       // 1100____ ________ <two byte lead in byte 1>
15997       TOO_SHORT | OVERLONG_2,
15998       // 1101____ ________ <two byte lead in byte 1>
15999       TOO_SHORT,
16000       // 1110____ ________ <three byte lead in byte 1>
16001       TOO_SHORT | OVERLONG_3 | SURROGATE,
16002       // 1111____ ________ <four+ byte lead in byte 1>
16003       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
16004     );
16005     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
16006     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
16007       // ____0000 ________
16008       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
16009       // ____0001 ________
16010       CARRY | OVERLONG_2,
16011       // ____001_ ________
16012       CARRY,
16013       CARRY,
16014 
16015       // ____0100 ________
16016       CARRY | TOO_LARGE,
16017       // ____0101 ________
16018       CARRY | TOO_LARGE | TOO_LARGE_1000,
16019       // ____011_ ________
16020       CARRY | TOO_LARGE | TOO_LARGE_1000,
16021       CARRY | TOO_LARGE | TOO_LARGE_1000,
16022 
16023       // ____1___ ________
16024       CARRY | TOO_LARGE | TOO_LARGE_1000,
16025       CARRY | TOO_LARGE | TOO_LARGE_1000,
16026       CARRY | TOO_LARGE | TOO_LARGE_1000,
16027       CARRY | TOO_LARGE | TOO_LARGE_1000,
16028       CARRY | TOO_LARGE | TOO_LARGE_1000,
16029       // ____1101 ________
16030       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
16031       CARRY | TOO_LARGE | TOO_LARGE_1000,
16032       CARRY | TOO_LARGE | TOO_LARGE_1000
16033     );
16034     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
16035       // ________ 0_______ <ASCII in byte 2>
16036       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
16037       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
16038 
16039       // ________ 1000____
16040       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
16041       // ________ 1001____
16042       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
16043       // ________ 101_____
16044       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
16045       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
16046 
16047       // ________ 11______
16048       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
16049     );
16050     return (byte_1_high & byte_1_low & byte_2_high);
16051   }
16052   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
16053       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
16054     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
16055     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
16056     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
16057     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
16058     return must23_80 ^ sc;
16059   }
16060 
16061 
16062   struct validating_transcoder {
16063     // If this is nonzero, there has been a UTF-8 error.
16064     simd8<uint8_t> error;
16065 
16066     validating_transcoder() : error(uint8_t(0)) {}
16067     //
16068     // Check whether the current bytes are valid UTF-8.
16069     //
16070     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
16071       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
16072       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
16073       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
16074       simd8<uint8_t> sc = check_special_cases(input, prev1);
16075       this->error |= check_multibyte_lengths(input, prev_input, sc);
16076     }
16077 
16078 
16079 
16080     simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
16081       size_t pos = 0;
16082       char32_t* start{utf32_output};
16083       // In the worst case, we have the haswell kernel which can cause an overflow of
16084       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
16085       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
16086       // much more than 8 bytes. However, you cannot generally assume that you have valid
16087       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
16088       // to give us a good margin.
16089       size_t leading_byte = 0;
16090       size_t margin = size;
16091       for(; margin > 0 && leading_byte < 4; margin--) {
16092         leading_byte += (int8_t(in[margin-1]) > -65);
16093       }
16094       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
16095       const size_t safety_margin = size - margin + 1; // to avoid overruns!
16096       while(pos + 64 + safety_margin <= size) {
16097         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16098         if(input.is_ascii()) {
16099           input.store_ascii_as_utf32(utf32_output);
16100           utf32_output += 64;
16101           pos += 64;
16102         } else {
16103           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
16104           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
16105               "We support either two or four chunks per 64-byte block.");
16106           auto zero = simd8<uint8_t>{uint8_t(0)};
16107           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
16108             this->check_utf8_bytes(input.chunks[0], zero);
16109             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16110           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
16111             this->check_utf8_bytes(input.chunks[0], zero);
16112             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16113             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
16114             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
16115           }
16116           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
16117           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
16118           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
16119           // We process in blocks of up to 12 bytes except possibly
16120           // for fast paths which may process up to 16 bytes. For the
16121           // slow path to work, we should have at least 12 input bytes left.
16122           size_t max_starting_point = (pos + 64) - 12;
16123           // Next loop is going to run at least five times.
16124           while(pos < max_starting_point) {
16125             // Performance note: our ability to compute 'consumed' and
16126             // then shift and recompute is critical. If there is a
16127             // latency of, say, 4 cycles on getting 'consumed', then
16128             // the inner loop might have a total latency of about 6 cycles.
16129             // Yet we process between 6 to 12 inputs bytes, thus we get
16130             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
16131             // for this section of the code. Hence, there is a limit
16132             // to how much we can further increase this latency before
16133             // it seriously harms performance.
16134             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
16135                             utf8_end_of_code_point_mask, utf32_output);
16136             pos += consumed;
16137             utf8_end_of_code_point_mask >>= consumed;
16138           }
16139           // At this point there may remain between 0 and 12 bytes in the
16140           // 64-byte block. These bytes will be processed again. So we have an
16141           // 80% efficiency (in the worst case). In practice we expect an
16142           // 85% to 90% efficiency.
16143         }
16144       }
16145       if(errors()) { return 0; }
16146       if(pos < size) {
16147         size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
16148         if(howmany == 0) { return 0; }
16149         utf32_output += howmany;
16150       }
16151       return utf32_output - start;
16152     }
16153 
16154     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
16155       size_t pos = 0;
16156       char32_t* start{utf32_output};
16157       // In the worst case, we have the haswell kernel which can cause an overflow of
16158       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
16159       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
16160       // much more than 8 bytes. However, you cannot generally assume that you have valid
16161       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
16162       // to give us a good margin.
16163       size_t leading_byte = 0;
16164       size_t margin = size;
16165       for(; margin > 0 && leading_byte < 4; margin--) {
16166         leading_byte += (int8_t(in[margin-1]) > -65);
16167       }
16168       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
16169       const size_t safety_margin = size - margin + 1; // to avoid overruns!
16170       while(pos + 64 + safety_margin <= size) {
16171         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16172         if(input.is_ascii()) {
16173           input.store_ascii_as_utf32(utf32_output);
16174           utf32_output += 64;
16175           pos += 64;
16176         } else {
16177           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
16178           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
16179               "We support either two or four chunks per 64-byte block.");
16180           auto zero = simd8<uint8_t>{uint8_t(0)};
16181           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
16182             this->check_utf8_bytes(input.chunks[0], zero);
16183             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16184           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
16185             this->check_utf8_bytes(input.chunks[0], zero);
16186             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16187             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
16188             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
16189           }
16190           if (errors()) {
16191             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
16192             res.count += pos;
16193             return res;
16194           }
16195           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
16196           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
16197           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
16198           // We process in blocks of up to 12 bytes except possibly
16199           // for fast paths which may process up to 16 bytes. For the
16200           // slow path to work, we should have at least 12 input bytes left.
16201           size_t max_starting_point = (pos + 64) - 12;
16202           // Next loop is going to run at least five times.
16203           while(pos < max_starting_point) {
16204             // Performance note: our ability to compute 'consumed' and
16205             // then shift and recompute is critical. If there is a
16206             // latency of, say, 4 cycles on getting 'consumed', then
16207             // the inner loop might have a total latency of about 6 cycles.
16208             // Yet we process between 6 to 12 inputs bytes, thus we get
16209             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
16210             // for this section of the code. Hence, there is a limit
16211             // to how much we can further increase this latency before
16212             // it seriously harms performance.
16213             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
16214                             utf8_end_of_code_point_mask, utf32_output);
16215             pos += consumed;
16216             utf8_end_of_code_point_mask >>= consumed;
16217           }
16218           // At this point there may remain between 0 and 12 bytes in the
16219           // 64-byte block. These bytes will be processed again. So we have an
16220           // 80% efficiency (in the worst case). In practice we expect an
16221           // 85% to 90% efficiency.
16222         }
16223       }
16224       if(errors()) {
16225         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
16226         res.count += pos;
16227         return res;
16228       }
16229       if(pos < size) {
16230         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
16231         if (res.error) {    // In case of error, we want the error position
16232           res.count += pos;
16233           return res;
16234         } else {    // In case of success, we want the number of word written
16235           utf32_output += res.count;
16236         }
16237       }
16238       return result(error_code::SUCCESS, utf32_output - start);
16239     }
16240 
16241     simdutf_really_inline bool errors() const {
16242       return this->error.any_bits_set_anywhere();
16243     }
16244 
16245   }; // struct utf8_checker
16246 } // utf8_to_utf32 namespace
16247 } // unnamed namespace
16248 } // namespace arm64
16249 } // namespace simdutf
16250 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
16251 // other functions
16252 /* begin file src/generic/utf8.h */
16253 
16254 namespace simdutf {
16255 namespace arm64 {
16256 namespace {
16257 namespace utf8 {
16258 
16259 using namespace simd;
16260 
16261 simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
16262     size_t pos = 0;
16263     size_t count = 0;
16264     for(;pos + 64 <= size; pos += 64) {
16265       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16266       uint64_t utf8_continuation_mask = input.gt(-65);
16267       count += count_ones(utf8_continuation_mask);
16268     }
16269     return count + scalar::utf8::count_code_points(in + pos, size - pos);
16270 }
16271 
16272 simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
16273     size_t pos = 0;
16274     size_t count = 0;
16275     // This algorithm could no doubt be improved!
16276     for(;pos + 64 <= size; pos += 64) {
16277       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16278       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
16279       // We count one word for anything that is not a continuation (so
16280       // leading bytes).
16281       count += 64 - count_ones(utf8_continuation_mask);
16282       int64_t utf8_4byte = input.gteq_unsigned(240);
16283       count += count_ones(utf8_4byte);
16284     }
16285     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
16286 }
16287 } // utf8 namespace
16288 } // unnamed namespace
16289 } // namespace arm64
16290 } // namespace simdutf
16291 /* end file src/generic/utf8.h */
16292 /* begin file src/generic/utf16.h */
16293 namespace simdutf {
16294 namespace arm64 {
16295 namespace {
16296 namespace utf16 {
16297 
16298 template <endianness big_endian>
16299 simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
16300     size_t pos = 0;
16301     size_t count = 0;
16302     for(;pos < size/32*32; pos += 32) {
16303       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
16304       if (!match_system(big_endian)) { input.swap_bytes(); }
16305       uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
16306       count += count_ones(not_pair) / 2;
16307     }
16308     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
16309 }
16310 
16311 template <endianness big_endian>
16312 simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
16313     size_t pos = 0;
16314     size_t count = 0;
16315     // This algorithm could no doubt be improved!
16316     for(;pos < size/32*32; pos += 32) {
16317       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
16318       if (!match_system(big_endian)) { input.swap_bytes(); }
16319       uint64_t ascii_mask = input.lteq(0x7F);
16320       uint64_t twobyte_mask = input.lteq(0x7FF);
16321       uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
16322 
16323       size_t ascii_count = count_ones(ascii_mask) / 2;
16324       size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
16325       size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
16326       size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
16327       count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
16328     }
16329     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
16330 }
16331 
16332 template <endianness big_endian>
16333 simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
16334     return count_code_points<big_endian>(in, size);
16335 }
16336 
16337 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
16338   size_t pos = 0;
16339 
16340   while (pos < size/32*32) {
16341     simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
16342     input.swap_bytes();
16343     input.store(reinterpret_cast<uint16_t *>(output));
16344     pos += 32;
16345     output += 32;
16346   }
16347 
16348   scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
16349 }
16350 
16351 } // utf16
16352 } // unnamed namespace
16353 } // namespace arm64
16354 } // namespace simdutf
16355 /* end file src/generic/utf16.h */
16356 // transcoding from UTF-8 to Latin 1
16357 /* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
16358 
16359 
16360 namespace simdutf {
16361 namespace arm64 {
16362 namespace {
16363 namespace utf8_to_latin1 {
16364 using namespace simd;
16365 
16366 
16367   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
16368 // For UTF-8 to Latin 1, we can allow any ASCII character, and any continuation byte,
16369 // but the non-ASCII leading bytes must be 0b11000011 or 0b11000010 and nothing else.
16370 //
16371 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
16372 // Bit 1 = Too Long (ASCII followed by continuation)
16373 // Bit 2 = Overlong 3-byte
16374 // Bit 4 = Surrogate
16375 // Bit 5 = Overlong 2-byte
16376 // Bit 7 = Two Continuations
16377     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
16378                                                 // 11______ 11______
16379     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
16380     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
16381     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
16382     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
16383     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
16384     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
16385                                                 // 11110100 101_____
16386                                                 // 11110101 1001____
16387                                                 // 11110101 101_____
16388                                                 // 1111011_ 1001____
16389                                                 // 1111011_ 101_____
16390                                                 // 11111___ 1001____
16391                                                 // 11111___ 101_____
16392     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
16393                                                 // 11110101 1000____
16394                                                 // 1111011_ 1000____
16395                                                 // 11111___ 1000____
16396     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
16397     constexpr const uint8_t FORBIDDEN  = 0xff;
16398 
16399     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
16400       // 0_______ ________ <ASCII in byte 1>
16401       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
16402       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
16403       // 10______ ________ <continuation in byte 1>
16404       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
16405       // 1100____ ________ <two byte lead in byte 1>
16406       TOO_SHORT | OVERLONG_2,
16407       // 1101____ ________ <two byte lead in byte 1>
16408       FORBIDDEN,
16409       // 1110____ ________ <three byte lead in byte 1>
16410       FORBIDDEN,
16411       // 1111____ ________ <four+ byte lead in byte 1>
16412       FORBIDDEN
16413     );
16414     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
16415     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
16416       // ____0000 ________
16417       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
16418       // ____0001 ________
16419       CARRY | OVERLONG_2,
16420       // ____001_ ________
16421       CARRY,
16422       CARRY,
16423 
16424       // ____0100 ________
16425       FORBIDDEN,
16426       // ____0101 ________
16427       FORBIDDEN,
16428       // ____011_ ________
16429       FORBIDDEN,
16430       FORBIDDEN,
16431 
16432       // ____1___ ________
16433       FORBIDDEN,
16434       FORBIDDEN,
16435       FORBIDDEN,
16436       FORBIDDEN,
16437       FORBIDDEN,
16438       // ____1101 ________
16439       FORBIDDEN,
16440       FORBIDDEN,
16441       FORBIDDEN
16442     );
16443     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
16444       // ________ 0_______ <ASCII in byte 2>
16445       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
16446       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
16447 
16448       // ________ 1000____
16449       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
16450       // ________ 1001____
16451       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
16452       // ________ 101_____
16453       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
16454       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
16455 
16456       // ________ 11______
16457       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
16458     );
16459     return (byte_1_high & byte_1_low & byte_2_high);
16460   }
16461 
16462   struct validating_transcoder {
16463     // If this is nonzero, there has been a UTF-8 error.
16464     simd8<uint8_t> error;
16465 
16466     validating_transcoder() : error(uint8_t(0)) {}
16467     //
16468     // Check whether the current bytes are valid UTF-8.
16469     //
16470     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
16471       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
16472       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
16473       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
16474       this->error |= check_special_cases(input, prev1);
16475     }
16476 
16477 
16478     simdutf_really_inline size_t convert(const char* in, size_t size, char* latin1_output) {
16479       size_t pos = 0;
16480       char* start{latin1_output};
16481       // In the worst case, we have the haswell kernel which can cause an overflow of
16482       // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes,
16483       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
16484       // much more than 8 bytes. However, you cannot generally assume that you have valid
16485       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
16486       // to give us a good margin.
16487       size_t leading_byte = 0;
16488       size_t margin = size;
16489       for(; margin > 0 && leading_byte < 8; margin--) {
16490         leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ...
16491       }
16492       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
16493       const size_t safety_margin = size - margin + 1; // to avoid overruns!
16494       while(pos + 64 + safety_margin <= size) {
16495         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16496         if(input.is_ascii()) {
16497           input.store((int8_t*)latin1_output);
16498           latin1_output += 64;
16499           pos += 64;
16500         } else {
16501           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
16502           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
16503               "We support either two or four chunks per 64-byte block.");
16504           auto zero = simd8<uint8_t>{uint8_t(0)};
16505           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
16506             this->check_utf8_bytes(input.chunks[0], zero);
16507             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16508           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
16509             this->check_utf8_bytes(input.chunks[0], zero);
16510             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16511             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
16512             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
16513           }
16514           uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
16515           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
16516           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
16517           // We process in blocks of up to 12 bytes except possibly
16518           // for fast paths which may process up to 16 bytes. For the
16519           // slow path to work, we should have at least 12 input bytes left.
16520           size_t max_starting_point = (pos + 64) - 12;
16521           // Next loop is going to run at least five times.
16522           while(pos < max_starting_point) {
16523             // Performance note: our ability to compute 'consumed' and
16524             // then shift and recompute is critical. If there is a
16525             // latency of, say, 4 cycles on getting 'consumed', then
16526             // the inner loop might have a total latency of about 6 cycles.
16527             // Yet we process between 6 to 12 inputs bytes, thus we get
16528             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
16529             // for this section of the code. Hence, there is a limit
16530             // to how much we can further increase this latency before
16531             // it seriously harms performance.
16532             size_t consumed = convert_masked_utf8_to_latin1(in + pos,
16533                             utf8_end_of_code_point_mask, latin1_output);
16534             pos += consumed;
16535             utf8_end_of_code_point_mask >>= consumed;
16536           }
16537           // At this point there may remain between 0 and 12 bytes in the
16538           // 64-byte block. These bytes will be processed again. So we have an
16539           // 80% efficiency (in the worst case). In practice we expect an
16540           // 85% to 90% efficiency.
16541         }
16542       }
16543       if(errors()) { return 0; }
16544       if(pos < size) {
16545         size_t howmany  = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
16546         if(howmany == 0) { return 0; }
16547         latin1_output += howmany;
16548       }
16549       return latin1_output - start;
16550     }
16551 
16552     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char* latin1_output) {
16553       size_t pos = 0;
16554       char* start{latin1_output};
16555       // In the worst case, we have the haswell kernel which can cause an overflow of
16556       // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes,
16557       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
16558       // much more than 8 bytes. However, you cannot generally assume that you have valid
16559       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
16560       // to give us a good margin.
16561       size_t leading_byte = 0;
16562       size_t margin = size;
16563       for(; margin > 0 && leading_byte < 8; margin--) {
16564         leading_byte += (int8_t(in[margin-1]) > -65);
16565       }
16566       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
16567       const size_t safety_margin = size - margin + 1; // to avoid overruns!
16568       while(pos + 64 + safety_margin <= size) {
16569         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16570         if(input.is_ascii()) {
16571           input.store((int8_t*)latin1_output);
16572           latin1_output += 64;
16573           pos += 64;
16574         } else {
16575           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
16576           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
16577               "We support either two or four chunks per 64-byte block.");
16578           auto zero = simd8<uint8_t>{uint8_t(0)};
16579           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
16580             this->check_utf8_bytes(input.chunks[0], zero);
16581             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16582           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
16583             this->check_utf8_bytes(input.chunks[0], zero);
16584             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16585             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
16586             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
16587           }
16588           if (errors()) {
16589             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
16590             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
16591             result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output);
16592             res.count += pos;
16593             return res;
16594           }
16595           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
16596           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
16597           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
16598           // We process in blocks of up to 12 bytes except possibly
16599           // for fast paths which may process up to 16 bytes. For the
16600           // slow path to work, we should have at least 12 input bytes left.
16601           size_t max_starting_point = (pos + 64) - 12;
16602           // Next loop is going to run at least five times.
16603           while(pos < max_starting_point) {
16604             // Performance note: our ability to compute 'consumed' and
16605             // then shift and recompute is critical. If there is a
16606             // latency of, say, 4 cycles on getting 'consumed', then
16607             // the inner loop might have a total latency of about 6 cycles.
16608             // Yet we process between 6 to 12 inputs bytes, thus we get
16609             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
16610             // for this section of the code. Hence, there is a limit
16611             // to how much we can further increase this latency before
16612             // it seriously harms performance.
16613             size_t consumed = convert_masked_utf8_to_latin1(in + pos,
16614                             utf8_end_of_code_point_mask, latin1_output);
16615             pos += consumed;
16616             utf8_end_of_code_point_mask >>= consumed;
16617           }
16618           // At this point there may remain between 0 and 12 bytes in the
16619           // 64-byte block. These bytes will be processed again. So we have an
16620           // 80% efficiency (in the worst case). In practice we expect an
16621           // 85% to 90% efficiency.
16622         }
16623       }
16624       if(errors()) {
16625         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
16626         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
16627         result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output);
16628         res.count += pos;
16629         return res;
16630       }
16631       if(pos < size) {
16632         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
16633         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
16634         result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output);
16635         if (res.error) {    // In case of error, we want the error position
16636           res.count += pos;
16637           return res;
16638         } else {    // In case of success, we want the number of word written
16639           latin1_output += res.count;
16640         }
16641       }
16642       return result(error_code::SUCCESS, latin1_output - start);
16643     }
16644 
16645     simdutf_really_inline bool errors() const {
16646       return this->error.any_bits_set_anywhere();
16647     }
16648 
16649   }; // struct utf8_checker
16650 } // utf8_to_latin1 namespace
16651 } // unnamed namespace
16652 } // namespace arm64
16653 } // namespace simdutf
16654 /* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
16655 /* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
16656 
16657 
16658 namespace simdutf {
16659 namespace arm64 {
16660 namespace {
16661 namespace utf8_to_latin1 {
16662 using namespace simd;
16663 
16664 
16665     simdutf_really_inline size_t convert_valid(const char* in, size_t size, char* latin1_output) {
16666       size_t pos = 0;
16667       char* start{latin1_output};
16668       // In the worst case, we have the haswell kernel which can cause an overflow of
16669       // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes,
16670       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
16671       // much more than 8 bytes. However, you cannot generally assume that you have valid
16672       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
16673       // to give us a good margin.
16674       size_t leading_byte = 0;
16675       size_t margin = size;
16676       for(; margin > 0 && leading_byte < 8; margin--) {
16677         leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ...
16678       }
16679       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
16680       const size_t safety_margin = size - margin + 1; // to avoid overruns!
16681       while(pos + 64 + safety_margin <= size) {
16682         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16683         if(input.is_ascii()) {
16684           input.store((int8_t*)latin1_output);
16685           latin1_output += 64;
16686           pos += 64;
16687         } else {
16688           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
16689           uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
16690           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
16691           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
16692           // We process in blocks of up to 12 bytes except possibly
16693           // for fast paths which may process up to 16 bytes. For the
16694           // slow path to work, we should have at least 12 input bytes left.
16695           size_t max_starting_point = (pos + 64) - 12;
16696           // Next loop is going to run at least five times.
16697           while(pos < max_starting_point) {
16698             // Performance note: our ability to compute 'consumed' and
16699             // then shift and recompute is critical. If there is a
16700             // latency of, say, 4 cycles on getting 'consumed', then
16701             // the inner loop might have a total latency of about 6 cycles.
16702             // Yet we process between 6 to 12 inputs bytes, thus we get
16703             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
16704             // for this section of the code. Hence, there is a limit
16705             // to how much we can further increase this latency before
16706             // it seriously harms performance.
16707             size_t consumed = convert_masked_utf8_to_latin1(in + pos,
16708                             utf8_end_of_code_point_mask, latin1_output);
16709             pos += consumed;
16710             utf8_end_of_code_point_mask >>= consumed;
16711           }
16712           // At this point there may remain between 0 and 12 bytes in the
16713           // 64-byte block. These bytes will be processed again. So we have an
16714           // 80% efficiency (in the worst case). In practice we expect an
16715           // 85% to 90% efficiency.
16716         }
16717       }
16718       if(pos < size) {
16719         size_t howmany  = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, latin1_output);
16720         latin1_output += howmany;
16721       }
16722       return latin1_output - start;
16723     }
16724 
16725   }
16726 }   // utf8_to_latin1 namespace
16727 }   // unnamed namespace
16728 }   // namespace arm64
16729  // namespace simdutf
16730 /* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
16731 
16732 // placeholder scalars
16733 
16734 //
16735 // Implementation-specific overrides
16736 //
16737 namespace simdutf {
16738 namespace arm64 {
16739 
16740 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
16741   // If there is a BOM, then we trust it.
16742   auto bom_encoding = simdutf::BOM::check_bom(input, length);
16743   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
16744   if (length % 2 == 0) {
16745     return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
16746   } else {
16747     if (implementation::validate_utf8(input, length)) {
16748       return simdutf::encoding_type::UTF8;
16749     } else {
16750       return simdutf::encoding_type::unspecified;
16751     }
16752   }
16753 }
16754 
16755 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
16756   return arm64::utf8_validation::generic_validate_utf8(buf,len);
16757 }
16758 
16759 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
16760   return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
16761 }
16762 
16763 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
16764   return arm64::utf8_validation::generic_validate_ascii(buf,len);
16765 }
16766 
16767 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
16768   return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
16769 }
16770 
16771 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
16772   const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
16773   if (tail) {
16774     return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
16775   } else {
16776     return false;
16777   }
16778 }
16779 
16780 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
16781   const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
16782   if (tail) {
16783     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
16784   } else {
16785     return false;
16786   }
16787 }
16788 
16789 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
16790   result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
16791   if (res.count != len) {
16792     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
16793     return result(scalar_res.error, res.count + scalar_res.count);
16794   } else {
16795     return res;
16796   }
16797 }
16798 
16799 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
16800   result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
16801   if (res.count != len) {
16802     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
16803     return result(scalar_res.error, res.count + scalar_res.count);
16804   } else {
16805     return res;
16806   }
16807 }
16808 
16809 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
16810   const char32_t* tail = arm_validate_utf32le(buf, len);
16811   if (tail) {
16812     return scalar::utf32::validate(tail, len - (tail - buf));
16813   } else {
16814     return false;
16815   }
16816 }
16817 
16818 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
16819   result res = arm_validate_utf32le_with_errors(buf, len);
16820   if (res.count != len) {
16821     result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
16822     return result(scalar_res.error, res.count + scalar_res.count);
16823   } else {
16824     return res;
16825   }
16826 }
16827 
16828 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
16829   std::pair<const char*, char*> ret = arm_convert_latin1_to_utf8(buf, len, utf8_output);
16830   size_t converted_chars = ret.second - utf8_output;
16831 
16832   if (ret.first != buf + len) {
16833     const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
16834       ret.first, len - (ret.first - buf), ret.second);
16835     converted_chars += scalar_converted_chars;
16836   }
16837   return converted_chars;
16838 }
16839 
16840 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
16841   std::pair<const char*, char16_t*> ret = arm_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
16842   size_t converted_chars = ret.second - utf16_output;
16843   if (ret.first != buf + len) {
16844     const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert<endianness::LITTLE>(
16845       ret.first, len - (ret.first - buf), ret.second);
16846     converted_chars += scalar_converted_chars;
16847   }
16848   return converted_chars;
16849 }
16850 
16851 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
16852   std::pair<const char*, char16_t*> ret = arm_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
16853   size_t converted_chars = ret.second - utf16_output;
16854   if (ret.first != buf + len) {
16855     const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert<endianness::BIG>(
16856       ret.first, len - (ret.first - buf), ret.second);
16857     converted_chars += scalar_converted_chars;
16858   }
16859   return converted_chars;
16860 }
16861 
16862 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
16863   std::pair<const char*, char32_t*> ret = arm_convert_latin1_to_utf32(buf, len, utf32_output);
16864   size_t converted_chars = ret.second - utf32_output;
16865   if (ret.first != buf + len) {
16866     const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
16867       ret.first, len - (ret.first - buf), ret.second);
16868     converted_chars += scalar_converted_chars;
16869   }
16870   return converted_chars;
16871 }
16872 
16873 simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
16874   utf8_to_latin1::validating_transcoder converter;
16875   return converter.convert(buf, len, latin1_output);
16876 }
16877 
16878 simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
16879   utf8_to_latin1::validating_transcoder converter;
16880   return converter.convert_with_errors(buf, len, latin1_output);
16881 }
16882 
16883 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
16884   return arm64::utf8_to_latin1::convert_valid(buf,len,latin1_output);
16885 }
16886 
16887 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
16888   utf8_to_utf16::validating_transcoder converter;
16889   return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
16890 }
16891 
16892 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
16893   utf8_to_utf16::validating_transcoder converter;
16894   return converter.convert<endianness::BIG>(buf, len, utf16_output);
16895 }
16896 
16897 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
16898   utf8_to_utf16::validating_transcoder converter;
16899   return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
16900 }
16901 
16902 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
16903   utf8_to_utf16::validating_transcoder converter;
16904   return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
16905 }
16906 
16907 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
16908     char16_t* utf16_output) const noexcept {
16909   return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
16910 }
16911 
16912 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
16913     char16_t* utf16_output) const noexcept {
16914   return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
16915 }
16916 
16917 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
16918   utf8_to_utf32::validating_transcoder converter;
16919   return converter.convert(buf, len, utf32_output);
16920 }
16921 
16922 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
16923   utf8_to_utf32::validating_transcoder converter;
16924   return converter.convert_with_errors(buf, len, utf32_output);
16925 }
16926 
16927 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
16928     char32_t* utf32_output) const noexcept {
16929   return utf8_to_utf32::convert_valid(input, size,  utf32_output);
16930 }
16931 
16932 simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
16933   std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
16934   if (ret.first == nullptr) { return 0; }
16935   size_t saved_bytes = ret.second - latin1_output;
16936 
16937   if (ret.first != buf + len) {
16938     const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert<endianness::LITTLE>(
16939                                         ret.first, len - (ret.first - buf), ret.second);
16940     if (scalar_saved_bytes == 0) { return 0; }
16941     saved_bytes += scalar_saved_bytes;
16942   }
16943   return saved_bytes;
16944 }
16945 
16946 simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
16947   std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
16948   if (ret.first == nullptr) { return 0; }
16949   size_t saved_bytes = ret.second - latin1_output;
16950 
16951   if (ret.first != buf + len) {
16952     const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert<endianness::BIG>(
16953                                         ret.first, len - (ret.first - buf), ret.second);
16954     if (scalar_saved_bytes == 0) { return 0; }
16955     saved_bytes += scalar_saved_bytes;
16956   }
16957   return saved_bytes;
16958 }
16959 
16960 simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
16961   std::pair<result, char*> ret = arm_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(buf, len, latin1_output);
16962   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
16963   if (ret.first.count != len) { // All good so far, but not finished
16964     result scalar_res = scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
16965                                         buf + ret.first.count, len - ret.first.count, ret.second);
16966     if (scalar_res.error) {
16967       scalar_res.count += ret.first.count;
16968       return scalar_res;
16969     } else {
16970       ret.second += scalar_res.count;
16971     }
16972   }
16973   ret.first.count = ret.second - latin1_output;   // Set count to the number of 8-bit code units written
16974   return ret.first;
16975 }
16976 
16977 simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
16978   std::pair<result, char*> ret = arm_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len, latin1_output);
16979   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
16980   if (ret.first.count != len) { // All good so far, but not finished
16981     result scalar_res = scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
16982                                         buf + ret.first.count, len - ret.first.count, ret.second);
16983     if (scalar_res.error) {
16984       scalar_res.count += ret.first.count;
16985       return scalar_res;
16986     } else {
16987       ret.second += scalar_res.count;
16988     }
16989   }
16990   ret.first.count = ret.second - latin1_output;   // Set count to the number of 8-bit code units written
16991   return ret.first;
16992 }
16993 
16994 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
16995   // optimization opportunity: implement a custom function.
16996   return convert_utf16be_to_latin1(buf, len, latin1_output);
16997 }
16998 
16999 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
17000   // optimization opportunity: implement a custom function.
17001   return convert_utf16le_to_latin1(buf, len, latin1_output);
17002 }
17003 
17004 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17005   std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
17006   if (ret.first == nullptr) { return 0; }
17007   size_t saved_bytes = ret.second - utf8_output;
17008   if (ret.first != buf + len) {
17009     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
17010                                         ret.first, len - (ret.first - buf), ret.second);
17011     if (scalar_saved_bytes == 0) { return 0; }
17012     saved_bytes += scalar_saved_bytes;
17013   }
17014   return saved_bytes;
17015 }
17016 
17017 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17018   std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
17019   if (ret.first == nullptr) { return 0; }
17020   size_t saved_bytes = ret.second - utf8_output;
17021   if (ret.first != buf + len) {
17022     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
17023                                         ret.first, len - (ret.first - buf), ret.second);
17024     if (scalar_saved_bytes == 0) { return 0; }
17025     saved_bytes += scalar_saved_bytes;
17026   }
17027   return saved_bytes;
17028 }
17029 
17030 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17031   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
17032   std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
17033   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
17034   if (ret.first.count != len) { // All good so far, but not finished
17035     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
17036                                         buf + ret.first.count, len - ret.first.count, ret.second);
17037     if (scalar_res.error) {
17038       scalar_res.count += ret.first.count;
17039       return scalar_res;
17040     } else {
17041       ret.second += scalar_res.count;
17042     }
17043   }
17044   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
17045   return ret.first;
17046 }
17047 
17048 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17049   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
17050   std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
17051   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
17052   if (ret.first.count != len) { // All good so far, but not finished
17053     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
17054                                         buf + ret.first.count, len - ret.first.count, ret.second);
17055     if (scalar_res.error) {
17056       scalar_res.count += ret.first.count;
17057       return scalar_res;
17058     } else {
17059       ret.second += scalar_res.count;
17060     }
17061   }
17062   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
17063   return ret.first;
17064 }
17065 
17066 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17067   return convert_utf16le_to_utf8(buf, len, utf8_output);
17068 }
17069 
17070 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17071   return convert_utf16be_to_utf8(buf, len, utf8_output);
17072 }
17073 
17074 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
17075   std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
17076   if (ret.first == nullptr) { return 0; }
17077   size_t saved_bytes = ret.second - utf8_output;
17078   if (ret.first != buf + len) {
17079     const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
17080                                         ret.first, len - (ret.first - buf), ret.second);
17081     if (scalar_saved_bytes == 0) { return 0; }
17082     saved_bytes += scalar_saved_bytes;
17083   }
17084   return saved_bytes;
17085 }
17086 
17087 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
17088   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
17089   std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
17090   if (ret.first.count != len) {
17091     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
17092                                         buf + ret.first.count, len - ret.first.count, ret.second);
17093     if (scalar_res.error) {
17094       scalar_res.count += ret.first.count;
17095       return scalar_res;
17096     } else {
17097       ret.second += scalar_res.count;
17098     }
17099   }
17100   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
17101   return ret.first;
17102 }
17103 
17104 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17105   std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
17106   if (ret.first == nullptr) { return 0; }
17107   size_t saved_bytes = ret.second - utf32_output;
17108   if (ret.first != buf + len) {
17109     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
17110                                         ret.first, len - (ret.first - buf), ret.second);
17111     if (scalar_saved_bytes == 0) { return 0; }
17112     saved_bytes += scalar_saved_bytes;
17113   }
17114   return saved_bytes;
17115 }
17116 
17117 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17118   std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
17119   if (ret.first == nullptr) { return 0; }
17120   size_t saved_bytes = ret.second - utf32_output;
17121   if (ret.first != buf + len) {
17122     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
17123                                         ret.first, len - (ret.first - buf), ret.second);
17124     if (scalar_saved_bytes == 0) { return 0; }
17125     saved_bytes += scalar_saved_bytes;
17126   }
17127   return saved_bytes;
17128 }
17129 
17130 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17131   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
17132   std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
17133   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
17134   if (ret.first.count != len) { // All good so far, but not finished
17135     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
17136                                         buf + ret.first.count, len - ret.first.count, ret.second);
17137     if (scalar_res.error) {
17138       scalar_res.count += ret.first.count;
17139       return scalar_res;
17140     } else {
17141       ret.second += scalar_res.count;
17142     }
17143   }
17144   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit code units written
17145   return ret.first;
17146 }
17147 
17148 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17149   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
17150   std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
17151   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
17152   if (ret.first.count != len) { // All good so far, but not finished
17153     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
17154                                         buf + ret.first.count, len - ret.first.count, ret.second);
17155     if (scalar_res.error) {
17156       scalar_res.count += ret.first.count;
17157       return scalar_res;
17158     } else {
17159       ret.second += scalar_res.count;
17160     }
17161   }
17162   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit code units written
17163   return ret.first;
17164 }
17165 
17166 simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
17167   std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_latin1(buf, len, latin1_output);
17168   if (ret.first == nullptr) { return 0; }
17169   size_t saved_bytes = ret.second - latin1_output;
17170 
17171   if (ret.first != buf + len) {
17172     const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
17173                                         ret.first, len - (ret.first - buf), ret.second);
17174     if (scalar_saved_bytes == 0) { return 0; }
17175     saved_bytes += scalar_saved_bytes;
17176   }
17177   return saved_bytes;
17178 }
17179 
17180 simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
17181   std::pair<result, char*> ret = arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
17182   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
17183   if (ret.first.count != len) { // All good so far, but not finished
17184     result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
17185                                         buf + ret.first.count, len - ret.first.count, ret.second);
17186     if (scalar_res.error) {
17187       scalar_res.count += ret.first.count;
17188       return scalar_res;
17189     } else {
17190       ret.second += scalar_res.count;
17191     }
17192   }
17193   ret.first.count = ret.second - latin1_output;   // Set count to the number of 8-bit code units written
17194   return ret.first;
17195 }
17196 
17197 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
17198   std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_latin1(buf, len, latin1_output);
17199   if (ret.first == nullptr) { return 0; }
17200   size_t saved_bytes = ret.second - latin1_output;
17201 
17202   if (ret.first != buf + len) {
17203     const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
17204                                         ret.first, len - (ret.first - buf), ret.second);
17205     saved_bytes += scalar_saved_bytes;
17206   }
17207   return saved_bytes;
17208 }
17209 
17210 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
17211   // optimization opportunity: implement a custom function.
17212   return convert_utf32_to_utf8(buf, len, utf8_output);
17213 }
17214 
17215 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17216   std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
17217   if (ret.first == nullptr) { return 0; }
17218   size_t saved_bytes = ret.second - utf16_output;
17219   if (ret.first != buf + len) {
17220     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
17221                                         ret.first, len - (ret.first - buf), ret.second);
17222     if (scalar_saved_bytes == 0) { return 0; }
17223     saved_bytes += scalar_saved_bytes;
17224   }
17225   return saved_bytes;
17226 }
17227 
17228 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17229   std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
17230   if (ret.first == nullptr) { return 0; }
17231   size_t saved_bytes = ret.second - utf16_output;
17232   if (ret.first != buf + len) {
17233     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
17234                                         ret.first, len - (ret.first - buf), ret.second);
17235     if (scalar_saved_bytes == 0) { return 0; }
17236     saved_bytes += scalar_saved_bytes;
17237   }
17238   return saved_bytes;
17239 }
17240 
17241 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17242   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
17243   std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
17244   if (ret.first.count != len) {
17245     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
17246                                         buf + ret.first.count, len - ret.first.count, ret.second);
17247     if (scalar_res.error) {
17248       scalar_res.count += ret.first.count;
17249       return scalar_res;
17250     } else {
17251       ret.second += scalar_res.count;
17252     }
17253   }
17254   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit code units written
17255   return ret.first;
17256 }
17257 
17258 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17259   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
17260   std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
17261   if (ret.first.count != len) {
17262     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
17263                                         buf + ret.first.count, len - ret.first.count, ret.second);
17264     if (scalar_res.error) {
17265       scalar_res.count += ret.first.count;
17266       return scalar_res;
17267     } else {
17268       ret.second += scalar_res.count;
17269     }
17270   }
17271   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit code units written
17272   return ret.first;
17273 }
17274 
17275 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17276   return convert_utf32_to_utf16le(buf, len, utf16_output);
17277 }
17278 
17279 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17280   return convert_utf32_to_utf16be(buf, len, utf16_output);
17281 }
17282 
17283 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17284   return convert_utf16le_to_utf32(buf, len, utf32_output);
17285 }
17286 
17287 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17288   return convert_utf16be_to_utf32(buf, len, utf32_output);
17289 }
17290 
17291 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
17292   utf16::change_endianness_utf16(input, length, output);
17293 }
17294 
17295 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
17296   return utf16::count_code_points<endianness::LITTLE>(input, length);
17297 }
17298 
17299 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
17300   return utf16::count_code_points<endianness::BIG>(input, length);
17301 }
17302 
17303 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
17304   return utf8::count_code_points(input, length);
17305 }
17306 
17307 simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
17308   return count_utf8(buf,len);
17309 }
17310 
17311 simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
17312   return scalar::utf16::latin1_length_from_utf16(length);
17313 }
17314 
17315 simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept {
17316   return scalar::utf32::latin1_length_from_utf32(length);
17317 }
17318 
17319 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
17320   // See https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/
17321   // credit to Pete Cawley
17322   const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
17323   uint64_t result = 0;
17324   const int lanes = sizeof(uint8x16_t);
17325   uint8_t rem = length % lanes;
17326   const uint8_t *simd_end = data + (length / lanes) * lanes;
17327   const uint8x16_t threshold = vdupq_n_u8(0x80);
17328   for (; data < simd_end; data += lanes) {
17329     // load 16 bytes
17330     uint8x16_t input_vec = vld1q_u8(data);
17331     // compare to threshold (0x80)
17332     uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold);
17333     // vertical addition
17334     result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit));
17335   }
17336   return result + (length / lanes) * lanes + scalar::latin1::utf8_length_from_latin1((const char*)simd_end, rem);
17337 }
17338 
17339 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
17340   return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
17341 }
17342 
17343 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
17344   return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
17345 }
17346 
17347 
17348 simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
17349   return scalar::latin1::utf16_length_from_latin1(length);
17350 }
17351 
17352 
17353 simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
17354   return scalar::latin1::utf32_length_from_latin1(length);
17355 }
17356 
17357 
17358 
17359 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
17360   return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
17361 }
17362 
17363 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
17364   return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
17365 }
17366 
17367 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
17368   return utf8::utf16_length_from_utf8(input, length);
17369 }
17370 
17371 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
17372   const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
17373   const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
17374   const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
17375   const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
17376   size_t pos = 0;
17377   size_t count = 0;
17378   for(;pos + 4 <= length; pos += 4) {
17379     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
17380     const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
17381     const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
17382     const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
17383     const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
17384 
17385     const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
17386     const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
17387     const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
17388 
17389     const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
17390     const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
17391 
17392     size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
17393     size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
17394     size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
17395 
17396     count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
17397   }
17398   return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
17399 }
17400 
17401 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
17402   const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
17403   const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
17404   size_t pos = 0;
17405   size_t count = 0;
17406   for(;pos + 4 <= length; pos += 4) {
17407     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
17408     const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
17409     const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
17410     const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask);
17411     size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
17412     count += 4 + surrogate_count;
17413   }
17414   return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
17415 }
17416 
17417 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
17418   return utf8::count_code_points(input, length);
17419 }
17420 
17421 } // namespace arm64
17422 } // namespace simdutf
17423 
17424 /* begin file src/simdutf/arm64/end.h */
17425 /* end file src/simdutf/arm64/end.h */
17426 /* end file src/arm64/implementation.cpp */
17427 #endif
17428 #if SIMDUTF_IMPLEMENTATION_FALLBACK
17429 /* begin file src/fallback/implementation.cpp */
17430 /* begin file src/simdutf/fallback/begin.h */
17431 // redefining SIMDUTF_IMPLEMENTATION to "fallback"
17432 // #define SIMDUTF_IMPLEMENTATION fallback
17433 /* end file src/simdutf/fallback/begin.h */
17434 
17435 
17436 
17437 
17438 
17439 
17440 
17441 
17442 
17443 namespace simdutf {
17444 namespace fallback {
17445 
17446 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
17447   // If there is a BOM, then we trust it.
17448   auto bom_encoding = simdutf::BOM::check_bom(input, length);
17449   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
17450   int out = 0;
17451   if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
17452   if((length % 2) == 0) {
17453     if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
17454   }
17455   if((length % 4) == 0) {
17456     if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
17457   }
17458 
17459   return out;
17460 }
17461 
17462 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
17463   return scalar::utf8::validate(buf, len);
17464 }
17465 
17466 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
17467   return scalar::utf8::validate_with_errors(buf, len);
17468 }
17469 
17470 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
17471   return scalar::ascii::validate(buf, len);
17472 }
17473 
17474 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
17475   return scalar::ascii::validate_with_errors(buf, len);
17476 }
17477 
17478 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
17479   return scalar::utf16::validate<endianness::LITTLE>(buf, len);
17480 }
17481 
17482 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
17483   return scalar::utf16::validate<endianness::BIG>(buf, len);
17484 }
17485 
17486 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
17487   return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
17488 }
17489 
17490 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
17491   return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
17492 }
17493 
17494 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
17495   return scalar::utf32::validate(buf, len);
17496 }
17497 
17498 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
17499   return scalar::utf32::validate_with_errors(buf, len);
17500 }
17501 
17502 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
17503   return scalar::latin1_to_utf8::convert(buf,len,utf8_output);
17504 }
17505 
17506 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
17507   return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
17508 }
17509 
17510 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
17511   return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
17512 }
17513 
17514 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept {
17515   return scalar::latin1_to_utf32::convert(buf,len,utf32_output);
17516 }
17517 
17518 simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
17519   return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
17520 }
17521 
17522 simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
17523   return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
17524 }
17525 
17526 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
17527   return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
17528 }
17529 
17530 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
17531   return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
17532 }
17533 
17534 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
17535   return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
17536 }
17537 
17538 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
17539   return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
17540 }
17541 
17542 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
17543   return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
17544 }
17545 
17546 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
17547   return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
17548 }
17549 
17550 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
17551   return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
17552 }
17553 
17554 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
17555   return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
17556 }
17557 
17558 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
17559   return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
17560 }
17561 
17562 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
17563     char32_t* utf32_output) const noexcept {
17564   return scalar::utf8_to_utf32::convert_valid(input, size,  utf32_output);
17565 }
17566 
17567 simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
17568   return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
17569 }
17570 
17571 simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
17572   return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
17573 }
17574 
17575 simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
17576   return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
17577 }
17578 
17579 simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
17580   return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
17581 }
17582 
17583 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
17584   return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
17585 }
17586 
17587 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
17588   return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
17589 }
17590 
17591 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17592   return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
17593 }
17594 
17595 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17596   return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
17597 }
17598 
17599 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17600   return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
17601 }
17602 
17603 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17604   return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
17605 }
17606 
17607 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17608   return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
17609 }
17610 
17611 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
17612   return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
17613 }
17614 
17615 simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
17616   return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
17617 }
17618 
17619 simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
17620   return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
17621 }
17622 
17623 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
17624   return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
17625 }
17626 
17627 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
17628   return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
17629 }
17630 
17631 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
17632   return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
17633 }
17634 
17635 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
17636   return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
17637 }
17638 
17639 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17640   return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
17641 }
17642 
17643 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17644   return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
17645 }
17646 
17647 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17648   return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
17649 }
17650 
17651 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17652   return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
17653 }
17654 
17655 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17656   return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
17657 }
17658 
17659 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
17660   return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
17661 }
17662 
17663 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17664   return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
17665 }
17666 
17667 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17668   return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
17669 }
17670 
17671 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17672   return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
17673 }
17674 
17675 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17676   return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
17677 }
17678 
17679 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17680   return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
17681 }
17682 
17683 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
17684   return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
17685 }
17686 
17687 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
17688   scalar::utf16::change_endianness_utf16(input, length, output);
17689 }
17690 
17691 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
17692   return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
17693 }
17694 
17695 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
17696   return scalar::utf16::count_code_points<endianness::BIG>(input, length);
17697 }
17698 
17699 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
17700   return scalar::utf8::count_code_points(input, length);
17701 }
17702 
17703 simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
17704   return scalar::utf8::count_code_points(buf,len);
17705 }
17706 
17707 simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
17708   return scalar::utf16::latin1_length_from_utf16(length);
17709 }
17710 
17711 simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept {
17712   return length;
17713 }
17714 
17715 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
17716   return scalar::latin1::utf8_length_from_latin1(input,length);
17717 }
17718 
17719 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
17720   return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
17721 }
17722 
17723 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
17724   return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
17725 }
17726 
17727 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
17728   return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
17729 }
17730 
17731 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
17732   return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
17733 }
17734 
17735 simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
17736   return scalar::latin1::utf16_length_from_latin1(length);
17737 }
17738 
17739 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
17740   return scalar::utf8::utf16_length_from_utf8(input, length);
17741 }
17742 
17743 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
17744   return scalar::utf32::utf8_length_from_utf32(input, length);
17745 }
17746 
17747 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
17748   return scalar::utf32::utf16_length_from_utf32(input, length);
17749 }
17750 
17751 simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
17752   return scalar::latin1::utf32_length_from_latin1(length);
17753 }
17754 
17755 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
17756   return scalar::utf8::count_code_points(input, length);
17757 }
17758 
17759 } // namespace fallback
17760 } // namespace simdutf
17761 
17762 /* begin file src/simdutf/fallback/end.h */
17763 /* end file src/simdutf/fallback/end.h */
17764 /* end file src/fallback/implementation.cpp */
17765 #endif
17766 #if SIMDUTF_IMPLEMENTATION_ICELAKE
17767 /* begin file src/icelake/implementation.cpp */
17768 
17769 
17770 /* begin file src/simdutf/icelake/begin.h */
17771 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
17772 // #define SIMDUTF_IMPLEMENTATION icelake
17773 
17774 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
17775 // nothing needed.
17776 #else
17777 SIMDUTF_TARGET_ICELAKE
17778 #endif
17779 
17780 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
17781 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
17782 #endif // end of workaround
17783 /* end file src/simdutf/icelake/begin.h */
17784 namespace simdutf {
17785 namespace icelake {
17786 namespace {
17787 #ifndef SIMDUTF_ICELAKE_H
17788 #error "icelake.h must be included"
17789 #endif
17790 /* begin file src/icelake/icelake_utf8_common.inl.cpp */
17791 // Common procedures for both validating and non-validating conversions from UTF-8.
17792 enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL};
17793 
17794 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
17795 using utf8_to_utf32_result = std::pair<const char*, uint32_t*>;
17796 
17797 /*
17798     process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
17799     to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
17800     might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
17801     indicates how many input bytes are relevant.
17802 
17803     Returns true when the result is correct, otherwise it returns false.
17804 
17805     The provided in and out pointers are advanced according to how many input
17806     bytes have been processed, upon success.
17807 */
17808 template <block_processing_mode tail, endianness big_endian>
17809 simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
17810   // constants
17811   __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
17812   __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
17813   __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
17814   __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
17815   __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
17816   __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
17817   __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
17818   __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
17819   __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
17820   __m512i byteflip = _mm512_setr_epi64(
17821             0x0607040502030001,
17822             0x0e0f0c0d0a0b0809,
17823             0x0607040502030001,
17824             0x0e0f0c0d0a0b0809,
17825             0x0607040502030001,
17826             0x0e0f0c0d0a0b0809,
17827             0x0607040502030001,
17828             0x0e0f0c0d0a0b0809
17829         );
17830   // Note that 'tail' is a compile-time constant !
17831   __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
17832   __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
17833   __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
17834   if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII
17835   // alternatively, we could do 'if (m1 == b) { '
17836     if (tail == SIMDUTF_FULL) {
17837       in += 64;          // consumed 64 bytes
17838       // we convert a full 64-byte block, writing 128 bytes.
17839       __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
17840       if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
17841       _mm512_storeu_si512(out, input1);
17842       out += 32;
17843       __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
17844       if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
17845       _mm512_storeu_si512(out, input2);
17846       out += 32;
17847       return true; // we are done
17848     } else {
17849       in += gap;
17850       if (gap <= 32) {
17851         __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
17852         if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
17853         _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1);
17854         out += gap;
17855       } else {
17856         __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
17857         if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
17858         _mm512_storeu_si512(out, input1);
17859         out += 32;
17860         __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
17861         if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
17862         _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
17863         out += gap - 32;
17864       }
17865       return true; // we are done
17866     }
17867   }
17868   // classify characters further
17869   __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
17870                                         _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
17871   __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
17872                                        _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
17873 
17874   __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
17875                                                      _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
17876                                                                      // Overlong 2-byte sequence
17877   if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
17878     // Overlong 2-byte sequence
17879     return false;
17880   }
17881   if (_ktestz_mask64_u8(m34, m34) == 0) {
17882     // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence!
17883     __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
17884                                         _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
17885 
17886     __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b);
17887 
17888     __mmask64 mp1 = _kshiftli_mask64(m234, 1);
17889     __mmask64 mp2 = _kshiftli_mask64(m34, 2);
17890     // We could do it as follows...
17891     // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
17892     // but GCC generates better code when we do:
17893     if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
17894       // Fast path with 1,2,3 bytes
17895       __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
17896       __mmask64 m1234 = _kor_mask64(m1, m234);
17897       // mismatched continuation bytes:
17898       if (tail == SIMDUTF_FULL) {
17899         __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
17900         // the presence of a 1 bit indicates that they overlap.
17901         // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
17902         if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
17903       } else {
17904         __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
17905         if (mc != bxorm1234) { return false; }
17906       }
17907       // mend: identifying the last bytes of each sequence to be decoded
17908       __mmask64 mend = _kshiftri_mask64(m1234, 1);
17909       if (tail != SIMDUTF_FULL) {
17910         mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
17911       }
17912 
17913 
17914       __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
17915       __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
17916 
17917       __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
17918       __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
17919       __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
17920                                                         clearedbytes); // the last byte of each character
17921 
17922       __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
17923       __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
17924       __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
17925       __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
17926                                                               beforeasciibytes); // the second last bytes (of two, three byte seq,
17927                                                                                  // surrogates)
17928       secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
17929 
17930       __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
17931                                                        indexofsecondlastbytes); // indices of the second last bytes
17932       __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
17933                                                     clearedbytes); // only those that are the third last byte of a sequece
17934       __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
17935                                                              thirdlastbyte); // the third last bytes (of three byte sequences, hi
17936                                                                              // surrogate)
17937       thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
17938       __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
17939       // the elements of Wout excluding the last element if it happens to be a high surrogate:
17940 
17941       __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
17942 
17943 
17944       // Encodings out of range...
17945       {
17946         // the location of 3-byte sequence start bytes in the input
17947         __mmask64 m3 = m34 & (b ^ m4);
17948         // code units in Wout corresponding to 3-byte sequences.
17949         __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
17950         __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
17951         __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
17952         __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
17953         __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
17954         __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
17955         if (_kor_mask32(Msmall800, M3s)) { return false; }
17956       }
17957       int64_t nout = _mm_popcnt_u64(mprocessed);
17958       in +=  64 - _lzcnt_u64(mprocessed);
17959       if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
17960       _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
17961       out += nout;
17962       return true; // ok
17963     }
17964     //
17965     // We have a 4-byte sequence, this is the general case.
17966     // Slow!
17967     __mmask64 mp3 = _kshiftli_mask64(m4, 3);
17968     __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
17969     __mmask64 m1234 = _kor_mask64(m1, m234);
17970 
17971     // mend: identifying the last bytes of each sequence to be decoded
17972     __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
17973     if (tail != SIMDUTF_FULL) {
17974       mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
17975     }
17976     __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
17977     __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
17978 
17979     __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
17980     __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
17981     __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
17982                                                       clearedbytes); // the last byte of each character
17983 
17984     __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
17985     __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
17986     __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
17987     __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
17988                                                             beforeasciibytes); // the second last bytes (of two, three byte seq,
17989                                                                                // surrogates)
17990     secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
17991 
17992     __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
17993                                                      indexofsecondlastbytes); // indices of the second last bytes
17994     __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
17995                                                   clearedbytes); // only those that are the third last byte of a sequece
17996     __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
17997                                                            thirdlastbyte); // the third last bytes (of three byte sequences, hi
17998                                                                            // surrogate)
17999     thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
18000     __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
18001     uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
18002     __mmask32 Mlo = __mmask32(Mlo_uint64);
18003     __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
18004     __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo,
18005                                                   mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
18006     __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes,
18007                                                                  4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
18008     __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes,
18009                                                    lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
18010     __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
18011                                          mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
18012     // the elements of Wout excluding the last element if it happens to be a high surrogate:
18013     __mmask32 Mout = ~(Mhi & 0x80000000);
18014     __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
18015 
18016 
18017     // mismatched continuation bytes:
18018     if (tail == SIMDUTF_FULL) {
18019       __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
18020       // the presence of a 1 bit indicates that they overlap.
18021       // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
18022       if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
18023     } else {
18024       __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
18025       if (mc != bxorm1234) { return false; }
18026     }
18027     // Encodings out of range...
18028     {
18029       // the location of 3-byte sequence start bytes in the input
18030       __mmask64 m3 = m34 & (b ^ m4);
18031       // code units in Wout corresponding to 3-byte sequences.
18032       __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
18033       __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
18034       __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
18035       __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
18036       __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
18037       __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
18038       __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
18039       __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
18040       if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { return false; }
18041     }
18042     in += 64 - _lzcnt_u64(mprocessed);
18043     int64_t nout = _mm_popcnt_u64(mprocessed);
18044     if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
18045     _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
18046     out += nout;
18047     return true; // ok
18048   }
18049   // Fast path 2: all ASCII or 2 byte
18050   __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
18051   // on top of -0xc0 we substract -2 which we get back later of the
18052   // continuation byte tags
18053   __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
18054   __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
18055   if (tail == SIMDUTF_FULL) {
18056     __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
18057     if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { return false; }
18058   } else {
18059     __mmask64 bxorleading = _kxor_mask64(b, leading);
18060     if (_kshiftli_mask64(m234, 1) != bxorleading) { return false; }
18061   }
18062   //
18063   if (tail == SIMDUTF_FULL) {
18064     // In the two-byte/ASCII scenario, we are easily latency bound, so we want
18065     // to increment the input buffer as quickly as possible.
18066     // We process 32 bytes unless the byte at index 32 is a continuation byte,
18067     // in which case we include it as well for a total of 33 bytes.
18068     // Note that if x is an ASCII byte, then the following is false:
18069     // int8_t(x) <= int8_t(0xc0) under two's complement.
18070     in += 32;
18071     if(int8_t(*in) <= int8_t(0xc0)) in++;
18072     // The alternative is to do
18073     // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
18074     // but it requires loading the input, doing the mask computation, and converting
18075     // back the mask to a general register. It just takes too long, leaving the
18076     // processor likely to be idle.
18077   } else {
18078     in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
18079   }
18080   __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte);          // will contain zero for ascii, and the data
18081   lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead));                 // ... zero extended into code units
18082   __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
18083   follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow));             // ... zero extended into code units
18084   lead = _mm512_slli_epi16(lead, 6);                                         // shifted into position
18085   __m512i final = _mm512_add_epi16(follow, lead);                            // combining lead and follow
18086 
18087   if(big_endian) { final = _mm512_shuffle_epi8(final, byteflip); }
18088   if (tail == SIMDUTF_FULL) {
18089     // Next part is UTF-16 specific and can be generalized to UTF-32.
18090     int nout = _mm_popcnt_u32(uint32_t(leading));
18091     _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
18092     out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
18093   } else {
18094     int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
18095     _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
18096     out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
18097   }
18098 
18099   return true; // we are fine.
18100 }
18101 
18102 
18103 
18104 
18105 /*
18106     utf32_to_utf16_masked converts `count` lower UTF-32 code units
18107     from input `utf32` into UTF-16. It differs from utf32_to_utf16
18108     in that it 'masks' the writes.
18109 
18110     Returns how many 16-bit code units were stored.
18111 
18112     byteflip is used for flipping 16-bit code units, and it should be
18113         __m512i byteflip = _mm512_setr_epi64(
18114             0x0607040502030001,
18115             0x0e0f0c0d0a0b0809,
18116             0x0607040502030001,
18117             0x0e0f0c0d0a0b0809,
18118             0x0607040502030001,
18119             0x0e0f0c0d0a0b0809,
18120             0x0607040502030001,
18121             0x0e0f0c0d0a0b0809
18122         );
18123     We pass it to the (always inlined) function to encourage the compiler to
18124     keep the value in a (constant) register.
18125 */
18126 template <endianness big_endian>
18127 simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
18128 
18129     const __mmask16 valid = uint16_t((1 << count) - 1);
18130     // 1. check if we have any surrogate pairs
18131     const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
18132     const __mmask16 sp_mask = _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
18133 
18134     if (sp_mask == 0) {
18135         if(big_endian) {
18136           _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
18137 
18138         } else {
18139           _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
18140         }
18141         return count;
18142     }
18143 
18144     {
18145         // build surrogate pair code units in 32-bit lanes
18146 
18147         //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
18148         const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
18149         const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
18150 
18151         //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
18152         const __m512i t1 = _mm512_slli_epi32(t0, 6);
18153 
18154         //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
18155         //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
18156         const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
18157         const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
18158 
18159         //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
18160         //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
18161         const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
18162         const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
18163         const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
18164         const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
18165         __m512i t5 = _mm512_ror_epi32(t4, 16);
18166         // Here we want to trim all of the upper 16-bit code units from the 2-byte
18167         // characters represented as 4-byte values. We can compute it from
18168         // sp_mask or the following... It can be more optimized!
18169         const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
18170         const  __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2*count)) - 1));
18171         if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
18172         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
18173         __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
18174         _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
18175         //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
18176     }
18177 
18178     return count + static_cast<unsigned int>(count_ones(sp_mask));
18179 }
18180 
18181 /*
18182     utf32_to_utf16 converts `count` lower UTF-32 code units
18183     from input `utf32` into UTF-16. It may overflow.
18184 
18185     Returns how many 16-bit code units were stored.
18186 
18187     byteflip is used for flipping 16-bit code units, and it should be
18188         __m512i byteflip = _mm512_setr_epi64(
18189             0x0607040502030001,
18190             0x0e0f0c0d0a0b0809,
18191             0x0607040502030001,
18192             0x0e0f0c0d0a0b0809,
18193             0x0607040502030001,
18194             0x0e0f0c0d0a0b0809,
18195             0x0607040502030001,
18196             0x0e0f0c0d0a0b0809
18197         );
18198     We pass it to the (always inlined) function to encourage the compiler to
18199     keep the value in a (constant) register.
18200 */
18201 template <endianness big_endian>
18202 simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
18203     // check if we have any surrogate pairs
18204     const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
18205     const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
18206 
18207     if (sp_mask == 0) {
18208         // technically, it should be _mm256_storeu_epi16
18209         if(big_endian) {
18210           _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),_mm512_castsi512_si256(byteflip)));
18211         } else {
18212           _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
18213         }
18214         return count;
18215     }
18216 
18217     {
18218         // build surrogate pair code units in 32-bit lanes
18219 
18220         //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
18221         const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
18222         const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
18223 
18224         //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
18225         const __m512i t1 = _mm512_slli_epi32(t0, 6);
18226 
18227         //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
18228         //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
18229         const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
18230         const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
18231 
18232         //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
18233         //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
18234         const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
18235         const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
18236         const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
18237         const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
18238         __m512i t5 = _mm512_ror_epi32(t4, 16);
18239         const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
18240         if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
18241         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
18242         __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
18243         _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
18244         //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
18245     }
18246 
18247     return count + static_cast<unsigned int>(count_ones(sp_mask));
18248 }
18249 
18250 /**
18251  * Store the last N bytes of previous followed by 512-N bytes from input.
18252  */
18253 template <int N>
18254 __m512i prev(__m512i input, __m512i previous) {
18255     static_assert(N<=32, "N must be no larger than 32");
18256     const __m512i movemask = _mm512_setr_epi32(28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11);
18257     const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
18258 #if SIMDUTF_GCC8 || SIMDUTF_GCC9
18259     constexpr int shift = 16-N; // workaround for GCC8,9
18260     return _mm512_alignr_epi8(input, rotated, shift);
18261 #else
18262     return _mm512_alignr_epi8(input, rotated, 16-N);
18263 #endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
18264 }
18265 
18266 template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
18267 __m512i shuffle_epi128(__m512i v) {
18268     static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
18269     static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
18270     static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
18271     static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");
18272 
18273     constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
18274     return _mm512_shuffle_i32x4(v, v, shuffle);
18275 }
18276 
18277 template <unsigned idx>
18278 constexpr __m512i broadcast_epi128(__m512i v) {
18279     return shuffle_epi128<idx, idx, idx, idx>(v);
18280 }
18281 
18282 /**
18283  * Current unused.
18284  */
18285 template <int N>
18286 __m512i rotate_by_N_epi8(const __m512i input) {
18287 
18288     // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
18289     const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
18290 
18291     return _mm512_alignr_epi8(permuted, input, N);
18292 }
18293 
18294 /*
18295     expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
18296     stored at separate 32-bit lanes.
18297 
18298     For each lane we have also a character class (`char_class), given in form
18299     0x8080800N, where N is 4 higest bits from the leading byte; 0x80 resets
18300     corresponding bytes during pshufb.
18301 */
18302 simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) {
18303     /*
18304         Input:
18305         - utf8: bytes stored at separate 32-bit code units
18306         - valid: which code units have valid UTF-8 characters
18307 
18308         Bit layout of single word. We show 4 cases for each possible
18309         UTF-8 character encoding. The `?` denotes bits we must not
18310         assume their value.
18311 
18312         |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
18313         |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
18314         |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
18315         |????.????|????.????|????.????|0aaa.aaaa| ASCII char
18316           byte 3    byte 2    byte 1     byte 0
18317     */
18318 
18319     /* 1. Reset control bits of continuation bytes and the MSB
18320           of the leading byte; this makes all bytes unsigned (and
18321           does not alter ASCII char).
18322 
18323         |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
18324         |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
18325         |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
18326         |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
18327          ^^        ^^        ^^        ^
18328     */
18329     __m512i values;
18330     const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
18331     values = _mm512_and_si512(utf8, v_3f3f_3f7f);
18332 
18333     /* 2. Swap and join fields A-B and C-D
18334 
18335         |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
18336         |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
18337         |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
18338         |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
18339     const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
18340     values = _mm512_maddubs_epi16(values, v_0140_0140);
18341 
18342     /* 3. Swap and join fields AB & CD
18343 
18344         |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
18345         |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
18346         |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
18347         |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
18348     const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
18349     values = _mm512_madd_epi16(values, v_0001_1000);
18350 
18351     /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
18352         |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
18353         |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
18354         |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
18355         |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
18356     {
18357         /** pshufb
18358 
18359         continuation = 0
18360         ascii    = 7
18361         _2_bytes = 9
18362         _3_bytes = 10
18363         _4_bytes = 11
18364 
18365         shift_left_v3 = 4 * [
18366             ascii, # 0000
18367             ascii, # 0001
18368             ascii, # 0010
18369             ascii, # 0011
18370             ascii, # 0100
18371             ascii, # 0101
18372             ascii, # 0110
18373             ascii, # 0111
18374             continuation, # 1000
18375             continuation, # 1001
18376             continuation, # 1010
18377             continuation, # 1011
18378             _2_bytes, # 1100
18379             _2_bytes, # 1101
18380             _3_bytes, # 1110
18381             _4_bytes, # 1111
18382         ] */
18383         const __m512i shift_left_v3 = _mm512_setr_epi64(
18384             0x0707070707070707,
18385             0x0b0a090900000000,
18386             0x0707070707070707,
18387             0x0b0a090900000000,
18388             0x0707070707070707,
18389             0x0b0a090900000000,
18390             0x0707070707070707,
18391             0x0b0a090900000000
18392         );
18393 
18394         const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
18395         values = _mm512_sllv_epi32(values, shift);
18396     }
18397 
18398     /* 5. Shift right the values by variable amounts to reset lowest bits
18399         |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
18400         |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
18401         |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
18402         |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
18403     {
18404         // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
18405         const __m512i shift_right = _mm512_setr_epi64(
18406             0x1919191919191919,
18407             0x0b10151500000000,
18408             0x1919191919191919,
18409             0x0b10151500000000,
18410             0x1919191919191919,
18411             0x0b10151500000000,
18412             0x1919191919191919,
18413             0x0b10151500000000
18414         );
18415 
18416         const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
18417         values = _mm512_srlv_epi32(values, shift);
18418     }
18419 
18420     return values;
18421 }
18422 
18423 
18424 simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int &count) {
18425     const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
18426     const __m512i expand_ver2 = _mm512_setr_epi64(
18427                 0x0403020103020100,
18428                 0x0605040305040302,
18429                 0x0807060507060504,
18430                 0x0a09080709080706,
18431                 0x0c0b0a090b0a0908,
18432                 0x0e0d0c0b0d0c0b0a,
18433                 0x000f0e0d0f0e0d0c,
18434                 0x0201000f01000f0e
18435     );
18436     const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
18437     const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
18438     const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
18439     const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
18440     const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
18441     count = static_cast<int>(count_ones(leading_bytes));
18442     return  _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
18443 }
18444 
18445 simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
18446     __m512i char_class = _mm512_srli_epi32(input, 4);
18447     /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
18448     const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
18449     const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
18450     char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
18451     return expanded_utf8_to_utf32(char_class, input);
18452 }
18453 /* end file src/icelake/icelake_utf8_common.inl.cpp */
18454 /* begin file src/icelake/icelake_macros.inl.cpp */
18455 
18456 /*
18457     This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a UTF-8 string)
18458     and loads all possible 4-byte substring into an AVX512 register.
18459 
18460     For example if we have bytes abcdefgh... we create following 32-bit lanes
18461 
18462     [abcd|bcde|cdef|defg|efgh|...]
18463      ^                          ^
18464      byte 0 of reg              byte 63 of reg
18465 */
18466 /** pshufb
18467         # lane{0,1,2} have got bytes: [  0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15]
18468         # lane3 has got bytes:        [ 16, 17, 18, 19,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15]
18469 
18470         expand_ver2 = [
18471             # lane 0:
18472             0, 1, 2, 3,
18473             1, 2, 3, 4,
18474             2, 3, 4, 5,
18475             3, 4, 5, 6,
18476 
18477             # lane 1:
18478             4, 5, 6, 7,
18479             5, 6, 7, 8,
18480             6, 7, 8, 9,
18481             7, 8, 9, 10,
18482 
18483             # lane 2:
18484              8,  9, 10, 11,
18485              9, 10, 11, 12,
18486             10, 11, 12, 13,
18487             11, 12, 13, 14,
18488 
18489             # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19
18490             12, 13, 14, 15,
18491             13, 14, 15,  0,
18492             14, 15,  0,  1,
18493             15,  0,  1,  2,
18494         ]
18495 */
18496 
18497 #define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                                                    \
18498         {                                                                                                    \
18499             const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);                              \
18500             const __m512i expand_ver2 = _mm512_setr_epi64(                                                   \
18501                 0x0403020103020100,                                                                          \
18502                 0x0605040305040302,                                                                          \
18503                 0x0807060507060504,                                                                          \
18504                 0x0a09080709080706,                                                                          \
18505                 0x0c0b0a090b0a0908,                                                                          \
18506                 0x0e0d0c0b0d0c0b0a,                                                                          \
18507                 0x000f0e0d0f0e0d0c,                                                                          \
18508                 0x0201000f01000f0e                                                                           \
18509             );                                                                                               \
18510             const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);                                  \
18511                                                                                                              \
18512             __mmask16 leading_bytes;                                                                         \
18513             const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                                             \
18514             const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                                         \
18515             const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                                             \
18516             leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                                       \
18517                                                                                                              \
18518             __m512i char_class;                                                                              \
18519             char_class = _mm512_srli_epi32(input, 4);                                                        \
18520             /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                                           \
18521             const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                                             \
18522             const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                                       \
18523             char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);              \
18524                                                                                                              \
18525             const int valid_count = static_cast<int>(count_ones(leading_bytes));                             \
18526             const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);                                 \
18527                                                                                                              \
18528             const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32);    \
18529                                                                                                              \
18530             if (UTF32) {                                                                                     \
18531                 if(MASKED) {                                                                                 \
18532                     const __mmask16 valid = uint16_t((1 << valid_count) - 1);                                \
18533                     _mm512_mask_storeu_epi32((__m512i*)output, valid, out);                                  \
18534                 } else {                                                                                     \
18535                     _mm512_storeu_si512((__m512i*)output, out);                                              \
18536                 }                                                                                            \
18537                 output += valid_count;                                                                       \
18538             } else {                                                                                         \
18539                 if(MASKED) {                                                                                 \
18540                     output += utf32_to_utf16_masked<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
18541                 } else {                                                                                     \
18542                     output += utf32_to_utf16<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output));        \
18543                 }                                                                                            \
18544             }                                                                                                \
18545         }
18546 
18547 #define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)                                    \
18548 {                                                                                                           \
18549     if (UTF32) {                                                                                            \
18550         if(MASKED) {                                                                                        \
18551             const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);                                  \
18552             _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT);                                  \
18553         } else {                                                                                            \
18554             _mm512_storeu_si512((__m512i*)output, INPUT);                                              \
18555         }                                                                                                   \
18556         output += VALID_COUNT;                                                                              \
18557     } else {                                                                                                \
18558         if(MASKED) {                                                                                        \
18559             output += utf32_to_utf16_masked<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));      \
18560         } else {                                                                                            \
18561             output += utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));             \
18562         }                                                                                                   \
18563     }                                                                                                       \
18564 }
18565 
18566 
18567 #define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                                  \
18568         if (UTF32) {                                                                      \
18569                 const __m128i t0 = _mm512_castsi512_si128(utf8);                          \
18570                 const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                    \
18571                 const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                    \
18572                 const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                    \
18573                 _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi32(t0)); \
18574                 _mm512_storeu_si512((__m512i*)(output + 1*16), _mm512_cvtepu8_epi32(t1)); \
18575                 _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi32(t2)); \
18576                 _mm512_storeu_si512((__m512i*)(output + 3*16), _mm512_cvtepu8_epi32(t3)); \
18577         } else {                                                                          \
18578                 const __m256i h0 = _mm512_castsi512_si256(utf8);                          \
18579                 const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                    \
18580                 if(big_endian) {                                                          \
18581                 _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
18582                 _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
18583                 } else {                                                                  \
18584                 _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi16(h0)); \
18585                 _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi16(h1)); \
18586                 }                                                                         \
18587         }
18588 /* end file src/icelake/icelake_macros.inl.cpp */
18589 /* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
18590 // file included directly
18591 
18592 // File contains conversion procedure from VALID UTF-8 strings.
18593 
18594 /*
18595     valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.
18596 
18597     The `OUTPUT` template type decides what to do with UTF-32: store
18598     it directly or convert into UTF-16 (with AVX512).
18599 
18600     Input:
18601     - str           - valid UTF-8 string
18602     - len           - string length
18603     - out_buffer    - output buffer
18604 
18605     Result:
18606     - pair.first    - the first unprocessed input byte
18607     - pair.second   - the first unprocessed output word
18608 */
18609 template <endianness big_endian, typename OUTPUT>
18610 std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
18611     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
18612     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
18613     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
18614     static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
18615 
18616     __m512i byteflip = _mm512_setr_epi64(
18617             0x0607040502030001,
18618             0x0e0f0c0d0a0b0809,
18619             0x0607040502030001,
18620             0x0e0f0c0d0a0b0809,
18621             0x0607040502030001,
18622             0x0e0f0c0d0a0b0809,
18623             0x0607040502030001,
18624             0x0e0f0c0d0a0b0809
18625         );
18626     const char* ptr = str;
18627     const char* end = ptr + len;
18628 
18629     OUTPUT* output = dwords;
18630     /**
18631      * In the main loop, we consume 64 bytes per iteration,
18632      * but we access 64 + 4 bytes.
18633      * We check for ptr + 64 + 64 <= end because
18634      * we want to be do maskless writes without overruns.
18635      */
18636     while (ptr + 64 + 64 <= end) {
18637         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
18638         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
18639         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
18640         if(ascii == 0) {
18641             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
18642             output += 64;
18643             ptr += 64;
18644             continue;
18645         }
18646 
18647         const __m512i lane0 = broadcast_epi128<0>(utf8);
18648         const __m512i lane1 = broadcast_epi128<1>(utf8);
18649         int valid_count0;
18650         __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
18651         const __m512i lane2 = broadcast_epi128<2>(utf8);
18652         int valid_count1;
18653         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
18654         if(valid_count0 + valid_count1 <= 16) {
18655             vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
18656             valid_count0 += valid_count1;
18657             vec0 = expand_utf8_to_utf32(vec0);
18658             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
18659         } else {
18660             vec0 = expand_utf8_to_utf32(vec0);
18661             vec1 = expand_utf8_to_utf32(vec1);
18662             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
18663             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
18664         }
18665         const __m512i lane3 = broadcast_epi128<3>(utf8);
18666         int valid_count2;
18667         __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
18668         uint32_t tmp1;
18669         ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
18670         const __m512i lane4 = _mm512_set1_epi32(tmp1);
18671         int valid_count3;
18672         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
18673         if(valid_count2 + valid_count3 <= 16) {
18674             vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
18675             valid_count2 += valid_count3;
18676             vec2 = expand_utf8_to_utf32(vec2);
18677             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
18678         } else {
18679             vec2 = expand_utf8_to_utf32(vec2);
18680             vec3 = expand_utf8_to_utf32(vec3);
18681             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
18682             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
18683         }
18684         ptr += 4*16;
18685     }
18686 
18687     if (ptr + 64 <= end) {
18688         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
18689         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
18690         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
18691         if(ascii == 0) {
18692             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
18693             output += 64;
18694             ptr += 64;
18695         } else {
18696             const __m512i lane0 = broadcast_epi128<0>(utf8);
18697             const __m512i lane1 = broadcast_epi128<1>(utf8);
18698             int valid_count0;
18699             __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
18700             const __m512i lane2 = broadcast_epi128<2>(utf8);
18701             int valid_count1;
18702             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
18703             if(valid_count0 + valid_count1 <= 16) {
18704                 vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
18705                 valid_count0 += valid_count1;
18706                 vec0 = expand_utf8_to_utf32(vec0);
18707                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
18708             } else {
18709                 vec0 = expand_utf8_to_utf32(vec0);
18710                 vec1 = expand_utf8_to_utf32(vec1);
18711                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
18712                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
18713             }
18714 
18715             const __m512i lane3 = broadcast_epi128<3>(utf8);
18716             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
18717 
18718             ptr += 3*16;
18719         }
18720     }
18721     return {ptr, output};
18722 }
18723 
18724 
18725 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
18726 /* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
18727 /* begin file src/icelake/icelake_utf8_validation.inl.cpp */
18728 // file included directly
18729 
18730 
18731 simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1) {
18732   __m512i mask1 = _mm512_setr_epi64(
18733         0x0202020202020202,
18734         0x4915012180808080,
18735         0x0202020202020202,
18736         0x4915012180808080,
18737         0x0202020202020202,
18738         0x4915012180808080,
18739         0x0202020202020202,
18740         0x4915012180808080);
18741     const __m512i v_0f = _mm512_set1_epi8(0x0f);
18742     __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);
18743 
18744     __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
18745     __m512i mask2 = _mm512_setr_epi64(
18746         0xcbcbcb8b8383a3e7,
18747         0xcbcbdbcbcbcbcbcb,
18748         0xcbcbcb8b8383a3e7,
18749         0xcbcbdbcbcbcbcbcb,
18750         0xcbcbcb8b8383a3e7,
18751         0xcbcbdbcbcbcbcbcb,
18752         0xcbcbcb8b8383a3e7,
18753         0xcbcbdbcbcbcbcbcb);
18754      __m512i index2 = _mm512_and_si512(prev1, v_0f);
18755 
18756     __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
18757     __m512i mask3 = _mm512_setr_epi64(
18758         0x101010101010101,
18759         0x1010101babaaee6,
18760         0x101010101010101,
18761         0x1010101babaaee6,
18762         0x101010101010101,
18763         0x1010101babaaee6,
18764         0x101010101010101,
18765         0x1010101babaaee6
18766     );
18767     __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
18768     __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
18769     return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
18770   }
18771 
18772   simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
18773       const __m512i prev_input, const __m512i sc) {
18774     __m512i prev2 = prev<2>(input, prev_input);
18775     __m512i prev3 = prev<3>(input, prev_input);
18776     __m512i is_third_byte  = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0
18777     __m512i is_fourth_byte  = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0
18778     __m512i is_third_or_fourth_byte = _mm512_or_si512(is_third_byte, is_fourth_byte);
18779     const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
18780     is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
18781     // We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
18782     const __m512i v_80 = _mm512_set1_epi8(char(0x80));
18783     return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc, 0b1101010);
18784     //__m512i is_third_or_fourth_byte_mask = _mm512_and_si512(is_third_or_fourth_byte, v_80);
18785     //return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
18786   }
18787   //
18788   // Return nonzero if there are incomplete multibyte characters at the end of the block:
18789   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
18790   //
18791   simdutf_really_inline __m512i is_incomplete(const __m512i input) {
18792     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
18793     // ... 1111____ 111_____ 11______
18794     __m512i max_value = _mm512_setr_epi64(
18795         0xffffffffffffffff,
18796         0xffffffffffffffff,
18797         0xffffffffffffffff,
18798         0xffffffffffffffff,
18799         0xffffffffffffffff,
18800         0xffffffffffffffff,
18801         0xffffffffffffffff,
18802         0xbfdfefffffffffff);
18803     return _mm512_subs_epu8(input, max_value);
18804   }
18805 
18806   struct avx512_utf8_checker {
18807     // If this is nonzero, there has been a UTF-8 error.
18808     __m512i error{};
18809 
18810     // The last input we received
18811     __m512i prev_input_block{};
18812     // Whether the last input we received was incomplete (used for ASCII fast path)
18813     __m512i prev_incomplete{};
18814 
18815     //
18816     // Check whether the current bytes are valid UTF-8.
18817     //
18818     simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input) {
18819       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
18820       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
18821       __m512i prev1 = prev<1>(input, prev_input);
18822       __m512i sc = check_special_cases(input, prev1);
18823       this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
18824     }
18825 
18826     // The only problem that can happen at EOF is that a multibyte character is too short
18827     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
18828     // too large in the first of two bytes.
18829     simdutf_really_inline void check_eof() {
18830       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
18831       // possibly finish them.
18832       this->error = _mm512_or_si512(this->error, this->prev_incomplete);
18833     }
18834 
18835     // returns true if ASCII.
18836     simdutf_really_inline bool check_next_input(const __m512i input) {
18837       const __m512i v_80 = _mm512_set1_epi8(char(0x80));
18838       const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
18839       if(ascii == 0) {
18840         this->error = _mm512_or_si512(this->error, this->prev_incomplete);
18841         return true;
18842       } else {
18843         this->check_utf8_bytes(input, this->prev_input_block);
18844         this->prev_incomplete = is_incomplete(input);
18845         this->prev_input_block = input;
18846         return false;
18847       }
18848     }
18849     // do not forget to call check_eof!
18850     simdutf_really_inline bool errors() const {
18851         return _mm512_test_epi8_mask(this->error, this->error) != 0;
18852     }
18853 
18854   }; // struct avx512_utf8_checker
18855 /* end file src/icelake/icelake_utf8_validation.inl.cpp */
18856 /* begin file src/icelake/icelake_from_utf8.inl.cpp */
18857 // file included directly
18858 
18859 // File contains conversion procedure from possibly invalid UTF-8 strings.
18860 
18861 /**
18862  * Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to
18863  * out.
18864  * Returns the position of the input and output after the processing is
18865  * completed. Upon error, the output is set to null.
18866  */
18867 
18868 template <endianness big_endian>
18869 utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
18870   const char *const final_in = in + len;
18871   bool result = true;
18872   while (result) {
18873     if (in + 64 <= final_in) {
18874         result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
18875     } else if(in < final_in) {
18876         result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
18877     } else { break; }
18878   }
18879   if(!result) { out = nullptr; }
18880   return std::make_pair(in, out);
18881 }
18882 
18883 template <endianness big_endian>
18884 simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, size_t len, char16_t *out) {
18885   const char *const init_in = in;
18886   const char16_t *const init_out = out;
18887   const char *const final_in = in + len;
18888   bool  result = true;
18889   while (result) {
18890     if (in + 64 <= final_in) {
18891         result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
18892     } else if(in < final_in) {
18893         result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
18894     } else { break; }
18895   }
18896   if(!result) {
18897     // rewind_and_convert_with_errors will seek a potential error from in onward,
18898     // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
18899     simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
18900     res.count += (in - init_in);
18901     return res;
18902   } else {
18903     return simdutf::result(error_code::SUCCESS,out - init_out);
18904   }
18905 }
18906 
18907 
18908 template <endianness big_endian, typename OUTPUT>
18909 std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
18910     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
18911     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
18912     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
18913     static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
18914 
18915     const char* ptr = str;
18916     const char* end = ptr + len;
18917     __m512i byteflip = _mm512_setr_epi64(
18918             0x0607040502030001,
18919             0x0e0f0c0d0a0b0809,
18920             0x0607040502030001,
18921             0x0e0f0c0d0a0b0809,
18922             0x0607040502030001,
18923             0x0e0f0c0d0a0b0809,
18924             0x0607040502030001,
18925             0x0e0f0c0d0a0b0809
18926         );
18927     OUTPUT* output = dwords;
18928     avx512_utf8_checker checker{};
18929     /**
18930      * In the main loop, we consume 64 bytes per iteration,
18931      * but we access 64 + 4 bytes.
18932      * We check for ptr + 64 + 64 <= end because
18933      * we want to be do maskless writes without overruns.
18934      */
18935     while (ptr + 64 + 64 <= end) {
18936         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
18937         if(checker.check_next_input(utf8)) {
18938             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
18939             output += 64;
18940             ptr += 64;
18941             continue;
18942         }
18943         const __m512i lane0 = broadcast_epi128<0>(utf8);
18944         const __m512i lane1 = broadcast_epi128<1>(utf8);
18945         int valid_count0;
18946         __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
18947         const __m512i lane2 = broadcast_epi128<2>(utf8);
18948         int valid_count1;
18949         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
18950         if(valid_count0 + valid_count1 <= 16) {
18951             vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
18952             valid_count0 += valid_count1;
18953             vec0 = expand_utf8_to_utf32(vec0);
18954             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
18955         } else {
18956             vec0 = expand_utf8_to_utf32(vec0);
18957             vec1 = expand_utf8_to_utf32(vec1);
18958             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
18959             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
18960         }
18961         const __m512i lane3 = broadcast_epi128<3>(utf8);
18962         int valid_count2;
18963         __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
18964         uint32_t tmp1;
18965         ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
18966         const __m512i lane4 = _mm512_set1_epi32(tmp1);
18967         int valid_count3;
18968         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
18969         if(valid_count2 + valid_count3 <= 16) {
18970             vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
18971             valid_count2 += valid_count3;
18972             vec2 = expand_utf8_to_utf32(vec2);
18973             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
18974         } else {
18975             vec2 = expand_utf8_to_utf32(vec2);
18976             vec3 = expand_utf8_to_utf32(vec3);
18977             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
18978             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
18979         }
18980         ptr += 4*16;
18981     }
18982     const char* validatedptr = ptr; // validated up to ptr
18983 
18984     // For the final pass, we validate 64 bytes, but we only transcode
18985     // 3*16 bytes, so we may end up double-validating 16 bytes.
18986     if (ptr + 64 <= end) {
18987         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
18988         if(checker.check_next_input(utf8)) {
18989             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
18990             output += 64;
18991             ptr += 64;
18992         } else {
18993             const __m512i lane0 = broadcast_epi128<0>(utf8);
18994             const __m512i lane1 = broadcast_epi128<1>(utf8);
18995             int valid_count0;
18996             __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
18997             const __m512i lane2 = broadcast_epi128<2>(utf8);
18998             int valid_count1;
18999             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
19000             if(valid_count0 + valid_count1 <= 16) {
19001                 vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
19002                 valid_count0 += valid_count1;
19003                 vec0 = expand_utf8_to_utf32(vec0);
19004                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
19005             } else {
19006                 vec0 = expand_utf8_to_utf32(vec0);
19007                 vec1 = expand_utf8_to_utf32(vec1);
19008                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
19009                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
19010             }
19011 
19012             const __m512i lane3 = broadcast_epi128<3>(utf8);
19013             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
19014 
19015             ptr += 3*16;
19016         }
19017         validatedptr += 4*16;
19018     }
19019     {
19020        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
19021        checker.check_next_input(utf8);
19022     }
19023     checker.check_eof();
19024     if(checker.errors()) {
19025         return {ptr, nullptr}; // We found an error.
19026     }
19027     return {ptr, output};
19028 }
19029 
19030 // Like validating_utf8_to_fixed_length but returns as soon as an error is identified
19031 template <endianness big_endian, typename OUTPUT>
19032 std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) {
19033     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
19034     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
19035     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
19036     static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
19037 
19038     const char* ptr = str;
19039     const char* end = ptr + len;
19040     __m512i byteflip = _mm512_setr_epi64(
19041             0x0607040502030001,
19042             0x0e0f0c0d0a0b0809,
19043             0x0607040502030001,
19044             0x0e0f0c0d0a0b0809,
19045             0x0607040502030001,
19046             0x0e0f0c0d0a0b0809,
19047             0x0607040502030001,
19048             0x0e0f0c0d0a0b0809
19049         );
19050     OUTPUT* output = dwords;
19051     avx512_utf8_checker checker{};
19052     /**
19053      * In the main loop, we consume 64 bytes per iteration,
19054      * but we access 64 + 4 bytes.
19055      * We check for ptr + 64 + 64 <= end because
19056      * we want to be do maskless writes without overruns.
19057      */
19058     while (ptr + 64 + 64 <= end) {
19059         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
19060         if(checker.check_next_input(utf8)) {
19061             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
19062             output += 64;
19063             ptr += 64;
19064             continue;
19065         }
19066         if(checker.errors()) {
19067             return {ptr, output, false}; // We found an error.
19068         }
19069         const __m512i lane0 = broadcast_epi128<0>(utf8);
19070         const __m512i lane1 = broadcast_epi128<1>(utf8);
19071         int valid_count0;
19072         __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
19073         const __m512i lane2 = broadcast_epi128<2>(utf8);
19074         int valid_count1;
19075         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
19076         if(valid_count0 + valid_count1 <= 16) {
19077             vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
19078             valid_count0 += valid_count1;
19079             vec0 = expand_utf8_to_utf32(vec0);
19080             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
19081         } else {
19082             vec0 = expand_utf8_to_utf32(vec0);
19083             vec1 = expand_utf8_to_utf32(vec1);
19084             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
19085             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
19086         }
19087         const __m512i lane3 = broadcast_epi128<3>(utf8);
19088         int valid_count2;
19089         __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
19090         uint32_t tmp1;
19091         ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
19092         const __m512i lane4 = _mm512_set1_epi32(tmp1);
19093         int valid_count3;
19094         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
19095         if(valid_count2 + valid_count3 <= 16) {
19096             vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
19097             valid_count2 += valid_count3;
19098             vec2 = expand_utf8_to_utf32(vec2);
19099             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
19100         } else {
19101             vec2 = expand_utf8_to_utf32(vec2);
19102             vec3 = expand_utf8_to_utf32(vec3);
19103             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
19104             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
19105         }
19106         ptr += 4*16;
19107     }
19108     const char* validatedptr = ptr; // validated up to ptr
19109 
19110     // For the final pass, we validate 64 bytes, but we only transcode
19111     // 3*16 bytes, so we may end up double-validating 16 bytes.
19112     if (ptr + 64 <= end) {
19113         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
19114         if(checker.check_next_input(utf8)) {
19115             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
19116             output += 64;
19117             ptr += 64;
19118         } else if(checker.errors()) {
19119             return {ptr, output, false}; // We found an error.
19120         } else {
19121             const __m512i lane0 = broadcast_epi128<0>(utf8);
19122             const __m512i lane1 = broadcast_epi128<1>(utf8);
19123             int valid_count0;
19124             __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
19125             const __m512i lane2 = broadcast_epi128<2>(utf8);
19126             int valid_count1;
19127             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
19128             if(valid_count0 + valid_count1 <= 16) {
19129                 vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
19130                 valid_count0 += valid_count1;
19131                 vec0 = expand_utf8_to_utf32(vec0);
19132                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
19133             } else {
19134                 vec0 = expand_utf8_to_utf32(vec0);
19135                 vec1 = expand_utf8_to_utf32(vec1);
19136                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
19137                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
19138             }
19139 
19140             const __m512i lane3 = broadcast_epi128<3>(utf8);
19141             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
19142 
19143             ptr += 3*16;
19144         }
19145         validatedptr += 4*16;
19146     }
19147     {
19148        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
19149        checker.check_next_input(utf8);
19150     }
19151     checker.check_eof();
19152     if(checker.errors()) {
19153         return {ptr, output, false}; // We found an error.
19154     }
19155     return {ptr, output, true};
19156 }
19157 /* end file src/icelake/icelake_from_utf8.inl.cpp */
19158 /* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
19159 // file included directly
19160 
19161 // File contains conversion procedure from possibly invalid UTF-8 strings.
19162 
19163 // template <bool is_remaining, bool use_masked_store>
19164 template <bool is_remaining>
19165 simdutf_really_inline size_t process_block_from_utf8_to_latin1(const char *buf, size_t len,
19166                                            char *latin_output, __m512i minus64,
19167                                            __m512i one,
19168                                            __mmask64 *next_leading_ptr,
19169                                            __mmask64 *next_bit6_ptr) {
19170   __mmask64 load_mask =
19171       is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
19172   __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
19173   __mmask64 nonascii = _mm512_movepi8_mask(input);
19174 
19175   if (nonascii == 0) {
19176     is_remaining
19177         ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
19178         : _mm512_storeu_si512((__m512i *)latin_output, input);
19179     return len;
19180   }
19181 
19182   __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
19183 
19184   __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
19185   __mmask64 invalid_leading_bytes =
19186       _mm512_mask_cmpgt_epu8_mask(leading, highbits, one);
19187 
19188   if (invalid_leading_bytes) {
19189     return 0; // Indicates error
19190   }
19191 
19192   __mmask64 leading_shift = (leading << 1) | *next_leading_ptr;
19193   *next_leading_ptr = leading >> 63;
19194 
19195   if ((nonascii ^ leading) != leading_shift) {
19196     return 0; // Indicates error
19197   }
19198 
19199   __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
19200   input =
19201       _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
19202   *next_bit6_ptr = bit6 >> 63;
19203 
19204   __mmask64 retain = ~leading & load_mask;
19205   __m512i output = _mm512_maskz_compress_epi8(retain, input);
19206   int64_t written_out = count_ones(retain);
19207   __mmask64 store_mask = (1ULL << written_out) - 1;
19208 
19209   // ***************************
19210   //  Possible optimization? (Nick Nuon)
19211   //  This commented out line is 5% faster but sadly it'll also write past
19212   //  memory bounds for latin1_output: is_remaining ?
19213   //  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output) :
19214   //  _mm512_storeu_si512((__m512i *)latin_output, output); I tried using
19215   //  _mm512_storeu_si512 and have the next process_block start from the
19216   //  "written_out" point but the compiler shuffles memory in such a way that it
19217   //  is signifcantly slower...
19218   // ****************************
19219   _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
19220 
19221   return written_out;
19222 }
19223 
19224 size_t utf8_to_latin1_avx512(const char *buf, size_t len, char *latin_output) {
19225   char *start = latin_output;
19226   size_t pos = 0;
19227   __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
19228   __m512i one = _mm512_set1_epi8(1);
19229   __mmask64 next_leading = 0;
19230   __mmask64 next_bit6 = 0;
19231 
19232   while (pos + 64 <= len) {
19233     size_t written = process_block_from_utf8_to_latin1<false>(buf + pos, 64, latin_output, minus64,
19234                                           one, &next_leading, &next_bit6);
19235     if (written == 0) {
19236       return 0; // Indicates error
19237     }
19238     latin_output += written;
19239     pos += 64;
19240   }
19241 
19242   if (pos < len) {
19243     size_t remaining = len - pos;
19244     size_t written =
19245         process_block_from_utf8_to_latin1<true>(buf + pos, remaining, latin_output, minus64, one,
19246                             &next_leading, &next_bit6);
19247     if (written == 0) {
19248       return 0; // Indicates error
19249     }
19250     latin_output += written;
19251   }
19252 
19253   return (size_t)(latin_output - start);
19254 }
19255 /* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
19256 /* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
19257 // file included directly
19258 
19259 // File contains conversion procedure from valid UTF-8 strings.
19260 
19261 template <bool is_remaining>
19262 simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(const char *buf, size_t len,
19263                                                  char *latin_output,
19264                                                  __m512i minus64, __m512i one,
19265                                                  __mmask64 *next_leading_ptr,
19266                                                  __mmask64 *next_bit6_ptr) {
19267   __mmask64 load_mask =
19268       is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
19269   __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
19270   __mmask64 nonascii = _mm512_movepi8_mask(input);
19271 
19272   if (nonascii == 0) {
19273     is_remaining
19274         ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
19275         : _mm512_storeu_si512((__m512i *)latin_output, input);
19276     return len;
19277   }
19278 
19279   __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
19280 
19281   __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
19282 
19283   *next_leading_ptr = leading >> 63;
19284 
19285   __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
19286   input =
19287       _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
19288   *next_bit6_ptr = bit6 >> 63;
19289 
19290   __mmask64 retain = ~leading & load_mask;
19291   __m512i output = _mm512_maskz_compress_epi8(retain, input);
19292   int64_t written_out = count_ones(retain);
19293   __mmask64 store_mask = (1ULL << written_out) - 1;
19294   // Optimization opportunity: sometimes, masked writes are not needed.
19295   _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
19296   return written_out;
19297 }
19298 
19299 size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len,
19300                                    char *latin_output) {
19301   char *start = latin_output;
19302   size_t pos = 0;
19303   __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
19304   __m512i one = _mm512_set1_epi8(1);
19305   __mmask64 next_leading = 0;
19306   __mmask64 next_bit6 = 0;
19307 
19308   while (pos + 64 <= len) {
19309     size_t written = process_valid_block_from_utf8_to_latin1<false>(
19310         buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
19311     latin_output += written;
19312     pos += 64;
19313   }
19314 
19315   if (pos < len) {
19316     size_t remaining = len - pos;
19317     size_t written =
19318         process_valid_block_from_utf8_to_latin1<true>(buf + pos, remaining, latin_output, minus64,
19319                                   one, &next_leading, &next_bit6);
19320     latin_output += written;
19321   }
19322 
19323   return (size_t)(latin_output - start);
19324 }
19325 /* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
19326 /* begin file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
19327 // file included directly
19328 template <endianness big_endian>
19329 size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len,
19330                                        char *latin1_output) {
19331   const char16_t *end = buf + len;
19332   __m512i v_0xFF = _mm512_set1_epi16(0xff);
19333   __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
19334                                        0x0607040502030001, 0x0e0f0c0d0a0b0809,
19335                                        0x0607040502030001, 0x0e0f0c0d0a0b0809,
19336                                        0x0607040502030001, 0x0e0f0c0d0a0b0809);
19337   __m512i shufmask = _mm512_set_epi8(
19338       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19339       0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
19340       36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
19341   while (buf + 32 <= end) {
19342     __m512i in = _mm512_loadu_si512((__m512i *)buf);
19343     if (big_endian) {
19344       in = _mm512_shuffle_epi8(in, byteflip);
19345     }
19346     if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
19347       return 0;
19348     }
19349     _mm256_storeu_si256(
19350         (__m256i *)latin1_output,
19351         _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
19352     latin1_output += 32;
19353     buf += 32;
19354   }
19355   if (buf < end) {
19356     uint32_t mask(uint32_t(1 << (end - buf)) - 1);
19357     __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
19358     if (big_endian) {
19359       in = _mm512_shuffle_epi8(in, byteflip);
19360     }
19361     if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
19362       return 0;
19363     }
19364     _mm256_mask_storeu_epi8(
19365         latin1_output, mask,
19366         _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
19367   }
19368   return len;
19369 }
19370 
19371 template <endianness big_endian>
19372 std::pair<result, char *>
19373 icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
19374                                             char *latin1_output) {
19375   const char16_t *end = buf + len;
19376   const char16_t *start = buf;
19377   __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
19378                                        0x0607040502030001, 0x0e0f0c0d0a0b0809,
19379                                        0x0607040502030001, 0x0e0f0c0d0a0b0809,
19380                                        0x0607040502030001, 0x0e0f0c0d0a0b0809);
19381   __m512i v_0xFF = _mm512_set1_epi16(0xff);
19382   __m512i shufmask = _mm512_set_epi8(
19383       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19384       0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
19385       36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
19386   while (buf + 32 <= end) {
19387     __m512i in = _mm512_loadu_si512((__m512i *)buf);
19388     if (big_endian) {
19389       in = _mm512_shuffle_epi8(in, byteflip);
19390     }
19391     if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
19392       uint16_t word;
19393       while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
19394                                  : uint16_t(*buf))) <= 0xff) {
19395         *latin1_output++ = uint8_t(word);
19396         buf++;
19397       }
19398       return std::make_pair(result(error_code::TOO_LARGE, buf - start),
19399                             latin1_output);
19400     }
19401     _mm256_storeu_si256(
19402         (__m256i *)latin1_output,
19403         _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
19404     latin1_output += 32;
19405     buf += 32;
19406   }
19407   if (buf < end) {
19408     uint32_t mask(uint32_t(1 << (end - buf)) - 1);
19409     __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
19410     if (big_endian) {
19411       in = _mm512_shuffle_epi8(in, byteflip);
19412     }
19413     if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
19414 
19415       uint16_t word;
19416       while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
19417                                  : uint16_t(*buf))) <= 0xff) {
19418         *latin1_output++ = uint8_t(word);
19419         buf++;
19420       }
19421       return std::make_pair(result(error_code::TOO_LARGE, buf - start),
19422                             latin1_output);
19423     }
19424     _mm256_mask_storeu_epi8(
19425         latin1_output, mask,
19426         _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
19427   }
19428   return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
19429 }
19430 /* end file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
19431 /* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
19432 // file included directly
19433 
19434 /**
19435  * This function converts the input (inbuf, inlen), assumed to be valid
19436  * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units written
19437  * is written to 'outlen' and the function reports the number of input word
19438  * consumed.
19439  */
19440 template <endianness big_endian>
19441 size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
19442                                unsigned char *outbuf, size_t *outlen) {
19443   __m512i in;
19444   __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
19445   __m512i byteflip = _mm512_setr_epi64(
19446             0x0607040502030001,
19447             0x0e0f0c0d0a0b0809,
19448             0x0607040502030001,
19449             0x0e0f0c0d0a0b0809,
19450             0x0607040502030001,
19451             0x0e0f0c0d0a0b0809,
19452             0x0607040502030001,
19453             0x0e0f0c0d0a0b0809
19454         );
19455   const char16_t * const inbuf_orig = inbuf;
19456   const unsigned char * const outbuf_orig = outbuf;
19457   size_t adjust = 0;
19458   int carry = 0;
19459 
19460   while (inlen >= 32) {
19461     in = _mm512_loadu_si512(inbuf);
19462     if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
19463     inlen -= 31;
19464   lastiteration:
19465     inbuf += 31;
19466 
19467   failiteration:
19468     const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
19469       inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
19470 
19471     if (_ktestz_mask32_u8(inmask, is234byte)) {
19472       // fast path for ASCII only
19473       _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
19474       outbuf += 31;
19475       carry = 0;
19476 
19477       if (inlen < 32) {
19478         goto tail;
19479       } else {
19480         continue;
19481       }
19482     }
19483 
19484     const __mmask32 is12byte =
19485         _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
19486 
19487     if (_ktestc_mask32_u8(is12byte, inmask)) {
19488       // fast path for 1 and 2 byte only
19489 
19490       const __m512i twobytes = _mm512_ternarylogic_epi32(
19491           _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
19492           _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
19493       in = _mm512_mask_add_epi16(in, is234byte, twobytes,
19494                                  _mm512_set1_epi16(int16_t(0x80c0)));
19495       const __m512i cmpmask =
19496           _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
19497                                   _mm512_set1_epi16(0x0800));
19498       const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
19499       const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
19500       _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))),
19501                               out);
19502       outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
19503       carry = 0;
19504 
19505       if (inlen < 32) {
19506         goto tail;
19507       } else {
19508         continue;
19509       }
19510     }
19511     __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
19512     __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
19513 
19514 
19515     __m512i taglo = _mm512_set1_epi32(0x8080e000);
19516     __m512i taghi = taglo;
19517 
19518     const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
19519     const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
19520         inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
19521     const __mmask32 losurr = _mm512_cmp_epu16_mask(
19522         fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
19523 
19524     int carryout = 0;
19525     if (!_kortestz_mask32_u8(hisurr, losurr)) {
19526       // handle surrogates
19527 
19528       __m512i los = _mm512_alignr_epi32(hi, lo, 1);
19529       __m512i his = _mm512_alignr_epi32(lo, hi, 1);
19530 
19531       const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
19532       taglo =
19533           _mm512_mask_mov_epi32(taglo,__mmask16(hisurr), _mm512_set1_epi32(0x808080f0));
19534       taghi =
19535           _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0));
19536 
19537       lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
19538       hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
19539       los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
19540       his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
19541       lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
19542       hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
19543 
19544       carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
19545 
19546       const uint32_t  h = _cvtmask32_u32(hisurr);
19547       const uint32_t  l = _cvtmask32_u32(losurr);
19548       // check for mismatched surrogates
19549       if ((h + h + carry) ^ l) {
19550         const uint32_t lonohi = l & ~(h + h + carry);
19551         const uint32_t hinolo = h & ~(l >> 1);
19552         inlen = _tzcnt_u32(hinolo | lonohi);
19553         inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1));
19554         in = _mm512_maskz_mov_epi16(inmask, in);
19555         adjust = (int)inlen - 31;
19556         inlen = 0;
19557         goto failiteration;
19558       }
19559     }
19560 
19561     hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi);
19562     carry = carryout;
19563 
19564     __m512i mslo =
19565         _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
19566 
19567     __m512i mshi =
19568         _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
19569 
19570     const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
19571     const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
19572 
19573     const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
19574     const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
19575     const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
19576 
19577     taglo =
19578         _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000));
19579     taghi =
19580         _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000));
19581     __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
19582                                       _mm512_set1_epi32(0x00010101));
19583     __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
19584                                       _mm512_set1_epi32(0x00010101));
19585 
19586 
19587     magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
19588                                       _mm512_set1_epi32(0x00010101));
19589     magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
19590                                       _mm512_set1_epi32(0x00010101));
19591 
19592     mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
19593                                      0xea); // A&B|C
19594     mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
19595                                      0xea);
19596     mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
19597 
19598     mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
19599 
19600     const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
19601     const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
19602     const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
19603     const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
19604     const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
19605     const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
19606 
19607     uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
19608     uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
19609 
19610     _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
19611     _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
19612     outbuf += advlo + advhi;
19613   }
19614   outbuf -= adjust;
19615 
19616 tail:
19617   if (inlen != 0) {
19618     // We must have inlen < 31.
19619     inmask = _cvtu32_mask32((1 << inlen) - 1);
19620     in = _mm512_maskz_loadu_epi16(inmask, inbuf);
19621     if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
19622     adjust = inlen - 31;
19623     inlen = 0;
19624     goto lastiteration;
19625   }
19626   *outlen = (outbuf - outbuf_orig) + adjust;
19627   return ((inbuf - inbuf_orig) + adjust);
19628 }
19629 /* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
19630 /* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
19631 // file included directly
19632 
19633 /*
19634   Returns a pair: the first unprocessed byte from buf and utf32_output
19635   A scalar routing should carry on the conversion of the tail.
19636 */
19637 template <endianness big_endian>
19638 std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
19639   const char16_t* end = buf + len;
19640   const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
19641   const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
19642   const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
19643   __mmask32 carry{0};
19644   const __m512i byteflip = _mm512_setr_epi64(
19645             0x0607040502030001,
19646             0x0e0f0c0d0a0b0809,
19647             0x0607040502030001,
19648             0x0e0f0c0d0a0b0809,
19649             0x0607040502030001,
19650             0x0e0f0c0d0a0b0809,
19651             0x0607040502030001,
19652             0x0e0f0c0d0a0b0809
19653         );
19654   while (std::distance(buf,end) >= 32) {
19655     // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
19656     __m512i in = _mm512_loadu_si512((__m512i*)buf);
19657     if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
19658 
19659     // H - bitmask for high surrogates
19660     const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
19661     // H - bitmask for low surrogates
19662     const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
19663 
19664     if ((H|L)) {
19665       // surrogate pair(s) in a register
19666       const __mmask32 V = (L ^ (carry | (H << 1)));   // A high surrogate must be followed by low one and a low one must be preceded by a high one.
19667                                                       // If valid, V should be equal to 0
19668 
19669       if(V == 0) {
19670         // valid case
19671         /*
19672             Input surrogate pair:
19673             |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
19674                 low surrogate      high surrogate
19675         */
19676         /*  1. Expand all code units to 32-bit code units
19677             in  |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
19678         */
19679         const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
19680         const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1));
19681 
19682         /*  2. Shift by one 16-bit word to align low surrogates with high surrogates
19683             in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
19684             shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
19685         */
19686         const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
19687         const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
19688 
19689         /*  3. Align all high surrogates in first and second by shifting to the left by 10 bits
19690             |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
19691         */
19692         const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
19693         const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H>>16), second, 10);
19694 
19695         /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant
19696             in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
19697             shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
19698             constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
19699         */
19700         const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
19701         const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first);
19702         const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant);
19703 
19704         const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H>>16), aligned_second, shifted_second);
19705         const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H>>16), added_second, constant);
19706 
19707         //  5. Store all valid UTF-32 code units (low surrogate positions and 32nd word are invalid)
19708         const __mmask32 valid = ~L & 0x7fffffff;
19709         // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32
19710         // to ease performance portability to Zen 4.
19711         const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
19712         const size_t howmany1 = count_ones((uint16_t)(valid));
19713         _mm512_storeu_si512((__m512i *) utf32_output,  compressed_first);
19714         utf32_output += howmany1;
19715         const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
19716         const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
19717         // The following could be unsafe in some cases?
19718         //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
19719         _mm512_mask_storeu_epi32((__m512i *) utf32_output, __mmask16((1<<howmany2)-1), compressed_second);
19720         utf32_output += howmany2;
19721         // Only process 31 code units, but keep track if the 31st word is a high surrogate as a carry
19722         buf += 31;
19723         carry = (H >> 30) & 0x1;
19724       } else {
19725         // invalid case
19726         return std::make_tuple(buf+carry, utf32_output, false);
19727       }
19728     } else {
19729       // no surrogates
19730       // extend all thirty-two 16-bit code units to thirty-two 32-bit code units
19731       _mm512_storeu_si512((__m512i *)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
19732       _mm512_storeu_si512((__m512i *)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1)));
19733       utf32_output += 32;
19734       buf += 32;
19735       carry = 0;
19736     }
19737   } // while
19738   return std::make_tuple(buf+carry, utf32_output, true);
19739 }
19740 /* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
19741 /* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
19742 // file included directly
19743 size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len,
19744                                        char *latin1_output) {
19745   const char32_t *end = buf + len;
19746   __m512i v_0xFF = _mm512_set1_epi32(0xff);
19747   __m512i shufmask = _mm512_set_epi8(
19748       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19749       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
19750       56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
19751   while (buf + 16 <= end) {
19752     __m512i in = _mm512_loadu_si512((__m512i *)buf);
19753     if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
19754       return 0;
19755     }
19756     _mm_storeu_si128((__m128i *)latin1_output,
19757                      _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
19758     latin1_output += 16;
19759     buf += 16;
19760   }
19761   if (buf < end) {
19762     uint16_t mask = uint16_t((1 << (end - buf)) - 1);
19763     __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
19764     if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
19765       return 0;
19766     }
19767     _mm_mask_storeu_epi8(
19768         latin1_output, mask,
19769         _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
19770   }
19771   return len;
19772 }
19773 
19774 std::pair<result, char *>
19775 icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
19776                                             char *latin1_output) {
19777   const char32_t *end = buf + len;
19778   const char32_t *start = buf;
19779   __m512i v_0xFF = _mm512_set1_epi32(0xff);
19780   __m512i shufmask = _mm512_set_epi8(
19781       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19782       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
19783       56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
19784   while (buf + 16 <= end) {
19785     __m512i in = _mm512_loadu_si512((__m512i *)buf);
19786     if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
19787       while (uint32_t(*buf) <= 0xff) {
19788         *latin1_output++ = uint8_t(*buf++);
19789       }
19790       return std::make_pair(result(error_code::TOO_LARGE, buf - start),
19791                             latin1_output);
19792     }
19793     _mm_storeu_si128((__m128i *)latin1_output,
19794                      _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
19795     latin1_output += 16;
19796     buf += 16;
19797   }
19798   if (buf < end) {
19799     uint16_t mask = uint16_t((1 << (end - buf)) - 1);
19800     __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
19801     if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
19802       while (uint32_t(*buf) <= 0xff) {
19803         *latin1_output++ = uint8_t(*buf++);
19804       }
19805       return std::make_pair(result(error_code::TOO_LARGE, buf - start),
19806                             latin1_output);
19807     }
19808     _mm_mask_storeu_epi8(
19809         latin1_output, mask,
19810         _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
19811   }
19812   return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
19813 }
19814 /* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
19815 /* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
19816 // file included directly
19817 
19818 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
19819 std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
19820   const char32_t* end = buf + len;
19821   const __m256i v_0000 = _mm256_setzero_si256();
19822   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
19823   const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
19824   const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
19825   const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
19826   const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
19827   __m256i running_max = _mm256_setzero_si256();
19828   __m256i forbidden_bytemask = _mm256_setzero_si256();
19829 
19830   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
19831 
19832   while (buf + 16 + safety_margin <= end) {
19833     __m256i in = _mm256_loadu_si256((__m256i*)buf);
19834     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
19835     running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
19836 
19837     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation
19838     __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
19839     in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
19840 
19841     // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
19842 
19843     if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
19844       // 1. pack the bytes
19845       const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
19846       // 2. store (16 bytes)
19847       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
19848       // 3. adjust pointers
19849       buf += 16;
19850       utf8_output += 16;
19851       continue; // we are done for this round!
19852     }
19853     // no bits set above 7th bit
19854     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
19855     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
19856 
19857     // no bits set above 11th bit
19858     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
19859     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
19860     if (one_or_two_bytes_bitmask == 0xffffffff) {
19861       // 1. prepare 2-byte values
19862       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
19863       // expected output   : [110a|aaaa|10bb|bbbb] x 8
19864       const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
19865       const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
19866 
19867       // t0 = [000a|aaaa|bbbb|bb00]
19868       const __m256i t0 = _mm256_slli_epi16(in_16, 2);
19869       // t1 = [000a|aaaa|0000|0000]
19870       const __m256i t1 = _mm256_and_si256(t0, v_1f00);
19871       // t2 = [0000|0000|00bb|bbbb]
19872       const __m256i t2 = _mm256_and_si256(in_16, v_003f);
19873       // t3 = [000a|aaaa|00bb|bbbb]
19874       const __m256i t3 = _mm256_or_si256(t1, t2);
19875       // t4 = [110a|aaaa|10bb|bbbb]
19876       const __m256i t4 = _mm256_or_si256(t3, v_c080);
19877 
19878       // 2. merge ASCII and 2-byte codewords
19879       const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
19880 
19881       // 3. prepare bitmask for 8-bit lookup
19882       const uint32_t M0 = one_byte_bitmask & 0x55555555;
19883       const uint32_t M1 = M0 >> 7;
19884       const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
19885       // 4. pack the bytes
19886 
19887       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
19888       const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
19889 
19890       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
19891       const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
19892 
19893       const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
19894       // 5. store bytes
19895       _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
19896       utf8_output += row[0];
19897       _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
19898       utf8_output += row_2[0];
19899 
19900       // 6. adjust pointers
19901       buf += 16;
19902       continue;
19903     }
19904     // Must check for overflow in packing
19905     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
19906     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
19907     if (saturation_bitmask == 0xffffffff) {
19908       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
19909       const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
19910       forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
19911 
19912       const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
19913                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
19914                                               0x0000, 0x0202, 0x0404, 0x0606,
19915                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
19916 
19917       /* In this branch we handle three cases:
19918         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
19919         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
19920         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
19921 
19922         We expand the input word (16-bit) into two code units (32-bit), thus
19923         we have room for four bytes. However, we need five distinct bit
19924         layouts. Note that the last byte in cases #2 and #3 is the same.
19925 
19926         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
19927         in register t2.
19928 
19929         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
19930         either byte 1 for case #2 or byte 2 for case #3. Note that they
19931         differ by exactly one bit.
19932 
19933         Finally from these two code units we build proper UTF-8 sequence, taking
19934         into account the case (i.e, the number of bytes to write).
19935       */
19936       /**
19937        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
19938        * t2 => [0ccc|cccc] [10cc|cccc]
19939        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
19940        */
19941 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
19942       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
19943       const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
19944       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
19945       const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
19946       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
19947       const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
19948 
19949       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
19950       const __m256i s0 = _mm256_srli_epi16(in_16, 4);
19951       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
19952       const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
19953       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
19954       const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
19955       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
19956       const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
19957       const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
19958       const __m256i s4 = _mm256_xor_si256(s3, m0);
19959 #undef simdutf_vec
19960 
19961       // 4. expand code units 16-bit => 32-bit
19962       const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
19963       const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
19964 
19965       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
19966       const uint32_t mask = (one_byte_bitmask & 0x55555555) |
19967                             (one_or_two_bytes_bitmask & 0xaaaaaaaa);
19968       // Due to the wider registers, the following path is less likely to be useful.
19969       /*if(mask == 0) {
19970         // We only have three-byte code units. Use fast path.
19971         const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
19972         const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
19973         const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
19974         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
19975         utf8_output += 12;
19976         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
19977         utf8_output += 12;
19978         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
19979         utf8_output += 12;
19980         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
19981         utf8_output += 12;
19982         buf += 16;
19983         continue;
19984       }*/
19985       const uint8_t mask0 = uint8_t(mask);
19986       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
19987       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
19988       const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
19989 
19990       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
19991       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
19992       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
19993       const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
19994 
19995       const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
19996       const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
19997       const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
19998       const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
19999 
20000 
20001       const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
20002       const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
20003       const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
20004       const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
20005 
20006       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
20007       utf8_output += row0[0];
20008       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
20009       utf8_output += row1[0];
20010       _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
20011       utf8_output += row2[0];
20012       _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
20013       utf8_output += row3[0];
20014       buf += 16;
20015     } else {
20016       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
20017       // Let us do a scalar fallback.
20018       // It may seem wasteful to use scalar code, but being efficient with SIMD
20019       // may require large, non-trivial tables?
20020       size_t forward = 15;
20021       size_t k = 0;
20022       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
20023       for(; k < forward; k++) {
20024         uint32_t word = buf[k];
20025         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
20026           *utf8_output++ = char(word);
20027         } else if((word & 0xFFFFF800)==0) { // 2-byte
20028           *utf8_output++ = char((word>>6) | 0b11000000);
20029           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20030         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
20031           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
20032           *utf8_output++ = char((word>>12) | 0b11100000);
20033           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20034           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20035         } else {  // 4-byte
20036           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
20037           *utf8_output++ = char((word>>18) | 0b11110000);
20038           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
20039           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20040           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20041         }
20042       }
20043       buf += k;
20044     }
20045   } // while
20046 
20047   // check for invalid input
20048   const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
20049   if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
20050     return std::make_pair(nullptr, utf8_output);
20051   }
20052 
20053   if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
20054 
20055   return std::make_pair(buf, utf8_output);
20056 }
20057 
20058 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
20059 std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
20060   const char32_t* end = buf + len;
20061   const char32_t* start = buf;
20062 
20063   const __m256i v_0000 = _mm256_setzero_si256();
20064   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
20065   const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
20066   const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
20067   const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
20068   const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
20069   const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
20070 
20071   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
20072 
20073   while (buf + 16 + safety_margin <= end) {
20074     __m256i in = _mm256_loadu_si256((__m256i*)buf);
20075     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
20076     // Check for too large input
20077     const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
20078     if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
20079       return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
20080     }
20081 
20082     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation
20083     __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
20084     in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
20085 
20086     // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
20087 
20088     if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
20089       // 1. pack the bytes
20090       const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
20091       // 2. store (16 bytes)
20092       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
20093       // 3. adjust pointers
20094       buf += 16;
20095       utf8_output += 16;
20096       continue; // we are done for this round!
20097     }
20098     // no bits set above 7th bit
20099     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
20100     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
20101 
20102     // no bits set above 11th bit
20103     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
20104     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
20105     if (one_or_two_bytes_bitmask == 0xffffffff) {
20106       // 1. prepare 2-byte values
20107       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
20108       // expected output   : [110a|aaaa|10bb|bbbb] x 8
20109       const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
20110       const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
20111 
20112       // t0 = [000a|aaaa|bbbb|bb00]
20113       const __m256i t0 = _mm256_slli_epi16(in_16, 2);
20114       // t1 = [000a|aaaa|0000|0000]
20115       const __m256i t1 = _mm256_and_si256(t0, v_1f00);
20116       // t2 = [0000|0000|00bb|bbbb]
20117       const __m256i t2 = _mm256_and_si256(in_16, v_003f);
20118       // t3 = [000a|aaaa|00bb|bbbb]
20119       const __m256i t3 = _mm256_or_si256(t1, t2);
20120       // t4 = [110a|aaaa|10bb|bbbb]
20121       const __m256i t4 = _mm256_or_si256(t3, v_c080);
20122 
20123       // 2. merge ASCII and 2-byte codewords
20124       const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
20125 
20126       // 3. prepare bitmask for 8-bit lookup
20127       const uint32_t M0 = one_byte_bitmask & 0x55555555;
20128       const uint32_t M1 = M0 >> 7;
20129       const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
20130       // 4. pack the bytes
20131 
20132       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
20133       const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
20134 
20135       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
20136       const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
20137 
20138       const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
20139       // 5. store bytes
20140       _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
20141       utf8_output += row[0];
20142       _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
20143       utf8_output += row_2[0];
20144 
20145       // 6. adjust pointers
20146       buf += 16;
20147       continue;
20148     }
20149     // Must check for overflow in packing
20150     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
20151     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
20152     if (saturation_bitmask == 0xffffffff) {
20153       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
20154 
20155       // Check for illegal surrogate code units
20156       const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
20157       const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
20158       if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
20159         return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
20160       }
20161 
20162       const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
20163                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
20164                                               0x0000, 0x0202, 0x0404, 0x0606,
20165                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
20166 
20167       /* In this branch we handle three cases:
20168         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
20169         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
20170         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
20171 
20172         We expand the input word (16-bit) into two code units (32-bit), thus
20173         we have room for four bytes. However, we need five distinct bit
20174         layouts. Note that the last byte in cases #2 and #3 is the same.
20175 
20176         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
20177         in register t2.
20178 
20179         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
20180         either byte 1 for case #2 or byte 2 for case #3. Note that they
20181         differ by exactly one bit.
20182 
20183         Finally from these two code units we build proper UTF-8 sequence, taking
20184         into account the case (i.e, the number of bytes to write).
20185       */
20186       /**
20187        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
20188        * t2 => [0ccc|cccc] [10cc|cccc]
20189        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
20190        */
20191 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
20192       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
20193       const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
20194       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
20195       const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
20196       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
20197       const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
20198 
20199       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
20200       const __m256i s0 = _mm256_srli_epi16(in_16, 4);
20201       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
20202       const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
20203       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
20204       const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
20205       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
20206       const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
20207       const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
20208       const __m256i s4 = _mm256_xor_si256(s3, m0);
20209 #undef simdutf_vec
20210 
20211       // 4. expand code units 16-bit => 32-bit
20212       const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
20213       const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
20214 
20215       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
20216       const uint32_t mask = (one_byte_bitmask & 0x55555555) |
20217                             (one_or_two_bytes_bitmask & 0xaaaaaaaa);
20218       // Due to the wider registers, the following path is less likely to be useful.
20219       /*if(mask == 0) {
20220         // We only have three-byte code units. Use fast path.
20221         const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
20222         const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
20223         const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
20224         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
20225         utf8_output += 12;
20226         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
20227         utf8_output += 12;
20228         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
20229         utf8_output += 12;
20230         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
20231         utf8_output += 12;
20232         buf += 16;
20233         continue;
20234       }*/
20235       const uint8_t mask0 = uint8_t(mask);
20236       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
20237       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
20238       const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
20239 
20240       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
20241       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
20242       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
20243       const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
20244 
20245       const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
20246       const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
20247       const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
20248       const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
20249 
20250 
20251       const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
20252       const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
20253       const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
20254       const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
20255 
20256       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
20257       utf8_output += row0[0];
20258       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
20259       utf8_output += row1[0];
20260       _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
20261       utf8_output += row2[0];
20262       _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
20263       utf8_output += row3[0];
20264       buf += 16;
20265     } else {
20266       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
20267       // Let us do a scalar fallback.
20268       // It may seem wasteful to use scalar code, but being efficient with SIMD
20269       // may require large, non-trivial tables?
20270       size_t forward = 15;
20271       size_t k = 0;
20272       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
20273       for(; k < forward; k++) {
20274         uint32_t word = buf[k];
20275         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
20276           *utf8_output++ = char(word);
20277         } else if((word & 0xFFFFF800)==0) { // 2-byte
20278           *utf8_output++ = char((word>>6) | 0b11000000);
20279           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20280         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
20281           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
20282           *utf8_output++ = char((word>>12) | 0b11100000);
20283           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20284           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20285         } else {  // 4-byte
20286           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
20287           *utf8_output++ = char((word>>18) | 0b11110000);
20288           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
20289           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20290           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20291         }
20292       }
20293       buf += k;
20294     }
20295   } // while
20296 
20297   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
20298 }
20299 /* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
20300 /* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
20301 // file included directly
20302 
20303 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
20304 template <endianness big_endian>
20305 std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
20306   const char32_t* end = buf + len;
20307 
20308   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
20309   __m256i forbidden_bytemask = _mm256_setzero_si256();
20310 
20311 
20312   while (buf + 8 + safety_margin <= end) {
20313     __m256i in = _mm256_loadu_si256((__m256i*)buf);
20314 
20315     const __m256i v_00000000 = _mm256_setzero_si256();
20316     const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
20317 
20318     // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
20319     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
20320     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
20321 
20322     if (saturation_bitmask == 0xffffffff) {
20323       const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
20324       const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
20325       forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
20326 
20327       __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
20328       if (big_endian) {
20329         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
20330         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
20331       }
20332       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
20333       utf16_output += 8;
20334       buf += 8;
20335     } else {
20336       size_t forward = 7;
20337       size_t k = 0;
20338       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
20339       for(; k < forward; k++) {
20340         uint32_t word = buf[k];
20341         if((word & 0xFFFF0000)==0) {
20342           // will not generate a surrogate pair
20343           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
20344           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
20345         } else {
20346           // will generate a surrogate pair
20347           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
20348           word -= 0x10000;
20349           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
20350           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
20351           if (big_endian) {
20352             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
20353             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
20354           }
20355           *utf16_output++ = char16_t(high_surrogate);
20356           *utf16_output++ = char16_t(low_surrogate);
20357         }
20358       }
20359       buf += k;
20360     }
20361   }
20362 
20363   // check for invalid input
20364   if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
20365 
20366   return std::make_pair(buf, utf16_output);
20367 }
20368 
20369 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
20370 template <endianness big_endian>
20371 std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
20372   const char32_t* start = buf;
20373   const char32_t* end = buf + len;
20374 
20375   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
20376 
20377   while (buf + 8 + safety_margin <= end) {
20378     __m256i in = _mm256_loadu_si256((__m256i*)buf);
20379 
20380     const __m256i v_00000000 = _mm256_setzero_si256();
20381     const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
20382 
20383     // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
20384     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
20385     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
20386 
20387     if (saturation_bitmask == 0xffffffff) {
20388       const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
20389       const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
20390       const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
20391       if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
20392         return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
20393       }
20394 
20395       __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
20396       if (big_endian) {
20397         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
20398         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
20399       }
20400       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
20401       utf16_output += 8;
20402       buf += 8;
20403     } else {
20404       size_t forward = 7;
20405       size_t k = 0;
20406       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
20407       for(; k < forward; k++) {
20408         uint32_t word = buf[k];
20409         if((word & 0xFFFF0000)==0) {
20410           // will not generate a surrogate pair
20411           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
20412           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
20413         } else {
20414           // will generate a surrogate pair
20415           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
20416           word -= 0x10000;
20417           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
20418           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
20419           if (big_endian) {
20420             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
20421             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
20422           }
20423           *utf16_output++ = char16_t(high_surrogate);
20424           *utf16_output++ = char16_t(low_surrogate);
20425         }
20426       }
20427       buf += k;
20428     }
20429   }
20430 
20431   return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
20432 }
20433 /* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
20434 /* begin file src/icelake/icelake_ascii_validation.inl.cpp */
20435 // file included directly
20436 
20437 bool validate_ascii(const char* buf, size_t len) {
20438   const char* end = buf + len;
20439   const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
20440   __m512i running_or = _mm512_setzero_si512();
20441   for (; buf + 64 <= end; buf += 64) {
20442     const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
20443     running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
20444   }
20445   if(buf < end) {
20446      const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end-buf)) - 1,(const __m512i*)buf);
20447     running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
20448   }
20449   return (_mm512_test_epi8_mask(running_or, running_or) == 0);
20450 }
20451 /* end file src/icelake/icelake_ascii_validation.inl.cpp */
20452 /* begin file src/icelake/icelake_utf32_validation.inl.cpp */
20453 // file included directly
20454 
20455 const char32_t* validate_utf32(const char32_t* buf, size_t len) {
20456     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
20457 
20458     const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
20459     __m512i currentmax = _mm512_setzero_si512();
20460     __m512i currentoffsetmax = _mm512_setzero_si512();
20461 
20462     while (buf <= end) {
20463       __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
20464       buf += 16;
20465       currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
20466       currentmax = _mm512_max_epu32(utf32, currentmax);
20467     }
20468 
20469     const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
20470     const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
20471     __m512i is_zero = _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
20472     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
20473       return nullptr;
20474     }
20475     is_zero = _mm512_xor_si512(_mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
20476     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
20477       return nullptr;
20478     }
20479 
20480     return buf;
20481 }
20482 /* end file src/icelake/icelake_utf32_validation.inl.cpp */
20483 /* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
20484 // file included directly
20485 
20486 static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len, char *utf8_output, int mask_output) {
20487   __mmask64 nonascii = _mm512_movepi8_mask(input);
20488   size_t output_size = input_len + (size_t)count_ones(nonascii);
20489 
20490   // Mask to denote whether the byte is a leading byte that is not ascii
20491   __mmask64 sixth =
20492       _mm512_cmpge_epu8_mask(input, _mm512_set1_epi8(-64)); //binary representation of -64: 1100 0000
20493 
20494   const uint64_t alternate_bits = UINT64_C(0x5555555555555555);
20495   uint64_t ascii = ~nonascii;
20496   // the bits in ascii are inverted and zeros are interspersed in between them
20497   uint64_t maskA = ~_pdep_u64(ascii, alternate_bits);
20498   uint64_t maskB = ~_pdep_u64(ascii>>32, alternate_bits);
20499 
20500   // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD)
20501   __m512i input_interleaved = _mm512_permutexvar_epi8(_mm512_set_epi32(
20502     0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818,
20503     0x37173616, 0x35153414, 0x33133212, 0x31113010,
20504     0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808,
20505     0x27072606, 0x25052404, 0x23032202, 0x21012000
20506   ), input);
20507 
20508   // double size of each byte, and insert the leading byte 1100 0010
20509 
20510 /*
20511 upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the process.
20512 We adjust for the bytes that have their two most significant bits. This takes care of the first 32 bytes, assuming we interleaved the bytes. */
20513   __m512i outputA = _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8);
20514   outputA = _mm512_mask_add_epi16(
20515                                   outputA,
20516                                  (__mmask32)sixth,
20517                                   outputA,
20518                                   _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001????
20519 
20520   // in the second 32-bit half, set first or second option based on whether original input is leading byte (second case) or not (first case)
20521   __m512i leadingB = _mm512_mask_blend_epi16(
20522                                               (__mmask32)(sixth>>32),
20523                                               _mm512_set1_epi16(0x00c2), // 0000 0000 1101 0010
20524                                               _mm512_set1_epi16(0x40c3));// 0100 0000 1100 0011
20525   __m512i outputB = _mm512_ternarylogic_epi32(
20526                                               input_interleaved,
20527                                               leadingB,
20528                                               _mm512_set1_epi16((short)0xff00),
20529                                               (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB
20530 
20531   // prune redundant bytes
20532   outputA = _mm512_maskz_compress_epi8(maskA, outputA);
20533   outputB = _mm512_maskz_compress_epi8(maskB, outputB);
20534 
20535 
20536   size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32;
20537 
20538   if(mask_output) {
20539     if(input_len > 32) { // is the second half of the input vector used?
20540       __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA);
20541       _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
20542       utf8_output += output_sizeA;
20543       write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA));
20544       _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB);
20545     } else {
20546       __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size);
20547       _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
20548     }
20549   } else {
20550     _mm512_storeu_si512(utf8_output, outputA);
20551     utf8_output += output_sizeA;
20552     _mm512_storeu_si512(utf8_output, outputB);
20553   }
20554   return output_size;
20555 }
20556 
20557 static inline size_t latin1_to_utf8_avx512_branch(__m512i input, char *utf8_output) {
20558   __mmask64 nonascii = _mm512_movepi8_mask(input);
20559   if(nonascii) {
20560     return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0);
20561   } else {
20562     _mm512_storeu_si512(utf8_output, input);
20563     return 64;
20564   }
20565 }
20566 
20567 size_t latin1_to_utf8_avx512_start(const char *buf, size_t len, char *utf8_output) {
20568   char *start = utf8_output;
20569   size_t pos = 0;
20570   // if there's at least 128 bytes remaining, we don't need to mask the output
20571   for (; pos + 128 <= len; pos += 64) {
20572     __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
20573     utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output);
20574   }
20575   // in the last 128 bytes, the first 64 may require masking the output
20576   if (pos + 64 <= len) {
20577     __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
20578     utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1);
20579     pos += 64;
20580   }
20581   // with the last 64 bytes, the input also needs to be masked
20582   if (pos < len) {
20583     __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos));
20584     __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos));
20585     utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1);
20586   }
20587   return (size_t)(utf8_output - start);
20588 }
20589 /* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
20590 /* begin file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
20591 // file included directly
20592 template <endianness big_endian>
20593 size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len,
20594                                        char16_t *utf16_output) {
20595   size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32
20596 
20597   __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
20598                                        0x0607040502030001, 0x0e0f0c0d0a0b0809,
20599                                        0x0607040502030001, 0x0e0f0c0d0a0b0809,
20600                                        0x0607040502030001, 0x0e0f0c0d0a0b0809);
20601   for (size_t i = 0; i < rounded_len; i += 32) {
20602     // Load 32 Latin1 characters into a 256-bit register
20603     __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]);
20604     // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
20605     __m512i out = _mm512_cvtepu8_epi16(in);
20606     if (big_endian) {
20607       out = _mm512_shuffle_epi8(out, byteflip);
20608     }
20609     // Store the results back to memory
20610     _mm512_storeu_si512((__m512i *)&utf16_output[i], out);
20611   }
20612   if (rounded_len != len) {
20613     uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1;
20614     __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len);
20615 
20616     // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
20617     __m512i out = _mm512_cvtepu8_epi16(in);
20618     if (big_endian) {
20619       out = _mm512_shuffle_epi8(out, byteflip);
20620     }
20621     // Store the results back to memory
20622     _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out);
20623   }
20624 
20625   return len;
20626 }
20627 /* end file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
20628 /* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
20629 std::pair<const char*, char32_t*> avx512_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) {
20630     size_t rounded_len = len & ~0xF;  // Round down to nearest multiple of 16
20631 
20632     for (size_t i = 0; i < rounded_len; i += 16) {
20633         // Load 16 Latin1 characters into a 128-bit register
20634         __m128i in = _mm_loadu_si128((__m128i*)&buf[i]);
20635 
20636         // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using vpmovzxbd
20637         __m512i out = _mm512_cvtepu8_epi32(in);
20638 
20639         // Store the results back to memory
20640         _mm512_storeu_si512((__m512i*)&utf32_output[i], out);
20641     }
20642 
20643     // Return pointers pointing to where we left off
20644     return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
20645 }
20646 /* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
20647 
20648 
20649 #include <cstdint>
20650 
20651 } // namespace
20652 } // namespace icelake
20653 } // namespace simdutf
20654 
20655 namespace simdutf {
20656 namespace icelake {
20657 
20658 simdutf_warn_unused int
20659 implementation::detect_encodings(const char *input,
20660                                  size_t length) const noexcept {
20661   // If there is a BOM, then we trust it.
20662   auto bom_encoding = simdutf::BOM::check_bom(input, length);
20663   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
20664   if (length % 2 == 0) {
20665     const char *buf = input;
20666 
20667     const char *start = buf;
20668     const char *end = input + length;
20669 
20670     bool is_utf8 = true;
20671     bool is_utf16 = true;
20672     bool is_utf32 = true;
20673 
20674     int out = 0;
20675 
20676     avx512_utf8_checker checker{};
20677     __m512i currentmax = _mm512_setzero_si512();
20678     while (buf + 64 <= end) {
20679       __m512i in = _mm512_loadu_si512((__m512i *)buf);
20680       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
20681       __mmask32 surrogates =
20682           _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
20683       if (surrogates) {
20684         is_utf8 = false;
20685 
20686         // Can still be either UTF-16LE or UTF-32 depending on the positions
20687         // of the surrogates To be valid UTF-32, a surrogate cannot be in the
20688         // two most significant bytes of any 32-bit word. On the other hand, to
20689         // be valid UTF-16LE, at least one surrogate must be in the two most
20690         // significant bytes of a 32-bit word since they always come in pairs in
20691         // UTF-16LE. Note that we always proceed in multiple of 4 before this
20692         // point so there is no offset in 32-bit code units.
20693 
20694         if ((surrogates & 0xaaaaaaaa) != 0) {
20695           is_utf32 = false;
20696           __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
20697               diff, _mm512_set1_epi16(uint16_t(0x0400)));
20698           __mmask32 lowsurrogates = surrogates ^ highsurrogates;
20699           // high must be followed by low
20700           if ((highsurrogates << 1) != lowsurrogates) {
20701             return simdutf::encoding_type::unspecified;
20702           }
20703 
20704           bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
20705           if (ends_with_high) {
20706             buf +=
20707                 31 *
20708                 sizeof(char16_t); // advance only by 31 code units so that we start
20709                                   // with the high surrogate on the next round.
20710           } else {
20711             buf += 32 * sizeof(char16_t);
20712           }
20713           is_utf16 = validate_utf16le(reinterpret_cast<const char16_t *>(buf),
20714                                       (end - buf) / sizeof(char16_t));
20715           if (!is_utf16) {
20716             return simdutf::encoding_type::unspecified;
20717 
20718           } else {
20719             return simdutf::encoding_type::UTF16_LE;
20720           }
20721 
20722         } else {
20723           is_utf16 = false;
20724           // Check for UTF-32
20725           if (length % 4 == 0) {
20726             const char32_t *input32 = reinterpret_cast<const char32_t *>(buf);
20727             const char32_t *end32 =
20728                 reinterpret_cast<const char32_t *>(start) + length / 4;
20729             if (validate_utf32(input32, end32 - input32)) {
20730               return simdutf::encoding_type::UTF32_LE;
20731             }
20732           }
20733           return simdutf::encoding_type::unspecified;
20734         }
20735       }
20736       // If no surrogate, validate under other encodings as well
20737 
20738       // UTF-32 validation
20739       currentmax = _mm512_max_epu32(in, currentmax);
20740 
20741       // UTF-8 validation
20742       checker.check_next_input(in);
20743 
20744       buf += 64;
20745     }
20746 
20747     // Check which encodings are possible
20748 
20749     if (is_utf8) {
20750       size_t current_length = static_cast<size_t>(buf - start);
20751       if (current_length != length) {
20752         const __m512i utf8 = _mm512_maskz_loadu_epi8(
20753             (1ULL << (length - current_length)) - 1, (const __m512i *)buf);
20754         checker.check_next_input(utf8);
20755       }
20756       checker.check_eof();
20757       if (!checker.errors()) {
20758         out |= simdutf::encoding_type::UTF8;
20759       }
20760     }
20761 
20762     if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(
20763                         reinterpret_cast<const char16_t *>(buf),
20764                         (length - (buf - start)) / 2)) {
20765       out |= simdutf::encoding_type::UTF16_LE;
20766     }
20767 
20768     if (is_utf32 && (length % 4 == 0)) {
20769       currentmax = _mm512_max_epu32(
20770           _mm512_maskz_loadu_epi8(
20771               (1ULL << (length - static_cast<size_t>(buf - start))) - 1,
20772               (const __m512i *)buf),
20773           currentmax);
20774       __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
20775                                 _MM_CMPINT_GT);
20776       if (outside_range == 0) {
20777         out |= simdutf::encoding_type::UTF32_LE;
20778       }
20779     }
20780 
20781     return out;
20782   } else if (implementation::validate_utf8(input, length)) {
20783     return simdutf::encoding_type::UTF8;
20784   } else {
20785     return simdutf::encoding_type::unspecified;
20786   }
20787 }
20788 
20789 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
20790     avx512_utf8_checker checker{};
20791     const char* ptr = buf;
20792     const char* end = ptr + len;
20793     for (; ptr + 64 <= end; ptr += 64) {
20794         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
20795         checker.check_next_input(utf8);
20796     }
20797     {
20798        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
20799        checker.check_next_input(utf8);
20800     }
20801     checker.check_eof();
20802     return ! checker.errors();
20803 }
20804 
20805 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
20806     avx512_utf8_checker checker{};
20807     const char* ptr = buf;
20808     const char* end = ptr + len;
20809     size_t count{0};
20810     for (; ptr + 64 <= end; ptr += 64) {
20811       const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
20812       checker.check_next_input(utf8);
20813       if(checker.errors()) {
20814         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
20815         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf), reinterpret_cast<const char*>(buf + count), len - count);
20816         res.count += count;
20817         return res;
20818       }
20819       count += 64;
20820     }
20821     {
20822       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
20823       checker.check_next_input(utf8);
20824       if(checker.errors()) {
20825         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
20826         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf), reinterpret_cast<const char*>(buf + count), len - count);
20827         res.count += count;
20828         return res;
20829       } else {
20830         return result(error_code::SUCCESS, len);
20831       }
20832     }
20833 }
20834 
20835 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
20836   return icelake::validate_ascii(buf, len);
20837 }
20838 
20839 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
20840   const char* buf_orig = buf;
20841   const char* end = buf + len;
20842   const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
20843   for (; buf + 64 <= end; buf += 64) {
20844     const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
20845     __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
20846     if(notascii) {
20847       return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
20848     }
20849   }
20850   {
20851     const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf);
20852     __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
20853     if(notascii) {
20854       return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
20855     }
20856   }
20857   return result(error_code::SUCCESS, len);
20858 }
20859 
20860 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
20861     const char16_t *end = buf + len;
20862 
20863     for(;buf + 32 <= end; ) {
20864       __m512i in = _mm512_loadu_si512((__m512i*)buf);
20865       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
20866       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
20867       if(surrogates) {
20868         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
20869         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
20870         // high must be followed by low
20871         if ((highsurrogates << 1) != lowsurrogates) {
20872            return false;
20873         }
20874         bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
20875         if(ends_with_high) {
20876           buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round.
20877         } else {
20878           buf += 32;
20879         }
20880       } else {
20881         buf += 32;
20882       }
20883     }
20884     if(buf < end) {
20885       __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
20886       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
20887       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
20888       if(surrogates) {
20889         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
20890         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
20891         // high must be followed by low
20892         if ((highsurrogates << 1) != lowsurrogates) {
20893            return false;
20894         }
20895       }
20896     }
20897     return true;
20898 }
20899 
20900 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
20901    const char16_t *end = buf + len;
20902    const __m512i byteflip = _mm512_setr_epi64(
20903             0x0607040502030001,
20904             0x0e0f0c0d0a0b0809,
20905             0x0607040502030001,
20906             0x0e0f0c0d0a0b0809,
20907             0x0607040502030001,
20908             0x0e0f0c0d0a0b0809,
20909             0x0607040502030001,
20910             0x0e0f0c0d0a0b0809
20911         );
20912     for(;buf + 32 <= end; ) {
20913       __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
20914       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
20915       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
20916       if(surrogates) {
20917         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
20918         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
20919         // high must be followed by low
20920         if ((highsurrogates << 1) != lowsurrogates) {
20921            return false;
20922         }
20923         bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
20924         if(ends_with_high) {
20925           buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round.
20926         } else {
20927           buf += 32;
20928         }
20929       } else {
20930         buf += 32;
20931       }
20932     }
20933     if(buf < end) {
20934       __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
20935       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
20936       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
20937       if(surrogates) {
20938         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
20939         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
20940         // high must be followed by low
20941         if ((highsurrogates << 1) != lowsurrogates) {
20942            return false;
20943         }
20944       }
20945     }
20946     return true;
20947 }
20948 
20949 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
20950     const char16_t *start_buf = buf;
20951     const char16_t *end = buf + len;
20952     for(;buf + 32 <= end; ) {
20953       __m512i in = _mm512_loadu_si512((__m512i*)buf);
20954       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
20955       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
20956       if(surrogates) {
20957         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
20958         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
20959         // high must be followed by low
20960         if ((highsurrogates << 1) != lowsurrogates) {
20961           uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
20962           uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
20963           return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
20964         }
20965         bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
20966         if(ends_with_high) {
20967           buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round.
20968         } else {
20969           buf += 32;
20970         }
20971       } else {
20972         buf += 32;
20973       }
20974     }
20975     if(buf < end) {
20976       __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
20977       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
20978       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
20979       if(surrogates) {
20980         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
20981         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
20982         // high must be followed by low
20983         if ((highsurrogates << 1) != lowsurrogates) {
20984           uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
20985           uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
20986           return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
20987         }
20988       }
20989     }
20990     return result(error_code::SUCCESS, len);
20991 }
20992 
20993 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
20994     const char16_t *start_buf = buf;
20995     const char16_t *end = buf + len;
20996     const __m512i byteflip = _mm512_setr_epi64(
20997             0x0607040502030001,
20998             0x0e0f0c0d0a0b0809,
20999             0x0607040502030001,
21000             0x0e0f0c0d0a0b0809,
21001             0x0607040502030001,
21002             0x0e0f0c0d0a0b0809,
21003             0x0607040502030001,
21004             0x0e0f0c0d0a0b0809
21005         );
21006     for(;buf + 32 <= end; ) {
21007       __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
21008       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
21009       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
21010       if(surrogates) {
21011         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
21012         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
21013         // high must be followed by low
21014         if ((highsurrogates << 1) != lowsurrogates) {
21015           uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
21016           uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
21017           return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
21018         }
21019         bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
21020         if(ends_with_high) {
21021           buf += 31; // advance only by 31 code units so that we start with the high surrogate on the next round.
21022         } else {
21023           buf += 32;
21024         }
21025       } else {
21026         buf += 32;
21027       }
21028     }
21029     if(buf < end) {
21030       __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
21031       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
21032       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
21033       if(surrogates) {
21034         __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
21035         __mmask32 lowsurrogates = surrogates ^ highsurrogates;
21036         // high must be followed by low
21037         if ((highsurrogates << 1) != lowsurrogates) {
21038           uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
21039           uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
21040           return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
21041         }
21042       }
21043     }
21044     return result(error_code::SUCCESS, len);
21045 }
21046 
21047 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
21048   const char32_t * tail = icelake::validate_utf32(buf, len);
21049   if (tail) {
21050     return scalar::utf32::validate(tail, len - (tail - buf));
21051   } else {
21052     return false;
21053   }
21054 }
21055 
21056 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
21057 
21058     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
21059     const char32_t* buf_orig = buf;
21060     while (buf <= end) {
21061       __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
21062       __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
21063                                 _MM_CMPINT_GT);
21064       if (outside_range) {
21065         return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
21066       }
21067 
21068       __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
21069 
21070       __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
21071                                 _MM_CMPINT_GT);
21072       if (surrogate_range) {
21073         return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
21074       }
21075       buf += 16;
21076     }
21077     if(buf < buf_orig + len) {
21078       __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1<<(buf_orig + len - buf))-1),(const __m512i*)buf);
21079       __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
21080                                 _MM_CMPINT_GT);
21081       if (outside_range) {
21082         return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
21083       }
21084       __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
21085 
21086       __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
21087                                 _MM_CMPINT_GT);
21088       if (surrogate_range) {
21089         return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
21090       }
21091     }
21092 
21093     return result(error_code::SUCCESS, len);
21094 }
21095 
21096 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
21097   return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output);
21098 }
21099 
21100 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
21101   return icelake_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
21102 }
21103 
21104 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
21105   return icelake_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
21106 }
21107 
21108 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
21109     std::pair<const char*, char32_t*> ret = avx512_convert_latin1_to_utf32(buf, len, utf32_output);
21110     if (ret.first == nullptr) { return 0; }
21111     size_t converted_chars = ret.second - utf32_output;
21112     if (ret.first != buf + len) {
21113         const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
21114                                               ret.first, len - (ret.first - buf), ret.second);
21115         if (scalar_converted_chars == 0) { return 0; }
21116         converted_chars += scalar_converted_chars;
21117     }
21118     return converted_chars;
21119 }
21120 
21121 simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
21122   return icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
21123 }
21124 
21125 
21126 simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
21127   // Initialize output length and input length counters
21128   size_t inlen = 0;
21129 
21130   // First, try to convert as much as possible using the SIMD implementation.
21131   inlen = icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
21132 
21133   // If we have completely converted the string
21134   if(inlen == len) {
21135     return {simdutf::SUCCESS, len};
21136   }
21137 
21138   // Else if there are remaining bytes, use the scalar function to process them.
21139   // Note: This is assuming scalar::utf8_to_latin1::convert_with_errors is a function that takes
21140   // the input buffer, length, and output buffer, and returns a result object with an error code
21141   // and the number of characters processed.
21142   result res = scalar::utf8_to_latin1::convert_with_errors(buf + inlen, len - inlen, latin1_output + inlen);
21143   res.count += inlen; // Add the number of characters processed by the SIMD implementation
21144 
21145   return res;
21146 }
21147 
21148 
21149 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
21150   return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output);
21151 }
21152 
21153 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
21154   utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
21155   if (ret.second == nullptr) {
21156     return 0;
21157   }
21158   return ret.second - utf16_output;
21159 }
21160 
21161 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
21162   utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(buf, len, utf16_output);
21163   if (ret.second == nullptr) {
21164     return 0;
21165   }
21166   return ret.second - utf16_output;
21167 }
21168 
21169 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
21170    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
21171 }
21172 
21173 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
21174    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
21175 }
21176 
21177 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
21178   utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(buf, len, utf16_output);
21179   size_t saved_bytes = ret.second - utf16_output;
21180   const char* end = buf + len;
21181   if (ret.first == end) {
21182     return saved_bytes;
21183   }
21184 
21185   // Note: AVX512 procedure looks up 4 bytes forward, and
21186   //       correctly converts multi-byte chars even if their
21187   //       continuation bytes lie outsiede 16-byte window.
21188   //       It meas, we have to skip continuation bytes from
21189   //       the beginning ret.first, as they were already consumed.
21190   while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
21191       ret.first += 1;
21192   }
21193 
21194   if (ret.first != end) {
21195     const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
21196                                         ret.first, len - (ret.first - buf), ret.second);
21197     if (scalar_saved_bytes == 0) { return 0; }
21198     saved_bytes += scalar_saved_bytes;
21199   }
21200 
21201   return saved_bytes;
21202 }
21203 
21204 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
21205   utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(buf, len, utf16_output);
21206   size_t saved_bytes = ret.second - utf16_output;
21207   const char* end = buf + len;
21208   if (ret.first == end) {
21209     return saved_bytes;
21210   }
21211 
21212   // Note: AVX512 procedure looks up 4 bytes forward, and
21213   //       correctly converts multi-byte chars even if their
21214   //       continuation bytes lie outsiede 16-byte window.
21215   //       It meas, we have to skip continuation bytes from
21216   //       the beginning ret.first, as they were already consumed.
21217   while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
21218       ret.first += 1;
21219   }
21220 
21221   if (ret.first != end) {
21222     const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
21223                                         ret.first, len - (ret.first - buf), ret.second);
21224     if (scalar_saved_bytes == 0) { return 0; }
21225     saved_bytes += scalar_saved_bytes;
21226   }
21227 
21228   return saved_bytes;
21229 }
21230 
21231 
21232 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
21233   uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
21234   utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
21235   if (ret.second == nullptr)
21236     return 0;
21237 
21238   size_t saved_bytes = ret.second - utf32_output;
21239   const char* end = buf + len;
21240   if (ret.first == end) {
21241     return saved_bytes;
21242   }
21243 
21244   // Note: the AVX512 procedure looks up 4 bytes forward, and
21245   //       correctly converts multi-byte chars even if their
21246   //       continuation bytes lie outside 16-byte window.
21247   //       It means, we have to skip continuation bytes from
21248   //       the beginning ret.first, as they were already consumed.
21249   while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
21250       ret.first += 1;
21251   }
21252 
21253   if (ret.first != end) {
21254     const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
21255                                         ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
21256     if (scalar_saved_bytes == 0) { return 0; }
21257     saved_bytes += scalar_saved_bytes;
21258   }
21259 
21260   return saved_bytes;
21261 }
21262 
21263 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept {
21264   uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32);
21265   auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
21266   if (!std::get<2>(ret)) {
21267     auto new_buf = std::get<0>(ret);
21268     // rewind_and_convert_with_errors will seek a potential error from new_buf onward,
21269     // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
21270     result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t *>(std::get<1>(ret)));
21271     res.count += (std::get<0>(ret) - buf);
21272     return res;
21273   }
21274   size_t saved_bytes = std::get<1>(ret) - utf32_output;
21275   const char* end = buf + len;
21276   if (std::get<0>(ret) == end) {
21277     return {simdutf::SUCCESS, saved_bytes};
21278   }
21279 
21280   // Note: the AVX512 procedure looks up 4 bytes forward, and
21281   //       correctly converts multi-byte chars even if their
21282   //       continuation bytes lie outside 16-byte window.
21283   //       It means, we have to skip continuation bytes from
21284   //       the beginning ret.first, as they were already consumed.
21285   while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
21286       std::get<0>(ret) += 1;
21287   }
21288 
21289   if (std::get<0>(ret) != end) {
21290     auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
21291                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
21292     if (scalar_result.error != simdutf::SUCCESS) {
21293       scalar_result.count +=  (std::get<0>(ret) - buf);
21294     } else {
21295       scalar_result.count += saved_bytes;
21296     }
21297     return scalar_result;
21298   }
21299 
21300   return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
21301 }
21302 
21303 
21304 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
21305   uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
21306   utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
21307   size_t saved_bytes = ret.second - utf32_output;
21308   const char* end = buf + len;
21309   if (ret.first == end) {
21310     return saved_bytes;
21311   }
21312 
21313   // Note: AVX512 procedure looks up 4 bytes forward, and
21314   //       correctly converts multi-byte chars even if their
21315   //       continuation bytes lie outsiede 16-byte window.
21316   //       It meas, we have to skip continuation bytes from
21317   //       the beginning ret.first, as they were already consumed.
21318   while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
21319       ret.first += 1;
21320   }
21321 
21322   if (ret.first != end) {
21323     const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
21324                                         ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
21325     if (scalar_saved_bytes == 0) { return 0; }
21326     saved_bytes += scalar_saved_bytes;
21327   }
21328 
21329   return saved_bytes;
21330 }
21331 
21332 
21333 simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
21334   return icelake_convert_utf16_to_latin1<endianness::LITTLE>(buf,len,latin1_output);
21335 }
21336 
21337 simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
21338   return icelake_convert_utf16_to_latin1<endianness::BIG>(buf,len,latin1_output);
21339 }
21340 
21341 simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
21342   return icelake_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(buf,len,latin1_output).first;
21343 }
21344 
21345 simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
21346   return icelake_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf,len,latin1_output).first;
21347 }
21348 
21349 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
21350   // optimization opportunity: implement custom function
21351   return convert_utf16be_to_latin1(buf, len, latin1_output);
21352 }
21353 
21354 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
21355   // optimization opportunity: implement custom function
21356   return convert_utf16le_to_latin1(buf, len, latin1_output);
21357 }
21358 
21359 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
21360   size_t outlen;
21361   size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
21362   if(inlen != len) { return 0; }
21363   return outlen;
21364 }
21365 
21366 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
21367   size_t outlen;
21368   size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
21369   if(inlen != len) { return 0; }
21370   return outlen;
21371 }
21372 
21373 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
21374   size_t outlen;
21375   size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
21376   if(inlen != len) {
21377     result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
21378     res.count += inlen;
21379     return res;
21380   }
21381   return {simdutf::SUCCESS, outlen};
21382 }
21383 
21384 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
21385   size_t outlen;
21386   size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
21387   if(inlen != len) {
21388     result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
21389     res.count += inlen;
21390     return res;
21391   }
21392   return {simdutf::SUCCESS, outlen};
21393 }
21394 
21395 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
21396   return convert_utf16le_to_utf8(buf, len, utf8_output);
21397 }
21398 
21399 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
21400   return convert_utf16be_to_utf8(buf, len, utf8_output);
21401 }
21402 
21403 simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
21404   return icelake_convert_utf32_to_latin1(buf,len,latin1_output);
21405 }
21406 
21407 simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
21408   return icelake_convert_utf32_to_latin1_with_errors(buf,len,latin1_output).first;
21409 }
21410 
21411 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
21412   return icelake_convert_utf32_to_latin1(buf,len,latin1_output);
21413 }
21414 
21415 
21416 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
21417   std::pair<const char32_t*, char*> ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output);
21418   if (ret.first == nullptr) { return 0; }
21419   size_t saved_bytes = ret.second - utf8_output;
21420   if (ret.first != buf + len) {
21421     const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
21422                                         ret.first, len - (ret.first - buf), ret.second);
21423     if (scalar_saved_bytes == 0) { return 0; }
21424     saved_bytes += scalar_saved_bytes;
21425   }
21426   return saved_bytes;
21427 }
21428 
21429 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
21430   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
21431   std::pair<result, char*> ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
21432   if (ret.first.count != len) {
21433     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
21434                                         buf + ret.first.count, len - ret.first.count, ret.second);
21435     if (scalar_res.error) {
21436       scalar_res.count += ret.first.count;
21437       return scalar_res;
21438     } else {
21439       ret.second += scalar_res.count;
21440     }
21441   }
21442   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
21443   return ret.first;
21444 }
21445 
21446 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
21447   return convert_utf32_to_utf8(buf, len, utf8_output);
21448 }
21449 
21450 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
21451   std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
21452   if (ret.first == nullptr) { return 0; }
21453   size_t saved_bytes = ret.second - utf16_output;
21454   if (ret.first != buf + len) {
21455     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
21456                                         ret.first, len - (ret.first - buf), ret.second);
21457     if (scalar_saved_bytes == 0) { return 0; }
21458     saved_bytes += scalar_saved_bytes;
21459   }
21460   return saved_bytes;
21461 }
21462 
21463 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
21464   std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
21465   if (ret.first == nullptr) { return 0; }
21466   size_t saved_bytes = ret.second - utf16_output;
21467   if (ret.first != buf + len) {
21468     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
21469                                         ret.first, len - (ret.first - buf), ret.second);
21470     if (scalar_saved_bytes == 0) { return 0; }
21471     saved_bytes += scalar_saved_bytes;
21472   }
21473   return saved_bytes;
21474 }
21475 
21476 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
21477   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
21478   std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
21479   if (ret.first.count != len) {
21480     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
21481                                         buf + ret.first.count, len - ret.first.count, ret.second);
21482     if (scalar_res.error) {
21483       scalar_res.count += ret.first.count;
21484       return scalar_res;
21485     } else {
21486       ret.second += scalar_res.count;
21487     }
21488   }
21489   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit code units written
21490   return ret.first;
21491 }
21492 
21493 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
21494   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
21495   std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
21496   if (ret.first.count != len) {
21497     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
21498                                         buf + ret.first.count, len - ret.first.count, ret.second);
21499     if (scalar_res.error) {
21500       scalar_res.count += ret.first.count;
21501       return scalar_res;
21502     } else {
21503       ret.second += scalar_res.count;
21504     }
21505   }
21506   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit code units written
21507   return ret.first;
21508 }
21509 
21510 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
21511   return convert_utf32_to_utf16le(buf, len, utf16_output);
21512 }
21513 
21514 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
21515   return convert_utf32_to_utf16be(buf, len, utf16_output);
21516 }
21517 
21518 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
21519   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
21520   if (!std::get<2>(ret)) { return 0; }
21521   size_t saved_bytes = std::get<1>(ret) - utf32_output;
21522   if (std::get<0>(ret) != buf + len) {
21523     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
21524                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
21525     if (scalar_saved_bytes == 0) { return 0; }
21526     saved_bytes += scalar_saved_bytes;
21527   }
21528   return saved_bytes;
21529 }
21530 
21531 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
21532   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
21533   if (!std::get<2>(ret)) { return 0; }
21534   size_t saved_bytes = std::get<1>(ret) - utf32_output;
21535   if (std::get<0>(ret) != buf + len) {
21536     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
21537                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
21538     if (scalar_saved_bytes == 0) { return 0; }
21539     saved_bytes += scalar_saved_bytes;
21540   }
21541   return saved_bytes;
21542 }
21543 
21544 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
21545   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
21546   if (!std::get<2>(ret)) {
21547     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
21548                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
21549     scalar_res.count += (std::get<0>(ret) - buf);
21550     return scalar_res;
21551   }
21552   size_t saved_bytes = std::get<1>(ret) - utf32_output;
21553   if (std::get<0>(ret) != buf + len) {
21554     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
21555                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
21556     if (scalar_res.error) {
21557       scalar_res.count += (std::get<0>(ret) - buf);
21558       return scalar_res;
21559     } else {
21560       scalar_res.count += saved_bytes;
21561       return scalar_res;
21562     }
21563   }
21564   return simdutf::result(simdutf::SUCCESS, saved_bytes);
21565 }
21566 
21567 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
21568   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
21569   if (!std::get<2>(ret)) {
21570     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
21571                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
21572     scalar_res.count += (std::get<0>(ret) - buf);
21573     return scalar_res;
21574   }
21575   size_t saved_bytes = std::get<1>(ret) - utf32_output;
21576   if (std::get<0>(ret) != buf + len) {
21577     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
21578                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
21579     if (scalar_res.error) {
21580       scalar_res.count += (std::get<0>(ret) - buf);
21581       return scalar_res;
21582     } else {
21583       scalar_res.count += saved_bytes;
21584       return scalar_res;
21585     }
21586   }
21587   return simdutf::result(simdutf::SUCCESS, saved_bytes);
21588 }
21589 
21590 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
21591   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
21592   if (!std::get<2>(ret)) { return 0; }
21593   size_t saved_bytes = std::get<1>(ret) - utf32_output;
21594   if (std::get<0>(ret) != buf + len) {
21595     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
21596                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
21597     if (scalar_saved_bytes == 0) { return 0; }
21598     saved_bytes += scalar_saved_bytes;
21599   }
21600   return saved_bytes;
21601 }
21602 
21603 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
21604   std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
21605   if (!std::get<2>(ret)) { return 0; }
21606   size_t saved_bytes = std::get<1>(ret) - utf32_output;
21607   if (std::get<0>(ret) != buf + len) {
21608     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
21609                                         std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
21610     if (scalar_saved_bytes == 0) { return 0; }
21611     saved_bytes += scalar_saved_bytes;
21612   }
21613   return saved_bytes;
21614 }
21615 
21616 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
21617   size_t pos = 0;
21618   const __m512i byteflip = _mm512_setr_epi64(
21619             0x0607040502030001,
21620             0x0e0f0c0d0a0b0809,
21621             0x0607040502030001,
21622             0x0e0f0c0d0a0b0809,
21623             0x0607040502030001,
21624             0x0e0f0c0d0a0b0809,
21625             0x0607040502030001,
21626             0x0e0f0c0d0a0b0809
21627         );
21628   while (pos + 32 <= length) {
21629     __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
21630     utf16 = _mm512_shuffle_epi8(utf16, byteflip);
21631     _mm512_storeu_si512(output + pos, utf16);
21632     pos += 32;
21633   }
21634   if(pos < length) {
21635     __mmask32 m((1<< (length - pos))-1);
21636     __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
21637     utf16 = _mm512_shuffle_epi8(utf16, byteflip);
21638     _mm512_mask_storeu_epi16(output + pos, m, utf16);
21639   }
21640 }
21641 
21642 
21643 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
21644   const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
21645   const char16_t* ptr = input;
21646 
21647   const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
21648   const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
21649 
21650   size_t count{0};
21651 
21652   while (ptr <= end) {
21653     __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
21654     ptr += 32;
21655     uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
21656     count += count_ones(not_high_surrogate);
21657   }
21658 
21659   return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
21660 }
21661 
21662 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
21663   const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
21664   const char16_t* ptr = input;
21665 
21666   const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
21667   const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
21668 
21669   size_t count{0};
21670   const __m512i byteflip = _mm512_setr_epi64(
21671             0x0607040502030001,
21672             0x0e0f0c0d0a0b0809,
21673             0x0607040502030001,
21674             0x0e0f0c0d0a0b0809,
21675             0x0607040502030001,
21676             0x0e0f0c0d0a0b0809,
21677             0x0607040502030001,
21678             0x0e0f0c0d0a0b0809
21679         );
21680   while (ptr <= end) {
21681     __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
21682     ptr += 32;
21683     uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
21684     count += count_ones(not_high_surrogate);
21685   }
21686 
21687   return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
21688 }
21689 
21690 
21691 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
21692   const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
21693   size_t answer =  length / sizeof(__m512i) * sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
21694   size_t i = 0;
21695   __m512i unrolled_popcount{0};
21696 
21697   const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
21698 
21699   while (i + sizeof(__m512i) <= length) {
21700     size_t iterations = (length - i) / sizeof(__m512i);
21701 
21702     size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
21703     for (; i + 8*sizeof(__m512i) <= max_i; i += 8*sizeof(__m512i)) {
21704         __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
21705         __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
21706         __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i)));
21707         __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i)));
21708         __m512i input5 = _mm512_loadu_si512((const __m512i *)(str + i + 4*sizeof(__m512i)));
21709         __m512i input6 = _mm512_loadu_si512((const __m512i *)(str + i + 5*sizeof(__m512i)));
21710         __m512i input7 = _mm512_loadu_si512((const __m512i *)(str + i + 6*sizeof(__m512i)));
21711         __m512i input8 = _mm512_loadu_si512((const __m512i *)(str + i + 7*sizeof(__m512i)));
21712 
21713 
21714         __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation);
21715         __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation);
21716         __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation);
21717         __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation);
21718         __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation);
21719         __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation);
21720         __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation);
21721         __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation);
21722 
21723         __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5, mask4, mask3, mask2, mask1);
21724 
21725 
21726         unrolled_popcount = _mm512_add_epi64(unrolled_popcount, _mm512_popcnt_epi64(mask_register));
21727     }
21728 
21729     for (; i <= max_i; i += sizeof(__m512i)) {
21730       __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
21731       uint64_t continuation_bitmask = static_cast<uint64_t>(_mm512_cmple_epi8_mask(more_input, continuation));
21732       answer -= count_ones(continuation_bitmask);
21733     }
21734   }
21735 
21736   __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0);
21737   __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1);
21738   answer -= (size_t)_mm256_extract_epi64(first_half, 0) +
21739             (size_t)_mm256_extract_epi64(first_half, 1) +
21740             (size_t)_mm256_extract_epi64(first_half, 2) +
21741             (size_t)_mm256_extract_epi64(first_half, 3) +
21742             (size_t)_mm256_extract_epi64(second_half, 0) +
21743             (size_t)_mm256_extract_epi64(second_half, 1) +
21744             (size_t)_mm256_extract_epi64(second_half, 2) +
21745             (size_t)_mm256_extract_epi64(second_half, 3);
21746 
21747   return answer + scalar::utf8::count_code_points(reinterpret_cast<const char *>(str + i), length - i);
21748 }
21749 
21750 simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
21751   return count_utf8(buf,len);
21752 }
21753 
21754 simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
21755   return scalar::utf16::latin1_length_from_utf16(length);
21756 }
21757 
21758 simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept {
21759   return scalar::utf32::latin1_length_from_utf32(length);
21760 }
21761 
21762 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
21763   const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
21764   const char16_t* ptr = input;
21765 
21766   const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
21767   const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
21768   const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
21769   const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
21770 
21771   size_t count{0};
21772 
21773   while (ptr <= end) {
21774     __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
21775     ptr += 32;
21776     __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
21777     __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
21778     __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
21779     __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
21780 
21781     size_t ascii_count = count_ones(ascii_bitmask);
21782     size_t two_bytes_count = count_ones(two_bytes_bitmask);
21783     size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
21784     size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
21785 
21786     count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
21787   }
21788 
21789   return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
21790 }
21791 
21792 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
21793   const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
21794   const char16_t* ptr = input;
21795 
21796   const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
21797   const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
21798   const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
21799   const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
21800 
21801   size_t count{0};
21802   const __m512i byteflip = _mm512_setr_epi64(
21803             0x0607040502030001,
21804             0x0e0f0c0d0a0b0809,
21805             0x0607040502030001,
21806             0x0e0f0c0d0a0b0809,
21807             0x0607040502030001,
21808             0x0e0f0c0d0a0b0809,
21809             0x0607040502030001,
21810             0x0e0f0c0d0a0b0809
21811         );
21812   while (ptr <= end) {
21813     __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
21814     utf16 = _mm512_shuffle_epi8(utf16, byteflip);
21815     ptr += 32;
21816     __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
21817     __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
21818     __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
21819     __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
21820 
21821     size_t ascii_count = count_ones(ascii_bitmask);
21822     size_t two_bytes_count = count_ones(two_bytes_bitmask);
21823     size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
21824     size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
21825     count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
21826   }
21827 
21828   return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
21829 }
21830 
21831 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
21832   return implementation::count_utf16le(input, length);
21833 }
21834 
21835 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
21836   return implementation::count_utf16be(input, length);
21837 }
21838 
21839 simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
21840   return scalar::latin1::utf16_length_from_latin1(length);
21841 }
21842 
21843 
21844 simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
21845   return scalar::latin1::utf32_length_from_latin1(length);
21846 }
21847 
21848 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
21849   const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
21850   size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
21851   size_t i = 0;
21852   unsigned char v_0xFF = 0xff;
21853   __m512i eight_64bits = _mm512_setzero_si512();
21854   while (i + sizeof(__m512i) <= length) {
21855     __m512i runner = _mm512_setzero_si512();
21856     size_t iterations = (length - i) / sizeof(__m512i);
21857     if (iterations > 255) {
21858       iterations = 255;
21859     }
21860     size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
21861     for (; i + 4*sizeof(__m512i) <= max_i; i += 4*sizeof(__m512i)) {
21862             // Load four __m512i vectors
21863             __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
21864             __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
21865             __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i)));
21866             __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i)));
21867 
21868             // Generate four masks
21869             __mmask64 mask1 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
21870             __mmask64 mask2 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
21871             __mmask64 mask3 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
21872             __mmask64 mask4 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
21873             // Apply the masks and subtract from the runner
21874             __m512i not_ascii1 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
21875             __m512i not_ascii2 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
21876             __m512i not_ascii3 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
21877             __m512i not_ascii4 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
21878 
21879             runner = _mm512_sub_epi8(runner, not_ascii1);
21880             runner = _mm512_sub_epi8(runner, not_ascii2);
21881             runner = _mm512_sub_epi8(runner, not_ascii3);
21882             runner = _mm512_sub_epi8(runner, not_ascii4);
21883     }
21884 
21885     for (; i <= max_i; i += sizeof(__m512i)) {
21886       __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
21887 
21888       __mmask64 mask = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
21889       __m512i not_ascii = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
21890       runner = _mm512_sub_epi8(runner, not_ascii);
21891     }
21892 
21893     eight_64bits = _mm512_add_epi64(eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
21894   }
21895 
21896   __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
21897   __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
21898   answer += (size_t)_mm256_extract_epi64(first_half, 0) +
21899             (size_t)_mm256_extract_epi64(first_half, 1) +
21900             (size_t)_mm256_extract_epi64(first_half, 2) +
21901             (size_t)_mm256_extract_epi64(first_half, 3) +
21902             (size_t)_mm256_extract_epi64(second_half, 0) +
21903             (size_t)_mm256_extract_epi64(second_half, 1) +
21904             (size_t)_mm256_extract_epi64(second_half, 2) +
21905             (size_t)_mm256_extract_epi64(second_half, 3);
21906   return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast<const char *>(str + i), length - i);
21907 }
21908 
21909 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
21910     size_t pos = 0;
21911     size_t count = 0;
21912     // This algorithm could no doubt be improved!
21913     for(;pos + 64 <= length; pos += 64) {
21914       __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos));
21915       uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65+1));
21916       // We count one word for anything that is not a continuation (so
21917       // leading bytes).
21918       count += 64 - count_ones(utf8_continuation_mask);
21919       uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
21920       count += count_ones(utf8_4byte);
21921     }
21922     return count + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
21923 }
21924 
21925 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
21926   const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
21927   const char32_t* ptr = input;
21928 
21929   const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
21930   const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
21931   const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
21932 
21933   size_t count{0};
21934 
21935   while (ptr <= end) {
21936     __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
21937     ptr += 16;
21938     __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
21939     __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
21940     __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
21941 
21942     size_t ascii_count = count_ones(ascii_bitmask);
21943     size_t two_bytes_count = count_ones(two_bytes_bitmask);
21944     size_t three_bytes_count = count_ones(three_bytes_bitmask);
21945     size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
21946     count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count;
21947   }
21948 
21949   return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
21950 }
21951 
21952 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
21953   const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
21954   const char32_t* ptr = input;
21955 
21956   const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
21957 
21958   size_t count{0};
21959 
21960   while (ptr <= end) {
21961     __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
21962     ptr += 16;
21963     __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
21964 
21965     count += 16 + count_ones(surrogates_bitmask);
21966   }
21967 
21968   return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
21969 }
21970 
21971 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
21972   return implementation::count_utf8(input, length);
21973 }
21974 
21975 } // namespace icelake
21976 } // namespace simdutf
21977 
21978 /* begin file src/simdutf/icelake/end.h */
21979 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
21980 // nothing needed.
21981 #else
21982 SIMDUTF_UNTARGET_REGION
21983 #endif
21984 
21985 
21986 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
21987 SIMDUTF_POP_DISABLE_WARNINGS
21988 #endif // end of workaround
21989 /* end file src/simdutf/icelake/end.h */
21990 /* end file src/icelake/implementation.cpp */
21991 #endif
21992 #if SIMDUTF_IMPLEMENTATION_HASWELL
21993 /* begin file src/haswell/implementation.cpp */
21994 
21995 /* begin file src/simdutf/haswell/begin.h */
21996 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
21997 // #define SIMDUTF_IMPLEMENTATION haswell
21998 
21999 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
22000 // nothing needed.
22001 #else
22002 SIMDUTF_TARGET_HASWELL
22003 #endif
22004 
22005 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
22006 SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
22007 #endif // end of workaround
22008 /* end file src/simdutf/haswell/begin.h */
22009 namespace simdutf {
22010 namespace haswell {
22011 namespace {
22012 #ifndef SIMDUTF_HASWELL_H
22013 #error "haswell.h must be included"
22014 #endif
22015 using namespace simd;
22016 
22017 
22018 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
22019   return input.reduce_or().is_ascii();
22020 }
22021 
22022 simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
22023   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
22024   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
22025   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
22026   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
22027   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
22028 }
22029 
22030 simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
22031   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
22032   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
22033   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
22034   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
22035 }
22036 
22037 /* begin file src/haswell/avx2_detect_encodings.cpp */
22038 template<class checker>
22039 // len is known to be a multiple of 2 when this is called
22040 int avx2_detect_encodings(const char * buf, size_t len) {
22041     const char* start = buf;
22042     const char* end = buf + len;
22043 
22044     bool is_utf8 = true;
22045     bool is_utf16 = true;
22046     bool is_utf32 = true;
22047 
22048     int out = 0;
22049 
22050     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
22051     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
22052 
22053     __m256i currentmax = _mm256_setzero_si256();
22054 
22055     checker check{};
22056 
22057     while(buf + 64 <= end) {
22058         __m256i in = _mm256_loadu_si256((__m256i*)buf);
22059         __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
22060 
22061         const auto u0 = simd16<uint16_t>(in);
22062         const auto u1 = simd16<uint16_t>(nextin);
22063 
22064         const auto v0 = u0.shr<8>();
22065         const auto v1 = u1.shr<8>();
22066 
22067         const auto in16 = simd16<uint16_t>::pack(v0, v1);
22068 
22069         const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8;
22070         uint32_t surrogates_bitmask0 = surrogates_wordmask0.to_bitmask();
22071 
22072         // Check for surrogates
22073         if (surrogates_bitmask0 != 0x0) {
22074             // Cannot be UTF8
22075             is_utf8 = false;
22076             // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
22077             // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
22078             // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
22079             // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
22080             // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units.
22081 
22082             if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) {
22083                 is_utf32 = false;
22084                 // Code from avx2_validate_utf16le.cpp
22085                 const char16_t * input = reinterpret_cast<const char16_t*>(buf);
22086                 const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
22087 
22088                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
22089                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
22090 
22091                 const uint32_t V0 = ~surrogates_bitmask0;
22092 
22093                 const auto    vH0 = (in16 & v_fc) == v_dc;
22094                 const uint32_t H0 = vH0.to_bitmask();
22095 
22096                 const uint32_t L0 = ~H0 & surrogates_bitmask0;
22097 
22098                 const uint32_t a0 = L0 & (H0 >> 1);
22099                 const uint32_t b0 = a0 << 1;
22100                 const uint32_t c0 = V0 | a0 | b0;
22101 
22102                 if (c0 == 0xffffffff) {
22103                     input += simd16<uint16_t>::ELEMENTS * 2;
22104                 } else if (c0 == 0x7fffffff) {
22105                     input += simd16<uint16_t>::ELEMENTS * 2 - 1;
22106                 } else {
22107                     return simdutf::encoding_type::unspecified;
22108                 }
22109 
22110                 while (input + simd16<uint16_t>::ELEMENTS * 2 < end16) {
22111                     const auto in0 = simd16<uint16_t>(input);
22112                     const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
22113 
22114                     const auto t0 = in0.shr<8>();
22115                     const auto t1 = in1.shr<8>();
22116 
22117                     const auto in_16 = simd16<uint16_t>::pack(t0, t1);
22118 
22119                     const auto surrogates_wordmask = (in_16 & v_f8) == v_d8;
22120                     const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
22121                     if (surrogates_bitmask == 0x0) {
22122                         input += simd16<uint16_t>::ELEMENTS * 2;
22123                     } else {
22124                         const uint32_t V = ~surrogates_bitmask;
22125 
22126                         const auto    vH = (in_16 & v_fc) == v_dc;
22127                         const uint32_t H = vH.to_bitmask();
22128 
22129                         const uint32_t L = ~H & surrogates_bitmask;
22130 
22131                         const uint32_t a = L & (H >> 1);
22132 
22133                         const uint32_t b = a << 1;
22134 
22135                         const uint32_t c = V | a | b;
22136 
22137                         if (c == 0xffffffff) {
22138                             input += simd16<uint16_t>::ELEMENTS * 2;
22139                         } else if (c == 0x7fffffff) {
22140                             input += simd16<uint16_t>::ELEMENTS * 2 - 1;
22141                         } else {
22142                             return simdutf::encoding_type::unspecified;
22143                         }
22144                     }
22145                 }
22146             } else {
22147                 is_utf16 = false;
22148                 // Check for UTF-32
22149                 if (len % 4 == 0) {
22150                     const char32_t * input = reinterpret_cast<const char32_t*>(buf);
22151                     const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
22152 
22153                     // Must start checking for surrogates
22154                     __m256i currentoffsetmax = _mm256_setzero_si256();
22155                     const __m256i offset = _mm256_set1_epi32(0xffff2000);
22156                     const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
22157 
22158                     currentmax = _mm256_max_epu32(in, currentmax);
22159                     currentmax = _mm256_max_epu32(nextin, currentmax);
22160 
22161                     currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
22162                     currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax);
22163 
22164                     while (input + 8 < end32) {
22165                         const __m256i in32 = _mm256_loadu_si256((__m256i *)input);
22166                         currentmax = _mm256_max_epu32(in32,currentmax);
22167                         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax);
22168                         input += 8;
22169                     }
22170 
22171                     __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
22172                     if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
22173                         return simdutf::encoding_type::unspecified;
22174                     }
22175                 } else {
22176                     return simdutf::encoding_type::unspecified;
22177                 }
22178             }
22179             break;
22180         }
22181         // If no surrogate, validate under other encodings as well
22182 
22183         // UTF-32 validation
22184         currentmax = _mm256_max_epu32(in, currentmax);
22185         currentmax = _mm256_max_epu32(nextin, currentmax);
22186 
22187         // UTF-8 validation
22188         // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
22189         simd::simd8x64<uint8_t> in8(in, nextin);
22190         check.check_next_input(in8);
22191 
22192         buf += 64;
22193     }
22194 
22195     // Check which encodings are possible
22196 
22197     if (is_utf8) {
22198         if (static_cast<size_t>(buf - start) != len) {
22199             uint8_t block[64]{};
22200             std::memset(block, 0x20, 64);
22201             std::memcpy(block, buf, len - (buf - start));
22202             simd::simd8x64<uint8_t> in(block);
22203             check.check_next_input(in);
22204         }
22205         if (!check.errors()) {
22206             out |= simdutf::encoding_type::UTF8;
22207         }
22208     }
22209 
22210     if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
22211         out |= simdutf::encoding_type::UTF16_LE;
22212     }
22213 
22214     if (is_utf32 && (len % 4 == 0)) {
22215         const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
22216         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
22217         if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
22218             out |= simdutf::encoding_type::UTF32_LE;
22219         }
22220     }
22221 
22222     return out;
22223 }
22224 /* end file src/haswell/avx2_detect_encodings.cpp */
22225 
22226 /* begin file src/haswell/avx2_validate_utf16.cpp */
22227 /*
22228     In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
22229 
22230     In a vectorized algorithm we want to examine the most significant
22231     nibble in order to select a fast path. If none of highest nibbles
22232     are 0xD (13), than we are sure that UTF-16 chunk in a vector
22233     register is valid.
22234 
22235     Let us analyze what we need to check if the nibble is 0xD. The
22236     value of the preceding nibble determines what we have:
22237 
22238     0xd000 .. 0xd7ff - a valid word
22239     0xd800 .. 0xdbff - low surrogate
22240     0xdc00 .. 0xdfff - high surrogate
22241 
22242     Other constraints we have to consider:
22243     - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
22244     - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
22245     - there must not be sole low surrogate nor high surrogate
22246 
22247     We're going to build three bitmasks based on the 3rd nibble:
22248     - V = valid word,
22249     - L = low surrogate (0xd800 .. 0xdbff)
22250     - H = high surrogate (0xdc00 .. 0xdfff)
22251 
22252       0   1   2   3   4   5   6   7    <--- word index
22253     [ V | L | H | L | H | V | V | L ]
22254       1   0   0   0   0   1   1   0     - V = valid masks
22255       0   1   0   1   0   0   0   1     - L = low surrogate
22256       0   0   1   0   1   0   0   0     - H high surrogate
22257 
22258 
22259       1   0   0   0   0   1   1   0   V = valid masks
22260       0   1   0   1   0   0   0   0   a = L & (H >> 1)
22261       0   0   1   0   1   0   0   0   b = a << 1
22262       1   1   1   1   1   1   1   0   c = V | a | b
22263                                   ^
22264                                   the last bit can be zero, we just consume 7 code units
22265                                   and recheck this word in the next iteration
22266 */
22267 
22268 /* Returns:
22269    - pointer to the last unprocessed character (a scalar fallback should check the rest);
22270    - nullptr if an error was detected.
22271 */
22272 template <endianness big_endian>
22273 const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
22274     const char16_t* end = input + size;
22275 
22276     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
22277     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
22278     const auto v_fc = simd8<uint8_t>::splat(0xfc);
22279     const auto v_dc = simd8<uint8_t>::splat(0xdc);
22280 
22281     while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
22282         // 0. Load data: since the validation takes into account only higher
22283         //    byte of each word, we compress the two vectors into one which
22284         //    consists only the higher bytes.
22285         auto in0 = simd16<uint16_t>(input);
22286         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
22287 
22288         if (big_endian) {
22289             in0 = in0.swap_bytes();
22290             in1 = in1.swap_bytes();
22291         }
22292 
22293         const auto t0 = in0.shr<8>();
22294         const auto t1 = in1.shr<8>();
22295 
22296         const auto in = simd16<uint16_t>::pack(t0, t1);
22297 
22298         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
22299         const auto surrogates_wordmask = (in & v_f8) == v_d8;
22300         const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
22301         if (surrogates_bitmask == 0x0) {
22302             input += simd16<uint16_t>::ELEMENTS * 2;
22303         } else {
22304             // 2. We have some surrogates that have to be distinguished:
22305             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
22306             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
22307             //
22308             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
22309 
22310             // V - non-surrogate code units
22311             //     V = not surrogates_wordmask
22312             const uint32_t V = ~surrogates_bitmask;
22313 
22314             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
22315             const auto    vH = (in & v_fc) == v_dc;
22316             const uint32_t H = vH.to_bitmask();
22317 
22318             // L - word mask for low surrogates
22319             //     L = not H and surrogates_wordmask
22320             const uint32_t L = ~H & surrogates_bitmask;
22321 
22322             const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
22323                                               // (A low surrogate placed in the 7th register's word
22324                                               // is an exception we handle.)
22325             const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
22326                                               // thanks to that we have only two masks for valid case.
22327             const uint32_t c = V | a | b;     // Combine all the masks into the final one.
22328 
22329             if (c == 0xffffffff) {
22330                 // The whole input register contains valid UTF-16, i.e.,
22331                 // either single code units or proper surrogate pairs.
22332                 input += simd16<uint16_t>::ELEMENTS * 2;
22333             } else if (c == 0x7fffffff) {
22334                 // The 31 lower code units of the input register contains valid UTF-16.
22335                 // The 31 word may be either a low or high surrogate. It the next
22336                 // iteration we 1) check if the low surrogate is followed by a high
22337                 // one, 2) reject sole high surrogate.
22338                 input += simd16<uint16_t>::ELEMENTS * 2 - 1;
22339             } else {
22340                 return nullptr;
22341             }
22342         }
22343     }
22344 
22345     return input;
22346 }
22347 
22348 
22349 template <endianness big_endian>
22350 const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) {
22351     const char16_t* start = input;
22352     const char16_t* end = input + size;
22353 
22354     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
22355     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
22356     const auto v_fc = simd8<uint8_t>::splat(0xfc);
22357     const auto v_dc = simd8<uint8_t>::splat(0xdc);
22358 
22359     while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
22360         // 0. Load data: since the validation takes into account only higher
22361         //    byte of each word, we compress the two vectors into one which
22362         //    consists only the higher bytes.
22363         auto in0 = simd16<uint16_t>(input);
22364         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
22365 
22366         if (big_endian) {
22367             in0 = in0.swap_bytes();
22368             in1 = in1.swap_bytes();
22369         }
22370 
22371         const auto t0 = in0.shr<8>();
22372         const auto t1 = in1.shr<8>();
22373 
22374         const auto in = simd16<uint16_t>::pack(t0, t1);
22375 
22376         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
22377         const auto surrogates_wordmask = (in & v_f8) == v_d8;
22378         const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
22379         if (surrogates_bitmask == 0x0) {
22380             input += simd16<uint16_t>::ELEMENTS * 2;
22381         } else {
22382             // 2. We have some surrogates that have to be distinguished:
22383             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
22384             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
22385             //
22386             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
22387 
22388             // V - non-surrogate code units
22389             //     V = not surrogates_wordmask
22390             const uint32_t V = ~surrogates_bitmask;
22391 
22392             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
22393             const auto    vH = (in & v_fc) == v_dc;
22394             const uint32_t H = vH.to_bitmask();
22395 
22396             // L - word mask for low surrogates
22397             //     L = not H and surrogates_wordmask
22398             const uint32_t L = ~H & surrogates_bitmask;
22399 
22400             const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
22401                                               // (A low surrogate placed in the 7th register's word
22402                                               // is an exception we handle.)
22403             const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
22404                                               // thanks to that we have only two masks for valid case.
22405             const uint32_t c = V | a | b;     // Combine all the masks into the final one.
22406 
22407             if (c == 0xffffffff) {
22408                 // The whole input register contains valid UTF-16, i.e.,
22409                 // either single code units or proper surrogate pairs.
22410                 input += simd16<uint16_t>::ELEMENTS * 2;
22411             } else if (c == 0x7fffffff) {
22412                 // The 31 lower code units of the input register contains valid UTF-16.
22413                 // The 31 word may be either a low or high surrogate. It the next
22414                 // iteration we 1) check if the low surrogate is followed by a high
22415                 // one, 2) reject sole high surrogate.
22416                 input += simd16<uint16_t>::ELEMENTS * 2 - 1;
22417             } else {
22418                 return result(error_code::SURROGATE, input - start);
22419             }
22420         }
22421     }
22422 
22423     return result(error_code::SUCCESS, input - start);
22424 }
22425 /* end file src/haswell/avx2_validate_utf16.cpp */
22426 /* begin file src/haswell/avx2_validate_utf32le.cpp */
22427 /* Returns:
22428    - pointer to the last unprocessed character (a scalar fallback should check the rest);
22429    - nullptr if an error was detected.
22430 */
22431 const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
22432     const char32_t* end = input + size;
22433 
22434     const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
22435     const __m256i offset = _mm256_set1_epi32(0xffff2000);
22436     const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
22437     __m256i currentmax = _mm256_setzero_si256();
22438     __m256i currentoffsetmax = _mm256_setzero_si256();
22439 
22440     while (input + 8 < end) {
22441         const __m256i in = _mm256_loadu_si256((__m256i *)input);
22442         currentmax = _mm256_max_epu32(in,currentmax);
22443         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
22444         input += 8;
22445     }
22446     __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
22447     if(_mm256_testz_si256(is_zero, is_zero) == 0) {
22448         return nullptr;
22449     }
22450 
22451     is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
22452     if(_mm256_testz_si256(is_zero, is_zero) == 0) {
22453         return nullptr;
22454     }
22455 
22456     return input;
22457 }
22458 
22459 
22460 const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) {
22461     const char32_t* start = input;
22462     const char32_t* end = input + size;
22463 
22464     const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
22465     const __m256i offset = _mm256_set1_epi32(0xffff2000);
22466     const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
22467     __m256i currentmax = _mm256_setzero_si256();
22468     __m256i currentoffsetmax = _mm256_setzero_si256();
22469 
22470     while (input + 8 < end) {
22471         const __m256i in = _mm256_loadu_si256((__m256i *)input);
22472         currentmax = _mm256_max_epu32(in,currentmax);
22473         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
22474 
22475         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
22476         if(_mm256_testz_si256(is_zero, is_zero) == 0) {
22477             return result(error_code::TOO_LARGE, input - start);
22478         }
22479 
22480         is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
22481         if(_mm256_testz_si256(is_zero, is_zero) == 0) {
22482             return result(error_code::SURROGATE, input - start);
22483         }
22484         input += 8;
22485     }
22486 
22487     return result(error_code::SUCCESS, input - start);
22488 }
22489 /* end file src/haswell/avx2_validate_utf32le.cpp */
22490 
22491 /* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */
22492 std::pair<const char *, char *> avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len,
22493                            char *utf8_output) {
22494   const char *end = latin1_input + len;
22495   const __m256i v_0000 = _mm256_setzero_si256();
22496   const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
22497   const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
22498   const size_t safety_margin = 12;
22499 
22500   while (latin1_input + 16 + safety_margin <= end) {
22501     __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
22502     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
22503     const __m128i v_80 = _mm_set1_epi8((char)0x80);
22504     if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!!
22505       // 1. store (16 bytes)
22506       _mm_storeu_si128((__m128i *)utf8_output, in8);
22507       // 2. adjust pointers
22508       latin1_input += 16;
22509       utf8_output += 16;
22510       continue; // we are done for this round!
22511     }
22512     // We proceed only with the first 16 bytes.
22513     const __m256i in = _mm256_cvtepu8_epi16((in8));
22514 
22515     // 1. prepare 2-byte values
22516     // input 16-bit word : [0000|0000|aabb|bbbb] x 8
22517     // expected output   : [1100|00aa|10bb|bbbb] x 8
22518     const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
22519     const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
22520 
22521     // t0 = [0000|00aa|bbbb|bb00]
22522     const __m256i t0 = _mm256_slli_epi16(in, 2);
22523     // t1 = [0000|00aa|0000|0000]
22524     const __m256i t1 = _mm256_and_si256(t0, v_1f00);
22525     // t2 = [0000|0000|00bb|bbbb]
22526     const __m256i t2 = _mm256_and_si256(in, v_003f);
22527     // t3 = [000a|aaaa|00bb|bbbb]
22528     const __m256i t3 = _mm256_or_si256(t1, t2);
22529     // t4 = [1100|00aa|10bb|bbbb]
22530     const __m256i t4 = _mm256_or_si256(t3, v_c080);
22531 
22532     // 2. merge ASCII and 2-byte codewords
22533 
22534     // no bits set above 7th bit
22535     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
22536     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
22537 
22538     const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
22539 
22540     // 3. prepare bitmask for 8-bit lookup
22541     const uint32_t M0 = one_byte_bitmask & 0x55555555;
22542     const uint32_t M1 = M0 >> 7;
22543     const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
22544     // 4. pack the bytes
22545 
22546     const uint8_t *row =
22547         &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
22548     const uint8_t *row_2 =
22549         &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)]
22550                                                             [0];
22551 
22552     const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
22553     const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
22554 
22555     const __m256i utf8_packed = _mm256_shuffle_epi8(
22556         utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
22557     // 5. store bytes
22558     _mm_storeu_si128((__m128i *)utf8_output,
22559                      _mm256_castsi256_si128(utf8_packed));
22560     utf8_output += row[0];
22561     _mm_storeu_si128((__m128i *)utf8_output,
22562                      _mm256_extractf128_si256(utf8_packed, 1));
22563     utf8_output += row_2[0];
22564 
22565     // 6. adjust pointers
22566     latin1_input += 16;
22567     continue;
22568 
22569   } // while
22570   return std::make_pair(latin1_input, utf8_output);
22571 }
22572 /* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */
22573 /* begin file src/haswell/avx2_convert_latin1_to_utf16.cpp */
22574 template <endianness big_endian>
22575 std::pair<const char*, char16_t*> avx2_convert_latin1_to_utf16(const char* latin1_input, size_t len, char16_t* utf16_output) {
22576     size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32
22577 
22578     size_t i = 0;
22579     for (; i < rounded_len; i += 16) {
22580         // Load 16 bytes from the address (input + i) into a xmm register
22581         __m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(latin1_input + i));
22582 
22583         // Zero extend each byte in xmm0 to word and put it in another xmm register
22584         __m128i xmm1 = _mm_cvtepu8_epi16(xmm0);
22585 
22586         // Shift xmm0 to the right by 8 bytes
22587         xmm0 = _mm_srli_si128(xmm0, 8);
22588 
22589         // Zero extend each byte in the shifted xmm0 to word in xmm0
22590         xmm0 = _mm_cvtepu8_epi16(xmm0);
22591 
22592         if (big_endian) {
22593             const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
22594             xmm0 = _mm_shuffle_epi8(xmm0, swap);
22595             xmm1 = _mm_shuffle_epi8(xmm1, swap);
22596         }
22597 
22598         // Store the contents of xmm1 into the address pointed by (output + i)
22599         _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i), xmm1);
22600 
22601         // Store the contents of xmm0 into the address pointed by (output + i + 8)
22602         _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i + 8), xmm0);
22603     }
22604 
22605     return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
22606 
22607 }
22608 /* end file src/haswell/avx2_convert_latin1_to_utf16.cpp */
22609 /* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */
22610 std::pair<const char*, char32_t*> avx2_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) {
22611     size_t rounded_len = ((len | 7) ^ 7);  // Round down to nearest multiple of 8
22612 
22613     for (size_t i = 0; i < rounded_len; i += 8) {
22614         // Load 8 Latin1 characters into a 64-bit register
22615         __m128i in = _mm_loadl_epi64((__m128i*)&buf[i]);
22616 
22617         // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using vpmovzxbd
22618         __m256i out = _mm256_cvtepu8_epi32(in);
22619 
22620         // Store the results back to memory
22621         _mm256_storeu_si256((__m256i*)&utf32_output[i], out);
22622     }
22623 
22624     // return pointers pointing to where we left off
22625     return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
22626 }
22627 
22628 /* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */
22629 
22630 /* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
22631 // depends on "tables/utf8_to_utf16_tables.h"
22632 
22633 
22634 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
22635 // end of the code points. Only the least significant 12 bits of the mask
22636 // are accessed.
22637 // It returns how many bytes were consumed (up to 12).
22638 template <endianness big_endian>
22639 size_t convert_masked_utf8_to_utf16(const char *input,
22640                            uint64_t utf8_end_of_code_point_mask,
22641                            char16_t *&utf16_output) {
22642   // we use an approach where we try to process up to 12 input bytes.
22643   // Why 12 input bytes and not 16? Because we are concerned with the size of
22644   // the lookup tables. Also 12 is nicely divisible by two and three.
22645   //
22646   //
22647   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
22648   // beneficial to have fast paths that depend on branch prediction but have less latency.
22649   // This results in more instructions but, potentially, also higher speeds.
22650   //
22651   // We first try a few fast paths.
22652   const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
22653   const __m128i in = _mm_loadu_si128((__m128i *)input);
22654   const uint16_t input_utf8_end_of_code_point_mask =
22655       utf8_end_of_code_point_mask & 0xfff;
22656   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
22657     // We process the data in chunks of 16 bytes.
22658     __m256i ascii = _mm256_cvtepu8_epi16(in);
22659     if (big_endian) {
22660       const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
22661                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
22662       ascii = _mm256_shuffle_epi8(ascii, swap256);
22663     }
22664     _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
22665     utf16_output += 16; // We wrote 16 16-bit characters.
22666     return 16; // We consumed 16 bytes.
22667   }
22668   if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
22669     // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units.
22670     // There is probably a more efficient sequence, but the following might do.
22671     const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
22672     const __m128i perm = _mm_shuffle_epi8(in, sh);
22673     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
22674     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
22675     __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
22676     if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
22677     _mm_storeu_si128((__m128i *)utf16_output, composed);
22678     utf16_output += 8; // We wrote 16 bytes, 8 code points.
22679     return 16;
22680   }
22681   if(input_utf8_end_of_code_point_mask == 0x924) {
22682     // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte UTF-16 code units.
22683     // There is probably a more efficient sequence, but the following might do.
22684     const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
22685     const __m128i perm = _mm_shuffle_epi8(in, sh);
22686     const __m128i ascii =
22687         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
22688     const __m128i middlebyte =
22689         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
22690     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22691     const __m128i highbyte =
22692         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
22693     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
22694     const __m128i composed =
22695         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
22696     __m128i composed_repacked = _mm_packus_epi32(composed, composed);
22697     if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
22698     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
22699     utf16_output += 4;
22700     return 12;
22701   }
22702 
22703   const uint8_t idx =
22704       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
22705   const uint8_t consumed =
22706       simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
22707   if (idx < 64) {
22708     // SIX (6) input code-code units
22709     // this is a relatively easy scenario
22710     // we process SIX (6) input code-code units. The max length in bytes of six code
22711     // code units spanning between 1 and 2 bytes each is 12 bytes. On processors
22712     // where pdep/pext is fast, we might be able to use a small lookup table.
22713     const __m128i sh =
22714         _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
22715     const __m128i perm = _mm_shuffle_epi8(in, sh);
22716     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
22717     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
22718     __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
22719     if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
22720     _mm_storeu_si128((__m128i *)utf16_output, composed);
22721     utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential overflow of 4 bytes.
22722   } else if (idx < 145) {
22723     // FOUR (4) input code-code units
22724     const __m128i sh =
22725         _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
22726     const __m128i perm = _mm_shuffle_epi8(in, sh);
22727     const __m128i ascii =
22728         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
22729     const __m128i middlebyte =
22730         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
22731     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22732     const __m128i highbyte =
22733         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
22734     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
22735     const __m128i composed =
22736         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
22737     __m128i composed_repacked = _mm_packus_epi32(composed, composed);
22738     if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
22739     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
22740     utf16_output += 4; // Here we overflow by 8 bytes.
22741   } else if (idx < 209) {
22742     // TWO (2) input code-code units
22743     //////////////
22744     // There might be garbage inputs where a leading byte mascarades as a four-byte
22745     // leading byte (by being followed by 3 continuation byte), but is not greater than
22746     // 0xf0. This could trigger a buffer overflow if we only counted leading
22747     // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
22748     // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
22749     // We do as at the cost of an extra mask.
22750     /////////////
22751     const __m128i sh =
22752         _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
22753     const __m128i perm = _mm_shuffle_epi8(in, sh);
22754     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
22755     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
22756     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22757     __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
22758     // correct for spurious high bit
22759     const __m128i correct =
22760         _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
22761     middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
22762     const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
22763     // We deliberately carry the leading four bits in highbyte if they are present,
22764     // we remove them later when computing hightenbits.
22765     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
22766     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
22767     // When we need to generate a surrogate pair (leading byte > 0xF0), then
22768     // the corresponding 32-bit value in 'composed'  will be greater than
22769     // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
22770     // location of the surrogate pairs.
22771     const __m128i composed =
22772         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
22773                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
22774     const __m128i composedminus =
22775         _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
22776     const __m128i lowtenbits =
22777         _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
22778     // Notice the 0x3ff mask:
22779     const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
22780     const __m128i lowtenbitsadd =
22781         _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
22782     const __m128i hightenbitsadd =
22783         _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
22784     const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
22785     __m128i surrogates =
22786         _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
22787     uint32_t basic_buffer[4];
22788     uint32_t basic_buffer_swap[4];
22789     if (big_endian) {
22790       _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
22791       surrogates = _mm_shuffle_epi8(surrogates, swap);
22792     }
22793     _mm_storeu_si128((__m128i *)basic_buffer, composed);
22794     uint32_t surrogate_buffer[4];
22795     _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
22796     for (size_t i = 0; i < 3; i++) {
22797       if(basic_buffer[i] > 0x3c00000) {
22798         utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
22799         utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
22800         utf16_output += 2;
22801       } else  {
22802         utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
22803         utf16_output++;
22804       }
22805     }
22806   } else {
22807     // here we know that there is an error but we do not handle errors
22808   }
22809   return consumed;
22810 }
22811 /* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
22812 /* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
22813 // depends on "tables/utf8_to_utf16_tables.h"
22814 
22815 
22816 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
22817 // end of the code points. Only the least significant 12 bits of the mask
22818 // are accessed.
22819 // It returns how many bytes were consumed (up to 12).
22820 size_t convert_masked_utf8_to_utf32(const char *input,
22821                            uint64_t utf8_end_of_code_point_mask,
22822                            char32_t *&utf32_output) {
22823   // we use an approach where we try to process up to 12 input bytes.
22824   // Why 12 input bytes and not 16? Because we are concerned with the size of
22825   // the lookup tables. Also 12 is nicely divisible by two and three.
22826   //
22827   //
22828   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
22829   // beneficial to have fast paths that depend on branch prediction but have less latency.
22830   // This results in more instructions but, potentially, also higher speeds.
22831   //
22832   // We first try a few fast paths.
22833   const __m128i in = _mm_loadu_si128((__m128i *)input);
22834   const uint16_t input_utf8_end_of_code_point_mask =
22835       utf8_end_of_code_point_mask & 0xfff;
22836   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
22837     // We process the data in chunks of 16 bytes.
22838     _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in));
22839     _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8)));
22840     utf32_output += 16; // We wrote 16 32-bit characters.
22841     return 16; // We consumed 16 bytes.
22842   }
22843   if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
22844     // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units.
22845     // There is probably a more efficient sequence, but the following might do.
22846     const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
22847     const __m128i perm = _mm_shuffle_epi8(in, sh);
22848     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
22849     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
22850     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
22851     _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
22852     utf32_output += 8; // We wrote 16 bytes, 8 code points.
22853     return 16;
22854   }
22855   if(input_utf8_end_of_code_point_mask == 0x924) {
22856     // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units.
22857     // There is probably a more efficient sequence, but the following might do.
22858     const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
22859     const __m128i perm = _mm_shuffle_epi8(in, sh);
22860     const __m128i ascii =
22861         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
22862     const __m128i middlebyte =
22863         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
22864     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22865     const __m128i highbyte =
22866         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
22867     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
22868     const __m128i composed =
22869         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
22870     _mm_storeu_si128((__m128i *)utf32_output, composed);
22871     utf32_output += 4;
22872     return 12;
22873   }
22874   /// We do not have a fast path available, so we fallback.
22875 
22876   const uint8_t idx =
22877       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
22878   const uint8_t consumed =
22879       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
22880   if (idx < 64) {
22881     // SIX (6) input code-code units
22882     // this is a relatively easy scenario
22883     // we process SIX (6) input code-code units. The max length in bytes of six code
22884     // code units spanning between 1 and 2 bytes each is 12 bytes. On processors
22885     // where pdep/pext is fast, we might be able to use a small lookup table.
22886     const __m128i sh =
22887         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
22888     const __m128i perm = _mm_shuffle_epi8(in, sh);
22889     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
22890     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
22891     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
22892     _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
22893     utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
22894     // overflow of 32 - 24 = 8 bytes.
22895   } else if (idx < 145) {
22896     // FOUR (4) input code-code units
22897     const __m128i sh =
22898         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
22899     const __m128i perm = _mm_shuffle_epi8(in, sh);
22900     const __m128i ascii =
22901         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
22902     const __m128i middlebyte =
22903         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
22904     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22905     const __m128i highbyte =
22906         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
22907     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
22908     const __m128i composed =
22909         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
22910     _mm_storeu_si128((__m128i *)utf32_output, composed);
22911     utf32_output += 4;
22912   } else if (idx < 209) {
22913     // TWO (2) input code-code units
22914     const __m128i sh =
22915         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
22916     const __m128i perm = _mm_shuffle_epi8(in, sh);
22917     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
22918     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
22919     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22920     __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
22921     // correct for spurious high bit
22922     const __m128i correct =
22923         _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
22924     middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
22925     const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
22926     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
22927     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
22928     const __m128i composed =
22929         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
22930                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
22931     _mm_storeu_si128((__m128i *)utf32_output, composed);
22932     utf32_output += 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
22933   } else {
22934     // here we know that there is an error but we do not handle errors
22935   }
22936   return consumed;
22937 }
22938 /* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
22939 
22940 /* begin file src/haswell/avx2_convert_utf16_to_latin1.cpp */
22941 template <endianness big_endian>
22942 std::pair<const char16_t *, char *>
22943 avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
22944                              char *latin1_output) {
22945   const char16_t *end = buf + len;
22946   while (buf + 16 <= end) {
22947     // Load 16 UTF-16 characters into 256-bit AVX2 register
22948     __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
22949 
22950     if (!match_system(big_endian)) {
22951       const __m256i swap = _mm256_setr_epi8(
22952           1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
22953           21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
22954       in = _mm256_shuffle_epi8(in, swap);
22955     }
22956 
22957     __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
22958     if (_mm256_testz_si256(in, high_byte_mask)) {
22959       // Pack 16-bit characters into 8-bit and store in latin1_output
22960       __m128i lo = _mm256_extractf128_si256(in, 0);
22961       __m128i hi = _mm256_extractf128_si256(in, 1);
22962       __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
22963       __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
22964       _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
22965                        latin1_packed_lo);
22966       _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
22967                        latin1_packed_hi);
22968       // Adjust pointers for next iteration
22969       buf += 16;
22970       latin1_output += 16;
22971     } else {
22972       return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
22973     }
22974   } // while
22975   return std::make_pair(buf, latin1_output);
22976 }
22977 
22978 template <endianness big_endian>
22979 std::pair<result, char *>
22980 avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
22981                                          char *latin1_output) {
22982   const char16_t *start = buf;
22983   const char16_t *end = buf + len;
22984   while (buf + 16 <= end) {
22985     __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
22986 
22987     if (!big_endian) {
22988       const __m256i swap = _mm256_setr_epi8(
22989           1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
22990           21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
22991       in = _mm256_shuffle_epi8(in, swap);
22992     }
22993 
22994     __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
22995     if (_mm256_testz_si256(in, high_byte_mask)) {
22996       __m128i lo = _mm256_extractf128_si256(in, 0);
22997       __m128i hi = _mm256_extractf128_si256(in, 1);
22998       __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
22999       __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
23000       _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
23001                        latin1_packed_lo);
23002       _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
23003                        latin1_packed_hi);
23004       buf += 16;
23005       latin1_output += 16;
23006     } else {
23007       // Fallback to scalar code for handling errors
23008       for (int k = 0; k < 16; k++) {
23009         uint16_t word = !match_system(big_endian)
23010                             ? scalar::utf16::swap_bytes(buf[k])
23011                             : buf[k];
23012         if (word <= 0xff) {
23013           *latin1_output++ = char(word);
23014         } else {
23015           return std::make_pair(
23016               result{error_code::TOO_LARGE, (size_t)(buf - start + k)},
23017               latin1_output);
23018         }
23019       }
23020       buf += 16;
23021     }
23022   } // while
23023   return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)},
23024                         latin1_output);
23025 }
23026 /* end file src/haswell/avx2_convert_utf16_to_latin1.cpp */
23027 /* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
23028 /*
23029     The vectorized algorithm works on single SSE register i.e., it
23030     loads eight 16-bit code units.
23031 
23032     We consider three cases:
23033     1. an input register contains no surrogates and each value
23034        is in range 0x0000 .. 0x07ff.
23035     2. an input register contains no surrogates and values are
23036        is in range 0x0000 .. 0xffff.
23037     3. an input register contains surrogates --- i.e. codepoints
23038        can have 16 or 32 bits.
23039 
23040     Ad 1.
23041 
23042     When values are less than 0x0800, it means that a 16-bit code unit
23043     can be converted into: 1) single UTF8 byte (when it's an ASCII
23044     char) or 2) two UTF8 bytes.
23045 
23046     For this case we do only some shuffle to obtain these 2-byte
23047     codes and finally compress the whole SSE register with a single
23048     shuffle.
23049 
23050     We need 256-entry lookup table to get a compression pattern
23051     and the number of output bytes in the compressed vector register.
23052     Each entry occupies 17 bytes.
23053 
23054     Ad 2.
23055 
23056     When values fit in 16-bit code units, but are above 0x07ff, then
23057     a single word may produce one, two or three UTF8 bytes.
23058 
23059     We prepare data for all these three cases in two registers.
23060     The first register contains lower two UTF8 bytes (used in all
23061     cases), while the second one contains just the third byte for
23062     the three-UTF8-bytes case.
23063 
23064     Finally these two registers are interleaved forming eight-element
23065     array of 32-bit values. The array spans two SSE registers.
23066     The bytes from the registers are compressed using two shuffles.
23067 
23068     We need 256-entry lookup table to get a compression pattern
23069     and the number of output bytes in the compressed vector register.
23070     Each entry occupies 17 bytes.
23071 
23072 
23073     To summarize:
23074     - We need two 256-entry tables that have 8704 bytes in total.
23075 */
23076 
23077 
23078 /*
23079   Returns a pair: the first unprocessed byte from buf and utf8_output
23080   A scalar routing should carry on the conversion of the tail.
23081 */
23082 template <endianness big_endian>
23083 std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
23084   const char16_t* end = buf + len;
23085   const __m256i v_0000 = _mm256_setzero_si256();
23086   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
23087   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
23088   const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
23089   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
23090 
23091   while (buf + 16 + safety_margin <= end) {
23092     __m256i in = _mm256_loadu_si256((__m256i*)buf);
23093     if (big_endian) {
23094       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
23095                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
23096       in = _mm256_shuffle_epi8(in, swap);
23097     }
23098     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
23099     const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
23100     if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
23101         // 1. pack the bytes
23102         const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
23103         // 2. store (16 bytes)
23104         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
23105         // 3. adjust pointers
23106         buf += 16;
23107         utf8_output += 16;
23108         continue; // we are done for this round!
23109     }
23110     // no bits set above 7th bit
23111     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
23112     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
23113 
23114     // no bits set above 11th bit
23115     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
23116     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
23117     if (one_or_two_bytes_bitmask == 0xffffffff) {
23118 
23119           // 1. prepare 2-byte values
23120           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23121           // expected output   : [110a|aaaa|10bb|bbbb] x 8
23122           const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
23123           const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
23124 
23125           // t0 = [000a|aaaa|bbbb|bb00]
23126           const __m256i t0 = _mm256_slli_epi16(in, 2);
23127           // t1 = [000a|aaaa|0000|0000]
23128           const __m256i t1 = _mm256_and_si256(t0, v_1f00);
23129           // t2 = [0000|0000|00bb|bbbb]
23130           const __m256i t2 = _mm256_and_si256(in, v_003f);
23131           // t3 = [000a|aaaa|00bb|bbbb]
23132           const __m256i t3 = _mm256_or_si256(t1, t2);
23133           // t4 = [110a|aaaa|10bb|bbbb]
23134           const __m256i t4 = _mm256_or_si256(t3, v_c080);
23135 
23136           // 2. merge ASCII and 2-byte codewords
23137           const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
23138 
23139           // 3. prepare bitmask for 8-bit lookup
23140           const uint32_t M0 = one_byte_bitmask & 0x55555555;
23141           const uint32_t M1 = M0 >> 7;
23142           const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
23143           // 4. pack the bytes
23144 
23145           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
23146           const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
23147 
23148           const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
23149           const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
23150 
23151           const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
23152           // 5. store bytes
23153           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
23154           utf8_output += row[0];
23155           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
23156           utf8_output += row_2[0];
23157 
23158           // 6. adjust pointers
23159           buf += 16;
23160           continue;
23161     }
23162     // 1. Check if there are any surrogate word in the input chunk.
23163     //    We have also deal with situation when there is a surrogate word
23164     //    at the end of a chunk.
23165     const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
23166 
23167     // bitmask = 0x0000 if there are no surrogates
23168     //         = 0xc000 if the last word is a surrogate
23169     const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
23170     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
23171     // it is likely an uncommon occurrence.
23172     if (surrogates_bitmask == 0x00000000) {
23173       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
23174         const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
23175                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
23176                                                 0x0000, 0x0202, 0x0404, 0x0606,
23177                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
23178 
23179         /* In this branch we handle three cases:
23180            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
23181            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
23182            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
23183 
23184           We expand the input word (16-bit) into two code units (32-bit), thus
23185           we have room for four bytes. However, we need five distinct bit
23186           layouts. Note that the last byte in cases #2 and #3 is the same.
23187 
23188           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
23189           in register t2.
23190 
23191           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
23192           either byte 1 for case #2 or byte 2 for case #3. Note that they
23193           differ by exactly one bit.
23194 
23195           Finally from these two code units we build proper UTF-8 sequence, taking
23196           into account the case (i.e, the number of bytes to write).
23197         */
23198         /**
23199          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
23200          * t2 => [0ccc|cccc] [10cc|cccc]
23201          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
23202          */
23203 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
23204         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
23205         const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
23206         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
23207         const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
23208         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
23209         const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
23210 
23211         // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
23212         const __m256i s0 = _mm256_srli_epi16(in, 4);
23213         // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
23214         const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
23215         // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
23216         const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
23217         // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
23218         const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
23219         const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
23220         const __m256i s4 = _mm256_xor_si256(s3, m0);
23221 #undef simdutf_vec
23222 
23223         // 4. expand code units 16-bit => 32-bit
23224         const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
23225         const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
23226 
23227         // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
23228         const uint32_t mask = (one_byte_bitmask & 0x55555555) |
23229                               (one_or_two_bytes_bitmask & 0xaaaaaaaa);
23230         // Due to the wider registers, the following path is less likely to be useful.
23231         /*if(mask == 0) {
23232           // We only have three-byte code units. Use fast path.
23233           const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
23234           const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
23235           const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
23236           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
23237           utf8_output += 12;
23238           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
23239           utf8_output += 12;
23240           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
23241           utf8_output += 12;
23242           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
23243           utf8_output += 12;
23244           buf += 16;
23245           continue;
23246         }*/
23247         const uint8_t mask0 = uint8_t(mask);
23248         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
23249         const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
23250         const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
23251 
23252         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
23253         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
23254         const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
23255         const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
23256 
23257         const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
23258         const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
23259         const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
23260         const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
23261 
23262 
23263         const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
23264         const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
23265         const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
23266         const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
23267 
23268         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
23269         utf8_output += row0[0];
23270         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
23271         utf8_output += row1[0];
23272         _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
23273         utf8_output += row2[0];
23274         _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
23275         utf8_output += row3[0];
23276         buf += 16;
23277     // surrogate pair(s) in a register
23278     } else {
23279       // Let us do a scalar fallback.
23280       // It may seem wasteful to use scalar code, but being efficient with SIMD
23281       // in the presence of surrogate pairs may require non-trivial tables.
23282       size_t forward = 15;
23283       size_t k = 0;
23284       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
23285       for(; k < forward; k++) {
23286         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23287         if((word & 0xFF80)==0) {
23288           *utf8_output++ = char(word);
23289         } else if((word & 0xF800)==0) {
23290           *utf8_output++ = char((word>>6) | 0b11000000);
23291           *utf8_output++ = char((word & 0b111111) | 0b10000000);
23292         } else if((word &0xF800 ) != 0xD800) {
23293           *utf8_output++ = char((word>>12) | 0b11100000);
23294           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
23295           *utf8_output++ = char((word & 0b111111) | 0b10000000);
23296         } else {
23297           // must be a surrogate pair
23298           uint16_t diff = uint16_t(word - 0xD800);
23299           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
23300           k++;
23301           uint16_t diff2 = uint16_t(next_word - 0xDC00);
23302           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
23303           uint32_t value = (diff << 10) + diff2 + 0x10000;
23304           *utf8_output++ = char((value>>18) | 0b11110000);
23305           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
23306           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
23307           *utf8_output++ = char((value & 0b111111) | 0b10000000);
23308         }
23309       }
23310       buf += k;
23311     }
23312   } // while
23313   return std::make_pair(buf, utf8_output);
23314 }
23315 
23316 
23317 /*
23318   Returns a pair: a result struct and utf8_output.
23319   If there is an error, the count field of the result is the position of the error.
23320   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
23321   A scalar routing should carry on the conversion of the tail if needed.
23322 */
23323 template <endianness big_endian>
23324 std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
23325   const char16_t* start = buf;
23326   const char16_t* end = buf + len;
23327 
23328   const __m256i v_0000 = _mm256_setzero_si256();
23329   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
23330   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
23331   const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
23332   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
23333 
23334   while (buf + 16 + safety_margin <= end) {
23335     __m256i in = _mm256_loadu_si256((__m256i*)buf);
23336     if (big_endian) {
23337       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
23338                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
23339       in = _mm256_shuffle_epi8(in, swap);
23340     }
23341     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
23342     const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
23343     if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
23344         // 1. pack the bytes
23345         const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
23346         // 2. store (16 bytes)
23347         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
23348         // 3. adjust pointers
23349         buf += 16;
23350         utf8_output += 16;
23351         continue; // we are done for this round!
23352     }
23353     // no bits set above 7th bit
23354     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
23355     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
23356 
23357     // no bits set above 11th bit
23358     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
23359     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
23360     if (one_or_two_bytes_bitmask == 0xffffffff) {
23361 
23362           // 1. prepare 2-byte values
23363           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23364           // expected output   : [110a|aaaa|10bb|bbbb] x 8
23365           const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
23366           const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
23367 
23368           // t0 = [000a|aaaa|bbbb|bb00]
23369           const __m256i t0 = _mm256_slli_epi16(in, 2);
23370           // t1 = [000a|aaaa|0000|0000]
23371           const __m256i t1 = _mm256_and_si256(t0, v_1f00);
23372           // t2 = [0000|0000|00bb|bbbb]
23373           const __m256i t2 = _mm256_and_si256(in, v_003f);
23374           // t3 = [000a|aaaa|00bb|bbbb]
23375           const __m256i t3 = _mm256_or_si256(t1, t2);
23376           // t4 = [110a|aaaa|10bb|bbbb]
23377           const __m256i t4 = _mm256_or_si256(t3, v_c080);
23378 
23379           // 2. merge ASCII and 2-byte codewords
23380           const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
23381 
23382           // 3. prepare bitmask for 8-bit lookup
23383           const uint32_t M0 = one_byte_bitmask & 0x55555555;
23384           const uint32_t M1 = M0 >> 7;
23385           const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
23386           // 4. pack the bytes
23387 
23388           const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
23389           const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
23390 
23391           const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
23392           const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
23393 
23394           const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
23395           // 5. store bytes
23396           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
23397           utf8_output += row[0];
23398           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
23399           utf8_output += row_2[0];
23400 
23401           // 6. adjust pointers
23402           buf += 16;
23403           continue;
23404     }
23405     // 1. Check if there are any surrogate word in the input chunk.
23406     //    We have also deal with situation when there is a surrogate word
23407     //    at the end of a chunk.
23408     const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
23409 
23410     // bitmask = 0x0000 if there are no surrogates
23411     //         = 0xc000 if the last word is a surrogate
23412     const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
23413     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
23414     // it is likely an uncommon occurrence.
23415     if (surrogates_bitmask == 0x00000000) {
23416       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
23417         const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
23418                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
23419                                                 0x0000, 0x0202, 0x0404, 0x0606,
23420                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
23421 
23422         /* In this branch we handle three cases:
23423            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
23424            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
23425            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
23426 
23427           We expand the input word (16-bit) into two code units (32-bit), thus
23428           we have room for four bytes. However, we need five distinct bit
23429           layouts. Note that the last byte in cases #2 and #3 is the same.
23430 
23431           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
23432           in register t2.
23433 
23434           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
23435           either byte 1 for case #2 or byte 2 for case #3. Note that they
23436           differ by exactly one bit.
23437 
23438           Finally from these two code units we build proper UTF-8 sequence, taking
23439           into account the case (i.e, the number of bytes to write).
23440         */
23441         /**
23442          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
23443          * t2 => [0ccc|cccc] [10cc|cccc]
23444          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
23445          */
23446 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
23447         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
23448         const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
23449         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
23450         const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
23451         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
23452         const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
23453 
23454         // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
23455         const __m256i s0 = _mm256_srli_epi16(in, 4);
23456         // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
23457         const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
23458         // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
23459         const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
23460         // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
23461         const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
23462         const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
23463         const __m256i s4 = _mm256_xor_si256(s3, m0);
23464 #undef simdutf_vec
23465 
23466         // 4. expand code units 16-bit => 32-bit
23467         const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
23468         const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
23469 
23470         // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
23471         const uint32_t mask = (one_byte_bitmask & 0x55555555) |
23472                               (one_or_two_bytes_bitmask & 0xaaaaaaaa);
23473         // Due to the wider registers, the following path is less likely to be useful.
23474         /*if(mask == 0) {
23475           // We only have three-byte code units. Use fast path.
23476           const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
23477           const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
23478           const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
23479           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
23480           utf8_output += 12;
23481           _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
23482           utf8_output += 12;
23483           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
23484           utf8_output += 12;
23485           _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
23486           utf8_output += 12;
23487           buf += 16;
23488           continue;
23489         }*/
23490         const uint8_t mask0 = uint8_t(mask);
23491         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
23492         const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
23493         const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
23494 
23495         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
23496         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
23497         const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
23498         const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
23499 
23500         const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
23501         const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
23502         const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
23503         const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
23504 
23505 
23506         const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
23507         const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
23508         const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
23509         const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
23510 
23511         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
23512         utf8_output += row0[0];
23513         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
23514         utf8_output += row1[0];
23515         _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
23516         utf8_output += row2[0];
23517         _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
23518         utf8_output += row3[0];
23519         buf += 16;
23520     // surrogate pair(s) in a register
23521     } else {
23522       // Let us do a scalar fallback.
23523       // It may seem wasteful to use scalar code, but being efficient with SIMD
23524       // in the presence of surrogate pairs may require non-trivial tables.
23525       size_t forward = 15;
23526       size_t k = 0;
23527       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
23528       for(; k < forward; k++) {
23529         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23530         if((word & 0xFF80)==0) {
23531           *utf8_output++ = char(word);
23532         } else if((word & 0xF800)==0) {
23533           *utf8_output++ = char((word>>6) | 0b11000000);
23534           *utf8_output++ = char((word & 0b111111) | 0b10000000);
23535         } else if((word &0xF800 ) != 0xD800) {
23536           *utf8_output++ = char((word>>12) | 0b11100000);
23537           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
23538           *utf8_output++ = char((word & 0b111111) | 0b10000000);
23539         } else {
23540           // must be a surrogate pair
23541           uint16_t diff = uint16_t(word - 0xD800);
23542           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
23543           k++;
23544           uint16_t diff2 = uint16_t(next_word - 0xDC00);
23545           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
23546           uint32_t value = (diff << 10) + diff2 + 0x10000;
23547           *utf8_output++ = char((value>>18) | 0b11110000);
23548           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
23549           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
23550           *utf8_output++ = char((value & 0b111111) | 0b10000000);
23551         }
23552       }
23553       buf += k;
23554     }
23555   } // while
23556   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
23557 }
23558 /* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
23559 /* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
23560 /*
23561     The vectorized algorithm works on single SSE register i.e., it
23562     loads eight 16-bit code units.
23563 
23564     We consider three cases:
23565     1. an input register contains no surrogates and each value
23566        is in range 0x0000 .. 0x07ff.
23567     2. an input register contains no surrogates and values are
23568        is in range 0x0000 .. 0xffff.
23569     3. an input register contains surrogates --- i.e. codepoints
23570        can have 16 or 32 bits.
23571 
23572     Ad 1.
23573 
23574     When values are less than 0x0800, it means that a 16-bit code unit
23575     can be converted into: 1) single UTF8 byte (when it's an ASCII
23576     char) or 2) two UTF8 bytes.
23577 
23578     For this case we do only some shuffle to obtain these 2-byte
23579     codes and finally compress the whole SSE register with a single
23580     shuffle.
23581 
23582     We need 256-entry lookup table to get a compression pattern
23583     and the number of output bytes in the compressed vector register.
23584     Each entry occupies 17 bytes.
23585 
23586     Ad 2.
23587 
23588     When values fit in 16-bit code units, but are above 0x07ff, then
23589     a single word may produce one, two or three UTF8 bytes.
23590 
23591     We prepare data for all these three cases in two registers.
23592     The first register contains lower two UTF8 bytes (used in all
23593     cases), while the second one contains just the third byte for
23594     the three-UTF8-bytes case.
23595 
23596     Finally these two registers are interleaved forming eight-element
23597     array of 32-bit values. The array spans two SSE registers.
23598     The bytes from the registers are compressed using two shuffles.
23599 
23600     We need 256-entry lookup table to get a compression pattern
23601     and the number of output bytes in the compressed vector register.
23602     Each entry occupies 17 bytes.
23603 
23604 
23605     To summarize:
23606     - We need two 256-entry tables that have 8704 bytes in total.
23607 */
23608 
23609 
23610 /*
23611   Returns a pair: the first unprocessed byte from buf and utf32_output
23612   A scalar routing should carry on the conversion of the tail.
23613 */
23614 template <endianness big_endian>
23615 std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
23616   const char16_t* end = buf + len;
23617   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
23618   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
23619 
23620   while (buf + 16 <= end) {
23621     __m256i in = _mm256_loadu_si256((__m256i*)buf);
23622     if (big_endian) {
23623       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
23624                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
23625       in = _mm256_shuffle_epi8(in, swap);
23626     }
23627 
23628     // 1. Check if there are any surrogate word in the input chunk.
23629     //    We have also deal with situation when there is a surrogate word
23630     //    at the end of a chunk.
23631     const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
23632 
23633     // bitmask = 0x0000 if there are no surrogates
23634     //         = 0xc000 if the last word is a surrogate
23635     const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
23636     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
23637     // it is likely an uncommon occurrence.
23638     if (surrogates_bitmask == 0x00000000) {
23639       // case: we extend all sixteen 16-bit code units to sixteen 32-bit code units
23640         _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
23641         _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
23642         utf32_output += 16;
23643         buf += 16;
23644     // surrogate pair(s) in a register
23645     } else {
23646       // Let us do a scalar fallback.
23647       // It may seem wasteful to use scalar code, but being efficient with SIMD
23648       // in the presence of surrogate pairs may require non-trivial tables.
23649       size_t forward = 15;
23650       size_t k = 0;
23651       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
23652       for(; k < forward; k++) {
23653         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23654         if((word &0xF800 ) != 0xD800) {
23655           // No surrogate pair
23656           *utf32_output++ = char32_t(word);
23657         } else {
23658           // must be a surrogate pair
23659           uint16_t diff = uint16_t(word - 0xD800);
23660           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
23661           k++;
23662           uint16_t diff2 = uint16_t(next_word - 0xDC00);
23663           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
23664           uint32_t value = (diff << 10) + diff2 + 0x10000;
23665           *utf32_output++ = char32_t(value);
23666         }
23667       }
23668       buf += k;
23669     }
23670   } // while
23671   return std::make_pair(buf, utf32_output);
23672 }
23673 
23674 
23675 /*
23676   Returns a pair: a result struct and utf8_output.
23677   If there is an error, the count field of the result is the position of the error.
23678   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
23679   A scalar routing should carry on the conversion of the tail if needed.
23680 */
23681 template <endianness big_endian>
23682 std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
23683   const char16_t* start = buf;
23684   const char16_t* end = buf + len;
23685   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
23686   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
23687 
23688   while (buf + 16 <= end) {
23689     __m256i in = _mm256_loadu_si256((__m256i*)buf);
23690     if (big_endian) {
23691       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
23692                                   17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
23693       in = _mm256_shuffle_epi8(in, swap);
23694     }
23695 
23696     // 1. Check if there are any surrogate word in the input chunk.
23697     //    We have also deal with situation when there is a surrogate word
23698     //    at the end of a chunk.
23699     const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
23700 
23701     // bitmask = 0x0000 if there are no surrogates
23702     //         = 0xc000 if the last word is a surrogate
23703     const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
23704     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
23705     // it is likely an uncommon occurrence.
23706     if (surrogates_bitmask == 0x00000000) {
23707       // case: we extend all sixteen 16-bit code units to sixteen 32-bit code units
23708         _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
23709         _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
23710         utf32_output += 16;
23711         buf += 16;
23712     // surrogate pair(s) in a register
23713     } else {
23714       // Let us do a scalar fallback.
23715       // It may seem wasteful to use scalar code, but being efficient with SIMD
23716       // in the presence of surrogate pairs may require non-trivial tables.
23717       size_t forward = 15;
23718       size_t k = 0;
23719       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
23720       for(; k < forward; k++) {
23721         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23722         if((word &0xF800 ) != 0xD800) {
23723           // No surrogate pair
23724           *utf32_output++ = char32_t(word);
23725         } else {
23726           // must be a surrogate pair
23727           uint16_t diff = uint16_t(word - 0xD800);
23728           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
23729           k++;
23730           uint16_t diff2 = uint16_t(next_word - 0xDC00);
23731           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
23732           uint32_t value = (diff << 10) + diff2 + 0x10000;
23733           *utf32_output++ = char32_t(value);
23734         }
23735       }
23736       buf += k;
23737     }
23738   } // while
23739   return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
23740 }
23741 /* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
23742 
23743 /* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */
23744 std::pair<const char32_t *, char *>
23745 avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len,
23746                              char *latin1_output) {
23747   const size_t rounded_len =
23748   len & ~0x1F; // Round down to nearest multiple of 32
23749 
23750   __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
23751 
23752   __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
23753                                     -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
23754                                     -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
23755 
23756   for (size_t i = 0; i < rounded_len; i += 16) {
23757     __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
23758     __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
23759 
23760     __m256i check_combined = _mm256_or_si256(in1, in2);
23761 
23762     if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
23763       return std::make_pair(nullptr, latin1_output);
23764     }
23765 
23766     //Turn UTF32 bytes into latin 1 bytes
23767     __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
23768     __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
23769 
23770     //move Latin1 bytes to their correct spot
23771     __m256i idx1 = _mm256_set_epi32(-1, -1,-1,-1,-1,-1,4,0);
23772     __m256i idx2 = _mm256_set_epi32(-1, -1,-1,-1,4,0,-1,-1);
23773     __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
23774     __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
23775 
23776     __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
23777     _mm_storeu_si128((__m128i *)latin1_output,
23778                      _mm256_castsi256_si128(result));
23779 
23780     latin1_output += 16;
23781     buf += 16;
23782   }
23783 
23784   return std::make_pair(buf, latin1_output);
23785 }
23786 std::pair<result, char *>
23787 avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
23788                                          char *latin1_output) {
23789     const size_t rounded_len =
23790         len & ~0x1F; // Round down to nearest multiple of 32
23791 
23792     __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
23793     __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
23794                                        -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
23795                                        -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
23796 
23797     const char32_t *start = buf;
23798 
23799     for (size_t i = 0; i < rounded_len; i += 16) {
23800         __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
23801         __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
23802 
23803         __m256i check_combined = _mm256_or_si256(in1, in2);
23804 
23805         if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
23806             // Fallback to scalar code for handling errors
23807             for (int k = 0; k < 8; k++) {
23808                 char32_t codepoint = buf[k];
23809                 if (codepoint <= 0xFF) {
23810                     *latin1_output++ = static_cast<char>(codepoint);
23811                 } else {
23812                     return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
23813                                           latin1_output);
23814                 }
23815             }
23816             buf += 8;
23817         } else {
23818             __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
23819             __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
23820 
23821             __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
23822             __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
23823             __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
23824             __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
23825 
23826             __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
23827             _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result));
23828 
23829             latin1_output += 16;
23830             buf += 16;
23831         }
23832     }
23833 
23834     return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output);
23835 }
23836 /* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */
23837 /* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
23838 std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
23839   const char32_t* end = buf + len;
23840   const __m256i v_0000 = _mm256_setzero_si256();
23841   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
23842   const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
23843   const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
23844   const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
23845   const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
23846   __m256i running_max = _mm256_setzero_si256();
23847   __m256i forbidden_bytemask = _mm256_setzero_si256();
23848 
23849   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
23850 
23851   while (buf + 16 + safety_margin <= end) {
23852     __m256i in = _mm256_loadu_si256((__m256i*)buf);
23853     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
23854     running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
23855 
23856     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation
23857     __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
23858     in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
23859 
23860     // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
23861 
23862     if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
23863       // 1. pack the bytes
23864       const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
23865       // 2. store (16 bytes)
23866       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
23867       // 3. adjust pointers
23868       buf += 16;
23869       utf8_output += 16;
23870       continue; // we are done for this round!
23871     }
23872     // no bits set above 7th bit
23873     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
23874     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
23875 
23876     // no bits set above 11th bit
23877     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
23878     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
23879     if (one_or_two_bytes_bitmask == 0xffffffff) {
23880       // 1. prepare 2-byte values
23881       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23882       // expected output   : [110a|aaaa|10bb|bbbb] x 8
23883       const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
23884       const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
23885 
23886       // t0 = [000a|aaaa|bbbb|bb00]
23887       const __m256i t0 = _mm256_slli_epi16(in_16, 2);
23888       // t1 = [000a|aaaa|0000|0000]
23889       const __m256i t1 = _mm256_and_si256(t0, v_1f00);
23890       // t2 = [0000|0000|00bb|bbbb]
23891       const __m256i t2 = _mm256_and_si256(in_16, v_003f);
23892       // t3 = [000a|aaaa|00bb|bbbb]
23893       const __m256i t3 = _mm256_or_si256(t1, t2);
23894       // t4 = [110a|aaaa|10bb|bbbb]
23895       const __m256i t4 = _mm256_or_si256(t3, v_c080);
23896 
23897       // 2. merge ASCII and 2-byte codewords
23898       const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
23899 
23900       // 3. prepare bitmask for 8-bit lookup
23901       const uint32_t M0 = one_byte_bitmask & 0x55555555;
23902       const uint32_t M1 = M0 >> 7;
23903       const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
23904       // 4. pack the bytes
23905 
23906       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
23907       const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
23908 
23909       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
23910       const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
23911 
23912       const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
23913       // 5. store bytes
23914       _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
23915       utf8_output += row[0];
23916       _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
23917       utf8_output += row_2[0];
23918 
23919       // 6. adjust pointers
23920       buf += 16;
23921       continue;
23922     }
23923     // Must check for overflow in packing
23924     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
23925     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
23926     if (saturation_bitmask == 0xffffffff) {
23927       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
23928       const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
23929       forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
23930 
23931       const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
23932                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
23933                                               0x0000, 0x0202, 0x0404, 0x0606,
23934                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
23935 
23936       /* In this branch we handle three cases:
23937         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
23938         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
23939         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
23940 
23941         We expand the input word (16-bit) into two code units (32-bit), thus
23942         we have room for four bytes. However, we need five distinct bit
23943         layouts. Note that the last byte in cases #2 and #3 is the same.
23944 
23945         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
23946         in register t2.
23947 
23948         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
23949         either byte 1 for case #2 or byte 2 for case #3. Note that they
23950         differ by exactly one bit.
23951 
23952         Finally from these two code units we build proper UTF-8 sequence, taking
23953         into account the case (i.e, the number of bytes to write).
23954       */
23955       /**
23956        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
23957        * t2 => [0ccc|cccc] [10cc|cccc]
23958        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
23959        */
23960 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
23961       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
23962       const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
23963       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
23964       const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
23965       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
23966       const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
23967 
23968       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
23969       const __m256i s0 = _mm256_srli_epi16(in_16, 4);
23970       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
23971       const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
23972       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
23973       const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
23974       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
23975       const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
23976       const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
23977       const __m256i s4 = _mm256_xor_si256(s3, m0);
23978 #undef simdutf_vec
23979 
23980       // 4. expand code units 16-bit => 32-bit
23981       const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
23982       const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
23983 
23984       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
23985       const uint32_t mask = (one_byte_bitmask & 0x55555555) |
23986                             (one_or_two_bytes_bitmask & 0xaaaaaaaa);
23987       // Due to the wider registers, the following path is less likely to be useful.
23988       /*if(mask == 0) {
23989         // We only have three-byte code units. Use fast path.
23990         const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
23991         const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
23992         const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
23993         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
23994         utf8_output += 12;
23995         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
23996         utf8_output += 12;
23997         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
23998         utf8_output += 12;
23999         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
24000         utf8_output += 12;
24001         buf += 16;
24002         continue;
24003       }*/
24004       const uint8_t mask0 = uint8_t(mask);
24005       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
24006       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
24007       const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
24008 
24009       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
24010       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
24011       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
24012       const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
24013 
24014       const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
24015       const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
24016       const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
24017       const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
24018 
24019 
24020       const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
24021       const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
24022       const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
24023       const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
24024 
24025       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
24026       utf8_output += row0[0];
24027       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
24028       utf8_output += row1[0];
24029       _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
24030       utf8_output += row2[0];
24031       _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
24032       utf8_output += row3[0];
24033       buf += 16;
24034     } else {
24035       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
24036       // Let us do a scalar fallback.
24037       // It may seem wasteful to use scalar code, but being efficient with SIMD
24038       // may require large, non-trivial tables?
24039       size_t forward = 15;
24040       size_t k = 0;
24041       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
24042       for(; k < forward; k++) {
24043         uint32_t word = buf[k];
24044         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
24045           *utf8_output++ = char(word);
24046         } else if((word & 0xFFFFF800)==0) { // 2-byte
24047           *utf8_output++ = char((word>>6) | 0b11000000);
24048           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24049         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
24050           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
24051           *utf8_output++ = char((word>>12) | 0b11100000);
24052           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24053           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24054         } else {  // 4-byte
24055           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
24056           *utf8_output++ = char((word>>18) | 0b11110000);
24057           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
24058           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24059           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24060         }
24061       }
24062       buf += k;
24063     }
24064   } // while
24065 
24066   // check for invalid input
24067   const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
24068   if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
24069     return std::make_pair(nullptr, utf8_output);
24070   }
24071 
24072   if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
24073 
24074   return std::make_pair(buf, utf8_output);
24075 }
24076 
24077 
24078 std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
24079   const char32_t* end = buf + len;
24080   const char32_t* start = buf;
24081 
24082   const __m256i v_0000 = _mm256_setzero_si256();
24083   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
24084   const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
24085   const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
24086   const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
24087   const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
24088   const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
24089 
24090   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
24091 
24092   while (buf + 16 + safety_margin <= end) {
24093     __m256i in = _mm256_loadu_si256((__m256i*)buf);
24094     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
24095     // Check for too large input
24096     const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
24097     if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
24098       return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
24099     }
24100 
24101     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation
24102     __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
24103     in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
24104 
24105     // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
24106 
24107     if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
24108       // 1. pack the bytes
24109       const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
24110       // 2. store (16 bytes)
24111       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
24112       // 3. adjust pointers
24113       buf += 16;
24114       utf8_output += 16;
24115       continue; // we are done for this round!
24116     }
24117     // no bits set above 7th bit
24118     const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
24119     const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
24120 
24121     // no bits set above 11th bit
24122     const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
24123     const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
24124     if (one_or_two_bytes_bitmask == 0xffffffff) {
24125       // 1. prepare 2-byte values
24126       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
24127       // expected output   : [110a|aaaa|10bb|bbbb] x 8
24128       const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
24129       const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
24130 
24131       // t0 = [000a|aaaa|bbbb|bb00]
24132       const __m256i t0 = _mm256_slli_epi16(in_16, 2);
24133       // t1 = [000a|aaaa|0000|0000]
24134       const __m256i t1 = _mm256_and_si256(t0, v_1f00);
24135       // t2 = [0000|0000|00bb|bbbb]
24136       const __m256i t2 = _mm256_and_si256(in_16, v_003f);
24137       // t3 = [000a|aaaa|00bb|bbbb]
24138       const __m256i t3 = _mm256_or_si256(t1, t2);
24139       // t4 = [110a|aaaa|10bb|bbbb]
24140       const __m256i t4 = _mm256_or_si256(t3, v_c080);
24141 
24142       // 2. merge ASCII and 2-byte codewords
24143       const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
24144 
24145       // 3. prepare bitmask for 8-bit lookup
24146       const uint32_t M0 = one_byte_bitmask & 0x55555555;
24147       const uint32_t M1 = M0 >> 7;
24148       const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
24149       // 4. pack the bytes
24150 
24151       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
24152       const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
24153 
24154       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
24155       const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
24156 
24157       const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
24158       // 5. store bytes
24159       _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
24160       utf8_output += row[0];
24161       _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
24162       utf8_output += row_2[0];
24163 
24164       // 6. adjust pointers
24165       buf += 16;
24166       continue;
24167     }
24168     // Must check for overflow in packing
24169     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
24170     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
24171     if (saturation_bitmask == 0xffffffff) {
24172       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
24173 
24174       // Check for illegal surrogate code units
24175       const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
24176       const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
24177       if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
24178         return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
24179       }
24180 
24181       const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
24182                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
24183                                               0x0000, 0x0202, 0x0404, 0x0606,
24184                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
24185 
24186       /* In this branch we handle three cases:
24187         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
24188         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
24189         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
24190 
24191         We expand the input word (16-bit) into two code units (32-bit), thus
24192         we have room for four bytes. However, we need five distinct bit
24193         layouts. Note that the last byte in cases #2 and #3 is the same.
24194 
24195         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
24196         in register t2.
24197 
24198         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
24199         either byte 1 for case #2 or byte 2 for case #3. Note that they
24200         differ by exactly one bit.
24201 
24202         Finally from these two code units we build proper UTF-8 sequence, taking
24203         into account the case (i.e, the number of bytes to write).
24204       */
24205       /**
24206        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
24207        * t2 => [0ccc|cccc] [10cc|cccc]
24208        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
24209        */
24210 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
24211       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
24212       const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
24213       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
24214       const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
24215       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
24216       const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
24217 
24218       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
24219       const __m256i s0 = _mm256_srli_epi16(in_16, 4);
24220       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
24221       const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
24222       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
24223       const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
24224       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
24225       const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
24226       const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
24227       const __m256i s4 = _mm256_xor_si256(s3, m0);
24228 #undef simdutf_vec
24229 
24230       // 4. expand code units 16-bit => 32-bit
24231       const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
24232       const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
24233 
24234       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
24235       const uint32_t mask = (one_byte_bitmask & 0x55555555) |
24236                             (one_or_two_bytes_bitmask & 0xaaaaaaaa);
24237       // Due to the wider registers, the following path is less likely to be useful.
24238       /*if(mask == 0) {
24239         // We only have three-byte code units. Use fast path.
24240         const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
24241         const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
24242         const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
24243         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
24244         utf8_output += 12;
24245         _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
24246         utf8_output += 12;
24247         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
24248         utf8_output += 12;
24249         _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
24250         utf8_output += 12;
24251         buf += 16;
24252         continue;
24253       }*/
24254       const uint8_t mask0 = uint8_t(mask);
24255       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
24256       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
24257       const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
24258 
24259       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
24260       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
24261       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
24262       const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
24263 
24264       const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
24265       const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
24266       const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
24267       const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
24268 
24269 
24270       const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
24271       const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
24272       const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
24273       const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
24274 
24275       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
24276       utf8_output += row0[0];
24277       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
24278       utf8_output += row1[0];
24279       _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
24280       utf8_output += row2[0];
24281       _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
24282       utf8_output += row3[0];
24283       buf += 16;
24284     } else {
24285       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
24286       // Let us do a scalar fallback.
24287       // It may seem wasteful to use scalar code, but being efficient with SIMD
24288       // may require large, non-trivial tables?
24289       size_t forward = 15;
24290       size_t k = 0;
24291       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
24292       for(; k < forward; k++) {
24293         uint32_t word = buf[k];
24294         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
24295           *utf8_output++ = char(word);
24296         } else if((word & 0xFFFFF800)==0) { // 2-byte
24297           *utf8_output++ = char((word>>6) | 0b11000000);
24298           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24299         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
24300           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
24301           *utf8_output++ = char((word>>12) | 0b11100000);
24302           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24303           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24304         } else {  // 4-byte
24305           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
24306           *utf8_output++ = char((word>>18) | 0b11110000);
24307           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
24308           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24309           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24310         }
24311       }
24312       buf += k;
24313     }
24314   } // while
24315 
24316   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
24317 }
24318 /* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
24319 /* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
24320 template <endianness big_endian>
24321 std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
24322   const char32_t* end = buf + len;
24323 
24324   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
24325   __m256i forbidden_bytemask = _mm256_setzero_si256();
24326 
24327 
24328   while (buf + 8 + safety_margin <= end) {
24329     __m256i in = _mm256_loadu_si256((__m256i*)buf);
24330 
24331     const __m256i v_00000000 = _mm256_setzero_si256();
24332     const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
24333 
24334     // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
24335     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
24336     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
24337 
24338     if (saturation_bitmask == 0xffffffff) {
24339       const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
24340       const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
24341       forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
24342 
24343       __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
24344       if (big_endian) {
24345         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
24346         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
24347       }
24348       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
24349       utf16_output += 8;
24350       buf += 8;
24351     } else {
24352       size_t forward = 7;
24353       size_t k = 0;
24354       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
24355       for(; k < forward; k++) {
24356         uint32_t word = buf[k];
24357         if((word & 0xFFFF0000)==0) {
24358           // will not generate a surrogate pair
24359           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
24360           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
24361         } else {
24362           // will generate a surrogate pair
24363           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
24364           word -= 0x10000;
24365           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
24366           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
24367           if (big_endian) {
24368             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
24369             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
24370           }
24371           *utf16_output++ = char16_t(high_surrogate);
24372           *utf16_output++ = char16_t(low_surrogate);
24373         }
24374       }
24375       buf += k;
24376     }
24377   }
24378 
24379   // check for invalid input
24380   if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
24381 
24382   return std::make_pair(buf, utf16_output);
24383 }
24384 
24385 
24386 template <endianness big_endian>
24387 std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
24388   const char32_t* start = buf;
24389   const char32_t* end = buf + len;
24390 
24391   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
24392 
24393   while (buf + 8 + safety_margin <= end) {
24394     __m256i in = _mm256_loadu_si256((__m256i*)buf);
24395 
24396     const __m256i v_00000000 = _mm256_setzero_si256();
24397     const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
24398 
24399     // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
24400     const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
24401     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
24402 
24403     if (saturation_bitmask == 0xffffffff) {
24404       const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
24405       const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
24406       const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
24407       if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
24408         return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
24409       }
24410 
24411       __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
24412       if (big_endian) {
24413         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
24414         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
24415       }
24416       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
24417       utf16_output += 8;
24418       buf += 8;
24419     } else {
24420       size_t forward = 7;
24421       size_t k = 0;
24422       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
24423       for(; k < forward; k++) {
24424         uint32_t word = buf[k];
24425         if((word & 0xFFFF0000)==0) {
24426           // will not generate a surrogate pair
24427           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
24428           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
24429         } else {
24430           // will generate a surrogate pair
24431           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
24432           word -= 0x10000;
24433           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
24434           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
24435           if (big_endian) {
24436             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
24437             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
24438           }
24439           *utf16_output++ = char16_t(high_surrogate);
24440           *utf16_output++ = char16_t(low_surrogate);
24441         }
24442       }
24443       buf += k;
24444     }
24445   }
24446 
24447   return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
24448 }
24449 /* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
24450 
24451 /* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */
24452 // depends on "tables/utf8_to_utf16_tables.h"
24453 
24454 // Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
24455 // end of the code points. Only the least significant 12 bits of the mask
24456 // are accessed.
24457 // It returns how many bytes were consumed (up to 12).
24458 size_t convert_masked_utf8_to_latin1(const char *input,
24459                            uint64_t utf8_end_of_code_point_mask,
24460                            char *&latin1_output) {
24461   // we use an approach where we try to process up to 12 input bytes.
24462   // Why 12 input bytes and not 16? Because we are concerned with the size of
24463   // the lookup tables. Also 12 is nicely divisible by two and three.
24464   //
24465   //
24466   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
24467   // beneficial to have fast paths that depend on branch prediction but have less latency.
24468   // This results in more instructions but, potentially, also higher speeds.
24469   //
24470   const __m128i in = _mm_loadu_si128((__m128i *)input);
24471   const __m128i in_second_half = _mm_loadu_si128((__m128i *)(input + 16));
24472 
24473   const uint16_t input_utf8_end_of_code_point_mask =
24474       utf8_end_of_code_point_mask & 0xfff; //we're only processing 12 bytes in case it`s not all ASCII
24475 
24476   if((input_utf8_end_of_code_point_mask & 0xffffffff) == 0xffffffff) {
24477     // Load the next 128 bits.
24478 
24479     // Combine the two 128-bit registers into a single 256-bit register.
24480     __m256i in_combined = _mm256_set_m128i(in_second_half, in);
24481 
24482     // We process the data in chunks of 32 bytes.
24483     _mm256_storeu_si256(reinterpret_cast<__m256i *>(latin1_output), in_combined);
24484 
24485     latin1_output += 32; // We wrote 32 characters.
24486     return 32; // We consumed 32 bytes.
24487   }
24488 
24489 
24490   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
24491     // We process the data in chunks of 16 bytes.
24492     _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
24493     latin1_output += 16; // We wrote 16 characters.
24494     return 16; // We consumed 16 bytes.
24495   }
24496   /// We do not have a fast path available, so we fallback.
24497   const uint8_t idx =
24498       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
24499   const uint8_t consumed =
24500       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
24501   // this indicates an invalid input:
24502   if(idx >= 64) { return consumed; }
24503   // Here we should have (idx < 64), if not, there is a bug in the validation or elsewhere.
24504   // SIX (6) input code-code units
24505   // this is a relatively easy scenario
24506   // we process SIX (6) input code-code units. The max length in bytes of six code
24507   // code units spanning between 1 and 2 bytes each is 12 bytes. On processors
24508   // where pdep/pext is fast, we might be able to use a small lookup table.
24509   const __m128i sh =
24510         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
24511   const __m128i perm = _mm_shuffle_epi8(in, sh);
24512   const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
24513   const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
24514   __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
24515   const __m128i latin1_packed = _mm_packus_epi16(composed,composed);
24516   // writing 8 bytes even though we only care about the first 6 bytes.
24517   // performance note: it would be faster to use _mm_storeu_si128, we should investigate.
24518   _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
24519   latin1_output += 6; // We wrote 6 bytes.
24520   return consumed;
24521 }
24522 /* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */
24523 
24524 } // unnamed namespace
24525 } // namespace haswell
24526 } // namespace simdutf
24527 
24528 /* begin file src/generic/buf_block_reader.h */
24529 namespace simdutf {
24530 namespace haswell {
24531 namespace {
24532 
24533 // Walks through a buffer in block-sized increments, loading the last part with spaces
24534 template<size_t STEP_SIZE>
24535 struct buf_block_reader {
24536 public:
24537   simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
24538   simdutf_really_inline size_t block_index();
24539   simdutf_really_inline bool has_full_block() const;
24540   simdutf_really_inline const uint8_t *full_block() const;
24541   /**
24542    * Get the last block, padded with spaces.
24543    *
24544    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
24545    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
24546    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
24547    *
24548    * @return the number of effective characters in the last block.
24549    */
24550   simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
24551   simdutf_really_inline void advance();
24552 private:
24553   const uint8_t *buf;
24554   const size_t len;
24555   const size_t lenminusstep;
24556   size_t idx;
24557 };
24558 
24559 // Routines to print masks and text for debugging bitmask operations
24560 simdutf_unused static char * format_input_text_64(const uint8_t *text) {
24561   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
24562   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
24563     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
24564   }
24565   buf[sizeof(simd8x64<uint8_t>)] = '\0';
24566   return buf;
24567 }
24568 
24569 // Routines to print masks and text for debugging bitmask operations
24570 simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
24571   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
24572   in.store(reinterpret_cast<uint8_t*>(buf));
24573   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
24574     if (buf[i] < ' ') { buf[i] = '_'; }
24575   }
24576   buf[sizeof(simd8x64<uint8_t>)] = '\0';
24577   return buf;
24578 }
24579 
24580 simdutf_unused static char * format_mask(uint64_t mask) {
24581   static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
24582   for (size_t i=0; i<64; i++) {
24583     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
24584   }
24585   buf[64] = '\0';
24586   return buf;
24587 }
24588 
24589 template<size_t STEP_SIZE>
24590 simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
24591 
24592 template<size_t STEP_SIZE>
24593 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
24594 
24595 template<size_t STEP_SIZE>
24596 simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
24597   return idx < lenminusstep;
24598 }
24599 
24600 template<size_t STEP_SIZE>
24601 simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
24602   return &buf[idx];
24603 }
24604 
24605 template<size_t STEP_SIZE>
24606 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
24607   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
24608   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
24609   std::memcpy(dst, buf + idx, len - idx);
24610   return len - idx;
24611 }
24612 
24613 template<size_t STEP_SIZE>
24614 simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
24615   idx += STEP_SIZE;
24616 }
24617 
24618 } // unnamed namespace
24619 } // namespace haswell
24620 } // namespace simdutf
24621 /* end file src/generic/buf_block_reader.h */
24622 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
24623 namespace simdutf {
24624 namespace haswell {
24625 namespace {
24626 namespace utf8_validation {
24627 
24628 using namespace simd;
24629 
24630   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
24631 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
24632 // Bit 1 = Too Long (ASCII followed by continuation)
24633 // Bit 2 = Overlong 3-byte
24634 // Bit 4 = Surrogate
24635 // Bit 5 = Overlong 2-byte
24636 // Bit 7 = Two Continuations
24637     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
24638                                                 // 11______ 11______
24639     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
24640     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
24641     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
24642     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
24643     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
24644     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
24645                                                 // 11110100 101_____
24646                                                 // 11110101 1001____
24647                                                 // 11110101 101_____
24648                                                 // 1111011_ 1001____
24649                                                 // 1111011_ 101_____
24650                                                 // 11111___ 1001____
24651                                                 // 11111___ 101_____
24652     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
24653                                                 // 11110101 1000____
24654                                                 // 1111011_ 1000____
24655                                                 // 11111___ 1000____
24656     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
24657 
24658     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
24659       // 0_______ ________ <ASCII in byte 1>
24660       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
24661       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
24662       // 10______ ________ <continuation in byte 1>
24663       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
24664       // 1100____ ________ <two byte lead in byte 1>
24665       TOO_SHORT | OVERLONG_2,
24666       // 1101____ ________ <two byte lead in byte 1>
24667       TOO_SHORT,
24668       // 1110____ ________ <three byte lead in byte 1>
24669       TOO_SHORT | OVERLONG_3 | SURROGATE,
24670       // 1111____ ________ <four+ byte lead in byte 1>
24671       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
24672     );
24673     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
24674     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
24675       // ____0000 ________
24676       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
24677       // ____0001 ________
24678       CARRY | OVERLONG_2,
24679       // ____001_ ________
24680       CARRY,
24681       CARRY,
24682 
24683       // ____0100 ________
24684       CARRY | TOO_LARGE,
24685       // ____0101 ________
24686       CARRY | TOO_LARGE | TOO_LARGE_1000,
24687       // ____011_ ________
24688       CARRY | TOO_LARGE | TOO_LARGE_1000,
24689       CARRY | TOO_LARGE | TOO_LARGE_1000,
24690 
24691       // ____1___ ________
24692       CARRY | TOO_LARGE | TOO_LARGE_1000,
24693       CARRY | TOO_LARGE | TOO_LARGE_1000,
24694       CARRY | TOO_LARGE | TOO_LARGE_1000,
24695       CARRY | TOO_LARGE | TOO_LARGE_1000,
24696       CARRY | TOO_LARGE | TOO_LARGE_1000,
24697       // ____1101 ________
24698       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
24699       CARRY | TOO_LARGE | TOO_LARGE_1000,
24700       CARRY | TOO_LARGE | TOO_LARGE_1000
24701     );
24702     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
24703       // ________ 0_______ <ASCII in byte 2>
24704       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
24705       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
24706 
24707       // ________ 1000____
24708       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
24709       // ________ 1001____
24710       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
24711       // ________ 101_____
24712       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
24713       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
24714 
24715       // ________ 11______
24716       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
24717     );
24718     return (byte_1_high & byte_1_low & byte_2_high);
24719   }
24720   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
24721       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
24722     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
24723     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
24724     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
24725     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
24726     return must23_80 ^ sc;
24727   }
24728 
24729   //
24730   // Return nonzero if there are incomplete multibyte characters at the end of the block:
24731   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
24732   //
24733   simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
24734     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
24735     // ... 1111____ 111_____ 11______
24736     static const uint8_t max_array[32] = {
24737       255, 255, 255, 255, 255, 255, 255, 255,
24738       255, 255, 255, 255, 255, 255, 255, 255,
24739       255, 255, 255, 255, 255, 255, 255, 255,
24740       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
24741     };
24742     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
24743     return input.gt_bits(max_value);
24744   }
24745 
24746   struct utf8_checker {
24747     // If this is nonzero, there has been a UTF-8 error.
24748     simd8<uint8_t> error;
24749     // The last input we received
24750     simd8<uint8_t> prev_input_block;
24751     // Whether the last input we received was incomplete (used for ASCII fast path)
24752     simd8<uint8_t> prev_incomplete;
24753 
24754     //
24755     // Check whether the current bytes are valid UTF-8.
24756     //
24757     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
24758       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
24759       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
24760       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
24761       simd8<uint8_t> sc = check_special_cases(input, prev1);
24762       this->error |= check_multibyte_lengths(input, prev_input, sc);
24763     }
24764 
24765     // The only problem that can happen at EOF is that a multibyte character is too short
24766     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
24767     // too large in the first of two bytes.
24768     simdutf_really_inline void check_eof() {
24769       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
24770       // possibly finish them.
24771       this->error |= this->prev_incomplete;
24772     }
24773 
24774     simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
24775       if(simdutf_likely(is_ascii(input))) {
24776         this->error |= this->prev_incomplete;
24777       } else {
24778         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
24779         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
24780             "We support either two or four chunks per 64-byte block.");
24781         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
24782           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
24783           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
24784         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
24785           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
24786           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
24787           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
24788           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
24789         }
24790         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
24791         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
24792 
24793       }
24794     }
24795 
24796     // do not forget to call check_eof!
24797     simdutf_really_inline bool errors() const {
24798       return this->error.any_bits_set_anywhere();
24799     }
24800 
24801   }; // struct utf8_checker
24802 } // namespace utf8_validation
24803 
24804 using utf8_validation::utf8_checker;
24805 
24806 } // unnamed namespace
24807 } // namespace haswell
24808 } // namespace simdutf
24809 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
24810 /* begin file src/generic/utf8_validation/utf8_validator.h */
24811 namespace simdutf {
24812 namespace haswell {
24813 namespace {
24814 namespace utf8_validation {
24815 
24816 /**
24817  * Validates that the string is actual UTF-8.
24818  */
24819 template<class checker>
24820 bool generic_validate_utf8(const uint8_t * input, size_t length) {
24821     checker c{};
24822     buf_block_reader<64> reader(input, length);
24823     while (reader.has_full_block()) {
24824       simd::simd8x64<uint8_t> in(reader.full_block());
24825       c.check_next_input(in);
24826       reader.advance();
24827     }
24828     uint8_t block[64]{};
24829     reader.get_remainder(block);
24830     simd::simd8x64<uint8_t> in(block);
24831     c.check_next_input(in);
24832     reader.advance();
24833     c.check_eof();
24834     return !c.errors();
24835 }
24836 
24837 bool generic_validate_utf8(const char * input, size_t length) {
24838   return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
24839 }
24840 
24841 /**
24842  * Validates that the string is actual UTF-8 and stops on errors.
24843  */
24844 template<class checker>
24845 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
24846     checker c{};
24847     buf_block_reader<64> reader(input, length);
24848     size_t count{0};
24849     while (reader.has_full_block()) {
24850       simd::simd8x64<uint8_t> in(reader.full_block());
24851       c.check_next_input(in);
24852       if(c.errors()) {
24853         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
24854         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
24855         res.count += count;
24856         return res;
24857       }
24858       reader.advance();
24859       count += 64;
24860     }
24861     uint8_t block[64]{};
24862     reader.get_remainder(block);
24863     simd::simd8x64<uint8_t> in(block);
24864     c.check_next_input(in);
24865     reader.advance();
24866     c.check_eof();
24867     if (c.errors()) {
24868       if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
24869       result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
24870       res.count += count;
24871       return res;
24872     } else {
24873       return result(error_code::SUCCESS, length);
24874     }
24875 }
24876 
24877 result generic_validate_utf8_with_errors(const char * input, size_t length) {
24878   return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
24879 }
24880 
24881 template<class checker>
24882 bool generic_validate_ascii(const uint8_t * input, size_t length) {
24883     buf_block_reader<64> reader(input, length);
24884     uint8_t blocks[64]{};
24885     simd::simd8x64<uint8_t> running_or(blocks);
24886     while (reader.has_full_block()) {
24887       simd::simd8x64<uint8_t> in(reader.full_block());
24888       running_or |= in;
24889       reader.advance();
24890     }
24891     uint8_t block[64]{};
24892     reader.get_remainder(block);
24893     simd::simd8x64<uint8_t> in(block);
24894     running_or |= in;
24895     return running_or.is_ascii();
24896 }
24897 
24898 bool generic_validate_ascii(const char * input, size_t length) {
24899   return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
24900 }
24901 
24902 template<class checker>
24903 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
24904   buf_block_reader<64> reader(input, length);
24905   size_t count{0};
24906   while (reader.has_full_block()) {
24907     simd::simd8x64<uint8_t> in(reader.full_block());
24908     if (!in.is_ascii()) {
24909       result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
24910       return result(res.error, count + res.count);
24911     }
24912     reader.advance();
24913 
24914     count += 64;
24915   }
24916   uint8_t block[64]{};
24917   reader.get_remainder(block);
24918   simd::simd8x64<uint8_t> in(block);
24919   if (!in.is_ascii()) {
24920     result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
24921     return result(res.error, count + res.count);
24922   } else {
24923     return result(error_code::SUCCESS, length);
24924   }
24925 }
24926 
24927 result generic_validate_ascii_with_errors(const char * input, size_t length) {
24928   return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
24929 }
24930 
24931 } // namespace utf8_validation
24932 } // unnamed namespace
24933 } // namespace haswell
24934 } // namespace simdutf
24935 /* end file src/generic/utf8_validation/utf8_validator.h */
24936 // transcoding from UTF-8 to UTF-16
24937 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
24938 
24939 
24940 namespace simdutf {
24941 namespace haswell {
24942 namespace {
24943 namespace utf8_to_utf16 {
24944 
24945 using namespace simd;
24946 
24947 template <endianness endian>
24948 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
24949     char16_t* utf16_output) noexcept {
24950   // The implementation is not specific to haswell and should be moved to the generic directory.
24951   size_t pos = 0;
24952   char16_t* start{utf16_output};
24953   const size_t safety_margin = 16; // to avoid overruns!
24954   while(pos + 64 + safety_margin <= size) {
24955     // this loop could be unrolled further. For example, we could process the mask
24956     // far more than 64 bytes.
24957     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
24958     if(in.is_ascii()) {
24959       in.store_ascii_as_utf16<endian>(utf16_output);
24960       utf16_output += 64;
24961       pos += 64;
24962     } else {
24963       // Slow path. We hope that the compiler will recognize that this is a slow path.
24964       // Anything that is not a continuation mask is a 'leading byte', that is, the
24965       // start of a new code point.
24966       uint64_t utf8_continuation_mask = in.lt(-65 + 1);
24967       // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
24968       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
24969       // The *start* of code points is not so useful, rather, we want the *end* of code points.
24970       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
24971       // We process in blocks of up to 12 bytes except possibly
24972       // for fast paths which may process up to 16 bytes. For the
24973       // slow path to work, we should have at least 12 input bytes left.
24974       size_t max_starting_point = (pos + 64) - 12;
24975       // Next loop is going to run at least five times when using solely
24976       // the slow/regular path, and at least four times if there are fast paths.
24977       while(pos < max_starting_point) {
24978         // Performance note: our ability to compute 'consumed' and
24979         // then shift and recompute is critical. If there is a
24980         // latency of, say, 4 cycles on getting 'consumed', then
24981         // the inner loop might have a total latency of about 6 cycles.
24982         // Yet we process between 6 to 12 inputs bytes, thus we get
24983         // a speed limit between 1 cycle/byte and 0.5 cycle/byte
24984         // for this section of the code. Hence, there is a limit
24985         // to how much we can further increase this latency before
24986         // it seriously harms performance.
24987         //
24988         // Thus we may allow convert_masked_utf8_to_utf16 to process
24989         // more bytes at a time under a fast-path mode where 16 bytes
24990         // are consumed at once (e.g., when encountering ASCII).
24991         size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
24992                             utf8_end_of_code_point_mask, utf16_output);
24993         pos += consumed;
24994         utf8_end_of_code_point_mask >>= consumed;
24995       }
24996       // At this point there may remain between 0 and 12 bytes in the
24997       // 64-byte block. These bytes will be processed again. So we have an
24998       // 80% efficiency (in the worst case). In practice we expect an
24999       // 85% to 90% efficiency.
25000     }
25001   }
25002   utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
25003   return utf16_output - start;
25004 }
25005 
25006 } // namespace utf8_to_utf16
25007 } // unnamed namespace
25008 } // namespace haswell
25009 } // namespace simdutf
25010 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
25011 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
25012 
25013 
25014 namespace simdutf {
25015 namespace haswell {
25016 namespace {
25017 namespace utf8_to_utf16 {
25018 using namespace simd;
25019 
25020 
25021   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
25022 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
25023 // Bit 1 = Too Long (ASCII followed by continuation)
25024 // Bit 2 = Overlong 3-byte
25025 // Bit 4 = Surrogate
25026 // Bit 5 = Overlong 2-byte
25027 // Bit 7 = Two Continuations
25028     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
25029                                                 // 11______ 11______
25030     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
25031     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
25032     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
25033     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
25034     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
25035     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
25036                                                 // 11110100 101_____
25037                                                 // 11110101 1001____
25038                                                 // 11110101 101_____
25039                                                 // 1111011_ 1001____
25040                                                 // 1111011_ 101_____
25041                                                 // 11111___ 1001____
25042                                                 // 11111___ 101_____
25043     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
25044                                                 // 11110101 1000____
25045                                                 // 1111011_ 1000____
25046                                                 // 11111___ 1000____
25047     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
25048 
25049     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
25050       // 0_______ ________ <ASCII in byte 1>
25051       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
25052       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
25053       // 10______ ________ <continuation in byte 1>
25054       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
25055       // 1100____ ________ <two byte lead in byte 1>
25056       TOO_SHORT | OVERLONG_2,
25057       // 1101____ ________ <two byte lead in byte 1>
25058       TOO_SHORT,
25059       // 1110____ ________ <three byte lead in byte 1>
25060       TOO_SHORT | OVERLONG_3 | SURROGATE,
25061       // 1111____ ________ <four+ byte lead in byte 1>
25062       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
25063     );
25064     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
25065     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
25066       // ____0000 ________
25067       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
25068       // ____0001 ________
25069       CARRY | OVERLONG_2,
25070       // ____001_ ________
25071       CARRY,
25072       CARRY,
25073 
25074       // ____0100 ________
25075       CARRY | TOO_LARGE,
25076       // ____0101 ________
25077       CARRY | TOO_LARGE | TOO_LARGE_1000,
25078       // ____011_ ________
25079       CARRY | TOO_LARGE | TOO_LARGE_1000,
25080       CARRY | TOO_LARGE | TOO_LARGE_1000,
25081 
25082       // ____1___ ________
25083       CARRY | TOO_LARGE | TOO_LARGE_1000,
25084       CARRY | TOO_LARGE | TOO_LARGE_1000,
25085       CARRY | TOO_LARGE | TOO_LARGE_1000,
25086       CARRY | TOO_LARGE | TOO_LARGE_1000,
25087       CARRY | TOO_LARGE | TOO_LARGE_1000,
25088       // ____1101 ________
25089       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
25090       CARRY | TOO_LARGE | TOO_LARGE_1000,
25091       CARRY | TOO_LARGE | TOO_LARGE_1000
25092     );
25093     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
25094       // ________ 0_______ <ASCII in byte 2>
25095       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
25096       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
25097 
25098       // ________ 1000____
25099       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
25100       // ________ 1001____
25101       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
25102       // ________ 101_____
25103       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
25104       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
25105 
25106       // ________ 11______
25107       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
25108     );
25109     return (byte_1_high & byte_1_low & byte_2_high);
25110   }
25111   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
25112       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
25113     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
25114     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
25115     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
25116     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
25117     return must23_80 ^ sc;
25118   }
25119 
25120 
25121   struct validating_transcoder {
25122     // If this is nonzero, there has been a UTF-8 error.
25123     simd8<uint8_t> error;
25124 
25125     validating_transcoder() : error(uint8_t(0)) {}
25126     //
25127     // Check whether the current bytes are valid UTF-8.
25128     //
25129     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
25130       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
25131       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
25132       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
25133       simd8<uint8_t> sc = check_special_cases(input, prev1);
25134       this->error |= check_multibyte_lengths(input, prev_input, sc);
25135     }
25136 
25137 
25138     template <endianness endian>
25139     simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
25140       size_t pos = 0;
25141       char16_t* start{utf16_output};
25142       // In the worst case, we have the haswell kernel which can cause an overflow of
25143       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
25144       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
25145       // much more than 8 bytes. However, you cannot generally assume that you have valid
25146       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
25147       // to give us a good margin.
25148       size_t leading_byte = 0;
25149       size_t margin = size;
25150       for(; margin > 0 && leading_byte < 8; margin--) {
25151         leading_byte += (int8_t(in[margin-1]) > -65);
25152       }
25153       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
25154       const size_t safety_margin = size - margin + 1; // to avoid overruns!
25155       while(pos + 64 + safety_margin <= size) {
25156         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25157         if(input.is_ascii()) {
25158           input.store_ascii_as_utf16<endian>(utf16_output);
25159           utf16_output += 64;
25160           pos += 64;
25161         } else {
25162           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
25163           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
25164               "We support either two or four chunks per 64-byte block.");
25165           auto zero = simd8<uint8_t>{uint8_t(0)};
25166           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
25167             this->check_utf8_bytes(input.chunks[0], zero);
25168             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25169           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
25170             this->check_utf8_bytes(input.chunks[0], zero);
25171             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25172             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25173             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25174           }
25175           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25176           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
25177           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
25178           // We process in blocks of up to 12 bytes except possibly
25179           // for fast paths which may process up to 16 bytes. For the
25180           // slow path to work, we should have at least 12 input bytes left.
25181           size_t max_starting_point = (pos + 64) - 12;
25182           // Next loop is going to run at least five times.
25183           while(pos < max_starting_point) {
25184             // Performance note: our ability to compute 'consumed' and
25185             // then shift and recompute is critical. If there is a
25186             // latency of, say, 4 cycles on getting 'consumed', then
25187             // the inner loop might have a total latency of about 6 cycles.
25188             // Yet we process between 6 to 12 inputs bytes, thus we get
25189             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
25190             // for this section of the code. Hence, there is a limit
25191             // to how much we can further increase this latency before
25192             // it seriously harms performance.
25193             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
25194                             utf8_end_of_code_point_mask, utf16_output);
25195             pos += consumed;
25196             utf8_end_of_code_point_mask >>= consumed;
25197           }
25198           // At this point there may remain between 0 and 12 bytes in the
25199           // 64-byte block. These bytes will be processed again. So we have an
25200           // 80% efficiency (in the worst case). In practice we expect an
25201           // 85% to 90% efficiency.
25202         }
25203       }
25204       if(errors()) { return 0; }
25205       if(pos < size) {
25206         size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
25207         if(howmany == 0) { return 0; }
25208         utf16_output += howmany;
25209       }
25210       return utf16_output - start;
25211     }
25212 
25213     template <endianness endian>
25214     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
25215       size_t pos = 0;
25216       char16_t* start{utf16_output};
25217       // In the worst case, we have the haswell kernel which can cause an overflow of
25218       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
25219       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
25220       // much more than 8 bytes. However, you cannot generally assume that you have valid
25221       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
25222       // to give us a good margin.
25223       size_t leading_byte = 0;
25224       size_t margin = size;
25225       for(; margin > 0 && leading_byte < 8; margin--) {
25226         leading_byte += (int8_t(in[margin-1]) > -65);
25227       }
25228       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
25229       const size_t safety_margin = size - margin + 1; // to avoid overruns!
25230       while(pos + 64 + safety_margin <= size) {
25231         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25232         if(input.is_ascii()) {
25233           input.store_ascii_as_utf16<endian>(utf16_output);
25234           utf16_output += 64;
25235           pos += 64;
25236         } else {
25237           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
25238           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
25239               "We support either two or four chunks per 64-byte block.");
25240           auto zero = simd8<uint8_t>{uint8_t(0)};
25241           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
25242             this->check_utf8_bytes(input.chunks[0], zero);
25243             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25244           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
25245             this->check_utf8_bytes(input.chunks[0], zero);
25246             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25247             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25248             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25249           }
25250           if (errors()) {
25251             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
25252             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
25253             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
25254             res.count += pos;
25255             return res;
25256           }
25257           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25258           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
25259           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
25260           // We process in blocks of up to 12 bytes except possibly
25261           // for fast paths which may process up to 16 bytes. For the
25262           // slow path to work, we should have at least 12 input bytes left.
25263           size_t max_starting_point = (pos + 64) - 12;
25264           // Next loop is going to run at least five times.
25265           while(pos < max_starting_point) {
25266             // Performance note: our ability to compute 'consumed' and
25267             // then shift and recompute is critical. If there is a
25268             // latency of, say, 4 cycles on getting 'consumed', then
25269             // the inner loop might have a total latency of about 6 cycles.
25270             // Yet we process between 6 to 12 inputs bytes, thus we get
25271             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
25272             // for this section of the code. Hence, there is a limit
25273             // to how much we can further increase this latency before
25274             // it seriously harms performance.
25275             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
25276                             utf8_end_of_code_point_mask, utf16_output);
25277             pos += consumed;
25278             utf8_end_of_code_point_mask >>= consumed;
25279           }
25280           // At this point there may remain between 0 and 12 bytes in the
25281           // 64-byte block. These bytes will be processed again. So we have an
25282           // 80% efficiency (in the worst case). In practice we expect an
25283           // 85% to 90% efficiency.
25284         }
25285       }
25286       if(errors()) {
25287         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
25288         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
25289         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
25290         res.count += pos;
25291         return res;
25292       }
25293       if(pos < size) {
25294         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
25295         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
25296         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
25297         if (res.error) {    // In case of error, we want the error position
25298           res.count += pos;
25299           return res;
25300         } else {    // In case of success, we want the number of word written
25301           utf16_output += res.count;
25302         }
25303       }
25304       return result(error_code::SUCCESS, utf16_output - start);
25305     }
25306 
25307     simdutf_really_inline bool errors() const {
25308       return this->error.any_bits_set_anywhere();
25309     }
25310 
25311   }; // struct utf8_checker
25312 } // utf8_to_utf16 namespace
25313 } // unnamed namespace
25314 } // namespace haswell
25315 } // namespace simdutf
25316 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
25317 // transcoding from UTF-8 to UTF-32
25318 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
25319 
25320 namespace simdutf {
25321 namespace haswell {
25322 namespace {
25323 namespace utf8_to_utf32 {
25324 
25325 using namespace simd;
25326 
25327 
25328 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
25329     char32_t* utf32_output) noexcept {
25330   size_t pos = 0;
25331   char32_t* start{utf32_output};
25332   const size_t safety_margin = 16; // to avoid overruns!
25333   while(pos + 64 + safety_margin <= size) {
25334     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
25335     if(in.is_ascii()) {
25336       in.store_ascii_as_utf32(utf32_output);
25337       utf32_output += 64;
25338       pos += 64;
25339     } else {
25340     // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
25341     uint64_t utf8_continuation_mask = in.lt(-65 + 1);
25342     uint64_t utf8_leading_mask = ~utf8_continuation_mask;
25343     uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
25344     size_t max_starting_point = (pos + 64) - 12;
25345     while(pos < max_starting_point) {
25346       size_t consumed = convert_masked_utf8_to_utf32(input + pos,
25347                           utf8_end_of_code_point_mask, utf32_output);
25348       pos += consumed;
25349       utf8_end_of_code_point_mask >>= consumed;
25350       }
25351     }
25352   }
25353   utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
25354   return utf32_output - start;
25355 }
25356 
25357 
25358 } // namespace utf8_to_utf32
25359 } // unnamed namespace
25360 } // namespace haswell
25361 } // namespace simdutf
25362 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
25363 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
25364 
25365 
25366 namespace simdutf {
25367 namespace haswell {
25368 namespace {
25369 namespace utf8_to_utf32 {
25370 using namespace simd;
25371 
25372 
25373   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
25374 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
25375 // Bit 1 = Too Long (ASCII followed by continuation)
25376 // Bit 2 = Overlong 3-byte
25377 // Bit 4 = Surrogate
25378 // Bit 5 = Overlong 2-byte
25379 // Bit 7 = Two Continuations
25380     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
25381                                                 // 11______ 11______
25382     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
25383     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
25384     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
25385     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
25386     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
25387     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
25388                                                 // 11110100 101_____
25389                                                 // 11110101 1001____
25390                                                 // 11110101 101_____
25391                                                 // 1111011_ 1001____
25392                                                 // 1111011_ 101_____
25393                                                 // 11111___ 1001____
25394                                                 // 11111___ 101_____
25395     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
25396                                                 // 11110101 1000____
25397                                                 // 1111011_ 1000____
25398                                                 // 11111___ 1000____
25399     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
25400 
25401     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
25402       // 0_______ ________ <ASCII in byte 1>
25403       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
25404       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
25405       // 10______ ________ <continuation in byte 1>
25406       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
25407       // 1100____ ________ <two byte lead in byte 1>
25408       TOO_SHORT | OVERLONG_2,
25409       // 1101____ ________ <two byte lead in byte 1>
25410       TOO_SHORT,
25411       // 1110____ ________ <three byte lead in byte 1>
25412       TOO_SHORT | OVERLONG_3 | SURROGATE,
25413       // 1111____ ________ <four+ byte lead in byte 1>
25414       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
25415     );
25416     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
25417     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
25418       // ____0000 ________
25419       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
25420       // ____0001 ________
25421       CARRY | OVERLONG_2,
25422       // ____001_ ________
25423       CARRY,
25424       CARRY,
25425 
25426       // ____0100 ________
25427       CARRY | TOO_LARGE,
25428       // ____0101 ________
25429       CARRY | TOO_LARGE | TOO_LARGE_1000,
25430       // ____011_ ________
25431       CARRY | TOO_LARGE | TOO_LARGE_1000,
25432       CARRY | TOO_LARGE | TOO_LARGE_1000,
25433 
25434       // ____1___ ________
25435       CARRY | TOO_LARGE | TOO_LARGE_1000,
25436       CARRY | TOO_LARGE | TOO_LARGE_1000,
25437       CARRY | TOO_LARGE | TOO_LARGE_1000,
25438       CARRY | TOO_LARGE | TOO_LARGE_1000,
25439       CARRY | TOO_LARGE | TOO_LARGE_1000,
25440       // ____1101 ________
25441       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
25442       CARRY | TOO_LARGE | TOO_LARGE_1000,
25443       CARRY | TOO_LARGE | TOO_LARGE_1000
25444     );
25445     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
25446       // ________ 0_______ <ASCII in byte 2>
25447       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
25448       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
25449 
25450       // ________ 1000____
25451       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
25452       // ________ 1001____
25453       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
25454       // ________ 101_____
25455       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
25456       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
25457 
25458       // ________ 11______
25459       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
25460     );
25461     return (byte_1_high & byte_1_low & byte_2_high);
25462   }
25463   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
25464       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
25465     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
25466     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
25467     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
25468     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
25469     return must23_80 ^ sc;
25470   }
25471 
25472 
25473   struct validating_transcoder {
25474     // If this is nonzero, there has been a UTF-8 error.
25475     simd8<uint8_t> error;
25476 
25477     validating_transcoder() : error(uint8_t(0)) {}
25478     //
25479     // Check whether the current bytes are valid UTF-8.
25480     //
25481     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
25482       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
25483       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
25484       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
25485       simd8<uint8_t> sc = check_special_cases(input, prev1);
25486       this->error |= check_multibyte_lengths(input, prev_input, sc);
25487     }
25488 
25489 
25490 
25491     simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
25492       size_t pos = 0;
25493       char32_t* start{utf32_output};
25494       // In the worst case, we have the haswell kernel which can cause an overflow of
25495       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
25496       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
25497       // much more than 8 bytes. However, you cannot generally assume that you have valid
25498       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
25499       // to give us a good margin.
25500       size_t leading_byte = 0;
25501       size_t margin = size;
25502       for(; margin > 0 && leading_byte < 4; margin--) {
25503         leading_byte += (int8_t(in[margin-1]) > -65);
25504       }
25505       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
25506       const size_t safety_margin = size - margin + 1; // to avoid overruns!
25507       while(pos + 64 + safety_margin <= size) {
25508         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25509         if(input.is_ascii()) {
25510           input.store_ascii_as_utf32(utf32_output);
25511           utf32_output += 64;
25512           pos += 64;
25513         } else {
25514           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
25515           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
25516               "We support either two or four chunks per 64-byte block.");
25517           auto zero = simd8<uint8_t>{uint8_t(0)};
25518           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
25519             this->check_utf8_bytes(input.chunks[0], zero);
25520             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25521           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
25522             this->check_utf8_bytes(input.chunks[0], zero);
25523             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25524             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25525             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25526           }
25527           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25528           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
25529           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
25530           // We process in blocks of up to 12 bytes except possibly
25531           // for fast paths which may process up to 16 bytes. For the
25532           // slow path to work, we should have at least 12 input bytes left.
25533           size_t max_starting_point = (pos + 64) - 12;
25534           // Next loop is going to run at least five times.
25535           while(pos < max_starting_point) {
25536             // Performance note: our ability to compute 'consumed' and
25537             // then shift and recompute is critical. If there is a
25538             // latency of, say, 4 cycles on getting 'consumed', then
25539             // the inner loop might have a total latency of about 6 cycles.
25540             // Yet we process between 6 to 12 inputs bytes, thus we get
25541             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
25542             // for this section of the code. Hence, there is a limit
25543             // to how much we can further increase this latency before
25544             // it seriously harms performance.
25545             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
25546                             utf8_end_of_code_point_mask, utf32_output);
25547             pos += consumed;
25548             utf8_end_of_code_point_mask >>= consumed;
25549           }
25550           // At this point there may remain between 0 and 12 bytes in the
25551           // 64-byte block. These bytes will be processed again. So we have an
25552           // 80% efficiency (in the worst case). In practice we expect an
25553           // 85% to 90% efficiency.
25554         }
25555       }
25556       if(errors()) { return 0; }
25557       if(pos < size) {
25558         size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
25559         if(howmany == 0) { return 0; }
25560         utf32_output += howmany;
25561       }
25562       return utf32_output - start;
25563     }
25564 
25565     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
25566       size_t pos = 0;
25567       char32_t* start{utf32_output};
25568       // In the worst case, we have the haswell kernel which can cause an overflow of
25569       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
25570       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
25571       // much more than 8 bytes. However, you cannot generally assume that you have valid
25572       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
25573       // to give us a good margin.
25574       size_t leading_byte = 0;
25575       size_t margin = size;
25576       for(; margin > 0 && leading_byte < 4; margin--) {
25577         leading_byte += (int8_t(in[margin-1]) > -65);
25578       }
25579       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
25580       const size_t safety_margin = size - margin + 1; // to avoid overruns!
25581       while(pos + 64 + safety_margin <= size) {
25582         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25583         if(input.is_ascii()) {
25584           input.store_ascii_as_utf32(utf32_output);
25585           utf32_output += 64;
25586           pos += 64;
25587         } else {
25588           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
25589           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
25590               "We support either two or four chunks per 64-byte block.");
25591           auto zero = simd8<uint8_t>{uint8_t(0)};
25592           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
25593             this->check_utf8_bytes(input.chunks[0], zero);
25594             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25595           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
25596             this->check_utf8_bytes(input.chunks[0], zero);
25597             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25598             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25599             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25600           }
25601           if (errors()) {
25602             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
25603             res.count += pos;
25604             return res;
25605           }
25606           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25607           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
25608           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
25609           // We process in blocks of up to 12 bytes except possibly
25610           // for fast paths which may process up to 16 bytes. For the
25611           // slow path to work, we should have at least 12 input bytes left.
25612           size_t max_starting_point = (pos + 64) - 12;
25613           // Next loop is going to run at least five times.
25614           while(pos < max_starting_point) {
25615             // Performance note: our ability to compute 'consumed' and
25616             // then shift and recompute is critical. If there is a
25617             // latency of, say, 4 cycles on getting 'consumed', then
25618             // the inner loop might have a total latency of about 6 cycles.
25619             // Yet we process between 6 to 12 inputs bytes, thus we get
25620             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
25621             // for this section of the code. Hence, there is a limit
25622             // to how much we can further increase this latency before
25623             // it seriously harms performance.
25624             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
25625                             utf8_end_of_code_point_mask, utf32_output);
25626             pos += consumed;
25627             utf8_end_of_code_point_mask >>= consumed;
25628           }
25629           // At this point there may remain between 0 and 12 bytes in the
25630           // 64-byte block. These bytes will be processed again. So we have an
25631           // 80% efficiency (in the worst case). In practice we expect an
25632           // 85% to 90% efficiency.
25633         }
25634       }
25635       if(errors()) {
25636         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
25637         res.count += pos;
25638         return res;
25639       }
25640       if(pos < size) {
25641         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
25642         if (res.error) {    // In case of error, we want the error position
25643           res.count += pos;
25644           return res;
25645         } else {    // In case of success, we want the number of word written
25646           utf32_output += res.count;
25647         }
25648       }
25649       return result(error_code::SUCCESS, utf32_output - start);
25650     }
25651 
25652     simdutf_really_inline bool errors() const {
25653       return this->error.any_bits_set_anywhere();
25654     }
25655 
25656   }; // struct utf8_checker
25657 } // utf8_to_utf32 namespace
25658 } // unnamed namespace
25659 } // namespace haswell
25660 } // namespace simdutf
25661 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
25662 // other functions
25663 /* begin file src/generic/utf8.h */
25664 
25665 namespace simdutf {
25666 namespace haswell {
25667 namespace {
25668 namespace utf8 {
25669 
25670 using namespace simd;
25671 
25672 simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
25673     size_t pos = 0;
25674     size_t count = 0;
25675     for(;pos + 64 <= size; pos += 64) {
25676       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25677       uint64_t utf8_continuation_mask = input.gt(-65);
25678       count += count_ones(utf8_continuation_mask);
25679     }
25680     return count + scalar::utf8::count_code_points(in + pos, size - pos);
25681 }
25682 
25683 simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
25684     size_t pos = 0;
25685     size_t count = 0;
25686     // This algorithm could no doubt be improved!
25687     for(;pos + 64 <= size; pos += 64) {
25688       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25689       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25690       // We count one word for anything that is not a continuation (so
25691       // leading bytes).
25692       count += 64 - count_ones(utf8_continuation_mask);
25693       int64_t utf8_4byte = input.gteq_unsigned(240);
25694       count += count_ones(utf8_4byte);
25695     }
25696     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
25697 }
25698 } // utf8 namespace
25699 } // unnamed namespace
25700 } // namespace haswell
25701 } // namespace simdutf
25702 /* end file src/generic/utf8.h */
25703 /* begin file src/generic/utf16.h */
25704 namespace simdutf {
25705 namespace haswell {
25706 namespace {
25707 namespace utf16 {
25708 
25709 template <endianness big_endian>
25710 simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
25711     size_t pos = 0;
25712     size_t count = 0;
25713     for(;pos < size/32*32; pos += 32) {
25714       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
25715       if (!match_system(big_endian)) { input.swap_bytes(); }
25716       uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
25717       count += count_ones(not_pair) / 2;
25718     }
25719     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
25720 }
25721 
25722 template <endianness big_endian>
25723 simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
25724     size_t pos = 0;
25725     size_t count = 0;
25726     // This algorithm could no doubt be improved!
25727     for(;pos < size/32*32; pos += 32) {
25728       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
25729       if (!match_system(big_endian)) { input.swap_bytes(); }
25730       uint64_t ascii_mask = input.lteq(0x7F);
25731       uint64_t twobyte_mask = input.lteq(0x7FF);
25732       uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
25733 
25734       size_t ascii_count = count_ones(ascii_mask) / 2;
25735       size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
25736       size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
25737       size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
25738       count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
25739     }
25740     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
25741 }
25742 
25743 template <endianness big_endian>
25744 simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
25745     return count_code_points<big_endian>(in, size);
25746 }
25747 
25748 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
25749   size_t pos = 0;
25750 
25751   while (pos < size/32*32) {
25752     simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
25753     input.swap_bytes();
25754     input.store(reinterpret_cast<uint16_t *>(output));
25755     pos += 32;
25756     output += 32;
25757   }
25758 
25759   scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
25760 }
25761 
25762 } // utf16
25763 } // unnamed namespace
25764 } // namespace haswell
25765 } // namespace simdutf
25766 /* end file src/generic/utf16.h */
25767 
25768 
25769 // transcoding from UTF-8 to Latin 1
25770 /* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
25771 
25772 
25773 namespace simdutf {
25774 namespace haswell {
25775 namespace {
25776 namespace utf8_to_latin1 {
25777 using namespace simd;
25778 
25779 
25780   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
25781 // For UTF-8 to Latin 1, we can allow any ASCII character, and any continuation byte,
25782 // but the non-ASCII leading bytes must be 0b11000011 or 0b11000010 and nothing else.
25783 //
25784 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
25785 // Bit 1 = Too Long (ASCII followed by continuation)
25786 // Bit 2 = Overlong 3-byte
25787 // Bit 4 = Surrogate
25788 // Bit 5 = Overlong 2-byte
25789 // Bit 7 = Two Continuations
25790     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
25791                                                 // 11______ 11______
25792     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
25793     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
25794     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
25795     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
25796     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
25797     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
25798                                                 // 11110100 101_____
25799                                                 // 11110101 1001____
25800                                                 // 11110101 101_____
25801                                                 // 1111011_ 1001____
25802                                                 // 1111011_ 101_____
25803                                                 // 11111___ 1001____
25804                                                 // 11111___ 101_____
25805     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
25806                                                 // 11110101 1000____
25807                                                 // 1111011_ 1000____
25808                                                 // 11111___ 1000____
25809     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
25810     constexpr const uint8_t FORBIDDEN  = 0xff;
25811 
25812     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
25813       // 0_______ ________ <ASCII in byte 1>
25814       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
25815       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
25816       // 10______ ________ <continuation in byte 1>
25817       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
25818       // 1100____ ________ <two byte lead in byte 1>
25819       TOO_SHORT | OVERLONG_2,
25820       // 1101____ ________ <two byte lead in byte 1>
25821       FORBIDDEN,
25822       // 1110____ ________ <three byte lead in byte 1>
25823       FORBIDDEN,
25824       // 1111____ ________ <four+ byte lead in byte 1>
25825       FORBIDDEN
25826     );
25827     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
25828     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
25829       // ____0000 ________
25830       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
25831       // ____0001 ________
25832       CARRY | OVERLONG_2,
25833       // ____001_ ________
25834       CARRY,
25835       CARRY,
25836 
25837       // ____0100 ________
25838       FORBIDDEN,
25839       // ____0101 ________
25840       FORBIDDEN,
25841       // ____011_ ________
25842       FORBIDDEN,
25843       FORBIDDEN,
25844 
25845       // ____1___ ________
25846       FORBIDDEN,
25847       FORBIDDEN,
25848       FORBIDDEN,
25849       FORBIDDEN,
25850       FORBIDDEN,
25851       // ____1101 ________
25852       FORBIDDEN,
25853       FORBIDDEN,
25854       FORBIDDEN
25855     );
25856     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
25857       // ________ 0_______ <ASCII in byte 2>
25858       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
25859       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
25860 
25861       // ________ 1000____
25862       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
25863       // ________ 1001____
25864       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
25865       // ________ 101_____
25866       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
25867       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
25868 
25869       // ________ 11______
25870       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
25871     );
25872     return (byte_1_high & byte_1_low & byte_2_high);
25873   }
25874 
25875   struct validating_transcoder {
25876     // If this is nonzero, there has been a UTF-8 error.
25877     simd8<uint8_t> error;
25878 
25879     validating_transcoder() : error(uint8_t(0)) {}
25880     //
25881     // Check whether the current bytes are valid UTF-8.
25882     //
25883     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
25884       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
25885       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
25886       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
25887       this->error |= check_special_cases(input, prev1);
25888     }
25889 
25890 
25891     simdutf_really_inline size_t convert(const char* in, size_t size, char* latin1_output) {
25892       size_t pos = 0;
25893       char* start{latin1_output};
25894       // In the worst case, we have the haswell kernel which can cause an overflow of
25895       // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes,
25896       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
25897       // much more than 8 bytes. However, you cannot generally assume that you have valid
25898       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
25899       // to give us a good margin.
25900       size_t leading_byte = 0;
25901       size_t margin = size;
25902       for(; margin > 0 && leading_byte < 8; margin--) {
25903         leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ...
25904       }
25905       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
25906       const size_t safety_margin = size - margin + 1; // to avoid overruns!
25907       while(pos + 64 + safety_margin <= size) {
25908         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25909         if(input.is_ascii()) {
25910           input.store((int8_t*)latin1_output);
25911           latin1_output += 64;
25912           pos += 64;
25913         } else {
25914           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
25915           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
25916               "We support either two or four chunks per 64-byte block.");
25917           auto zero = simd8<uint8_t>{uint8_t(0)};
25918           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
25919             this->check_utf8_bytes(input.chunks[0], zero);
25920             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25921           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
25922             this->check_utf8_bytes(input.chunks[0], zero);
25923             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25924             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25925             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25926           }
25927           uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
25928           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
25929           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
25930           // We process in blocks of up to 12 bytes except possibly
25931           // for fast paths which may process up to 16 bytes. For the
25932           // slow path to work, we should have at least 12 input bytes left.
25933           size_t max_starting_point = (pos + 64) - 12;
25934           // Next loop is going to run at least five times.
25935           while(pos < max_starting_point) {
25936             // Performance note: our ability to compute 'consumed' and
25937             // then shift and recompute is critical. If there is a
25938             // latency of, say, 4 cycles on getting 'consumed', then
25939             // the inner loop might have a total latency of about 6 cycles.
25940             // Yet we process between 6 to 12 inputs bytes, thus we get
25941             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
25942             // for this section of the code. Hence, there is a limit
25943             // to how much we can further increase this latency before
25944             // it seriously harms performance.
25945             size_t consumed = convert_masked_utf8_to_latin1(in + pos,
25946                             utf8_end_of_code_point_mask, latin1_output);
25947             pos += consumed;
25948             utf8_end_of_code_point_mask >>= consumed;
25949           }
25950           // At this point there may remain between 0 and 12 bytes in the
25951           // 64-byte block. These bytes will be processed again. So we have an
25952           // 80% efficiency (in the worst case). In practice we expect an
25953           // 85% to 90% efficiency.
25954         }
25955       }
25956       if(errors()) { return 0; }
25957       if(pos < size) {
25958         size_t howmany  = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
25959         if(howmany == 0) { return 0; }
25960         latin1_output += howmany;
25961       }
25962       return latin1_output - start;
25963     }
25964 
25965     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char* latin1_output) {
25966       size_t pos = 0;
25967       char* start{latin1_output};
25968       // In the worst case, we have the haswell kernel which can cause an overflow of
25969       // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes,
25970       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
25971       // much more than 8 bytes. However, you cannot generally assume that you have valid
25972       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
25973       // to give us a good margin.
25974       size_t leading_byte = 0;
25975       size_t margin = size;
25976       for(; margin > 0 && leading_byte < 8; margin--) {
25977         leading_byte += (int8_t(in[margin-1]) > -65);
25978       }
25979       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
25980       const size_t safety_margin = size - margin + 1; // to avoid overruns!
25981       while(pos + 64 + safety_margin <= size) {
25982         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25983         if(input.is_ascii()) {
25984           input.store((int8_t*)latin1_output);
25985           latin1_output += 64;
25986           pos += 64;
25987         } else {
25988           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
25989           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
25990               "We support either two or four chunks per 64-byte block.");
25991           auto zero = simd8<uint8_t>{uint8_t(0)};
25992           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
25993             this->check_utf8_bytes(input.chunks[0], zero);
25994             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25995           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
25996             this->check_utf8_bytes(input.chunks[0], zero);
25997             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25998             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25999             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
26000           }
26001           if (errors()) {
26002             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
26003             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
26004             result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output);
26005             res.count += pos;
26006             return res;
26007           }
26008           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
26009           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
26010           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
26011           // We process in blocks of up to 12 bytes except possibly
26012           // for fast paths which may process up to 16 bytes. For the
26013           // slow path to work, we should have at least 12 input bytes left.
26014           size_t max_starting_point = (pos + 64) - 12;
26015           // Next loop is going to run at least five times.
26016           while(pos < max_starting_point) {
26017             // Performance note: our ability to compute 'consumed' and
26018             // then shift and recompute is critical. If there is a
26019             // latency of, say, 4 cycles on getting 'consumed', then
26020             // the inner loop might have a total latency of about 6 cycles.
26021             // Yet we process between 6 to 12 inputs bytes, thus we get
26022             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
26023             // for this section of the code. Hence, there is a limit
26024             // to how much we can further increase this latency before
26025             // it seriously harms performance.
26026             size_t consumed = convert_masked_utf8_to_latin1(in + pos,
26027                             utf8_end_of_code_point_mask, latin1_output);
26028             pos += consumed;
26029             utf8_end_of_code_point_mask >>= consumed;
26030           }
26031           // At this point there may remain between 0 and 12 bytes in the
26032           // 64-byte block. These bytes will be processed again. So we have an
26033           // 80% efficiency (in the worst case). In practice we expect an
26034           // 85% to 90% efficiency.
26035         }
26036       }
26037       if(errors()) {
26038         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
26039         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
26040         result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output);
26041         res.count += pos;
26042         return res;
26043       }
26044       if(pos < size) {
26045         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
26046         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
26047         result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output);
26048         if (res.error) {    // In case of error, we want the error position
26049           res.count += pos;
26050           return res;
26051         } else {    // In case of success, we want the number of word written
26052           latin1_output += res.count;
26053         }
26054       }
26055       return result(error_code::SUCCESS, latin1_output - start);
26056     }
26057 
26058     simdutf_really_inline bool errors() const {
26059       return this->error.any_bits_set_anywhere();
26060     }
26061 
26062   }; // struct utf8_checker
26063 } // utf8_to_latin1 namespace
26064 } // unnamed namespace
26065 } // namespace haswell
26066 } // namespace simdutf
26067 /* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
26068 /* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
26069 
26070 
26071 namespace simdutf {
26072 namespace haswell {
26073 namespace {
26074 namespace utf8_to_latin1 {
26075 using namespace simd;
26076 
26077 
26078     simdutf_really_inline size_t convert_valid(const char* in, size_t size, char* latin1_output) {
26079       size_t pos = 0;
26080       char* start{latin1_output};
26081       // In the worst case, we have the haswell kernel which can cause an overflow of
26082       // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes,
26083       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
26084       // much more than 8 bytes. However, you cannot generally assume that you have valid
26085       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
26086       // to give us a good margin.
26087       size_t leading_byte = 0;
26088       size_t margin = size;
26089       for(; margin > 0 && leading_byte < 8; margin--) {
26090         leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ...
26091       }
26092       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
26093       const size_t safety_margin = size - margin + 1; // to avoid overruns!
26094       while(pos + 64 + safety_margin <= size) {
26095         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
26096         if(input.is_ascii()) {
26097           input.store((int8_t*)latin1_output);
26098           latin1_output += 64;
26099           pos += 64;
26100         } else {
26101           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
26102           uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
26103           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
26104           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
26105           // We process in blocks of up to 12 bytes except possibly
26106           // for fast paths which may process up to 16 bytes. For the
26107           // slow path to work, we should have at least 12 input bytes left.
26108           size_t max_starting_point = (pos + 64) - 12;
26109           // Next loop is going to run at least five times.
26110           while(pos < max_starting_point) {
26111             // Performance note: our ability to compute 'consumed' and
26112             // then shift and recompute is critical. If there is a
26113             // latency of, say, 4 cycles on getting 'consumed', then
26114             // the inner loop might have a total latency of about 6 cycles.
26115             // Yet we process between 6 to 12 inputs bytes, thus we get
26116             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
26117             // for this section of the code. Hence, there is a limit
26118             // to how much we can further increase this latency before
26119             // it seriously harms performance.
26120             size_t consumed = convert_masked_utf8_to_latin1(in + pos,
26121                             utf8_end_of_code_point_mask, latin1_output);
26122             pos += consumed;
26123             utf8_end_of_code_point_mask >>= consumed;
26124           }
26125           // At this point there may remain between 0 and 12 bytes in the
26126           // 64-byte block. These bytes will be processed again. So we have an
26127           // 80% efficiency (in the worst case). In practice we expect an
26128           // 85% to 90% efficiency.
26129         }
26130       }
26131       if(pos < size) {
26132         size_t howmany  = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, latin1_output);
26133         latin1_output += howmany;
26134       }
26135       return latin1_output - start;
26136     }
26137 
26138   }
26139 }   // utf8_to_latin1 namespace
26140 }   // unnamed namespace
26141 }   // namespace haswell
26142  // namespace simdutf
26143 /* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
26144 
26145 namespace simdutf {
26146 namespace haswell {
26147 
26148 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
26149   // If there is a BOM, then we trust it.
26150   auto bom_encoding = simdutf::BOM::check_bom(input, length);
26151   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
26152   if (length % 2 == 0) {
26153     return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
26154   } else {
26155     if (implementation::validate_utf8(input, length)) {
26156       return simdutf::encoding_type::UTF8;
26157     } else {
26158       return simdutf::encoding_type::unspecified;
26159     }
26160   }
26161 }
26162 
26163 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
26164   return haswell::utf8_validation::generic_validate_utf8(buf,len);
26165 }
26166 
26167 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
26168   return haswell::utf8_validation::generic_validate_utf8_with_errors(buf,len);
26169 }
26170 
26171 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
26172   return haswell::utf8_validation::generic_validate_ascii(buf,len);
26173 }
26174 
26175 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
26176   return haswell::utf8_validation::generic_validate_ascii_with_errors(buf,len);
26177 }
26178 
26179 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
26180   const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
26181   if (tail) {
26182     return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
26183   } else {
26184     return false;
26185   }
26186 }
26187 
26188 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
26189   const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
26190   if (tail) {
26191     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
26192   } else {
26193     return false;
26194   }
26195 }
26196 
26197 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
26198   result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
26199   if (res.count != len) {
26200     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
26201     return result(scalar_res.error, res.count + scalar_res.count);
26202   } else {
26203     return res;
26204   }
26205 }
26206 
26207 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
26208   result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
26209   if (res.count != len) {
26210     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
26211     return result(scalar_res.error, res.count + scalar_res.count);
26212   } else {
26213     return res;
26214   }
26215 }
26216 
26217 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
26218   const char32_t* tail = avx2_validate_utf32le(buf, len);
26219   if (tail) {
26220     return scalar::utf32::validate(tail, len - (tail - buf));
26221   } else {
26222     return false;
26223   }
26224 }
26225 
26226 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
26227   result res = avx2_validate_utf32le_with_errors(buf, len);
26228   if (res.count != len) {
26229     result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
26230     return result(scalar_res.error, res.count + scalar_res.count);
26231   } else {
26232     return res;
26233   }
26234 }
26235 
26236 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
26237   std::pair<const char*, char*> ret = avx2_convert_latin1_to_utf8(buf, len, utf8_output);
26238   size_t converted_chars = ret.second - utf8_output;
26239 
26240   if (ret.first != buf + len) {
26241     const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
26242       ret.first, len - (ret.first - buf), ret.second);
26243     converted_chars += scalar_converted_chars;
26244   }
26245 
26246   return converted_chars;
26247 }
26248 
26249 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
26250     std::pair<const char*, char16_t*> ret = avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
26251     if (ret.first == nullptr) { return 0; }
26252     size_t converted_chars = ret.second - utf16_output;
26253     if (ret.first != buf + len) {
26254         const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert<endianness::LITTLE>(
26255                                               ret.first, len - (ret.first - buf), ret.second);
26256         if (scalar_converted_chars == 0) { return 0; }
26257         converted_chars += scalar_converted_chars;
26258     }
26259     return converted_chars;
26260 }
26261 
26262 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
26263     std::pair<const char*, char16_t*> ret = avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
26264     if (ret.first == nullptr) { return 0; }
26265     size_t converted_chars = ret.second - utf16_output;
26266     if (ret.first != buf + len) {
26267         const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert<endianness::BIG>(
26268                                               ret.first, len - (ret.first - buf), ret.second);
26269         if (scalar_converted_chars == 0) { return 0; }
26270         converted_chars += scalar_converted_chars;
26271     }
26272     return converted_chars;
26273 }
26274 
26275 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
26276     std::pair<const char*, char32_t*> ret = avx2_convert_latin1_to_utf32(buf, len, utf32_output);
26277     if (ret.first == nullptr) { return 0; }
26278     size_t converted_chars = ret.second - utf32_output;
26279     if (ret.first != buf + len) {
26280         const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
26281                                               ret.first, len - (ret.first - buf), ret.second);
26282         if (scalar_converted_chars == 0) { return 0; }
26283         converted_chars += scalar_converted_chars;
26284     }
26285     return converted_chars;
26286 }
26287 
26288 simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
26289   utf8_to_latin1::validating_transcoder converter;
26290   return converter.convert(buf, len, latin1_output);
26291 }
26292 
26293 simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
26294   utf8_to_latin1::validating_transcoder converter;
26295   return converter.convert_with_errors(buf, len, latin1_output);
26296 }
26297 
26298 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* input, size_t size,
26299     char* latin1_output) const noexcept {
26300    return utf8_to_latin1::convert_valid(input, size,  latin1_output);
26301 }
26302 
26303 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
26304   utf8_to_utf16::validating_transcoder converter;
26305   return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
26306 }
26307 
26308 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
26309   utf8_to_utf16::validating_transcoder converter;
26310   return converter.convert<endianness::BIG>(buf, len, utf16_output);
26311 }
26312 
26313 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
26314   utf8_to_utf16::validating_transcoder converter;
26315   return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
26316 }
26317 
26318 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
26319   utf8_to_utf16::validating_transcoder converter;
26320   return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
26321 }
26322 
26323 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
26324     char16_t* utf16_output) const noexcept {
26325    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
26326 }
26327 
26328 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
26329     char16_t* utf16_output) const noexcept {
26330    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
26331 }
26332 
26333 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
26334   utf8_to_utf32::validating_transcoder converter;
26335   return converter.convert(buf, len, utf32_output);
26336 }
26337 
26338 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
26339   utf8_to_utf32::validating_transcoder converter;
26340   return converter.convert_with_errors(buf, len, utf32_output);
26341 }
26342 
26343 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
26344     char32_t* utf32_output) const noexcept {
26345   return utf8_to_utf32::convert_valid(input, size,  utf32_output);
26346 }
26347 
26348 
26349 simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
26350   std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
26351   if (ret.first == nullptr) { return 0; }
26352   size_t saved_bytes = ret.second - latin1_output;
26353   if (ret.first != buf + len) {
26354     const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert<endianness::LITTLE>(
26355                                         ret.first, len - (ret.first - buf), ret.second);
26356     if (scalar_saved_bytes == 0) { return 0; }
26357     saved_bytes += scalar_saved_bytes;
26358   }
26359   return saved_bytes;
26360 }
26361 
26362 simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
26363   std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
26364   if (ret.first == nullptr) { return 0; }
26365   size_t saved_bytes = ret.second - latin1_output;
26366   if (ret.first != buf + len) {
26367     const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert<endianness::BIG>(
26368                                         ret.first, len - (ret.first - buf), ret.second);
26369     if (scalar_saved_bytes == 0) { return 0; }
26370     saved_bytes += scalar_saved_bytes;
26371   }
26372   return saved_bytes;
26373 }
26374 
26375 simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
26376   std::pair<result, char*> ret = avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(buf, len, latin1_output);
26377   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
26378   if (ret.first.count != len) { // All good so far, but not finished
26379     result scalar_res = scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
26380                                         buf + ret.first.count, len - ret.first.count, ret.second);
26381     if (scalar_res.error) {
26382       scalar_res.count += ret.first.count;
26383       return scalar_res;
26384     } else {
26385       ret.second += scalar_res.count;
26386     }
26387   }
26388   ret.first.count = ret.second - latin1_output;   // Set count to the number of 8-bit code units written
26389   return ret.first;
26390 }
26391 
26392 simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
26393   std::pair<result, char*> ret = avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len, latin1_output);
26394   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
26395   if (ret.first.count != len) { // All good so far, but not finished
26396     result scalar_res = scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
26397                                         buf + ret.first.count, len - ret.first.count, ret.second);
26398     if (scalar_res.error) {
26399       scalar_res.count += ret.first.count;
26400       return scalar_res;
26401     } else {
26402       ret.second += scalar_res.count;
26403     }
26404   }
26405   ret.first.count = ret.second - latin1_output;   // Set count to the number of 8-bit code units written
26406   return ret.first;
26407 }
26408 
26409 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
26410   // optimization opportunity: implement a custom function
26411   return convert_utf16be_to_latin1(buf, len, latin1_output);
26412 }
26413 
26414 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
26415   // optimization opportunity: implement a custom function
26416   return convert_utf16le_to_latin1(buf, len, latin1_output);
26417 }
26418 
26419 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
26420   std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
26421   if (ret.first == nullptr) { return 0; }
26422   size_t saved_bytes = ret.second - utf8_output;
26423   if (ret.first != buf + len) {
26424     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
26425                                         ret.first, len - (ret.first - buf), ret.second);
26426     if (scalar_saved_bytes == 0) { return 0; }
26427     saved_bytes += scalar_saved_bytes;
26428   }
26429   return saved_bytes;
26430 }
26431 
26432 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
26433   std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
26434   if (ret.first == nullptr) { return 0; }
26435   size_t saved_bytes = ret.second - utf8_output;
26436   if (ret.first != buf + len) {
26437     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
26438                                         ret.first, len - (ret.first - buf), ret.second);
26439     if (scalar_saved_bytes == 0) { return 0; }
26440     saved_bytes += scalar_saved_bytes;
26441   }
26442   return saved_bytes;
26443 }
26444 
26445 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
26446   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
26447   std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
26448   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
26449   if (ret.first.count != len) { // All good so far, but not finished
26450     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
26451                                         buf + ret.first.count, len - ret.first.count, ret.second);
26452     if (scalar_res.error) {
26453       scalar_res.count += ret.first.count;
26454       return scalar_res;
26455     } else {
26456       ret.second += scalar_res.count;
26457     }
26458   }
26459   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
26460   return ret.first;
26461 }
26462 
26463 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
26464   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
26465   std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
26466   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
26467   if (ret.first.count != len) { // All good so far, but not finished
26468     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
26469                                         buf + ret.first.count, len - ret.first.count, ret.second);
26470     if (scalar_res.error) {
26471       scalar_res.count += ret.first.count;
26472       return scalar_res;
26473     } else {
26474       ret.second += scalar_res.count;
26475     }
26476   }
26477   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
26478   return ret.first;
26479 }
26480 
26481 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
26482   return convert_utf16le_to_utf8(buf, len, utf8_output);
26483 }
26484 
26485 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
26486   return convert_utf16be_to_utf8(buf, len, utf8_output);
26487 }
26488 
26489 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
26490   std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output);
26491   if (ret.first == nullptr) { return 0; }
26492   size_t saved_bytes = ret.second - utf8_output;
26493   if (ret.first != buf + len) {
26494     const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
26495                                         ret.first, len - (ret.first - buf), ret.second);
26496     if (scalar_saved_bytes == 0) { return 0; }
26497     saved_bytes += scalar_saved_bytes;
26498   }
26499   return saved_bytes;
26500 }
26501 
26502 simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
26503   std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_latin1(buf, len, latin1_output);
26504   if (ret.first == nullptr) { return 0; }
26505   size_t saved_bytes = ret.second - latin1_output;
26506   if (ret.first != buf + len) {
26507     const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
26508                                         ret.first, len - (ret.first - buf), ret.second);
26509     if (scalar_saved_bytes == 0) { return 0; }
26510     saved_bytes += scalar_saved_bytes;
26511   }
26512   return saved_bytes;
26513 }
26514 
26515 simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
26516   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
26517   std::pair<result, char*> ret = avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
26518   if (ret.first.count != len) {
26519     result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
26520                                         buf + ret.first.count, len - ret.first.count, ret.second);
26521     if (scalar_res.error) {
26522       scalar_res.count += ret.first.count;
26523       return scalar_res;
26524     } else {
26525       ret.second += scalar_res.count;
26526     }
26527   }
26528   ret.first.count = ret.second - latin1_output;   // Set count to the number of 8-bit code units written
26529   return ret.first;
26530 }
26531 
26532 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
26533   return convert_utf32_to_latin1(buf,len,latin1_output);
26534 }
26535 
26536 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
26537   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
26538   std::pair<result, char*> ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
26539   if (ret.first.count != len) {
26540     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
26541                                         buf + ret.first.count, len - ret.first.count, ret.second);
26542     if (scalar_res.error) {
26543       scalar_res.count += ret.first.count;
26544       return scalar_res;
26545     } else {
26546       ret.second += scalar_res.count;
26547     }
26548   }
26549   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
26550   return ret.first;
26551 }
26552 
26553 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
26554   std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
26555   if (ret.first == nullptr) { return 0; }
26556   size_t saved_bytes = ret.second - utf32_output;
26557   if (ret.first != buf + len) {
26558     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
26559                                         ret.first, len - (ret.first - buf), ret.second);
26560     if (scalar_saved_bytes == 0) { return 0; }
26561     saved_bytes += scalar_saved_bytes;
26562   }
26563   return saved_bytes;
26564 }
26565 
26566 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
26567   std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
26568   if (ret.first == nullptr) { return 0; }
26569   size_t saved_bytes = ret.second - utf32_output;
26570   if (ret.first != buf + len) {
26571     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
26572                                         ret.first, len - (ret.first - buf), ret.second);
26573     if (scalar_saved_bytes == 0) { return 0; }
26574     saved_bytes += scalar_saved_bytes;
26575   }
26576   return saved_bytes;
26577 }
26578 
26579 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
26580   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
26581   std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
26582   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
26583   if (ret.first.count != len) { // All good so far, but not finished
26584     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
26585                                         buf + ret.first.count, len - ret.first.count, ret.second);
26586     if (scalar_res.error) {
26587       scalar_res.count += ret.first.count;
26588       return scalar_res;
26589     } else {
26590       ret.second += scalar_res.count;
26591     }
26592   }
26593   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit code units written
26594   return ret.first;
26595 }
26596 
26597 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
26598   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
26599   std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
26600   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
26601   if (ret.first.count != len) { // All good so far, but not finished
26602     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
26603                                         buf + ret.first.count, len - ret.first.count, ret.second);
26604     if (scalar_res.error) {
26605       scalar_res.count += ret.first.count;
26606       return scalar_res;
26607     } else {
26608       ret.second += scalar_res.count;
26609     }
26610   }
26611   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit code units written
26612   return ret.first;
26613 }
26614 
26615 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
26616   return convert_utf32_to_utf8(buf, len, utf8_output);
26617 }
26618 
26619 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
26620   std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
26621   if (ret.first == nullptr) { return 0; }
26622   size_t saved_bytes = ret.second - utf16_output;
26623   if (ret.first != buf + len) {
26624     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
26625                                         ret.first, len - (ret.first - buf), ret.second);
26626     if (scalar_saved_bytes == 0) { return 0; }
26627     saved_bytes += scalar_saved_bytes;
26628   }
26629   return saved_bytes;
26630 }
26631 
26632 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
26633   std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
26634   if (ret.first == nullptr) { return 0; }
26635   size_t saved_bytes = ret.second - utf16_output;
26636   if (ret.first != buf + len) {
26637     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
26638                                         ret.first, len - (ret.first - buf), ret.second);
26639     if (scalar_saved_bytes == 0) { return 0; }
26640     saved_bytes += scalar_saved_bytes;
26641   }
26642   return saved_bytes;
26643 }
26644 
26645 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
26646   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
26647   std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
26648   if (ret.first.count != len) {
26649     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
26650                                         buf + ret.first.count, len - ret.first.count, ret.second);
26651     if (scalar_res.error) {
26652       scalar_res.count += ret.first.count;
26653       return scalar_res;
26654     } else {
26655       ret.second += scalar_res.count;
26656     }
26657   }
26658   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit code units written
26659   return ret.first;
26660 }
26661 
26662 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
26663   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
26664   std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
26665   if (ret.first.count != len) {
26666     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
26667                                         buf + ret.first.count, len - ret.first.count, ret.second);
26668     if (scalar_res.error) {
26669       scalar_res.count += ret.first.count;
26670       return scalar_res;
26671     } else {
26672       ret.second += scalar_res.count;
26673     }
26674   }
26675   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit code units written
26676   return ret.first;
26677 }
26678 
26679 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
26680   return convert_utf32_to_utf16le(buf, len, utf16_output);
26681 }
26682 
26683 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
26684   return convert_utf32_to_utf16be(buf, len, utf16_output);
26685 }
26686 
26687 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
26688   return convert_utf16le_to_utf32(buf, len, utf32_output);
26689 }
26690 
26691 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
26692   return convert_utf16be_to_utf32(buf, len, utf32_output);
26693 }
26694 
26695 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
26696   utf16::change_endianness_utf16(input, length, output);
26697 }
26698 
26699 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
26700   return utf16::count_code_points<endianness::LITTLE>(input, length);
26701 }
26702 
26703 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
26704   return utf16::count_code_points<endianness::BIG>(input, length);
26705 }
26706 
26707 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
26708   return utf8::count_code_points(input, length);
26709 }
26710 
26711 simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
26712   return count_utf8(buf,len);
26713 }
26714 
26715 simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
26716   return scalar::utf16::latin1_length_from_utf16(length);
26717 }
26718 
26719 simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept {
26720   return scalar::utf32::latin1_length_from_utf32(length);
26721 }
26722 
26723 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
26724   return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
26725 }
26726 
26727 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
26728   return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
26729 }
26730 
26731 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
26732   return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
26733 }
26734 
26735 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
26736   return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
26737 }
26738 
26739 
26740 simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
26741   return scalar::latin1::utf16_length_from_latin1(length);
26742 }
26743 
26744 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
26745   return utf8::utf16_length_from_utf8(input, length);
26746 }
26747 
26748 
26749 simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
26750   return scalar::latin1::utf32_length_from_latin1(length);
26751 }
26752 
26753 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *input, size_t len) const noexcept {
26754   const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
26755   size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
26756   size_t i = 0;
26757   __m256i four_64bits = _mm256_setzero_si256();
26758   while (i + sizeof(__m256i) <= len) {
26759     __m256i runner = _mm256_setzero_si256();
26760     // We can do up to 255 loops without overflow.
26761     size_t iterations = (len - i) / sizeof(__m256i);
26762     if (iterations > 255) {
26763       iterations = 255;
26764     }
26765     size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
26766     for (; i + 4*sizeof(__m256i) <= max_i; i += 4*sizeof(__m256i)) {
26767       __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
26768       __m256i input2 = _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
26769       __m256i input3 = _mm256_loadu_si256((const __m256i *)(data + i + 2*sizeof(__m256i)));
26770       __m256i input4 = _mm256_loadu_si256((const __m256i *)(data + i + 3*sizeof(__m256i)));
26771       __m256i input12 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
26772               _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
26773       __m256i input23 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
26774               _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
26775       __m256i input1234 = _mm256_add_epi8(input12, input23);
26776       runner = _mm256_sub_epi8(
26777           runner, input1234);
26778     }
26779     for (; i <= max_i; i += sizeof(__m256i)) {
26780       __m256i input_256_chunk = _mm256_loadu_si256((const __m256i *)(data + i));
26781       runner = _mm256_sub_epi8(
26782           runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
26783     }
26784     four_64bits = _mm256_add_epi64(
26785         four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
26786   }
26787   answer += _mm256_extract_epi64(four_64bits, 0) +
26788             _mm256_extract_epi64(four_64bits, 1) +
26789             _mm256_extract_epi64(four_64bits, 2) +
26790             _mm256_extract_epi64(four_64bits, 3);
26791   return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast<const char *>(data + i), len - i);
26792 }
26793 
26794 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
26795   const __m256i v_00000000 = _mm256_setzero_si256();
26796   const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
26797   const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
26798   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
26799   size_t pos = 0;
26800   size_t count = 0;
26801   for(;pos + 8 <= length; pos += 8) {
26802     __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
26803     const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
26804     const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
26805     const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
26806     const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
26807     const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
26808     const uint32_t ascii_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
26809     const uint32_t two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
26810     const uint32_t three_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
26811 
26812     size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
26813     size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
26814     size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
26815     count += 32 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
26816   }
26817   return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
26818 }
26819 
26820 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
26821   const __m256i v_00000000 = _mm256_setzero_si256();
26822   const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
26823   size_t pos = 0;
26824   size_t count = 0;
26825   for(;pos + 8 <= length; pos += 8) {
26826     __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
26827     const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
26828     const uint32_t surrogate_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
26829     size_t surrogate_count = (32-count_ones(surrogate_bitmask))/4;
26830     count += 8 + surrogate_count;
26831   }
26832   return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
26833 }
26834 
26835 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
26836   return utf8::count_code_points(input, length);
26837 }
26838 
26839 } // namespace haswell
26840 } // namespace simdutf
26841 
26842 /* begin file src/simdutf/haswell/end.h */
26843 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
26844 // nothing needed.
26845 #else
26846 SIMDUTF_UNTARGET_REGION
26847 #endif
26848 
26849 
26850 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
26851 SIMDUTF_POP_DISABLE_WARNINGS
26852 #endif // end of workaround
26853 /* end file src/simdutf/haswell/end.h */
26854 /* end file src/haswell/implementation.cpp */
26855 #endif
26856 #if SIMDUTF_IMPLEMENTATION_PPC64
26857 /* begin file src/ppc64/implementation.cpp */
26858 
26859 
26860 
26861 
26862 
26863 /* begin file src/simdutf/ppc64/begin.h */
26864 // redefining SIMDUTF_IMPLEMENTATION to "ppc64"
26865 // #define SIMDUTF_IMPLEMENTATION ppc64
26866 /* end file src/simdutf/ppc64/begin.h */
26867 namespace simdutf {
26868 namespace ppc64 {
26869 namespace {
26870 #ifndef SIMDUTF_PPC64_H
26871 #error "ppc64.h must be included"
26872 #endif
26873 using namespace simd;
26874 
26875 
26876 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
26877   // careful: 0x80 is not ascii.
26878   return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
26879 }
26880 
26881 simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
26882   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
26883   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
26884   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
26885   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
26886   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
26887 }
26888 
26889 simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
26890   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
26891   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
26892   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
26893   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
26894 }
26895 
26896 } // unnamed namespace
26897 } // namespace ppc64
26898 } // namespace simdutf
26899 
26900 /* begin file src/generic/buf_block_reader.h */
26901 namespace simdutf {
26902 namespace ppc64 {
26903 namespace {
26904 
26905 // Walks through a buffer in block-sized increments, loading the last part with spaces
26906 template<size_t STEP_SIZE>
26907 struct buf_block_reader {
26908 public:
26909   simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
26910   simdutf_really_inline size_t block_index();
26911   simdutf_really_inline bool has_full_block() const;
26912   simdutf_really_inline const uint8_t *full_block() const;
26913   /**
26914    * Get the last block, padded with spaces.
26915    *
26916    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
26917    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
26918    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
26919    *
26920    * @return the number of effective characters in the last block.
26921    */
26922   simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
26923   simdutf_really_inline void advance();
26924 private:
26925   const uint8_t *buf;
26926   const size_t len;
26927   const size_t lenminusstep;
26928   size_t idx;
26929 };
26930 
26931 // Routines to print masks and text for debugging bitmask operations
26932 simdutf_unused static char * format_input_text_64(const uint8_t *text) {
26933   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
26934   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
26935     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
26936   }
26937   buf[sizeof(simd8x64<uint8_t>)] = '\0';
26938   return buf;
26939 }
26940 
26941 // Routines to print masks and text for debugging bitmask operations
26942 simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
26943   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
26944   in.store(reinterpret_cast<uint8_t*>(buf));
26945   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
26946     if (buf[i] < ' ') { buf[i] = '_'; }
26947   }
26948   buf[sizeof(simd8x64<uint8_t>)] = '\0';
26949   return buf;
26950 }
26951 
26952 simdutf_unused static char * format_mask(uint64_t mask) {
26953   static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
26954   for (size_t i=0; i<64; i++) {
26955     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
26956   }
26957   buf[64] = '\0';
26958   return buf;
26959 }
26960 
26961 template<size_t STEP_SIZE>
26962 simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
26963 
26964 template<size_t STEP_SIZE>
26965 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
26966 
26967 template<size_t STEP_SIZE>
26968 simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
26969   return idx < lenminusstep;
26970 }
26971 
26972 template<size_t STEP_SIZE>
26973 simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
26974   return &buf[idx];
26975 }
26976 
26977 template<size_t STEP_SIZE>
26978 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
26979   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
26980   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
26981   std::memcpy(dst, buf + idx, len - idx);
26982   return len - idx;
26983 }
26984 
26985 template<size_t STEP_SIZE>
26986 simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
26987   idx += STEP_SIZE;
26988 }
26989 
26990 } // unnamed namespace
26991 } // namespace ppc64
26992 } // namespace simdutf
26993 /* end file src/generic/buf_block_reader.h */
26994 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
26995 namespace simdutf {
26996 namespace ppc64 {
26997 namespace {
26998 namespace utf8_validation {
26999 
27000 using namespace simd;
27001 
27002   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
27003 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
27004 // Bit 1 = Too Long (ASCII followed by continuation)
27005 // Bit 2 = Overlong 3-byte
27006 // Bit 4 = Surrogate
27007 // Bit 5 = Overlong 2-byte
27008 // Bit 7 = Two Continuations
27009     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
27010                                                 // 11______ 11______
27011     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
27012     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
27013     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
27014     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
27015     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
27016     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
27017                                                 // 11110100 101_____
27018                                                 // 11110101 1001____
27019                                                 // 11110101 101_____
27020                                                 // 1111011_ 1001____
27021                                                 // 1111011_ 101_____
27022                                                 // 11111___ 1001____
27023                                                 // 11111___ 101_____
27024     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
27025                                                 // 11110101 1000____
27026                                                 // 1111011_ 1000____
27027                                                 // 11111___ 1000____
27028     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
27029 
27030     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
27031       // 0_______ ________ <ASCII in byte 1>
27032       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27033       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27034       // 10______ ________ <continuation in byte 1>
27035       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
27036       // 1100____ ________ <two byte lead in byte 1>
27037       TOO_SHORT | OVERLONG_2,
27038       // 1101____ ________ <two byte lead in byte 1>
27039       TOO_SHORT,
27040       // 1110____ ________ <three byte lead in byte 1>
27041       TOO_SHORT | OVERLONG_3 | SURROGATE,
27042       // 1111____ ________ <four+ byte lead in byte 1>
27043       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
27044     );
27045     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
27046     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
27047       // ____0000 ________
27048       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
27049       // ____0001 ________
27050       CARRY | OVERLONG_2,
27051       // ____001_ ________
27052       CARRY,
27053       CARRY,
27054 
27055       // ____0100 ________
27056       CARRY | TOO_LARGE,
27057       // ____0101 ________
27058       CARRY | TOO_LARGE | TOO_LARGE_1000,
27059       // ____011_ ________
27060       CARRY | TOO_LARGE | TOO_LARGE_1000,
27061       CARRY | TOO_LARGE | TOO_LARGE_1000,
27062 
27063       // ____1___ ________
27064       CARRY | TOO_LARGE | TOO_LARGE_1000,
27065       CARRY | TOO_LARGE | TOO_LARGE_1000,
27066       CARRY | TOO_LARGE | TOO_LARGE_1000,
27067       CARRY | TOO_LARGE | TOO_LARGE_1000,
27068       CARRY | TOO_LARGE | TOO_LARGE_1000,
27069       // ____1101 ________
27070       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
27071       CARRY | TOO_LARGE | TOO_LARGE_1000,
27072       CARRY | TOO_LARGE | TOO_LARGE_1000
27073     );
27074     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
27075       // ________ 0_______ <ASCII in byte 2>
27076       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27077       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27078 
27079       // ________ 1000____
27080       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
27081       // ________ 1001____
27082       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
27083       // ________ 101_____
27084       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27085       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27086 
27087       // ________ 11______
27088       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
27089     );
27090     return (byte_1_high & byte_1_low & byte_2_high);
27091   }
27092   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
27093       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
27094     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
27095     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
27096     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
27097     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
27098     return must23_80 ^ sc;
27099   }
27100 
27101   //
27102   // Return nonzero if there are incomplete multibyte characters at the end of the block:
27103   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
27104   //
27105   simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
27106     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
27107     // ... 1111____ 111_____ 11______
27108     static const uint8_t max_array[32] = {
27109       255, 255, 255, 255, 255, 255, 255, 255,
27110       255, 255, 255, 255, 255, 255, 255, 255,
27111       255, 255, 255, 255, 255, 255, 255, 255,
27112       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
27113     };
27114     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
27115     return input.gt_bits(max_value);
27116   }
27117 
27118   struct utf8_checker {
27119     // If this is nonzero, there has been a UTF-8 error.
27120     simd8<uint8_t> error;
27121     // The last input we received
27122     simd8<uint8_t> prev_input_block;
27123     // Whether the last input we received was incomplete (used for ASCII fast path)
27124     simd8<uint8_t> prev_incomplete;
27125 
27126     //
27127     // Check whether the current bytes are valid UTF-8.
27128     //
27129     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
27130       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
27131       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
27132       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
27133       simd8<uint8_t> sc = check_special_cases(input, prev1);
27134       this->error |= check_multibyte_lengths(input, prev_input, sc);
27135     }
27136 
27137     // The only problem that can happen at EOF is that a multibyte character is too short
27138     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
27139     // too large in the first of two bytes.
27140     simdutf_really_inline void check_eof() {
27141       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
27142       // possibly finish them.
27143       this->error |= this->prev_incomplete;
27144     }
27145 
27146     simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
27147       if(simdutf_likely(is_ascii(input))) {
27148         this->error |= this->prev_incomplete;
27149       } else {
27150         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27151         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27152             "We support either two or four chunks per 64-byte block.");
27153         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27154           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
27155           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27156         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27157           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
27158           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27159           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27160           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27161         }
27162         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
27163         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
27164 
27165       }
27166     }
27167 
27168     // do not forget to call check_eof!
27169     simdutf_really_inline bool errors() const {
27170       return this->error.any_bits_set_anywhere();
27171     }
27172 
27173   }; // struct utf8_checker
27174 } // namespace utf8_validation
27175 
27176 using utf8_validation::utf8_checker;
27177 
27178 } // unnamed namespace
27179 } // namespace ppc64
27180 } // namespace simdutf
27181 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
27182 /* begin file src/generic/utf8_validation/utf8_validator.h */
27183 namespace simdutf {
27184 namespace ppc64 {
27185 namespace {
27186 namespace utf8_validation {
27187 
27188 /**
27189  * Validates that the string is actual UTF-8.
27190  */
27191 template<class checker>
27192 bool generic_validate_utf8(const uint8_t * input, size_t length) {
27193     checker c{};
27194     buf_block_reader<64> reader(input, length);
27195     while (reader.has_full_block()) {
27196       simd::simd8x64<uint8_t> in(reader.full_block());
27197       c.check_next_input(in);
27198       reader.advance();
27199     }
27200     uint8_t block[64]{};
27201     reader.get_remainder(block);
27202     simd::simd8x64<uint8_t> in(block);
27203     c.check_next_input(in);
27204     reader.advance();
27205     c.check_eof();
27206     return !c.errors();
27207 }
27208 
27209 bool generic_validate_utf8(const char * input, size_t length) {
27210   return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27211 }
27212 
27213 /**
27214  * Validates that the string is actual UTF-8 and stops on errors.
27215  */
27216 template<class checker>
27217 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
27218     checker c{};
27219     buf_block_reader<64> reader(input, length);
27220     size_t count{0};
27221     while (reader.has_full_block()) {
27222       simd::simd8x64<uint8_t> in(reader.full_block());
27223       c.check_next_input(in);
27224       if(c.errors()) {
27225         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
27226         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
27227         res.count += count;
27228         return res;
27229       }
27230       reader.advance();
27231       count += 64;
27232     }
27233     uint8_t block[64]{};
27234     reader.get_remainder(block);
27235     simd::simd8x64<uint8_t> in(block);
27236     c.check_next_input(in);
27237     reader.advance();
27238     c.check_eof();
27239     if (c.errors()) {
27240       if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
27241       result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
27242       res.count += count;
27243       return res;
27244     } else {
27245       return result(error_code::SUCCESS, length);
27246     }
27247 }
27248 
27249 result generic_validate_utf8_with_errors(const char * input, size_t length) {
27250   return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27251 }
27252 
27253 template<class checker>
27254 bool generic_validate_ascii(const uint8_t * input, size_t length) {
27255     buf_block_reader<64> reader(input, length);
27256     uint8_t blocks[64]{};
27257     simd::simd8x64<uint8_t> running_or(blocks);
27258     while (reader.has_full_block()) {
27259       simd::simd8x64<uint8_t> in(reader.full_block());
27260       running_or |= in;
27261       reader.advance();
27262     }
27263     uint8_t block[64]{};
27264     reader.get_remainder(block);
27265     simd::simd8x64<uint8_t> in(block);
27266     running_or |= in;
27267     return running_or.is_ascii();
27268 }
27269 
27270 bool generic_validate_ascii(const char * input, size_t length) {
27271   return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27272 }
27273 
27274 template<class checker>
27275 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
27276   buf_block_reader<64> reader(input, length);
27277   size_t count{0};
27278   while (reader.has_full_block()) {
27279     simd::simd8x64<uint8_t> in(reader.full_block());
27280     if (!in.is_ascii()) {
27281       result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
27282       return result(res.error, count + res.count);
27283     }
27284     reader.advance();
27285 
27286     count += 64;
27287   }
27288   uint8_t block[64]{};
27289   reader.get_remainder(block);
27290   simd::simd8x64<uint8_t> in(block);
27291   if (!in.is_ascii()) {
27292     result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
27293     return result(res.error, count + res.count);
27294   } else {
27295     return result(error_code::SUCCESS, length);
27296   }
27297 }
27298 
27299 result generic_validate_ascii_with_errors(const char * input, size_t length) {
27300   return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27301 }
27302 
27303 } // namespace utf8_validation
27304 } // unnamed namespace
27305 } // namespace ppc64
27306 } // namespace simdutf
27307 /* end file src/generic/utf8_validation/utf8_validator.h */
27308 // transcoding from UTF-8 to UTF-16
27309 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
27310 
27311 
27312 namespace simdutf {
27313 namespace ppc64 {
27314 namespace {
27315 namespace utf8_to_utf16 {
27316 
27317 using namespace simd;
27318 
27319 template <endianness endian>
27320 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
27321     char16_t* utf16_output) noexcept {
27322   // The implementation is not specific to haswell and should be moved to the generic directory.
27323   size_t pos = 0;
27324   char16_t* start{utf16_output};
27325   const size_t safety_margin = 16; // to avoid overruns!
27326   while(pos + 64 + safety_margin <= size) {
27327     // this loop could be unrolled further. For example, we could process the mask
27328     // far more than 64 bytes.
27329     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
27330     if(in.is_ascii()) {
27331       in.store_ascii_as_utf16<endian>(utf16_output);
27332       utf16_output += 64;
27333       pos += 64;
27334     } else {
27335       // Slow path. We hope that the compiler will recognize that this is a slow path.
27336       // Anything that is not a continuation mask is a 'leading byte', that is, the
27337       // start of a new code point.
27338       uint64_t utf8_continuation_mask = in.lt(-65 + 1);
27339       // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
27340       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27341       // The *start* of code points is not so useful, rather, we want the *end* of code points.
27342       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27343       // We process in blocks of up to 12 bytes except possibly
27344       // for fast paths which may process up to 16 bytes. For the
27345       // slow path to work, we should have at least 12 input bytes left.
27346       size_t max_starting_point = (pos + 64) - 12;
27347       // Next loop is going to run at least five times when using solely
27348       // the slow/regular path, and at least four times if there are fast paths.
27349       while(pos < max_starting_point) {
27350         // Performance note: our ability to compute 'consumed' and
27351         // then shift and recompute is critical. If there is a
27352         // latency of, say, 4 cycles on getting 'consumed', then
27353         // the inner loop might have a total latency of about 6 cycles.
27354         // Yet we process between 6 to 12 inputs bytes, thus we get
27355         // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27356         // for this section of the code. Hence, there is a limit
27357         // to how much we can further increase this latency before
27358         // it seriously harms performance.
27359         //
27360         // Thus we may allow convert_masked_utf8_to_utf16 to process
27361         // more bytes at a time under a fast-path mode where 16 bytes
27362         // are consumed at once (e.g., when encountering ASCII).
27363         size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
27364                             utf8_end_of_code_point_mask, utf16_output);
27365         pos += consumed;
27366         utf8_end_of_code_point_mask >>= consumed;
27367       }
27368       // At this point there may remain between 0 and 12 bytes in the
27369       // 64-byte block. These bytes will be processed again. So we have an
27370       // 80% efficiency (in the worst case). In practice we expect an
27371       // 85% to 90% efficiency.
27372     }
27373   }
27374   utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
27375   return utf16_output - start;
27376 }
27377 
27378 } // namespace utf8_to_utf16
27379 } // unnamed namespace
27380 } // namespace ppc64
27381 } // namespace simdutf
27382 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
27383 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
27384 
27385 
27386 namespace simdutf {
27387 namespace ppc64 {
27388 namespace {
27389 namespace utf8_to_utf16 {
27390 using namespace simd;
27391 
27392 
27393   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
27394 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
27395 // Bit 1 = Too Long (ASCII followed by continuation)
27396 // Bit 2 = Overlong 3-byte
27397 // Bit 4 = Surrogate
27398 // Bit 5 = Overlong 2-byte
27399 // Bit 7 = Two Continuations
27400     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
27401                                                 // 11______ 11______
27402     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
27403     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
27404     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
27405     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
27406     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
27407     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
27408                                                 // 11110100 101_____
27409                                                 // 11110101 1001____
27410                                                 // 11110101 101_____
27411                                                 // 1111011_ 1001____
27412                                                 // 1111011_ 101_____
27413                                                 // 11111___ 1001____
27414                                                 // 11111___ 101_____
27415     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
27416                                                 // 11110101 1000____
27417                                                 // 1111011_ 1000____
27418                                                 // 11111___ 1000____
27419     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
27420 
27421     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
27422       // 0_______ ________ <ASCII in byte 1>
27423       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27424       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27425       // 10______ ________ <continuation in byte 1>
27426       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
27427       // 1100____ ________ <two byte lead in byte 1>
27428       TOO_SHORT | OVERLONG_2,
27429       // 1101____ ________ <two byte lead in byte 1>
27430       TOO_SHORT,
27431       // 1110____ ________ <three byte lead in byte 1>
27432       TOO_SHORT | OVERLONG_3 | SURROGATE,
27433       // 1111____ ________ <four+ byte lead in byte 1>
27434       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
27435     );
27436     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
27437     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
27438       // ____0000 ________
27439       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
27440       // ____0001 ________
27441       CARRY | OVERLONG_2,
27442       // ____001_ ________
27443       CARRY,
27444       CARRY,
27445 
27446       // ____0100 ________
27447       CARRY | TOO_LARGE,
27448       // ____0101 ________
27449       CARRY | TOO_LARGE | TOO_LARGE_1000,
27450       // ____011_ ________
27451       CARRY | TOO_LARGE | TOO_LARGE_1000,
27452       CARRY | TOO_LARGE | TOO_LARGE_1000,
27453 
27454       // ____1___ ________
27455       CARRY | TOO_LARGE | TOO_LARGE_1000,
27456       CARRY | TOO_LARGE | TOO_LARGE_1000,
27457       CARRY | TOO_LARGE | TOO_LARGE_1000,
27458       CARRY | TOO_LARGE | TOO_LARGE_1000,
27459       CARRY | TOO_LARGE | TOO_LARGE_1000,
27460       // ____1101 ________
27461       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
27462       CARRY | TOO_LARGE | TOO_LARGE_1000,
27463       CARRY | TOO_LARGE | TOO_LARGE_1000
27464     );
27465     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
27466       // ________ 0_______ <ASCII in byte 2>
27467       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27468       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27469 
27470       // ________ 1000____
27471       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
27472       // ________ 1001____
27473       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
27474       // ________ 101_____
27475       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27476       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27477 
27478       // ________ 11______
27479       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
27480     );
27481     return (byte_1_high & byte_1_low & byte_2_high);
27482   }
27483   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
27484       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
27485     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
27486     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
27487     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
27488     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
27489     return must23_80 ^ sc;
27490   }
27491 
27492 
27493   struct validating_transcoder {
27494     // If this is nonzero, there has been a UTF-8 error.
27495     simd8<uint8_t> error;
27496 
27497     validating_transcoder() : error(uint8_t(0)) {}
27498     //
27499     // Check whether the current bytes are valid UTF-8.
27500     //
27501     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
27502       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
27503       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
27504       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
27505       simd8<uint8_t> sc = check_special_cases(input, prev1);
27506       this->error |= check_multibyte_lengths(input, prev_input, sc);
27507     }
27508 
27509 
27510     template <endianness endian>
27511     simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
27512       size_t pos = 0;
27513       char16_t* start{utf16_output};
27514       // In the worst case, we have the haswell kernel which can cause an overflow of
27515       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
27516       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
27517       // much more than 8 bytes. However, you cannot generally assume that you have valid
27518       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
27519       // to give us a good margin.
27520       size_t leading_byte = 0;
27521       size_t margin = size;
27522       for(; margin > 0 && leading_byte < 8; margin--) {
27523         leading_byte += (int8_t(in[margin-1]) > -65);
27524       }
27525       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
27526       const size_t safety_margin = size - margin + 1; // to avoid overruns!
27527       while(pos + 64 + safety_margin <= size) {
27528         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27529         if(input.is_ascii()) {
27530           input.store_ascii_as_utf16<endian>(utf16_output);
27531           utf16_output += 64;
27532           pos += 64;
27533         } else {
27534           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27535           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27536               "We support either two or four chunks per 64-byte block.");
27537           auto zero = simd8<uint8_t>{uint8_t(0)};
27538           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27539             this->check_utf8_bytes(input.chunks[0], zero);
27540             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27541           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27542             this->check_utf8_bytes(input.chunks[0], zero);
27543             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27544             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27545             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27546           }
27547           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27548           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27549           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27550           // We process in blocks of up to 12 bytes except possibly
27551           // for fast paths which may process up to 16 bytes. For the
27552           // slow path to work, we should have at least 12 input bytes left.
27553           size_t max_starting_point = (pos + 64) - 12;
27554           // Next loop is going to run at least five times.
27555           while(pos < max_starting_point) {
27556             // Performance note: our ability to compute 'consumed' and
27557             // then shift and recompute is critical. If there is a
27558             // latency of, say, 4 cycles on getting 'consumed', then
27559             // the inner loop might have a total latency of about 6 cycles.
27560             // Yet we process between 6 to 12 inputs bytes, thus we get
27561             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27562             // for this section of the code. Hence, there is a limit
27563             // to how much we can further increase this latency before
27564             // it seriously harms performance.
27565             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
27566                             utf8_end_of_code_point_mask, utf16_output);
27567             pos += consumed;
27568             utf8_end_of_code_point_mask >>= consumed;
27569           }
27570           // At this point there may remain between 0 and 12 bytes in the
27571           // 64-byte block. These bytes will be processed again. So we have an
27572           // 80% efficiency (in the worst case). In practice we expect an
27573           // 85% to 90% efficiency.
27574         }
27575       }
27576       if(errors()) { return 0; }
27577       if(pos < size) {
27578         size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
27579         if(howmany == 0) { return 0; }
27580         utf16_output += howmany;
27581       }
27582       return utf16_output - start;
27583     }
27584 
27585     template <endianness endian>
27586     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
27587       size_t pos = 0;
27588       char16_t* start{utf16_output};
27589       // In the worst case, we have the haswell kernel which can cause an overflow of
27590       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
27591       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
27592       // much more than 8 bytes. However, you cannot generally assume that you have valid
27593       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
27594       // to give us a good margin.
27595       size_t leading_byte = 0;
27596       size_t margin = size;
27597       for(; margin > 0 && leading_byte < 8; margin--) {
27598         leading_byte += (int8_t(in[margin-1]) > -65);
27599       }
27600       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
27601       const size_t safety_margin = size - margin + 1; // to avoid overruns!
27602       while(pos + 64 + safety_margin <= size) {
27603         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27604         if(input.is_ascii()) {
27605           input.store_ascii_as_utf16<endian>(utf16_output);
27606           utf16_output += 64;
27607           pos += 64;
27608         } else {
27609           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27610           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27611               "We support either two or four chunks per 64-byte block.");
27612           auto zero = simd8<uint8_t>{uint8_t(0)};
27613           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27614             this->check_utf8_bytes(input.chunks[0], zero);
27615             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27616           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27617             this->check_utf8_bytes(input.chunks[0], zero);
27618             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27619             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27620             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27621           }
27622           if (errors()) {
27623             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
27624             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
27625             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
27626             res.count += pos;
27627             return res;
27628           }
27629           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27630           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27631           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27632           // We process in blocks of up to 12 bytes except possibly
27633           // for fast paths which may process up to 16 bytes. For the
27634           // slow path to work, we should have at least 12 input bytes left.
27635           size_t max_starting_point = (pos + 64) - 12;
27636           // Next loop is going to run at least five times.
27637           while(pos < max_starting_point) {
27638             // Performance note: our ability to compute 'consumed' and
27639             // then shift and recompute is critical. If there is a
27640             // latency of, say, 4 cycles on getting 'consumed', then
27641             // the inner loop might have a total latency of about 6 cycles.
27642             // Yet we process between 6 to 12 inputs bytes, thus we get
27643             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27644             // for this section of the code. Hence, there is a limit
27645             // to how much we can further increase this latency before
27646             // it seriously harms performance.
27647             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
27648                             utf8_end_of_code_point_mask, utf16_output);
27649             pos += consumed;
27650             utf8_end_of_code_point_mask >>= consumed;
27651           }
27652           // At this point there may remain between 0 and 12 bytes in the
27653           // 64-byte block. These bytes will be processed again. So we have an
27654           // 80% efficiency (in the worst case). In practice we expect an
27655           // 85% to 90% efficiency.
27656         }
27657       }
27658       if(errors()) {
27659         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
27660         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
27661         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
27662         res.count += pos;
27663         return res;
27664       }
27665       if(pos < size) {
27666         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
27667         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
27668         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
27669         if (res.error) {    // In case of error, we want the error position
27670           res.count += pos;
27671           return res;
27672         } else {    // In case of success, we want the number of word written
27673           utf16_output += res.count;
27674         }
27675       }
27676       return result(error_code::SUCCESS, utf16_output - start);
27677     }
27678 
27679     simdutf_really_inline bool errors() const {
27680       return this->error.any_bits_set_anywhere();
27681     }
27682 
27683   }; // struct utf8_checker
27684 } // utf8_to_utf16 namespace
27685 } // unnamed namespace
27686 } // namespace ppc64
27687 } // namespace simdutf
27688 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
27689 // transcoding from UTF-8 to UTF-32
27690 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
27691 
27692 namespace simdutf {
27693 namespace ppc64 {
27694 namespace {
27695 namespace utf8_to_utf32 {
27696 
27697 using namespace simd;
27698 
27699 
27700 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
27701     char32_t* utf32_output) noexcept {
27702   size_t pos = 0;
27703   char32_t* start{utf32_output};
27704   const size_t safety_margin = 16; // to avoid overruns!
27705   while(pos + 64 + safety_margin <= size) {
27706     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
27707     if(in.is_ascii()) {
27708       in.store_ascii_as_utf32(utf32_output);
27709       utf32_output += 64;
27710       pos += 64;
27711     } else {
27712     // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
27713     uint64_t utf8_continuation_mask = in.lt(-65 + 1);
27714     uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27715     uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27716     size_t max_starting_point = (pos + 64) - 12;
27717     while(pos < max_starting_point) {
27718       size_t consumed = convert_masked_utf8_to_utf32(input + pos,
27719                           utf8_end_of_code_point_mask, utf32_output);
27720       pos += consumed;
27721       utf8_end_of_code_point_mask >>= consumed;
27722       }
27723     }
27724   }
27725   utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
27726   return utf32_output - start;
27727 }
27728 
27729 
27730 } // namespace utf8_to_utf32
27731 } // unnamed namespace
27732 } // namespace ppc64
27733 } // namespace simdutf
27734 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
27735 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
27736 
27737 
27738 namespace simdutf {
27739 namespace ppc64 {
27740 namespace {
27741 namespace utf8_to_utf32 {
27742 using namespace simd;
27743 
27744 
27745   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
27746 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
27747 // Bit 1 = Too Long (ASCII followed by continuation)
27748 // Bit 2 = Overlong 3-byte
27749 // Bit 4 = Surrogate
27750 // Bit 5 = Overlong 2-byte
27751 // Bit 7 = Two Continuations
27752     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
27753                                                 // 11______ 11______
27754     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
27755     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
27756     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
27757     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
27758     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
27759     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
27760                                                 // 11110100 101_____
27761                                                 // 11110101 1001____
27762                                                 // 11110101 101_____
27763                                                 // 1111011_ 1001____
27764                                                 // 1111011_ 101_____
27765                                                 // 11111___ 1001____
27766                                                 // 11111___ 101_____
27767     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
27768                                                 // 11110101 1000____
27769                                                 // 1111011_ 1000____
27770                                                 // 11111___ 1000____
27771     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
27772 
27773     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
27774       // 0_______ ________ <ASCII in byte 1>
27775       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27776       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
27777       // 10______ ________ <continuation in byte 1>
27778       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
27779       // 1100____ ________ <two byte lead in byte 1>
27780       TOO_SHORT | OVERLONG_2,
27781       // 1101____ ________ <two byte lead in byte 1>
27782       TOO_SHORT,
27783       // 1110____ ________ <three byte lead in byte 1>
27784       TOO_SHORT | OVERLONG_3 | SURROGATE,
27785       // 1111____ ________ <four+ byte lead in byte 1>
27786       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
27787     );
27788     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
27789     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
27790       // ____0000 ________
27791       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
27792       // ____0001 ________
27793       CARRY | OVERLONG_2,
27794       // ____001_ ________
27795       CARRY,
27796       CARRY,
27797 
27798       // ____0100 ________
27799       CARRY | TOO_LARGE,
27800       // ____0101 ________
27801       CARRY | TOO_LARGE | TOO_LARGE_1000,
27802       // ____011_ ________
27803       CARRY | TOO_LARGE | TOO_LARGE_1000,
27804       CARRY | TOO_LARGE | TOO_LARGE_1000,
27805 
27806       // ____1___ ________
27807       CARRY | TOO_LARGE | TOO_LARGE_1000,
27808       CARRY | TOO_LARGE | TOO_LARGE_1000,
27809       CARRY | TOO_LARGE | TOO_LARGE_1000,
27810       CARRY | TOO_LARGE | TOO_LARGE_1000,
27811       CARRY | TOO_LARGE | TOO_LARGE_1000,
27812       // ____1101 ________
27813       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
27814       CARRY | TOO_LARGE | TOO_LARGE_1000,
27815       CARRY | TOO_LARGE | TOO_LARGE_1000
27816     );
27817     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
27818       // ________ 0_______ <ASCII in byte 2>
27819       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27820       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
27821 
27822       // ________ 1000____
27823       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
27824       // ________ 1001____
27825       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
27826       // ________ 101_____
27827       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27828       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
27829 
27830       // ________ 11______
27831       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
27832     );
27833     return (byte_1_high & byte_1_low & byte_2_high);
27834   }
27835   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
27836       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
27837     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
27838     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
27839     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
27840     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
27841     return must23_80 ^ sc;
27842   }
27843 
27844 
27845   struct validating_transcoder {
27846     // If this is nonzero, there has been a UTF-8 error.
27847     simd8<uint8_t> error;
27848 
27849     validating_transcoder() : error(uint8_t(0)) {}
27850     //
27851     // Check whether the current bytes are valid UTF-8.
27852     //
27853     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
27854       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
27855       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
27856       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
27857       simd8<uint8_t> sc = check_special_cases(input, prev1);
27858       this->error |= check_multibyte_lengths(input, prev_input, sc);
27859     }
27860 
27861 
27862 
27863     simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
27864       size_t pos = 0;
27865       char32_t* start{utf32_output};
27866       // In the worst case, we have the haswell kernel which can cause an overflow of
27867       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
27868       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
27869       // much more than 8 bytes. However, you cannot generally assume that you have valid
27870       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
27871       // to give us a good margin.
27872       size_t leading_byte = 0;
27873       size_t margin = size;
27874       for(; margin > 0 && leading_byte < 4; margin--) {
27875         leading_byte += (int8_t(in[margin-1]) > -65);
27876       }
27877       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
27878       const size_t safety_margin = size - margin + 1; // to avoid overruns!
27879       while(pos + 64 + safety_margin <= size) {
27880         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27881         if(input.is_ascii()) {
27882           input.store_ascii_as_utf32(utf32_output);
27883           utf32_output += 64;
27884           pos += 64;
27885         } else {
27886           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27887           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27888               "We support either two or four chunks per 64-byte block.");
27889           auto zero = simd8<uint8_t>{uint8_t(0)};
27890           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27891             this->check_utf8_bytes(input.chunks[0], zero);
27892             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27893           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27894             this->check_utf8_bytes(input.chunks[0], zero);
27895             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27896             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27897             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27898           }
27899           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27900           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27901           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27902           // We process in blocks of up to 12 bytes except possibly
27903           // for fast paths which may process up to 16 bytes. For the
27904           // slow path to work, we should have at least 12 input bytes left.
27905           size_t max_starting_point = (pos + 64) - 12;
27906           // Next loop is going to run at least five times.
27907           while(pos < max_starting_point) {
27908             // Performance note: our ability to compute 'consumed' and
27909             // then shift and recompute is critical. If there is a
27910             // latency of, say, 4 cycles on getting 'consumed', then
27911             // the inner loop might have a total latency of about 6 cycles.
27912             // Yet we process between 6 to 12 inputs bytes, thus we get
27913             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27914             // for this section of the code. Hence, there is a limit
27915             // to how much we can further increase this latency before
27916             // it seriously harms performance.
27917             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
27918                             utf8_end_of_code_point_mask, utf32_output);
27919             pos += consumed;
27920             utf8_end_of_code_point_mask >>= consumed;
27921           }
27922           // At this point there may remain between 0 and 12 bytes in the
27923           // 64-byte block. These bytes will be processed again. So we have an
27924           // 80% efficiency (in the worst case). In practice we expect an
27925           // 85% to 90% efficiency.
27926         }
27927       }
27928       if(errors()) { return 0; }
27929       if(pos < size) {
27930         size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
27931         if(howmany == 0) { return 0; }
27932         utf32_output += howmany;
27933       }
27934       return utf32_output - start;
27935     }
27936 
27937     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
27938       size_t pos = 0;
27939       char32_t* start{utf32_output};
27940       // In the worst case, we have the haswell kernel which can cause an overflow of
27941       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
27942       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
27943       // much more than 8 bytes. However, you cannot generally assume that you have valid
27944       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
27945       // to give us a good margin.
27946       size_t leading_byte = 0;
27947       size_t margin = size;
27948       for(; margin > 0 && leading_byte < 4; margin--) {
27949         leading_byte += (int8_t(in[margin-1]) > -65);
27950       }
27951       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
27952       const size_t safety_margin = size - margin + 1; // to avoid overruns!
27953       while(pos + 64 + safety_margin <= size) {
27954         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27955         if(input.is_ascii()) {
27956           input.store_ascii_as_utf32(utf32_output);
27957           utf32_output += 64;
27958           pos += 64;
27959         } else {
27960           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
27961           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
27962               "We support either two or four chunks per 64-byte block.");
27963           auto zero = simd8<uint8_t>{uint8_t(0)};
27964           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
27965             this->check_utf8_bytes(input.chunks[0], zero);
27966             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27967           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
27968             this->check_utf8_bytes(input.chunks[0], zero);
27969             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27970             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27971             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27972           }
27973           if (errors()) {
27974             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
27975             res.count += pos;
27976             return res;
27977           }
27978           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27979           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
27980           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
27981           // We process in blocks of up to 12 bytes except possibly
27982           // for fast paths which may process up to 16 bytes. For the
27983           // slow path to work, we should have at least 12 input bytes left.
27984           size_t max_starting_point = (pos + 64) - 12;
27985           // Next loop is going to run at least five times.
27986           while(pos < max_starting_point) {
27987             // Performance note: our ability to compute 'consumed' and
27988             // then shift and recompute is critical. If there is a
27989             // latency of, say, 4 cycles on getting 'consumed', then
27990             // the inner loop might have a total latency of about 6 cycles.
27991             // Yet we process between 6 to 12 inputs bytes, thus we get
27992             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
27993             // for this section of the code. Hence, there is a limit
27994             // to how much we can further increase this latency before
27995             // it seriously harms performance.
27996             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
27997                             utf8_end_of_code_point_mask, utf32_output);
27998             pos += consumed;
27999             utf8_end_of_code_point_mask >>= consumed;
28000           }
28001           // At this point there may remain between 0 and 12 bytes in the
28002           // 64-byte block. These bytes will be processed again. So we have an
28003           // 80% efficiency (in the worst case). In practice we expect an
28004           // 85% to 90% efficiency.
28005         }
28006       }
28007       if(errors()) {
28008         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
28009         res.count += pos;
28010         return res;
28011       }
28012       if(pos < size) {
28013         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
28014         if (res.error) {    // In case of error, we want the error position
28015           res.count += pos;
28016           return res;
28017         } else {    // In case of success, we want the number of word written
28018           utf32_output += res.count;
28019         }
28020       }
28021       return result(error_code::SUCCESS, utf32_output - start);
28022     }
28023 
28024     simdutf_really_inline bool errors() const {
28025       return this->error.any_bits_set_anywhere();
28026     }
28027 
28028   }; // struct utf8_checker
28029 } // utf8_to_utf32 namespace
28030 } // unnamed namespace
28031 } // namespace ppc64
28032 } // namespace simdutf
28033 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
28034 // other functions
28035 /* begin file src/generic/utf8.h */
28036 
28037 namespace simdutf {
28038 namespace ppc64 {
28039 namespace {
28040 namespace utf8 {
28041 
28042 using namespace simd;
28043 
28044 simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
28045     size_t pos = 0;
28046     size_t count = 0;
28047     for(;pos + 64 <= size; pos += 64) {
28048       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
28049       uint64_t utf8_continuation_mask = input.gt(-65);
28050       count += count_ones(utf8_continuation_mask);
28051     }
28052     return count + scalar::utf8::count_code_points(in + pos, size - pos);
28053 }
28054 
28055 simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
28056     size_t pos = 0;
28057     size_t count = 0;
28058     // This algorithm could no doubt be improved!
28059     for(;pos + 64 <= size; pos += 64) {
28060       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
28061       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
28062       // We count one word for anything that is not a continuation (so
28063       // leading bytes).
28064       count += 64 - count_ones(utf8_continuation_mask);
28065       int64_t utf8_4byte = input.gteq_unsigned(240);
28066       count += count_ones(utf8_4byte);
28067     }
28068     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
28069 }
28070 } // utf8 namespace
28071 } // unnamed namespace
28072 } // namespace ppc64
28073 } // namespace simdutf
28074 /* end file src/generic/utf8.h */
28075 /* begin file src/generic/utf16.h */
28076 namespace simdutf {
28077 namespace ppc64 {
28078 namespace {
28079 namespace utf16 {
28080 
28081 template <endianness big_endian>
28082 simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
28083     size_t pos = 0;
28084     size_t count = 0;
28085     for(;pos < size/32*32; pos += 32) {
28086       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
28087       if (!match_system(big_endian)) { input.swap_bytes(); }
28088       uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
28089       count += count_ones(not_pair) / 2;
28090     }
28091     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
28092 }
28093 
28094 template <endianness big_endian>
28095 simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
28096     size_t pos = 0;
28097     size_t count = 0;
28098     // This algorithm could no doubt be improved!
28099     for(;pos < size/32*32; pos += 32) {
28100       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
28101       if (!match_system(big_endian)) { input.swap_bytes(); }
28102       uint64_t ascii_mask = input.lteq(0x7F);
28103       uint64_t twobyte_mask = input.lteq(0x7FF);
28104       uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
28105 
28106       size_t ascii_count = count_ones(ascii_mask) / 2;
28107       size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
28108       size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
28109       size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
28110       count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
28111     }
28112     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
28113 }
28114 
28115 template <endianness big_endian>
28116 simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
28117     return count_code_points<big_endian>(in, size);
28118 }
28119 
28120 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
28121   size_t pos = 0;
28122 
28123   while (pos < size/32*32) {
28124     simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
28125     input.swap_bytes();
28126     input.store(reinterpret_cast<uint16_t *>(output));
28127     pos += 32;
28128     output += 32;
28129   }
28130 
28131   scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
28132 }
28133 
28134 } // utf16
28135 } // unnamed namespace
28136 } // namespace ppc64
28137 } // namespace simdutf
28138 /* end file src/generic/utf16.h */
28139 
28140 //
28141 // Implementation-specific overrides
28142 //
28143 namespace simdutf {
28144 namespace ppc64 {
28145 
28146 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
28147   // If there is a BOM, then we trust it.
28148   auto bom_encoding = simdutf::BOM::check_bom(input, length);
28149   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
28150   int out = 0;
28151   if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
28152   if((length % 2) == 0) {
28153     if(validate_utf16(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
28154   }
28155   if((length % 4) == 0) {
28156     if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
28157   }
28158 
28159   return out;
28160 }
28161 
28162 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
28163   return ppc64::utf8_validation::generic_validate_utf8(buf,len);
28164 }
28165 
28166 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
28167   return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
28168 }
28169 
28170 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
28171   return ppc64::utf8_validation::generic_validate_ascii(buf,len);
28172 }
28173 
28174 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
28175   return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
28176 }
28177 
28178 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
28179   return scalar::utf16::validate<endianness::LITTLE>(buf, len);
28180 }
28181 
28182 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
28183   return scalar::utf16::validate<endianness::BIG>(buf, len);
28184 }
28185 
28186 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
28187   return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
28188 }
28189 
28190 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
28191   return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
28192 }
28193 
28194 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
28195   return scalar::utf32::validate_with_errors(buf, len);
28196 }
28197 
28198 simdutf_warn_unused bool implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
28199   return scalar::utf32::validate(buf, len);
28200 }
28201 
28202 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
28203   return 0; // stub
28204 }
28205 
28206 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
28207   return 0; // stub
28208 }
28209 
28210 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
28211   return result(error_code::OTHER, 0); // stub
28212 }
28213 
28214 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
28215   return result(error_code::OTHER, 0); // stub
28216 }
28217 
28218 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
28219   return 0; // stub
28220 }
28221 
28222 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
28223   return 0; // stub
28224 }
28225 
28226 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
28227   return 0; // stub
28228 }
28229 
28230 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
28231   return result(error_code::OTHER, 0); // stub
28232 }
28233 
28234 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
28235   return 0; // stub
28236 }
28237 
28238 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28239   return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
28240 }
28241 
28242 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28243   return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
28244 }
28245 
28246 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28247   return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
28248 }
28249 
28250 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28251   return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
28252 }
28253 
28254 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28255   return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
28256 }
28257 
28258 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
28259   return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
28260 }
28261 
28262 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
28263   return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
28264 }
28265 
28266 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
28267   return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
28268 }
28269 
28270 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
28271   return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
28272 }
28273 
28274 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28275   return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
28276 }
28277 
28278 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28279   return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
28280 }
28281 
28282 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28283   return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
28284 }
28285 
28286 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28287   return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
28288 }
28289 
28290 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28291   return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
28292 }
28293 
28294 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
28295   return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
28296 }
28297 
28298 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28299   return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
28300 }
28301 
28302 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28303   return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
28304 }
28305 
28306 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28307   return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
28308 }
28309 
28310 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28311   return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
28312 }
28313 
28314 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28315   return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
28316 }
28317 
28318 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
28319   return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
28320 }
28321 
28322 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
28323   scalar::utf16::change_endianness_utf16(input, length, output);
28324 }
28325 
28326 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
28327   return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
28328 }
28329 
28330 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
28331   return scalar::utf16::count_code_points<endianness::BIG>(input, length);
28332 }
28333 
28334 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
28335   return utf8::count_code_points(input, length);
28336 }
28337 
28338 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
28339   return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
28340 }
28341 
28342 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
28343   return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
28344 }
28345 
28346 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
28347   return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
28348 }
28349 
28350 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
28351   return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
28352 }
28353 
28354 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
28355   return scalar::utf8::utf16_length_from_utf8(input, length);
28356 }
28357 
28358 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
28359   return scalar::utf32::utf8_length_from_utf32(input, length);
28360 }
28361 
28362 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
28363   return scalar::utf32::utf16_length_from_utf32(input, length);
28364 }
28365 
28366 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
28367   return scalar::utf8::count_code_points(input, length);
28368 }
28369 
28370 } // namespace ppc64
28371 } // namespace simdutf
28372 
28373 /* begin file src/simdutf/ppc64/end.h */
28374 /* end file src/simdutf/ppc64/end.h */
28375 /* end file src/ppc64/implementation.cpp */
28376 #endif
28377 #if SIMDUTF_IMPLEMENTATION_WESTMERE
28378 /* begin file src/westmere/implementation.cpp */
28379 /* begin file src/simdutf/westmere/begin.h */
28380 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
28381 // #define SIMDUTF_IMPLEMENTATION westmere
28382 
28383 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
28384 // nothing needed.
28385 #else
28386 SIMDUTF_TARGET_WESTMERE
28387 #endif
28388 /* end file src/simdutf/westmere/begin.h */
28389 namespace simdutf {
28390 namespace westmere {
28391 namespace {
28392 #ifndef SIMDUTF_WESTMERE_H
28393 #error "westmere.h must be included"
28394 #endif
28395 using namespace simd;
28396 
28397 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
28398   return input.reduce_or().is_ascii();
28399 }
28400 
28401 simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
28402   simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
28403   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
28404   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
28405   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
28406   return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
28407 }
28408 
28409 simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
28410   simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
28411   simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
28412   // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
28413   return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
28414 }
28415 
28416 /* begin file src/westmere/internal/loader.cpp */
28417 namespace internal {
28418 namespace westmere {
28419 
28420 /* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
28421 /*
28422 * reads a vector of uint16 values
28423 * bits after 11th are ignored
28424 * first 11 bits are encoded into utf8
28425 * !important! utf8_output must have at least 16 writable bytes
28426 */
28427 
28428 inline void write_v_u16_11bits_to_utf8(
28429   const __m128i v_u16,
28430   char*& utf8_output,
28431   const __m128i one_byte_bytemask,
28432   const uint16_t one_byte_bitmask
28433 ) {
28434   // 0b1100_0000_1000_0000
28435   const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
28436   // 0b0001_1111_0000_0000
28437   const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
28438   // 0b0000_0000_0011_1111
28439   const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
28440 
28441   // 1. prepare 2-byte values
28442           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
28443           // expected output   : [110a|aaaa|10bb|bbbb] x 8
28444 
28445   // t0 = [000a|aaaa|bbbb|bb00]
28446   const __m128i t0 = _mm_slli_epi16(v_u16, 2);
28447   // t1 = [000a|aaaa|0000|0000]
28448   const __m128i t1 = _mm_and_si128(t0, v_1f00);
28449   // t2 = [0000|0000|00bb|bbbb]
28450   const __m128i t2 = _mm_and_si128(v_u16, v_003f);
28451   // t3 = [000a|aaaa|00bb|bbbb]
28452   const __m128i t3 = _mm_or_si128(t1, t2);
28453   // t4 = [110a|aaaa|10bb|bbbb]
28454   const __m128i t4 = _mm_or_si128(t3, v_c080);
28455 
28456   // 2. merge ASCII and 2-byte codewords
28457   const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask);
28458 
28459   // 3. prepare bitmask for 8-bit lookup
28460   //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
28461   const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
28462   const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
28463   const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
28464   // 4. pack the bytes
28465   const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
28466   const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
28467   const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
28468 
28469   // 5. store bytes
28470   _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
28471 
28472   // 6. adjust pointers
28473   utf8_output += row[0];
28474 }
28475 
28476 inline void write_v_u16_11bits_to_utf8(
28477   const __m128i v_u16,
28478   char*& utf8_output,
28479   const __m128i v_0000,
28480   const __m128i v_ff80
28481 ) {
28482   // no bits set above 7th bit
28483   const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000);
28484   const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
28485 
28486   write_v_u16_11bits_to_utf8(
28487     v_u16, utf8_output, one_byte_bytemask, one_byte_bitmask);
28488 }
28489 /* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
28490 
28491 } // namespace westmere
28492 } // namespace internal
28493 /* end file src/westmere/internal/loader.cpp */
28494 /* begin file src/westmere/sse_detect_encodings.cpp */
28495 template<class checker>
28496 // len is known to be a multiple of 2 when this is called
28497 int sse_detect_encodings(const char * buf, size_t len) {
28498     const char* start = buf;
28499     const char* end = buf + len;
28500 
28501     bool is_utf8 = true;
28502     bool is_utf16 = true;
28503     bool is_utf32 = true;
28504 
28505     int out = 0;
28506 
28507     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
28508     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
28509 
28510     __m128i currentmax = _mm_setzero_si128();
28511 
28512     checker check{};
28513 
28514     while(buf + 64 <= end) {
28515         __m128i in = _mm_loadu_si128((__m128i*)buf);
28516         __m128i secondin = _mm_loadu_si128((__m128i*)buf+1);
28517         __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
28518         __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
28519 
28520         const auto u0 = simd16<uint16_t>(in);
28521         const auto u1 = simd16<uint16_t>(secondin);
28522         const auto u2 = simd16<uint16_t>(thirdin);
28523         const auto u3 = simd16<uint16_t>(fourthin);
28524 
28525         const auto v0 = u0.shr<8>();
28526         const auto v1 = u1.shr<8>();
28527         const auto v2 = u2.shr<8>();
28528         const auto v3 = u3.shr<8>();
28529 
28530         const auto in16 = simd16<uint16_t>::pack(v0, v1);
28531         const auto nextin16 = simd16<uint16_t>::pack(v2, v3);
28532 
28533         const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8;
28534         const auto surrogates_wordmask1 = (nextin16 & v_f8) == v_d8;
28535         uint16_t surrogates_bitmask0 = static_cast<uint16_t>(surrogates_wordmask0.to_bitmask());
28536         uint16_t surrogates_bitmask1 = static_cast<uint16_t>(surrogates_wordmask1.to_bitmask());
28537 
28538         // Check for surrogates
28539         if (surrogates_bitmask0 != 0x0 || surrogates_bitmask1 != 0x0) {
28540             // Cannot be UTF8
28541             is_utf8 = false;
28542             // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
28543             // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
28544             // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
28545             // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
28546             // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units.
28547 
28548             if (((surrogates_bitmask0 | surrogates_bitmask1) & 0xaaaa) != 0) {
28549                 is_utf32 = false;
28550                 // Code from sse_validate_utf16le.cpp
28551                 // Not efficient, we do not process surrogates_bitmask1
28552                 const char16_t * input = reinterpret_cast<const char16_t*>(buf);
28553                 const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
28554 
28555                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
28556                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
28557 
28558                 const uint16_t V0 = static_cast<uint16_t>(~surrogates_bitmask0);
28559 
28560                 const auto    vH0 = (in16 & v_fc) == v_dc;
28561                 const uint16_t H0 = static_cast<uint16_t>(vH0.to_bitmask());
28562 
28563                 const uint16_t L0 = static_cast<uint16_t>(~H0 & surrogates_bitmask0);
28564 
28565                 const uint16_t a0 = static_cast<uint16_t>(L0 & (H0 >> 1));
28566 
28567                 const uint16_t b0 = static_cast<uint16_t>(a0 << 1);
28568 
28569                 const uint16_t c0 = static_cast<uint16_t>(V0 | a0 | b0);
28570 
28571                 if (c0 == 0xffff) {
28572                     input += 16;
28573                 } else if (c0 == 0x7fff) {
28574                     input += 15;
28575                 } else {
28576                     is_utf16 = false;
28577                     break;
28578                 }
28579 
28580                 while (input + simd16<uint16_t>::SIZE * 2 < end16) {
28581                     const auto in0 = simd16<uint16_t>(input);
28582                     const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
28583 
28584                     const auto t0 = in0.shr<8>();
28585                     const auto t1 = in1.shr<8>();
28586 
28587                     const auto in_16 = simd16<uint16_t>::pack(t0, t1);
28588 
28589                     const auto surrogates_wordmask = (in_16 & v_f8) == v_d8;
28590                     const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
28591                     if (surrogates_bitmask == 0x0) {
28592                         input += 16;
28593                     } else {
28594                         const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
28595 
28596                         const auto    vH = (in_16 & v_fc) == v_dc;
28597                         const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
28598 
28599                         const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
28600 
28601                         const uint16_t a = static_cast<uint16_t>(L & (H >> 1));
28602 
28603                         const uint16_t b = static_cast<uint16_t>(a << 1);
28604 
28605                         const uint16_t c = static_cast<uint16_t>(V | a | b);
28606 
28607                         if (c == 0xffff) {
28608                             input += 16;
28609                         } else if (c == 0x7fff) {
28610                             input += 15;
28611                         } else {
28612                             is_utf16 = false;
28613                             break;
28614                         }
28615                     }
28616                 }
28617             } else {
28618                 is_utf16 = false;
28619                 // Check for UTF-32
28620                 if (len % 4 == 0) {
28621                     const char32_t * input = reinterpret_cast<const char32_t*>(buf);
28622                     const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
28623 
28624                     // Must start checking for surrogates
28625                     __m128i currentoffsetmax = _mm_setzero_si128();
28626                     const __m128i offset = _mm_set1_epi32(0xffff2000);
28627                     const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
28628 
28629                     currentmax = _mm_max_epu32(in, currentmax);
28630                     currentmax = _mm_max_epu32(secondin, currentmax);
28631                     currentmax = _mm_max_epu32(thirdin, currentmax);
28632                     currentmax = _mm_max_epu32(fourthin, currentmax);
28633 
28634                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
28635                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(secondin, offset), currentoffsetmax);
28636                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(thirdin, offset), currentoffsetmax);
28637                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax);
28638 
28639                     while (input + 4 < end32) {
28640                         const __m128i in32 = _mm_loadu_si128((__m128i *)input);
28641                         currentmax = _mm_max_epu32(in32,currentmax);
28642                         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax);
28643                         input += 4;
28644                     }
28645 
28646                     __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
28647                     if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
28648                         is_utf32 = false;
28649                     }
28650                 } else {
28651                     is_utf32 = false;
28652                 }
28653             }
28654             break;
28655         }
28656         // If no surrogate, validate under other encodings as well
28657 
28658         // UTF-32 validation
28659         currentmax = _mm_max_epu32(in, currentmax);
28660         currentmax = _mm_max_epu32(secondin, currentmax);
28661         currentmax = _mm_max_epu32(thirdin, currentmax);
28662         currentmax = _mm_max_epu32(fourthin, currentmax);
28663 
28664         // UTF-8 validation
28665         // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
28666         simd::simd8x64<uint8_t> in8(in, secondin, thirdin, fourthin);
28667         check.check_next_input(in8);
28668 
28669         buf += 64;
28670     }
28671 
28672     // Check which encodings are possible
28673 
28674     if (is_utf8) {
28675         if (static_cast<size_t>(buf - start) != len) {
28676             uint8_t block[64]{};
28677             std::memset(block, 0x20, 64);
28678             std::memcpy(block, buf, len - (buf - start));
28679             simd::simd8x64<uint8_t> in(block);
28680             check.check_next_input(in);
28681         }
28682         if (!check.errors()) {
28683             out |= simdutf::encoding_type::UTF8;
28684         }
28685     }
28686 
28687     if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
28688         out |= simdutf::encoding_type::UTF16_LE;
28689     }
28690 
28691     if (is_utf32 && (len % 4 == 0)) {
28692         const __m128i standardmax = _mm_set1_epi32(0x10ffff);
28693         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
28694         if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
28695             out |= simdutf::encoding_type::UTF32_LE;
28696         }
28697     }
28698 
28699     return out;
28700 }
28701 /* end file src/westmere/sse_detect_encodings.cpp */
28702 
28703 /* begin file src/westmere/sse_validate_utf16.cpp */
28704 /*
28705     In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
28706 
28707     In a vectorized algorithm we want to examine the most significant
28708     nibble in order to select a fast path. If none of highest nibbles
28709     are 0xD (13), than we are sure that UTF-16 chunk in a vector
28710     register is valid.
28711 
28712     Let us analyze what we need to check if the nibble is 0xD. The
28713     value of the preceding nibble determines what we have:
28714 
28715     0xd000 .. 0xd7ff - a valid word
28716     0xd800 .. 0xdbff - low surrogate
28717     0xdc00 .. 0xdfff - high surrogate
28718 
28719     Other constraints we have to consider:
28720     - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
28721     - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
28722     - there must not be sole low surrogate nor high surrogate
28723 
28724     We're going to build three bitmasks based on the 3rd nibble:
28725     - V = valid word,
28726     - L = low surrogate (0xd800 .. 0xdbff)
28727     - H = high surrogate (0xdc00 .. 0xdfff)
28728 
28729       0   1   2   3   4   5   6   7    <--- word index
28730     [ V | L | H | L | H | V | V | L ]
28731       1   0   0   0   0   1   1   0     - V = valid masks
28732       0   1   0   1   0   0   0   1     - L = low surrogate
28733       0   0   1   0   1   0   0   0     - H high surrogate
28734 
28735 
28736       1   0   0   0   0   1   1   0   V = valid masks
28737       0   1   0   1   0   0   0   0   a = L & (H >> 1)
28738       0   0   1   0   1   0   0   0   b = a << 1
28739       1   1   1   1   1   1   1   0   c = V | a | b
28740                                   ^
28741                                   the last bit can be zero, we just consume 7 code units
28742                                   and recheck this word in the next iteration
28743 */
28744 
28745 /* Returns:
28746    - pointer to the last unprocessed character (a scalar fallback should check the rest);
28747    - nullptr if an error was detected.
28748 */
28749 template <endianness big_endian>
28750 const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
28751     const char16_t* end = input + size;
28752 
28753     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
28754     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
28755     const auto v_fc = simd8<uint8_t>::splat(0xfc);
28756     const auto v_dc = simd8<uint8_t>::splat(0xdc);
28757 
28758     while (input + simd16<uint16_t>::SIZE * 2 < end) {
28759         // 0. Load data: since the validation takes into account only higher
28760         //    byte of each word, we compress the two vectors into one which
28761         //    consists only the higher bytes.
28762         auto in0 = simd16<uint16_t>(input);
28763         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
28764         if (big_endian) {
28765             in0 = in0.swap_bytes();
28766             in1 = in1.swap_bytes();
28767         }
28768 
28769         const auto t0 = in0.shr<8>();
28770         const auto t1 = in1.shr<8>();
28771 
28772         const auto in = simd16<uint16_t>::pack(t0, t1);
28773 
28774         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
28775         const auto surrogates_wordmask = (in & v_f8) == v_d8;
28776         const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
28777         if (surrogates_bitmask == 0x0000) {
28778             input += 16;
28779         } else {
28780             // 2. We have some surrogates that have to be distinguished:
28781             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
28782             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
28783             //
28784             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
28785 
28786             // V - non-surrogate code units
28787             //     V = not surrogates_wordmask
28788             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
28789 
28790             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
28791             const auto    vH = (in & v_fc) == v_dc;
28792             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
28793 
28794             // L - word mask for low surrogates
28795             //     L = not H and surrogates_wordmask
28796             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
28797 
28798             const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
28799                                               // (A low surrogate placed in the 7th register's word
28800                                               // is an exception we handle.)
28801             const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
28802                                               // thanks to that we have only two masks for valid case.
28803             const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
28804 
28805             if (c == 0xffff) {
28806                 // The whole input register contains valid UTF-16, i.e.,
28807                 // either single code units or proper surrogate pairs.
28808                 input += 16;
28809             } else if (c == 0x7fff) {
28810                 // The 15 lower code units of the input register contains valid UTF-16.
28811                 // The 15th word may be either a low or high surrogate. It the next
28812                 // iteration we 1) check if the low surrogate is followed by a high
28813                 // one, 2) reject sole high surrogate.
28814                 input += 15;
28815             } else {
28816                 return nullptr;
28817             }
28818         }
28819     }
28820 
28821     return input;
28822 }
28823 
28824 
28825 template <endianness big_endian>
28826 const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) {
28827     const char16_t* start = input;
28828     const char16_t* end = input + size;
28829 
28830     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
28831     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
28832     const auto v_fc = simd8<uint8_t>::splat(0xfc);
28833     const auto v_dc = simd8<uint8_t>::splat(0xdc);
28834 
28835     while (input + simd16<uint16_t>::SIZE * 2 < end) {
28836         // 0. Load data: since the validation takes into account only higher
28837         //    byte of each word, we compress the two vectors into one which
28838         //    consists only the higher bytes.
28839         auto in0 = simd16<uint16_t>(input);
28840         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
28841 
28842         if (big_endian) {
28843             in0 = in0.swap_bytes();
28844             in1 = in1.swap_bytes();
28845         }
28846 
28847         const auto t0 = in0.shr<8>();
28848         const auto t1 = in1.shr<8>();
28849 
28850         const auto in = simd16<uint16_t>::pack(t0, t1);
28851 
28852         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
28853         const auto surrogates_wordmask = (in & v_f8) == v_d8;
28854         const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
28855         if (surrogates_bitmask == 0x0000) {
28856             input += 16;
28857         } else {
28858             // 2. We have some surrogates that have to be distinguished:
28859             //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
28860             //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
28861             //
28862             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
28863 
28864             // V - non-surrogate code units
28865             //     V = not surrogates_wordmask
28866             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
28867 
28868             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
28869             const auto    vH = (in & v_fc) == v_dc;
28870             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
28871 
28872             // L - word mask for low surrogates
28873             //     L = not H and surrogates_wordmask
28874             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
28875 
28876             const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
28877                                               // (A low surrogate placed in the 7th register's word
28878                                               // is an exception we handle.)
28879             const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
28880                                               // thanks to that we have only two masks for valid case.
28881             const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
28882 
28883             if (c == 0xffff) {
28884                 // The whole input register contains valid UTF-16, i.e.,
28885                 // either single code units or proper surrogate pairs.
28886                 input += 16;
28887             } else if (c == 0x7fff) {
28888                 // The 15 lower code units of the input register contains valid UTF-16.
28889                 // The 15th word may be either a low or high surrogate. It the next
28890                 // iteration we 1) check if the low surrogate is followed by a high
28891                 // one, 2) reject sole high surrogate.
28892                 input += 15;
28893             } else {
28894                 return result(error_code::SURROGATE, input - start);
28895             }
28896         }
28897     }
28898 
28899     return result(error_code::SUCCESS, input - start);
28900 }
28901 /* end file src/westmere/sse_validate_utf16.cpp */
28902 /* begin file src/westmere/sse_validate_utf32le.cpp */
28903 /* Returns:
28904    - pointer to the last unprocessed character (a scalar fallback should check the rest);
28905    - nullptr if an error was detected.
28906 */
28907 const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
28908     const char32_t* end = input + size;
28909 
28910     const __m128i standardmax = _mm_set1_epi32(0x10ffff);
28911     const __m128i offset = _mm_set1_epi32(0xffff2000);
28912     const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
28913     __m128i currentmax = _mm_setzero_si128();
28914     __m128i currentoffsetmax = _mm_setzero_si128();
28915 
28916     while (input + 4 < end) {
28917         const __m128i in = _mm_loadu_si128((__m128i *)input);
28918         currentmax = _mm_max_epu32(in,currentmax);
28919         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
28920         input += 4;
28921     }
28922     __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
28923     if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
28924         return nullptr;
28925     }
28926 
28927     is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
28928     if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
28929         return nullptr;
28930     }
28931 
28932     return input;
28933 }
28934 
28935 
28936 const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) {
28937     const char32_t* start = input;
28938     const char32_t* end = input + size;
28939 
28940     const __m128i standardmax = _mm_set1_epi32(0x10ffff);
28941     const __m128i offset = _mm_set1_epi32(0xffff2000);
28942     const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
28943     __m128i currentmax = _mm_setzero_si128();
28944     __m128i currentoffsetmax = _mm_setzero_si128();
28945 
28946     while (input + 4 < end) {
28947         const __m128i in = _mm_loadu_si128((__m128i *)input);
28948         currentmax = _mm_max_epu32(in,currentmax);
28949         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
28950 
28951         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
28952         if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
28953             return result(error_code::TOO_LARGE, input - start);
28954         }
28955 
28956         is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
28957         if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
28958             return result(error_code::SURROGATE, input - start);
28959         }
28960         input += 4;
28961     }
28962 
28963     return result(error_code::SUCCESS, input - start);
28964 }
28965 /* end file src/westmere/sse_validate_utf32le.cpp */
28966 
28967 /* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */
28968 std::pair<const char* const, char* const> sse_convert_latin1_to_utf8(
28969   const char* latin_input,
28970   const size_t latin_input_length,
28971   char* utf8_output) {
28972   const char* end = latin_input + latin_input_length;
28973 
28974   const __m128i v_0000 = _mm_setzero_si128();
28975   // 0b1000_0000
28976   const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80);
28977   // 0b1111_1111_1000_0000
28978   const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
28979 
28980   const __m128i latin_1_half_into_u16_byte_mask = _mm_setr_epi8(
28981       0, '\x80',
28982       1, '\x80',
28983       2, '\x80',
28984       3, '\x80',
28985       4, '\x80',
28986       5, '\x80',
28987       6, '\x80',
28988       7, '\x80'
28989     );
28990 
28991   const __m128i latin_2_half_into_u16_byte_mask = _mm_setr_epi8(
28992       8, '\x80',
28993       9, '\x80',
28994       10, '\x80',
28995       11, '\x80',
28996       12, '\x80',
28997       13, '\x80',
28998       14, '\x80',
28999       15, '\x80'
29000     );
29001 
29002   // each latin1 takes 1-2 utf8 bytes
29003   // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then adjust the pointer)
29004   // so the last write can exceed the utf8_output size by 8-1 bytes
29005   // by reserving 8 extra input bytes, we expect the output to have 8-16 bytes free
29006   while (latin_input + 16 + 8 <= end) {
29007     // Load 16 Latin1 characters (16 bytes) into a 128-bit register
29008     __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input);
29009 
29010 
29011     if (_mm_testz_si128(v_latin, v_80)) {// ASCII fast path!!!!
29012       _mm_storeu_si128((__m128i*)utf8_output, v_latin);
29013       latin_input += 16;
29014       utf8_output += 16;
29015       continue;
29016     }
29017 
29018 
29019     // assuming a/b are bytes and A/B are uint16 of the same value
29020     // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
29021     __m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
29022     // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
29023     __m128i v_u16_latin_2_half = _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);
29024 
29025 
29026     internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
29027     internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half, utf8_output, v_0000, v_ff80);
29028     latin_input += 16;
29029   }
29030 
29031   if (latin_input + 16 <= end) {
29032     // Load 16 Latin1 characters (16 bytes) into a 128-bit register
29033     __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input);
29034 
29035     if (_mm_testz_si128(v_latin, v_80)) {// ASCII fast path!!!!
29036       _mm_storeu_si128((__m128i*)utf8_output, v_latin);
29037       latin_input += 16;
29038       utf8_output += 16;
29039     } else {
29040       // assuming a/b are bytes and A/B are uint16 of the same value
29041       // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
29042       __m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
29043       internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
29044       latin_input += 8;
29045     }
29046   }
29047 
29048   return std::make_pair(latin_input, utf8_output);
29049 }
29050 /* end file src/westmere/sse_convert_latin1_to_utf8.cpp */
29051 /* begin file src/westmere/sse_convert_latin1_to_utf16.cpp */
29052 template <endianness big_endian>
29053 std::pair<const char*, char16_t*> sse_convert_latin1_to_utf16(const char *latin1_input, size_t len,
29054                                                               char16_t *utf16_output) {
29055     size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
29056     for (size_t i = 0; i < rounded_len; i += 16) {
29057         // Load 16 Latin1 characters into a 128-bit register
29058         __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&latin1_input[i]));
29059         __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in)
29060                          : _mm_unpacklo_epi8(in, _mm_setzero_si128());
29061         __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in)
29062                          : _mm_unpackhi_epi8(in, _mm_setzero_si128());
29063         // Zero extend each Latin1 character to 16-bit integers and store the results back to memory
29064         _mm_storeu_si128(reinterpret_cast<__m128i*>(&utf16_output[i]), out1);
29065         _mm_storeu_si128(reinterpret_cast<__m128i*>(&utf16_output[i + 8]), out2);
29066     }
29067     // return pointers pointing to where we left off
29068     return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
29069 }
29070 /* end file src/westmere/sse_convert_latin1_to_utf16.cpp */
29071 /* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */
29072 std::pair<const char*, char32_t*> sse_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) {
29073     const char* end = buf + len;
29074 
29075     while (buf + 16 <= end) {
29076         // Load 16 Latin1 characters (16 bytes) into a 128-bit register
29077         __m128i in = _mm_loadu_si128((__m128i*)buf);
29078 
29079         // Shift input to process next 4 bytes
29080         __m128i in_shifted1 = _mm_srli_si128(in, 4);
29081         __m128i in_shifted2 = _mm_srli_si128(in, 8);
29082         __m128i in_shifted3 = _mm_srli_si128(in, 12);
29083 
29084         // expand 8-bit to 32-bit unit
29085         __m128i out1 = _mm_cvtepu8_epi32(in);
29086         __m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
29087         __m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
29088         __m128i out4 = _mm_cvtepu8_epi32(in_shifted3);
29089 
29090         _mm_storeu_si128((__m128i*)utf32_output, out1);
29091         _mm_storeu_si128((__m128i*)(utf32_output + 4), out2);
29092         _mm_storeu_si128((__m128i*)(utf32_output + 8), out3);
29093         _mm_storeu_si128((__m128i*)(utf32_output + 12), out4);
29094 
29095         utf32_output += 16;
29096         buf += 16;
29097     }
29098 
29099     return std::make_pair(buf, utf32_output);
29100 }
29101 
29102 /* end file src/westmere/sse_convert_latin1_to_utf32.cpp */
29103 
29104 
29105 /* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
29106 // depends on "tables/utf8_to_utf16_tables.h"
29107 
29108 
29109 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
29110 // end of the code points. Only the least significant 12 bits of the mask
29111 // are accessed.
29112 // It returns how many bytes were consumed (up to 12).
29113 template <endianness big_endian>
29114 size_t convert_masked_utf8_to_utf16(const char *input,
29115                            uint64_t utf8_end_of_code_point_mask,
29116                            char16_t *&utf16_output) {
29117   // we use an approach where we try to process up to 12 input bytes.
29118   // Why 12 input bytes and not 16? Because we are concerned with the size of
29119   // the lookup tables. Also 12 is nicely divisible by two and three.
29120   //
29121   //
29122   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
29123   // beneficial to have fast paths that depend on branch prediction but have less latency.
29124   // This results in more instructions but, potentially, also higher speeds.
29125   //
29126   // We first try a few fast paths.
29127   const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29128   const __m128i in = _mm_loadu_si128((__m128i *)input);
29129   const uint16_t input_utf8_end_of_code_point_mask =
29130       utf8_end_of_code_point_mask & 0xfff;
29131   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
29132     // We process the data in chunks of 16 bytes.
29133     __m128i ascii_first = _mm_cvtepu8_epi16(in);
29134     __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8));
29135     if (big_endian) {
29136       ascii_first = _mm_shuffle_epi8(ascii_first, swap);
29137       ascii_second = _mm_shuffle_epi8(ascii_second, swap);
29138     }
29139     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
29140     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second);
29141     utf16_output += 16; // We wrote 16 16-bit characters.
29142     return 16; // We consumed 16 bytes.
29143   }
29144   if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
29145     // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units.
29146     // There is probably a more efficient sequence, but the following might do.
29147     const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29148     const __m128i perm = _mm_shuffle_epi8(in, sh);
29149     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29150     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29151     __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29152     if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
29153     _mm_storeu_si128((__m128i *)utf16_output, composed);
29154     utf16_output += 8; // We wrote 16 bytes, 8 code points.
29155     return 16;
29156   }
29157   if(input_utf8_end_of_code_point_mask == 0x924) {
29158     // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte UTF-16 code units.
29159     // There is probably a more efficient sequence, but the following might do.
29160     const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
29161     const __m128i perm = _mm_shuffle_epi8(in, sh);
29162     const __m128i ascii =
29163         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
29164     const __m128i middlebyte =
29165         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
29166     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29167     const __m128i highbyte =
29168         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
29169     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
29170     const __m128i composed =
29171         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
29172     __m128i composed_repacked = _mm_packus_epi32(composed, composed);
29173     if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
29174     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
29175     utf16_output += 4;
29176     return 12;
29177   }
29178   /// We do not have a fast path available, so we fallback.
29179 
29180   const uint8_t idx =
29181       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
29182   const uint8_t consumed =
29183       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
29184   if (idx < 64) {
29185     // SIX (6) input code-code units
29186     // this is a relatively easy scenario
29187     // we process SIX (6) input code-code units. The max length in bytes of six code
29188     // code units spanning between 1 and 2 bytes each is 12 bytes. On processors
29189     // where pdep/pext is fast, we might be able to use a small lookup table.
29190     const __m128i sh =
29191         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29192     const __m128i perm = _mm_shuffle_epi8(in, sh);
29193     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29194     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29195     __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29196     if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
29197     _mm_storeu_si128((__m128i *)utf16_output, composed);
29198     utf16_output += 6; // We wrote 12 bytes, 6 code points.
29199   } else if (idx < 145) {
29200     // FOUR (4) input code-code units
29201     const __m128i sh =
29202         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29203     const __m128i perm = _mm_shuffle_epi8(in, sh);
29204     const __m128i ascii =
29205         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
29206     const __m128i middlebyte =
29207         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
29208     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29209     const __m128i highbyte =
29210         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
29211     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
29212     const __m128i composed =
29213         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
29214      __m128i composed_repacked = _mm_packus_epi32(composed, composed);
29215     if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
29216     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
29217     utf16_output += 4;
29218   } else if (idx < 209) {
29219     // TWO (2) input code-code units
29220     //////////////
29221     // There might be garbage inputs where a leading byte mascarades as a four-byte
29222     // leading byte (by being followed by 3 continuation byte), but is not greater than
29223     // 0xf0. This could trigger a buffer overflow if we only counted leading
29224     // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
29225     // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
29226     // We do as at the cost of an extra mask.
29227     /////////////
29228     const __m128i sh =
29229         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29230     const __m128i perm = _mm_shuffle_epi8(in, sh);
29231     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
29232     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
29233     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29234     __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
29235     // correct for spurious high bit
29236     const __m128i correct =
29237         _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
29238     middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
29239     const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
29240     // We deliberately carry the leading four bits in highbyte if they are present,
29241     // we remove them later when computing hightenbits.
29242     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
29243     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
29244     // When we need to generate a surrogate pair (leading byte > 0xF0), then
29245     // the corresponding 32-bit value in 'composed'  will be greater than
29246     // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
29247     // location of the surrogate pairs.
29248     const __m128i composed =
29249         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
29250                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
29251     const __m128i composedminus =
29252         _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
29253     const __m128i lowtenbits =
29254         _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
29255     // Notice the 0x3ff mask:
29256     const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
29257     const __m128i lowtenbitsadd =
29258         _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
29259     const __m128i hightenbitsadd =
29260         _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
29261     const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
29262     __m128i surrogates =
29263         _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
29264     uint32_t basic_buffer[4];
29265     uint32_t basic_buffer_swap[4];
29266     if (big_endian) {
29267       _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
29268       surrogates = _mm_shuffle_epi8(surrogates, swap);
29269     }
29270     _mm_storeu_si128((__m128i *)basic_buffer, composed);
29271     uint32_t surrogate_buffer[4];
29272     _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
29273     for (size_t i = 0; i < 3; i++) {
29274       if(basic_buffer[i] > 0x3c00000) {
29275         utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
29276         utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
29277         utf16_output += 2;
29278       } else {
29279         utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
29280         utf16_output++;
29281       }
29282     }
29283   } else {
29284     // here we know that there is an error but we do not handle errors
29285   }
29286   return consumed;
29287 }
29288 /* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
29289 /* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
29290 // depends on "tables/utf8_to_utf16_tables.h"
29291 
29292 
29293 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
29294 // end of the code points. Only the least significant 12 bits of the mask
29295 // are accessed.
29296 // It returns how many bytes were consumed (up to 12).
29297 size_t convert_masked_utf8_to_utf32(const char *input,
29298                            uint64_t utf8_end_of_code_point_mask,
29299                            char32_t *&utf32_output) {
29300   // we use an approach where we try to process up to 12 input bytes.
29301   // Why 12 input bytes and not 16? Because we are concerned with the size of
29302   // the lookup tables. Also 12 is nicely divisible by two and three.
29303   //
29304   //
29305   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
29306   // beneficial to have fast paths that depend on branch prediction but have less latency.
29307   // This results in more instructions but, potentially, also higher speeds.
29308   //
29309   // We first try a few fast paths.
29310   const __m128i in = _mm_loadu_si128((__m128i *)input);
29311   const uint16_t input_utf8_end_of_code_point_mask =
29312       utf8_end_of_code_point_mask & 0xfff;
29313   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
29314     // We process the data in chunks of 16 bytes.
29315     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in));
29316     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4)));
29317     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8)));
29318     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12)));
29319     utf32_output += 16; // We wrote 16 32-bit characters.
29320     return 16; // We consumed 16 bytes.
29321   }
29322   if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
29323     // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units.
29324     // There is probably a more efficient sequence, but the following might do.
29325     const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29326     const __m128i perm = _mm_shuffle_epi8(in, sh);
29327     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29328     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29329     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29330     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
29331     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
29332     utf32_output += 8; // We wrote 32 bytes, 8 code points.
29333     return 16;
29334   }
29335   if(input_utf8_end_of_code_point_mask == 0x924) {
29336     // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units.
29337     // There is probably a more efficient sequence, but the following might do.
29338     const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
29339     const __m128i perm = _mm_shuffle_epi8(in, sh);
29340     const __m128i ascii =
29341         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
29342     const __m128i middlebyte =
29343         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
29344     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29345     const __m128i highbyte =
29346         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
29347     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
29348     const __m128i composed =
29349         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
29350     _mm_storeu_si128((__m128i *)utf32_output, composed);
29351     utf32_output += 4;
29352     return 12;
29353   }
29354   /// We do not have a fast path available, so we fallback.
29355 
29356   const uint8_t idx =
29357       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
29358   const uint8_t consumed =
29359       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
29360   if (idx < 64) {
29361     // SIX (6) input code-code units
29362     // this is a relatively easy scenario
29363     // we process SIX (6) input code-code units. The max length in bytes of six code
29364     // code units spanning between 1 and 2 bytes each is 12 bytes. On processors
29365     // where pdep/pext is fast, we might be able to use a small lookup table.
29366     const __m128i sh =
29367         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29368     const __m128i perm = _mm_shuffle_epi8(in, sh);
29369     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29370     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29371     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29372     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
29373     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
29374     utf32_output += 6; // We wrote 12 bytes, 6 code points.
29375   } else if (idx < 145) {
29376     // FOUR (4) input code-code units
29377     const __m128i sh =
29378         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29379     const __m128i perm = _mm_shuffle_epi8(in, sh);
29380     const __m128i ascii =
29381         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
29382     const __m128i middlebyte =
29383         _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
29384     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29385     const __m128i highbyte =
29386         _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
29387     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
29388     const __m128i composed =
29389         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
29390     _mm_storeu_si128((__m128i *)utf32_output, composed);
29391     utf32_output += 4;
29392   } else if (idx < 209) {
29393     // TWO (2) input code-code units
29394     const __m128i sh =
29395         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29396     const __m128i perm = _mm_shuffle_epi8(in, sh);
29397     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
29398     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
29399     const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29400     __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
29401     // correct for spurious high bit
29402     const __m128i correct =
29403         _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
29404     middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
29405     const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
29406     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
29407     const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
29408     const __m128i composed =
29409         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
29410                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
29411     _mm_storeu_si128((__m128i *)utf32_output, composed);
29412     utf32_output += 3;
29413   } else {
29414     // here we know that there is an error but we do not handle errors
29415   }
29416   return consumed;
29417 }
29418 /* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
29419 /* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */
29420 // depends on "tables/utf8_to_utf16_tables.h"
29421 
29422 
29423 // Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
29424 // end of the code points. Only the least significant 12 bits of the mask
29425 // are accessed.
29426 // It returns how many bytes were consumed (up to 12).
29427 size_t convert_masked_utf8_to_latin1(const char *input,
29428                            uint64_t utf8_end_of_code_point_mask,
29429                            char *&latin1_output) {
29430   // we use an approach where we try to process up to 12 input bytes.
29431   // Why 12 input bytes and not 16? Because we are concerned with the size of
29432   // the lookup tables. Also 12 is nicely divisible by two and three.
29433   //
29434   //
29435   // Optimization note: our main path below is load-latency dependent. Thus it is maybe
29436   // beneficial to have fast paths that depend on branch prediction but have less latency.
29437   // This results in more instructions but, potentially, also higher speeds.
29438   //
29439   const __m128i in = _mm_loadu_si128((__m128i *)input);
29440   const uint16_t input_utf8_end_of_code_point_mask =
29441       utf8_end_of_code_point_mask & 0xfff; //we're only processing 12 bytes in case it`s not all ASCII
29442   if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
29443     // We process the data in chunks of 16 bytes.
29444     _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
29445     latin1_output += 16; // We wrote 16 characters.
29446     return 16; // We consumed 16 bytes.
29447   }
29448   /// We do not have a fast path available, so we fallback.
29449   const uint8_t idx =
29450       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
29451   const uint8_t consumed =
29452       tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
29453   // this indicates an invalid input:
29454   if(idx >= 64) { return consumed; }
29455   // Here we should have (idx < 64), if not, there is a bug in the validation or elsewhere.
29456   // SIX (6) input code-code units
29457   // this is a relatively easy scenario
29458   // we process SIX (6) input code-code units. The max length in bytes of six code
29459   // code units spanning between 1 and 2 bytes each is 12 bytes. On processors
29460   // where pdep/pext is fast, we might be able to use a small lookup table.
29461   const __m128i sh =
29462         _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29463   const __m128i perm = _mm_shuffle_epi8(in, sh);
29464   const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29465   const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29466   __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29467   const __m128i latin1_packed = _mm_packus_epi16(composed,composed);
29468   // writing 8 bytes even though we only care about the first 6 bytes.
29469   // performance note: it would be faster to use _mm_storeu_si128, we should investigate.
29470   _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
29471   latin1_output += 6; // We wrote 6 bytes.
29472   return consumed;
29473 }
29474 /* end file src/westmere/sse_convert_utf8_to_latin1.cpp */
29475 
29476 /* begin file src/westmere/sse_convert_utf16_to_latin1.cpp */
29477 template <endianness big_endian>
29478 std::pair<const char16_t*, char*> sse_convert_utf16_to_latin1(const char16_t* buf, size_t len, char* latin1_output) {
29479   const char16_t* end = buf + len;
29480   while (buf + 8 <= end) {
29481     // Load 8 UTF-16 characters into 128-bit SSE register
29482     __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buf));
29483 
29484     if (!match_system(big_endian)) {
29485       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29486       in = _mm_shuffle_epi8(in, swap);
29487     }
29488 
29489     __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
29490     if (_mm_testz_si128(in, high_byte_mask)) {
29491       // Pack 16-bit characters into 8-bit and store in latin1_output
29492       __m128i latin1_packed = _mm_packus_epi16(in, in);
29493       _mm_storel_epi64(reinterpret_cast<__m128i*>(latin1_output), latin1_packed);
29494       // Adjust pointers for next iteration
29495       buf += 8;
29496       latin1_output += 8;
29497     } else {
29498       return std::make_pair(nullptr, reinterpret_cast<char*>(latin1_output));
29499     }
29500   } // while
29501   return std::make_pair(buf, latin1_output);
29502 }
29503 
29504 template <endianness big_endian>
29505 std::pair<result, char*> sse_convert_utf16_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) {
29506   const char16_t* start = buf;
29507   const char16_t* end = buf + len;
29508   while (buf + 8 <= end) {
29509     __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buf));
29510 
29511     if (!big_endian) {
29512       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29513       in = _mm_shuffle_epi8(in, swap);
29514     }
29515 
29516     __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
29517     if (_mm_testz_si128(in, high_byte_mask)) {
29518       __m128i latin1_packed = _mm_packus_epi16(in, in);
29519       _mm_storel_epi64(reinterpret_cast<__m128i*>(latin1_output), latin1_packed);
29520       buf += 8;
29521       latin1_output += 8;
29522     } else {
29523       // Fallback to scalar code for handling errors
29524       for(int k = 0; k < 8; k++) {
29525         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
29526         if(word <= 0xff) {
29527           *latin1_output++ = char(word);
29528         } else {
29529           return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), latin1_output);
29530         }
29531       }
29532       buf += 8;
29533     }
29534   } // while
29535   return std::make_pair(result(error_code::SUCCESS, buf - start), latin1_output);
29536 }
29537 /* end file src/westmere/sse_convert_utf16_to_latin1.cpp */
29538 /* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
29539 /*
29540     The vectorized algorithm works on single SSE register i.e., it
29541     loads eight 16-bit code units.
29542 
29543     We consider three cases:
29544     1. an input register contains no surrogates and each value
29545        is in range 0x0000 .. 0x07ff.
29546     2. an input register contains no surrogates and values are
29547        is in range 0x0000 .. 0xffff.
29548     3. an input register contains surrogates --- i.e. codepoints
29549        can have 16 or 32 bits.
29550 
29551     Ad 1.
29552 
29553     When values are less than 0x0800, it means that a 16-bit code unit
29554     can be converted into: 1) single UTF8 byte (when it's an ASCII
29555     char) or 2) two UTF8 bytes.
29556 
29557     For this case we do only some shuffle to obtain these 2-byte
29558     codes and finally compress the whole SSE register with a single
29559     shuffle.
29560 
29561     We need 256-entry lookup table to get a compression pattern
29562     and the number of output bytes in the compressed vector register.
29563     Each entry occupies 17 bytes.
29564 
29565     Ad 2.
29566 
29567     When values fit in 16-bit code units, but are above 0x07ff, then
29568     a single word may produce one, two or three UTF8 bytes.
29569 
29570     We prepare data for all these three cases in two registers.
29571     The first register contains lower two UTF8 bytes (used in all
29572     cases), while the second one contains just the third byte for
29573     the three-UTF8-bytes case.
29574 
29575     Finally these two registers are interleaved forming eight-element
29576     array of 32-bit values. The array spans two SSE registers.
29577     The bytes from the registers are compressed using two shuffles.
29578 
29579     We need 256-entry lookup table to get a compression pattern
29580     and the number of output bytes in the compressed vector register.
29581     Each entry occupies 17 bytes.
29582 
29583 
29584     To summarize:
29585     - We need two 256-entry tables that have 8704 bytes in total.
29586 */
29587 
29588 /*
29589   Returns a pair: the first unprocessed byte from buf and utf8_output
29590   A scalar routing should carry on the conversion of the tail.
29591 */
29592 template <endianness big_endian>
29593 std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
29594 
29595   const char16_t* end = buf + len;
29596 
29597   const __m128i v_0000 = _mm_setzero_si128();
29598   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
29599   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
29600   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
29601 
29602   while (buf + 16 + safety_margin <= end) {
29603     __m128i in = _mm_loadu_si128((__m128i*)buf);
29604     if (big_endian) {
29605       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29606       in = _mm_shuffle_epi8(in, swap);
29607     }
29608     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
29609     const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
29610     if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
29611         __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
29612         if (big_endian) {
29613           const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29614           nextin = _mm_shuffle_epi8(nextin, swap);
29615         }
29616         if(!_mm_testz_si128(nextin, v_ff80)) {
29617           // 1. pack the bytes
29618           // obviously suboptimal.
29619           const __m128i utf8_packed = _mm_packus_epi16(in,in);
29620           // 2. store (16 bytes)
29621           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
29622           // 3. adjust pointers
29623           buf += 8;
29624           utf8_output += 8;
29625           in = nextin;
29626         } else {
29627           // 1. pack the bytes
29628           // obviously suboptimal.
29629           const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
29630           // 2. store (16 bytes)
29631           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
29632           // 3. adjust pointers
29633           buf += 16;
29634           utf8_output += 16;
29635           continue; // we are done for this round!
29636         }
29637     }
29638 
29639     // no bits set above 7th bit
29640     const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
29641     const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
29642 
29643     // no bits set above 11th bit
29644     const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
29645     const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
29646 
29647     if (one_or_two_bytes_bitmask == 0xffff) {
29648       internal::westmere::write_v_u16_11bits_to_utf8(in, utf8_output, one_byte_bytemask, one_byte_bitmask);
29649       buf += 8;
29650       continue;
29651     }
29652 
29653     // 1. Check if there are any surrogate word in the input chunk.
29654     //    We have also deal with situation when there is a surrogate word
29655     //    at the end of a chunk.
29656     const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
29657 
29658     // bitmask = 0x0000 if there are no surrogates
29659     //         = 0xc000 if the last word is a surrogate
29660     const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
29661     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
29662     // it is likely an uncommon occurrence.
29663     if (surrogates_bitmask == 0x0000) {
29664       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
29665         const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
29666                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
29667 
29668         /* In this branch we handle three cases:
29669            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
29670            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
29671            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
29672 
29673           We expand the input word (16-bit) into two code units (32-bit), thus
29674           we have room for four bytes. However, we need five distinct bit
29675           layouts. Note that the last byte in cases #2 and #3 is the same.
29676 
29677           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
29678           in register t2.
29679 
29680           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
29681           either byte 1 for case #2 or byte 2 for case #3. Note that they
29682           differ by exactly one bit.
29683 
29684           Finally from these two code units we build proper UTF-8 sequence, taking
29685           into account the case (i.e, the number of bytes to write).
29686         */
29687         /**
29688          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
29689          * t2 => [0ccc|cccc] [10cc|cccc]
29690          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
29691          */
29692 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
29693         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
29694         const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
29695         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
29696         const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
29697         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
29698         const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
29699 
29700         // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
29701         const __m128i s0 = _mm_srli_epi16(in, 4);
29702         // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
29703         const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
29704         // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
29705         const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
29706         // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
29707         const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
29708         const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
29709         const __m128i s4 = _mm_xor_si128(s3, m0);
29710 #undef simdutf_vec
29711 
29712         // 4. expand code units 16-bit => 32-bit
29713         const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
29714         const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
29715 
29716         // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
29717         const uint16_t mask = (one_byte_bitmask & 0x5555) |
29718                               (one_or_two_bytes_bitmask & 0xaaaa);
29719         if(mask == 0) {
29720           // We only have three-byte code units. Use fast path.
29721           const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
29722           const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
29723           const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
29724           _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
29725           utf8_output += 12;
29726           _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
29727           utf8_output += 12;
29728           buf += 8;
29729           continue;
29730         }
29731         const uint8_t mask0 = uint8_t(mask);
29732 
29733         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
29734         const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
29735         const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
29736 
29737         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
29738 
29739         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
29740         const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
29741         const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
29742 
29743         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
29744         utf8_output += row0[0];
29745         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
29746         utf8_output += row1[0];
29747 
29748         buf += 8;
29749     // surrogate pair(s) in a register
29750     } else {
29751       // Let us do a scalar fallback.
29752       // It may seem wasteful to use scalar code, but being efficient with SIMD
29753       // in the presence of surrogate pairs may require non-trivial tables.
29754       size_t forward = 15;
29755       size_t k = 0;
29756       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
29757       for(; k < forward; k++) {
29758         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
29759         if((word & 0xFF80)==0) {
29760           *utf8_output++ = char(word);
29761         } else if((word & 0xF800)==0) {
29762           *utf8_output++ = char((word>>6) | 0b11000000);
29763           *utf8_output++ = char((word & 0b111111) | 0b10000000);
29764         } else if((word &0xF800 ) != 0xD800) {
29765           *utf8_output++ = char((word>>12) | 0b11100000);
29766           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
29767           *utf8_output++ = char((word & 0b111111) | 0b10000000);
29768         } else {
29769           // must be a surrogate pair
29770           uint16_t diff = uint16_t(word - 0xD800);
29771           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
29772           k++;
29773           uint16_t diff2 = uint16_t(next_word - 0xDC00);
29774           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
29775           uint32_t value = (diff << 10) + diff2 + 0x10000;
29776           *utf8_output++ = char((value>>18) | 0b11110000);
29777           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
29778           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
29779           *utf8_output++ = char((value & 0b111111) | 0b10000000);
29780         }
29781       }
29782       buf += k;
29783     }
29784   } // while
29785 
29786   return std::make_pair(buf, utf8_output);
29787 }
29788 
29789 
29790 /*
29791   Returns a pair: a result struct and utf8_output.
29792   If there is an error, the count field of the result is the position of the error.
29793   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
29794   A scalar routing should carry on the conversion of the tail if needed.
29795 */
29796 template <endianness big_endian>
29797 std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
29798   const char16_t* start = buf;
29799   const char16_t* end = buf + len;
29800 
29801   const __m128i v_0000 = _mm_setzero_si128();
29802   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
29803   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
29804   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
29805 
29806   while (buf + 16 + safety_margin <= end) {
29807     __m128i in = _mm_loadu_si128((__m128i*)buf);
29808     if (big_endian) {
29809       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29810       in = _mm_shuffle_epi8(in, swap);
29811     }
29812     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
29813     const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
29814     if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
29815         __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
29816         if (big_endian) {
29817           const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29818           nextin = _mm_shuffle_epi8(nextin, swap);
29819         }
29820         if(!_mm_testz_si128(nextin, v_ff80)) {
29821           // 1. pack the bytes
29822           // obviously suboptimal.
29823           const __m128i utf8_packed = _mm_packus_epi16(in,in);
29824           // 2. store (16 bytes)
29825           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
29826           // 3. adjust pointers
29827           buf += 8;
29828           utf8_output += 8;
29829           in = nextin;
29830         } else {
29831           // 1. pack the bytes
29832           // obviously suboptimal.
29833           const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
29834           // 2. store (16 bytes)
29835           _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
29836           // 3. adjust pointers
29837           buf += 16;
29838           utf8_output += 16;
29839           continue; // we are done for this round!
29840         }
29841     }
29842 
29843     // no bits set above 7th bit
29844     const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
29845     const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
29846 
29847     // no bits set above 11th bit
29848     const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
29849     const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
29850 
29851     if (one_or_two_bytes_bitmask == 0xffff) {
29852       internal::westmere::write_v_u16_11bits_to_utf8(in, utf8_output, one_byte_bytemask, one_byte_bitmask);
29853       buf += 8;
29854       continue;
29855     }
29856 
29857     // 1. Check if there are any surrogate word in the input chunk.
29858     //    We have also deal with situation when there is a surrogate word
29859     //    at the end of a chunk.
29860     const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
29861 
29862     // bitmask = 0x0000 if there are no surrogates
29863     //         = 0xc000 if the last word is a surrogate
29864     const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
29865     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
29866     // it is likely an uncommon occurrence.
29867     if (surrogates_bitmask == 0x0000) {
29868       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
29869         const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
29870                                                 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
29871 
29872         /* In this branch we handle three cases:
29873            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
29874            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
29875            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
29876 
29877           We expand the input word (16-bit) into two code units (32-bit), thus
29878           we have room for four bytes. However, we need five distinct bit
29879           layouts. Note that the last byte in cases #2 and #3 is the same.
29880 
29881           We precompute byte 1 for case #1 and the common byte for cases #2 & #3
29882           in register t2.
29883 
29884           We precompute byte 1 for case #3 and -- **conditionally** -- precompute
29885           either byte 1 for case #2 or byte 2 for case #3. Note that they
29886           differ by exactly one bit.
29887 
29888           Finally from these two code units we build proper UTF-8 sequence, taking
29889           into account the case (i.e, the number of bytes to write).
29890         */
29891         /**
29892          * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
29893          * t2 => [0ccc|cccc] [10cc|cccc]
29894          * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
29895          */
29896 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
29897         // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
29898         const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
29899         // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
29900         const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
29901         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
29902         const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
29903 
29904         // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
29905         const __m128i s0 = _mm_srli_epi16(in, 4);
29906         // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
29907         const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
29908         // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
29909         const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
29910         // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
29911         const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
29912         const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
29913         const __m128i s4 = _mm_xor_si128(s3, m0);
29914 #undef simdutf_vec
29915 
29916         // 4. expand code units 16-bit => 32-bit
29917         const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
29918         const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
29919 
29920         // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
29921         const uint16_t mask = (one_byte_bitmask & 0x5555) |
29922                               (one_or_two_bytes_bitmask & 0xaaaa);
29923         if(mask == 0) {
29924           // We only have three-byte code units. Use fast path.
29925           const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
29926           const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
29927           const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
29928           _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
29929           utf8_output += 12;
29930           _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
29931           utf8_output += 12;
29932           buf += 8;
29933           continue;
29934         }
29935         const uint8_t mask0 = uint8_t(mask);
29936 
29937         const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
29938         const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
29939         const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
29940 
29941         const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
29942 
29943         const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
29944         const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
29945         const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
29946 
29947         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
29948         utf8_output += row0[0];
29949         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
29950         utf8_output += row1[0];
29951 
29952         buf += 8;
29953     // surrogate pair(s) in a register
29954     } else {
29955       // Let us do a scalar fallback.
29956       // It may seem wasteful to use scalar code, but being efficient with SIMD
29957       // in the presence of surrogate pairs may require non-trivial tables.
29958       size_t forward = 15;
29959       size_t k = 0;
29960       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
29961       for(; k < forward; k++) {
29962         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
29963         if((word & 0xFF80)==0) {
29964           *utf8_output++ = char(word);
29965         } else if((word & 0xF800)==0) {
29966           *utf8_output++ = char((word>>6) | 0b11000000);
29967           *utf8_output++ = char((word & 0b111111) | 0b10000000);
29968         } else if((word &0xF800 ) != 0xD800) {
29969           *utf8_output++ = char((word>>12) | 0b11100000);
29970           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
29971           *utf8_output++ = char((word & 0b111111) | 0b10000000);
29972         } else {
29973           // must be a surrogate pair
29974           uint16_t diff = uint16_t(word - 0xD800);
29975           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
29976           k++;
29977           uint16_t diff2 = uint16_t(next_word - 0xDC00);
29978           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
29979           uint32_t value = (diff << 10) + diff2 + 0x10000;
29980           *utf8_output++ = char((value>>18) | 0b11110000);
29981           *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
29982           *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
29983           *utf8_output++ = char((value & 0b111111) | 0b10000000);
29984         }
29985       }
29986       buf += k;
29987     }
29988   } // while
29989 
29990   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
29991 }
29992 /* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
29993 /* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
29994 /*
29995     The vectorized algorithm works on single SSE register i.e., it
29996     loads eight 16-bit code units.
29997 
29998     We consider three cases:
29999     1. an input register contains no surrogates and each value
30000        is in range 0x0000 .. 0x07ff.
30001     2. an input register contains no surrogates and values are
30002        is in range 0x0000 .. 0xffff.
30003     3. an input register contains surrogates --- i.e. codepoints
30004        can have 16 or 32 bits.
30005 
30006     Ad 1.
30007 
30008     When values are less than 0x0800, it means that a 16-bit code unit
30009     can be converted into: 1) single UTF8 byte (when it's an ASCII
30010     char) or 2) two UTF8 bytes.
30011 
30012     For this case we do only some shuffle to obtain these 2-byte
30013     codes and finally compress the whole SSE register with a single
30014     shuffle.
30015 
30016     We need 256-entry lookup table to get a compression pattern
30017     and the number of output bytes in the compressed vector register.
30018     Each entry occupies 17 bytes.
30019 
30020     Ad 2.
30021 
30022     When values fit in 16-bit code units, but are above 0x07ff, then
30023     a single word may produce one, two or three UTF8 bytes.
30024 
30025     We prepare data for all these three cases in two registers.
30026     The first register contains lower two UTF8 bytes (used in all
30027     cases), while the second one contains just the third byte for
30028     the three-UTF8-bytes case.
30029 
30030     Finally these two registers are interleaved forming eight-element
30031     array of 32-bit values. The array spans two SSE registers.
30032     The bytes from the registers are compressed using two shuffles.
30033 
30034     We need 256-entry lookup table to get a compression pattern
30035     and the number of output bytes in the compressed vector register.
30036     Each entry occupies 17 bytes.
30037 
30038 
30039     To summarize:
30040     - We need two 256-entry tables that have 8704 bytes in total.
30041 */
30042 
30043 /*
30044   Returns a pair: the first unprocessed byte from buf and utf8_output
30045   A scalar routing should carry on the conversion of the tail.
30046 */
30047 template <endianness big_endian>
30048 std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
30049   const char16_t* end = buf + len;
30050 
30051   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
30052   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
30053 
30054   while (buf + 8 <= end) {
30055     __m128i in = _mm_loadu_si128((__m128i*)buf);
30056 
30057     if (big_endian) {
30058       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
30059       in = _mm_shuffle_epi8(in, swap);
30060     }
30061 
30062     // 1. Check if there are any surrogate word in the input chunk.
30063     //    We have also deal with situation when there is a surrogate word
30064     //    at the end of a chunk.
30065     const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
30066 
30067     // bitmask = 0x0000 if there are no surrogates
30068     //         = 0xc000 if the last word is a surrogate
30069     const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
30070     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
30071     // it is likely an uncommon occurrence.
30072     if (surrogates_bitmask == 0x0000) {
30073       // case: no surrogate pair, extend 16-bit code units to 32-bit code units
30074         _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
30075         _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
30076         utf32_output += 8;
30077         buf += 8;
30078     // surrogate pair(s) in a register
30079     } else {
30080       // Let us do a scalar fallback.
30081       // It may seem wasteful to use scalar code, but being efficient with SIMD
30082       // in the presence of surrogate pairs may require non-trivial tables.
30083       size_t forward = 15;
30084       size_t k = 0;
30085       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
30086       for(; k < forward; k++) {
30087         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
30088         if((word &0xF800 ) != 0xD800) {
30089           *utf32_output++ = char32_t(word);
30090         } else {
30091           // must be a surrogate pair
30092           uint16_t diff = uint16_t(word - 0xD800);
30093           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
30094           k++;
30095           uint16_t diff2 = uint16_t(next_word - 0xDC00);
30096           if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
30097           uint32_t value = (diff << 10) + diff2 + 0x10000;
30098           *utf32_output++ = char32_t(value);
30099         }
30100       }
30101       buf += k;
30102     }
30103   } // while
30104   return std::make_pair(buf, utf32_output);
30105 }
30106 
30107 
30108 /*
30109   Returns a pair: a result struct and utf8_output.
30110   If there is an error, the count field of the result is the position of the error.
30111   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
30112   A scalar routing should carry on the conversion of the tail if needed.
30113 */
30114 template <endianness big_endian>
30115 std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
30116   const char16_t* start = buf;
30117   const char16_t* end = buf + len;
30118 
30119   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
30120   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
30121 
30122   while (buf + 8 <= end) {
30123     __m128i in = _mm_loadu_si128((__m128i*)buf);
30124 
30125     if (big_endian) {
30126       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
30127       in = _mm_shuffle_epi8(in, swap);
30128     }
30129 
30130     // 1. Check if there are any surrogate word in the input chunk.
30131     //    We have also deal with situation when there is a surrogate word
30132     //    at the end of a chunk.
30133     const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
30134 
30135     // bitmask = 0x0000 if there are no surrogates
30136     //         = 0xc000 if the last word is a surrogate
30137     const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
30138     // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
30139     // it is likely an uncommon occurrence.
30140     if (surrogates_bitmask == 0x0000) {
30141       // case: no surrogate pair, extend 16-bit code units to 32-bit code units
30142         _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
30143         _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
30144         utf32_output += 8;
30145         buf += 8;
30146     // surrogate pair(s) in a register
30147     } else {
30148       // Let us do a scalar fallback.
30149       // It may seem wasteful to use scalar code, but being efficient with SIMD
30150       // in the presence of surrogate pairs may require non-trivial tables.
30151       size_t forward = 15;
30152       size_t k = 0;
30153       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
30154       for(; k < forward; k++) {
30155         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
30156         if((word &0xF800 ) != 0xD800) {
30157           *utf32_output++ = char32_t(word);
30158         } else {
30159           // must be a surrogate pair
30160           uint16_t diff = uint16_t(word - 0xD800);
30161           uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
30162           k++;
30163           uint16_t diff2 = uint16_t(next_word - 0xDC00);
30164           if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
30165           uint32_t value = (diff << 10) + diff2 + 0x10000;
30166           *utf32_output++ = char32_t(value);
30167         }
30168       }
30169       buf += k;
30170     }
30171   } // while
30172   return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
30173 }
30174 /* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
30175 
30176 /* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */
30177 std::pair<const char32_t *, char *>
30178 sse_convert_utf32_to_latin1(const char32_t *buf, size_t len,
30179                             char *latin1_output) {
30180   const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
30181 
30182   __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
30183   __m128i shufmask =
30184       _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
30185 
30186   for (size_t i = 0; i < rounded_len; i += 16) {
30187     __m128i in1 = _mm_loadu_si128((__m128i *)buf);
30188     __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
30189     __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
30190     __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
30191 
30192     __m128i check_combined = _mm_or_si128(in1, in2);
30193     check_combined = _mm_or_si128(check_combined, in3);
30194     check_combined = _mm_or_si128(check_combined, in4);
30195 
30196     if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
30197       return std::make_pair(nullptr, latin1_output);
30198     }
30199     __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask), _mm_shuffle_epi8(in2, shufmask));
30200     __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask), _mm_shuffle_epi8(in4, shufmask));
30201     __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
30202     _mm_storeu_si128((__m128i *)latin1_output, pack);
30203     latin1_output += 16;
30204     buf += 16;
30205   }
30206 
30207   return std::make_pair(buf, latin1_output);
30208 }
30209 
30210 std::pair<result, char *>
30211 sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
30212                                         char *latin1_output) {
30213   const char32_t *start = buf;
30214   const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
30215 
30216   __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
30217   __m128i shufmask =
30218       _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
30219 
30220   for (size_t i = 0; i < rounded_len; i += 16) {
30221     __m128i in1 = _mm_loadu_si128((__m128i *)buf);
30222     __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
30223     __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
30224     __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
30225 
30226     __m128i check_combined = _mm_or_si128(in1, in2);
30227     check_combined = _mm_or_si128(check_combined, in3);
30228     check_combined = _mm_or_si128(check_combined, in4);
30229 
30230     if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
30231       // Fallback to scalar code for handling errors
30232       for (int k = 0; k < 16; k++) {
30233         char32_t codepoint = buf[k];
30234         if (codepoint <= 0xff) {
30235           *latin1_output++ = char(codepoint);
30236         } else {
30237           return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
30238                                 latin1_output);
30239         }
30240       }
30241       buf += 16;
30242       continue;
30243     }
30244     __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask), _mm_shuffle_epi8(in2, shufmask));
30245     __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask), _mm_shuffle_epi8(in4, shufmask));
30246     __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
30247     _mm_storeu_si128((__m128i *)latin1_output, pack);
30248     latin1_output += 16;
30249     buf += 16;
30250   }
30251 
30252   return std::make_pair(result(error_code::SUCCESS, buf - start),
30253                         latin1_output);
30254 }
30255 /* end file src/westmere/sse_convert_utf32_to_latin1.cpp */
30256 /* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
30257 std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
30258   const char32_t* end = buf + len;
30259 
30260   const __m128i v_0000 = _mm_setzero_si128();//__m128 = 128 bits
30261   const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); //1111 1000 0000 0000
30262   const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); //1100 0000 1000 0000
30263   const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); //1111 1111 1000 0000
30264   const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); //1111 1111 1111 1111 0000 0000 0000 0000
30265   const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); //0111 1111 1111 1111 1111 1111 1111 1111
30266   __m128i running_max = _mm_setzero_si128();
30267   __m128i forbidden_bytemask = _mm_setzero_si128();
30268   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
30269 
30270   while (buf + 16 + safety_margin <= end) { //buf is a char32_t pointer, each char32_t has 4 bytes or 32 bits, thus buf + 16 * char_32t = 512 bits = 64 bytes
30271     // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
30272     __m128i in = _mm_loadu_si128((__m128i*)buf);
30273     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);//These two values can hold only 8 UTF32 chars
30274     running_max = _mm_max_epu32(
30275                                 _mm_max_epu32(in, running_max), //take element-wise max char32_t from in and running_max vector
30276                                  nextin); //and take element-wise max element from nextin and running_max vector
30277 
30278     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation
30279     __m128i in_16 = _mm_packus_epi32(
30280                                       _mm_and_si128(in, v_7fffffff),
30281                                       _mm_and_si128(nextin, v_7fffffff)
30282                                       );//in this context pack the two __m128 into a single
30283     //By ensuring the highest bit is set to 0(&v_7fffffff), we're making sure all values are interpreted as non-negative, or specifically, the values are within the range of valid Unicode code points.
30284     //remember : having leading byte 0 means a positive number by the two complements system. Unicode is well beneath the range where you'll start getting issues so that's OK.
30285 
30286     // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
30287 
30288     // Check for ASCII fast path
30289 
30290     // ASCII fast path!!!!
30291       // We eagerly load another 32 bytes, hoping that they will be ASCII too.
30292       // The intuition is that we try to collect 16 ASCII characters which requires
30293       // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
30294       // as our new inputs.
30295     if(_mm_testz_si128(in_16, v_ff80)) {  //if the first two blocks are ASCII
30296       __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
30297       __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
30298       running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin);//take the running max of all 4 vectors thus far
30299       __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));//pack into 1 vector, now you have two
30300       if(!_mm_testz_si128(nextin_16, v_ff80)) {  //checks if the second packed vector is ASCII, if not:
30301         // 1. pack the bytes
30302         // obviously suboptimal.
30303         const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16); //creates two copy of in_16 in 1 vector
30304         // 2. store (16 bytes)
30305         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); //put them into the output
30306         // 3. adjust pointers
30307         buf += 8; //the char32_t buffer pointer goes up 8 char32_t chars* 32 bits =  256 bits
30308         utf8_output += 8; //same with output, e.g. lift the first two blocks alone.
30309         // Proceed with next input
30310         in_16 = nextin_16;
30311         // We need to update in and nextin because they are used later.
30312         in = thirdin;
30313         nextin = fourthin;
30314       } else {
30315         // 1. pack the bytes
30316         const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
30317         // 2. store (16 bytes)
30318         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30319         // 3. adjust pointers
30320         buf += 16;
30321         utf8_output += 16;
30322         continue; // we are done for this round!
30323       }
30324     }
30325 
30326     // no bits set above 7th bit -- find out all the ASCII characters
30327     const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
30328                                                       _mm_and_si128(in_16, v_ff80), // the vector that get only the first 9 bits of each 16-bit/2-byte units
30329                                                        v_0000 //
30330                                                        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is of format 0000 0000 0000 0XXX XXXX
30331     // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and 0000 0000 0000 0000 if not for each 16-bit/2-byte units
30332     const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask)); // collect the MSB from previous vector and put them into uint16_t mas
30333 
30334     // no bits set above 11th bit
30335     const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
30336     const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
30337 
30338     if (one_or_two_bytes_bitmask == 0xffff) {
30339       // case: all code units either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
30340       // 1. prepare 2-byte values
30341       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
30342       // expected output   : [110a|aaaa|10bb|bbbb] x 8
30343       const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
30344       const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
30345 
30346       // t0 = [000a|aaaa|bbbb|bb00]
30347       const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
30348       // t1 = [000a|aaaa|0000|0000]
30349       const __m128i t1 = _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
30350       // t2 = [0000|0000|00bb|bbbb]
30351       const __m128i t2 = _mm_and_si128(in_16, v_003f);// potential second utf8 byte
30352       // t3 = [000a|aaaa|00bb|bbbb]
30353       const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together
30354       // t4 = [110a|aaaa|10bb|bbbb]
30355       const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
30356 
30357       // 2. merge ASCII and 2-byte codewords
30358       const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
30359 
30360       // 3. prepare bitmask for 8-bit lookup
30361       //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
30362       const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
30363       const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
30364       const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
30365       // 4. pack the bytes
30366       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
30367       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
30368       const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
30369 
30370       // 5. store bytes
30371       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30372 
30373       // 6. adjust pointers
30374       buf += 8;
30375       utf8_output += row[0];
30376       continue;
30377     }
30378 
30379     // Check for overflow in packing
30380 
30381     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
30382     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
30383     if (saturation_bitmask == 0xffff) {
30384       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
30385       const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
30386       forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
30387 
30388       const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
30389                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
30390 
30391       /* In this branch we handle three cases:
30392           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
30393           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
30394           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
30395 
30396         We expand the input word (16-bit) into two code units (32-bit), thus
30397         we have room for four bytes. However, we need five distinct bit
30398         layouts. Note that the last byte in cases #2 and #3 is the same.
30399 
30400         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
30401         in register t2.
30402 
30403         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
30404         either byte 1 for case #2 or byte 2 for case #3. Note that they
30405         differ by exactly one bit.
30406 
30407         Finally from these two code units we build proper UTF-8 sequence, taking
30408         into account the case (i.e, the number of bytes to write).
30409       */
30410       /**
30411        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
30412        * t2 => [0ccc|cccc] [10cc|cccc]
30413        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
30414        */
30415 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
30416       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
30417       const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
30418       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
30419       const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
30420       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
30421       const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
30422 
30423       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
30424       const __m128i s0 = _mm_srli_epi16(in_16, 4);
30425       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
30426       const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
30427       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
30428       const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
30429       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
30430       const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
30431       const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
30432       const __m128i s4 = _mm_xor_si128(s3, m0);
30433 #undef simdutf_vec
30434 
30435       // 4. expand code units 16-bit => 32-bit
30436       const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
30437       const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
30438 
30439       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
30440       const uint16_t mask = (one_byte_bitmask & 0x5555) |
30441                             (one_or_two_bytes_bitmask & 0xaaaa);
30442       if(mask == 0) {
30443         // We only have three-byte code units. Use fast path.
30444         const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
30445         const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
30446         const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
30447         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
30448         utf8_output += 12;
30449         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
30450         utf8_output += 12;
30451         buf += 8;
30452         continue;
30453       }
30454       const uint8_t mask0 = uint8_t(mask);
30455 
30456       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
30457       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
30458       const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
30459 
30460       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
30461 
30462       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
30463       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
30464       const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
30465 
30466       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
30467       utf8_output += row0[0];
30468       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
30469       utf8_output += row1[0];
30470 
30471       buf += 8;
30472     } else {
30473       // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
30474       // Let us do a scalar fallback.
30475       // It may seem wasteful to use scalar code, but being efficient with SIMD
30476       // in the presence of surrogate pairs may require non-trivial tables.
30477       size_t forward = 15;
30478       size_t k = 0;
30479       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
30480       for(; k < forward; k++) {
30481         uint32_t word = buf[k];
30482         if((word & 0xFFFFFF80)==0) {
30483           *utf8_output++ = char(word);
30484         } else if((word & 0xFFFFF800)==0) {
30485           *utf8_output++ = char((word>>6) | 0b11000000);
30486           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30487         } else if((word &0xFFFF0000 )==0) {
30488           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
30489           *utf8_output++ = char((word>>12) | 0b11100000);
30490           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30491           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30492         } else {
30493           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
30494           *utf8_output++ = char((word>>18) | 0b11110000);
30495           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
30496           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30497           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30498         }
30499       }
30500       buf += k;
30501     }
30502   } // while
30503 
30504   // check for invalid input
30505   const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
30506   if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
30507     return std::make_pair(nullptr, utf8_output);
30508   }
30509 
30510   if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
30511 
30512   return std::make_pair(buf, utf8_output);
30513 }
30514 
30515 
30516 std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
30517 
30518   const char32_t* end = buf + len;
30519   const char32_t* start = buf;
30520 
30521   const __m128i v_0000 = _mm_setzero_si128();
30522   const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
30523   const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
30524   const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
30525   const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
30526   const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
30527   const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
30528 
30529   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
30530 
30531   while (buf + 16 + safety_margin <= end) {
30532     // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
30533     __m128i in = _mm_loadu_si128((__m128i*)buf);
30534     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
30535 
30536     // Check for too large input
30537     __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
30538     if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
30539       return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
30540     }
30541 
30542     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation
30543     __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
30544 
30545     // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
30546 
30547     // Check for ASCII fast path
30548     if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
30549       // We eagerly load another 32 bytes, hoping that they will be ASCII too.
30550       // The intuition is that we try to collect 16 ASCII characters which requires
30551       // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
30552       // as our new inputs.
30553       __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
30554       __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
30555       __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
30556       if(!_mm_testz_si128(nextin_16, v_ff80)) {
30557         // 1. pack the bytes
30558         // obviously suboptimal.
30559         const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
30560         // 2. store (16 bytes)
30561         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30562         // 3. adjust pointers
30563         buf += 8;
30564         utf8_output += 8;
30565         // Proceed with next input
30566         in_16 = nextin_16;
30567         __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
30568         if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
30569           return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
30570         }
30571         // We need to update in and nextin because they are used later.
30572         in = thirdin;
30573         nextin = fourthin;
30574       } else {
30575         // 1. pack the bytes
30576         const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
30577         // 2. store (16 bytes)
30578         _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30579         // 3. adjust pointers
30580         buf += 16;
30581         utf8_output += 16;
30582         continue; // we are done for this round!
30583       }
30584     }
30585 
30586     // no bits set above 7th bit
30587     const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
30588     const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
30589 
30590     // no bits set above 11th bit
30591     const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
30592     const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
30593 
30594     if (one_or_two_bytes_bitmask == 0xffff) {
30595       // case: all code units either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
30596       // 1. prepare 2-byte values
30597       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
30598       // expected output   : [110a|aaaa|10bb|bbbb] x 8
30599       const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
30600       const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
30601 
30602       // t0 = [000a|aaaa|bbbb|bb00]
30603       const __m128i t0 = _mm_slli_epi16(in_16, 2);
30604       // t1 = [000a|aaaa|0000|0000]
30605       const __m128i t1 = _mm_and_si128(t0, v_1f00);
30606       // t2 = [0000|0000|00bb|bbbb]
30607       const __m128i t2 = _mm_and_si128(in_16, v_003f);
30608       // t3 = [000a|aaaa|00bb|bbbb]
30609       const __m128i t3 = _mm_or_si128(t1, t2);
30610       // t4 = [110a|aaaa|10bb|bbbb]
30611       const __m128i t4 = _mm_or_si128(t3, v_c080);
30612 
30613       // 2. merge ASCII and 2-byte codewords
30614       const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
30615 
30616       // 3. prepare bitmask for 8-bit lookup
30617       //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
30618       const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
30619       const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
30620       const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
30621       // 4. pack the bytes
30622       const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
30623       const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
30624       const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
30625 
30626       // 5. store bytes
30627       _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30628 
30629       // 6. adjust pointers
30630       buf += 8;
30631       utf8_output += row[0];
30632       continue;
30633     }
30634 
30635 
30636     // Check for overflow in packing
30637     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
30638     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
30639 
30640     if (saturation_bitmask == 0xffff) {
30641       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
30642 
30643       // Check for illegal surrogate code units
30644       const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
30645       const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
30646       if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
30647         return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
30648       }
30649 
30650       const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
30651                                               0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
30652 
30653       /* In this branch we handle three cases:
30654           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
30655           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
30656           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
30657 
30658         We expand the input word (16-bit) into two code units (32-bit), thus
30659         we have room for four bytes. However, we need five distinct bit
30660         layouts. Note that the last byte in cases #2 and #3 is the same.
30661 
30662         We precompute byte 1 for case #1 and the common byte for cases #2 & #3
30663         in register t2.
30664 
30665         We precompute byte 1 for case #3 and -- **conditionally** -- precompute
30666         either byte 1 for case #2 or byte 2 for case #3. Note that they
30667         differ by exactly one bit.
30668 
30669         Finally from these two code units we build proper UTF-8 sequence, taking
30670         into account the case (i.e, the number of bytes to write).
30671       */
30672       /**
30673        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
30674        * t2 => [0ccc|cccc] [10cc|cccc]
30675        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
30676        */
30677 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
30678       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
30679       const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
30680       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
30681       const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
30682       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
30683       const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
30684 
30685       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
30686       const __m128i s0 = _mm_srli_epi16(in_16, 4);
30687       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
30688       const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
30689       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
30690       const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
30691       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
30692       const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
30693       const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
30694       const __m128i s4 = _mm_xor_si128(s3, m0);
30695 #undef simdutf_vec
30696 
30697       // 4. expand code units 16-bit => 32-bit
30698       const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
30699       const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
30700 
30701       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
30702       const uint16_t mask = (one_byte_bitmask & 0x5555) |
30703                             (one_or_two_bytes_bitmask & 0xaaaa);
30704       if(mask == 0) {
30705         // We only have three-byte code units. Use fast path.
30706         const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
30707         const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
30708         const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
30709         _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
30710         utf8_output += 12;
30711         _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
30712         utf8_output += 12;
30713         buf += 8;
30714         continue;
30715       }
30716       const uint8_t mask0 = uint8_t(mask);
30717 
30718       const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
30719       const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
30720       const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
30721 
30722       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
30723 
30724       const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
30725       const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
30726       const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
30727 
30728       _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
30729       utf8_output += row0[0];
30730       _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
30731       utf8_output += row1[0];
30732 
30733       buf += 8;
30734     } else {
30735       // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
30736       // Let us do a scalar fallback.
30737       // It may seem wasteful to use scalar code, but being efficient with SIMD
30738       // in the presence of surrogate pairs may require non-trivial tables.
30739       size_t forward = 15;
30740       size_t k = 0;
30741       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
30742       for(; k < forward; k++) {
30743         uint32_t word = buf[k];
30744         if((word & 0xFFFFFF80)==0) {
30745           *utf8_output++ = char(word);
30746         } else if((word & 0xFFFFF800)==0) {
30747           *utf8_output++ = char((word>>6) | 0b11000000);
30748           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30749         } else if((word &0xFFFF0000 )==0) {
30750           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
30751           *utf8_output++ = char((word>>12) | 0b11100000);
30752           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30753           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30754         } else {
30755           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); }
30756           *utf8_output++ = char((word>>18) | 0b11110000);
30757           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
30758           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30759           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30760         }
30761       }
30762       buf += k;
30763     }
30764   } // while
30765 
30766   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
30767 }
30768 /* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
30769 /* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
30770 template <endianness big_endian>
30771 std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
30772 
30773   const char32_t* end = buf + len;
30774 
30775   const __m128i v_0000 = _mm_setzero_si128();
30776   const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
30777   __m128i forbidden_bytemask = _mm_setzero_si128();
30778 
30779   while (buf + 8 <= end) {
30780     __m128i in = _mm_loadu_si128((__m128i*)buf);
30781     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
30782     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
30783     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
30784 
30785     // Check if no bits set above 16th
30786     if (saturation_bitmask == 0xffff) {
30787       // Pack UTF-32 to UTF-16
30788       __m128i utf16_packed = _mm_packus_epi32(in, nextin);
30789 
30790       const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
30791       const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
30792       forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
30793 
30794       if (big_endian) {
30795         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
30796         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
30797       }
30798 
30799       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
30800       utf16_output += 8;
30801       buf += 8;
30802     } else {
30803       size_t forward = 7;
30804       size_t k = 0;
30805       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
30806       for(; k < forward; k++) {
30807         uint32_t word = buf[k];
30808         if((word & 0xFFFF0000)==0) {
30809           // will not generate a surrogate pair
30810           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
30811           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
30812         } else {
30813           // will generate a surrogate pair
30814           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
30815           word -= 0x10000;
30816           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
30817           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
30818           if (big_endian) {
30819             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
30820             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
30821           }
30822           *utf16_output++ = char16_t(high_surrogate);
30823           *utf16_output++ = char16_t(low_surrogate);
30824         }
30825       }
30826       buf += k;
30827     }
30828   }
30829 
30830   // check for invalid input
30831   if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
30832 
30833   return std::make_pair(buf, utf16_output);
30834 }
30835 
30836 
30837 template <endianness big_endian>
30838 std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
30839   const char32_t* start = buf;
30840   const char32_t* end = buf + len;
30841 
30842   const __m128i v_0000 = _mm_setzero_si128();
30843   const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
30844 
30845   while (buf + 8 <= end) {
30846     __m128i in = _mm_loadu_si128((__m128i*)buf);
30847     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
30848     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
30849     const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
30850 
30851     // Check if no bits set above 16th
30852     if (saturation_bitmask == 0xffff) {
30853       // Pack UTF-32 to UTF-16
30854       __m128i utf16_packed = _mm_packus_epi32(in, nextin);
30855 
30856       const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
30857       const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
30858       const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
30859       if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
30860         return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
30861       }
30862 
30863       if (big_endian) {
30864         const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
30865         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
30866       }
30867 
30868       _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
30869       utf16_output += 8;
30870       buf += 8;
30871     } else {
30872       size_t forward = 7;
30873       size_t k = 0;
30874       if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
30875       for(; k < forward; k++) {
30876         uint32_t word = buf[k];
30877         if((word & 0xFFFF0000)==0) {
30878           // will not generate a surrogate pair
30879           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
30880           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
30881         } else {
30882           // will generate a surrogate pair
30883           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
30884           word -= 0x10000;
30885           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
30886           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
30887           if (big_endian) {
30888             high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
30889             low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
30890           }
30891           *utf16_output++ = char16_t(high_surrogate);
30892           *utf16_output++ = char16_t(low_surrogate);
30893         }
30894       }
30895       buf += k;
30896     }
30897   }
30898 
30899   return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
30900 }
30901 /* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
30902 
30903 } // unnamed namespace
30904 } // namespace westmere
30905 } // namespace simdutf
30906 
30907 /* begin file src/generic/buf_block_reader.h */
30908 namespace simdutf {
30909 namespace westmere {
30910 namespace {
30911 
30912 // Walks through a buffer in block-sized increments, loading the last part with spaces
30913 template<size_t STEP_SIZE>
30914 struct buf_block_reader {
30915 public:
30916   simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
30917   simdutf_really_inline size_t block_index();
30918   simdutf_really_inline bool has_full_block() const;
30919   simdutf_really_inline const uint8_t *full_block() const;
30920   /**
30921    * Get the last block, padded with spaces.
30922    *
30923    * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
30924    * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
30925    * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
30926    *
30927    * @return the number of effective characters in the last block.
30928    */
30929   simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
30930   simdutf_really_inline void advance();
30931 private:
30932   const uint8_t *buf;
30933   const size_t len;
30934   const size_t lenminusstep;
30935   size_t idx;
30936 };
30937 
30938 // Routines to print masks and text for debugging bitmask operations
30939 simdutf_unused static char * format_input_text_64(const uint8_t *text) {
30940   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
30941   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
30942     buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
30943   }
30944   buf[sizeof(simd8x64<uint8_t>)] = '\0';
30945   return buf;
30946 }
30947 
30948 // Routines to print masks and text for debugging bitmask operations
30949 simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
30950   static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
30951   in.store(reinterpret_cast<uint8_t*>(buf));
30952   for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
30953     if (buf[i] < ' ') { buf[i] = '_'; }
30954   }
30955   buf[sizeof(simd8x64<uint8_t>)] = '\0';
30956   return buf;
30957 }
30958 
30959 simdutf_unused static char * format_mask(uint64_t mask) {
30960   static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
30961   for (size_t i=0; i<64; i++) {
30962     buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
30963   }
30964   buf[64] = '\0';
30965   return buf;
30966 }
30967 
30968 template<size_t STEP_SIZE>
30969 simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
30970 
30971 template<size_t STEP_SIZE>
30972 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
30973 
30974 template<size_t STEP_SIZE>
30975 simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
30976   return idx < lenminusstep;
30977 }
30978 
30979 template<size_t STEP_SIZE>
30980 simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
30981   return &buf[idx];
30982 }
30983 
30984 template<size_t STEP_SIZE>
30985 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
30986   if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
30987   std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
30988   std::memcpy(dst, buf + idx, len - idx);
30989   return len - idx;
30990 }
30991 
30992 template<size_t STEP_SIZE>
30993 simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
30994   idx += STEP_SIZE;
30995 }
30996 
30997 } // unnamed namespace
30998 } // namespace westmere
30999 } // namespace simdutf
31000 /* end file src/generic/buf_block_reader.h */
31001 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
31002 namespace simdutf {
31003 namespace westmere {
31004 namespace {
31005 namespace utf8_validation {
31006 
31007 using namespace simd;
31008 
31009   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
31010 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
31011 // Bit 1 = Too Long (ASCII followed by continuation)
31012 // Bit 2 = Overlong 3-byte
31013 // Bit 4 = Surrogate
31014 // Bit 5 = Overlong 2-byte
31015 // Bit 7 = Two Continuations
31016     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
31017                                                 // 11______ 11______
31018     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
31019     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
31020     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
31021     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
31022     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
31023     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
31024                                                 // 11110100 101_____
31025                                                 // 11110101 1001____
31026                                                 // 11110101 101_____
31027                                                 // 1111011_ 1001____
31028                                                 // 1111011_ 101_____
31029                                                 // 11111___ 1001____
31030                                                 // 11111___ 101_____
31031     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
31032                                                 // 11110101 1000____
31033                                                 // 1111011_ 1000____
31034                                                 // 11111___ 1000____
31035     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
31036 
31037     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
31038       // 0_______ ________ <ASCII in byte 1>
31039       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
31040       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
31041       // 10______ ________ <continuation in byte 1>
31042       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
31043       // 1100____ ________ <two byte lead in byte 1>
31044       TOO_SHORT | OVERLONG_2,
31045       // 1101____ ________ <two byte lead in byte 1>
31046       TOO_SHORT,
31047       // 1110____ ________ <three byte lead in byte 1>
31048       TOO_SHORT | OVERLONG_3 | SURROGATE,
31049       // 1111____ ________ <four+ byte lead in byte 1>
31050       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
31051     );
31052     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
31053     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
31054       // ____0000 ________
31055       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
31056       // ____0001 ________
31057       CARRY | OVERLONG_2,
31058       // ____001_ ________
31059       CARRY,
31060       CARRY,
31061 
31062       // ____0100 ________
31063       CARRY | TOO_LARGE,
31064       // ____0101 ________
31065       CARRY | TOO_LARGE | TOO_LARGE_1000,
31066       // ____011_ ________
31067       CARRY | TOO_LARGE | TOO_LARGE_1000,
31068       CARRY | TOO_LARGE | TOO_LARGE_1000,
31069 
31070       // ____1___ ________
31071       CARRY | TOO_LARGE | TOO_LARGE_1000,
31072       CARRY | TOO_LARGE | TOO_LARGE_1000,
31073       CARRY | TOO_LARGE | TOO_LARGE_1000,
31074       CARRY | TOO_LARGE | TOO_LARGE_1000,
31075       CARRY | TOO_LARGE | TOO_LARGE_1000,
31076       // ____1101 ________
31077       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
31078       CARRY | TOO_LARGE | TOO_LARGE_1000,
31079       CARRY | TOO_LARGE | TOO_LARGE_1000
31080     );
31081     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
31082       // ________ 0_______ <ASCII in byte 2>
31083       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
31084       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
31085 
31086       // ________ 1000____
31087       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
31088       // ________ 1001____
31089       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
31090       // ________ 101_____
31091       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
31092       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
31093 
31094       // ________ 11______
31095       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
31096     );
31097     return (byte_1_high & byte_1_low & byte_2_high);
31098   }
31099   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
31100       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
31101     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
31102     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
31103     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
31104     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
31105     return must23_80 ^ sc;
31106   }
31107 
31108   //
31109   // Return nonzero if there are incomplete multibyte characters at the end of the block:
31110   // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
31111   //
31112   simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
31113     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
31114     // ... 1111____ 111_____ 11______
31115     static const uint8_t max_array[32] = {
31116       255, 255, 255, 255, 255, 255, 255, 255,
31117       255, 255, 255, 255, 255, 255, 255, 255,
31118       255, 255, 255, 255, 255, 255, 255, 255,
31119       255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
31120     };
31121     const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
31122     return input.gt_bits(max_value);
31123   }
31124 
31125   struct utf8_checker {
31126     // If this is nonzero, there has been a UTF-8 error.
31127     simd8<uint8_t> error;
31128     // The last input we received
31129     simd8<uint8_t> prev_input_block;
31130     // Whether the last input we received was incomplete (used for ASCII fast path)
31131     simd8<uint8_t> prev_incomplete;
31132 
31133     //
31134     // Check whether the current bytes are valid UTF-8.
31135     //
31136     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
31137       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
31138       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
31139       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
31140       simd8<uint8_t> sc = check_special_cases(input, prev1);
31141       this->error |= check_multibyte_lengths(input, prev_input, sc);
31142     }
31143 
31144     // The only problem that can happen at EOF is that a multibyte character is too short
31145     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
31146     // too large in the first of two bytes.
31147     simdutf_really_inline void check_eof() {
31148       // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
31149       // possibly finish them.
31150       this->error |= this->prev_incomplete;
31151     }
31152 
31153     simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
31154       if(simdutf_likely(is_ascii(input))) {
31155         this->error |= this->prev_incomplete;
31156       } else {
31157         // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
31158         static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
31159             "We support either two or four chunks per 64-byte block.");
31160         if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
31161           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
31162           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31163         } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
31164           this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
31165           this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31166           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31167           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31168         }
31169         this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
31170         this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
31171 
31172       }
31173     }
31174 
31175     // do not forget to call check_eof!
31176     simdutf_really_inline bool errors() const {
31177       return this->error.any_bits_set_anywhere();
31178     }
31179 
31180   }; // struct utf8_checker
31181 } // namespace utf8_validation
31182 
31183 using utf8_validation::utf8_checker;
31184 
31185 } // unnamed namespace
31186 } // namespace westmere
31187 } // namespace simdutf
31188 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
31189 /* begin file src/generic/utf8_validation/utf8_validator.h */
31190 namespace simdutf {
31191 namespace westmere {
31192 namespace {
31193 namespace utf8_validation {
31194 
31195 /**
31196  * Validates that the string is actual UTF-8.
31197  */
31198 template<class checker>
31199 bool generic_validate_utf8(const uint8_t * input, size_t length) {
31200     checker c{};
31201     buf_block_reader<64> reader(input, length);
31202     while (reader.has_full_block()) {
31203       simd::simd8x64<uint8_t> in(reader.full_block());
31204       c.check_next_input(in);
31205       reader.advance();
31206     }
31207     uint8_t block[64]{};
31208     reader.get_remainder(block);
31209     simd::simd8x64<uint8_t> in(block);
31210     c.check_next_input(in);
31211     reader.advance();
31212     c.check_eof();
31213     return !c.errors();
31214 }
31215 
31216 bool generic_validate_utf8(const char * input, size_t length) {
31217   return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
31218 }
31219 
31220 /**
31221  * Validates that the string is actual UTF-8 and stops on errors.
31222  */
31223 template<class checker>
31224 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
31225     checker c{};
31226     buf_block_reader<64> reader(input, length);
31227     size_t count{0};
31228     while (reader.has_full_block()) {
31229       simd::simd8x64<uint8_t> in(reader.full_block());
31230       c.check_next_input(in);
31231       if(c.errors()) {
31232         if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
31233         result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
31234         res.count += count;
31235         return res;
31236       }
31237       reader.advance();
31238       count += 64;
31239     }
31240     uint8_t block[64]{};
31241     reader.get_remainder(block);
31242     simd::simd8x64<uint8_t> in(block);
31243     c.check_next_input(in);
31244     reader.advance();
31245     c.check_eof();
31246     if (c.errors()) {
31247       if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
31248       result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
31249       res.count += count;
31250       return res;
31251     } else {
31252       return result(error_code::SUCCESS, length);
31253     }
31254 }
31255 
31256 result generic_validate_utf8_with_errors(const char * input, size_t length) {
31257   return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
31258 }
31259 
31260 template<class checker>
31261 bool generic_validate_ascii(const uint8_t * input, size_t length) {
31262     buf_block_reader<64> reader(input, length);
31263     uint8_t blocks[64]{};
31264     simd::simd8x64<uint8_t> running_or(blocks);
31265     while (reader.has_full_block()) {
31266       simd::simd8x64<uint8_t> in(reader.full_block());
31267       running_or |= in;
31268       reader.advance();
31269     }
31270     uint8_t block[64]{};
31271     reader.get_remainder(block);
31272     simd::simd8x64<uint8_t> in(block);
31273     running_or |= in;
31274     return running_or.is_ascii();
31275 }
31276 
31277 bool generic_validate_ascii(const char * input, size_t length) {
31278   return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
31279 }
31280 
31281 template<class checker>
31282 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
31283   buf_block_reader<64> reader(input, length);
31284   size_t count{0};
31285   while (reader.has_full_block()) {
31286     simd::simd8x64<uint8_t> in(reader.full_block());
31287     if (!in.is_ascii()) {
31288       result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
31289       return result(res.error, count + res.count);
31290     }
31291     reader.advance();
31292 
31293     count += 64;
31294   }
31295   uint8_t block[64]{};
31296   reader.get_remainder(block);
31297   simd::simd8x64<uint8_t> in(block);
31298   if (!in.is_ascii()) {
31299     result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
31300     return result(res.error, count + res.count);
31301   } else {
31302     return result(error_code::SUCCESS, length);
31303   }
31304 }
31305 
31306 result generic_validate_ascii_with_errors(const char * input, size_t length) {
31307   return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
31308 }
31309 
31310 } // namespace utf8_validation
31311 } // unnamed namespace
31312 } // namespace westmere
31313 } // namespace simdutf
31314 /* end file src/generic/utf8_validation/utf8_validator.h */
31315 // transcoding from UTF-8 to UTF-16
31316 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
31317 
31318 
31319 namespace simdutf {
31320 namespace westmere {
31321 namespace {
31322 namespace utf8_to_utf16 {
31323 
31324 using namespace simd;
31325 
31326 template <endianness endian>
31327 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
31328     char16_t* utf16_output) noexcept {
31329   // The implementation is not specific to haswell and should be moved to the generic directory.
31330   size_t pos = 0;
31331   char16_t* start{utf16_output};
31332   const size_t safety_margin = 16; // to avoid overruns!
31333   while(pos + 64 + safety_margin <= size) {
31334     // this loop could be unrolled further. For example, we could process the mask
31335     // far more than 64 bytes.
31336     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
31337     if(in.is_ascii()) {
31338       in.store_ascii_as_utf16<endian>(utf16_output);
31339       utf16_output += 64;
31340       pos += 64;
31341     } else {
31342       // Slow path. We hope that the compiler will recognize that this is a slow path.
31343       // Anything that is not a continuation mask is a 'leading byte', that is, the
31344       // start of a new code point.
31345       uint64_t utf8_continuation_mask = in.lt(-65 + 1);
31346       // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
31347       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
31348       // The *start* of code points is not so useful, rather, we want the *end* of code points.
31349       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
31350       // We process in blocks of up to 12 bytes except possibly
31351       // for fast paths which may process up to 16 bytes. For the
31352       // slow path to work, we should have at least 12 input bytes left.
31353       size_t max_starting_point = (pos + 64) - 12;
31354       // Next loop is going to run at least five times when using solely
31355       // the slow/regular path, and at least four times if there are fast paths.
31356       while(pos < max_starting_point) {
31357         // Performance note: our ability to compute 'consumed' and
31358         // then shift and recompute is critical. If there is a
31359         // latency of, say, 4 cycles on getting 'consumed', then
31360         // the inner loop might have a total latency of about 6 cycles.
31361         // Yet we process between 6 to 12 inputs bytes, thus we get
31362         // a speed limit between 1 cycle/byte and 0.5 cycle/byte
31363         // for this section of the code. Hence, there is a limit
31364         // to how much we can further increase this latency before
31365         // it seriously harms performance.
31366         //
31367         // Thus we may allow convert_masked_utf8_to_utf16 to process
31368         // more bytes at a time under a fast-path mode where 16 bytes
31369         // are consumed at once (e.g., when encountering ASCII).
31370         size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
31371                             utf8_end_of_code_point_mask, utf16_output);
31372         pos += consumed;
31373         utf8_end_of_code_point_mask >>= consumed;
31374       }
31375       // At this point there may remain between 0 and 12 bytes in the
31376       // 64-byte block. These bytes will be processed again. So we have an
31377       // 80% efficiency (in the worst case). In practice we expect an
31378       // 85% to 90% efficiency.
31379     }
31380   }
31381   utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
31382   return utf16_output - start;
31383 }
31384 
31385 } // namespace utf8_to_utf16
31386 } // unnamed namespace
31387 } // namespace westmere
31388 } // namespace simdutf
31389 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
31390 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
31391 
31392 
31393 namespace simdutf {
31394 namespace westmere {
31395 namespace {
31396 namespace utf8_to_utf16 {
31397 using namespace simd;
31398 
31399 
31400   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
31401 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
31402 // Bit 1 = Too Long (ASCII followed by continuation)
31403 // Bit 2 = Overlong 3-byte
31404 // Bit 4 = Surrogate
31405 // Bit 5 = Overlong 2-byte
31406 // Bit 7 = Two Continuations
31407     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
31408                                                 // 11______ 11______
31409     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
31410     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
31411     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
31412     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
31413     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
31414     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
31415                                                 // 11110100 101_____
31416                                                 // 11110101 1001____
31417                                                 // 11110101 101_____
31418                                                 // 1111011_ 1001____
31419                                                 // 1111011_ 101_____
31420                                                 // 11111___ 1001____
31421                                                 // 11111___ 101_____
31422     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
31423                                                 // 11110101 1000____
31424                                                 // 1111011_ 1000____
31425                                                 // 11111___ 1000____
31426     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
31427 
31428     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
31429       // 0_______ ________ <ASCII in byte 1>
31430       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
31431       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
31432       // 10______ ________ <continuation in byte 1>
31433       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
31434       // 1100____ ________ <two byte lead in byte 1>
31435       TOO_SHORT | OVERLONG_2,
31436       // 1101____ ________ <two byte lead in byte 1>
31437       TOO_SHORT,
31438       // 1110____ ________ <three byte lead in byte 1>
31439       TOO_SHORT | OVERLONG_3 | SURROGATE,
31440       // 1111____ ________ <four+ byte lead in byte 1>
31441       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
31442     );
31443     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
31444     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
31445       // ____0000 ________
31446       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
31447       // ____0001 ________
31448       CARRY | OVERLONG_2,
31449       // ____001_ ________
31450       CARRY,
31451       CARRY,
31452 
31453       // ____0100 ________
31454       CARRY | TOO_LARGE,
31455       // ____0101 ________
31456       CARRY | TOO_LARGE | TOO_LARGE_1000,
31457       // ____011_ ________
31458       CARRY | TOO_LARGE | TOO_LARGE_1000,
31459       CARRY | TOO_LARGE | TOO_LARGE_1000,
31460 
31461       // ____1___ ________
31462       CARRY | TOO_LARGE | TOO_LARGE_1000,
31463       CARRY | TOO_LARGE | TOO_LARGE_1000,
31464       CARRY | TOO_LARGE | TOO_LARGE_1000,
31465       CARRY | TOO_LARGE | TOO_LARGE_1000,
31466       CARRY | TOO_LARGE | TOO_LARGE_1000,
31467       // ____1101 ________
31468       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
31469       CARRY | TOO_LARGE | TOO_LARGE_1000,
31470       CARRY | TOO_LARGE | TOO_LARGE_1000
31471     );
31472     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
31473       // ________ 0_______ <ASCII in byte 2>
31474       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
31475       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
31476 
31477       // ________ 1000____
31478       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
31479       // ________ 1001____
31480       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
31481       // ________ 101_____
31482       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
31483       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
31484 
31485       // ________ 11______
31486       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
31487     );
31488     return (byte_1_high & byte_1_low & byte_2_high);
31489   }
31490   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
31491       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
31492     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
31493     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
31494     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
31495     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
31496     return must23_80 ^ sc;
31497   }
31498 
31499 
31500   struct validating_transcoder {
31501     // If this is nonzero, there has been a UTF-8 error.
31502     simd8<uint8_t> error;
31503 
31504     validating_transcoder() : error(uint8_t(0)) {}
31505     //
31506     // Check whether the current bytes are valid UTF-8.
31507     //
31508     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
31509       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
31510       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
31511       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
31512       simd8<uint8_t> sc = check_special_cases(input, prev1);
31513       this->error |= check_multibyte_lengths(input, prev_input, sc);
31514     }
31515 
31516 
31517     template <endianness endian>
31518     simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
31519       size_t pos = 0;
31520       char16_t* start{utf16_output};
31521       // In the worst case, we have the haswell kernel which can cause an overflow of
31522       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
31523       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
31524       // much more than 8 bytes. However, you cannot generally assume that you have valid
31525       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
31526       // to give us a good margin.
31527       size_t leading_byte = 0;
31528       size_t margin = size;
31529       for(; margin > 0 && leading_byte < 8; margin--) {
31530         leading_byte += (int8_t(in[margin-1]) > -65);
31531       }
31532       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
31533       const size_t safety_margin = size - margin + 1; // to avoid overruns!
31534       while(pos + 64 + safety_margin <= size) {
31535         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
31536         if(input.is_ascii()) {
31537           input.store_ascii_as_utf16<endian>(utf16_output);
31538           utf16_output += 64;
31539           pos += 64;
31540         } else {
31541           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
31542           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
31543               "We support either two or four chunks per 64-byte block.");
31544           auto zero = simd8<uint8_t>{uint8_t(0)};
31545           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
31546             this->check_utf8_bytes(input.chunks[0], zero);
31547             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31548           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
31549             this->check_utf8_bytes(input.chunks[0], zero);
31550             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31551             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31552             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31553           }
31554           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
31555           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
31556           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
31557           // We process in blocks of up to 12 bytes except possibly
31558           // for fast paths which may process up to 16 bytes. For the
31559           // slow path to work, we should have at least 12 input bytes left.
31560           size_t max_starting_point = (pos + 64) - 12;
31561           // Next loop is going to run at least five times.
31562           while(pos < max_starting_point) {
31563             // Performance note: our ability to compute 'consumed' and
31564             // then shift and recompute is critical. If there is a
31565             // latency of, say, 4 cycles on getting 'consumed', then
31566             // the inner loop might have a total latency of about 6 cycles.
31567             // Yet we process between 6 to 12 inputs bytes, thus we get
31568             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
31569             // for this section of the code. Hence, there is a limit
31570             // to how much we can further increase this latency before
31571             // it seriously harms performance.
31572             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
31573                             utf8_end_of_code_point_mask, utf16_output);
31574             pos += consumed;
31575             utf8_end_of_code_point_mask >>= consumed;
31576           }
31577           // At this point there may remain between 0 and 12 bytes in the
31578           // 64-byte block. These bytes will be processed again. So we have an
31579           // 80% efficiency (in the worst case). In practice we expect an
31580           // 85% to 90% efficiency.
31581         }
31582       }
31583       if(errors()) { return 0; }
31584       if(pos < size) {
31585         size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
31586         if(howmany == 0) { return 0; }
31587         utf16_output += howmany;
31588       }
31589       return utf16_output - start;
31590     }
31591 
31592     template <endianness endian>
31593     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
31594       size_t pos = 0;
31595       char16_t* start{utf16_output};
31596       // In the worst case, we have the haswell kernel which can cause an overflow of
31597       // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
31598       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
31599       // much more than 8 bytes. However, you cannot generally assume that you have valid
31600       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
31601       // to give us a good margin.
31602       size_t leading_byte = 0;
31603       size_t margin = size;
31604       for(; margin > 0 && leading_byte < 8; margin--) {
31605         leading_byte += (int8_t(in[margin-1]) > -65);
31606       }
31607       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
31608       const size_t safety_margin = size - margin + 1; // to avoid overruns!
31609       while(pos + 64 + safety_margin <= size) {
31610         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
31611         if(input.is_ascii()) {
31612           input.store_ascii_as_utf16<endian>(utf16_output);
31613           utf16_output += 64;
31614           pos += 64;
31615         } else {
31616           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
31617           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
31618               "We support either two or four chunks per 64-byte block.");
31619           auto zero = simd8<uint8_t>{uint8_t(0)};
31620           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
31621             this->check_utf8_bytes(input.chunks[0], zero);
31622             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31623           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
31624             this->check_utf8_bytes(input.chunks[0], zero);
31625             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31626             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31627             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31628           }
31629           if (errors()) {
31630             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
31631             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
31632             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
31633             res.count += pos;
31634             return res;
31635           }
31636           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
31637           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
31638           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
31639           // We process in blocks of up to 12 bytes except possibly
31640           // for fast paths which may process up to 16 bytes. For the
31641           // slow path to work, we should have at least 12 input bytes left.
31642           size_t max_starting_point = (pos + 64) - 12;
31643           // Next loop is going to run at least five times.
31644           while(pos < max_starting_point) {
31645             // Performance note: our ability to compute 'consumed' and
31646             // then shift and recompute is critical. If there is a
31647             // latency of, say, 4 cycles on getting 'consumed', then
31648             // the inner loop might have a total latency of about 6 cycles.
31649             // Yet we process between 6 to 12 inputs bytes, thus we get
31650             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
31651             // for this section of the code. Hence, there is a limit
31652             // to how much we can further increase this latency before
31653             // it seriously harms performance.
31654             size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
31655                             utf8_end_of_code_point_mask, utf16_output);
31656             pos += consumed;
31657             utf8_end_of_code_point_mask >>= consumed;
31658           }
31659           // At this point there may remain between 0 and 12 bytes in the
31660           // 64-byte block. These bytes will be processed again. So we have an
31661           // 80% efficiency (in the worst case). In practice we expect an
31662           // 85% to 90% efficiency.
31663         }
31664       }
31665       if(errors()) {
31666         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
31667         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
31668         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
31669         res.count += pos;
31670         return res;
31671       }
31672       if(pos < size) {
31673         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
31674         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
31675         result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
31676         if (res.error) {    // In case of error, we want the error position
31677           res.count += pos;
31678           return res;
31679         } else {    // In case of success, we want the number of word written
31680           utf16_output += res.count;
31681         }
31682       }
31683       return result(error_code::SUCCESS, utf16_output - start);
31684     }
31685 
31686     simdutf_really_inline bool errors() const {
31687       return this->error.any_bits_set_anywhere();
31688     }
31689 
31690   }; // struct utf8_checker
31691 } // utf8_to_utf16 namespace
31692 } // unnamed namespace
31693 } // namespace westmere
31694 } // namespace simdutf
31695 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
31696 // transcoding from UTF-8 to UTF-32
31697 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
31698 
31699 namespace simdutf {
31700 namespace westmere {
31701 namespace {
31702 namespace utf8_to_utf32 {
31703 
31704 using namespace simd;
31705 
31706 
31707 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
31708     char32_t* utf32_output) noexcept {
31709   size_t pos = 0;
31710   char32_t* start{utf32_output};
31711   const size_t safety_margin = 16; // to avoid overruns!
31712   while(pos + 64 + safety_margin <= size) {
31713     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
31714     if(in.is_ascii()) {
31715       in.store_ascii_as_utf32(utf32_output);
31716       utf32_output += 64;
31717       pos += 64;
31718     } else {
31719     // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
31720     uint64_t utf8_continuation_mask = in.lt(-65 + 1);
31721     uint64_t utf8_leading_mask = ~utf8_continuation_mask;
31722     uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
31723     size_t max_starting_point = (pos + 64) - 12;
31724     while(pos < max_starting_point) {
31725       size_t consumed = convert_masked_utf8_to_utf32(input + pos,
31726                           utf8_end_of_code_point_mask, utf32_output);
31727       pos += consumed;
31728       utf8_end_of_code_point_mask >>= consumed;
31729       }
31730     }
31731   }
31732   utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
31733   return utf32_output - start;
31734 }
31735 
31736 
31737 } // namespace utf8_to_utf32
31738 } // unnamed namespace
31739 } // namespace westmere
31740 } // namespace simdutf
31741 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
31742 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
31743 
31744 
31745 namespace simdutf {
31746 namespace westmere {
31747 namespace {
31748 namespace utf8_to_utf32 {
31749 using namespace simd;
31750 
31751 
31752   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
31753 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
31754 // Bit 1 = Too Long (ASCII followed by continuation)
31755 // Bit 2 = Overlong 3-byte
31756 // Bit 4 = Surrogate
31757 // Bit 5 = Overlong 2-byte
31758 // Bit 7 = Two Continuations
31759     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
31760                                                 // 11______ 11______
31761     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
31762     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
31763     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
31764     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
31765     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
31766     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
31767                                                 // 11110100 101_____
31768                                                 // 11110101 1001____
31769                                                 // 11110101 101_____
31770                                                 // 1111011_ 1001____
31771                                                 // 1111011_ 101_____
31772                                                 // 11111___ 1001____
31773                                                 // 11111___ 101_____
31774     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
31775                                                 // 11110101 1000____
31776                                                 // 1111011_ 1000____
31777                                                 // 11111___ 1000____
31778     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
31779 
31780     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
31781       // 0_______ ________ <ASCII in byte 1>
31782       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
31783       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
31784       // 10______ ________ <continuation in byte 1>
31785       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
31786       // 1100____ ________ <two byte lead in byte 1>
31787       TOO_SHORT | OVERLONG_2,
31788       // 1101____ ________ <two byte lead in byte 1>
31789       TOO_SHORT,
31790       // 1110____ ________ <three byte lead in byte 1>
31791       TOO_SHORT | OVERLONG_3 | SURROGATE,
31792       // 1111____ ________ <four+ byte lead in byte 1>
31793       TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
31794     );
31795     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
31796     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
31797       // ____0000 ________
31798       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
31799       // ____0001 ________
31800       CARRY | OVERLONG_2,
31801       // ____001_ ________
31802       CARRY,
31803       CARRY,
31804 
31805       // ____0100 ________
31806       CARRY | TOO_LARGE,
31807       // ____0101 ________
31808       CARRY | TOO_LARGE | TOO_LARGE_1000,
31809       // ____011_ ________
31810       CARRY | TOO_LARGE | TOO_LARGE_1000,
31811       CARRY | TOO_LARGE | TOO_LARGE_1000,
31812 
31813       // ____1___ ________
31814       CARRY | TOO_LARGE | TOO_LARGE_1000,
31815       CARRY | TOO_LARGE | TOO_LARGE_1000,
31816       CARRY | TOO_LARGE | TOO_LARGE_1000,
31817       CARRY | TOO_LARGE | TOO_LARGE_1000,
31818       CARRY | TOO_LARGE | TOO_LARGE_1000,
31819       // ____1101 ________
31820       CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
31821       CARRY | TOO_LARGE | TOO_LARGE_1000,
31822       CARRY | TOO_LARGE | TOO_LARGE_1000
31823     );
31824     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
31825       // ________ 0_______ <ASCII in byte 2>
31826       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
31827       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
31828 
31829       // ________ 1000____
31830       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
31831       // ________ 1001____
31832       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
31833       // ________ 101_____
31834       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
31835       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
31836 
31837       // ________ 11______
31838       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
31839     );
31840     return (byte_1_high & byte_1_low & byte_2_high);
31841   }
31842   simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
31843       const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
31844     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
31845     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
31846     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
31847     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
31848     return must23_80 ^ sc;
31849   }
31850 
31851 
31852   struct validating_transcoder {
31853     // If this is nonzero, there has been a UTF-8 error.
31854     simd8<uint8_t> error;
31855 
31856     validating_transcoder() : error(uint8_t(0)) {}
31857     //
31858     // Check whether the current bytes are valid UTF-8.
31859     //
31860     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
31861       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
31862       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
31863       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
31864       simd8<uint8_t> sc = check_special_cases(input, prev1);
31865       this->error |= check_multibyte_lengths(input, prev_input, sc);
31866     }
31867 
31868 
31869 
31870     simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
31871       size_t pos = 0;
31872       char32_t* start{utf32_output};
31873       // In the worst case, we have the haswell kernel which can cause an overflow of
31874       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
31875       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
31876       // much more than 8 bytes. However, you cannot generally assume that you have valid
31877       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
31878       // to give us a good margin.
31879       size_t leading_byte = 0;
31880       size_t margin = size;
31881       for(; margin > 0 && leading_byte < 4; margin--) {
31882         leading_byte += (int8_t(in[margin-1]) > -65);
31883       }
31884       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
31885       const size_t safety_margin = size - margin + 1; // to avoid overruns!
31886       while(pos + 64 + safety_margin <= size) {
31887         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
31888         if(input.is_ascii()) {
31889           input.store_ascii_as_utf32(utf32_output);
31890           utf32_output += 64;
31891           pos += 64;
31892         } else {
31893           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
31894           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
31895               "We support either two or four chunks per 64-byte block.");
31896           auto zero = simd8<uint8_t>{uint8_t(0)};
31897           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
31898             this->check_utf8_bytes(input.chunks[0], zero);
31899             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31900           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
31901             this->check_utf8_bytes(input.chunks[0], zero);
31902             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31903             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31904             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31905           }
31906           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
31907           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
31908           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
31909           // We process in blocks of up to 12 bytes except possibly
31910           // for fast paths which may process up to 16 bytes. For the
31911           // slow path to work, we should have at least 12 input bytes left.
31912           size_t max_starting_point = (pos + 64) - 12;
31913           // Next loop is going to run at least five times.
31914           while(pos < max_starting_point) {
31915             // Performance note: our ability to compute 'consumed' and
31916             // then shift and recompute is critical. If there is a
31917             // latency of, say, 4 cycles on getting 'consumed', then
31918             // the inner loop might have a total latency of about 6 cycles.
31919             // Yet we process between 6 to 12 inputs bytes, thus we get
31920             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
31921             // for this section of the code. Hence, there is a limit
31922             // to how much we can further increase this latency before
31923             // it seriously harms performance.
31924             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
31925                             utf8_end_of_code_point_mask, utf32_output);
31926             pos += consumed;
31927             utf8_end_of_code_point_mask >>= consumed;
31928           }
31929           // At this point there may remain between 0 and 12 bytes in the
31930           // 64-byte block. These bytes will be processed again. So we have an
31931           // 80% efficiency (in the worst case). In practice we expect an
31932           // 85% to 90% efficiency.
31933         }
31934       }
31935       if(errors()) { return 0; }
31936       if(pos < size) {
31937         size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
31938         if(howmany == 0) { return 0; }
31939         utf32_output += howmany;
31940       }
31941       return utf32_output - start;
31942     }
31943 
31944     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
31945       size_t pos = 0;
31946       char32_t* start{utf32_output};
31947       // In the worst case, we have the haswell kernel which can cause an overflow of
31948       // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
31949       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
31950       // much more than 8 bytes. However, you cannot generally assume that you have valid
31951       // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
31952       // to give us a good margin.
31953       size_t leading_byte = 0;
31954       size_t margin = size;
31955       for(; margin > 0 && leading_byte < 4; margin--) {
31956         leading_byte += (int8_t(in[margin-1]) > -65);
31957       }
31958       // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
31959       const size_t safety_margin = size - margin + 1; // to avoid overruns!
31960       while(pos + 64 + safety_margin <= size) {
31961         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
31962         if(input.is_ascii()) {
31963           input.store_ascii_as_utf32(utf32_output);
31964           utf32_output += 64;
31965           pos += 64;
31966         } else {
31967           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
31968           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
31969               "We support either two or four chunks per 64-byte block.");
31970           auto zero = simd8<uint8_t>{uint8_t(0)};
31971           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
31972             this->check_utf8_bytes(input.chunks[0], zero);
31973             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31974           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
31975             this->check_utf8_bytes(input.chunks[0], zero);
31976             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31977             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31978             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31979           }
31980           if (errors()) {
31981             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
31982             res.count += pos;
31983             return res;
31984           }
31985           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
31986           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
31987           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
31988           // We process in blocks of up to 12 bytes except possibly
31989           // for fast paths which may process up to 16 bytes. For the
31990           // slow path to work, we should have at least 12 input bytes left.
31991           size_t max_starting_point = (pos + 64) - 12;
31992           // Next loop is going to run at least five times.
31993           while(pos < max_starting_point) {
31994             // Performance note: our ability to compute 'consumed' and
31995             // then shift and recompute is critical. If there is a
31996             // latency of, say, 4 cycles on getting 'consumed', then
31997             // the inner loop might have a total latency of about 6 cycles.
31998             // Yet we process between 6 to 12 inputs bytes, thus we get
31999             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
32000             // for this section of the code. Hence, there is a limit
32001             // to how much we can further increase this latency before
32002             // it seriously harms performance.
32003             size_t consumed = convert_masked_utf8_to_utf32(in + pos,
32004                             utf8_end_of_code_point_mask, utf32_output);
32005             pos += consumed;
32006             utf8_end_of_code_point_mask >>= consumed;
32007           }
32008           // At this point there may remain between 0 and 12 bytes in the
32009           // 64-byte block. These bytes will be processed again. So we have an
32010           // 80% efficiency (in the worst case). In practice we expect an
32011           // 85% to 90% efficiency.
32012         }
32013       }
32014       if(errors()) {
32015         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
32016         res.count += pos;
32017         return res;
32018       }
32019       if(pos < size) {
32020         result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
32021         if (res.error) {    // In case of error, we want the error position
32022           res.count += pos;
32023           return res;
32024         } else {    // In case of success, we want the number of word written
32025           utf32_output += res.count;
32026         }
32027       }
32028       return result(error_code::SUCCESS, utf32_output - start);
32029     }
32030 
32031     simdutf_really_inline bool errors() const {
32032       return this->error.any_bits_set_anywhere();
32033     }
32034 
32035   }; // struct utf8_checker
32036 } // utf8_to_utf32 namespace
32037 } // unnamed namespace
32038 } // namespace westmere
32039 } // namespace simdutf
32040 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
32041 // other functions
32042 /* begin file src/generic/utf8.h */
32043 
32044 namespace simdutf {
32045 namespace westmere {
32046 namespace {
32047 namespace utf8 {
32048 
32049 using namespace simd;
32050 
32051 simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
32052     size_t pos = 0;
32053     size_t count = 0;
32054     for(;pos + 64 <= size; pos += 64) {
32055       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32056       uint64_t utf8_continuation_mask = input.gt(-65);
32057       count += count_ones(utf8_continuation_mask);
32058     }
32059     return count + scalar::utf8::count_code_points(in + pos, size - pos);
32060 }
32061 
32062 simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
32063     size_t pos = 0;
32064     size_t count = 0;
32065     // This algorithm could no doubt be improved!
32066     for(;pos + 64 <= size; pos += 64) {
32067       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32068       uint64_t utf8_continuation_mask = input.lt(-65 + 1);
32069       // We count one word for anything that is not a continuation (so
32070       // leading bytes).
32071       count += 64 - count_ones(utf8_continuation_mask);
32072       int64_t utf8_4byte = input.gteq_unsigned(240);
32073       count += count_ones(utf8_4byte);
32074     }
32075     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
32076 }
32077 } // utf8 namespace
32078 } // unnamed namespace
32079 } // namespace westmere
32080 } // namespace simdutf
32081 /* end file src/generic/utf8.h */
32082 /* begin file src/generic/utf16.h */
32083 namespace simdutf {
32084 namespace westmere {
32085 namespace {
32086 namespace utf16 {
32087 
32088 template <endianness big_endian>
32089 simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
32090     size_t pos = 0;
32091     size_t count = 0;
32092     for(;pos < size/32*32; pos += 32) {
32093       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
32094       if (!match_system(big_endian)) { input.swap_bytes(); }
32095       uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
32096       count += count_ones(not_pair) / 2;
32097     }
32098     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
32099 }
32100 
32101 template <endianness big_endian>
32102 simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
32103     size_t pos = 0;
32104     size_t count = 0;
32105     // This algorithm could no doubt be improved!
32106     for(;pos < size/32*32; pos += 32) {
32107       simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
32108       if (!match_system(big_endian)) { input.swap_bytes(); }
32109       uint64_t ascii_mask = input.lteq(0x7F);
32110       uint64_t twobyte_mask = input.lteq(0x7FF);
32111       uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
32112 
32113       size_t ascii_count = count_ones(ascii_mask) / 2;
32114       size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
32115       size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
32116       size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
32117       count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
32118     }
32119     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
32120 }
32121 
32122 template <endianness big_endian>
32123 simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
32124     return count_code_points<big_endian>(in, size);
32125 }
32126 
32127 simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
32128   size_t pos = 0;
32129 
32130   while (pos < size/32*32) {
32131     simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
32132     input.swap_bytes();
32133     input.store(reinterpret_cast<uint16_t *>(output));
32134     pos += 32;
32135     output += 32;
32136   }
32137 
32138   scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
32139 }
32140 
32141 } // utf16
32142 } // unnamed namespace
32143 } // namespace westmere
32144 } // namespace simdutf
32145 /* end file src/generic/utf16.h */
32146 // transcoding from UTF-8 to Latin 1
32147 /* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
32148 
32149 
32150 namespace simdutf {
32151 namespace westmere {
32152 namespace {
32153 namespace utf8_to_latin1 {
32154 using namespace simd;
32155 
32156 
32157   simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
32158 // For UTF-8 to Latin 1, we can allow any ASCII character, and any continuation byte,
32159 // but the non-ASCII leading bytes must be 0b11000011 or 0b11000010 and nothing else.
32160 //
32161 // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
32162 // Bit 1 = Too Long (ASCII followed by continuation)
32163 // Bit 2 = Overlong 3-byte
32164 // Bit 4 = Surrogate
32165 // Bit 5 = Overlong 2-byte
32166 // Bit 7 = Two Continuations
32167     constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
32168                                                 // 11______ 11______
32169     constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
32170     constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
32171     constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
32172     constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
32173     constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
32174     constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
32175                                                 // 11110100 101_____
32176                                                 // 11110101 1001____
32177                                                 // 11110101 101_____
32178                                                 // 1111011_ 1001____
32179                                                 // 1111011_ 101_____
32180                                                 // 11111___ 1001____
32181                                                 // 11111___ 101_____
32182     constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
32183                                                 // 11110101 1000____
32184                                                 // 1111011_ 1000____
32185                                                 // 11111___ 1000____
32186     constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
32187     constexpr const uint8_t FORBIDDEN  = 0xff;
32188 
32189     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
32190       // 0_______ ________ <ASCII in byte 1>
32191       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
32192       TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
32193       // 10______ ________ <continuation in byte 1>
32194       TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
32195       // 1100____ ________ <two byte lead in byte 1>
32196       TOO_SHORT | OVERLONG_2,
32197       // 1101____ ________ <two byte lead in byte 1>
32198       FORBIDDEN,
32199       // 1110____ ________ <three byte lead in byte 1>
32200       FORBIDDEN,
32201       // 1111____ ________ <four+ byte lead in byte 1>
32202       FORBIDDEN
32203     );
32204     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
32205     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
32206       // ____0000 ________
32207       CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
32208       // ____0001 ________
32209       CARRY | OVERLONG_2,
32210       // ____001_ ________
32211       CARRY,
32212       CARRY,
32213 
32214       // ____0100 ________
32215       FORBIDDEN,
32216       // ____0101 ________
32217       FORBIDDEN,
32218       // ____011_ ________
32219       FORBIDDEN,
32220       FORBIDDEN,
32221 
32222       // ____1___ ________
32223       FORBIDDEN,
32224       FORBIDDEN,
32225       FORBIDDEN,
32226       FORBIDDEN,
32227       FORBIDDEN,
32228       // ____1101 ________
32229       FORBIDDEN,
32230       FORBIDDEN,
32231       FORBIDDEN
32232     );
32233     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
32234       // ________ 0_______ <ASCII in byte 2>
32235       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
32236       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
32237 
32238       // ________ 1000____
32239       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
32240       // ________ 1001____
32241       TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
32242       // ________ 101_____
32243       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
32244       TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
32245 
32246       // ________ 11______
32247       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
32248     );
32249     return (byte_1_high & byte_1_low & byte_2_high);
32250   }
32251 
32252   struct validating_transcoder {
32253     // If this is nonzero, there has been a UTF-8 error.
32254     simd8<uint8_t> error;
32255 
32256     validating_transcoder() : error(uint8_t(0)) {}
32257     //
32258     // Check whether the current bytes are valid UTF-8.
32259     //
32260     simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
32261       // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
32262       // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
32263       simd8<uint8_t> prev1 = input.prev<1>(prev_input);
32264       this->error |= check_special_cases(input, prev1);
32265     }
32266 
32267 
32268     simdutf_really_inline size_t convert(const char* in, size_t size, char* latin1_output) {
32269       size_t pos = 0;
32270       char* start{latin1_output};
32271       // In the worst case, we have the haswell kernel which can cause an overflow of
32272       // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes,
32273       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
32274       // much more than 8 bytes. However, you cannot generally assume that you have valid
32275       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
32276       // to give us a good margin.
32277       size_t leading_byte = 0;
32278       size_t margin = size;
32279       for(; margin > 0 && leading_byte < 8; margin--) {
32280         leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ...
32281       }
32282       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
32283       const size_t safety_margin = size - margin + 1; // to avoid overruns!
32284       while(pos + 64 + safety_margin <= size) {
32285         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32286         if(input.is_ascii()) {
32287           input.store((int8_t*)latin1_output);
32288           latin1_output += 64;
32289           pos += 64;
32290         } else {
32291           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
32292           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
32293               "We support either two or four chunks per 64-byte block.");
32294           auto zero = simd8<uint8_t>{uint8_t(0)};
32295           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
32296             this->check_utf8_bytes(input.chunks[0], zero);
32297             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
32298           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
32299             this->check_utf8_bytes(input.chunks[0], zero);
32300             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
32301             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
32302             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
32303           }
32304           uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
32305           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
32306           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
32307           // We process in blocks of up to 12 bytes except possibly
32308           // for fast paths which may process up to 16 bytes. For the
32309           // slow path to work, we should have at least 12 input bytes left.
32310           size_t max_starting_point = (pos + 64) - 12;
32311           // Next loop is going to run at least five times.
32312           while(pos < max_starting_point) {
32313             // Performance note: our ability to compute 'consumed' and
32314             // then shift and recompute is critical. If there is a
32315             // latency of, say, 4 cycles on getting 'consumed', then
32316             // the inner loop might have a total latency of about 6 cycles.
32317             // Yet we process between 6 to 12 inputs bytes, thus we get
32318             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
32319             // for this section of the code. Hence, there is a limit
32320             // to how much we can further increase this latency before
32321             // it seriously harms performance.
32322             size_t consumed = convert_masked_utf8_to_latin1(in + pos,
32323                             utf8_end_of_code_point_mask, latin1_output);
32324             pos += consumed;
32325             utf8_end_of_code_point_mask >>= consumed;
32326           }
32327           // At this point there may remain between 0 and 12 bytes in the
32328           // 64-byte block. These bytes will be processed again. So we have an
32329           // 80% efficiency (in the worst case). In practice we expect an
32330           // 85% to 90% efficiency.
32331         }
32332       }
32333       if(errors()) { return 0; }
32334       if(pos < size) {
32335         size_t howmany  = scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
32336         if(howmany == 0) { return 0; }
32337         latin1_output += howmany;
32338       }
32339       return latin1_output - start;
32340     }
32341 
32342     simdutf_really_inline result convert_with_errors(const char* in, size_t size, char* latin1_output) {
32343       size_t pos = 0;
32344       char* start{latin1_output};
32345       // In the worst case, we have the haswell kernel which can cause an overflow of
32346       // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes,
32347       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
32348       // much more than 8 bytes. However, you cannot generally assume that you have valid
32349       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
32350       // to give us a good margin.
32351       size_t leading_byte = 0;
32352       size_t margin = size;
32353       for(; margin > 0 && leading_byte < 8; margin--) {
32354         leading_byte += (int8_t(in[margin-1]) > -65);
32355       }
32356       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
32357       const size_t safety_margin = size - margin + 1; // to avoid overruns!
32358       while(pos + 64 + safety_margin <= size) {
32359         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32360         if(input.is_ascii()) {
32361           input.store((int8_t*)latin1_output);
32362           latin1_output += 64;
32363           pos += 64;
32364         } else {
32365           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
32366           static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
32367               "We support either two or four chunks per 64-byte block.");
32368           auto zero = simd8<uint8_t>{uint8_t(0)};
32369           if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
32370             this->check_utf8_bytes(input.chunks[0], zero);
32371             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
32372           } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
32373             this->check_utf8_bytes(input.chunks[0], zero);
32374             this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
32375             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
32376             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
32377           }
32378           if (errors()) {
32379             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
32380             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
32381             result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output);
32382             res.count += pos;
32383             return res;
32384           }
32385           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
32386           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
32387           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
32388           // We process in blocks of up to 12 bytes except possibly
32389           // for fast paths which may process up to 16 bytes. For the
32390           // slow path to work, we should have at least 12 input bytes left.
32391           size_t max_starting_point = (pos + 64) - 12;
32392           // Next loop is going to run at least five times.
32393           while(pos < max_starting_point) {
32394             // Performance note: our ability to compute 'consumed' and
32395             // then shift and recompute is critical. If there is a
32396             // latency of, say, 4 cycles on getting 'consumed', then
32397             // the inner loop might have a total latency of about 6 cycles.
32398             // Yet we process between 6 to 12 inputs bytes, thus we get
32399             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
32400             // for this section of the code. Hence, there is a limit
32401             // to how much we can further increase this latency before
32402             // it seriously harms performance.
32403             size_t consumed = convert_masked_utf8_to_latin1(in + pos,
32404                             utf8_end_of_code_point_mask, latin1_output);
32405             pos += consumed;
32406             utf8_end_of_code_point_mask >>= consumed;
32407           }
32408           // At this point there may remain between 0 and 12 bytes in the
32409           // 64-byte block. These bytes will be processed again. So we have an
32410           // 80% efficiency (in the worst case). In practice we expect an
32411           // 85% to 90% efficiency.
32412         }
32413       }
32414       if(errors()) {
32415         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
32416         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
32417         result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output);
32418         res.count += pos;
32419         return res;
32420       }
32421       if(pos < size) {
32422         // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
32423         // with the ability to go back up to pos bytes, and read size-pos bytes forward.
32424         result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, in + pos, size - pos, latin1_output);
32425         if (res.error) {    // In case of error, we want the error position
32426           res.count += pos;
32427           return res;
32428         } else {    // In case of success, we want the number of word written
32429           latin1_output += res.count;
32430         }
32431       }
32432       return result(error_code::SUCCESS, latin1_output - start);
32433     }
32434 
32435     simdutf_really_inline bool errors() const {
32436       return this->error.any_bits_set_anywhere();
32437     }
32438 
32439   }; // struct utf8_checker
32440 } // utf8_to_latin1 namespace
32441 } // unnamed namespace
32442 } // namespace westmere
32443 } // namespace simdutf
32444 /* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
32445 /* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
32446 
32447 
32448 namespace simdutf {
32449 namespace westmere {
32450 namespace {
32451 namespace utf8_to_latin1 {
32452 using namespace simd;
32453 
32454 
32455     simdutf_really_inline size_t convert_valid(const char* in, size_t size, char* latin1_output) {
32456       size_t pos = 0;
32457       char* start{latin1_output};
32458       // In the worst case, we have the haswell kernel which can cause an overflow of
32459       // 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last 16 bytes,
32460       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
32461       // much more than 8 bytes. However, you cannot generally assume that you have valid
32462       // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
32463       // to give us a good margin.
32464       size_t leading_byte = 0;
32465       size_t margin = size;
32466       for(; margin > 0 && leading_byte < 8; margin--) {
32467         leading_byte += (int8_t(in[margin-1]) > -65); //twos complement of -65 is 1011 1111 ...
32468       }
32469       // If the input is long enough, then we have that margin-1 is the eight last leading byte.
32470       const size_t safety_margin = size - margin + 1; // to avoid overruns!
32471       while(pos + 64 + safety_margin <= size) {
32472         simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32473         if(input.is_ascii()) {
32474           input.store((int8_t*)latin1_output);
32475           latin1_output += 64;
32476           pos += 64;
32477         } else {
32478           // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
32479           uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
32480           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
32481           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
32482           // We process in blocks of up to 12 bytes except possibly
32483           // for fast paths which may process up to 16 bytes. For the
32484           // slow path to work, we should have at least 12 input bytes left.
32485           size_t max_starting_point = (pos + 64) - 12;
32486           // Next loop is going to run at least five times.
32487           while(pos < max_starting_point) {
32488             // Performance note: our ability to compute 'consumed' and
32489             // then shift and recompute is critical. If there is a
32490             // latency of, say, 4 cycles on getting 'consumed', then
32491             // the inner loop might have a total latency of about 6 cycles.
32492             // Yet we process between 6 to 12 inputs bytes, thus we get
32493             // a speed limit between 1 cycle/byte and 0.5 cycle/byte
32494             // for this section of the code. Hence, there is a limit
32495             // to how much we can further increase this latency before
32496             // it seriously harms performance.
32497             size_t consumed = convert_masked_utf8_to_latin1(in + pos,
32498                             utf8_end_of_code_point_mask, latin1_output);
32499             pos += consumed;
32500             utf8_end_of_code_point_mask >>= consumed;
32501           }
32502           // At this point there may remain between 0 and 12 bytes in the
32503           // 64-byte block. These bytes will be processed again. So we have an
32504           // 80% efficiency (in the worst case). In practice we expect an
32505           // 85% to 90% efficiency.
32506         }
32507       }
32508       if(pos < size) {
32509         size_t howmany  = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos, latin1_output);
32510         latin1_output += howmany;
32511       }
32512       return latin1_output - start;
32513     }
32514 
32515   }
32516 }   // utf8_to_latin1 namespace
32517 }   // unnamed namespace
32518 }   // namespace westmere
32519  // namespace simdutf
32520 /* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
32521 
32522 
32523 //
32524 // Implementation-specific overrides
32525 //
32526 
32527 namespace simdutf {
32528 namespace westmere {
32529 
32530 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
32531   // If there is a BOM, then we trust it.
32532   auto bom_encoding = simdutf::BOM::check_bom(input, length);
32533   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
32534   if (length % 2 == 0) {
32535     return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
32536   } else {
32537     if (implementation::validate_utf8(input, length)) {
32538       return simdutf::encoding_type::UTF8;
32539     } else {
32540       return simdutf::encoding_type::unspecified;
32541     }
32542   }
32543 }
32544 
32545 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
32546   return westmere::utf8_validation::generic_validate_utf8(buf, len);
32547 }
32548 
32549 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
32550   return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
32551 }
32552 
32553 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
32554   return westmere::utf8_validation::generic_validate_ascii(buf, len);
32555 }
32556 
32557 simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
32558   return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,len);
32559 }
32560 
32561 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
32562   const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
32563   if (tail) {
32564     return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
32565   } else {
32566     return false;
32567   }
32568 }
32569 
32570 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
32571   const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
32572   if (tail) {
32573     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
32574   } else {
32575     return false;
32576   }
32577 }
32578 
32579 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
32580   result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
32581   if (res.count != len) {
32582     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
32583     return result(scalar_res.error, res.count + scalar_res.count);
32584   } else {
32585     return res;
32586   }
32587 }
32588 
32589 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
32590   result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
32591   if (res.count != len) {
32592     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
32593     return result(scalar_res.error, res.count + scalar_res.count);
32594   } else {
32595     return res;
32596   }
32597 }
32598 
32599 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
32600   const char32_t* tail = sse_validate_utf32le(buf, len);
32601   if (tail) {
32602     return scalar::utf32::validate(tail, len - (tail - buf));
32603   } else {
32604     return false;
32605   }
32606 }
32607 
32608 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
32609   result res = sse_validate_utf32le_with_errors(buf, len);
32610   if (res.count != len) {
32611     result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
32612     return result(scalar_res.error, res.count + scalar_res.count);
32613   } else {
32614     return res;
32615   }
32616 }
32617 
32618 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
32619 
32620   std::pair<const char*, char*> ret = sse_convert_latin1_to_utf8(buf, len, utf8_output);
32621   size_t converted_chars = ret.second - utf8_output;
32622 
32623   if (ret.first != buf + len) {
32624     const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
32625       ret.first, len - (ret.first - buf), ret.second);
32626     converted_chars += scalar_converted_chars;
32627   }
32628 
32629   return converted_chars;
32630 }
32631 
32632 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
32633     std::pair<const char*, char16_t*> ret = sse_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
32634     if (ret.first == nullptr) { return 0; }
32635     size_t converted_chars = ret.second - utf16_output;
32636     if (ret.first != buf + len) {
32637         const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert<endianness::LITTLE>(
32638                                               ret.first, len - (ret.first - buf), ret.second);
32639         if (scalar_converted_chars == 0) { return 0; }
32640         converted_chars += scalar_converted_chars;
32641     }
32642     return converted_chars;
32643 }
32644 
32645 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
32646     std::pair<const char*, char16_t*> ret = sse_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
32647     if (ret.first == nullptr) { return 0; }
32648     size_t converted_chars = ret.second - utf16_output;
32649     if (ret.first != buf + len) {
32650         const size_t scalar_converted_chars = scalar::latin1_to_utf16::convert<endianness::BIG>(
32651                                               ret.first, len - (ret.first - buf), ret.second);
32652         if (scalar_converted_chars == 0) { return 0; }
32653         converted_chars += scalar_converted_chars;
32654     }
32655     return converted_chars;
32656 }
32657 
32658 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
32659     std::pair<const char*, char32_t*> ret = sse_convert_latin1_to_utf32(buf, len, utf32_output);
32660     if (ret.first == nullptr) { return 0; }
32661     size_t converted_chars = ret.second - utf32_output;
32662     if (ret.first != buf + len) {
32663         const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
32664                                               ret.first, len - (ret.first - buf), ret.second);
32665         if (scalar_converted_chars == 0) { return 0; }
32666         converted_chars += scalar_converted_chars;
32667     }
32668     return converted_chars;
32669 }
32670 
32671 
32672 simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
32673   utf8_to_latin1::validating_transcoder converter;
32674   return converter.convert(buf, len, latin1_output);
32675 }
32676 
32677 simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
32678   utf8_to_latin1::validating_transcoder converter;
32679   return converter.convert_with_errors(buf, len, latin1_output);
32680 }
32681 
32682 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
32683   return westmere::utf8_to_latin1::convert_valid(buf,len,latin1_output);
32684 }
32685 
32686 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
32687   utf8_to_utf16::validating_transcoder converter;
32688   return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
32689 }
32690 
32691 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
32692   utf8_to_utf16::validating_transcoder converter;
32693   return converter.convert<endianness::BIG>(buf, len, utf16_output);
32694 }
32695 
32696 simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
32697   utf8_to_utf16::validating_transcoder converter;
32698   return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
32699 }
32700 
32701 simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
32702   utf8_to_utf16::validating_transcoder converter;
32703   return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
32704 }
32705 
32706 
32707 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
32708     char16_t* utf16_output) const noexcept {
32709   return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
32710 }
32711 
32712 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
32713     char16_t* utf16_output) const noexcept {
32714   return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
32715 }
32716 
32717 simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
32718   utf8_to_utf32::validating_transcoder converter;
32719   return converter.convert(buf, len, utf32_output);
32720 }
32721 
32722 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
32723   utf8_to_utf32::validating_transcoder converter;
32724   return converter.convert_with_errors(buf, len, utf32_output);
32725 }
32726 
32727 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
32728     char32_t* utf32_output) const noexcept {
32729   return utf8_to_utf32::convert_valid(input, size,  utf32_output);
32730 }
32731 
32732 simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
32733   std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
32734   if (ret.first == nullptr) { return 0; }
32735   size_t saved_bytes = ret.second - latin1_output;
32736 
32737   if (ret.first != buf + len) {
32738     const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert<endianness::LITTLE>(
32739                                         ret.first, len - (ret.first - buf), ret.second);
32740     if (scalar_saved_bytes == 0) { return 0; }
32741     saved_bytes += scalar_saved_bytes;
32742   }
32743   return saved_bytes;
32744 }
32745 
32746 simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
32747   std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
32748   if (ret.first == nullptr) { return 0; }
32749   size_t saved_bytes = ret.second - latin1_output;
32750 
32751   if (ret.first != buf + len) {
32752     const size_t scalar_saved_bytes = scalar::utf16_to_latin1::convert<endianness::BIG>(
32753                                         ret.first, len - (ret.first - buf), ret.second);
32754     if (scalar_saved_bytes == 0) { return 0; }
32755     saved_bytes += scalar_saved_bytes;
32756   }
32757   return saved_bytes;
32758 }
32759 
32760 simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
32761   std::pair<result, char*> ret = sse_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(buf, len, latin1_output);
32762   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
32763   if (ret.first.count != len) { // All good so far, but not finished
32764     result scalar_res = scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
32765                                         buf + ret.first.count, len - ret.first.count, ret.second);
32766     if (scalar_res.error) {
32767       scalar_res.count += ret.first.count;
32768       return scalar_res;
32769     } else {
32770       ret.second += scalar_res.count;
32771     }
32772   }
32773   ret.first.count = ret.second - latin1_output;   // Set count to the number of 8-bit code units written
32774   return ret.first;
32775 }
32776 
32777 simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
32778   std::pair<result, char*> ret = sse_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len, latin1_output);
32779   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
32780   if (ret.first.count != len) { // All good so far, but not finished
32781     result scalar_res = scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
32782                                         buf + ret.first.count, len - ret.first.count, ret.second);
32783     if (scalar_res.error) {
32784       scalar_res.count += ret.first.count;
32785       return scalar_res;
32786     } else {
32787       ret.second += scalar_res.count;
32788     }
32789   }
32790   ret.first.count = ret.second - latin1_output;   // Set count to the number of 8-bit code units written
32791   return ret.first;
32792 }
32793 
32794 
32795 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
32796   // optimization opportunity: we could provide an optimized function.
32797   return convert_utf16be_to_latin1(buf, len, latin1_output);
32798 }
32799 
32800 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
32801   // optimization opportunity: we could provide an optimized function.
32802   return convert_utf16le_to_latin1(buf, len, latin1_output);
32803 }
32804 
32805 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
32806   std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
32807   if (ret.first == nullptr) { return 0; }
32808   size_t saved_bytes = ret.second - utf8_output;
32809   if (ret.first != buf + len) {
32810     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
32811                                         ret.first, len - (ret.first - buf), ret.second);
32812     if (scalar_saved_bytes == 0) { return 0; }
32813     saved_bytes += scalar_saved_bytes;
32814   }
32815   return saved_bytes;
32816 }
32817 
32818 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
32819   std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
32820   if (ret.first == nullptr) { return 0; }
32821   size_t saved_bytes = ret.second - utf8_output;
32822   if (ret.first != buf + len) {
32823     const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
32824                                         ret.first, len - (ret.first - buf), ret.second);
32825     if (scalar_saved_bytes == 0) { return 0; }
32826     saved_bytes += scalar_saved_bytes;
32827   }
32828   return saved_bytes;
32829 }
32830 
32831 simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
32832   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
32833   std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
32834   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
32835   if (ret.first.count != len) { // All good so far, but not finished
32836     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
32837                                         buf + ret.first.count, len - ret.first.count, ret.second);
32838     if (scalar_res.error) {
32839       scalar_res.count += ret.first.count;
32840       return scalar_res;
32841     } else {
32842       ret.second += scalar_res.count;
32843     }
32844   }
32845   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
32846   return ret.first;
32847 }
32848 
32849 simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
32850   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
32851   std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
32852   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
32853   if (ret.first.count != len) { // All good so far, but not finished
32854     result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
32855                                         buf + ret.first.count, len - ret.first.count, ret.second);
32856     if (scalar_res.error) {
32857       scalar_res.count += ret.first.count;
32858       return scalar_res;
32859     } else {
32860       ret.second += scalar_res.count;
32861     }
32862   }
32863   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
32864   return ret.first;
32865 }
32866 
32867 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
32868   return convert_utf16le_to_utf8(buf, len, utf8_output);
32869 }
32870 
32871 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
32872   return convert_utf16be_to_utf8(buf, len, utf8_output);
32873 }
32874 
32875 simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
32876   std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_latin1(buf, len, latin1_output);
32877   if (ret.first == nullptr) { return 0; }
32878   size_t saved_bytes = ret.second - latin1_output;
32879   // if (ret.first != buf + len) {
32880   if (ret.first < buf + len) {
32881     const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
32882                                         ret.first, len - (ret.first - buf), ret.second);
32883     if (scalar_saved_bytes == 0) { return 0; }
32884     saved_bytes += scalar_saved_bytes;
32885   }
32886   return saved_bytes;
32887 }
32888 
32889 
32890 simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
32891   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
32892   std::pair<result, char*> ret = westmere::sse_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
32893   if (ret.first.count != len) {
32894     result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
32895                                         buf + ret.first.count, len - ret.first.count, ret.second);
32896     if (scalar_res.error) {
32897       scalar_res.count += ret.first.count;
32898       return scalar_res;
32899     } else {
32900       ret.second += scalar_res.count;
32901     }
32902   }
32903   ret.first.count = ret.second - latin1_output;   // Set count to the number of 8-bit code units written
32904   return ret.first;
32905 }
32906 
32907 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
32908   // optimization opportunity: we could provide an optimized function.
32909   return convert_utf32_to_latin1(buf,len,latin1_output);
32910 }
32911 
32912 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
32913   std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_utf8(buf, len, utf8_output);
32914   if (ret.first == nullptr) { return 0; }
32915   size_t saved_bytes = ret.second - utf8_output;
32916   if (ret.first != buf + len) {
32917     const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
32918                                         ret.first, len - (ret.first - buf), ret.second);
32919     if (scalar_saved_bytes == 0) { return 0; }
32920     saved_bytes += scalar_saved_bytes;
32921   }
32922   return saved_bytes;
32923 }
32924 
32925 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
32926   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
32927   std::pair<result, char*> ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
32928   if (ret.first.count != len) {
32929     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
32930                                         buf + ret.first.count, len - ret.first.count, ret.second);
32931     if (scalar_res.error) {
32932       scalar_res.count += ret.first.count;
32933       return scalar_res;
32934     } else {
32935       ret.second += scalar_res.count;
32936     }
32937   }
32938   ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit code units written
32939   return ret.first;
32940 }
32941 
32942 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
32943   std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
32944   if (ret.first == nullptr) { return 0; }
32945   size_t saved_bytes = ret.second - utf32_output;
32946   if (ret.first != buf + len) {
32947     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
32948                                         ret.first, len - (ret.first - buf), ret.second);
32949     if (scalar_saved_bytes == 0) { return 0; }
32950     saved_bytes += scalar_saved_bytes;
32951   }
32952   return saved_bytes;
32953 }
32954 
32955 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
32956   std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
32957   if (ret.first == nullptr) { return 0; }
32958   size_t saved_bytes = ret.second - utf32_output;
32959   if (ret.first != buf + len) {
32960     const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
32961                                         ret.first, len - (ret.first - buf), ret.second);
32962     if (scalar_saved_bytes == 0) { return 0; }
32963     saved_bytes += scalar_saved_bytes;
32964   }
32965   return saved_bytes;
32966 }
32967 
32968 simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
32969   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
32970   std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
32971   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
32972   if (ret.first.count != len) { // All good so far, but not finished
32973     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
32974                                         buf + ret.first.count, len - ret.first.count, ret.second);
32975     if (scalar_res.error) {
32976       scalar_res.count += ret.first.count;
32977       return scalar_res;
32978     } else {
32979       ret.second += scalar_res.count;
32980     }
32981   }
32982   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit code units written
32983   return ret.first;
32984 }
32985 
32986 simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
32987   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
32988   std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
32989   if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
32990   if (ret.first.count != len) { // All good so far, but not finished
32991     result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
32992                                         buf + ret.first.count, len - ret.first.count, ret.second);
32993     if (scalar_res.error) {
32994       scalar_res.count += ret.first.count;
32995       return scalar_res;
32996     } else {
32997       ret.second += scalar_res.count;
32998     }
32999   }
33000   ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit code units written
33001   return ret.first;
33002 }
33003 
33004 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
33005   return convert_utf32_to_utf8(buf, len, utf8_output);
33006 }
33007 
33008 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
33009   std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
33010   if (ret.first == nullptr) { return 0; }
33011   size_t saved_bytes = ret.second - utf16_output;
33012   if (ret.first != buf + len) {
33013     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
33014                                         ret.first, len - (ret.first - buf), ret.second);
33015     if (scalar_saved_bytes == 0) { return 0; }
33016     saved_bytes += scalar_saved_bytes;
33017   }
33018   return saved_bytes;
33019 }
33020 
33021 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
33022   std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
33023   if (ret.first == nullptr) { return 0; }
33024   size_t saved_bytes = ret.second - utf16_output;
33025   if (ret.first != buf + len) {
33026     const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
33027                                         ret.first, len - (ret.first - buf), ret.second);
33028     if (scalar_saved_bytes == 0) { return 0; }
33029     saved_bytes += scalar_saved_bytes;
33030   }
33031   return saved_bytes;
33032 }
33033 
33034 simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
33035   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
33036   std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
33037   if (ret.first.count != len) {
33038     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
33039                                         buf + ret.first.count, len - ret.first.count, ret.second);
33040     if (scalar_res.error) {
33041       scalar_res.count += ret.first.count;
33042       return scalar_res;
33043     } else {
33044       ret.second += scalar_res.count;
33045     }
33046   }
33047   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit code units written
33048   return ret.first;
33049 }
33050 
33051 simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
33052   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
33053   std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
33054   if (ret.first.count != len) {
33055     result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
33056                                         buf + ret.first.count, len - ret.first.count, ret.second);
33057     if (scalar_res.error) {
33058       scalar_res.count += ret.first.count;
33059       return scalar_res;
33060     } else {
33061       ret.second += scalar_res.count;
33062     }
33063   }
33064   ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit code units written
33065   return ret.first;
33066 }
33067 
33068 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
33069   return convert_utf32_to_utf16le(buf, len, utf16_output);
33070 }
33071 
33072 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
33073   return convert_utf32_to_utf16be(buf, len, utf16_output);
33074 }
33075 
33076 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
33077   return convert_utf16le_to_utf32(buf, len, utf32_output);
33078 }
33079 
33080 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
33081   return convert_utf16be_to_utf32(buf, len, utf32_output);
33082 }
33083 
33084 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
33085   utf16::change_endianness_utf16(input, length, output);
33086 }
33087 
33088 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
33089   return utf16::count_code_points<endianness::LITTLE>(input, length);
33090 }
33091 
33092 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
33093   return utf16::count_code_points<endianness::BIG>(input, length);
33094 }
33095 
33096 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
33097   return utf8::count_code_points(input, length);
33098 }
33099 
33100 simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
33101   return count_utf8(buf,len);
33102 }
33103 
33104 simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
33105   return scalar::utf16::latin1_length_from_utf16(length);
33106 }
33107 
33108 simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept {
33109   return scalar::utf32::latin1_length_from_utf32(length);
33110 }
33111 
33112 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
33113   return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
33114 }
33115 
33116 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
33117   return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
33118 }
33119 
33120 simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
33121   return scalar::latin1::utf16_length_from_latin1(length);
33122 }
33123 
33124 simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
33125   return scalar::latin1::utf32_length_from_latin1(length);
33126 }
33127 
33128 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t len) const noexcept {
33129   const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
33130   size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
33131   size_t i = 0;
33132   __m128i two_64bits = _mm_setzero_si128();
33133   while (i + sizeof(__m128i) <= len) {
33134     __m128i runner = _mm_setzero_si128();
33135     size_t iterations = (len - i) / sizeof(__m128i);
33136     if (iterations > 255) {
33137       iterations = 255;
33138     }
33139     size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
33140     for (; i + 4*sizeof(__m128i) <= max_i; i += 4*sizeof(__m128i)) {
33141       __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
33142       __m128i input2 = _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
33143       __m128i input3 = _mm_loadu_si128((const __m128i *)(str + i + 2*sizeof(__m128i)));
33144       __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i)));
33145       __m128i input12 = _mm_add_epi8(
33146                                       _mm_cmpgt_epi8(
33147                                                     _mm_setzero_si128(),
33148                                                     input1),
33149                                       _mm_cmpgt_epi8(
33150                                                     _mm_setzero_si128(),
33151                                                     input2));
33152       __m128i input34 = _mm_add_epi8(
33153                                       _mm_cmpgt_epi8(
33154                                                     _mm_setzero_si128(),
33155                                                     input3),
33156                                       _mm_cmpgt_epi8(
33157                                                     _mm_setzero_si128(),
33158                                                     input4));
33159       __m128i input1234 = _mm_add_epi8(input12, input34);
33160       runner = _mm_sub_epi8(runner, input1234);
33161     }
33162     for (; i <= max_i; i += sizeof(__m128i)) {
33163       __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
33164       runner = _mm_sub_epi8(
33165           runner, _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
33166     }
33167     two_64bits = _mm_add_epi64(
33168         two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
33169   }
33170   answer += _mm_extract_epi64(two_64bits, 0) +
33171             _mm_extract_epi64(two_64bits, 1);
33172   return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast<const char *>(str + i), len - i);
33173 }
33174 
33175 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
33176   return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
33177 }
33178 
33179 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
33180   return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
33181 }
33182 
33183 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
33184   return utf8::utf16_length_from_utf8(input, length);
33185 }
33186 
33187 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
33188   const __m128i v_00000000 = _mm_setzero_si128();
33189   const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
33190   const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
33191   const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
33192   size_t pos = 0;
33193   size_t count = 0;
33194   for(;pos + 4 <= length; pos += 4) {
33195     __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
33196     const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
33197     const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
33198     const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
33199     const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
33200     const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
33201     const uint16_t ascii_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
33202     const uint16_t two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
33203     const uint16_t three_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
33204 
33205     size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
33206     size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
33207     size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
33208     count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
33209   }
33210   return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
33211 }
33212 
33213 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
33214   const __m128i v_00000000 = _mm_setzero_si128();
33215   const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
33216   size_t pos = 0;
33217   size_t count = 0;
33218   for(;pos + 4 <= length; pos += 4) {
33219     __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
33220     const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
33221     const uint16_t surrogate_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
33222     size_t surrogate_count = (16-count_ones(surrogate_bitmask))/4;
33223     count += 4 + surrogate_count;
33224   }
33225   return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
33226 }
33227 
33228 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
33229   return utf8::count_code_points(input, length);
33230 }
33231 
33232 } // namespace westmere
33233 } // namespace simdutf
33234 
33235 /* begin file src/simdutf/westmere/end.h */
33236 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
33237 // nothing needed.
33238 #else
33239 SIMDUTF_UNTARGET_REGION
33240 #endif
33241 
33242 /* end file src/simdutf/westmere/end.h */
33243 /* end file src/westmere/implementation.cpp */
33244 #endif
33245 
33246 SIMDUTF_POP_DISABLE_WARNINGS
33247 /* end file src/simdutf.cpp */
33248