1 /* auto-generated on 2023-12-01 13:59:01 -0500. Do not edit! */
2 /* begin file include/simdutf.h */
3 #ifndef SIMDUTF_H
4 #define SIMDUTF_H
5 #include <cstring>
6
7 /* begin file include/simdutf/compiler_check.h */
8 #ifndef SIMDUTF_COMPILER_CHECK_H
9 #define SIMDUTF_COMPILER_CHECK_H
10
11 #ifndef __cplusplus
12 #error simdutf requires a C++ compiler
13 #endif
14
15 #ifndef SIMDUTF_CPLUSPLUS
16 #if defined(_MSVC_LANG) && !defined(__clang__)
17 #define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
18 #else
19 #define SIMDUTF_CPLUSPLUS __cplusplus
20 #endif
21 #endif
22
23 // C++ 17
24 #if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
25 #define SIMDUTF_CPLUSPLUS17 1
26 #endif
27
28 // C++ 14
29 #if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L)
30 #define SIMDUTF_CPLUSPLUS14 1
31 #endif
32
33 // C++ 11
34 #if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L)
35 #define SIMDUTF_CPLUSPLUS11 1
36 #endif
37
38 #ifndef SIMDUTF_CPLUSPLUS11
39 #error simdutf requires a compiler compliant with the C++11 standard
40 #endif
41
42 #endif // SIMDUTF_COMPILER_CHECK_H
43 /* end file include/simdutf/compiler_check.h */
44 /* begin file include/simdutf/common_defs.h */
45 #ifndef SIMDUTF_COMMON_DEFS_H
46 #define SIMDUTF_COMMON_DEFS_H
47
48 #include <cassert>
49 /* begin file include/simdutf/portability.h */
50 #ifndef SIMDUTF_PORTABILITY_H
51 #define SIMDUTF_PORTABILITY_H
52
53 #include <cstddef>
54 #include <cstdint>
55 #include <cstdlib>
56 #include <cfloat>
57 #include <cassert>
58 #ifndef _WIN32
59 // strcasecmp, strncasecmp
60 #include <strings.h>
61 #endif
62
63 /**
64 * We want to check that it is actually a little endian system at
65 * compile-time.
66 */
67
68 #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
69 #define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
70 #elif defined(_WIN32)
71 #define SIMDUTF_IS_BIG_ENDIAN 0
72 #else
73 #if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
74 #include <machine/endian.h>
75 #elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__)
76 #include <sys/byteorder.h>
77 #else // defined(__APPLE__) || defined(__FreeBSD__)
78
79 #ifdef __has_include
80 #if __has_include(<endian.h>)
81 #include <endian.h>
82 #endif //__has_include(<endian.h>)
83 #endif //__has_include
84
85 #endif // defined(__APPLE__) || defined(__FreeBSD__)
86
87
88 #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
89 #define SIMDUTF_IS_BIG_ENDIAN 0
90 #endif
91
92 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
93 #define SIMDUTF_IS_BIG_ENDIAN 0
94 #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
95 #define SIMDUTF_IS_BIG_ENDIAN 1
96 #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
97
98 #endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
99
100
101 /**
102 * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined.
103 */
104
105 #ifdef _MSC_VER
106 #define SIMDUTF_VISUAL_STUDIO 1
107 /**
108 * We want to differentiate carefully between
109 * clang under visual studio and regular visual
110 * studio.
111 *
112 * Under clang for Windows, we enable:
113 * * target pragmas so that part and only part of the
114 * code gets compiled for advanced instructions.
115 *
116 */
117 #ifdef __clang__
118 // clang under visual studio
119 #define SIMDUTF_CLANG_VISUAL_STUDIO 1
120 #else
121 // just regular visual studio (best guess)
122 #define SIMDUTF_REGULAR_VISUAL_STUDIO 1
123 #endif // __clang__
124 #endif // _MSC_VER
125
126 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
127 // https://en.wikipedia.org/wiki/C_alternative_tokens
128 // This header should have no effect, except maybe
129 // under Visual Studio.
130 #include <iso646.h>
131 #endif
132
133 #if defined(__x86_64__) || defined(_M_AMD64)
134 #define SIMDUTF_IS_X86_64 1
135 #elif defined(__aarch64__) || defined(_M_ARM64)
136 #define SIMDUTF_IS_ARM64 1
137 #elif defined(__PPC64__) || defined(_M_PPC64)
138 //#define SIMDUTF_IS_PPC64 1
139 // The simdutf library does yet support SIMD acceleration under
140 // POWER processors. Please see https://github.com/lemire/simdutf/issues/51
141 #elif defined(__s390__)
142 // s390 IBM system. Big endian.
143 #elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
144 // RISC-V 64-bit
145 #else
146 // The simdutf library is designed
147 // for 64-bit processors and it seems that you are not
148 // compiling for a known 64-bit platform. Please
149 // use a 64-bit target such as x64 or 64-bit ARM for best performance.
150 #define SIMDUTF_IS_32BITS 1
151
152 // We do not support 32-bit platforms, but it can be
153 // handy to identify them.
154 #if defined(_M_IX86) || defined(__i386__)
155 #define SIMDUTF_IS_X86_32BITS 1
156 #elif defined(__arm__) || defined(_M_ARM)
157 #define SIMDUTF_IS_ARM_32BITS 1
158 #elif defined(__PPC__) || defined(_M_PPC)
159 #define SIMDUTF_IS_PPC_32BITS 1
160 #endif
161
162 #endif // defined(__x86_64__) || defined(_M_AMD64)
163
164 #ifdef SIMDUTF_IS_32BITS
165 #ifndef SIMDUTF_NO_PORTABILITY_WARNING
166 // In the future, we may want to warn users of 32-bit systems that
167 // the simdutf does not support accelerated kernels for such systems.
168 #endif // SIMDUTF_NO_PORTABILITY_WARNING
169 #endif // SIMDUTF_IS_32BITS
170
171 // this is almost standard?
172 #define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a
173 #define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a)
174
175 // Our fast kernels require 64-bit systems.
176 //
177 // On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
178 // Furthermore, the number of SIMD registers is reduced.
179 //
180 // On 32-bit ARM, we would have smaller registers.
181 //
182 // The simdutf users should still have the fallback kernel. It is
183 // slower, but it should run everywhere.
184
185 //
186 // Enable valid runtime implementations, and select SIMDUTF_BUILTIN_IMPLEMENTATION
187 //
188
189 // We are going to use runtime dispatch.
190 #ifdef SIMDUTF_IS_X86_64
191 #ifdef __clang__
192 // clang does not have GCC push pop
193 // warning: clang attribute push can't be used within a namespace in clang up
194 // til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a
195 // namespace.
196 #define SIMDUTF_TARGET_REGION(T) \
197 _Pragma(SIMDUTF_STRINGIFY( \
198 clang attribute push(__attribute__((target(T))), apply_to = function)))
199 #define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
200 #elif defined(__GNUC__)
201 // GCC is easier
202 #define SIMDUTF_TARGET_REGION(T) \
203 _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
204 #define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
205 #endif // clang then gcc
206
207 #endif // x86
208
209 // Default target region macros don't do anything.
210 #ifndef SIMDUTF_TARGET_REGION
211 #define SIMDUTF_TARGET_REGION(T)
212 #define SIMDUTF_UNTARGET_REGION
213 #endif
214
215 // Is threading enabled?
216 #if defined(_REENTRANT) || defined(_MT)
217 #ifndef SIMDUTF_THREADS_ENABLED
218 #define SIMDUTF_THREADS_ENABLED
219 #endif
220 #endif
221
222 // workaround for large stack sizes under -O0.
223 // https://github.com/simdutf/simdutf/issues/691
224 #ifdef __APPLE__
225 #ifndef __OPTIMIZE__
226 // Apple systems have small stack sizes in secondary threads.
227 // Lack of compiler optimization may generate high stack usage.
228 // Users may want to disable threads for safety, but only when
229 // in debug mode which we detect by the fact that the __OPTIMIZE__
230 // macro is not defined.
231 #undef SIMDUTF_THREADS_ENABLED
232 #endif
233 #endif
234
235 #ifdef SIMDUTF_VISUAL_STUDIO
236 // This is one case where we do not distinguish between
237 // regular visual studio and clang under visual studio.
238 // clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has)
239 #define simdutf_strcasecmp _stricmp
240 #define simdutf_strncasecmp _strnicmp
241 #else
242 // The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8).
243 // So they are only useful for ASCII in our context.
244 // https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
245 #define simdutf_strcasecmp strcasecmp
246 #define simdutf_strncasecmp strncasecmp
247 #endif
248
249 #ifdef NDEBUG
250
251 #ifdef SIMDUTF_VISUAL_STUDIO
252 #define SIMDUTF_UNREACHABLE() __assume(0)
253 #define SIMDUTF_ASSUME(COND) __assume(COND)
254 #else
255 #define SIMDUTF_UNREACHABLE() __builtin_unreachable();
256 #define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0)
257 #endif
258
259 #else // NDEBUG
260
261 #define SIMDUTF_UNREACHABLE() assert(0);
262 #define SIMDUTF_ASSUME(COND) assert(COND)
263
264 #endif
265
266
267 #if defined(__GNUC__) && !defined(__clang__)
268 #if __GNUC__ >= 11
269 #define SIMDUTF_GCC11ORMORE 1
270 #endif // __GNUC__ >= 11
271 #endif // defined(__GNUC__) && !defined(__clang__)
272
273
274 #endif // SIMDUTF_PORTABILITY_H
275 /* end file include/simdutf/portability.h */
276 /* begin file include/simdutf/avx512.h */
277 #ifndef SIMDUTF_AVX512_H_
278 #define SIMDUTF_AVX512_H_
279
280 /*
281 It's possible to override AVX512 settings with cmake DCMAKE_CXX_FLAGS.
282
283 All preprocessor directives has form `SIMDUTF_HAS_AVX512{feature}`,
284 where a feature is a code name for extensions.
285
286 Please see the listing below to find which are supported.
287 */
288
289 #ifndef SIMDUTF_HAS_AVX512F
290 # if defined(__AVX512F__) && __AVX512F__ == 1
291 # define SIMDUTF_HAS_AVX512F 1
292 # endif
293 #endif
294
295 #ifndef SIMDUTF_HAS_AVX512DQ
296 # if defined(__AVX512DQ__) && __AVX512DQ__ == 1
297 # define SIMDUTF_HAS_AVX512DQ 1
298 # endif
299 #endif
300
301 #ifndef SIMDUTF_HAS_AVX512IFMA
302 # if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
303 # define SIMDUTF_HAS_AVX512IFMA 1
304 # endif
305 #endif
306
307 #ifndef SIMDUTF_HAS_AVX512CD
308 # if defined(__AVX512CD__) && __AVX512CD__ == 1
309 # define SIMDUTF_HAS_AVX512CD 1
310 # endif
311 #endif
312
313 #ifndef SIMDUTF_HAS_AVX512BW
314 # if defined(__AVX512BW__) && __AVX512BW__ == 1
315 # define SIMDUTF_HAS_AVX512BW 1
316 # endif
317 #endif
318
319 #ifndef SIMDUTF_HAS_AVX512VL
320 # if defined(__AVX512VL__) && __AVX512VL__ == 1
321 # define SIMDUTF_HAS_AVX512VL 1
322 # endif
323 #endif
324
325 #ifndef SIMDUTF_HAS_AVX512VBMI
326 # if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
327 # define SIMDUTF_HAS_AVX512VBMI 1
328 # endif
329 #endif
330
331 #ifndef SIMDUTF_HAS_AVX512VBMI2
332 # if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
333 # define SIMDUTF_HAS_AVX512VBMI2 1
334 # endif
335 #endif
336
337 #ifndef SIMDUTF_HAS_AVX512VNNI
338 # if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
339 # define SIMDUTF_HAS_AVX512VNNI 1
340 # endif
341 #endif
342
343 #ifndef SIMDUTF_HAS_AVX512BITALG
344 # if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
345 # define SIMDUTF_HAS_AVX512BITALG 1
346 # endif
347 #endif
348
349 #ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ
350 # if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
351 # define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
352 # endif
353 #endif
354
355 #endif // SIMDUTF_AVX512_H_
356 /* end file include/simdutf/avx512.h */
357
358
359 #if defined(__GNUC__)
360 // Marks a block with a name so that MCA analysis can see it.
361 #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
362 #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
363 #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
364 #else
365 #define SIMDUTF_BEGIN_DEBUG_BLOCK(name)
366 #define SIMDUTF_END_DEBUG_BLOCK(name)
367 #define SIMDUTF_DEBUG_BLOCK(name, block)
368 #endif
369
370 // Align to N-byte boundary
371 #define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
372 #define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
373
374 #define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)
375
376 #if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
377
378 #define simdutf_really_inline __forceinline
379 #define simdutf_never_inline __declspec(noinline)
380
381 #define simdutf_unused
382 #define simdutf_warn_unused
383
384 #ifndef simdutf_likely
385 #define simdutf_likely(x) x
386 #endif
387 #ifndef simdutf_unlikely
388 #define simdutf_unlikely(x) x
389 #endif
390
391 #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push ))
392 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 ))
393 #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER ))
394 // Get rid of Intellisense-only warnings (Code Analysis)
395 // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910).
396 #ifdef __has_include
397 #if __has_include(<CppCoreCheck\Warnings.h>)
398 #include <CppCoreCheck\Warnings.h>
399 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
400 #endif
401 #endif
402
403 #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
404 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
405 #endif
406
407 #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
408 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
409 #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop ))
410
411 #else // SIMDUTF_REGULAR_VISUAL_STUDIO
412
413 #define simdutf_really_inline inline __attribute__((always_inline))
414 #define simdutf_never_inline inline __attribute__((noinline))
415
416 #define simdutf_unused __attribute__((unused))
417 #define simdutf_warn_unused __attribute__((warn_unused_result))
418
419 #ifndef simdutf_likely
420 #define simdutf_likely(x) __builtin_expect(!!(x), 1)
421 #endif
422 #ifndef simdutf_unlikely
423 #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
424 #endif
425
426 #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
427 // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary
428 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \
429 SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \
430 SIMDUTF_DISABLE_GCC_WARNING(-Wall) \
431 SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \
432 SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \
433 SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \
434 SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
435 SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
436 SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \
437 SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \
438 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \
439 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
440 #define SIMDUTF_PRAGMA(P) _Pragma(#P)
441 #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
442 #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
443 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
444 #else
445 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
446 #endif
447 #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
448 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
449 #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
450
451
452
453 #endif // MSC_VER
454
455 #ifndef SIMDUTF_DLLIMPORTEXPORT
456 #if defined(SIMDUTF_VISUAL_STUDIO)
457 /**
458 * It does not matter here whether you are using
459 * the regular visual studio or clang under visual
460 * studio.
461 */
462 #if SIMDUTF_USING_LIBRARY
463 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
464 #else
465 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
466 #endif
467 #else
468 #define SIMDUTF_DLLIMPORTEXPORT
469 #endif
470 #endif
471
472 /// If EXPR is an error, returns it.
473 #define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
474
475
476 #endif // SIMDUTF_COMMON_DEFS_H
477 /* end file include/simdutf/common_defs.h */
478 /* begin file include/simdutf/encoding_types.h */
479 #include <string>
480
481 namespace simdutf {
482
483 enum encoding_type {
484 UTF8 = 1, // BOM 0xef 0xbb 0xbf
485 UTF16_LE = 2, // BOM 0xff 0xfe
486 UTF16_BE = 4, // BOM 0xfe 0xff
487 UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00
488 UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff
489 Latin1 = 32,
490
491 unspecified = 0
492 };
493
494 enum endianness {
495 LITTLE = 0,
496 BIG = 1
497 };
498
499 bool match_system(endianness e);
500
501 std::string to_string(encoding_type bom);
502
503 // Note that BOM for UTF8 is discouraged.
504 namespace BOM {
505
506 /**
507 * Checks for a BOM. If not, returns unspecified
508 * @param input the string to process
509 * @param length the length of the string in code units
510 * @return the corresponding encoding
511 */
512
513 encoding_type check_bom(const uint8_t* byte, size_t length);
514 encoding_type check_bom(const char* byte, size_t length);
515 /**
516 * Returns the size, in bytes, of the BOM for a given encoding type.
517 * Note that UTF8 BOM are discouraged.
518 * @param bom the encoding type
519 * @return the size in bytes of the corresponding BOM
520 */
521 size_t bom_byte_size(encoding_type bom);
522
523 } // BOM namespace
524 } // simdutf namespace
525 /* end file include/simdutf/encoding_types.h */
526 /* begin file include/simdutf/error.h */
527 #ifndef SIMDUTF_ERROR_H
528 #define SIMDUTF_ERROR_H
529 namespace simdutf {
530
531 enum error_code {
532 SUCCESS = 0,
533 HEADER_BITS, // Any byte must have fewer than 5 header bits.
534 TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length
535 // This is also the error when the input is truncated.
536 TOO_LONG, // We either have too many consecutive continuation bytes or the string starts with a continuation byte.
537 OVERLONG, // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters,
538 // and U+FFFF for four-byte characters.
539 TOO_LARGE, // The decoded character must be less than or equal to U+10FFFF,less than or equal than U+7F for ASCII OR less than equal than U+FF for Latin1
540 SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR
541 // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR
542 // there must be no surrogate at all (Latin1)
543 OTHER // Not related to validation/transcoding.
544 };
545
546 struct result {
547 error_code error;
548 size_t count; // In case of error, indicates the position of the error. In case of success, indicates the number of code units validated/written.
549
550 simdutf_really_inline result();
551
552 simdutf_really_inline result(error_code, size_t);
553 };
554
555 }
556 #endif
557 /* end file include/simdutf/error.h */
558
559 SIMDUTF_PUSH_DISABLE_WARNINGS
560 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
561
562 // Public API
563 /* begin file include/simdutf/simdutf_version.h */
564 // /include/simdutf/simdutf_version.h automatically generated by release.py,
565 // do not change by hand
566 #ifndef SIMDUTF_SIMDUTF_VERSION_H
567 #define SIMDUTF_SIMDUTF_VERSION_H
568
569 /** The version of simdutf being used (major.minor.revision) */
570 #define SIMDUTF_VERSION "4.0.8"
571
572 namespace simdutf {
573 enum {
574 /**
575 * The major version (MAJOR.minor.revision) of simdutf being used.
576 */
577 SIMDUTF_VERSION_MAJOR = 4,
578 /**
579 * The minor version (major.MINOR.revision) of simdutf being used.
580 */
581 SIMDUTF_VERSION_MINOR = 0,
582 /**
583 * The revision (major.minor.REVISION) of simdutf being used.
584 */
585 SIMDUTF_VERSION_REVISION = 8
586 };
587 } // namespace simdutf
588
589 #endif // SIMDUTF_SIMDUTF_VERSION_H
590 /* end file include/simdutf/simdutf_version.h */
591 /* begin file include/simdutf/implementation.h */
592 #ifndef SIMDUTF_IMPLEMENTATION_H
593 #define SIMDUTF_IMPLEMENTATION_H
594 #include <string>
595 #if !defined(SIMDUTF_NO_THREADS)
596 #include <atomic>
597 #endif
598 #include <vector>
599 #include <tuple>
600 /* begin file include/simdutf/internal/isadetection.h */
601 /* From
602 https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
603 Highly modified.
604
605 Copyright (c) 2016- Facebook, Inc (Adam Paszke)
606 Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
607 Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
608 Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
609 Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
610 Copyright (c) 2011-2013 NYU (Clement Farabet)
611 Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
612 Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
613 (Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
614 Samy Bengio, Johnny Mariethoz)
615
616 All rights reserved.
617
618 Redistribution and use in source and binary forms, with or without
619 modification, are permitted provided that the following conditions are met:
620
621 1. Redistributions of source code must retain the above copyright
622 notice, this list of conditions and the following disclaimer.
623
624 2. Redistributions in binary form must reproduce the above copyright
625 notice, this list of conditions and the following disclaimer in the
626 documentation and/or other materials provided with the distribution.
627
628 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
629 America and IDIAP Research Institute nor the names of its contributors may be
630 used to endorse or promote products derived from this software without
631 specific prior written permission.
632
633 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
634 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
635 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
636 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
637 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
638 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
639 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
640 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
641 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
642 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
643 POSSIBILITY OF SUCH DAMAGE.
644 */
645
646 #ifndef SIMDutf_INTERNAL_ISADETECTION_H
647 #define SIMDutf_INTERNAL_ISADETECTION_H
648
649 #include <cstdint>
650 #include <cstdlib>
651 #if defined(_MSC_VER)
652 #include <intrin.h>
653 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
654 #include <cpuid.h>
655 #endif
656
657 namespace simdutf {
658 namespace internal {
659
660 enum instruction_set {
661 DEFAULT = 0x0,
662 NEON = 0x1,
663 AVX2 = 0x4,
664 SSE42 = 0x8,
665 PCLMULQDQ = 0x10,
666 BMI1 = 0x20,
667 BMI2 = 0x40,
668 ALTIVEC = 0x80,
669 AVX512F = 0x100,
670 AVX512DQ = 0x200,
671 AVX512IFMA = 0x400,
672 AVX512PF = 0x800,
673 AVX512ER = 0x1000,
674 AVX512CD = 0x2000,
675 AVX512BW = 0x4000,
676 AVX512VL = 0x8000,
677 AVX512VBMI2 = 0x10000,
678 AVX512VPOPCNTDQ = 0x2000
679 };
680
681 #if defined(__PPC64__)
682
detect_supported_architectures()683 static inline uint32_t detect_supported_architectures() {
684 return instruction_set::ALTIVEC;
685 }
686
687 #elif defined(__aarch64__) || defined(_M_ARM64)
688
detect_supported_architectures()689 static inline uint32_t detect_supported_architectures() {
690 return instruction_set::NEON;
691 }
692
693 #elif defined(__x86_64__) || defined(_M_AMD64) // x64
694
695
696 namespace {
697 namespace cpuid_bit {
698 // Can be found on Intel ISA Reference for CPUID
699
700 // EAX = 0x01
701 constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit 1 of ECX for EAX=0x1
702 constexpr uint32_t sse42 = uint32_t(1) << 20; ///< @private bit 20 of ECX for EAX=0x1
703 constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
704
705 // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
706 // See: "Table 3-8. Information Returned by CPUID Instruction"
707 namespace ebx {
708 constexpr uint32_t bmi1 = uint32_t(1) << 3;
709 constexpr uint32_t avx2 = uint32_t(1) << 5;
710 constexpr uint32_t bmi2 = uint32_t(1) << 8;
711 constexpr uint32_t avx512f = uint32_t(1) << 16;
712 constexpr uint32_t avx512dq = uint32_t(1) << 17;
713 constexpr uint32_t avx512ifma = uint32_t(1) << 21;
714 constexpr uint32_t avx512cd = uint32_t(1) << 28;
715 constexpr uint32_t avx512bw = uint32_t(1) << 30;
716 constexpr uint32_t avx512vl = uint32_t(1) << 31;
717 }
718
719 namespace ecx {
720 constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
721 constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
722 constexpr uint32_t avx512vnni = uint32_t(1) << 11;
723 constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
724 constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
725 }
726 namespace edx {
727 constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
728 }
729 namespace xcr0_bit {
730 constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
731 constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
732 }
733 }
734 }
735
736
737
cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)738 static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
739 uint32_t *edx) {
740 #if defined(_MSC_VER)
741 int cpu_info[4];
742 __cpuidex(cpu_info, *eax, *ecx);
743 *eax = cpu_info[0];
744 *ebx = cpu_info[1];
745 *ecx = cpu_info[2];
746 *edx = cpu_info[3];
747 #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
748 uint32_t level = *eax;
749 __get_cpuid(level, eax, ebx, ecx, edx);
750 #else
751 uint32_t a = *eax, b, c = *ecx, d;
752 asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
753 *eax = a;
754 *ebx = b;
755 *ecx = c;
756 *edx = d;
757 #endif
758 }
759
xgetbv()760 static inline uint64_t xgetbv() {
761 #if defined(_MSC_VER)
762 return _xgetbv(0);
763 #else
764 uint32_t xcr0_lo, xcr0_hi;
765 asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
766 return xcr0_lo | ((uint64_t)xcr0_hi << 32);
767 #endif
768 }
769
detect_supported_architectures()770 static inline uint32_t detect_supported_architectures() {
771 uint32_t eax;
772 uint32_t ebx = 0;
773 uint32_t ecx = 0;
774 uint32_t edx = 0;
775 uint32_t host_isa = 0x0;
776
777 // EBX for EAX=0x1
778 eax = 0x1;
779 cpuid(&eax, &ebx, &ecx, &edx);
780
781 if (ecx & cpuid_bit::sse42) {
782 host_isa |= instruction_set::SSE42;
783 }
784
785 if (ecx & cpuid_bit::pclmulqdq) {
786 host_isa |= instruction_set::PCLMULQDQ;
787 }
788
789 if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
790 return host_isa;
791 }
792
793 // xgetbv for checking if the OS saves registers
794 uint64_t xcr0 = xgetbv();
795
796 if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
797 return host_isa;
798 }
799 // ECX for EAX=0x7
800 eax = 0x7;
801 ecx = 0x0; // Sub-leaf = 0
802 cpuid(&eax, &ebx, &ecx, &edx);
803 if (ebx & cpuid_bit::ebx::avx2) {
804 host_isa |= instruction_set::AVX2;
805 }
806 if (ebx & cpuid_bit::ebx::bmi1) {
807 host_isa |= instruction_set::BMI1;
808 }
809 if (ebx & cpuid_bit::ebx::bmi2) {
810 host_isa |= instruction_set::BMI2;
811 }
812 if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) {
813 return host_isa;
814 }
815 if (ebx & cpuid_bit::ebx::avx512f) {
816 host_isa |= instruction_set::AVX512F;
817 }
818 if (ebx & cpuid_bit::ebx::avx512bw) {
819 host_isa |= instruction_set::AVX512BW;
820 }
821 if (ebx & cpuid_bit::ebx::avx512cd) {
822 host_isa |= instruction_set::AVX512CD;
823 }
824 if (ebx & cpuid_bit::ebx::avx512dq) {
825 host_isa |= instruction_set::AVX512DQ;
826 }
827 if (ebx & cpuid_bit::ebx::avx512vl) {
828 host_isa |= instruction_set::AVX512VL;
829 }
830 if (ecx & cpuid_bit::ecx::avx512vbmi2) {
831 host_isa |= instruction_set::AVX512VBMI2;
832 }
833 if (ecx & cpuid_bit::ecx::avx512vpopcnt) {
834 host_isa |= instruction_set::AVX512VPOPCNTDQ;
835 }
836 return host_isa;
837 }
838 #else // fallback
839
840 // includes 32-bit ARM.
detect_supported_architectures()841 static inline uint32_t detect_supported_architectures() {
842 return instruction_set::DEFAULT;
843 }
844
845
846 #endif // end SIMD extension detection code
847
848 } // namespace internal
849 } // namespace simdutf
850
851 #endif // SIMDutf_INTERNAL_ISADETECTION_H
852 /* end file include/simdutf/internal/isadetection.h */
853
854
855 namespace simdutf {
856
857 /**
858 * Autodetect the encoding of the input, a single encoding is recommended.
859 * E.g., the function might return simdutf::encoding_type::UTF8,
860 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
861 * simdutf::encoding_type::UTF32_LE.
862 *
863 * @param input the string to analyze.
864 * @param length the length of the string in bytes.
865 * @return the detected encoding type
866 */
867 simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept;
868 simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept {
869 return autodetect_encoding(reinterpret_cast<const char *>(input), length);
870 }
871
872 /**
873 * Autodetect the possible encodings of the input in one pass.
874 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
875 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
876 *
877 * Overriden by each implementation.
878 *
879 * @param input the string to analyze.
880 * @param length the length of the string in bytes.
881 * @return the detected encoding type
882 */
883 simdutf_warn_unused int detect_encodings(const char * input, size_t length) noexcept;
884 simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * input, size_t length) noexcept {
885 return detect_encodings(reinterpret_cast<const char *>(input), length);
886 }
887
888 /**
889 * Validate the UTF-8 string. This function may be best when you expect
890 * the input to be almost always valid. Otherwise, consider using
891 * validate_utf8_with_errors.
892 *
893 * Overridden by each implementation.
894 *
895 * @param buf the UTF-8 string to validate.
896 * @param len the length of the string in bytes.
897 * @return true if and only if the string is valid UTF-8.
898 */
899 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
900
901 /**
902 * Validate the UTF-8 string and stop on error.
903 *
904 * Overridden by each implementation.
905 *
906 * @param buf the UTF-8 string to validate.
907 * @param len the length of the string in bytes.
908 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
909 */
910 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept;
911
912 /**
913 * Validate the ASCII string.
914 *
915 * Overridden by each implementation.
916 *
917 * @param buf the ASCII string to validate.
918 * @param len the length of the string in bytes.
919 * @return true if and only if the string is valid ASCII.
920 */
921 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
922
923 /**
924 * Validate the ASCII string and stop on error. It might be faster than
925 * validate_utf8 when an error is expected to occur early.
926 *
927 * Overridden by each implementation.
928 *
929 * @param buf the ASCII string to validate.
930 * @param len the length of the string in bytes.
931 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
932 */
933 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept;
934
935 /**
936 * Using native endianness; Validate the UTF-16 string.
937 * This function may be best when you expect the input to be almost always valid.
938 * Otherwise, consider using validate_utf16_with_errors.
939 *
940 * Overridden by each implementation.
941 *
942 * This function is not BOM-aware.
943 *
944 * @param buf the UTF-16 string to validate.
945 * @param len the length of the string in number of 2-byte code units (char16_t).
946 * @return true if and only if the string is valid UTF-16.
947 */
948 simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept;
949
950 /**
951 * Validate the UTF-16LE string. This function may be best when you expect
952 * the input to be almost always valid. Otherwise, consider using
953 * validate_utf16le_with_errors.
954 *
955 * Overridden by each implementation.
956 *
957 * This function is not BOM-aware.
958 *
959 * @param buf the UTF-16LE string to validate.
960 * @param len the length of the string in number of 2-byte code units (char16_t).
961 * @return true if and only if the string is valid UTF-16LE.
962 */
963 simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexcept;
964
965 /**
966 * Validate the UTF-16BE string. This function may be best when you expect
967 * the input to be almost always valid. Otherwise, consider using
968 * validate_utf16be_with_errors.
969 *
970 * Overridden by each implementation.
971 *
972 * This function is not BOM-aware.
973 *
974 * @param buf the UTF-16BE string to validate.
975 * @param len the length of the string in number of 2-byte code units (char16_t).
976 * @return true if and only if the string is valid UTF-16BE.
977 */
978 simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept;
979
980 /**
981 * Using native endianness; Validate the UTF-16 string and stop on error.
982 * It might be faster than validate_utf16 when an error is expected to occur early.
983 *
984 * Overridden by each implementation.
985 *
986 * This function is not BOM-aware.
987 *
988 * @param buf the UTF-16 string to validate.
989 * @param len the length of the string in number of 2-byte code units (char16_t).
990 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
991 */
992 simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept;
993
994 /**
995 * Validate the UTF-16LE string and stop on error. It might be faster than
996 * validate_utf16le when an error is expected to occur early.
997 *
998 * Overridden by each implementation.
999 *
1000 * This function is not BOM-aware.
1001 *
1002 * @param buf the UTF-16LE string to validate.
1003 * @param len the length of the string in number of 2-byte code units (char16_t).
1004 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
1005 */
1006 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) noexcept;
1007
1008 /**
1009 * Validate the UTF-16BE string and stop on error. It might be faster than
1010 * validate_utf16be when an error is expected to occur early.
1011 *
1012 * Overridden by each implementation.
1013 *
1014 * This function is not BOM-aware.
1015 *
1016 * @param buf the UTF-16BE string to validate.
1017 * @param len the length of the string in number of 2-byte code units (char16_t).
1018 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
1019 */
1020 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept;
1021
1022 /**
1023 * Validate the UTF-32 string. This function may be best when you expect
1024 * the input to be almost always valid. Otherwise, consider using
1025 * validate_utf32_with_errors.
1026 *
1027 * Overridden by each implementation.
1028 *
1029 * This function is not BOM-aware.
1030 *
1031 * @param buf the UTF-32 string to validate.
1032 * @param len the length of the string in number of 4-byte code units (char32_t).
1033 * @return true if and only if the string is valid UTF-32.
1034 */
1035 simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept;
1036
1037 /**
1038 * Validate the UTF-32 string and stop on error. It might be faster than
1039 * validate_utf32 when an error is expected to occur early.
1040 *
1041 * Overridden by each implementation.
1042 *
1043 * This function is not BOM-aware.
1044 *
1045 * @param buf the UTF-32 string to validate.
1046 * @param len the length of the string in number of 4-byte code units (char32_t).
1047 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
1048 */
1049 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept;
1050
1051 /**
1052 * Convert Latin1 string into UTF8 string.
1053 *
1054 * This function is suitable to work with inputs from untrusted sources.
1055 *
1056 * @param input the Latin1 string to convert
1057 * @param length the length of the string in bytes
1058 * @param latin1_output the pointer to buffer that can hold conversion result
1059 * @return the number of written char; 0 if conversion is not possible
1060 */
1061 simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept;
1062
1063
1064 /**
1065 * Convert possibly Latin1 string into UTF-16LE string.
1066 *
1067 * This function is suitable to work with inputs from untrusted sources.
1068 *
1069 * @param input the Latin1 string to convert
1070 * @param length the length of the string in bytes
1071 * @param utf16_buffer the pointer to buffer that can hold conversion result
1072 * @return the number of written char16_t; 0 if conversion is not possible
1073 */
1074 simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
1075
1076 /**
1077 * Convert Latin1 string into UTF-16BE string.
1078 *
1079 * This function is suitable to work with inputs from untrusted sources.
1080 *
1081 * @param input the Latin1 string to convert
1082 * @param length the length of the string in bytes
1083 * @param utf16_buffer the pointer to buffer that can hold conversion result
1084 * @return the number of written char16_t; 0 if conversion is not possible
1085 */
1086 simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
1087
1088 /**
1089 * Convert Latin1 string into UTF-32 string.
1090 *
1091 * This function is suitable to work with inputs from untrusted sources.
1092 *
1093 * @param input the Latin1 string to convert
1094 * @param length the length of the string in bytes
1095 * @param utf32_buffer the pointer to buffer that can hold conversion result
1096 * @return the number of written char32_t; 0 if conversion is not possible
1097 */
1098 simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
1099
1100 /**
1101 * Convert possibly broken UTF-8 string into latin1 string.
1102 *
1103 * During the conversion also validation of the input string is done.
1104 * This function is suitable to work with inputs from untrusted sources.
1105 *
1106 * @param input the UTF-8 string to convert
1107 * @param length the length of the string in bytes
1108 * @param latin1_output the pointer to buffer that can hold conversion result
1109 * @return the number of written char; 0 if the input was not valid UTF-8 string
1110 */
1111 simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
1112
1113 /**
1114 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16 string.
1115 *
1116 * During the conversion also validation of the input string is done.
1117 * This function is suitable to work with inputs from untrusted sources.
1118 *
1119 * @param input the UTF-8 string to convert
1120 * @param length the length of the string in bytes
1121 * @param utf16_buffer the pointer to buffer that can hold conversion result
1122 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1123 */
1124 simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept;
1125
1126
1127 /**
1128 * Using native endianness, convert a Latin1 string into a UTF-16 string.
1129 *
1130 * @param input the UTF-8 string to convert
1131 * @param length the length of the string in bytes
1132 * @param utf16_buffer the pointer to buffer that can hold conversion result
1133 * @return the number of written char16_t.
1134 */
1135 simdutf_warn_unused size_t convert_latin1_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept;
1136
1137 /**
1138 * Convert possibly broken UTF-8 string into UTF-16LE string.
1139 *
1140 * During the conversion also validation of the input string is done.
1141 * This function is suitable to work with inputs from untrusted sources.
1142 *
1143 * @param input the UTF-8 string to convert
1144 * @param length the length of the string in bytes
1145 * @param utf16_buffer the pointer to buffer that can hold conversion result
1146 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1147 */
1148 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
1149
1150 /**
1151 * Convert possibly broken UTF-8 string into UTF-16BE string.
1152 *
1153 * During the conversion also validation of the input string is done.
1154 * This function is suitable to work with inputs from untrusted sources.
1155 *
1156 * @param input the UTF-8 string to convert
1157 * @param length the length of the string in bytes
1158 * @param utf16_buffer the pointer to buffer that can hold conversion result
1159 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1160 */
1161 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
1162
1163
1164 /**
1165 * Convert possibly broken UTF-8 string into latin1 string with errors.
1166 *
1167 * During the conversion also validation of the input string is done.
1168 * This function is suitable to work with inputs from untrusted sources.
1169 *
1170 * @param input the UTF-8 string to convert
1171 * @param length the length of the string in bytes
1172 * @param latin1_output the pointer to buffer that can hold conversion result
1173 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
1174 */
1175 simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) noexcept;
1176
1177 /**
1178 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
1179 * string and stop on error.
1180 *
1181 * During the conversion also validation of the input string is done.
1182 * This function is suitable to work with inputs from untrusted sources.
1183 *
1184 * @param input the UTF-8 string to convert
1185 * @param length the length of the string in bytes
1186 * @param utf16_buffer the pointer to buffer that can hold conversion result
1187 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
1188 */
1189 simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1190
1191 /**
1192 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
1193 *
1194 * During the conversion also validation of the input string is done.
1195 * This function is suitable to work with inputs from untrusted sources.
1196 *
1197 * @param input the UTF-8 string to convert
1198 * @param length the length of the string in bytes
1199 * @param utf16_buffer the pointer to buffer that can hold conversion result
1200 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
1201 */
1202 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1203
1204 /**
1205 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
1206 *
1207 * During the conversion also validation of the input string is done.
1208 * This function is suitable to work with inputs from untrusted sources.
1209 *
1210 * @param input the UTF-8 string to convert
1211 * @param length the length of the string in bytes
1212 * @param utf16_buffer the pointer to buffer that can hold conversion result
1213 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
1214 */
1215 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1216
1217 /**
1218 * Convert possibly broken UTF-8 string into UTF-32 string.
1219 *
1220 * During the conversion also validation of the input string is done.
1221 * This function is suitable to work with inputs from untrusted sources.
1222 *
1223 * @param input the UTF-8 string to convert
1224 * @param length the length of the string in bytes
1225 * @param utf32_buffer the pointer to buffer that can hold conversion result
1226 * @return the number of written char32_t; 0 if the input was not valid UTF-8 string
1227 */
1228 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept;
1229
1230 /**
1231 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1232 *
1233 * During the conversion also validation of the input string is done.
1234 * This function is suitable to work with inputs from untrusted sources.
1235 *
1236 * @param input the UTF-8 string to convert
1237 * @param length the length of the string in bytes
1238 * @param utf32_buffer the pointer to buffer that can hold conversion result
1239 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
1240 */
1241 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept;
1242
1243 /**
1244 * Convert valid UTF-8 string into latin1 string.
1245 *
1246 * This function assumes that the input string is valid UTF-8.
1247 *
1248 * This function is not BOM-aware.
1249 *
1250 * @param input the UTF-8 string to convert
1251 * @param length the length of the string in bytes
1252 * @param latin1_output the pointer to buffer that can hold conversion result
1253 * @return the number of written char; 0 if the input was not valid UTF-8 string
1254 */
1255 simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
1256
1257
1258 /**
1259 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
1260 *
1261 * This function assumes that the input string is valid UTF-8.
1262 *
1263 * @param input the UTF-8 string to convert
1264 * @param length the length of the string in bytes
1265 * @param utf16_buffer the pointer to buffer that can hold conversion result
1266 * @return the number of written char16_t
1267 */
1268 simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1269
1270 /**
1271 * Convert valid UTF-8 string into UTF-16LE string.
1272 *
1273 * This function assumes that the input string is valid UTF-8.
1274 *
1275 * @param input the UTF-8 string to convert
1276 * @param length the length of the string in bytes
1277 * @param utf16_buffer the pointer to buffer that can hold conversion result
1278 * @return the number of written char16_t
1279 */
1280 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1281
1282 /**
1283 * Convert valid UTF-8 string into UTF-16BE string.
1284 *
1285 * This function assumes that the input string is valid UTF-8.
1286 *
1287 * @param input the UTF-8 string to convert
1288 * @param length the length of the string in bytes
1289 * @param utf16_buffer the pointer to buffer that can hold conversion result
1290 * @return the number of written char16_t
1291 */
1292 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1293
1294 /**
1295 * Convert valid UTF-8 string into UTF-32 string.
1296 *
1297 * This function assumes that the input string is valid UTF-8.
1298 *
1299 * @param input the UTF-8 string to convert
1300 * @param length the length of the string in bytes
1301 * @param utf32_buffer the pointer to buffer that can hold conversion result
1302 * @return the number of written char32_t
1303 */
1304 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
1305
1306
1307 /**
1308 * Return the number of bytes that this Latin1 string would require in UTF-8 format.
1309 *
1310 * @param input the Latin1 string to convert
1311 * @param length the length of the string bytes
1312 * @return the number of bytes required to encode the Latin1 string as UTF-8
1313 */
1314 simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) noexcept;
1315
1316 /**
1317 * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
1318 *
1319 * This function does not validate the input.
1320 *
1321 * This function is not BOM-aware.
1322 *
1323 * @param input the UTF-8 string to convert
1324 * @param length the length of the string in byte
1325 * @return the number of bytes required to encode the UTF-8 string as Latin1
1326 */
1327 simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) noexcept;
1328
1329 /**
1330 * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format.
1331 *
1332 * This function does not validate the input.
1333 *
1334 * This function is not BOM-aware.
1335 *
1336 * @param input the UTF-8 string to process
1337 * @param length the length of the string in bytes
1338 * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE
1339 */
1340 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept;
1341
1342 /**
1343 * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format.
1344 *
1345 * This function is equivalent to count_utf8
1346 *
1347 * This function does not validate the input.
1348 *
1349 * This function is not BOM-aware.
1350 *
1351 * @param input the UTF-8 string to process
1352 * @param length the length of the string in bytes
1353 * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32
1354 */
1355 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept;
1356
1357 /**
1358 * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string.
1359 *
1360 * During the conversion also validation of the input string is done.
1361 * This function is suitable to work with inputs from untrusted sources.
1362 *
1363 * This function is not BOM-aware.
1364 *
1365 * @param input the UTF-16 string to convert
1366 * @param length the length of the string in 2-byte code units (char16_t)
1367 * @param utf8_buffer the pointer to buffer that can hold conversion result
1368 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1369 */
1370 simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1371
1372
1373
1374 /**
1375 * Using native endianness, convert possibly broken UTF-16 string into Latin1 string.
1376 *
1377 * During the conversion also validation of the input string is done.
1378 * This function is suitable to work with inputs from untrusted sources.
1379 *
1380 * This function is not BOM-aware.
1381 *
1382 * @param input the UTF-16 string to convert
1383 * @param length the length of the string in 2-byte code units (char16_t)
1384 * @param latin1_buffer the pointer to buffer that can hold conversion result
1385 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1386 */
1387 simdutf_warn_unused size_t convert_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1388
1389 /**
1390 * Convert possibly broken UTF-16LE string into Latin1 string.
1391 *
1392 * During the conversion also validation of the input string is done.
1393 * This function is suitable to work with inputs from untrusted sources.
1394 *
1395 * This function is not BOM-aware.
1396 *
1397 * @param input the UTF-16LE string to convert
1398 * @param length the length of the string in 2-byte code units (char16_t)
1399 * @param latin1_buffer the pointer to buffer that can hold conversion result
1400 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1401 */
1402 simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1403
1404 /**
1405 * Convert possibly broken UTF-16BE string into Latin1 string.
1406 *
1407 * During the conversion also validation of the input string is done.
1408 * This function is suitable to work with inputs from untrusted sources.
1409 *
1410 * This function is not BOM-aware.
1411 *
1412 * @param input the UTF-16BE string to convert
1413 * @param length the length of the string in 2-byte code units (char16_t)
1414 * @param latin1_buffer the pointer to buffer that can hold conversion result
1415 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1416 */
1417 simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1418
1419
1420 /**
1421 * Convert possibly broken UTF-16LE string into UTF-8 string.
1422 *
1423 * During the conversion also validation of the input string is done.
1424 * This function is suitable to work with inputs from untrusted sources.
1425 *
1426 * This function is not BOM-aware.
1427 *
1428 * @param input the UTF-16LE string to convert
1429 * @param length the length of the string in 2-byte code units (char16_t)
1430 * @param utf8_buffer the pointer to buffer that can hold conversion result
1431 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1432 */
1433 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1434
1435 /**
1436 * Convert possibly broken UTF-16BE string into UTF-8 string.
1437 *
1438 * During the conversion also validation of the input string is done.
1439 * This function is suitable to work with inputs from untrusted sources.
1440 *
1441 * This function is not BOM-aware.
1442 *
1443 * @param input the UTF-16BE string to convert
1444 * @param length the length of the string in 2-byte code units (char16_t)
1445 * @param utf8_buffer the pointer to buffer that can hold conversion result
1446 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1447 */
1448 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1449
1450 /**
1451 * Using native endianness, convert possibly broken UTF-16 string into Latin1 string.
1452 *
1453 * During the conversion also validation of the input string is done.
1454 * This function is suitable to work with inputs from untrusted sources.
1455 * This function is not BOM-aware.
1456 *
1457 * @param input the UTF-16 string to convert
1458 * @param length the length of the string in 2-byte code units (char16_t)
1459 * @param latin1_buffer the pointer to buffer that can hold conversion result
1460 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1461 */
1462 simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1463
1464 /**
1465 * Convert possibly broken UTF-16LE string into Latin1 string.
1466 *
1467 * During the conversion also validation of the input string is done.
1468 * This function is suitable to work with inputs from untrusted sources.
1469 * This function is not BOM-aware.
1470 *
1471 * @param input the UTF-16LE string to convert
1472 * @param length the length of the string in 2-byte code units (char16_t)
1473 * @param latin1_buffer the pointer to buffer that can hold conversion result
1474 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1475 */
1476 simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1477
1478 /**
1479 * Convert possibly broken UTF-16BE string into Latin1 string.
1480 *
1481 * During the conversion also validation of the input string is done.
1482 * This function is suitable to work with inputs from untrusted sources.
1483 * This function is not BOM-aware.
1484 *
1485 * @param input the UTF-16BE string to convert
1486 * @param length the length of the string in 2-byte code units (char16_t)
1487 * @param latin1_buffer the pointer to buffer that can hold conversion result
1488 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1489 */
1490 simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1491
1492
1493 /**
1494 * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string and stop on error.
1495 *
1496 * During the conversion also validation of the input string is done.
1497 * This function is suitable to work with inputs from untrusted sources.
1498 *
1499 * This function is not BOM-aware.
1500 *
1501 * @param input the UTF-16 string to convert
1502 * @param length the length of the string in 2-byte code units (char16_t)
1503 * @param utf8_buffer the pointer to buffer that can hold conversion result
1504 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1505 */
1506 simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1507
1508 /**
1509 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
1510 *
1511 * During the conversion also validation of the input string is done.
1512 * This function is suitable to work with inputs from untrusted sources.
1513 *
1514 * This function is not BOM-aware.
1515 *
1516 * @param input the UTF-16LE string to convert
1517 * @param length the length of the string in 2-byte code units (char16_t)
1518 * @param utf8_buffer the pointer to buffer that can hold conversion result
1519 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1520 */
1521 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1522
1523 /**
1524 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
1525 *
1526 * During the conversion also validation of the input string is done.
1527 * This function is suitable to work with inputs from untrusted sources.
1528 *
1529 * This function is not BOM-aware.
1530 *
1531 * @param input the UTF-16BE string to convert
1532 * @param length the length of the string in 2-byte code units (char16_t)
1533 * @param utf8_buffer the pointer to buffer that can hold conversion result
1534 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1535 */
1536 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1537
1538 /**
1539 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
1540 *
1541 * This function assumes that the input string is valid UTF-16LE.
1542 *
1543 * This function is not BOM-aware.
1544 *
1545 * @param input the UTF-16 string to convert
1546 * @param length the length of the string in 2-byte code units (char16_t)
1547 * @param utf8_buffer the pointer to buffer that can hold the conversion result
1548 * @return number of written code units; 0 if conversion is not possible
1549 */
1550 simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1551
1552
1553 /**
1554 * Using native endianness, convert UTF-16 string into Latin1 string.
1555 *
1556 * This function assumes that the input string is valid UTF-8.
1557 *
1558 * This function is not BOM-aware.
1559 *
1560 * @param input the UTF-16 string to convert
1561 * @param length the length of the string in 2-byte code units (char16_t)
1562 * @param latin1_buffer the pointer to buffer that can hold conversion result
1563 * @return number of written code units; 0 if conversion is not possible
1564 */
1565 simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1566
1567 /**
1568 * Convert valid UTF-16LE string into Latin1 string.
1569 *
1570 * This function assumes that the input string is valid UTF-16LE.
1571 *
1572 * This function is not BOM-aware.
1573 *
1574 * @param input the UTF-16LE string to convert
1575 * @param length the length of the string in 2-byte code units (char16_t)
1576 * @param latin1_buffer the pointer to buffer that can hold conversion result
1577 * @return number of written code units; 0 if conversion is not possible
1578 */
1579 simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1580
1581 /**
1582 * Convert valid UTF-16BE string into Latin1 string.
1583 *
1584 * This function assumes that the input string is valid UTF-16BE.
1585 *
1586 * This function is not BOM-aware.
1587 *
1588 * @param input the UTF-16BE string to convert
1589 * @param length the length of the string in 2-byte code units (char16_t)
1590 * @param latin1_buffer the pointer to buffer that can hold conversion result
1591 * @return number of written code units; 0 if conversion is not possible
1592 */
1593 simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1594
1595
1596 /**
1597 * Convert valid UTF-16LE string into UTF-8 string.
1598 *
1599 * This function assumes that the input string is valid UTF-16LE.
1600 *
1601 * This function is not BOM-aware.
1602 *
1603 * @param input the UTF-16LE string to convert
1604 * @param length the length of the string in 2-byte code units (char16_t)
1605 * @param utf8_buffer the pointer to buffer that can hold the conversion result
1606 * @return number of written code units; 0 if conversion is not possible
1607 */
1608 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1609
1610 /**
1611 * Convert valid UTF-16BE string into UTF-8 string.
1612 *
1613 * This function assumes that the input string is valid UTF-16BE.
1614 *
1615 * This function is not BOM-aware.
1616 *
1617 * @param input the UTF-16BE string to convert
1618 * @param length the length of the string in 2-byte code units (char16_t)
1619 * @param utf8_buffer the pointer to buffer that can hold the conversion result
1620 * @return number of written code units; 0 if conversion is not possible
1621 */
1622 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1623
1624 /**
1625 * Using native endianness, convert possibly broken UTF-16 string into UTF-32 string.
1626 *
1627 * During the conversion also validation of the input string is done.
1628 * This function is suitable to work with inputs from untrusted sources.
1629 *
1630 * This function is not BOM-aware.
1631 *
1632 * @param input the UTF-16 string to convert
1633 * @param length the length of the string in 2-byte code units (char16_t)
1634 * @param utf32_buffer the pointer to buffer that can hold conversion result
1635 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1636 */
1637 simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1638
1639 /**
1640 * Convert possibly broken UTF-16LE string into UTF-32 string.
1641 *
1642 * During the conversion also validation of the input string is done.
1643 * This function is suitable to work with inputs from untrusted sources.
1644 *
1645 * This function is not BOM-aware.
1646 *
1647 * @param input the UTF-16LE string to convert
1648 * @param length the length of the string in 2-byte code units (char16_t)
1649 * @param utf32_buffer the pointer to buffer that can hold conversion result
1650 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1651 */
1652 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1653
1654 /**
1655 * Convert possibly broken UTF-16BE string into UTF-32 string.
1656 *
1657 * During the conversion also validation of the input string is done.
1658 * This function is suitable to work with inputs from untrusted sources.
1659 *
1660 * This function is not BOM-aware.
1661 *
1662 * @param input the UTF-16BE string to convert
1663 * @param length the length of the string in 2-byte code units (char16_t)
1664 * @param utf32_buffer the pointer to buffer that can hold conversion result
1665 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1666 */
1667 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1668
1669 /**
1670 * Using native endianness, convert possibly broken UTF-16 string into
1671 * UTF-32 string and stop on error.
1672 *
1673 * During the conversion also validation of the input string is done.
1674 * This function is suitable to work with inputs from untrusted sources.
1675 *
1676 * This function is not BOM-aware.
1677 *
1678 * @param input the UTF-16 string to convert
1679 * @param length the length of the string in 2-byte code units (char16_t)
1680 * @param utf32_buffer the pointer to buffer that can hold conversion result
1681 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
1682 */
1683 simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1684
1685 /**
1686 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
1687 *
1688 * During the conversion also validation of the input string is done.
1689 * This function is suitable to work with inputs from untrusted sources.
1690 *
1691 * This function is not BOM-aware.
1692 *
1693 * @param input the UTF-16LE string to convert
1694 * @param length the length of the string in 2-byte code units (char16_t)
1695 * @param utf32_buffer the pointer to buffer that can hold conversion result
1696 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
1697 */
1698 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1699
1700 /**
1701 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
1702 *
1703 * During the conversion also validation of the input string is done.
1704 * This function is suitable to work with inputs from untrusted sources.
1705 *
1706 * This function is not BOM-aware.
1707 *
1708 * @param input the UTF-16BE string to convert
1709 * @param length the length of the string in 2-byte code units (char16_t)
1710 * @param utf32_buffer the pointer to buffer that can hold conversion result
1711 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
1712 */
1713 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1714
1715 /**
1716 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
1717 *
1718 * This function assumes that the input string is valid UTF-16 (native endianness).
1719 *
1720 * This function is not BOM-aware.
1721 *
1722 * @param input the UTF-16 string to convert
1723 * @param length the length of the string in 2-byte code units (char16_t)
1724 * @param utf32_buffer the pointer to buffer that can hold the conversion result
1725 * @return number of written code units; 0 if conversion is not possible
1726 */
1727 simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1728
1729 /**
1730 * Convert valid UTF-16LE string into UTF-32 string.
1731 *
1732 * This function assumes that the input string is valid UTF-16LE.
1733 *
1734 * This function is not BOM-aware.
1735 *
1736 * @param input the UTF-16LE string to convert
1737 * @param length the length of the string in 2-byte code units (char16_t)
1738 * @param utf32_buffer the pointer to buffer that can hold the conversion result
1739 * @return number of written code units; 0 if conversion is not possible
1740 */
1741 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1742
1743 /**
1744 * Convert valid UTF-16BE string into UTF-32 string.
1745 *
1746 * This function assumes that the input string is valid UTF-16LE.
1747 *
1748 * This function is not BOM-aware.
1749 *
1750 * @param input the UTF-16BE string to convert
1751 * @param length the length of the string in 2-byte code units (char16_t)
1752 * @param utf32_buffer the pointer to buffer that can hold the conversion result
1753 * @return number of written code units; 0 if conversion is not possible
1754 */
1755 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1756
1757
1758 /*
1759 * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
1760 *
1761 * This function does not validate the input.
1762 *
1763 * This function is not BOM-aware.
1764 *
1765 * @param length the length of the string in 2-byte code units (char16_t)
1766 * @return the number of bytes required to encode the UTF-16LE string as Latin1
1767 */
1768 simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
1769
1770
1771 /**
1772 * Using native endianness; Compute the number of bytes that this UTF-16
1773 * string would require in UTF-8 format.
1774 *
1775 * This function does not validate the input.
1776 *
1777 * @param input the UTF-16 string to convert
1778 * @param length the length of the string in 2-byte code units (char16_t)
1779 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
1780 */
1781 simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept;
1782
1783 /**
1784 * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
1785 *
1786 * This function does not validate the input.
1787 *
1788 * @param input the UTF-16LE string to convert
1789 * @param length the length of the string in 2-byte code units (char16_t)
1790 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
1791 */
1792 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept;
1793
1794 /**
1795 * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
1796 *
1797 * This function does not validate the input.
1798 *
1799 * @param input the UTF-16BE string to convert
1800 * @param length the length of the string in 2-byte code units (char16_t)
1801 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
1802 */
1803 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept;
1804
1805 /**
1806 * Convert possibly broken UTF-32 string into UTF-8 string.
1807 *
1808 * During the conversion also validation of the input string is done.
1809 * This function is suitable to work with inputs from untrusted sources.
1810 *
1811 * This function is not BOM-aware.
1812 *
1813 * @param input the UTF-32 string to convert
1814 * @param length the length of the string in 4-byte code units (char32_t)
1815 * @param utf8_buffer the pointer to buffer that can hold conversion result
1816 * @return number of written code units; 0 if input is not a valid UTF-32 string
1817 */
1818 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1819
1820 /**
1821 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
1822 *
1823 * During the conversion also validation of the input string is done.
1824 * This function is suitable to work with inputs from untrusted sources.
1825 *
1826 * This function is not BOM-aware.
1827 *
1828 * @param input the UTF-32 string to convert
1829 * @param length the length of the string in 4-byte code units (char32_t)
1830 * @param utf8_buffer the pointer to buffer that can hold conversion result
1831 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1832 */
1833 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1834
1835 /**
1836 * Convert valid UTF-32 string into UTF-8 string.
1837 *
1838 * This function assumes that the input string is valid UTF-32.
1839 *
1840 * This function is not BOM-aware.
1841 *
1842 * @param input the UTF-32 string to convert
1843 * @param length the length of the string in 4-byte code units (char32_t)
1844 * @param utf8_buffer the pointer to buffer that can hold the conversion result
1845 * @return number of written code units; 0 if conversion is not possible
1846 */
1847 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1848
1849 /**
1850 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16 string.
1851 *
1852 * During the conversion also validation of the input string is done.
1853 * This function is suitable to work with inputs from untrusted sources.
1854 *
1855 * This function is not BOM-aware.
1856 *
1857 * @param input the UTF-32 string to convert
1858 * @param length the length of the string in 4-byte code units (char32_t)
1859 * @param utf16_buffer the pointer to buffer that can hold conversion result
1860 * @return number of written code units; 0 if input is not a valid UTF-32 string
1861 */
1862 simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1863
1864 /**
1865 * Convert possibly broken UTF-32 string into UTF-16LE string.
1866 *
1867 * During the conversion also validation of the input string is done.
1868 * This function is suitable to work with inputs from untrusted sources.
1869 *
1870 * This function is not BOM-aware.
1871 *
1872 * @param input the UTF-32 string to convert
1873 * @param length the length of the string in 4-byte code units (char32_t)
1874 * @param utf16_buffer the pointer to buffer that can hold conversion result
1875 * @return number of written code units; 0 if input is not a valid UTF-32 string
1876 */
1877 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1878
1879 /**
1880 * Convert possibly broken UTF-32 string into Latin1 string.
1881 *
1882 * During the conversion also validation of the input string is done.
1883 * This function is suitable to work with inputs from untrusted sources.
1884 *
1885 * This function is not BOM-aware.
1886 *
1887 * @param input the UTF-32 string to convert
1888 * @param length the length of the string in 4-byte code units (char32_t)
1889 * @param latin1_buffer the pointer to buffer that can hold conversion result
1890 * @return number of written code units; 0 if input is not a valid UTF-32 string
1891 */
1892 simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
1893
1894
1895 /**
1896 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
1897 *
1898 * During the conversion also validation of the input string is done.
1899 * This function is suitable to work with inputs from untrusted sources.
1900 *
1901 * This function is not BOM-aware.
1902 *
1903 * @param input the UTF-32 string to convert
1904 * @param length the length of the string in 4-byte code units (char32_t)
1905 * @param latin1_buffer the pointer to buffer that can hold conversion result
1906 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1907 */
1908 simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
1909
1910 /**
1911 * Convert valid UTF-32 string into Latin1 string.
1912 *
1913 * This function assumes that the input string is valid UTF-32.
1914 *
1915 * This function is not BOM-aware.
1916 *
1917 * @param input the UTF-32 string to convert
1918 * @param length the length of the string in 4-byte code units (char32_t)
1919 * @param latin1_buffer the pointer to buffer that can hold the conversion result
1920 * @return number of written code units; 0 if conversion is not possible
1921 */
1922 simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
1923
1924 /**
1925 * Convert possibly broken UTF-32 string into UTF-16BE string.
1926 *
1927 * During the conversion also validation of the input string is done.
1928 * This function is suitable to work with inputs from untrusted sources.
1929 *
1930 * This function is not BOM-aware.
1931 *
1932 * @param input the UTF-32 string to convert
1933 * @param length the length of the string in 4-byte code units (char32_t)
1934 * @param utf16_buffer the pointer to buffer that can hold conversion result
1935 * @return number of written code units; 0 if input is not a valid UTF-32 string
1936 */
1937 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1938
1939 /**
1940 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
1941 * string and stop on error.
1942 *
1943 * During the conversion also validation of the input string is done.
1944 * This function is suitable to work with inputs from untrusted sources.
1945 *
1946 * This function is not BOM-aware.
1947 *
1948 * @param input the UTF-32 string to convert
1949 * @param length the length of the string in 4-byte code units (char32_t)
1950 * @param utf16_buffer the pointer to buffer that can hold conversion result
1951 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
1952 */
1953 simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1954
1955 /**
1956 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
1957 *
1958 * During the conversion also validation of the input string is done.
1959 * This function is suitable to work with inputs from untrusted sources.
1960 *
1961 * This function is not BOM-aware.
1962 *
1963 * @param input the UTF-32 string to convert
1964 * @param length the length of the string in 4-byte code units (char32_t)
1965 * @param utf16_buffer the pointer to buffer that can hold conversion result
1966 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
1967 */
1968 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1969
1970 /**
1971 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
1972 *
1973 * During the conversion also validation of the input string is done.
1974 * This function is suitable to work with inputs from untrusted sources.
1975 *
1976 * This function is not BOM-aware.
1977 *
1978 * @param input the UTF-32 string to convert
1979 * @param length the length of the string in 4-byte code units (char32_t)
1980 * @param utf16_buffer the pointer to buffer that can hold conversion result
1981 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
1982 */
1983 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1984
1985 /**
1986 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
1987 *
1988 * This function assumes that the input string is valid UTF-32.
1989 *
1990 * This function is not BOM-aware.
1991 *
1992 * @param input the UTF-32 string to convert
1993 * @param length the length of the string in 4-byte code units (char32_t)
1994 * @param utf16_buffer the pointer to buffer that can hold the conversion result
1995 * @return number of written code units; 0 if conversion is not possible
1996 */
1997 simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1998
1999 /**
2000 * Convert valid UTF-32 string into UTF-16LE string.
2001 *
2002 * This function assumes that the input string is valid UTF-32.
2003 *
2004 * This function is not BOM-aware.
2005 *
2006 * @param input the UTF-32 string to convert
2007 * @param length the length of the string in 4-byte code units (char32_t)
2008 * @param utf16_buffer the pointer to buffer that can hold the conversion result
2009 * @return number of written code units; 0 if conversion is not possible
2010 */
2011 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
2012
2013 /**
2014 * Convert valid UTF-32 string into UTF-16BE string.
2015 *
2016 * This function assumes that the input string is valid UTF-32.
2017 *
2018 * This function is not BOM-aware.
2019 *
2020 * @param input the UTF-32 string to convert
2021 * @param length the length of the string in 4-byte code units (char32_t)
2022 * @param utf16_buffer the pointer to buffer that can hold the conversion result
2023 * @return number of written code units; 0 if conversion is not possible
2024 */
2025 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
2026
2027 /**
2028 * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
2029 * from UTF-16BE to UTF-16LE.
2030 *
2031 * This function does not validate the input.
2032 *
2033 * This function is not BOM-aware.
2034 *
2035 * @param input the UTF-16 string to process
2036 * @param length the length of the string in 2-byte code units (char16_t)
2037 * @param output the pointer to buffer that can hold the conversion result
2038 */
2039 void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept;
2040
2041 /**
2042 * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
2043 *
2044 * This function does not validate the input.
2045 *
2046 * @param input the UTF-32 string to convert
2047 * @param length the length of the string in 4-byte code units (char32_t)
2048 * @return the number of bytes required to encode the UTF-32 string as UTF-8
2049 */
2050 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept;
2051
2052 /**
2053 * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format.
2054 *
2055 * This function does not validate the input.
2056 *
2057 * @param input the UTF-32 string to convert
2058 * @param length the length of the string in 4-byte code units (char32_t)
2059 * @return the number of bytes required to encode the UTF-32 string as UTF-16
2060 */
2061 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept;
2062
2063 /**
2064 * Using native endianness; Compute the number of bytes that this UTF-16
2065 * string would require in UTF-32 format.
2066 *
2067 * This function is equivalent to count_utf16.
2068 *
2069 * This function does not validate the input.
2070 *
2071 * This function is not BOM-aware.
2072 *
2073 * @param input the UTF-16 string to convert
2074 * @param length the length of the string in 2-byte code units (char16_t)
2075 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2076 */
2077 simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept;
2078
2079 /**
2080 * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
2081 *
2082 * This function is equivalent to count_utf16le.
2083 *
2084 * This function does not validate the input.
2085 *
2086 * This function is not BOM-aware.
2087 *
2088 * @param input the UTF-16LE string to convert
2089 * @param length the length of the string in 2-byte code units (char16_t)
2090 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2091 */
2092 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept;
2093
2094 /**
2095 * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
2096 *
2097 * This function is equivalent to count_utf16be.
2098 *
2099 * This function does not validate the input.
2100 *
2101 * This function is not BOM-aware.
2102 *
2103 * @param input the UTF-16BE string to convert
2104 * @param length the length of the string in 2-byte code units (char16_t)
2105 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
2106 */
2107 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept;
2108
2109 /**
2110 * Count the number of code points (characters) in the string assuming that
2111 * it is valid.
2112 *
2113 * This function assumes that the input string is valid UTF-16 (native endianness).
2114 *
2115 * This function is not BOM-aware.
2116 *
2117 * @param input the UTF-16 string to process
2118 * @param length the length of the string in 2-byte code units (char16_t)
2119 * @return number of code points
2120 */
2121 simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept;
2122
2123 /**
2124 * Count the number of code points (characters) in the string assuming that
2125 * it is valid.
2126 *
2127 * This function assumes that the input string is valid UTF-16LE.
2128 *
2129 * This function is not BOM-aware.
2130 *
2131 * @param input the UTF-16LE string to process
2132 * @param length the length of the string in 2-byte code units (char16_t)
2133 * @return number of code points
2134 */
2135 simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept;
2136
2137 /**
2138 * Count the number of code points (characters) in the string assuming that
2139 * it is valid.
2140 *
2141 * This function assumes that the input string is valid UTF-16BE.
2142 *
2143 * This function is not BOM-aware.
2144 *
2145 * @param input the UTF-16BE string to process
2146 * @param length the length of the string in 2-byte code units (char16_t)
2147 * @return number of code points
2148 */
2149 simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept;
2150
2151 /**
2152 * Count the number of code points (characters) in the string assuming that
2153 * it is valid.
2154 *
2155 * This function assumes that the input string is valid UTF-8.
2156 *
2157 * @param input the UTF-8 string to process
2158 * @param length the length of the string in bytes
2159 * @return number of code points
2160 */
2161 simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept;
2162
2163 /**
2164 * Given a valid UTF-8 string having a possibly truncated last character,
2165 * this function checks the end of string. If the last character is truncated (or partial),
2166 * then it returns a shorter length (shorter by 1 to 3 bytes) so that the short UTF-8
2167 * strings only contain complete characters. If there is no truncated character,
2168 * the original length is returned.
2169 *
2170 * This function assumes that the input string is valid UTF-8, but possibly truncated.
2171 *
2172 * @param input the UTF-8 string to process
2173 * @param length the length of the string in bytes
2174 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
2175 */
2176 simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
2177
2178 /**
2179 * Given a valid UTF-16BE string having a possibly truncated last character,
2180 * this function checks the end of string. If the last character is truncated (or partial),
2181 * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16BE
2182 * strings only contain complete characters. If there is no truncated character,
2183 * the original length is returned.
2184 *
2185 * This function assumes that the input string is valid UTF-16BE, but possibly truncated.
2186 *
2187 * @param input the UTF-16BE string to process
2188 * @param length the length of the string in bytes
2189 * @return the length of the string in bytes, possibly shorter by 1 unit
2190 */
2191 simdutf_warn_unused size_t trim_partial_utf16be(const char16_t* input, size_t length);
2192
2193 /**
2194 * Given a valid UTF-16LE string having a possibly truncated last character,
2195 * this function checks the end of string. If the last character is truncated (or partial),
2196 * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16LE
2197 * strings only contain complete characters. If there is no truncated character,
2198 * the original length is returned.
2199 *
2200 * This function assumes that the input string is valid UTF-16LE, but possibly truncated.
2201 *
2202 * @param input the UTF-16LE string to process
2203 * @param length the length of the string in bytes
2204 * @return the length of the string in unit, possibly shorter by 1 unit
2205 */
2206 simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t length);
2207
2208
2209 /**
2210 * Given a valid UTF-16 string having a possibly truncated last character,
2211 * this function checks the end of string. If the last character is truncated (or partial),
2212 * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16
2213 * strings only contain complete characters. If there is no truncated character,
2214 * the original length is returned.
2215 *
2216 * This function assumes that the input string is valid UTF-16, but possibly truncated.
2217 * We use the native endianness.
2218 *
2219 * @param input the UTF-16 string to process
2220 * @param length the length of the string in bytes
2221 * @return the length of the string in unit, possibly shorter by 1 unit
2222 */
2223 simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length);
2224
2225 /**
2226 * An implementation of simdutf for a particular CPU architecture.
2227 *
2228 * Also used to maintain the currently active implementation. The active implementation is
2229 * automatically initialized on first use to the most advanced implementation supported by the host.
2230 */
2231 class implementation {
2232 public:
2233
2234 /**
2235 * The name of this implementation.
2236 *
2237 * const implementation *impl = simdutf::active_implementation;
2238 * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
2239 *
2240 * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
2241 */
name() const2242 virtual const std::string &name() const { return _name; }
2243
2244 /**
2245 * The description of this implementation.
2246 *
2247 * const implementation *impl = simdutf::active_implementation;
2248 * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
2249 *
2250 * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
2251 */
description() const2252 virtual const std::string &description() const { return _description; }
2253
2254 /**
2255 * The instruction sets this implementation is compiled against
2256 * and the current CPU match. This function may poll the current CPU/system
2257 * and should therefore not be called too often if performance is a concern.
2258 *
2259 *
2260 * @return true if the implementation can be safely used on the current system (determined at runtime)
2261 */
2262 bool supported_by_runtime_system() const;
2263
2264 /**
2265 * This function will try to detect the encoding
2266 * @param input the string to identify
2267 * @param length the length of the string in bytes.
2268 * @return the encoding type detected
2269 */
2270 virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept;
2271
2272 /**
2273 * This function will try to detect the possible encodings in one pass
2274 * @param input the string to identify
2275 * @param length the length of the string in bytes.
2276 * @return the encoding type detected
2277 */
2278 virtual int detect_encodings(const char * input, size_t length) const noexcept = 0;
2279
2280 /**
2281 * @private For internal implementation use
2282 *
2283 * The instruction sets this implementation is compiled against.
2284 *
2285 * @return a mask of all required `internal::instruction_set::` values
2286 */
required_instruction_sets() const2287 virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }
2288
2289
2290 /**
2291 * Validate the UTF-8 string.
2292 *
2293 * Overridden by each implementation.
2294 *
2295 * @param buf the UTF-8 string to validate.
2296 * @param len the length of the string in bytes.
2297 * @return true if and only if the string is valid UTF-8.
2298 */
2299 simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
2300
2301 /**
2302 * Validate the UTF-8 string and stop on errors.
2303 *
2304 * Overridden by each implementation.
2305 *
2306 * @param buf the UTF-8 string to validate.
2307 * @param len the length of the string in bytes.
2308 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2309 */
2310 simdutf_warn_unused virtual result validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
2311
2312 /**
2313 * Validate the ASCII string.
2314 *
2315 * Overridden by each implementation.
2316 *
2317 * @param buf the ASCII string to validate.
2318 * @param len the length of the string in bytes.
2319 * @return true if and only if the string is valid ASCII.
2320 */
2321 simdutf_warn_unused virtual bool validate_ascii(const char *buf, size_t len) const noexcept = 0;
2322
2323 /**
2324 * Validate the ASCII string and stop on error.
2325 *
2326 * Overridden by each implementation.
2327 *
2328 * @param buf the ASCII string to validate.
2329 * @param len the length of the string in bytes.
2330 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2331 */
2332 simdutf_warn_unused virtual result validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
2333
2334 /**
2335 * Validate the UTF-16LE string.This function may be best when you expect
2336 * the input to be almost always valid. Otherwise, consider using
2337 * validate_utf16le_with_errors.
2338 *
2339 * Overridden by each implementation.
2340 *
2341 * This function is not BOM-aware.
2342 *
2343 * @param buf the UTF-16LE string to validate.
2344 * @param len the length of the string in number of 2-byte code units (char16_t).
2345 * @return true if and only if the string is valid UTF-16LE.
2346 */
2347 simdutf_warn_unused virtual bool validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
2348
2349 /**
2350 * Validate the UTF-16BE string. This function may be best when you expect
2351 * the input to be almost always valid. Otherwise, consider using
2352 * validate_utf16be_with_errors.
2353 *
2354 * Overridden by each implementation.
2355 *
2356 * This function is not BOM-aware.
2357 *
2358 * @param buf the UTF-16BE string to validate.
2359 * @param len the length of the string in number of 2-byte code units (char16_t).
2360 * @return true if and only if the string is valid UTF-16BE.
2361 */
2362 simdutf_warn_unused virtual bool validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
2363
2364 /**
2365 * Validate the UTF-16LE string and stop on error. It might be faster than
2366 * validate_utf16le when an error is expected to occur early.
2367 *
2368 * Overridden by each implementation.
2369 *
2370 * This function is not BOM-aware.
2371 *
2372 * @param buf the UTF-16LE string to validate.
2373 * @param len the length of the string in number of 2-byte code units (char16_t).
2374 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2375 */
2376 simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
2377
2378 /**
2379 * Validate the UTF-16BE string and stop on error. It might be faster than
2380 * validate_utf16be when an error is expected to occur early.
2381 *
2382 * Overridden by each implementation.
2383 *
2384 * This function is not BOM-aware.
2385 *
2386 * @param buf the UTF-16BE string to validate.
2387 * @param len the length of the string in number of 2-byte code units (char16_t).
2388 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2389 */
2390 simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
2391
2392 /**
2393 * Validate the UTF-32 string.
2394 *
2395 * Overridden by each implementation.
2396 *
2397 * This function is not BOM-aware.
2398 *
2399 * @param buf the UTF-32 string to validate.
2400 * @param len the length of the string in number of 4-byte code units (char32_t).
2401 * @return true if and only if the string is valid UTF-32.
2402 */
2403 simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
2404
2405 /**
2406 * Validate the UTF-32 string and stop on error.
2407 *
2408 * Overridden by each implementation.
2409 *
2410 * This function is not BOM-aware.
2411 *
2412 * @param buf the UTF-32 string to validate.
2413 * @param len the length of the string in number of 4-byte code units (char32_t).
2414 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2415 */
2416 simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0;
2417
2418 /**
2419 * Convert Latin1 string into UTF8 string.
2420 *
2421 * This function is suitable to work with inputs from untrusted sources.
2422 *
2423 * @param input the Latin1 string to convert
2424 * @param length the length of the string in bytes
2425 * @param latin1_output the pointer to buffer that can hold conversion result
2426 * @return the number of written char; 0 if conversion is not possible
2427 */
2428 simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0;
2429
2430
2431 /**
2432 * Convert possibly Latin1 string into UTF-16LE string.
2433 *
2434 * This function is suitable to work with inputs from untrusted sources.
2435 *
2436 * @param input the Latin1 string to convert
2437 * @param length the length of the string in bytes
2438 * @param utf16_buffer the pointer to buffer that can hold conversion result
2439 * @return the number of written char16_t; 0 if conversion is not possible
2440 */
2441 simdutf_warn_unused virtual size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2442
2443 /**
2444 * Convert Latin1 string into UTF-16BE string.
2445 *
2446 * This function is suitable to work with inputs from untrusted sources.
2447 *
2448 * @param input the Latin1 string to convert
2449 * @param length the length of the string in bytes
2450 * @param utf16_buffer the pointer to buffer that can hold conversion result
2451 * @return the number of written char16_t; 0 if conversion is not possible
2452 */
2453 simdutf_warn_unused virtual size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2454
2455 /**
2456 * Convert Latin1 string into UTF-32 string.
2457 *
2458 * This function is suitable to work with inputs from untrusted sources.
2459 *
2460 * @param input the Latin1 string to convert
2461 * @param length the length of the string in bytes
2462 * @param utf32_buffer the pointer to buffer that can hold conversion result
2463 * @return the number of written char32_t; 0 if conversion is not possible
2464 */
2465 simdutf_warn_unused virtual size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2466
2467 /**
2468 * Convert possibly broken UTF-8 string into latin1 string.
2469 *
2470 * During the conversion also validation of the input string is done.
2471 * This function is suitable to work with inputs from untrusted sources.
2472 *
2473 * @param input the UTF-8 string to convert
2474 * @param length the length of the string in bytes
2475 * @param latin1_output the pointer to buffer that can hold conversion result
2476 * @return the number of written char; 0 if the input was not valid UTF-8 string
2477 */
2478 simdutf_warn_unused virtual size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
2479
2480 /**
2481 * Convert possibly broken UTF-8 string into latin1 string with errors
2482 *
2483 * During the conversion also validation of the input string is done.
2484 * This function is suitable to work with inputs from untrusted sources.
2485 *
2486 * @param input the UTF-8 string to convert
2487 * @param length the length of the string in bytes
2488 * @param latin1_output the pointer to buffer that can hold conversion result
2489 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2490 */
2491 simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) const noexcept = 0;
2492
2493 /**
2494 * Convert valid UTF-8 string into latin1 string.
2495 *
2496 * This function assumes that the input string is valid UTF-8.
2497 *
2498 * This function is not BOM-aware.
2499 *
2500 * @param input the UTF-8 string to convert
2501 * @param length the length of the string in bytes
2502 * @param latin1_output the pointer to buffer that can hold conversion result
2503 * @return the number of written char; 0 if the input was not valid UTF-8 string
2504 */
2505 simdutf_warn_unused virtual size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
2506
2507
2508 /**
2509 * Convert possibly broken UTF-8 string into UTF-16LE string.
2510 *
2511 * During the conversion also validation of the input string is done.
2512 * This function is suitable to work with inputs from untrusted sources.
2513 *
2514 * @param input the UTF-8 string to convert
2515 * @param length the length of the string in bytes
2516 * @param utf16_buffer the pointer to buffer that can hold conversion result
2517 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2518 */
2519 simdutf_warn_unused virtual size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2520
2521 /**
2522 * Convert possibly broken UTF-8 string into UTF-16BE string.
2523 *
2524 * During the conversion also validation of the input string is done.
2525 * This function is suitable to work with inputs from untrusted sources.
2526 *
2527 * @param input the UTF-8 string to convert
2528 * @param length the length of the string in bytes
2529 * @param utf16_buffer the pointer to buffer that can hold conversion result
2530 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2531 */
2532 simdutf_warn_unused virtual size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2533
2534 /**
2535 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
2536 *
2537 * During the conversion also validation of the input string is done.
2538 * This function is suitable to work with inputs from untrusted sources.
2539 *
2540 * @param input the UTF-8 string to convert
2541 * @param length the length of the string in bytes
2542 * @param utf16_buffer the pointer to buffer that can hold conversion result
2543 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2544 */
2545 simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2546
2547 /**
2548 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
2549 *
2550 * During the conversion also validation of the input string is done.
2551 * This function is suitable to work with inputs from untrusted sources.
2552 *
2553 * @param input the UTF-8 string to convert
2554 * @param length the length of the string in bytes
2555 * @param utf16_buffer the pointer to buffer that can hold conversion result
2556 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2557 */
2558 simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2559
2560 /**
2561 * Convert possibly broken UTF-8 string into UTF-32 string.
2562 *
2563 * During the conversion also validation of the input string is done.
2564 * This function is suitable to work with inputs from untrusted sources.
2565 *
2566 * @param input the UTF-8 string to convert
2567 * @param length the length of the string in bytes
2568 * @param utf32_buffer the pointer to buffer that can hold conversion result
2569 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2570 */
2571 simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
2572
2573 /**
2574 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
2575 *
2576 * During the conversion also validation of the input string is done.
2577 * This function is suitable to work with inputs from untrusted sources.
2578 *
2579 * @param input the UTF-8 string to convert
2580 * @param length the length of the string in bytes
2581 * @param utf32_buffer the pointer to buffer that can hold conversion result
2582 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
2583 */
2584 simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
2585
2586 /**
2587 * Convert valid UTF-8 string into UTF-16LE string.
2588 *
2589 * This function assumes that the input string is valid UTF-8.
2590 *
2591 * @param input the UTF-8 string to convert
2592 * @param length the length of the string in bytes
2593 * @param utf16_buffer the pointer to buffer that can hold conversion result
2594 * @return the number of written char16_t
2595 */
2596 simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2597
2598 /**
2599 * Convert valid UTF-8 string into UTF-16BE string.
2600 *
2601 * This function assumes that the input string is valid UTF-8.
2602 *
2603 * @param input the UTF-8 string to convert
2604 * @param length the length of the string in bytes
2605 * @param utf16_buffer the pointer to buffer that can hold conversion result
2606 * @return the number of written char16_t
2607 */
2608 simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2609
2610 /**
2611 * Convert valid UTF-8 string into UTF-32 string.
2612 *
2613 * This function assumes that the input string is valid UTF-8.
2614 *
2615 * @param input the UTF-8 string to convert
2616 * @param length the length of the string in bytes
2617 * @param utf16_buffer the pointer to buffer that can hold conversion result
2618 * @return the number of written char32_t
2619 */
2620 simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2621
2622 /**
2623 * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format.
2624 *
2625 * This function does not validate the input.
2626 *
2627 * @param input the UTF-8 string to process
2628 * @param length the length of the string in bytes
2629 * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE
2630 */
2631 simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0;
2632
2633 /**
2634 * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format.
2635 *
2636 * This function is equivalent to count_utf8.
2637 *
2638 * This function does not validate the input.
2639 *
2640 * @param input the UTF-8 string to process
2641 * @param length the length of the string in bytes
2642 * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32
2643 */
2644 simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0;
2645
2646 /**
2647 * Convert possibly broken UTF-16LE string into Latin1 string.
2648 *
2649 * During the conversion also validation of the input string is done.
2650 * This function is suitable to work with inputs from untrusted sources.
2651 *
2652 * This function is not BOM-aware.
2653 *
2654 * @param input the UTF-16LE string to convert
2655 * @param length the length of the string in 2-byte code units (char16_t)
2656 * @param latin1_buffer the pointer to buffer that can hold conversion result
2657 * @return number of written code units; 0 if input is not a valid UTF-16LE string
2658 */
2659 simdutf_warn_unused virtual size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2660
2661 /**
2662 * Convert possibly broken UTF-16BE string into Latin1 string.
2663 *
2664 * During the conversion also validation of the input string is done.
2665 * This function is suitable to work with inputs from untrusted sources.
2666 *
2667 * This function is not BOM-aware.
2668 *
2669 * @param input the UTF-16BE string to convert
2670 * @param length the length of the string in 2-byte code units (char16_t)
2671 * @param latin1_buffer the pointer to buffer that can hold conversion result
2672 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2673 */
2674 simdutf_warn_unused virtual size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2675
2676 /**
2677 * Convert possibly broken UTF-16LE string into Latin1 string.
2678 *
2679 * During the conversion also validation of the input string is done.
2680 * This function is suitable to work with inputs from untrusted sources.
2681 * This function is not BOM-aware.
2682 *
2683 * @param input the UTF-16LE string to convert
2684 * @param length the length of the string in 2-byte code units (char16_t)
2685 * @param latin1_buffer the pointer to buffer that can hold conversion result
2686 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2687 */
2688 simdutf_warn_unused virtual result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2689
2690 /**
2691 * Convert possibly broken UTF-16BE string into Latin1 string.
2692 *
2693 * During the conversion also validation of the input string is done.
2694 * This function is suitable to work with inputs from untrusted sources.
2695 * This function is not BOM-aware.
2696 *
2697 * @param input the UTF-16BE string to convert
2698 * @param length the length of the string in 2-byte code units (char16_t)
2699 * @param latin1_buffer the pointer to buffer that can hold conversion result
2700 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2701 */
2702 simdutf_warn_unused virtual result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2703
2704 /**
2705 * Convert valid UTF-16LE string into Latin1 string.
2706 *
2707 * This function assumes that the input string is valid UTF-8.
2708
2709 * This function is not BOM-aware.
2710 *
2711 * @param input the UTF-16LE string to convert
2712 * @param length the length of the string in 2-byte code units (char16_t)
2713 * @param latin1_buffer the pointer to buffer that can hold conversion result
2714 * @return number of written code units; 0 if conversion is not possible
2715 */
2716 simdutf_warn_unused virtual size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2717
2718 /**
2719 * Convert valid UTF-16BE string into Latin1 string.
2720 *
2721 * This function assumes that the input string is valid UTF-8.
2722 *
2723 * This function is not BOM-aware.
2724 *
2725 * @param input the UTF-16BE string to convert
2726 * @param length the length of the string in 2-byte code units (char16_t)
2727 * @param latin1_buffer the pointer to buffer that can hold conversion result
2728 * @return number of written code units; 0 if conversion is not possible
2729 */
2730 simdutf_warn_unused virtual size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2731
2732 /**
2733 * Convert possibly broken UTF-16LE string into UTF-8 string.
2734 *
2735 * During the conversion also validation of the input string is done.
2736 * This function is suitable to work with inputs from untrusted sources.
2737 *
2738 * This function is not BOM-aware.
2739 *
2740 * @param input the UTF-16LE string to convert
2741 * @param length the length of the string in 2-byte code units (char16_t)
2742 * @param utf8_buffer the pointer to buffer that can hold conversion result
2743 * @return number of written code units; 0 if input is not a valid UTF-16LE string
2744 */
2745 simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2746
2747 /**
2748 * Convert possibly broken UTF-16BE string into UTF-8 string.
2749 *
2750 * During the conversion also validation of the input string is done.
2751 * This function is suitable to work with inputs from untrusted sources.
2752 *
2753 * This function is not BOM-aware.
2754 *
2755 * @param input the UTF-16BE string to convert
2756 * @param length the length of the string in 2-byte code units (char16_t)
2757 * @param utf8_buffer the pointer to buffer that can hold conversion result
2758 * @return number of written code units; 0 if input is not a valid UTF-16BE string
2759 */
2760 simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2761
2762 /**
2763 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
2764 *
2765 * During the conversion also validation of the input string is done.
2766 * This function is suitable to work with inputs from untrusted sources.
2767 *
2768 * This function is not BOM-aware.
2769 *
2770 * @param input the UTF-16LE string to convert
2771 * @param length the length of the string in 2-byte code units (char16_t)
2772 * @param utf8_buffer the pointer to buffer that can hold conversion result
2773 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2774 */
2775 simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2776
2777 /**
2778 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
2779 *
2780 * During the conversion also validation of the input string is done.
2781 * This function is suitable to work with inputs from untrusted sources.
2782 *
2783 * This function is not BOM-aware.
2784 *
2785 * @param input the UTF-16BE string to convert
2786 * @param length the length of the string in 2-byte code units (char16_t)
2787 * @param utf8_buffer the pointer to buffer that can hold conversion result
2788 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2789 */
2790 simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2791
2792 /**
2793 * Convert valid UTF-16LE string into UTF-8 string.
2794 *
2795 * This function assumes that the input string is valid UTF-16LE.
2796 *
2797 * This function is not BOM-aware.
2798 *
2799 * @param input the UTF-16LE string to convert
2800 * @param length the length of the string in 2-byte code units (char16_t)
2801 * @param utf8_buffer the pointer to buffer that can hold the conversion result
2802 * @return number of written code units; 0 if conversion is not possible
2803 */
2804 simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2805
2806 /**
2807 * Convert valid UTF-16BE string into UTF-8 string.
2808 *
2809 * This function assumes that the input string is valid UTF-16BE.
2810 *
2811 * This function is not BOM-aware.
2812 *
2813 * @param input the UTF-16BE string to convert
2814 * @param length the length of the string in 2-byte code units (char16_t)
2815 * @param utf8_buffer the pointer to buffer that can hold the conversion result
2816 * @return number of written code units; 0 if conversion is not possible
2817 */
2818 simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2819
2820 /**
2821 * Convert possibly broken UTF-16LE string into UTF-32 string.
2822 *
2823 * During the conversion also validation of the input string is done.
2824 * This function is suitable to work with inputs from untrusted sources.
2825 *
2826 * This function is not BOM-aware.
2827 *
2828 * @param input the UTF-16LE string to convert
2829 * @param length the length of the string in 2-byte code units (char16_t)
2830 * @param utf32_buffer the pointer to buffer that can hold conversion result
2831 * @return number of written code units; 0 if input is not a valid UTF-16LE string
2832 */
2833 simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2834
2835 /**
2836 * Convert possibly broken UTF-16BE string into UTF-32 string.
2837 *
2838 * During the conversion also validation of the input string is done.
2839 * This function is suitable to work with inputs from untrusted sources.
2840 *
2841 * This function is not BOM-aware.
2842 *
2843 * @param input the UTF-16BE string to convert
2844 * @param length the length of the string in 2-byte code units (char16_t)
2845 * @param utf32_buffer the pointer to buffer that can hold conversion result
2846 * @return number of written code units; 0 if input is not a valid UTF-16BE string
2847 */
2848 simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2849
2850 /**
2851 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
2852 *
2853 * During the conversion also validation of the input string is done.
2854 * This function is suitable to work with inputs from untrusted sources.
2855 *
2856 * This function is not BOM-aware.
2857 *
2858 * @param input the UTF-16LE string to convert
2859 * @param length the length of the string in 2-byte code units (char16_t)
2860 * @param utf32_buffer the pointer to buffer that can hold conversion result
2861 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
2862 */
2863 simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2864
2865 /**
2866 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
2867 *
2868 * During the conversion also validation of the input string is done.
2869 * This function is suitable to work with inputs from untrusted sources.
2870 *
2871 * This function is not BOM-aware.
2872 *
2873 * @param input the UTF-16BE string to convert
2874 * @param length the length of the string in 2-byte code units (char16_t)
2875 * @param utf32_buffer the pointer to buffer that can hold conversion result
2876 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
2877 */
2878 simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2879
2880 /**
2881 * Convert valid UTF-16LE string into UTF-32 string.
2882 *
2883 * This function assumes that the input string is valid UTF-16LE.
2884 *
2885 * This function is not BOM-aware.
2886 *
2887 * @param input the UTF-16LE string to convert
2888 * @param length the length of the string in 2-byte code units (char16_t)
2889 * @param utf32_buffer the pointer to buffer that can hold the conversion result
2890 * @return number of written code units; 0 if conversion is not possible
2891 */
2892 simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2893
2894 /**
2895 * Convert valid UTF-16LE string into UTF-32BE string.
2896 *
2897 * This function assumes that the input string is valid UTF-16BE.
2898 *
2899 * This function is not BOM-aware.
2900 *
2901 * @param input the UTF-16BE string to convert
2902 * @param length the length of the string in 2-byte code units (char16_t)
2903 * @param utf32_buffer the pointer to buffer that can hold the conversion result
2904 * @return number of written code units; 0 if conversion is not possible
2905 */
2906 simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2907
2908 /**
2909 * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
2910 *
2911 * This function does not validate the input.
2912 *
2913 * This function is not BOM-aware.
2914 *
2915 * @param input the UTF-16LE string to convert
2916 * @param length the length of the string in 2-byte code units (char16_t)
2917 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2918 */
2919 simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
2920
2921 /**
2922 * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
2923 *
2924 * This function does not validate the input.
2925 *
2926 * This function is not BOM-aware.
2927 *
2928 * @param input the UTF-16BE string to convert
2929 * @param length the length of the string in 2-byte code units (char16_t)
2930 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
2931 */
2932 simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
2933
2934 /**
2935 * Convert possibly broken UTF-32 string into Latin1 string.
2936 *
2937 * During the conversion also validation of the input string is done.
2938 * This function is suitable to work with inputs from untrusted sources.
2939 *
2940 * This function is not BOM-aware.
2941 *
2942 * @param input the UTF-32 string to convert
2943 * @param length the length of the string in 4-byte code units (char32_t)
2944 * @param latin1_buffer the pointer to buffer that can hold conversion result
2945 * @return number of written code units; 0 if input is not a valid UTF-32 string
2946 */
2947
2948 simdutf_warn_unused virtual size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2949
2950 /**
2951 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
2952 *
2953 * During the conversion also validation of the input string is done.
2954 * This function is suitable to work with inputs from untrusted sources.
2955 *
2956 * This function is not BOM-aware.
2957 *
2958 * @param input the UTF-32 string to convert
2959 * @param length the length of the string in 4-byte code units (char32_t)
2960 * @param latin1_buffer the pointer to buffer that can hold conversion result
2961 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2962 */
2963
2964 simdutf_warn_unused virtual result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2965
2966 /**
2967 * Convert valid UTF-32 string into Latin1 string.
2968 *
2969 * This function assumes that the input string is valid UTF-32.
2970 *
2971 * This function is not BOM-aware.
2972 *
2973 * @param input the UTF-32 string to convert
2974 * @param length the length of the string in 4-byte code units (char32_t)
2975 * @param latin1_buffer the pointer to buffer that can hold the conversion result
2976 * @return number of written code units; 0 if conversion is not possible
2977 */
2978 simdutf_warn_unused virtual size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2979
2980 /**
2981 * Convert possibly broken UTF-32 string into UTF-8 string.
2982 *
2983 * During the conversion also validation of the input string is done.
2984 * This function is suitable to work with inputs from untrusted sources.
2985 *
2986 * This function is not BOM-aware.
2987 *
2988 * @param input the UTF-32 string to convert
2989 * @param length the length of the string in 4-byte code units (char32_t)
2990 * @param utf8_buffer the pointer to buffer that can hold conversion result
2991 * @return number of written code units; 0 if input is not a valid UTF-32 string
2992 */
2993 simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2994
2995 /**
2996 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
2997 *
2998 * During the conversion also validation of the input string is done.
2999 * This function is suitable to work with inputs from untrusted sources.
3000 *
3001 * This function is not BOM-aware.
3002 *
3003 * @param input the UTF-32 string to convert
3004 * @param length the length of the string in 4-byte code units (char32_t)
3005 * @param utf8_buffer the pointer to buffer that can hold conversion result
3006 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
3007 */
3008 simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
3009
3010 /**
3011 * Convert valid UTF-32 string into UTF-8 string.
3012 *
3013 * This function assumes that the input string is valid UTF-32.
3014 *
3015 * This function is not BOM-aware.
3016 *
3017 * @param input the UTF-32 string to convert
3018 * @param length the length of the string in 4-byte code units (char32_t)
3019 * @param utf8_buffer the pointer to buffer that can hold the conversion result
3020 * @return number of written code units; 0 if conversion is not possible
3021 */
3022 simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
3023
3024
3025 /**
3026 * Return the number of bytes that this UTF-16 string would require in Latin1 format.
3027 *
3028 *
3029 * @param input the UTF-16 string to convert
3030 * @param length the length of the string in 2-byte code units (char16_t)
3031 * @return the number of bytes required to encode the UTF-16 string as Latin1
3032 */
3033 simdutf_warn_unused virtual size_t utf16_length_from_latin1(size_t length) const noexcept = 0;
3034
3035 /**
3036 * Convert possibly broken UTF-32 string into UTF-16LE string.
3037 *
3038 * During the conversion also validation of the input string is done.
3039 * This function is suitable to work with inputs from untrusted sources.
3040 *
3041 * This function is not BOM-aware.
3042 *
3043 * @param input the UTF-32 string to convert
3044 * @param length the length of the string in 4-byte code units (char32_t)
3045 * @param utf16_buffer the pointer to buffer that can hold conversion result
3046 * @return number of written code units; 0 if input is not a valid UTF-32 string
3047 */
3048 simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3049
3050 /**
3051 * Convert possibly broken UTF-32 string into UTF-16BE string.
3052 *
3053 * During the conversion also validation of the input string is done.
3054 * This function is suitable to work with inputs from untrusted sources.
3055 *
3056 * This function is not BOM-aware.
3057 *
3058 * @param input the UTF-32 string to convert
3059 * @param length the length of the string in 4-byte code units (char32_t)
3060 * @param utf16_buffer the pointer to buffer that can hold conversion result
3061 * @return number of written code units; 0 if input is not a valid UTF-32 string
3062 */
3063 simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3064
3065 /**
3066 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
3067 *
3068 * During the conversion also validation of the input string is done.
3069 * This function is suitable to work with inputs from untrusted sources.
3070 *
3071 * This function is not BOM-aware.
3072 *
3073 * @param input the UTF-32 string to convert
3074 * @param length the length of the string in 4-byte code units (char32_t)
3075 * @param utf16_buffer the pointer to buffer that can hold conversion result
3076 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
3077 */
3078 simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3079
3080 /**
3081 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
3082 *
3083 * During the conversion also validation of the input string is done.
3084 * This function is suitable to work with inputs from untrusted sources.
3085 *
3086 * This function is not BOM-aware.
3087 *
3088 * @param input the UTF-32 string to convert
3089 * @param length the length of the string in 4-byte code units (char32_t)
3090 * @param utf16_buffer the pointer to buffer that can hold conversion result
3091 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
3092 */
3093 simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3094
3095 /**
3096 * Convert valid UTF-32 string into UTF-16LE string.
3097 *
3098 * This function assumes that the input string is valid UTF-32.
3099 *
3100 * This function is not BOM-aware.
3101 *
3102 * @param input the UTF-32 string to convert
3103 * @param length the length of the string in 4-byte code units (char32_t)
3104 * @param utf16_buffer the pointer to buffer that can hold the conversion result
3105 * @return number of written code units; 0 if conversion is not possible
3106 */
3107 simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3108
3109 /**
3110 * Convert valid UTF-32 string into UTF-16BE string.
3111 *
3112 * This function assumes that the input string is valid UTF-32.
3113 *
3114 * This function is not BOM-aware.
3115 *
3116 * @param input the UTF-32 string to convert
3117 * @param length the length of the string in 4-byte code units (char32_t)
3118 * @param utf16_buffer the pointer to buffer that can hold the conversion result
3119 * @return number of written code units; 0 if conversion is not possible
3120 */
3121 simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3122
3123 /**
3124 * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
3125 * from UTF-16BE to UTF-16LE.
3126 *
3127 * This function does not validate the input.
3128 *
3129 * This function is not BOM-aware.
3130 *
3131 * @param input the UTF-16 string to process
3132 * @param length the length of the string in 2-byte code units (char16_t)
3133 * @param output the pointer to buffer that can hold the conversion result
3134 */
3135 virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0;
3136
3137 /**
3138 * Return the number of bytes that this Latin1 string would require in UTF-8 format.
3139 *
3140 * @param input the Latin1 string to convert
3141 * @param length the length of the string bytes
3142 * @return the number of bytes required to encode the Latin1 string as UTF-8
3143 */
3144 simdutf_warn_unused virtual size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept = 0;
3145
3146 /**
3147 * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
3148 *
3149 * This function does not validate the input.
3150 *
3151 * @param input the UTF-32 string to convert
3152 * @param length the length of the string in 4-byte code units (char32_t)
3153 * @return the number of bytes required to encode the UTF-32 string as UTF-8
3154 */
3155 simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
3156
3157 /**
3158 * Compute the number of bytes that this UTF-32 string would require in Latin1 format.
3159 *
3160 * This function does not validate the input.
3161 *
3162 * @param length the length of the string in 4-byte code units (char32_t)
3163 * @return the number of bytes required to encode the UTF-32 string as Latin1
3164 */
3165 simdutf_warn_unused virtual size_t latin1_length_from_utf32(size_t length) const noexcept = 0;
3166
3167 /**
3168 * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
3169 *
3170 * This function does not validate the input.
3171 *
3172 * @param input the UTF-8 string to convert
3173 * @param length the length of the string in byte
3174 * @return the number of bytes required to encode the UTF-8 string as Latin1
3175 */
3176 simdutf_warn_unused virtual size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept = 0;
3177
3178 /*
3179 * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
3180 *
3181 * This function does not validate the input.
3182 *
3183 * This function is not BOM-aware.
3184 *
3185 * @param input the UTF-16LE string to convert
3186 * @param length the length of the string in 2-byte code units (char16_t)
3187 * @return the number of bytes required to encode the UTF-16LE string as Latin1
3188 */
3189 simdutf_warn_unused virtual size_t latin1_length_from_utf16(size_t length) const noexcept = 0;
3190
3191 /**
3192 * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format.
3193 *
3194 * This function does not validate the input.
3195 *
3196 * @param input the UTF-32 string to convert
3197 * @param length the length of the string in 4-byte code units (char32_t)
3198 * @return the number of bytes required to encode the UTF-32 string as UTF-16
3199 */
3200 simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
3201
3202
3203 /**
3204 * Return the number of bytes that this UTF-32 string would require in Latin1 format.
3205 *
3206 * This function does not validate the input.
3207 *
3208 * @param input the UTF-32 string to convert
3209 * @param length the length of the string in 4-byte code units (char32_t)
3210 * @return the number of bytes required to encode the UTF-32 string as Latin1
3211 */
3212 simdutf_warn_unused virtual size_t utf32_length_from_latin1(size_t length) const noexcept = 0;
3213
3214 /*
3215 * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
3216 *
3217 * This function is equivalent to count_utf16le.
3218 *
3219 * This function does not validate the input.
3220 *
3221 * This function is not BOM-aware.
3222 *
3223 * @param input the UTF-16LE string to convert
3224 * @param length the length of the string in 2-byte code units (char16_t)
3225 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3226 */
3227 simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
3228
3229 /*
3230 * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
3231 *
3232 * This function is equivalent to count_utf16be.
3233 *
3234 * This function does not validate the input.
3235 *
3236 * This function is not BOM-aware.
3237 *
3238 * @param input the UTF-16BE string to convert
3239 * @param length the length of the string in 2-byte code units (char16_t)
3240 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
3241 */
3242 simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
3243
3244 /**
3245 * Count the number of code points (characters) in the string assuming that
3246 * it is valid.
3247 *
3248 * This function assumes that the input string is valid UTF-16LE.
3249 *
3250 * This function is not BOM-aware.
3251 *
3252 * @param input the UTF-16LE string to process
3253 * @param length the length of the string in 2-byte code units (char16_t)
3254 * @return number of code points
3255 */
3256 simdutf_warn_unused virtual size_t count_utf16le(const char16_t * input, size_t length) const noexcept = 0;
3257
3258 /**
3259 * Count the number of code points (characters) in the string assuming that
3260 * it is valid.
3261 *
3262 * This function assumes that the input string is valid UTF-16BE.
3263 *
3264 * This function is not BOM-aware.
3265 *
3266 * @param input the UTF-16BE string to process
3267 * @param length the length of the string in 2-byte code units (char16_t)
3268 * @return number of code points
3269 */
3270 simdutf_warn_unused virtual size_t count_utf16be(const char16_t * input, size_t length) const noexcept = 0;
3271
3272
3273 /**
3274 * Count the number of code points (characters) in the string assuming that
3275 * it is valid.
3276 *
3277 * This function assumes that the input string is valid UTF-8.
3278 *
3279 * @param input the UTF-8 string to process
3280 * @param length the length of the string in bytes
3281 * @return number of code points
3282 */
3283 simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0;
3284
3285
3286
3287 protected:
3288 /** @private Construct an implementation with the given name and description. For subclasses. */
implementation( std::string name, std::string description, uint32_t required_instruction_sets )3289 simdutf_really_inline implementation(
3290 std::string name,
3291 std::string description,
3292 uint32_t required_instruction_sets
3293 ) :
3294 _name(name),
3295 _description(description),
3296 _required_instruction_sets(required_instruction_sets)
3297 {
3298 }
3299 virtual ~implementation()=default;
3300
3301 private:
3302 /**
3303 * The name of this implementation.
3304 */
3305 const std::string _name;
3306
3307 /**
3308 * The description of this implementation.
3309 */
3310 const std::string _description;
3311
3312 /**
3313 * Instruction sets required for this implementation.
3314 */
3315 const uint32_t _required_instruction_sets;
3316 };
3317
3318 /** @private */
3319 namespace internal {
3320
3321 /**
3322 * The list of available implementations compiled into simdutf.
3323 */
3324 class available_implementation_list {
3325 public:
3326 /** Get the list of available implementations compiled into simdutf */
available_implementation_list()3327 simdutf_really_inline available_implementation_list() {}
3328 /** Number of implementations */
3329 size_t size() const noexcept;
3330 /** STL const begin() iterator */
3331 const implementation * const *begin() const noexcept;
3332 /** STL const end() iterator */
3333 const implementation * const *end() const noexcept;
3334
3335 /**
3336 * Get the implementation with the given name.
3337 *
3338 * Case sensitive.
3339 *
3340 * const implementation *impl = simdutf::available_implementations["westmere"];
3341 * if (!impl) { exit(1); }
3342 * if (!imp->supported_by_runtime_system()) { exit(1); }
3343 * simdutf::active_implementation = impl;
3344 *
3345 * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
3346 * @return the implementation, or nullptr if the parse failed.
3347 */
3348 const implementation * operator[](const std::string &name) const noexcept {
3349 for (const implementation * impl : *this) {
3350 if (impl->name() == name) { return impl; }
3351 }
3352 return nullptr;
3353 }
3354
3355 /**
3356 * Detect the most advanced implementation supported by the current host.
3357 *
3358 * This is used to initialize the implementation on startup.
3359 *
3360 * const implementation *impl = simdutf::available_implementation::detect_best_supported();
3361 * simdutf::active_implementation = impl;
3362 *
3363 * @return the most advanced supported implementation for the current host, or an
3364 * implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported
3365 * implementation. Will never return nullptr.
3366 */
3367 const implementation *detect_best_supported() const noexcept;
3368 };
3369
3370 template<typename T>
3371 class atomic_ptr {
3372 public:
atomic_ptr(T *_ptr)3373 atomic_ptr(T *_ptr) : ptr{_ptr} {}
3374
3375 #if defined(SIMDUTF_NO_THREADS)
operator const T*() const3376 operator const T*() const { return ptr; }
operator *() const3377 const T& operator*() const { return *ptr; }
operator ->() const3378 const T* operator->() const { return ptr; }
3379
operator T*()3380 operator T*() { return ptr; }
operator *()3381 T& operator*() { return *ptr; }
operator ->()3382 T* operator->() { return ptr; }
operator =(T *_ptr)3383 atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
3384
3385 #else
operator const T*() const3386 operator const T*() const { return ptr.load(); }
operator *() const3387 const T& operator*() const { return *ptr; }
operator ->() const3388 const T* operator->() const { return ptr.load(); }
3389
operator T*()3390 operator T*() { return ptr.load(); }
operator *()3391 T& operator*() { return *ptr; }
operator ->()3392 T* operator->() { return ptr.load(); }
operator =(T *_ptr)3393 atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
3394
3395 #endif
3396
3397 private:
3398 #if defined(SIMDUTF_NO_THREADS)
3399 T* ptr;
3400 #else
3401 std::atomic<T*> ptr;
3402 #endif
3403 };
3404
3405 class detect_best_supported_implementation_on_first_use;
3406
3407 } // namespace internal
3408
3409 /**
3410 * The list of available implementations compiled into simdutf.
3411 */
3412 extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations();
3413
3414 /**
3415 * The active implementation.
3416 *
3417 * Automatically initialized on first use to the most advanced implementation supported by this hardware.
3418 */
3419 extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation();
3420
3421
3422 } // namespace simdutf
3423
3424 #endif // SIMDUTF_IMPLEMENTATION_H
3425 /* end file include/simdutf/implementation.h */
3426
3427
3428 // Implementation-internal files (must be included before the implementations themselves, to keep
3429 // amalgamation working--otherwise, the first time a file is included, it might be put inside the
3430 // #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't
3431 // compile unless that implementation is turned on).
3432
3433
3434 SIMDUTF_POP_DISABLE_WARNINGS
3435
3436 #endif // SIMDUTF_H
3437 /* end file include/simdutf.h */
3438