1/* auto-generated on 2023-12-01 13:59:01 -0500. Do not edit! */ 2/* begin file include/simdutf.h */ 3#ifndef SIMDUTF_H 4#define SIMDUTF_H 5#include <cstring> 6 7/* begin file include/simdutf/compiler_check.h */ 8#ifndef SIMDUTF_COMPILER_CHECK_H 9#define SIMDUTF_COMPILER_CHECK_H 10 11#ifndef __cplusplus 12#error simdutf requires a C++ compiler 13#endif 14 15#ifndef SIMDUTF_CPLUSPLUS 16#if defined(_MSVC_LANG) && !defined(__clang__) 17#define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG) 18#else 19#define SIMDUTF_CPLUSPLUS __cplusplus 20#endif 21#endif 22 23// C++ 17 24#if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L) 25#define SIMDUTF_CPLUSPLUS17 1 26#endif 27 28// C++ 14 29#if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L) 30#define SIMDUTF_CPLUSPLUS14 1 31#endif 32 33// C++ 11 34#if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L) 35#define SIMDUTF_CPLUSPLUS11 1 36#endif 37 38#ifndef SIMDUTF_CPLUSPLUS11 39#error simdutf requires a compiler compliant with the C++11 standard 40#endif 41 42#endif // SIMDUTF_COMPILER_CHECK_H 43/* end file include/simdutf/compiler_check.h */ 44/* begin file include/simdutf/common_defs.h */ 45#ifndef SIMDUTF_COMMON_DEFS_H 46#define SIMDUTF_COMMON_DEFS_H 47 48#include <cassert> 49/* begin file include/simdutf/portability.h */ 50#ifndef SIMDUTF_PORTABILITY_H 51#define SIMDUTF_PORTABILITY_H 52 53#include <cstddef> 54#include <cstdint> 55#include <cstdlib> 56#include <cfloat> 57#include <cassert> 58#ifndef _WIN32 59// strcasecmp, strncasecmp 60#include <strings.h> 61#endif 62 63/** 64 * We want to check that it is actually a little endian system at 65 * compile-time. 66 */ 67 68#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) 69#define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 70#elif defined(_WIN32) 71#define SIMDUTF_IS_BIG_ENDIAN 0 72#else 73#if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ 74#include <machine/endian.h> 75#elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__) 76#include <sys/byteorder.h> 77#else // defined(__APPLE__) || defined(__FreeBSD__) 78 79#ifdef __has_include 80#if __has_include(<endian.h>) 81#include <endian.h> 82#endif //__has_include(<endian.h>) 83#endif //__has_include 84 85#endif // defined(__APPLE__) || defined(__FreeBSD__) 86 87 88#ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) 89#define SIMDUTF_IS_BIG_ENDIAN 0 90#endif 91 92#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 93#define SIMDUTF_IS_BIG_ENDIAN 0 94#else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 95#define SIMDUTF_IS_BIG_ENDIAN 1 96#endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 97 98#endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ 99 100 101/** 102 * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined. 103 */ 104 105#ifdef _MSC_VER 106#define SIMDUTF_VISUAL_STUDIO 1 107/** 108 * We want to differentiate carefully between 109 * clang under visual studio and regular visual 110 * studio. 111 * 112 * Under clang for Windows, we enable: 113 * * target pragmas so that part and only part of the 114 * code gets compiled for advanced instructions. 115 * 116 */ 117#ifdef __clang__ 118// clang under visual studio 119#define SIMDUTF_CLANG_VISUAL_STUDIO 1 120#else 121// just regular visual studio (best guess) 122#define SIMDUTF_REGULAR_VISUAL_STUDIO 1 123#endif // __clang__ 124#endif // _MSC_VER 125 126#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO 127// https://en.wikipedia.org/wiki/C_alternative_tokens 128// This header should have no effect, except maybe 129// under Visual Studio. 130#include <iso646.h> 131#endif 132 133#if defined(__x86_64__) || defined(_M_AMD64) 134#define SIMDUTF_IS_X86_64 1 135#elif defined(__aarch64__) || defined(_M_ARM64) 136#define SIMDUTF_IS_ARM64 1 137#elif defined(__PPC64__) || defined(_M_PPC64) 138//#define SIMDUTF_IS_PPC64 1 139// The simdutf library does yet support SIMD acceleration under 140// POWER processors. Please see https://github.com/lemire/simdutf/issues/51 141#elif defined(__s390__) 142// s390 IBM system. Big endian. 143#elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64 144// RISC-V 64-bit 145#else 146// The simdutf library is designed 147// for 64-bit processors and it seems that you are not 148// compiling for a known 64-bit platform. Please 149// use a 64-bit target such as x64 or 64-bit ARM for best performance. 150#define SIMDUTF_IS_32BITS 1 151 152// We do not support 32-bit platforms, but it can be 153// handy to identify them. 154#if defined(_M_IX86) || defined(__i386__) 155#define SIMDUTF_IS_X86_32BITS 1 156#elif defined(__arm__) || defined(_M_ARM) 157#define SIMDUTF_IS_ARM_32BITS 1 158#elif defined(__PPC__) || defined(_M_PPC) 159#define SIMDUTF_IS_PPC_32BITS 1 160#endif 161 162#endif // defined(__x86_64__) || defined(_M_AMD64) 163 164#ifdef SIMDUTF_IS_32BITS 165#ifndef SIMDUTF_NO_PORTABILITY_WARNING 166// In the future, we may want to warn users of 32-bit systems that 167// the simdutf does not support accelerated kernels for such systems. 168#endif // SIMDUTF_NO_PORTABILITY_WARNING 169#endif // SIMDUTF_IS_32BITS 170 171// this is almost standard? 172#define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a 173#define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) 174 175// Our fast kernels require 64-bit systems. 176// 177// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions. 178// Furthermore, the number of SIMD registers is reduced. 179// 180// On 32-bit ARM, we would have smaller registers. 181// 182// The simdutf users should still have the fallback kernel. It is 183// slower, but it should run everywhere. 184 185// 186// Enable valid runtime implementations, and select SIMDUTF_BUILTIN_IMPLEMENTATION 187// 188 189// We are going to use runtime dispatch. 190#ifdef SIMDUTF_IS_X86_64 191#ifdef __clang__ 192// clang does not have GCC push pop 193// warning: clang attribute push can't be used within a namespace in clang up 194// til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a 195// namespace. 196#define SIMDUTF_TARGET_REGION(T) \ 197 _Pragma(SIMDUTF_STRINGIFY( \ 198 clang attribute push(__attribute__((target(T))), apply_to = function))) 199#define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop") 200#elif defined(__GNUC__) 201// GCC is easier 202#define SIMDUTF_TARGET_REGION(T) \ 203 _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T))) 204#define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options") 205#endif // clang then gcc 206 207#endif // x86 208 209// Default target region macros don't do anything. 210#ifndef SIMDUTF_TARGET_REGION 211#define SIMDUTF_TARGET_REGION(T) 212#define SIMDUTF_UNTARGET_REGION 213#endif 214 215// Is threading enabled? 216#if defined(_REENTRANT) || defined(_MT) 217#ifndef SIMDUTF_THREADS_ENABLED 218#define SIMDUTF_THREADS_ENABLED 219#endif 220#endif 221 222// workaround for large stack sizes under -O0. 223// https://github.com/simdutf/simdutf/issues/691 224#ifdef __APPLE__ 225#ifndef __OPTIMIZE__ 226// Apple systems have small stack sizes in secondary threads. 227// Lack of compiler optimization may generate high stack usage. 228// Users may want to disable threads for safety, but only when 229// in debug mode which we detect by the fact that the __OPTIMIZE__ 230// macro is not defined. 231#undef SIMDUTF_THREADS_ENABLED 232#endif 233#endif 234 235#ifdef SIMDUTF_VISUAL_STUDIO 236// This is one case where we do not distinguish between 237// regular visual studio and clang under visual studio. 238// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has) 239#define simdutf_strcasecmp _stricmp 240#define simdutf_strncasecmp _strnicmp 241#else 242// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8). 243// So they are only useful for ASCII in our context. 244// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings 245#define simdutf_strcasecmp strcasecmp 246#define simdutf_strncasecmp strncasecmp 247#endif 248 249#ifdef NDEBUG 250 251#ifdef SIMDUTF_VISUAL_STUDIO 252#define SIMDUTF_UNREACHABLE() __assume(0) 253#define SIMDUTF_ASSUME(COND) __assume(COND) 254#else 255#define SIMDUTF_UNREACHABLE() __builtin_unreachable(); 256#define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0) 257#endif 258 259#else // NDEBUG 260 261#define SIMDUTF_UNREACHABLE() assert(0); 262#define SIMDUTF_ASSUME(COND) assert(COND) 263 264#endif 265 266 267#if defined(__GNUC__) && !defined(__clang__) 268#if __GNUC__ >= 11 269#define SIMDUTF_GCC11ORMORE 1 270#endif // __GNUC__ >= 11 271#endif // defined(__GNUC__) && !defined(__clang__) 272 273 274#endif // SIMDUTF_PORTABILITY_H 275/* end file include/simdutf/portability.h */ 276/* begin file include/simdutf/avx512.h */ 277#ifndef SIMDUTF_AVX512_H_ 278#define SIMDUTF_AVX512_H_ 279 280/* 281 It's possible to override AVX512 settings with cmake DCMAKE_CXX_FLAGS. 282 283 All preprocessor directives has form `SIMDUTF_HAS_AVX512{feature}`, 284 where a feature is a code name for extensions. 285 286 Please see the listing below to find which are supported. 287*/ 288 289#ifndef SIMDUTF_HAS_AVX512F 290# if defined(__AVX512F__) && __AVX512F__ == 1 291# define SIMDUTF_HAS_AVX512F 1 292# endif 293#endif 294 295#ifndef SIMDUTF_HAS_AVX512DQ 296# if defined(__AVX512DQ__) && __AVX512DQ__ == 1 297# define SIMDUTF_HAS_AVX512DQ 1 298# endif 299#endif 300 301#ifndef SIMDUTF_HAS_AVX512IFMA 302# if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1 303# define SIMDUTF_HAS_AVX512IFMA 1 304# endif 305#endif 306 307#ifndef SIMDUTF_HAS_AVX512CD 308# if defined(__AVX512CD__) && __AVX512CD__ == 1 309# define SIMDUTF_HAS_AVX512CD 1 310# endif 311#endif 312 313#ifndef SIMDUTF_HAS_AVX512BW 314# if defined(__AVX512BW__) && __AVX512BW__ == 1 315# define SIMDUTF_HAS_AVX512BW 1 316# endif 317#endif 318 319#ifndef SIMDUTF_HAS_AVX512VL 320# if defined(__AVX512VL__) && __AVX512VL__ == 1 321# define SIMDUTF_HAS_AVX512VL 1 322# endif 323#endif 324 325#ifndef SIMDUTF_HAS_AVX512VBMI 326# if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1 327# define SIMDUTF_HAS_AVX512VBMI 1 328# endif 329#endif 330 331#ifndef SIMDUTF_HAS_AVX512VBMI2 332# if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1 333# define SIMDUTF_HAS_AVX512VBMI2 1 334# endif 335#endif 336 337#ifndef SIMDUTF_HAS_AVX512VNNI 338# if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1 339# define SIMDUTF_HAS_AVX512VNNI 1 340# endif 341#endif 342 343#ifndef SIMDUTF_HAS_AVX512BITALG 344# if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1 345# define SIMDUTF_HAS_AVX512BITALG 1 346# endif 347#endif 348 349#ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ 350# if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1 351# define SIMDUTF_HAS_AVX512VPOPCNTDQ 1 352# endif 353#endif 354 355#endif // SIMDUTF_AVX512_H_ 356/* end file include/simdutf/avx512.h */ 357 358 359#if defined(__GNUC__) 360 // Marks a block with a name so that MCA analysis can see it. 361 #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name); 362 #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name); 363 #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name); 364#else 365 #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) 366 #define SIMDUTF_END_DEBUG_BLOCK(name) 367 #define SIMDUTF_DEBUG_BLOCK(name, block) 368#endif 369 370// Align to N-byte boundary 371#define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1)) 372#define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n)-1)) 373 374#define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0) 375 376#if defined(SIMDUTF_REGULAR_VISUAL_STUDIO) 377 378 #define simdutf_really_inline __forceinline 379 #define simdutf_never_inline __declspec(noinline) 380 381 #define simdutf_unused 382 #define simdutf_warn_unused 383 384 #ifndef simdutf_likely 385 #define simdutf_likely(x) x 386 #endif 387 #ifndef simdutf_unlikely 388 #define simdutf_unlikely(x) x 389 #endif 390 391 #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push )) 392 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 )) 393 #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER )) 394 // Get rid of Intellisense-only warnings (Code Analysis) 395 // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910). 396 #ifdef __has_include 397 #if __has_include(<CppCoreCheck\Warnings.h>) 398 #include <CppCoreCheck\Warnings.h> 399 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS) 400 #endif 401 #endif 402 403 #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS 404 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS 405 #endif 406 407 #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996) 408 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING 409 #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop )) 410 411#else // SIMDUTF_REGULAR_VISUAL_STUDIO 412 413 #define simdutf_really_inline inline __attribute__((always_inline)) 414 #define simdutf_never_inline inline __attribute__((noinline)) 415 416 #define simdutf_unused __attribute__((unused)) 417 #define simdutf_warn_unused __attribute__((warn_unused_result)) 418 419 #ifndef simdutf_likely 420 #define simdutf_likely(x) __builtin_expect(!!(x), 1) 421 #endif 422 #ifndef simdutf_unlikely 423 #define simdutf_unlikely(x) __builtin_expect(!!(x), 0) 424 #endif 425 426 #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push") 427 // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary 428 #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \ 429 SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \ 430 SIMDUTF_DISABLE_GCC_WARNING(-Wall) \ 431 SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \ 432 SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \ 433 SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \ 434 SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \ 435 SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \ 436 SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \ 437 SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \ 438 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \ 439 SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable) 440 #define SIMDUTF_PRAGMA(P) _Pragma(#P) 441 #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING) 442 #if defined(SIMDUTF_CLANG_VISUAL_STUDIO) 443 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include) 444 #else 445 #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS 446 #endif 447 #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations) 448 #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow) 449 #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop") 450 451 452 453#endif // MSC_VER 454 455#ifndef SIMDUTF_DLLIMPORTEXPORT 456 #if defined(SIMDUTF_VISUAL_STUDIO) 457 /** 458 * It does not matter here whether you are using 459 * the regular visual studio or clang under visual 460 * studio. 461 */ 462 #if SIMDUTF_USING_LIBRARY 463 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport) 464 #else 465 #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport) 466 #endif 467 #else 468 #define SIMDUTF_DLLIMPORTEXPORT 469 #endif 470#endif 471 472/// If EXPR is an error, returns it. 473#define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } 474 475 476#endif // SIMDUTF_COMMON_DEFS_H 477/* end file include/simdutf/common_defs.h */ 478/* begin file include/simdutf/encoding_types.h */ 479#include <string> 480 481namespace simdutf { 482 483enum encoding_type { 484 UTF8 = 1, // BOM 0xef 0xbb 0xbf 485 UTF16_LE = 2, // BOM 0xff 0xfe 486 UTF16_BE = 4, // BOM 0xfe 0xff 487 UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00 488 UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff 489 Latin1 = 32, 490 491 unspecified = 0 492}; 493 494enum endianness { 495 LITTLE = 0, 496 BIG = 1 497}; 498 499bool match_system(endianness e); 500 501std::string to_string(encoding_type bom); 502 503// Note that BOM for UTF8 is discouraged. 504namespace BOM { 505 506/** 507 * Checks for a BOM. If not, returns unspecified 508 * @param input the string to process 509 * @param length the length of the string in code units 510 * @return the corresponding encoding 511 */ 512 513encoding_type check_bom(const uint8_t* byte, size_t length); 514encoding_type check_bom(const char* byte, size_t length); 515/** 516 * Returns the size, in bytes, of the BOM for a given encoding type. 517 * Note that UTF8 BOM are discouraged. 518 * @param bom the encoding type 519 * @return the size in bytes of the corresponding BOM 520 */ 521size_t bom_byte_size(encoding_type bom); 522 523} // BOM namespace 524} // simdutf namespace 525/* end file include/simdutf/encoding_types.h */ 526/* begin file include/simdutf/error.h */ 527#ifndef SIMDUTF_ERROR_H 528#define SIMDUTF_ERROR_H 529namespace simdutf { 530 531enum error_code { 532 SUCCESS = 0, 533 HEADER_BITS, // Any byte must have fewer than 5 header bits. 534 TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length 535 // This is also the error when the input is truncated. 536 TOO_LONG, // We either have too many consecutive continuation bytes or the string starts with a continuation byte. 537 OVERLONG, // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters, 538 // and U+FFFF for four-byte characters. 539 TOO_LARGE, // The decoded character must be less than or equal to U+10FFFF,less than or equal than U+7F for ASCII OR less than equal than U+FF for Latin1 540 SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR 541 // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR 542 // there must be no surrogate at all (Latin1) 543 OTHER // Not related to validation/transcoding. 544}; 545 546struct result { 547 error_code error; 548 size_t count; // In case of error, indicates the position of the error. In case of success, indicates the number of code units validated/written. 549 550 simdutf_really_inline result(); 551 552 simdutf_really_inline result(error_code, size_t); 553}; 554 555} 556#endif 557/* end file include/simdutf/error.h */ 558 559SIMDUTF_PUSH_DISABLE_WARNINGS 560SIMDUTF_DISABLE_UNDESIRED_WARNINGS 561 562// Public API 563/* begin file include/simdutf/simdutf_version.h */ 564// /include/simdutf/simdutf_version.h automatically generated by release.py, 565// do not change by hand 566#ifndef SIMDUTF_SIMDUTF_VERSION_H 567#define SIMDUTF_SIMDUTF_VERSION_H 568 569/** The version of simdutf being used (major.minor.revision) */ 570#define SIMDUTF_VERSION "4.0.8" 571 572namespace simdutf { 573enum { 574 /** 575 * The major version (MAJOR.minor.revision) of simdutf being used. 576 */ 577 SIMDUTF_VERSION_MAJOR = 4, 578 /** 579 * The minor version (major.MINOR.revision) of simdutf being used. 580 */ 581 SIMDUTF_VERSION_MINOR = 0, 582 /** 583 * The revision (major.minor.REVISION) of simdutf being used. 584 */ 585 SIMDUTF_VERSION_REVISION = 8 586}; 587} // namespace simdutf 588 589#endif // SIMDUTF_SIMDUTF_VERSION_H 590/* end file include/simdutf/simdutf_version.h */ 591/* begin file include/simdutf/implementation.h */ 592#ifndef SIMDUTF_IMPLEMENTATION_H 593#define SIMDUTF_IMPLEMENTATION_H 594#include <string> 595#if !defined(SIMDUTF_NO_THREADS) 596#include <atomic> 597#endif 598#include <vector> 599#include <tuple> 600/* begin file include/simdutf/internal/isadetection.h */ 601/* From 602https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h 603Highly modified. 604 605Copyright (c) 2016- Facebook, Inc (Adam Paszke) 606Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 607Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 608Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 609Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 610Copyright (c) 2011-2013 NYU (Clement Farabet) 611Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, 612Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute 613(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, 614Samy Bengio, Johnny Mariethoz) 615 616All rights reserved. 617 618Redistribution and use in source and binary forms, with or without 619modification, are permitted provided that the following conditions are met: 620 6211. Redistributions of source code must retain the above copyright 622 notice, this list of conditions and the following disclaimer. 623 6242. Redistributions in binary form must reproduce the above copyright 625 notice, this list of conditions and the following disclaimer in the 626 documentation and/or other materials provided with the distribution. 627 6283. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories 629America and IDIAP Research Institute nor the names of its contributors may be 630 used to endorse or promote products derived from this software without 631 specific prior written permission. 632 633THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 634AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 635IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 636ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 637LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 638CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 639SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 640INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 641CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 642ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 643POSSIBILITY OF SUCH DAMAGE. 644*/ 645 646#ifndef SIMDutf_INTERNAL_ISADETECTION_H 647#define SIMDutf_INTERNAL_ISADETECTION_H 648 649#include <cstdint> 650#include <cstdlib> 651#if defined(_MSC_VER) 652#include <intrin.h> 653#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) 654#include <cpuid.h> 655#endif 656 657namespace simdutf { 658namespace internal { 659 660enum instruction_set { 661 DEFAULT = 0x0, 662 NEON = 0x1, 663 AVX2 = 0x4, 664 SSE42 = 0x8, 665 PCLMULQDQ = 0x10, 666 BMI1 = 0x20, 667 BMI2 = 0x40, 668 ALTIVEC = 0x80, 669 AVX512F = 0x100, 670 AVX512DQ = 0x200, 671 AVX512IFMA = 0x400, 672 AVX512PF = 0x800, 673 AVX512ER = 0x1000, 674 AVX512CD = 0x2000, 675 AVX512BW = 0x4000, 676 AVX512VL = 0x8000, 677 AVX512VBMI2 = 0x10000, 678 AVX512VPOPCNTDQ = 0x2000 679}; 680 681#if defined(__PPC64__) 682 683static inline uint32_t detect_supported_architectures() { 684 return instruction_set::ALTIVEC; 685} 686 687#elif defined(__aarch64__) || defined(_M_ARM64) 688 689static inline uint32_t detect_supported_architectures() { 690 return instruction_set::NEON; 691} 692 693#elif defined(__x86_64__) || defined(_M_AMD64) // x64 694 695 696namespace { 697namespace cpuid_bit { 698 // Can be found on Intel ISA Reference for CPUID 699 700 // EAX = 0x01 701 constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit 1 of ECX for EAX=0x1 702 constexpr uint32_t sse42 = uint32_t(1) << 20; ///< @private bit 20 of ECX for EAX=0x1 703 constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1 704 705 // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf) 706 // See: "Table 3-8. Information Returned by CPUID Instruction" 707 namespace ebx { 708 constexpr uint32_t bmi1 = uint32_t(1) << 3; 709 constexpr uint32_t avx2 = uint32_t(1) << 5; 710 constexpr uint32_t bmi2 = uint32_t(1) << 8; 711 constexpr uint32_t avx512f = uint32_t(1) << 16; 712 constexpr uint32_t avx512dq = uint32_t(1) << 17; 713 constexpr uint32_t avx512ifma = uint32_t(1) << 21; 714 constexpr uint32_t avx512cd = uint32_t(1) << 28; 715 constexpr uint32_t avx512bw = uint32_t(1) << 30; 716 constexpr uint32_t avx512vl = uint32_t(1) << 31; 717 } 718 719 namespace ecx { 720 constexpr uint32_t avx512vbmi = uint32_t(1) << 1; 721 constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6; 722 constexpr uint32_t avx512vnni = uint32_t(1) << 11; 723 constexpr uint32_t avx512bitalg = uint32_t(1) << 12; 724 constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14; 725 } 726 namespace edx { 727 constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8; 728 } 729 namespace xcr0_bit { 730 constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX 731 constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM 732 } 733 } 734} 735 736 737 738static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, 739 uint32_t *edx) { 740#if defined(_MSC_VER) 741 int cpu_info[4]; 742 __cpuidex(cpu_info, *eax, *ecx); 743 *eax = cpu_info[0]; 744 *ebx = cpu_info[1]; 745 *ecx = cpu_info[2]; 746 *edx = cpu_info[3]; 747#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) 748 uint32_t level = *eax; 749 __get_cpuid(level, eax, ebx, ecx, edx); 750#else 751 uint32_t a = *eax, b, c = *ecx, d; 752 asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); 753 *eax = a; 754 *ebx = b; 755 *ecx = c; 756 *edx = d; 757#endif 758} 759 760static inline uint64_t xgetbv() { 761 #if defined(_MSC_VER) 762 return _xgetbv(0); 763 #else 764 uint32_t xcr0_lo, xcr0_hi; 765 asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0)); 766 return xcr0_lo | ((uint64_t)xcr0_hi << 32); 767 #endif 768 } 769 770static inline uint32_t detect_supported_architectures() { 771 uint32_t eax; 772 uint32_t ebx = 0; 773 uint32_t ecx = 0; 774 uint32_t edx = 0; 775 uint32_t host_isa = 0x0; 776 777 // EBX for EAX=0x1 778 eax = 0x1; 779 cpuid(&eax, &ebx, &ecx, &edx); 780 781 if (ecx & cpuid_bit::sse42) { 782 host_isa |= instruction_set::SSE42; 783 } 784 785 if (ecx & cpuid_bit::pclmulqdq) { 786 host_isa |= instruction_set::PCLMULQDQ; 787 } 788 789 if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) { 790 return host_isa; 791 } 792 793 // xgetbv for checking if the OS saves registers 794 uint64_t xcr0 = xgetbv(); 795 796 if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) { 797 return host_isa; 798 } 799 // ECX for EAX=0x7 800 eax = 0x7; 801 ecx = 0x0; // Sub-leaf = 0 802 cpuid(&eax, &ebx, &ecx, &edx); 803 if (ebx & cpuid_bit::ebx::avx2) { 804 host_isa |= instruction_set::AVX2; 805 } 806 if (ebx & cpuid_bit::ebx::bmi1) { 807 host_isa |= instruction_set::BMI1; 808 } 809 if (ebx & cpuid_bit::ebx::bmi2) { 810 host_isa |= instruction_set::BMI2; 811 } 812 if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) { 813 return host_isa; 814 } 815 if (ebx & cpuid_bit::ebx::avx512f) { 816 host_isa |= instruction_set::AVX512F; 817 } 818 if (ebx & cpuid_bit::ebx::avx512bw) { 819 host_isa |= instruction_set::AVX512BW; 820 } 821 if (ebx & cpuid_bit::ebx::avx512cd) { 822 host_isa |= instruction_set::AVX512CD; 823 } 824 if (ebx & cpuid_bit::ebx::avx512dq) { 825 host_isa |= instruction_set::AVX512DQ; 826 } 827 if (ebx & cpuid_bit::ebx::avx512vl) { 828 host_isa |= instruction_set::AVX512VL; 829 } 830 if (ecx & cpuid_bit::ecx::avx512vbmi2) { 831 host_isa |= instruction_set::AVX512VBMI2; 832 } 833 if (ecx & cpuid_bit::ecx::avx512vpopcnt) { 834 host_isa |= instruction_set::AVX512VPOPCNTDQ; 835 } 836 return host_isa; 837} 838#else // fallback 839 840// includes 32-bit ARM. 841static inline uint32_t detect_supported_architectures() { 842 return instruction_set::DEFAULT; 843} 844 845 846#endif // end SIMD extension detection code 847 848} // namespace internal 849} // namespace simdutf 850 851#endif // SIMDutf_INTERNAL_ISADETECTION_H 852/* end file include/simdutf/internal/isadetection.h */ 853 854 855namespace simdutf { 856 857/** 858 * Autodetect the encoding of the input, a single encoding is recommended. 859 * E.g., the function might return simdutf::encoding_type::UTF8, 860 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or 861 * simdutf::encoding_type::UTF32_LE. 862 * 863 * @param input the string to analyze. 864 * @param length the length of the string in bytes. 865 * @return the detected encoding type 866 */ 867simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept; 868simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept { 869 return autodetect_encoding(reinterpret_cast<const char *>(input), length); 870} 871 872/** 873 * Autodetect the possible encodings of the input in one pass. 874 * E.g., if the input might be UTF-16LE or UTF-8, this function returns 875 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE). 876 * 877 * Overriden by each implementation. 878 * 879 * @param input the string to analyze. 880 * @param length the length of the string in bytes. 881 * @return the detected encoding type 882 */ 883simdutf_warn_unused int detect_encodings(const char * input, size_t length) noexcept; 884simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * input, size_t length) noexcept { 885 return detect_encodings(reinterpret_cast<const char *>(input), length); 886} 887 888/** 889 * Validate the UTF-8 string. This function may be best when you expect 890 * the input to be almost always valid. Otherwise, consider using 891 * validate_utf8_with_errors. 892 * 893 * Overridden by each implementation. 894 * 895 * @param buf the UTF-8 string to validate. 896 * @param len the length of the string in bytes. 897 * @return true if and only if the string is valid UTF-8. 898 */ 899simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept; 900 901/** 902 * Validate the UTF-8 string and stop on error. 903 * 904 * Overridden by each implementation. 905 * 906 * @param buf the UTF-8 string to validate. 907 * @param len the length of the string in bytes. 908 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 909 */ 910simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept; 911 912/** 913 * Validate the ASCII string. 914 * 915 * Overridden by each implementation. 916 * 917 * @param buf the ASCII string to validate. 918 * @param len the length of the string in bytes. 919 * @return true if and only if the string is valid ASCII. 920 */ 921simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept; 922 923/** 924 * Validate the ASCII string and stop on error. It might be faster than 925 * validate_utf8 when an error is expected to occur early. 926 * 927 * Overridden by each implementation. 928 * 929 * @param buf the ASCII string to validate. 930 * @param len the length of the string in bytes. 931 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 932 */ 933simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept; 934 935/** 936 * Using native endianness; Validate the UTF-16 string. 937 * This function may be best when you expect the input to be almost always valid. 938 * Otherwise, consider using validate_utf16_with_errors. 939 * 940 * Overridden by each implementation. 941 * 942 * This function is not BOM-aware. 943 * 944 * @param buf the UTF-16 string to validate. 945 * @param len the length of the string in number of 2-byte code units (char16_t). 946 * @return true if and only if the string is valid UTF-16. 947 */ 948simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept; 949 950/** 951 * Validate the UTF-16LE string. This function may be best when you expect 952 * the input to be almost always valid. Otherwise, consider using 953 * validate_utf16le_with_errors. 954 * 955 * Overridden by each implementation. 956 * 957 * This function is not BOM-aware. 958 * 959 * @param buf the UTF-16LE string to validate. 960 * @param len the length of the string in number of 2-byte code units (char16_t). 961 * @return true if and only if the string is valid UTF-16LE. 962 */ 963simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexcept; 964 965/** 966 * Validate the UTF-16BE string. This function may be best when you expect 967 * the input to be almost always valid. Otherwise, consider using 968 * validate_utf16be_with_errors. 969 * 970 * Overridden by each implementation. 971 * 972 * This function is not BOM-aware. 973 * 974 * @param buf the UTF-16BE string to validate. 975 * @param len the length of the string in number of 2-byte code units (char16_t). 976 * @return true if and only if the string is valid UTF-16BE. 977 */ 978simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept; 979 980/** 981 * Using native endianness; Validate the UTF-16 string and stop on error. 982 * It might be faster than validate_utf16 when an error is expected to occur early. 983 * 984 * Overridden by each implementation. 985 * 986 * This function is not BOM-aware. 987 * 988 * @param buf the UTF-16 string to validate. 989 * @param len the length of the string in number of 2-byte code units (char16_t). 990 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 991 */ 992simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept; 993 994/** 995 * Validate the UTF-16LE string and stop on error. It might be faster than 996 * validate_utf16le when an error is expected to occur early. 997 * 998 * Overridden by each implementation. 999 * 1000 * This function is not BOM-aware. 1001 * 1002 * @param buf the UTF-16LE string to validate. 1003 * @param len the length of the string in number of 2-byte code units (char16_t). 1004 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 1005 */ 1006simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) noexcept; 1007 1008/** 1009 * Validate the UTF-16BE string and stop on error. It might be faster than 1010 * validate_utf16be when an error is expected to occur early. 1011 * 1012 * Overridden by each implementation. 1013 * 1014 * This function is not BOM-aware. 1015 * 1016 * @param buf the UTF-16BE string to validate. 1017 * @param len the length of the string in number of 2-byte code units (char16_t). 1018 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 1019 */ 1020simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept; 1021 1022/** 1023 * Validate the UTF-32 string. This function may be best when you expect 1024 * the input to be almost always valid. Otherwise, consider using 1025 * validate_utf32_with_errors. 1026 * 1027 * Overridden by each implementation. 1028 * 1029 * This function is not BOM-aware. 1030 * 1031 * @param buf the UTF-32 string to validate. 1032 * @param len the length of the string in number of 4-byte code units (char32_t). 1033 * @return true if and only if the string is valid UTF-32. 1034 */ 1035simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept; 1036 1037/** 1038 * Validate the UTF-32 string and stop on error. It might be faster than 1039 * validate_utf32 when an error is expected to occur early. 1040 * 1041 * Overridden by each implementation. 1042 * 1043 * This function is not BOM-aware. 1044 * 1045 * @param buf the UTF-32 string to validate. 1046 * @param len the length of the string in number of 4-byte code units (char32_t). 1047 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 1048 */ 1049simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept; 1050 1051 /** 1052 * Convert Latin1 string into UTF8 string. 1053 * 1054 * This function is suitable to work with inputs from untrusted sources. 1055 * 1056 * @param input the Latin1 string to convert 1057 * @param length the length of the string in bytes 1058 * @param latin1_output the pointer to buffer that can hold conversion result 1059 * @return the number of written char; 0 if conversion is not possible 1060 */ 1061 simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept; 1062 1063 1064 /** 1065 * Convert possibly Latin1 string into UTF-16LE string. 1066 * 1067 * This function is suitable to work with inputs from untrusted sources. 1068 * 1069 * @param input the Latin1 string to convert 1070 * @param length the length of the string in bytes 1071 * @param utf16_buffer the pointer to buffer that can hold conversion result 1072 * @return the number of written char16_t; 0 if conversion is not possible 1073 */ 1074 simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept; 1075 1076 /** 1077 * Convert Latin1 string into UTF-16BE string. 1078 * 1079 * This function is suitable to work with inputs from untrusted sources. 1080 * 1081 * @param input the Latin1 string to convert 1082 * @param length the length of the string in bytes 1083 * @param utf16_buffer the pointer to buffer that can hold conversion result 1084 * @return the number of written char16_t; 0 if conversion is not possible 1085 */ 1086 simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; 1087 1088 /** 1089 * Convert Latin1 string into UTF-32 string. 1090 * 1091 * This function is suitable to work with inputs from untrusted sources. 1092 * 1093 * @param input the Latin1 string to convert 1094 * @param length the length of the string in bytes 1095 * @param utf32_buffer the pointer to buffer that can hold conversion result 1096 * @return the number of written char32_t; 0 if conversion is not possible 1097 */ 1098 simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept; 1099 1100 /** 1101 * Convert possibly broken UTF-8 string into latin1 string. 1102 * 1103 * During the conversion also validation of the input string is done. 1104 * This function is suitable to work with inputs from untrusted sources. 1105 * 1106 * @param input the UTF-8 string to convert 1107 * @param length the length of the string in bytes 1108 * @param latin1_output the pointer to buffer that can hold conversion result 1109 * @return the number of written char; 0 if the input was not valid UTF-8 string 1110 */ 1111 simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; 1112 1113/** 1114 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16 string. 1115 * 1116 * During the conversion also validation of the input string is done. 1117 * This function is suitable to work with inputs from untrusted sources. 1118 * 1119 * @param input the UTF-8 string to convert 1120 * @param length the length of the string in bytes 1121 * @param utf16_buffer the pointer to buffer that can hold conversion result 1122 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string 1123 */ 1124simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept; 1125 1126 1127/** 1128 * Using native endianness, convert a Latin1 string into a UTF-16 string. 1129 * 1130 * @param input the UTF-8 string to convert 1131 * @param length the length of the string in bytes 1132 * @param utf16_buffer the pointer to buffer that can hold conversion result 1133 * @return the number of written char16_t. 1134 */ 1135simdutf_warn_unused size_t convert_latin1_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept; 1136 1137/** 1138 * Convert possibly broken UTF-8 string into UTF-16LE string. 1139 * 1140 * During the conversion also validation of the input string is done. 1141 * This function is suitable to work with inputs from untrusted sources. 1142 * 1143 * @param input the UTF-8 string to convert 1144 * @param length the length of the string in bytes 1145 * @param utf16_buffer the pointer to buffer that can hold conversion result 1146 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string 1147 */ 1148simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept; 1149 1150/** 1151 * Convert possibly broken UTF-8 string into UTF-16BE string. 1152 * 1153 * During the conversion also validation of the input string is done. 1154 * This function is suitable to work with inputs from untrusted sources. 1155 * 1156 * @param input the UTF-8 string to convert 1157 * @param length the length of the string in bytes 1158 * @param utf16_buffer the pointer to buffer that can hold conversion result 1159 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string 1160 */ 1161simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; 1162 1163 1164 /** 1165 * Convert possibly broken UTF-8 string into latin1 string with errors. 1166 * 1167 * During the conversion also validation of the input string is done. 1168 * This function is suitable to work with inputs from untrusted sources. 1169 * 1170 * @param input the UTF-8 string to convert 1171 * @param length the length of the string in bytes 1172 * @param latin1_output the pointer to buffer that can hold conversion result 1173 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 1174 */ 1175 simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) noexcept; 1176 1177/** 1178 * Using native endianness, convert possibly broken UTF-8 string into UTF-16 1179 * string and stop on error. 1180 * 1181 * During the conversion also validation of the input string is done. 1182 * This function is suitable to work with inputs from untrusted sources. 1183 * 1184 * @param input the UTF-8 string to convert 1185 * @param length the length of the string in bytes 1186 * @param utf16_buffer the pointer to buffer that can hold conversion result 1187 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. 1188 */ 1189simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; 1190 1191/** 1192 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error. 1193 * 1194 * During the conversion also validation of the input string is done. 1195 * This function is suitable to work with inputs from untrusted sources. 1196 * 1197 * @param input the UTF-8 string to convert 1198 * @param length the length of the string in bytes 1199 * @param utf16_buffer the pointer to buffer that can hold conversion result 1200 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. 1201 */ 1202simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; 1203 1204/** 1205 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error. 1206 * 1207 * During the conversion also validation of the input string is done. 1208 * This function is suitable to work with inputs from untrusted sources. 1209 * 1210 * @param input the UTF-8 string to convert 1211 * @param length the length of the string in bytes 1212 * @param utf16_buffer the pointer to buffer that can hold conversion result 1213 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. 1214 */ 1215simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; 1216 1217/** 1218 * Convert possibly broken UTF-8 string into UTF-32 string. 1219 * 1220 * During the conversion also validation of the input string is done. 1221 * This function is suitable to work with inputs from untrusted sources. 1222 * 1223 * @param input the UTF-8 string to convert 1224 * @param length the length of the string in bytes 1225 * @param utf32_buffer the pointer to buffer that can hold conversion result 1226 * @return the number of written char32_t; 0 if the input was not valid UTF-8 string 1227 */ 1228simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept; 1229 1230/** 1231 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error. 1232 * 1233 * During the conversion also validation of the input string is done. 1234 * This function is suitable to work with inputs from untrusted sources. 1235 * 1236 * @param input the UTF-8 string to convert 1237 * @param length the length of the string in bytes 1238 * @param utf32_buffer the pointer to buffer that can hold conversion result 1239 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. 1240 */ 1241simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept; 1242 1243 /** 1244 * Convert valid UTF-8 string into latin1 string. 1245 * 1246 * This function assumes that the input string is valid UTF-8. 1247 * 1248 * This function is not BOM-aware. 1249 * 1250 * @param input the UTF-8 string to convert 1251 * @param length the length of the string in bytes 1252 * @param latin1_output the pointer to buffer that can hold conversion result 1253 * @return the number of written char; 0 if the input was not valid UTF-8 string 1254 */ 1255 simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; 1256 1257 1258/** 1259 * Using native endianness, convert valid UTF-8 string into a UTF-16 string. 1260 * 1261 * This function assumes that the input string is valid UTF-8. 1262 * 1263 * @param input the UTF-8 string to convert 1264 * @param length the length of the string in bytes 1265 * @param utf16_buffer the pointer to buffer that can hold conversion result 1266 * @return the number of written char16_t 1267 */ 1268simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept; 1269 1270/** 1271 * Convert valid UTF-8 string into UTF-16LE string. 1272 * 1273 * This function assumes that the input string is valid UTF-8. 1274 * 1275 * @param input the UTF-8 string to convert 1276 * @param length the length of the string in bytes 1277 * @param utf16_buffer the pointer to buffer that can hold conversion result 1278 * @return the number of written char16_t 1279 */ 1280simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept; 1281 1282/** 1283 * Convert valid UTF-8 string into UTF-16BE string. 1284 * 1285 * This function assumes that the input string is valid UTF-8. 1286 * 1287 * @param input the UTF-8 string to convert 1288 * @param length the length of the string in bytes 1289 * @param utf16_buffer the pointer to buffer that can hold conversion result 1290 * @return the number of written char16_t 1291 */ 1292simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept; 1293 1294/** 1295 * Convert valid UTF-8 string into UTF-32 string. 1296 * 1297 * This function assumes that the input string is valid UTF-8. 1298 * 1299 * @param input the UTF-8 string to convert 1300 * @param length the length of the string in bytes 1301 * @param utf32_buffer the pointer to buffer that can hold conversion result 1302 * @return the number of written char32_t 1303 */ 1304simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept; 1305 1306 1307/** 1308 * Return the number of bytes that this Latin1 string would require in UTF-8 format. 1309 * 1310 * @param input the Latin1 string to convert 1311 * @param length the length of the string bytes 1312 * @return the number of bytes required to encode the Latin1 string as UTF-8 1313 */ 1314simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) noexcept; 1315 1316/** 1317 * Compute the number of bytes that this UTF-8 string would require in Latin1 format. 1318 * 1319 * This function does not validate the input. 1320 * 1321 * This function is not BOM-aware. 1322 * 1323 * @param input the UTF-8 string to convert 1324 * @param length the length of the string in byte 1325 * @return the number of bytes required to encode the UTF-8 string as Latin1 1326 */ 1327simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) noexcept; 1328 1329/** 1330 * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format. 1331 * 1332 * This function does not validate the input. 1333 * 1334 * This function is not BOM-aware. 1335 * 1336 * @param input the UTF-8 string to process 1337 * @param length the length of the string in bytes 1338 * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE 1339 */ 1340simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept; 1341 1342/** 1343 * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format. 1344 * 1345 * This function is equivalent to count_utf8 1346 * 1347 * This function does not validate the input. 1348 * 1349 * This function is not BOM-aware. 1350 * 1351 * @param input the UTF-8 string to process 1352 * @param length the length of the string in bytes 1353 * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32 1354 */ 1355simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept; 1356 1357/** 1358 * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string. 1359 * 1360 * During the conversion also validation of the input string is done. 1361 * This function is suitable to work with inputs from untrusted sources. 1362 * 1363 * This function is not BOM-aware. 1364 * 1365 * @param input the UTF-16 string to convert 1366 * @param length the length of the string in 2-byte code units (char16_t) 1367 * @param utf8_buffer the pointer to buffer that can hold conversion result 1368 * @return number of written code units; 0 if input is not a valid UTF-16LE string 1369 */ 1370simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; 1371 1372 1373 1374/** 1375 * Using native endianness, convert possibly broken UTF-16 string into Latin1 string. 1376 * 1377 * During the conversion also validation of the input string is done. 1378 * This function is suitable to work with inputs from untrusted sources. 1379 * 1380 * This function is not BOM-aware. 1381 * 1382 * @param input the UTF-16 string to convert 1383 * @param length the length of the string in 2-byte code units (char16_t) 1384 * @param latin1_buffer the pointer to buffer that can hold conversion result 1385 * @return number of written code units; 0 if input is not a valid UTF-16LE string 1386 */ 1387simdutf_warn_unused size_t convert_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; 1388 1389/** 1390 * Convert possibly broken UTF-16LE string into Latin1 string. 1391 * 1392 * During the conversion also validation of the input string is done. 1393 * This function is suitable to work with inputs from untrusted sources. 1394 * 1395 * This function is not BOM-aware. 1396 * 1397 * @param input the UTF-16LE string to convert 1398 * @param length the length of the string in 2-byte code units (char16_t) 1399 * @param latin1_buffer the pointer to buffer that can hold conversion result 1400 * @return number of written code units; 0 if input is not a valid UTF-16LE string 1401 */ 1402simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; 1403 1404/** 1405 * Convert possibly broken UTF-16BE string into Latin1 string. 1406 * 1407 * During the conversion also validation of the input string is done. 1408 * This function is suitable to work with inputs from untrusted sources. 1409 * 1410 * This function is not BOM-aware. 1411 * 1412 * @param input the UTF-16BE string to convert 1413 * @param length the length of the string in 2-byte code units (char16_t) 1414 * @param latin1_buffer the pointer to buffer that can hold conversion result 1415 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 1416 */ 1417simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; 1418 1419 1420/** 1421 * Convert possibly broken UTF-16LE string into UTF-8 string. 1422 * 1423 * During the conversion also validation of the input string is done. 1424 * This function is suitable to work with inputs from untrusted sources. 1425 * 1426 * This function is not BOM-aware. 1427 * 1428 * @param input the UTF-16LE string to convert 1429 * @param length the length of the string in 2-byte code units (char16_t) 1430 * @param utf8_buffer the pointer to buffer that can hold conversion result 1431 * @return number of written code units; 0 if input is not a valid UTF-16LE string 1432 */ 1433simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; 1434 1435/** 1436 * Convert possibly broken UTF-16BE string into UTF-8 string. 1437 * 1438 * During the conversion also validation of the input string is done. 1439 * This function is suitable to work with inputs from untrusted sources. 1440 * 1441 * This function is not BOM-aware. 1442 * 1443 * @param input the UTF-16BE string to convert 1444 * @param length the length of the string in 2-byte code units (char16_t) 1445 * @param utf8_buffer the pointer to buffer that can hold conversion result 1446 * @return number of written code units; 0 if input is not a valid UTF-16LE string 1447 */ 1448simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; 1449 1450/** 1451 * Using native endianness, convert possibly broken UTF-16 string into Latin1 string. 1452 * 1453 * During the conversion also validation of the input string is done. 1454 * This function is suitable to work with inputs from untrusted sources. 1455 * This function is not BOM-aware. 1456 * 1457 * @param input the UTF-16 string to convert 1458 * @param length the length of the string in 2-byte code units (char16_t) 1459 * @param latin1_buffer the pointer to buffer that can hold conversion result 1460 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 1461 */ 1462simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept; 1463 1464/** 1465 * Convert possibly broken UTF-16LE string into Latin1 string. 1466 * 1467 * During the conversion also validation of the input string is done. 1468 * This function is suitable to work with inputs from untrusted sources. 1469 * This function is not BOM-aware. 1470 * 1471 * @param input the UTF-16LE string to convert 1472 * @param length the length of the string in 2-byte code units (char16_t) 1473 * @param latin1_buffer the pointer to buffer that can hold conversion result 1474 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 1475 */ 1476simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept; 1477 1478/** 1479 * Convert possibly broken UTF-16BE string into Latin1 string. 1480 * 1481 * During the conversion also validation of the input string is done. 1482 * This function is suitable to work with inputs from untrusted sources. 1483 * This function is not BOM-aware. 1484 * 1485 * @param input the UTF-16BE string to convert 1486 * @param length the length of the string in 2-byte code units (char16_t) 1487 * @param latin1_buffer the pointer to buffer that can hold conversion result 1488 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 1489 */ 1490simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept; 1491 1492 1493/** 1494 * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string and stop on error. 1495 * 1496 * During the conversion also validation of the input string is done. 1497 * This function is suitable to work with inputs from untrusted sources. 1498 * 1499 * This function is not BOM-aware. 1500 * 1501 * @param input the UTF-16 string to convert 1502 * @param length the length of the string in 2-byte code units (char16_t) 1503 * @param utf8_buffer the pointer to buffer that can hold conversion result 1504 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 1505 */ 1506simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; 1507 1508/** 1509 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error. 1510 * 1511 * During the conversion also validation of the input string is done. 1512 * This function is suitable to work with inputs from untrusted sources. 1513 * 1514 * This function is not BOM-aware. 1515 * 1516 * @param input the UTF-16LE string to convert 1517 * @param length the length of the string in 2-byte code units (char16_t) 1518 * @param utf8_buffer the pointer to buffer that can hold conversion result 1519 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 1520 */ 1521simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; 1522 1523/** 1524 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error. 1525 * 1526 * During the conversion also validation of the input string is done. 1527 * This function is suitable to work with inputs from untrusted sources. 1528 * 1529 * This function is not BOM-aware. 1530 * 1531 * @param input the UTF-16BE string to convert 1532 * @param length the length of the string in 2-byte code units (char16_t) 1533 * @param utf8_buffer the pointer to buffer that can hold conversion result 1534 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 1535 */ 1536simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; 1537 1538/** 1539 * Using native endianness, convert valid UTF-16 string into UTF-8 string. 1540 * 1541 * This function assumes that the input string is valid UTF-16LE. 1542 * 1543 * This function is not BOM-aware. 1544 * 1545 * @param input the UTF-16 string to convert 1546 * @param length the length of the string in 2-byte code units (char16_t) 1547 * @param utf8_buffer the pointer to buffer that can hold the conversion result 1548 * @return number of written code units; 0 if conversion is not possible 1549 */ 1550simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; 1551 1552 1553/** 1554 * Using native endianness, convert UTF-16 string into Latin1 string. 1555 * 1556 * This function assumes that the input string is valid UTF-8. 1557 * 1558 * This function is not BOM-aware. 1559 * 1560 * @param input the UTF-16 string to convert 1561 * @param length the length of the string in 2-byte code units (char16_t) 1562 * @param latin1_buffer the pointer to buffer that can hold conversion result 1563 * @return number of written code units; 0 if conversion is not possible 1564 */ 1565simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; 1566 1567/** 1568 * Convert valid UTF-16LE string into Latin1 string. 1569 * 1570 * This function assumes that the input string is valid UTF-16LE. 1571 * 1572 * This function is not BOM-aware. 1573 * 1574 * @param input the UTF-16LE string to convert 1575 * @param length the length of the string in 2-byte code units (char16_t) 1576 * @param latin1_buffer the pointer to buffer that can hold conversion result 1577 * @return number of written code units; 0 if conversion is not possible 1578 */ 1579simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; 1580 1581/** 1582 * Convert valid UTF-16BE string into Latin1 string. 1583 * 1584 * This function assumes that the input string is valid UTF-16BE. 1585 * 1586 * This function is not BOM-aware. 1587 * 1588 * @param input the UTF-16BE string to convert 1589 * @param length the length of the string in 2-byte code units (char16_t) 1590 * @param latin1_buffer the pointer to buffer that can hold conversion result 1591 * @return number of written code units; 0 if conversion is not possible 1592 */ 1593simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept; 1594 1595 1596/** 1597 * Convert valid UTF-16LE string into UTF-8 string. 1598 * 1599 * This function assumes that the input string is valid UTF-16LE. 1600 * 1601 * This function is not BOM-aware. 1602 * 1603 * @param input the UTF-16LE string to convert 1604 * @param length the length of the string in 2-byte code units (char16_t) 1605 * @param utf8_buffer the pointer to buffer that can hold the conversion result 1606 * @return number of written code units; 0 if conversion is not possible 1607 */ 1608simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; 1609 1610/** 1611 * Convert valid UTF-16BE string into UTF-8 string. 1612 * 1613 * This function assumes that the input string is valid UTF-16BE. 1614 * 1615 * This function is not BOM-aware. 1616 * 1617 * @param input the UTF-16BE string to convert 1618 * @param length the length of the string in 2-byte code units (char16_t) 1619 * @param utf8_buffer the pointer to buffer that can hold the conversion result 1620 * @return number of written code units; 0 if conversion is not possible 1621 */ 1622simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; 1623 1624/** 1625 * Using native endianness, convert possibly broken UTF-16 string into UTF-32 string. 1626 * 1627 * During the conversion also validation of the input string is done. 1628 * This function is suitable to work with inputs from untrusted sources. 1629 * 1630 * This function is not BOM-aware. 1631 * 1632 * @param input the UTF-16 string to convert 1633 * @param length the length of the string in 2-byte code units (char16_t) 1634 * @param utf32_buffer the pointer to buffer that can hold conversion result 1635 * @return number of written code units; 0 if input is not a valid UTF-16LE string 1636 */ 1637simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; 1638 1639/** 1640 * Convert possibly broken UTF-16LE string into UTF-32 string. 1641 * 1642 * During the conversion also validation of the input string is done. 1643 * This function is suitable to work with inputs from untrusted sources. 1644 * 1645 * This function is not BOM-aware. 1646 * 1647 * @param input the UTF-16LE string to convert 1648 * @param length the length of the string in 2-byte code units (char16_t) 1649 * @param utf32_buffer the pointer to buffer that can hold conversion result 1650 * @return number of written code units; 0 if input is not a valid UTF-16LE string 1651 */ 1652simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; 1653 1654/** 1655 * Convert possibly broken UTF-16BE string into UTF-32 string. 1656 * 1657 * During the conversion also validation of the input string is done. 1658 * This function is suitable to work with inputs from untrusted sources. 1659 * 1660 * This function is not BOM-aware. 1661 * 1662 * @param input the UTF-16BE string to convert 1663 * @param length the length of the string in 2-byte code units (char16_t) 1664 * @param utf32_buffer the pointer to buffer that can hold conversion result 1665 * @return number of written code units; 0 if input is not a valid UTF-16LE string 1666 */ 1667simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; 1668 1669/** 1670 * Using native endianness, convert possibly broken UTF-16 string into 1671 * UTF-32 string and stop on error. 1672 * 1673 * During the conversion also validation of the input string is done. 1674 * This function is suitable to work with inputs from untrusted sources. 1675 * 1676 * This function is not BOM-aware. 1677 * 1678 * @param input the UTF-16 string to convert 1679 * @param length the length of the string in 2-byte code units (char16_t) 1680 * @param utf32_buffer the pointer to buffer that can hold conversion result 1681 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. 1682 */ 1683simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; 1684 1685/** 1686 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error. 1687 * 1688 * During the conversion also validation of the input string is done. 1689 * This function is suitable to work with inputs from untrusted sources. 1690 * 1691 * This function is not BOM-aware. 1692 * 1693 * @param input the UTF-16LE string to convert 1694 * @param length the length of the string in 2-byte code units (char16_t) 1695 * @param utf32_buffer the pointer to buffer that can hold conversion result 1696 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. 1697 */ 1698simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; 1699 1700/** 1701 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error. 1702 * 1703 * During the conversion also validation of the input string is done. 1704 * This function is suitable to work with inputs from untrusted sources. 1705 * 1706 * This function is not BOM-aware. 1707 * 1708 * @param input the UTF-16BE string to convert 1709 * @param length the length of the string in 2-byte code units (char16_t) 1710 * @param utf32_buffer the pointer to buffer that can hold conversion result 1711 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. 1712 */ 1713simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; 1714 1715/** 1716 * Using native endianness, convert valid UTF-16 string into UTF-32 string. 1717 * 1718 * This function assumes that the input string is valid UTF-16 (native endianness). 1719 * 1720 * This function is not BOM-aware. 1721 * 1722 * @param input the UTF-16 string to convert 1723 * @param length the length of the string in 2-byte code units (char16_t) 1724 * @param utf32_buffer the pointer to buffer that can hold the conversion result 1725 * @return number of written code units; 0 if conversion is not possible 1726 */ 1727simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; 1728 1729/** 1730 * Convert valid UTF-16LE string into UTF-32 string. 1731 * 1732 * This function assumes that the input string is valid UTF-16LE. 1733 * 1734 * This function is not BOM-aware. 1735 * 1736 * @param input the UTF-16LE string to convert 1737 * @param length the length of the string in 2-byte code units (char16_t) 1738 * @param utf32_buffer the pointer to buffer that can hold the conversion result 1739 * @return number of written code units; 0 if conversion is not possible 1740 */ 1741simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; 1742 1743/** 1744 * Convert valid UTF-16BE string into UTF-32 string. 1745 * 1746 * This function assumes that the input string is valid UTF-16LE. 1747 * 1748 * This function is not BOM-aware. 1749 * 1750 * @param input the UTF-16BE string to convert 1751 * @param length the length of the string in 2-byte code units (char16_t) 1752 * @param utf32_buffer the pointer to buffer that can hold the conversion result 1753 * @return number of written code units; 0 if conversion is not possible 1754 */ 1755simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; 1756 1757 1758/* 1759 * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format. 1760 * 1761 * This function does not validate the input. 1762 * 1763 * This function is not BOM-aware. 1764 * 1765 * @param length the length of the string in 2-byte code units (char16_t) 1766 * @return the number of bytes required to encode the UTF-16LE string as Latin1 1767 */ 1768simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept; 1769 1770 1771/** 1772 * Using native endianness; Compute the number of bytes that this UTF-16 1773 * string would require in UTF-8 format. 1774 * 1775 * This function does not validate the input. 1776 * 1777 * @param input the UTF-16 string to convert 1778 * @param length the length of the string in 2-byte code units (char16_t) 1779 * @return the number of bytes required to encode the UTF-16LE string as UTF-8 1780 */ 1781simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept; 1782 1783/** 1784 * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. 1785 * 1786 * This function does not validate the input. 1787 * 1788 * @param input the UTF-16LE string to convert 1789 * @param length the length of the string in 2-byte code units (char16_t) 1790 * @return the number of bytes required to encode the UTF-16LE string as UTF-8 1791 */ 1792simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept; 1793 1794/** 1795 * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format. 1796 * 1797 * This function does not validate the input. 1798 * 1799 * @param input the UTF-16BE string to convert 1800 * @param length the length of the string in 2-byte code units (char16_t) 1801 * @return the number of bytes required to encode the UTF-16BE string as UTF-8 1802 */ 1803simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept; 1804 1805/** 1806 * Convert possibly broken UTF-32 string into UTF-8 string. 1807 * 1808 * During the conversion also validation of the input string is done. 1809 * This function is suitable to work with inputs from untrusted sources. 1810 * 1811 * This function is not BOM-aware. 1812 * 1813 * @param input the UTF-32 string to convert 1814 * @param length the length of the string in 4-byte code units (char32_t) 1815 * @param utf8_buffer the pointer to buffer that can hold conversion result 1816 * @return number of written code units; 0 if input is not a valid UTF-32 string 1817 */ 1818simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; 1819 1820/** 1821 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error. 1822 * 1823 * During the conversion also validation of the input string is done. 1824 * This function is suitable to work with inputs from untrusted sources. 1825 * 1826 * This function is not BOM-aware. 1827 * 1828 * @param input the UTF-32 string to convert 1829 * @param length the length of the string in 4-byte code units (char32_t) 1830 * @param utf8_buffer the pointer to buffer that can hold conversion result 1831 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 1832 */ 1833simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept; 1834 1835/** 1836 * Convert valid UTF-32 string into UTF-8 string. 1837 * 1838 * This function assumes that the input string is valid UTF-32. 1839 * 1840 * This function is not BOM-aware. 1841 * 1842 * @param input the UTF-32 string to convert 1843 * @param length the length of the string in 4-byte code units (char32_t) 1844 * @param utf8_buffer the pointer to buffer that can hold the conversion result 1845 * @return number of written code units; 0 if conversion is not possible 1846 */ 1847simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; 1848 1849/** 1850 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16 string. 1851 * 1852 * During the conversion also validation of the input string is done. 1853 * This function is suitable to work with inputs from untrusted sources. 1854 * 1855 * This function is not BOM-aware. 1856 * 1857 * @param input the UTF-32 string to convert 1858 * @param length the length of the string in 4-byte code units (char32_t) 1859 * @param utf16_buffer the pointer to buffer that can hold conversion result 1860 * @return number of written code units; 0 if input is not a valid UTF-32 string 1861 */ 1862simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; 1863 1864/** 1865 * Convert possibly broken UTF-32 string into UTF-16LE string. 1866 * 1867 * During the conversion also validation of the input string is done. 1868 * This function is suitable to work with inputs from untrusted sources. 1869 * 1870 * This function is not BOM-aware. 1871 * 1872 * @param input the UTF-32 string to convert 1873 * @param length the length of the string in 4-byte code units (char32_t) 1874 * @param utf16_buffer the pointer to buffer that can hold conversion result 1875 * @return number of written code units; 0 if input is not a valid UTF-32 string 1876 */ 1877simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; 1878 1879/** 1880 * Convert possibly broken UTF-32 string into Latin1 string. 1881 * 1882 * During the conversion also validation of the input string is done. 1883 * This function is suitable to work with inputs from untrusted sources. 1884 * 1885 * This function is not BOM-aware. 1886 * 1887 * @param input the UTF-32 string to convert 1888 * @param length the length of the string in 4-byte code units (char32_t) 1889 * @param latin1_buffer the pointer to buffer that can hold conversion result 1890 * @return number of written code units; 0 if input is not a valid UTF-32 string 1891 */ 1892simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept; 1893 1894 1895/** 1896 * Convert possibly broken UTF-32 string into Latin1 string and stop on error. 1897 * 1898 * During the conversion also validation of the input string is done. 1899 * This function is suitable to work with inputs from untrusted sources. 1900 * 1901 * This function is not BOM-aware. 1902 * 1903 * @param input the UTF-32 string to convert 1904 * @param length the length of the string in 4-byte code units (char32_t) 1905 * @param latin1_buffer the pointer to buffer that can hold conversion result 1906 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 1907 */ 1908simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) noexcept; 1909 1910/** 1911 * Convert valid UTF-32 string into Latin1 string. 1912 * 1913 * This function assumes that the input string is valid UTF-32. 1914 * 1915 * This function is not BOM-aware. 1916 * 1917 * @param input the UTF-32 string to convert 1918 * @param length the length of the string in 4-byte code units (char32_t) 1919 * @param latin1_buffer the pointer to buffer that can hold the conversion result 1920 * @return number of written code units; 0 if conversion is not possible 1921 */ 1922simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept; 1923 1924/** 1925 * Convert possibly broken UTF-32 string into UTF-16BE string. 1926 * 1927 * During the conversion also validation of the input string is done. 1928 * This function is suitable to work with inputs from untrusted sources. 1929 * 1930 * This function is not BOM-aware. 1931 * 1932 * @param input the UTF-32 string to convert 1933 * @param length the length of the string in 4-byte code units (char32_t) 1934 * @param utf16_buffer the pointer to buffer that can hold conversion result 1935 * @return number of written code units; 0 if input is not a valid UTF-32 string 1936 */ 1937simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; 1938 1939/** 1940 * Using native endianness, convert possibly broken UTF-32 string into UTF-16 1941 * string and stop on error. 1942 * 1943 * During the conversion also validation of the input string is done. 1944 * This function is suitable to work with inputs from untrusted sources. 1945 * 1946 * This function is not BOM-aware. 1947 * 1948 * @param input the UTF-32 string to convert 1949 * @param length the length of the string in 4-byte code units (char32_t) 1950 * @param utf16_buffer the pointer to buffer that can hold conversion result 1951 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. 1952 */ 1953simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; 1954 1955/** 1956 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error. 1957 * 1958 * During the conversion also validation of the input string is done. 1959 * This function is suitable to work with inputs from untrusted sources. 1960 * 1961 * This function is not BOM-aware. 1962 * 1963 * @param input the UTF-32 string to convert 1964 * @param length the length of the string in 4-byte code units (char32_t) 1965 * @param utf16_buffer the pointer to buffer that can hold conversion result 1966 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. 1967 */ 1968simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; 1969 1970/** 1971 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error. 1972 * 1973 * During the conversion also validation of the input string is done. 1974 * This function is suitable to work with inputs from untrusted sources. 1975 * 1976 * This function is not BOM-aware. 1977 * 1978 * @param input the UTF-32 string to convert 1979 * @param length the length of the string in 4-byte code units (char32_t) 1980 * @param utf16_buffer the pointer to buffer that can hold conversion result 1981 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. 1982 */ 1983simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; 1984 1985/** 1986 * Using native endianness, convert valid UTF-32 string into a UTF-16 string. 1987 * 1988 * This function assumes that the input string is valid UTF-32. 1989 * 1990 * This function is not BOM-aware. 1991 * 1992 * @param input the UTF-32 string to convert 1993 * @param length the length of the string in 4-byte code units (char32_t) 1994 * @param utf16_buffer the pointer to buffer that can hold the conversion result 1995 * @return number of written code units; 0 if conversion is not possible 1996 */ 1997simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; 1998 1999/** 2000 * Convert valid UTF-32 string into UTF-16LE string. 2001 * 2002 * This function assumes that the input string is valid UTF-32. 2003 * 2004 * This function is not BOM-aware. 2005 * 2006 * @param input the UTF-32 string to convert 2007 * @param length the length of the string in 4-byte code units (char32_t) 2008 * @param utf16_buffer the pointer to buffer that can hold the conversion result 2009 * @return number of written code units; 0 if conversion is not possible 2010 */ 2011simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; 2012 2013/** 2014 * Convert valid UTF-32 string into UTF-16BE string. 2015 * 2016 * This function assumes that the input string is valid UTF-32. 2017 * 2018 * This function is not BOM-aware. 2019 * 2020 * @param input the UTF-32 string to convert 2021 * @param length the length of the string in 4-byte code units (char32_t) 2022 * @param utf16_buffer the pointer to buffer that can hold the conversion result 2023 * @return number of written code units; 0 if conversion is not possible 2024 */ 2025simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; 2026 2027/** 2028 * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or 2029 * from UTF-16BE to UTF-16LE. 2030 * 2031 * This function does not validate the input. 2032 * 2033 * This function is not BOM-aware. 2034 * 2035 * @param input the UTF-16 string to process 2036 * @param length the length of the string in 2-byte code units (char16_t) 2037 * @param output the pointer to buffer that can hold the conversion result 2038 */ 2039void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept; 2040 2041/** 2042 * Compute the number of bytes that this UTF-32 string would require in UTF-8 format. 2043 * 2044 * This function does not validate the input. 2045 * 2046 * @param input the UTF-32 string to convert 2047 * @param length the length of the string in 4-byte code units (char32_t) 2048 * @return the number of bytes required to encode the UTF-32 string as UTF-8 2049 */ 2050simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept; 2051 2052/** 2053 * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format. 2054 * 2055 * This function does not validate the input. 2056 * 2057 * @param input the UTF-32 string to convert 2058 * @param length the length of the string in 4-byte code units (char32_t) 2059 * @return the number of bytes required to encode the UTF-32 string as UTF-16 2060 */ 2061simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept; 2062 2063/** 2064 * Using native endianness; Compute the number of bytes that this UTF-16 2065 * string would require in UTF-32 format. 2066 * 2067 * This function is equivalent to count_utf16. 2068 * 2069 * This function does not validate the input. 2070 * 2071 * This function is not BOM-aware. 2072 * 2073 * @param input the UTF-16 string to convert 2074 * @param length the length of the string in 2-byte code units (char16_t) 2075 * @return the number of bytes required to encode the UTF-16LE string as UTF-32 2076 */ 2077simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept; 2078 2079/** 2080 * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format. 2081 * 2082 * This function is equivalent to count_utf16le. 2083 * 2084 * This function does not validate the input. 2085 * 2086 * This function is not BOM-aware. 2087 * 2088 * @param input the UTF-16LE string to convert 2089 * @param length the length of the string in 2-byte code units (char16_t) 2090 * @return the number of bytes required to encode the UTF-16LE string as UTF-32 2091 */ 2092simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept; 2093 2094/** 2095 * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format. 2096 * 2097 * This function is equivalent to count_utf16be. 2098 * 2099 * This function does not validate the input. 2100 * 2101 * This function is not BOM-aware. 2102 * 2103 * @param input the UTF-16BE string to convert 2104 * @param length the length of the string in 2-byte code units (char16_t) 2105 * @return the number of bytes required to encode the UTF-16BE string as UTF-32 2106 */ 2107simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept; 2108 2109/** 2110 * Count the number of code points (characters) in the string assuming that 2111 * it is valid. 2112 * 2113 * This function assumes that the input string is valid UTF-16 (native endianness). 2114 * 2115 * This function is not BOM-aware. 2116 * 2117 * @param input the UTF-16 string to process 2118 * @param length the length of the string in 2-byte code units (char16_t) 2119 * @return number of code points 2120 */ 2121simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept; 2122 2123/** 2124 * Count the number of code points (characters) in the string assuming that 2125 * it is valid. 2126 * 2127 * This function assumes that the input string is valid UTF-16LE. 2128 * 2129 * This function is not BOM-aware. 2130 * 2131 * @param input the UTF-16LE string to process 2132 * @param length the length of the string in 2-byte code units (char16_t) 2133 * @return number of code points 2134 */ 2135simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept; 2136 2137/** 2138 * Count the number of code points (characters) in the string assuming that 2139 * it is valid. 2140 * 2141 * This function assumes that the input string is valid UTF-16BE. 2142 * 2143 * This function is not BOM-aware. 2144 * 2145 * @param input the UTF-16BE string to process 2146 * @param length the length of the string in 2-byte code units (char16_t) 2147 * @return number of code points 2148 */ 2149simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept; 2150 2151/** 2152 * Count the number of code points (characters) in the string assuming that 2153 * it is valid. 2154 * 2155 * This function assumes that the input string is valid UTF-8. 2156 * 2157 * @param input the UTF-8 string to process 2158 * @param length the length of the string in bytes 2159 * @return number of code points 2160 */ 2161simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept; 2162 2163/** 2164 * Given a valid UTF-8 string having a possibly truncated last character, 2165 * this function checks the end of string. If the last character is truncated (or partial), 2166 * then it returns a shorter length (shorter by 1 to 3 bytes) so that the short UTF-8 2167 * strings only contain complete characters. If there is no truncated character, 2168 * the original length is returned. 2169 * 2170 * This function assumes that the input string is valid UTF-8, but possibly truncated. 2171 * 2172 * @param input the UTF-8 string to process 2173 * @param length the length of the string in bytes 2174 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes 2175 */ 2176simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length); 2177 2178/** 2179 * Given a valid UTF-16BE string having a possibly truncated last character, 2180 * this function checks the end of string. If the last character is truncated (or partial), 2181 * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16BE 2182 * strings only contain complete characters. If there is no truncated character, 2183 * the original length is returned. 2184 * 2185 * This function assumes that the input string is valid UTF-16BE, but possibly truncated. 2186 * 2187 * @param input the UTF-16BE string to process 2188 * @param length the length of the string in bytes 2189 * @return the length of the string in bytes, possibly shorter by 1 unit 2190 */ 2191simdutf_warn_unused size_t trim_partial_utf16be(const char16_t* input, size_t length); 2192 2193/** 2194 * Given a valid UTF-16LE string having a possibly truncated last character, 2195 * this function checks the end of string. If the last character is truncated (or partial), 2196 * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16LE 2197 * strings only contain complete characters. If there is no truncated character, 2198 * the original length is returned. 2199 * 2200 * This function assumes that the input string is valid UTF-16LE, but possibly truncated. 2201 * 2202 * @param input the UTF-16LE string to process 2203 * @param length the length of the string in bytes 2204 * @return the length of the string in unit, possibly shorter by 1 unit 2205 */ 2206simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t length); 2207 2208 2209/** 2210 * Given a valid UTF-16 string having a possibly truncated last character, 2211 * this function checks the end of string. If the last character is truncated (or partial), 2212 * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16 2213 * strings only contain complete characters. If there is no truncated character, 2214 * the original length is returned. 2215 * 2216 * This function assumes that the input string is valid UTF-16, but possibly truncated. 2217 * We use the native endianness. 2218 * 2219 * @param input the UTF-16 string to process 2220 * @param length the length of the string in bytes 2221 * @return the length of the string in unit, possibly shorter by 1 unit 2222 */ 2223simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length); 2224 2225/** 2226 * An implementation of simdutf for a particular CPU architecture. 2227 * 2228 * Also used to maintain the currently active implementation. The active implementation is 2229 * automatically initialized on first use to the most advanced implementation supported by the host. 2230 */ 2231class implementation { 2232public: 2233 2234 /** 2235 * The name of this implementation. 2236 * 2237 * const implementation *impl = simdutf::active_implementation; 2238 * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; 2239 * 2240 * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" 2241 */ 2242 virtual const std::string &name() const { return _name; } 2243 2244 /** 2245 * The description of this implementation. 2246 * 2247 * const implementation *impl = simdutf::active_implementation; 2248 * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; 2249 * 2250 * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" 2251 */ 2252 virtual const std::string &description() const { return _description; } 2253 2254 /** 2255 * The instruction sets this implementation is compiled against 2256 * and the current CPU match. This function may poll the current CPU/system 2257 * and should therefore not be called too often if performance is a concern. 2258 * 2259 * 2260 * @return true if the implementation can be safely used on the current system (determined at runtime) 2261 */ 2262 bool supported_by_runtime_system() const; 2263 2264 /** 2265 * This function will try to detect the encoding 2266 * @param input the string to identify 2267 * @param length the length of the string in bytes. 2268 * @return the encoding type detected 2269 */ 2270 virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept; 2271 2272 /** 2273 * This function will try to detect the possible encodings in one pass 2274 * @param input the string to identify 2275 * @param length the length of the string in bytes. 2276 * @return the encoding type detected 2277 */ 2278 virtual int detect_encodings(const char * input, size_t length) const noexcept = 0; 2279 2280 /** 2281 * @private For internal implementation use 2282 * 2283 * The instruction sets this implementation is compiled against. 2284 * 2285 * @return a mask of all required `internal::instruction_set::` values 2286 */ 2287 virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; } 2288 2289 2290 /** 2291 * Validate the UTF-8 string. 2292 * 2293 * Overridden by each implementation. 2294 * 2295 * @param buf the UTF-8 string to validate. 2296 * @param len the length of the string in bytes. 2297 * @return true if and only if the string is valid UTF-8. 2298 */ 2299 simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0; 2300 2301 /** 2302 * Validate the UTF-8 string and stop on errors. 2303 * 2304 * Overridden by each implementation. 2305 * 2306 * @param buf the UTF-8 string to validate. 2307 * @param len the length of the string in bytes. 2308 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 2309 */ 2310 simdutf_warn_unused virtual result validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0; 2311 2312 /** 2313 * Validate the ASCII string. 2314 * 2315 * Overridden by each implementation. 2316 * 2317 * @param buf the ASCII string to validate. 2318 * @param len the length of the string in bytes. 2319 * @return true if and only if the string is valid ASCII. 2320 */ 2321 simdutf_warn_unused virtual bool validate_ascii(const char *buf, size_t len) const noexcept = 0; 2322 2323 /** 2324 * Validate the ASCII string and stop on error. 2325 * 2326 * Overridden by each implementation. 2327 * 2328 * @param buf the ASCII string to validate. 2329 * @param len the length of the string in bytes. 2330 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 2331 */ 2332 simdutf_warn_unused virtual result validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0; 2333 2334 /** 2335 * Validate the UTF-16LE string.This function may be best when you expect 2336 * the input to be almost always valid. Otherwise, consider using 2337 * validate_utf16le_with_errors. 2338 * 2339 * Overridden by each implementation. 2340 * 2341 * This function is not BOM-aware. 2342 * 2343 * @param buf the UTF-16LE string to validate. 2344 * @param len the length of the string in number of 2-byte code units (char16_t). 2345 * @return true if and only if the string is valid UTF-16LE. 2346 */ 2347 simdutf_warn_unused virtual bool validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0; 2348 2349 /** 2350 * Validate the UTF-16BE string. This function may be best when you expect 2351 * the input to be almost always valid. Otherwise, consider using 2352 * validate_utf16be_with_errors. 2353 * 2354 * Overridden by each implementation. 2355 * 2356 * This function is not BOM-aware. 2357 * 2358 * @param buf the UTF-16BE string to validate. 2359 * @param len the length of the string in number of 2-byte code units (char16_t). 2360 * @return true if and only if the string is valid UTF-16BE. 2361 */ 2362 simdutf_warn_unused virtual bool validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0; 2363 2364 /** 2365 * Validate the UTF-16LE string and stop on error. It might be faster than 2366 * validate_utf16le when an error is expected to occur early. 2367 * 2368 * Overridden by each implementation. 2369 * 2370 * This function is not BOM-aware. 2371 * 2372 * @param buf the UTF-16LE string to validate. 2373 * @param len the length of the string in number of 2-byte code units (char16_t). 2374 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 2375 */ 2376 simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept = 0; 2377 2378 /** 2379 * Validate the UTF-16BE string and stop on error. It might be faster than 2380 * validate_utf16be when an error is expected to occur early. 2381 * 2382 * Overridden by each implementation. 2383 * 2384 * This function is not BOM-aware. 2385 * 2386 * @param buf the UTF-16BE string to validate. 2387 * @param len the length of the string in number of 2-byte code units (char16_t). 2388 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 2389 */ 2390 simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0; 2391 2392 /** 2393 * Validate the UTF-32 string. 2394 * 2395 * Overridden by each implementation. 2396 * 2397 * This function is not BOM-aware. 2398 * 2399 * @param buf the UTF-32 string to validate. 2400 * @param len the length of the string in number of 4-byte code units (char32_t). 2401 * @return true if and only if the string is valid UTF-32. 2402 */ 2403 simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0; 2404 2405 /** 2406 * Validate the UTF-32 string and stop on error. 2407 * 2408 * Overridden by each implementation. 2409 * 2410 * This function is not BOM-aware. 2411 * 2412 * @param buf the UTF-32 string to validate. 2413 * @param len the length of the string in number of 4-byte code units (char32_t). 2414 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 2415 */ 2416 simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0; 2417 2418 /** 2419 * Convert Latin1 string into UTF8 string. 2420 * 2421 * This function is suitable to work with inputs from untrusted sources. 2422 * 2423 * @param input the Latin1 string to convert 2424 * @param length the length of the string in bytes 2425 * @param latin1_output the pointer to buffer that can hold conversion result 2426 * @return the number of written char; 0 if conversion is not possible 2427 */ 2428 simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0; 2429 2430 2431 /** 2432 * Convert possibly Latin1 string into UTF-16LE string. 2433 * 2434 * This function is suitable to work with inputs from untrusted sources. 2435 * 2436 * @param input the Latin1 string to convert 2437 * @param length the length of the string in bytes 2438 * @param utf16_buffer the pointer to buffer that can hold conversion result 2439 * @return the number of written char16_t; 0 if conversion is not possible 2440 */ 2441 simdutf_warn_unused virtual size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; 2442 2443 /** 2444 * Convert Latin1 string into UTF-16BE string. 2445 * 2446 * This function is suitable to work with inputs from untrusted sources. 2447 * 2448 * @param input the Latin1 string to convert 2449 * @param length the length of the string in bytes 2450 * @param utf16_buffer the pointer to buffer that can hold conversion result 2451 * @return the number of written char16_t; 0 if conversion is not possible 2452 */ 2453 simdutf_warn_unused virtual size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; 2454 2455 /** 2456 * Convert Latin1 string into UTF-32 string. 2457 * 2458 * This function is suitable to work with inputs from untrusted sources. 2459 * 2460 * @param input the Latin1 string to convert 2461 * @param length the length of the string in bytes 2462 * @param utf32_buffer the pointer to buffer that can hold conversion result 2463 * @return the number of written char32_t; 0 if conversion is not possible 2464 */ 2465 simdutf_warn_unused virtual size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; 2466 2467 /** 2468 * Convert possibly broken UTF-8 string into latin1 string. 2469 * 2470 * During the conversion also validation of the input string is done. 2471 * This function is suitable to work with inputs from untrusted sources. 2472 * 2473 * @param input the UTF-8 string to convert 2474 * @param length the length of the string in bytes 2475 * @param latin1_output the pointer to buffer that can hold conversion result 2476 * @return the number of written char; 0 if the input was not valid UTF-8 string 2477 */ 2478 simdutf_warn_unused virtual size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0; 2479 2480 /** 2481 * Convert possibly broken UTF-8 string into latin1 string with errors 2482 * 2483 * During the conversion also validation of the input string is done. 2484 * This function is suitable to work with inputs from untrusted sources. 2485 * 2486 * @param input the UTF-8 string to convert 2487 * @param length the length of the string in bytes 2488 * @param latin1_output the pointer to buffer that can hold conversion result 2489 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 2490 */ 2491 simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) const noexcept = 0; 2492 2493 /** 2494 * Convert valid UTF-8 string into latin1 string. 2495 * 2496 * This function assumes that the input string is valid UTF-8. 2497 * 2498 * This function is not BOM-aware. 2499 * 2500 * @param input the UTF-8 string to convert 2501 * @param length the length of the string in bytes 2502 * @param latin1_output the pointer to buffer that can hold conversion result 2503 * @return the number of written char; 0 if the input was not valid UTF-8 string 2504 */ 2505 simdutf_warn_unused virtual size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0; 2506 2507 2508 /** 2509 * Convert possibly broken UTF-8 string into UTF-16LE string. 2510 * 2511 * During the conversion also validation of the input string is done. 2512 * This function is suitable to work with inputs from untrusted sources. 2513 * 2514 * @param input the UTF-8 string to convert 2515 * @param length the length of the string in bytes 2516 * @param utf16_buffer the pointer to buffer that can hold conversion result 2517 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string 2518 */ 2519 simdutf_warn_unused virtual size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; 2520 2521 /** 2522 * Convert possibly broken UTF-8 string into UTF-16BE string. 2523 * 2524 * During the conversion also validation of the input string is done. 2525 * This function is suitable to work with inputs from untrusted sources. 2526 * 2527 * @param input the UTF-8 string to convert 2528 * @param length the length of the string in bytes 2529 * @param utf16_buffer the pointer to buffer that can hold conversion result 2530 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string 2531 */ 2532 simdutf_warn_unused virtual size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; 2533 2534 /** 2535 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error. 2536 * 2537 * During the conversion also validation of the input string is done. 2538 * This function is suitable to work with inputs from untrusted sources. 2539 * 2540 * @param input the UTF-8 string to convert 2541 * @param length the length of the string in bytes 2542 * @param utf16_buffer the pointer to buffer that can hold conversion result 2543 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 2544 */ 2545 simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; 2546 2547 /** 2548 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error. 2549 * 2550 * During the conversion also validation of the input string is done. 2551 * This function is suitable to work with inputs from untrusted sources. 2552 * 2553 * @param input the UTF-8 string to convert 2554 * @param length the length of the string in bytes 2555 * @param utf16_buffer the pointer to buffer that can hold conversion result 2556 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful. 2557 */ 2558 simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; 2559 2560 /** 2561 * Convert possibly broken UTF-8 string into UTF-32 string. 2562 * 2563 * During the conversion also validation of the input string is done. 2564 * This function is suitable to work with inputs from untrusted sources. 2565 * 2566 * @param input the UTF-8 string to convert 2567 * @param length the length of the string in bytes 2568 * @param utf32_buffer the pointer to buffer that can hold conversion result 2569 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string 2570 */ 2571 simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0; 2572 2573 /** 2574 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error. 2575 * 2576 * During the conversion also validation of the input string is done. 2577 * This function is suitable to work with inputs from untrusted sources. 2578 * 2579 * @param input the UTF-8 string to convert 2580 * @param length the length of the string in bytes 2581 * @param utf32_buffer the pointer to buffer that can hold conversion result 2582 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. 2583 */ 2584 simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0; 2585 2586 /** 2587 * Convert valid UTF-8 string into UTF-16LE string. 2588 * 2589 * This function assumes that the input string is valid UTF-8. 2590 * 2591 * @param input the UTF-8 string to convert 2592 * @param length the length of the string in bytes 2593 * @param utf16_buffer the pointer to buffer that can hold conversion result 2594 * @return the number of written char16_t 2595 */ 2596 simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; 2597 2598/** 2599 * Convert valid UTF-8 string into UTF-16BE string. 2600 * 2601 * This function assumes that the input string is valid UTF-8. 2602 * 2603 * @param input the UTF-8 string to convert 2604 * @param length the length of the string in bytes 2605 * @param utf16_buffer the pointer to buffer that can hold conversion result 2606 * @return the number of written char16_t 2607 */ 2608 simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; 2609 2610 /** 2611 * Convert valid UTF-8 string into UTF-32 string. 2612 * 2613 * This function assumes that the input string is valid UTF-8. 2614 * 2615 * @param input the UTF-8 string to convert 2616 * @param length the length of the string in bytes 2617 * @param utf16_buffer the pointer to buffer that can hold conversion result 2618 * @return the number of written char32_t 2619 */ 2620 simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; 2621 2622 /** 2623 * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format. 2624 * 2625 * This function does not validate the input. 2626 * 2627 * @param input the UTF-8 string to process 2628 * @param length the length of the string in bytes 2629 * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE 2630 */ 2631 simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0; 2632 2633 /** 2634 * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format. 2635 * 2636 * This function is equivalent to count_utf8. 2637 * 2638 * This function does not validate the input. 2639 * 2640 * @param input the UTF-8 string to process 2641 * @param length the length of the string in bytes 2642 * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32 2643 */ 2644 simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0; 2645 2646 /** 2647 * Convert possibly broken UTF-16LE string into Latin1 string. 2648 * 2649 * During the conversion also validation of the input string is done. 2650 * This function is suitable to work with inputs from untrusted sources. 2651 * 2652 * This function is not BOM-aware. 2653 * 2654 * @param input the UTF-16LE string to convert 2655 * @param length the length of the string in 2-byte code units (char16_t) 2656 * @param latin1_buffer the pointer to buffer that can hold conversion result 2657 * @return number of written code units; 0 if input is not a valid UTF-16LE string 2658 */ 2659 simdutf_warn_unused virtual size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; 2660 2661 /** 2662 * Convert possibly broken UTF-16BE string into Latin1 string. 2663 * 2664 * During the conversion also validation of the input string is done. 2665 * This function is suitable to work with inputs from untrusted sources. 2666 * 2667 * This function is not BOM-aware. 2668 * 2669 * @param input the UTF-16BE string to convert 2670 * @param length the length of the string in 2-byte code units (char16_t) 2671 * @param latin1_buffer the pointer to buffer that can hold conversion result 2672 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 2673 */ 2674 simdutf_warn_unused virtual size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; 2675 2676 /** 2677 * Convert possibly broken UTF-16LE string into Latin1 string. 2678 * 2679 * During the conversion also validation of the input string is done. 2680 * This function is suitable to work with inputs from untrusted sources. 2681 * This function is not BOM-aware. 2682 * 2683 * @param input the UTF-16LE string to convert 2684 * @param length the length of the string in 2-byte code units (char16_t) 2685 * @param latin1_buffer the pointer to buffer that can hold conversion result 2686 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 2687 */ 2688 simdutf_warn_unused virtual result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; 2689 2690 /** 2691 * Convert possibly broken UTF-16BE string into Latin1 string. 2692 * 2693 * During the conversion also validation of the input string is done. 2694 * This function is suitable to work with inputs from untrusted sources. 2695 * This function is not BOM-aware. 2696 * 2697 * @param input the UTF-16BE string to convert 2698 * @param length the length of the string in 2-byte code units (char16_t) 2699 * @param latin1_buffer the pointer to buffer that can hold conversion result 2700 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 2701 */ 2702 simdutf_warn_unused virtual result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; 2703 2704 /** 2705 * Convert valid UTF-16LE string into Latin1 string. 2706 * 2707 * This function assumes that the input string is valid UTF-8. 2708 2709 * This function is not BOM-aware. 2710 * 2711 * @param input the UTF-16LE string to convert 2712 * @param length the length of the string in 2-byte code units (char16_t) 2713 * @param latin1_buffer the pointer to buffer that can hold conversion result 2714 * @return number of written code units; 0 if conversion is not possible 2715 */ 2716 simdutf_warn_unused virtual size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; 2717 2718 /** 2719 * Convert valid UTF-16BE string into Latin1 string. 2720 * 2721 * This function assumes that the input string is valid UTF-8. 2722 * 2723 * This function is not BOM-aware. 2724 * 2725 * @param input the UTF-16BE string to convert 2726 * @param length the length of the string in 2-byte code units (char16_t) 2727 * @param latin1_buffer the pointer to buffer that can hold conversion result 2728 * @return number of written code units; 0 if conversion is not possible 2729 */ 2730 simdutf_warn_unused virtual size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0; 2731 2732 /** 2733 * Convert possibly broken UTF-16LE string into UTF-8 string. 2734 * 2735 * During the conversion also validation of the input string is done. 2736 * This function is suitable to work with inputs from untrusted sources. 2737 * 2738 * This function is not BOM-aware. 2739 * 2740 * @param input the UTF-16LE string to convert 2741 * @param length the length of the string in 2-byte code units (char16_t) 2742 * @param utf8_buffer the pointer to buffer that can hold conversion result 2743 * @return number of written code units; 0 if input is not a valid UTF-16LE string 2744 */ 2745 simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; 2746 2747 /** 2748 * Convert possibly broken UTF-16BE string into UTF-8 string. 2749 * 2750 * During the conversion also validation of the input string is done. 2751 * This function is suitable to work with inputs from untrusted sources. 2752 * 2753 * This function is not BOM-aware. 2754 * 2755 * @param input the UTF-16BE string to convert 2756 * @param length the length of the string in 2-byte code units (char16_t) 2757 * @param utf8_buffer the pointer to buffer that can hold conversion result 2758 * @return number of written code units; 0 if input is not a valid UTF-16BE string 2759 */ 2760 simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; 2761 2762 /** 2763 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error. 2764 * 2765 * During the conversion also validation of the input string is done. 2766 * This function is suitable to work with inputs from untrusted sources. 2767 * 2768 * This function is not BOM-aware. 2769 * 2770 * @param input the UTF-16LE string to convert 2771 * @param length the length of the string in 2-byte code units (char16_t) 2772 * @param utf8_buffer the pointer to buffer that can hold conversion result 2773 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 2774 */ 2775 simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; 2776 2777 /** 2778 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error. 2779 * 2780 * During the conversion also validation of the input string is done. 2781 * This function is suitable to work with inputs from untrusted sources. 2782 * 2783 * This function is not BOM-aware. 2784 * 2785 * @param input the UTF-16BE string to convert 2786 * @param length the length of the string in 2-byte code units (char16_t) 2787 * @param utf8_buffer the pointer to buffer that can hold conversion result 2788 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 2789 */ 2790 simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; 2791 2792 /** 2793 * Convert valid UTF-16LE string into UTF-8 string. 2794 * 2795 * This function assumes that the input string is valid UTF-16LE. 2796 * 2797 * This function is not BOM-aware. 2798 * 2799 * @param input the UTF-16LE string to convert 2800 * @param length the length of the string in 2-byte code units (char16_t) 2801 * @param utf8_buffer the pointer to buffer that can hold the conversion result 2802 * @return number of written code units; 0 if conversion is not possible 2803 */ 2804 simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; 2805 2806 /** 2807 * Convert valid UTF-16BE string into UTF-8 string. 2808 * 2809 * This function assumes that the input string is valid UTF-16BE. 2810 * 2811 * This function is not BOM-aware. 2812 * 2813 * @param input the UTF-16BE string to convert 2814 * @param length the length of the string in 2-byte code units (char16_t) 2815 * @param utf8_buffer the pointer to buffer that can hold the conversion result 2816 * @return number of written code units; 0 if conversion is not possible 2817 */ 2818 simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; 2819 2820 /** 2821 * Convert possibly broken UTF-16LE string into UTF-32 string. 2822 * 2823 * During the conversion also validation of the input string is done. 2824 * This function is suitable to work with inputs from untrusted sources. 2825 * 2826 * This function is not BOM-aware. 2827 * 2828 * @param input the UTF-16LE string to convert 2829 * @param length the length of the string in 2-byte code units (char16_t) 2830 * @param utf32_buffer the pointer to buffer that can hold conversion result 2831 * @return number of written code units; 0 if input is not a valid UTF-16LE string 2832 */ 2833 simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; 2834 2835 /** 2836 * Convert possibly broken UTF-16BE string into UTF-32 string. 2837 * 2838 * During the conversion also validation of the input string is done. 2839 * This function is suitable to work with inputs from untrusted sources. 2840 * 2841 * This function is not BOM-aware. 2842 * 2843 * @param input the UTF-16BE string to convert 2844 * @param length the length of the string in 2-byte code units (char16_t) 2845 * @param utf32_buffer the pointer to buffer that can hold conversion result 2846 * @return number of written code units; 0 if input is not a valid UTF-16BE string 2847 */ 2848 simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; 2849 2850 /** 2851 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error. 2852 * 2853 * During the conversion also validation of the input string is done. 2854 * This function is suitable to work with inputs from untrusted sources. 2855 * 2856 * This function is not BOM-aware. 2857 * 2858 * @param input the UTF-16LE string to convert 2859 * @param length the length of the string in 2-byte code units (char16_t) 2860 * @param utf32_buffer the pointer to buffer that can hold conversion result 2861 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. 2862 */ 2863 simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; 2864 2865 /** 2866 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error. 2867 * 2868 * During the conversion also validation of the input string is done. 2869 * This function is suitable to work with inputs from untrusted sources. 2870 * 2871 * This function is not BOM-aware. 2872 * 2873 * @param input the UTF-16BE string to convert 2874 * @param length the length of the string in 2-byte code units (char16_t) 2875 * @param utf32_buffer the pointer to buffer that can hold conversion result 2876 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful. 2877 */ 2878 simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; 2879 2880 /** 2881 * Convert valid UTF-16LE string into UTF-32 string. 2882 * 2883 * This function assumes that the input string is valid UTF-16LE. 2884 * 2885 * This function is not BOM-aware. 2886 * 2887 * @param input the UTF-16LE string to convert 2888 * @param length the length of the string in 2-byte code units (char16_t) 2889 * @param utf32_buffer the pointer to buffer that can hold the conversion result 2890 * @return number of written code units; 0 if conversion is not possible 2891 */ 2892 simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; 2893 2894 /** 2895 * Convert valid UTF-16LE string into UTF-32BE string. 2896 * 2897 * This function assumes that the input string is valid UTF-16BE. 2898 * 2899 * This function is not BOM-aware. 2900 * 2901 * @param input the UTF-16BE string to convert 2902 * @param length the length of the string in 2-byte code units (char16_t) 2903 * @param utf32_buffer the pointer to buffer that can hold the conversion result 2904 * @return number of written code units; 0 if conversion is not possible 2905 */ 2906 simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; 2907 2908 /** 2909 * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. 2910 * 2911 * This function does not validate the input. 2912 * 2913 * This function is not BOM-aware. 2914 * 2915 * @param input the UTF-16LE string to convert 2916 * @param length the length of the string in 2-byte code units (char16_t) 2917 * @return the number of bytes required to encode the UTF-16LE string as UTF-8 2918 */ 2919 simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; 2920 2921 /** 2922 * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format. 2923 * 2924 * This function does not validate the input. 2925 * 2926 * This function is not BOM-aware. 2927 * 2928 * @param input the UTF-16BE string to convert 2929 * @param length the length of the string in 2-byte code units (char16_t) 2930 * @return the number of bytes required to encode the UTF-16BE string as UTF-8 2931 */ 2932 simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; 2933 2934 /** 2935 * Convert possibly broken UTF-32 string into Latin1 string. 2936 * 2937 * During the conversion also validation of the input string is done. 2938 * This function is suitable to work with inputs from untrusted sources. 2939 * 2940 * This function is not BOM-aware. 2941 * 2942 * @param input the UTF-32 string to convert 2943 * @param length the length of the string in 4-byte code units (char32_t) 2944 * @param latin1_buffer the pointer to buffer that can hold conversion result 2945 * @return number of written code units; 0 if input is not a valid UTF-32 string 2946 */ 2947 2948 simdutf_warn_unused virtual size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0; 2949 2950 /** 2951 * Convert possibly broken UTF-32 string into Latin1 string and stop on error. 2952 * 2953 * During the conversion also validation of the input string is done. 2954 * This function is suitable to work with inputs from untrusted sources. 2955 * 2956 * This function is not BOM-aware. 2957 * 2958 * @param input the UTF-32 string to convert 2959 * @param length the length of the string in 4-byte code units (char32_t) 2960 * @param latin1_buffer the pointer to buffer that can hold conversion result 2961 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 2962 */ 2963 2964 simdutf_warn_unused virtual result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0; 2965 2966 /** 2967 * Convert valid UTF-32 string into Latin1 string. 2968 * 2969 * This function assumes that the input string is valid UTF-32. 2970 * 2971 * This function is not BOM-aware. 2972 * 2973 * @param input the UTF-32 string to convert 2974 * @param length the length of the string in 4-byte code units (char32_t) 2975 * @param latin1_buffer the pointer to buffer that can hold the conversion result 2976 * @return number of written code units; 0 if conversion is not possible 2977 */ 2978 simdutf_warn_unused virtual size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0; 2979 2980 /** 2981 * Convert possibly broken UTF-32 string into UTF-8 string. 2982 * 2983 * During the conversion also validation of the input string is done. 2984 * This function is suitable to work with inputs from untrusted sources. 2985 * 2986 * This function is not BOM-aware. 2987 * 2988 * @param input the UTF-32 string to convert 2989 * @param length the length of the string in 4-byte code units (char32_t) 2990 * @param utf8_buffer the pointer to buffer that can hold conversion result 2991 * @return number of written code units; 0 if input is not a valid UTF-32 string 2992 */ 2993 simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; 2994 2995 /** 2996 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error. 2997 * 2998 * During the conversion also validation of the input string is done. 2999 * This function is suitable to work with inputs from untrusted sources. 3000 * 3001 * This function is not BOM-aware. 3002 * 3003 * @param input the UTF-32 string to convert 3004 * @param length the length of the string in 4-byte code units (char32_t) 3005 * @param utf8_buffer the pointer to buffer that can hold conversion result 3006 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful. 3007 */ 3008 simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; 3009 3010 /** 3011 * Convert valid UTF-32 string into UTF-8 string. 3012 * 3013 * This function assumes that the input string is valid UTF-32. 3014 * 3015 * This function is not BOM-aware. 3016 * 3017 * @param input the UTF-32 string to convert 3018 * @param length the length of the string in 4-byte code units (char32_t) 3019 * @param utf8_buffer the pointer to buffer that can hold the conversion result 3020 * @return number of written code units; 0 if conversion is not possible 3021 */ 3022 simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; 3023 3024 3025 /** 3026 * Return the number of bytes that this UTF-16 string would require in Latin1 format. 3027 * 3028 * 3029 * @param input the UTF-16 string to convert 3030 * @param length the length of the string in 2-byte code units (char16_t) 3031 * @return the number of bytes required to encode the UTF-16 string as Latin1 3032 */ 3033 simdutf_warn_unused virtual size_t utf16_length_from_latin1(size_t length) const noexcept = 0; 3034 3035 /** 3036 * Convert possibly broken UTF-32 string into UTF-16LE string. 3037 * 3038 * During the conversion also validation of the input string is done. 3039 * This function is suitable to work with inputs from untrusted sources. 3040 * 3041 * This function is not BOM-aware. 3042 * 3043 * @param input the UTF-32 string to convert 3044 * @param length the length of the string in 4-byte code units (char32_t) 3045 * @param utf16_buffer the pointer to buffer that can hold conversion result 3046 * @return number of written code units; 0 if input is not a valid UTF-32 string 3047 */ 3048 simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; 3049 3050 /** 3051 * Convert possibly broken UTF-32 string into UTF-16BE string. 3052 * 3053 * During the conversion also validation of the input string is done. 3054 * This function is suitable to work with inputs from untrusted sources. 3055 * 3056 * This function is not BOM-aware. 3057 * 3058 * @param input the UTF-32 string to convert 3059 * @param length the length of the string in 4-byte code units (char32_t) 3060 * @param utf16_buffer the pointer to buffer that can hold conversion result 3061 * @return number of written code units; 0 if input is not a valid UTF-32 string 3062 */ 3063 simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; 3064 3065 /** 3066 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error. 3067 * 3068 * During the conversion also validation of the input string is done. 3069 * This function is suitable to work with inputs from untrusted sources. 3070 * 3071 * This function is not BOM-aware. 3072 * 3073 * @param input the UTF-32 string to convert 3074 * @param length the length of the string in 4-byte code units (char32_t) 3075 * @param utf16_buffer the pointer to buffer that can hold conversion result 3076 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. 3077 */ 3078 simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; 3079 3080 /** 3081 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error. 3082 * 3083 * During the conversion also validation of the input string is done. 3084 * This function is suitable to work with inputs from untrusted sources. 3085 * 3086 * This function is not BOM-aware. 3087 * 3088 * @param input the UTF-32 string to convert 3089 * @param length the length of the string in 4-byte code units (char32_t) 3090 * @param utf16_buffer the pointer to buffer that can hold conversion result 3091 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful. 3092 */ 3093 simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; 3094 3095 /** 3096 * Convert valid UTF-32 string into UTF-16LE string. 3097 * 3098 * This function assumes that the input string is valid UTF-32. 3099 * 3100 * This function is not BOM-aware. 3101 * 3102 * @param input the UTF-32 string to convert 3103 * @param length the length of the string in 4-byte code units (char32_t) 3104 * @param utf16_buffer the pointer to buffer that can hold the conversion result 3105 * @return number of written code units; 0 if conversion is not possible 3106 */ 3107 simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; 3108 3109 /** 3110 * Convert valid UTF-32 string into UTF-16BE string. 3111 * 3112 * This function assumes that the input string is valid UTF-32. 3113 * 3114 * This function is not BOM-aware. 3115 * 3116 * @param input the UTF-32 string to convert 3117 * @param length the length of the string in 4-byte code units (char32_t) 3118 * @param utf16_buffer the pointer to buffer that can hold the conversion result 3119 * @return number of written code units; 0 if conversion is not possible 3120 */ 3121 simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; 3122 3123 /** 3124 * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or 3125 * from UTF-16BE to UTF-16LE. 3126 * 3127 * This function does not validate the input. 3128 * 3129 * This function is not BOM-aware. 3130 * 3131 * @param input the UTF-16 string to process 3132 * @param length the length of the string in 2-byte code units (char16_t) 3133 * @param output the pointer to buffer that can hold the conversion result 3134 */ 3135 virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0; 3136 3137 /** 3138 * Return the number of bytes that this Latin1 string would require in UTF-8 format. 3139 * 3140 * @param input the Latin1 string to convert 3141 * @param length the length of the string bytes 3142 * @return the number of bytes required to encode the Latin1 string as UTF-8 3143 */ 3144 simdutf_warn_unused virtual size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept = 0; 3145 3146 /** 3147 * Compute the number of bytes that this UTF-32 string would require in UTF-8 format. 3148 * 3149 * This function does not validate the input. 3150 * 3151 * @param input the UTF-32 string to convert 3152 * @param length the length of the string in 4-byte code units (char32_t) 3153 * @return the number of bytes required to encode the UTF-32 string as UTF-8 3154 */ 3155 simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; 3156 3157 /** 3158 * Compute the number of bytes that this UTF-32 string would require in Latin1 format. 3159 * 3160 * This function does not validate the input. 3161 * 3162 * @param length the length of the string in 4-byte code units (char32_t) 3163 * @return the number of bytes required to encode the UTF-32 string as Latin1 3164 */ 3165 simdutf_warn_unused virtual size_t latin1_length_from_utf32(size_t length) const noexcept = 0; 3166 3167 /** 3168 * Compute the number of bytes that this UTF-8 string would require in Latin1 format. 3169 * 3170 * This function does not validate the input. 3171 * 3172 * @param input the UTF-8 string to convert 3173 * @param length the length of the string in byte 3174 * @return the number of bytes required to encode the UTF-8 string as Latin1 3175 */ 3176 simdutf_warn_unused virtual size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept = 0; 3177 3178 /* 3179 * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format. 3180 * 3181 * This function does not validate the input. 3182 * 3183 * This function is not BOM-aware. 3184 * 3185 * @param input the UTF-16LE string to convert 3186 * @param length the length of the string in 2-byte code units (char16_t) 3187 * @return the number of bytes required to encode the UTF-16LE string as Latin1 3188 */ 3189 simdutf_warn_unused virtual size_t latin1_length_from_utf16(size_t length) const noexcept = 0; 3190 3191 /** 3192 * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format. 3193 * 3194 * This function does not validate the input. 3195 * 3196 * @param input the UTF-32 string to convert 3197 * @param length the length of the string in 4-byte code units (char32_t) 3198 * @return the number of bytes required to encode the UTF-32 string as UTF-16 3199 */ 3200 simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; 3201 3202 3203 /** 3204 * Return the number of bytes that this UTF-32 string would require in Latin1 format. 3205 * 3206 * This function does not validate the input. 3207 * 3208 * @param input the UTF-32 string to convert 3209 * @param length the length of the string in 4-byte code units (char32_t) 3210 * @return the number of bytes required to encode the UTF-32 string as Latin1 3211 */ 3212 simdutf_warn_unused virtual size_t utf32_length_from_latin1(size_t length) const noexcept = 0; 3213 3214 /* 3215 * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format. 3216 * 3217 * This function is equivalent to count_utf16le. 3218 * 3219 * This function does not validate the input. 3220 * 3221 * This function is not BOM-aware. 3222 * 3223 * @param input the UTF-16LE string to convert 3224 * @param length the length of the string in 2-byte code units (char16_t) 3225 * @return the number of bytes required to encode the UTF-16LE string as UTF-32 3226 */ 3227 simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; 3228 3229 /* 3230 * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format. 3231 * 3232 * This function is equivalent to count_utf16be. 3233 * 3234 * This function does not validate the input. 3235 * 3236 * This function is not BOM-aware. 3237 * 3238 * @param input the UTF-16BE string to convert 3239 * @param length the length of the string in 2-byte code units (char16_t) 3240 * @return the number of bytes required to encode the UTF-16BE string as UTF-32 3241 */ 3242 simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; 3243 3244 /** 3245 * Count the number of code points (characters) in the string assuming that 3246 * it is valid. 3247 * 3248 * This function assumes that the input string is valid UTF-16LE. 3249 * 3250 * This function is not BOM-aware. 3251 * 3252 * @param input the UTF-16LE string to process 3253 * @param length the length of the string in 2-byte code units (char16_t) 3254 * @return number of code points 3255 */ 3256 simdutf_warn_unused virtual size_t count_utf16le(const char16_t * input, size_t length) const noexcept = 0; 3257 3258 /** 3259 * Count the number of code points (characters) in the string assuming that 3260 * it is valid. 3261 * 3262 * This function assumes that the input string is valid UTF-16BE. 3263 * 3264 * This function is not BOM-aware. 3265 * 3266 * @param input the UTF-16BE string to process 3267 * @param length the length of the string in 2-byte code units (char16_t) 3268 * @return number of code points 3269 */ 3270 simdutf_warn_unused virtual size_t count_utf16be(const char16_t * input, size_t length) const noexcept = 0; 3271 3272 3273 /** 3274 * Count the number of code points (characters) in the string assuming that 3275 * it is valid. 3276 * 3277 * This function assumes that the input string is valid UTF-8. 3278 * 3279 * @param input the UTF-8 string to process 3280 * @param length the length of the string in bytes 3281 * @return number of code points 3282 */ 3283 simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0; 3284 3285 3286 3287protected: 3288 /** @private Construct an implementation with the given name and description. For subclasses. */ 3289 simdutf_really_inline implementation( 3290 std::string name, 3291 std::string description, 3292 uint32_t required_instruction_sets 3293 ) : 3294 _name(name), 3295 _description(description), 3296 _required_instruction_sets(required_instruction_sets) 3297 { 3298 } 3299 virtual ~implementation()=default; 3300 3301private: 3302 /** 3303 * The name of this implementation. 3304 */ 3305 const std::string _name; 3306 3307 /** 3308 * The description of this implementation. 3309 */ 3310 const std::string _description; 3311 3312 /** 3313 * Instruction sets required for this implementation. 3314 */ 3315 const uint32_t _required_instruction_sets; 3316}; 3317 3318/** @private */ 3319namespace internal { 3320 3321/** 3322 * The list of available implementations compiled into simdutf. 3323 */ 3324class available_implementation_list { 3325public: 3326 /** Get the list of available implementations compiled into simdutf */ 3327 simdutf_really_inline available_implementation_list() {} 3328 /** Number of implementations */ 3329 size_t size() const noexcept; 3330 /** STL const begin() iterator */ 3331 const implementation * const *begin() const noexcept; 3332 /** STL const end() iterator */ 3333 const implementation * const *end() const noexcept; 3334 3335 /** 3336 * Get the implementation with the given name. 3337 * 3338 * Case sensitive. 3339 * 3340 * const implementation *impl = simdutf::available_implementations["westmere"]; 3341 * if (!impl) { exit(1); } 3342 * if (!imp->supported_by_runtime_system()) { exit(1); } 3343 * simdutf::active_implementation = impl; 3344 * 3345 * @param name the implementation to find, e.g. "westmere", "haswell", "arm64" 3346 * @return the implementation, or nullptr if the parse failed. 3347 */ 3348 const implementation * operator[](const std::string &name) const noexcept { 3349 for (const implementation * impl : *this) { 3350 if (impl->name() == name) { return impl; } 3351 } 3352 return nullptr; 3353 } 3354 3355 /** 3356 * Detect the most advanced implementation supported by the current host. 3357 * 3358 * This is used to initialize the implementation on startup. 3359 * 3360 * const implementation *impl = simdutf::available_implementation::detect_best_supported(); 3361 * simdutf::active_implementation = impl; 3362 * 3363 * @return the most advanced supported implementation for the current host, or an 3364 * implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported 3365 * implementation. Will never return nullptr. 3366 */ 3367 const implementation *detect_best_supported() const noexcept; 3368}; 3369 3370template<typename T> 3371class atomic_ptr { 3372public: 3373 atomic_ptr(T *_ptr) : ptr{_ptr} {} 3374 3375#if defined(SIMDUTF_NO_THREADS) 3376 operator const T*() const { return ptr; } 3377 const T& operator*() const { return *ptr; } 3378 const T* operator->() const { return ptr; } 3379 3380 operator T*() { return ptr; } 3381 T& operator*() { return *ptr; } 3382 T* operator->() { return ptr; } 3383 atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; } 3384 3385#else 3386 operator const T*() const { return ptr.load(); } 3387 const T& operator*() const { return *ptr; } 3388 const T* operator->() const { return ptr.load(); } 3389 3390 operator T*() { return ptr.load(); } 3391 T& operator*() { return *ptr; } 3392 T* operator->() { return ptr.load(); } 3393 atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; } 3394 3395#endif 3396 3397private: 3398#if defined(SIMDUTF_NO_THREADS) 3399 T* ptr; 3400#else 3401 std::atomic<T*> ptr; 3402#endif 3403}; 3404 3405class detect_best_supported_implementation_on_first_use; 3406 3407} // namespace internal 3408 3409/** 3410 * The list of available implementations compiled into simdutf. 3411 */ 3412extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations(); 3413 3414/** 3415 * The active implementation. 3416 * 3417 * Automatically initialized on first use to the most advanced implementation supported by this hardware. 3418 */ 3419extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation(); 3420 3421 3422} // namespace simdutf 3423 3424#endif // SIMDUTF_IMPLEMENTATION_H 3425/* end file include/simdutf/implementation.h */ 3426 3427 3428// Implementation-internal files (must be included before the implementations themselves, to keep 3429// amalgamation working--otherwise, the first time a file is included, it might be put inside the 3430// #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't 3431// compile unless that implementation is turned on). 3432 3433 3434SIMDUTF_POP_DISABLE_WARNINGS 3435 3436#endif // SIMDUTF_H 3437/* end file include/simdutf.h */ 3438