Lines Matching defs:input
68 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
129 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
130 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
131 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
132 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
133 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
134 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
135 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
136 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
137 simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
142 simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
1268 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
1329 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1330 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1331 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1332 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1333 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
1334 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1335 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1336 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
1337 simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
1342 simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
1475 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
1536 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1537 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1538 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
1539 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
1540 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
1541 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1542 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
1543 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
1544 simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
1549 simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
2406 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
2467 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
2468 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
2469 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
2470 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
2471 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
2472 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
2473 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
2474 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
2475 simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
2480 simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
3347 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
3392 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
3393 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
3394 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
3395 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
3396 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
3397 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
3398 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
3399 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
3852 return input.reduce_or().is_ascii();
4004 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
4065 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
4066 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
4067 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
4068 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
4069 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
4070 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
4071 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
4072 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
4073 simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
4078 simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;};
4304 simdutf_warn_unused inline size_t trim_partial_utf8(const char *input, size_t length) {
4308 if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
4309 if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left
4312 if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
4318 if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
4319 if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left
4320 if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left
4430 const uint16_t * input = reinterpret_cast<const uint16_t *>(in);
4433 *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
4439 simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t* input, size_t length) {
4443 uint16_t last_word = uint16_t(input[length-1]);
4464 simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept {
4466 auto bom_encoding = simdutf::BOM::check_bom(input, length);
4475 if(validate_utf8(input, length)) { return encoding_type::UTF8; }
4480 if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { return encoding_type::UTF16_LE; }
4483 if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { return encoding_type::UTF32_LE; }
4540 simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept override {
4541 return set_best()->detect_encodings(input, length);
5252 simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept {
5254 return convert_utf8_to_utf16be(input, length, utf16_output);
5256 return convert_utf8_to_utf16le(input, length, utf16_output);
5280 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept {
5281 return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
5283 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept {
5284 return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
5286 simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
5288 return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
5290 return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
5293 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
5294 return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
5296 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
5297 return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
5299 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept {
5300 return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
5302 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept {
5303 return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
5337 simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
5339 return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
5341 return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
5344 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
5345 return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
5347 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
5348 return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
5350 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept {
5351 return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
5454 simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_output) noexcept {
5455 return get_active_implementation()->convert_utf32_to_latin1(input, length, latin1_output);
5528 void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept {
5529 get_active_implementation()->change_endianness_utf16(input, length, output);
5531 simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept {
5533 return count_utf16be(input, length);
5535 return count_utf16le(input, length);
5538 simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept {
5539 return get_active_implementation()->count_utf16le(input, length);
5541 simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept {
5542 return get_active_implementation()->count_utf16be(input, length);
5544 simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept {
5545 return get_active_implementation()->count_utf8(input, length);
5559 simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept {
5561 return utf8_length_from_utf16be(input, length);
5563 return utf8_length_from_utf16le(input, length);
5566 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept {
5567 return get_active_implementation()->utf8_length_from_utf16le(input, length);
5569 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept {
5570 return get_active_implementation()->utf8_length_from_utf16be(input, length);
5572 simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept {
5574 return utf32_length_from_utf16be(input, length);
5576 return utf32_length_from_utf16le(input, length);
5579 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept {
5580 return get_active_implementation()->utf32_length_from_utf16le(input, length);
5582 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept {
5583 return get_active_implementation()->utf32_length_from_utf16be(input, length);
5585 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept {
5586 return get_active_implementation()->utf16_length_from_utf8(input, length);
5591 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept {
5592 return get_active_implementation()->utf8_length_from_utf32(input, length);
5594 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept {
5595 return get_active_implementation()->utf16_length_from_utf32(input, length);
5597 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept {
5598 return get_active_implementation()->utf32_length_from_utf8(input, length);
5611 simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) {
5612 return scalar::utf8::trim_partial_utf8(input, length);
5615 simdutf_warn_unused size_t trim_partial_utf16be(const char16_t* input, size_t length) {
5616 return scalar::utf16::trim_partial_utf16<BIG>(input, length);
5619 simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t length) {
5620 return scalar::utf16::trim_partial_utf16<LITTLE>(input, length);
5623 simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length) {
5625 return trim_partial_utf16be(input, length);
5627 return trim_partial_utf16le(input, length);
11650 * up to len input bytes left, and we encountered some error. It is possible that
11929 * up to len input bytes left, and we encountered some error. It is possible that
12576 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
12577 simd8<uint8_t> bits = input.reduce_or();
12652 // we process SIX (6) input code-code units. The max length in bytes of six code
12729 const char16_t * input = reinterpret_cast<const char16_t*>(buf);
12748 input += 16;
12750 input += 15;
12756 while (input + 16 < end16) {
12757 const auto in0 = simd16<uint16_t>(input);
12758 const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
12765 input += 16;
12780 input += 16;
12782 input += 15;
12793 const char32_t * input = reinterpret_cast<const char32_t*>(buf);
12816 while (input + 4 < end32) {
12817 const uint32x4_t in_32 = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
12820 input += 4;
12882 const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
12883 const char16_t* end = input + size;
12888 while (input + 16 < end) {
12892 auto in0 = simd16<uint16_t>(input);
12893 auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
12904 input += 16;
12931 // The whole input register contains valid UTF-16, i.e.,
12933 input += 16;
12935 // The 15 lower code units of the input register contains valid UTF-16.
12939 input += 15;
12945 return input;
12950 const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) {
12951 const char16_t* start = input;
12952 const char16_t* end = input + size;
12958 while (input + 16 < end) {
12962 auto in0 = simd16<uint16_t>(input);
12963 auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
12975 input += 16;
13002 // The whole input register contains valid UTF-16, i.e.,
13004 input += 16;
13006 // The 15 lower code units of the input register contains valid UTF-16.
13010 input += 15;
13012 return result(error_code::SURROGATE, input - start);
13016 return result(error_code::SUCCESS, input - start);
13021 const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
13022 const char32_t* end = input + size;
13030 while (input + 4 < end) {
13031 const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
13034 input += 4;
13047 return input;
13051 const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) {
13052 const char32_t* start = input;
13053 const char32_t* end = input + size;
13061 while (input + 4 < end) {
13062 const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
13068 return result(error_code::TOO_LARGE, input - start);
13073 return result(error_code::SURROGATE, input - start);
13076 input += 4;
13079 return result(error_code::SUCCESS, input - start);
13109 // input 8-bit word : [aabb|bbbb] x 8
13206 size_t convert_masked_utf8_to_utf16(const char *input,
13209 // we use an approach where we try to process up to 12 input bytes.
13210 // Why 12 input bytes and not 16? Because we are concerned with the size of
13213 uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
13268 // SIX (6) input code-code units
13280 // FOUR (4) input code-code units
13325 // THREE (3) input code-code units
13465 size_t convert_masked_utf8_to_utf32(const char *input,
13468 // we use an approach where we try to process up to 12 input bytes.
13469 // Why 12 input bytes and not 16? Because we are concerned with the size of
13473 uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
13522 // SIX (6) input code-code units
13531 // FOUR (4) input code-code units
13560 // THREE (3) input code-code units
13641 size_t convert_masked_utf8_to_latin1(const char *input,
13644 // we use an approach where we try to process up to 12 input bytes.
13645 // Why 12 input bytes and not 16? Because we are concerned with the size of
13648 uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
13670 // this indicates an invalid input:
13673 // SIX (6) input code-code units
13675 // we process SIX (6) input code-code units. The max length in bytes of six code
13679 // we process SIX (6) input code-code units. The max length in bytes of six code
13765 1. an input register contains no surrogates and each value
13767 2. an input register contains no surrogates and values are
13769 3. an input register contains surrogates --- i.e. codepoints
13853 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
13916 We expand the input word (16-bit) into two code units (32-bit), thus
14107 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14170 We expand the input word (16-bit) into two code units (32-bit), thus
14317 1. an input register contains no surrogates and each value
14319 2. an input register contains no surrogates and values are
14321 3. an input register contains surrogates --- i.e. codepoints
14562 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14624 We expand the input word (16-bit) into two code units (32-bit), thus
14757 // check for invalid input
14795 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14842 // check for invalid input
14862 We expand the input word (16-bit) into two code units (32-bit), thus
15050 // check for invalid input
15219 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
15291 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
15309 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
15311 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
15312 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
15322 simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
15323 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
15332 return input.gt_bits(max_value);
15338 // The last input we received
15340 // Whether the last input we received was incomplete (used for ASCII fast path)
15346 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
15349 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
15350 simd8<uint8_t> sc = check_special_cases(input, prev1);
15351 this->error |= check_multibyte_lengths(input, prev_input, sc);
15363 simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
15364 if(simdutf_likely(is_ascii(input))) {
15371 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
15372 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15374 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
15375 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15376 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
15377 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
15379 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
15380 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
15409 bool generic_validate_utf8(const uint8_t * input, size_t length) {
15411 buf_block_reader<64> reader(input, length);
15426 bool generic_validate_utf8(const char * input, size_t length) {
15427 return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
15434 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
15436 buf_block_reader<64> reader(input, length);
15443 result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
15458 result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
15466 result generic_validate_utf8_with_errors(const char * input, size_t length) {
15467 return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
15471 bool generic_validate_ascii(const uint8_t * input, size_t length) {
15472 buf_block_reader<64> reader(input, length);
15487 bool generic_validate_ascii(const char * input, size_t length) {
15488 return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
15492 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
15493 buf_block_reader<64> reader(input, length);
15498 result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
15509 result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
15516 result generic_validate_ascii_with_errors(const char * input, size_t length) {
15517 return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
15537 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
15546 simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
15562 // slow path to work, we should have at least 12 input bytes left.
15580 size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
15591 utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
15610 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
15682 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
15700 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
15702 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
15703 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
15718 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
15721 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
15722 simd8<uint8_t> sc = check_special_cases(input, prev1);
15723 this->error |= check_multibyte_lengths(input, prev_input, sc);
15735 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
15742 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
15745 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
15746 if(input.is_ascii()) {
15747 input.store_ascii_as_utf16<endian>(utf16_output);
15756 this->check_utf8_bytes(input.chunks[0], zero);
15757 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15759 this->check_utf8_bytes(input.chunks[0], zero);
15760 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15761 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
15762 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
15764 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
15769 // slow path to work, we should have at least 12 input bytes left.
15810 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
15817 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
15820 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
15821 if(input.is_ascii()) {
15822 input.store_ascii_as_utf16<endian>(utf16_output);
15831 this->check_utf8_bytes(input.chunks[0], zero);
15832 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15834 this->check_utf8_bytes(input.chunks[0], zero);
15835 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
15836 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
15837 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
15846 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
15851 // slow path to work, we should have at least 12 input bytes left.
15917 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
15923 simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
15935 size_t consumed = convert_masked_utf8_to_utf32(input + pos,
15942 utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
15962 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
16034 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
16052 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
16054 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
16055 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
16070 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
16073 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
16074 simd8<uint8_t> sc = check_special_cases(input, prev1);
16075 this->error |= check_multibyte_lengths(input, prev_input, sc);
16087 // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
16094 // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
16097 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16098 if(input.is_ascii()) {
16099 input.store_ascii_as_utf32(utf32_output);
16108 this->check_utf8_bytes(input.chunks[0], zero);
16109 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16111 this->check_utf8_bytes(input.chunks[0], zero);
16112 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16113 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
16114 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
16116 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
16121 // slow path to work, we should have at least 12 input bytes left.
16161 // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
16168 // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
16171 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16172 if(input.is_ascii()) {
16173 input.store_ascii_as_utf32(utf32_output);
16182 this->check_utf8_bytes(input.chunks[0], zero);
16183 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16185 this->check_utf8_bytes(input.chunks[0], zero);
16186 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16187 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
16188 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
16195 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
16200 // slow path to work, we should have at least 12 input bytes left.
16265 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16266 uint64_t utf8_continuation_mask = input.gt(-65);
16277 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16278 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
16282 int64_t utf8_4byte = input.gteq_unsigned(240);
16303 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
16304 if (!match_system(big_endian)) { input.swap_bytes(); }
16305 uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
16317 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
16318 if (!match_system(big_endian)) { input.swap_bytes(); }
16319 uint64_t ascii_mask = input.lteq(0x7F);
16320 uint64_t twobyte_mask = input.lteq(0x7FF);
16321 uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
16341 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
16342 input.swap_bytes();
16343 input.store(reinterpret_cast<uint16_t *>(output));
16367 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
16443 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
16470 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
16473 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
16474 this->error |= check_special_cases(input, prev1);
16485 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
16492 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
16495 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16496 if(input.is_ascii()) {
16497 input.store((int8_t*)latin1_output);
16506 this->check_utf8_bytes(input.chunks[0], zero);
16507 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16509 this->check_utf8_bytes(input.chunks[0], zero);
16510 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16511 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
16512 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
16514 uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
16519 // slow path to work, we should have at least 12 input bytes left.
16559 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
16566 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
16569 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16570 if(input.is_ascii()) {
16571 input.store((int8_t*)latin1_output);
16580 this->check_utf8_bytes(input.chunks[0], zero);
16581 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16583 this->check_utf8_bytes(input.chunks[0], zero);
16584 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
16585 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
16586 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
16595 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
16600 // slow path to work, we should have at least 12 input bytes left.
16672 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
16679 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
16682 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
16683 if(input.is_ascii()) {
16684 input.store((int8_t*)latin1_output);
16689 uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
16694 // slow path to work, we should have at least 12 input bytes left.
16740 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
16742 auto bom_encoding = simdutf::BOM::check_bom(input, length);
16745 return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
16747 if (implementation::validate_utf8(input, length)) {
16907 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
16909 return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
16912 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
16914 return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
16927 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
16929 return utf8_to_utf32::convert_valid(input, size, utf32_output);
17291 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
17292 utf16::change_endianness_utf16(input, length, output);
17295 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
17296 return utf16::count_code_points<endianness::LITTLE>(input, length);
17299 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
17300 return utf16::count_code_points<endianness::BIG>(input, length);
17303 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
17304 return utf8::count_code_points(input, length);
17319 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
17322 const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
17339 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
17340 return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
17343 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
17344 return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
17359 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
17360 return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
17363 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
17364 return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
17367 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
17368 return utf8::utf16_length_from_utf8(input, length);
17371 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
17379 uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
17398 return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
17401 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
17407 uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
17414 return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
17417 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
17418 return utf8::count_code_points(input, length);
17446 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
17448 auto bom_encoding = simdutf::BOM::check_bom(input, length);
17451 if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
17453 if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
17456 if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
17562 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
17564 return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
17687 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
17688 scalar::utf16::change_endianness_utf16(input, length, output);
17691 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
17692 return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
17695 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
17696 return scalar::utf16::count_code_points<endianness::BIG>(input, length);
17699 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
17700 return scalar::utf8::count_code_points(input, length);
17715 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
17716 return scalar::latin1::utf8_length_from_latin1(input,length);
17719 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
17720 return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
17723 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
17724 return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
17727 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
17728 return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
17731 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
17732 return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
17739 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
17740 return scalar::utf8::utf16_length_from_utf8(input, length);
17743 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
17744 return scalar::utf32::utf8_length_from_utf32(input, length);
17747 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
17748 return scalar::utf32::utf16_length_from_utf32(input, length);
17755 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
17756 return scalar::utf8::count_code_points(input, length);
17799 to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
17801 indicates how many input bytes are relevant.
17805 The provided in and out pointers are advanced according to how many input
17832 __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
17833 __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
17839 __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
17843 __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
17851 __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
17856 __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
17860 __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
17869 __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
17870 _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
17871 __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
17872 _MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte
17874 __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
17875 _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
17883 __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
17918 __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
17946 // the location of 3-byte sequence start bytes in the input
17980 __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
18029 // the location of 3-byte sequence start bytes in the input
18053 __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
18065 // to increment the input buffer as quickly as possible.
18074 // but it requires loading the input, doing the mask computation, and converting
18082 __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
18107 from input `utf32` into UTF-16. It differs from utf32_to_utf16
18183 from input `utf32` into UTF-16. It may overflow.
18251 * Store the last N bytes of previous followed by 512-N bytes from input.
18254 __m512i prev(__m512i input, __m512i previous) {
18257 const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
18260 return _mm512_alignr_epi8(input, rotated, shift);
18262 return _mm512_alignr_epi8(input, rotated, 16-N);
18286 __m512i rotate_by_N_epi8(const __m512i input) {
18289 const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
18291 return _mm512_alignr_epi8(permuted, input, N);
18436 const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
18438 const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
18442 return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
18445 simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
18446 __m512i char_class = _mm512_srli_epi32(input, 4);
18447 /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */
18451 return expanded_utf8_to_utf32(char_class, input);
18510 const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); \
18514 const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); \
18519 char_class = _mm512_srli_epi32(input, 4); \
18520 /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ \
18526 const __m512i utf32 = expanded_utf8_to_utf32(char_class, input); \
18606 - pair.first - the first unprocessed input byte
18731 simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1) {
18767 __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
18772 simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
18774 __m512i prev2 = prev<2>(input, prev_input);
18775 __m512i prev3 = prev<3>(input, prev_input);
18791 simdutf_really_inline __m512i is_incomplete(const __m512i input) {
18792 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
18803 return _mm512_subs_epu8(input, max_value);
18810 // The last input we received
18812 // Whether the last input we received was incomplete (used for ASCII fast path)
18818 simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input) {
18821 __m512i prev1 = prev<1>(input, prev_input);
18822 __m512i sc = check_special_cases(input, prev1);
18823 this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
18836 simdutf_really_inline bool check_next_input(const __m512i input) {
18838 const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
18843 this->check_utf8_bytes(input, this->prev_input_block);
18844 this->prev_incomplete = is_incomplete(input);
18845 this->prev_input_block = input;
18864 * Returns the position of the input and output after the processing is
19172 __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
19173 __mmask64 nonascii = _mm512_movepi8_mask(input);
19177 ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
19178 : _mm512_storeu_si512((__m512i *)latin_output, input);
19182 __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
19184 __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
19200 input =
19201 _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
19205 __m512i output = _mm512_maskz_compress_epi8(retain, input);
19269 __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
19270 __mmask64 nonascii = _mm512_movepi8_mask(input);
19274 ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
19275 : _mm512_storeu_si512((__m512i *)latin_output, input);
19279 __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
19281 __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
19286 input =
19287 _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
19291 __m512i output = _mm512_maskz_compress_epi8(retain, input);
19435 * This function converts the input (inbuf, inlen), assumed to be valid
19437 * is written to 'outlen' and the function reports the number of input word
19862 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
19922 We expand the input word (16-bit) into two code units (32-bit), thus
20047 // check for invalid input
20076 // Check for too large input
20107 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
20172 We expand the input word (16-bit) into two code units (32-bit), thus
20363 // check for invalid input
20486 static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len, char *utf8_output, int mask_output) {
20487 __mmask64 nonascii = _mm512_movepi8_mask(input);
20492 _mm512_cmpge_epu8_mask(input, _mm512_set1_epi8(-64)); //binary representation of -64: 1100 0000
20506 ), input);
20520 // in the second 32-bit half, set first or second option based on whether original input is leading byte (second case) or not (first case)
20539 if(input_len > 32) { // is the second half of the input vector used?
20557 static inline size_t latin1_to_utf8_avx512_branch(__m512i input, char *utf8_output) {
20558 __mmask64 nonascii = _mm512_movepi8_mask(input);
20560 return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0);
20562 _mm512_storeu_si512(utf8_output, input);
20572 __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
20573 utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output);
20577 __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
20578 utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1);
20581 // with the last 64 bytes, the input also needs to be masked
20584 __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos));
20585 utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1);
20659 implementation::detect_encodings(const char *input,
20662 auto bom_encoding = simdutf::BOM::check_bom(input, length);
20665 const char *buf = input;
20668 const char *end = input + length;
20782 } else if (implementation::validate_utf8(input, length)) {
20844 const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
20845 __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
20851 const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf);
20852 __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
21127 // Initialize output length and input length counters
21140 // the input buffer, length, and output buffer, and returns a result object with an error code
21616 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
21629 __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
21636 __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
21643 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
21644 const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
21645 const char16_t* ptr = input;
21659 return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
21662 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
21663 const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
21664 const char16_t* ptr = input;
21687 return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
21691 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
21692 const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
21762 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
21763 const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
21764 const char16_t* ptr = input;
21789 return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
21792 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
21793 const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
21794 const char16_t* ptr = input;
21828 return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
21831 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
21832 return implementation::count_utf16le(input, length);
21835 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
21836 return implementation::count_utf16be(input, length);
21848 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
21849 const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
21909 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
21914 __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos));
21922 return count + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
21925 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
21926 const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
21927 const char32_t* ptr = input;
21949 return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
21952 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
21953 const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
21954 const char32_t* ptr = input;
21968 return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
21971 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
21972 return implementation::count_utf8(input, length);
22018 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
22019 return input.reduce_or().is_ascii();
22085 const char16_t * input = reinterpret_cast<const char16_t*>(buf);
22103 input += simd16<uint16_t>::ELEMENTS * 2;
22105 input += simd16<uint16_t>::ELEMENTS * 2 - 1;
22110 while (input + simd16<uint16_t>::ELEMENTS * 2 < end16) {
22111 const auto in0 = simd16<uint16_t>(input);
22112 const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
22122 input += simd16<uint16_t>::ELEMENTS * 2;
22138 input += simd16<uint16_t>::ELEMENTS * 2;
22140 input += simd16<uint16_t>::ELEMENTS * 2 - 1;
22150 const char32_t * input = reinterpret_cast<const char32_t*>(buf);
22164 while (input + 8 < end32) {
22165 const __m256i in32 = _mm256_loadu_si256((__m256i *)input);
22168 input += 8;
22273 const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
22274 const char16_t* end = input + size;
22281 while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
22285 auto in0 = simd16<uint16_t>(input);
22286 auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
22302 input += simd16<uint16_t>::ELEMENTS * 2;
22330 // The whole input register contains valid UTF-16, i.e.,
22332 input += simd16<uint16_t>::ELEMENTS * 2;
22334 // The 31 lower code units of the input register contains valid UTF-16.
22338 input += simd16<uint16_t>::ELEMENTS * 2 - 1;
22345 return input;
22350 const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) {
22351 const char16_t* start = input;
22352 const char16_t* end = input + size;
22359 while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
22363 auto in0 = simd16<uint16_t>(input);
22364 auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
22380 input += simd16<uint16_t>::ELEMENTS * 2;
22408 // The whole input register contains valid UTF-16, i.e.,
22410 input += simd16<uint16_t>::ELEMENTS * 2;
22412 // The 31 lower code units of the input register contains valid UTF-16.
22416 input += simd16<uint16_t>::ELEMENTS * 2 - 1;
22418 return result(error_code::SURROGATE, input - start);
22423 return result(error_code::SUCCESS, input - start);
22431 const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
22432 const char32_t* end = input + size;
22440 while (input + 8 < end) {
22441 const __m256i in = _mm256_loadu_si256((__m256i *)input);
22444 input += 8;
22456 return input;
22460 const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) {
22461 const char32_t* start = input;
22462 const char32_t* end = input + size;
22470 while (input + 8 < end) {
22471 const __m256i in = _mm256_loadu_si256((__m256i *)input);
22477 return result(error_code::TOO_LARGE, input - start);
22482 return result(error_code::SURROGATE, input - start);
22484 input += 8;
22487 return result(error_code::SUCCESS, input - start);
22516 // input 16-bit word : [0000|0000|aabb|bbbb] x 8
22580 // Load 16 bytes from the address (input + i) into a xmm register
22639 size_t convert_masked_utf8_to_utf16(const char *input,
22642 // we use an approach where we try to process up to 12 input bytes.
22643 // Why 12 input bytes and not 16? Because we are concerned with the size of
22653 const __m128i in = _mm_loadu_si128((__m128i *)input);
22708 // SIX (6) input code-code units
22710 // we process SIX (6) input code-code units. The max length in bytes of six code
22723 // FOUR (4) input code-code units
22742 // TWO (2) input code-code units
22820 size_t convert_masked_utf8_to_utf32(const char *input,
22823 // we use an approach where we try to process up to 12 input bytes.
22824 // Why 12 input bytes and not 16? Because we are concerned with the size of
22833 const __m128i in = _mm_loadu_si128((__m128i *)input);
22881 // SIX (6) input code-code units
22883 // we process SIX (6) input code-code units. The max length in bytes of six code
22896 // FOUR (4) input code-code units
22913 // TWO (2) input code-code units
23033 1. an input register contains no surrogates and each value
23035 2. an input register contains no surrogates and values are
23037 3. an input register contains surrogates --- i.e. codepoints
23120 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23162 // 1. Check if there are any surrogate word in the input chunk.
23184 We expand the input word (16-bit) into two code units (32-bit), thus
23363 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23405 // 1. Check if there are any surrogate word in the input chunk.
23427 We expand the input word (16-bit) into two code units (32-bit), thus
23565 1. an input register contains no surrogates and each value
23567 2. an input register contains no surrogates and values are
23569 3. an input register contains surrogates --- i.e. codepoints
23628 // 1. Check if there are any surrogate word in the input chunk.
23696 // 1. Check if there are any surrogate word in the input chunk.
23881 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23941 We expand the input word (16-bit) into two code units (32-bit), thus
24066 // check for invalid input
24095 // Check for too large input
24126 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
24191 We expand the input word (16-bit) into two code units (32-bit), thus
24379 // check for invalid input
24458 size_t convert_masked_utf8_to_latin1(const char *input,
24461 // we use an approach where we try to process up to 12 input bytes.
24462 // Why 12 input bytes and not 16? Because we are concerned with the size of
24470 const __m128i in = _mm_loadu_si128((__m128i *)input);
24471 const __m128i in_second_half = _mm_loadu_si128((__m128i *)(input + 16));
24501 // this indicates an invalid input:
24504 // SIX (6) input code-code units
24506 // we process SIX (6) input code-code units. The max length in bytes of six code
24630 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
24702 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
24720 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
24722 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
24723 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
24733 simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
24734 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
24743 return input.gt_bits(max_value);
24749 // The last input we received
24751 // Whether the last input we received was incomplete (used for ASCII fast path)
24757 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
24760 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
24761 simd8<uint8_t> sc = check_special_cases(input, prev1);
24762 this->error |= check_multibyte_lengths(input, prev_input, sc);
24774 simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
24775 if(simdutf_likely(is_ascii(input))) {
24782 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
24783 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
24785 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
24786 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
24787 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
24788 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
24790 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
24791 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
24820 bool generic_validate_utf8(const uint8_t * input, size_t length) {
24822 buf_block_reader<64> reader(input, length);
24837 bool generic_validate_utf8(const char * input, size_t length) {
24838 return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
24845 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
24847 buf_block_reader<64> reader(input, length);
24854 result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
24869 result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
24877 result generic_validate_utf8_with_errors(const char * input, size_t length) {
24878 return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
24882 bool generic_validate_ascii(const uint8_t * input, size_t length) {
24883 buf_block_reader<64> reader(input, length);
24898 bool generic_validate_ascii(const char * input, size_t length) {
24899 return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
24903 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
24904 buf_block_reader<64> reader(input, length);
24909 result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
24920 result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
24927 result generic_validate_ascii_with_errors(const char * input, size_t length) {
24928 return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
24948 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
24957 simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
24973 // slow path to work, we should have at least 12 input bytes left.
24991 size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
25002 utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
25021 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
25093 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
25111 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
25113 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
25114 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
25129 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
25132 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
25133 simd8<uint8_t> sc = check_special_cases(input, prev1);
25134 this->error |= check_multibyte_lengths(input, prev_input, sc);
25146 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
25153 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
25156 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25157 if(input.is_ascii()) {
25158 input.store_ascii_as_utf16<endian>(utf16_output);
25167 this->check_utf8_bytes(input.chunks[0], zero);
25168 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25170 this->check_utf8_bytes(input.chunks[0], zero);
25171 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25172 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25173 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25175 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25180 // slow path to work, we should have at least 12 input bytes left.
25221 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
25228 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
25231 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25232 if(input.is_ascii()) {
25233 input.store_ascii_as_utf16<endian>(utf16_output);
25242 this->check_utf8_bytes(input.chunks[0], zero);
25243 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25245 this->check_utf8_bytes(input.chunks[0], zero);
25246 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25247 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25248 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25257 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25262 // slow path to work, we should have at least 12 input bytes left.
25328 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
25334 simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
25346 size_t consumed = convert_masked_utf8_to_utf32(input + pos,
25353 utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
25373 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
25445 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
25463 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
25465 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
25466 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
25481 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
25484 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
25485 simd8<uint8_t> sc = check_special_cases(input, prev1);
25486 this->error |= check_multibyte_lengths(input, prev_input, sc);
25498 // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
25505 // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
25508 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25509 if(input.is_ascii()) {
25510 input.store_ascii_as_utf32(utf32_output);
25519 this->check_utf8_bytes(input.chunks[0], zero);
25520 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25522 this->check_utf8_bytes(input.chunks[0], zero);
25523 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25524 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25525 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25527 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25532 // slow path to work, we should have at least 12 input bytes left.
25572 // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
25579 // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
25582 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25583 if(input.is_ascii()) {
25584 input.store_ascii_as_utf32(utf32_output);
25593 this->check_utf8_bytes(input.chunks[0], zero);
25594 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25596 this->check_utf8_bytes(input.chunks[0], zero);
25597 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25598 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25599 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25606 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25611 // slow path to work, we should have at least 12 input bytes left.
25676 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25677 uint64_t utf8_continuation_mask = input.gt(-65);
25688 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25689 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
25693 int64_t utf8_4byte = input.gteq_unsigned(240);
25714 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
25715 if (!match_system(big_endian)) { input.swap_bytes(); }
25716 uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
25728 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
25729 if (!match_system(big_endian)) { input.swap_bytes(); }
25730 uint64_t ascii_mask = input.lteq(0x7F);
25731 uint64_t twobyte_mask = input.lteq(0x7FF);
25732 uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
25752 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
25753 input.swap_bytes();
25754 input.store(reinterpret_cast<uint16_t *>(output));
25780 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
25856 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
25883 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
25886 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
25887 this->error |= check_special_cases(input, prev1);
25898 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
25905 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
25908 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25909 if(input.is_ascii()) {
25910 input.store((int8_t*)latin1_output);
25919 this->check_utf8_bytes(input.chunks[0], zero);
25920 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25922 this->check_utf8_bytes(input.chunks[0], zero);
25923 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25924 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25925 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
25927 uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
25932 // slow path to work, we should have at least 12 input bytes left.
25972 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
25979 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
25982 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
25983 if(input.is_ascii()) {
25984 input.store((int8_t*)latin1_output);
25993 this->check_utf8_bytes(input.chunks[0], zero);
25994 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25996 this->check_utf8_bytes(input.chunks[0], zero);
25997 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
25998 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
25999 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
26008 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
26013 // slow path to work, we should have at least 12 input bytes left.
26085 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
26092 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
26095 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
26096 if(input.is_ascii()) {
26097 input.store((int8_t*)latin1_output);
26102 uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
26107 // slow path to work, we should have at least 12 input bytes left.
26148 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
26150 auto bom_encoding = simdutf::BOM::check_bom(input, length);
26153 return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
26155 if (implementation::validate_utf8(input, length)) {
26298 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* input, size_t size,
26300 return utf8_to_latin1::convert_valid(input, size, latin1_output);
26323 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
26325 return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
26328 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
26330 return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
26343 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
26345 return utf8_to_utf32::convert_valid(input, size, utf32_output);
26695 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
26696 utf16::change_endianness_utf16(input, length, output);
26699 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
26700 return utf16::count_code_points<endianness::LITTLE>(input, length);
26703 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
26704 return utf16::count_code_points<endianness::BIG>(input, length);
26707 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
26708 return utf8::count_code_points(input, length);
26723 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
26724 return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
26727 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
26728 return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
26731 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
26732 return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
26735 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
26736 return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
26744 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
26745 return utf8::utf16_length_from_utf8(input, length);
26753 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *input, size_t len) const noexcept {
26754 const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
26794 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
26802 __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
26817 return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
26820 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
26826 __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
26832 return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
26835 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
26836 return utf8::count_code_points(input, length);
26876 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
26878 return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
27002 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
27074 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
27092 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
27094 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
27095 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
27105 simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
27106 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
27115 return input.gt_bits(max_value);
27121 // The last input we received
27123 // Whether the last input we received was incomplete (used for ASCII fast path)
27129 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
27132 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
27133 simd8<uint8_t> sc = check_special_cases(input, prev1);
27134 this->error |= check_multibyte_lengths(input, prev_input, sc);
27146 simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
27147 if(simdutf_likely(is_ascii(input))) {
27154 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
27155 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27157 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
27158 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27159 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27160 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27162 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
27163 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
27192 bool generic_validate_utf8(const uint8_t * input, size_t length) {
27194 buf_block_reader<64> reader(input, length);
27209 bool generic_validate_utf8(const char * input, size_t length) {
27210 return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27217 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
27219 buf_block_reader<64> reader(input, length);
27226 result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
27241 result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
27249 result generic_validate_utf8_with_errors(const char * input, size_t length) {
27250 return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27254 bool generic_validate_ascii(const uint8_t * input, size_t length) {
27255 buf_block_reader<64> reader(input, length);
27270 bool generic_validate_ascii(const char * input, size_t length) {
27271 return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27275 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
27276 buf_block_reader<64> reader(input, length);
27281 result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
27292 result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
27299 result generic_validate_ascii_with_errors(const char * input, size_t length) {
27300 return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
27320 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
27329 simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
27345 // slow path to work, we should have at least 12 input bytes left.
27363 size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
27374 utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
27393 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
27465 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
27483 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
27485 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
27486 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
27501 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
27504 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
27505 simd8<uint8_t> sc = check_special_cases(input, prev1);
27506 this->error |= check_multibyte_lengths(input, prev_input, sc);
27518 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
27525 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
27528 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27529 if(input.is_ascii()) {
27530 input.store_ascii_as_utf16<endian>(utf16_output);
27539 this->check_utf8_bytes(input.chunks[0], zero);
27540 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27542 this->check_utf8_bytes(input.chunks[0], zero);
27543 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27544 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27545 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27547 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27552 // slow path to work, we should have at least 12 input bytes left.
27593 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
27600 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
27603 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27604 if(input.is_ascii()) {
27605 input.store_ascii_as_utf16<endian>(utf16_output);
27614 this->check_utf8_bytes(input.chunks[0], zero);
27615 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27617 this->check_utf8_bytes(input.chunks[0], zero);
27618 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27619 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27620 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27629 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27634 // slow path to work, we should have at least 12 input bytes left.
27700 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
27706 simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
27718 size_t consumed = convert_masked_utf8_to_utf32(input + pos,
27725 utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
27745 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
27817 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
27835 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
27837 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
27838 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
27853 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
27856 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
27857 simd8<uint8_t> sc = check_special_cases(input, prev1);
27858 this->error |= check_multibyte_lengths(input, prev_input, sc);
27870 // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
27877 // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
27880 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27881 if(input.is_ascii()) {
27882 input.store_ascii_as_utf32(utf32_output);
27891 this->check_utf8_bytes(input.chunks[0], zero);
27892 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27894 this->check_utf8_bytes(input.chunks[0], zero);
27895 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27896 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27897 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27899 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27904 // slow path to work, we should have at least 12 input bytes left.
27944 // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
27951 // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
27954 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
27955 if(input.is_ascii()) {
27956 input.store_ascii_as_utf32(utf32_output);
27965 this->check_utf8_bytes(input.chunks[0], zero);
27966 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27968 this->check_utf8_bytes(input.chunks[0], zero);
27969 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
27970 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
27971 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
27978 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
27983 // slow path to work, we should have at least 12 input bytes left.
28048 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
28049 uint64_t utf8_continuation_mask = input.gt(-65);
28060 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
28061 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
28065 int64_t utf8_4byte = input.gteq_unsigned(240);
28086 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
28087 if (!match_system(big_endian)) { input.swap_bytes(); }
28088 uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
28100 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
28101 if (!match_system(big_endian)) { input.swap_bytes(); }
28102 uint64_t ascii_mask = input.lteq(0x7F);
28103 uint64_t twobyte_mask = input.lteq(0x7FF);
28104 uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
28124 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
28125 input.swap_bytes();
28126 input.store(reinterpret_cast<uint16_t *>(output));
28146 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
28148 auto bom_encoding = simdutf::BOM::check_bom(input, length);
28151 if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
28153 if(validate_utf16(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
28156 if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
28322 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
28323 scalar::utf16::change_endianness_utf16(input, length, output);
28326 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
28327 return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
28330 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
28331 return scalar::utf16::count_code_points<endianness::BIG>(input, length);
28334 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
28335 return utf8::count_code_points(input, length);
28338 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
28339 return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
28342 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
28343 return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
28346 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
28347 return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
28350 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
28351 return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
28354 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
28355 return scalar::utf8::utf16_length_from_utf8(input, length);
28358 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
28359 return scalar::utf32::utf8_length_from_utf32(input, length);
28362 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
28363 return scalar::utf32::utf16_length_from_utf32(input, length);
28366 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
28367 return scalar::utf8::count_code_points(input, length);
28397 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
28398 return input.reduce_or().is_ascii();
28442 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
28552 const char16_t * input = reinterpret_cast<const char16_t*>(buf);
28572 input += 16;
28574 input += 15;
28580 while (input + simd16<uint16_t>::SIZE * 2 < end16) {
28581 const auto in0 = simd16<uint16_t>(input);
28582 const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
28592 input += 16;
28608 input += 16;
28610 input += 15;
28621 const char32_t * input = reinterpret_cast<const char32_t*>(buf);
28639 while (input + 4 < end32) {
28640 const __m128i in32 = _mm_loadu_si128((__m128i *)input);
28643 input += 4;
28750 const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
28751 const char16_t* end = input + size;
28758 while (input + simd16<uint16_t>::SIZE * 2 < end) {
28762 auto in0 = simd16<uint16_t>(input);
28763 auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
28778 input += 16;
28806 // The whole input register contains valid UTF-16, i.e.,
28808 input += 16;
28810 // The 15 lower code units of the input register contains valid UTF-16.
28814 input += 15;
28821 return input;
28826 const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) {
28827 const char16_t* start = input;
28828 const char16_t* end = input + size;
28835 while (input + simd16<uint16_t>::SIZE * 2 < end) {
28839 auto in0 = simd16<uint16_t>(input);
28840 auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
28856 input += 16;
28884 // The whole input register contains valid UTF-16, i.e.,
28886 input += 16;
28888 // The 15 lower code units of the input register contains valid UTF-16.
28892 input += 15;
28894 return result(error_code::SURROGATE, input - start);
28899 return result(error_code::SUCCESS, input - start);
28907 const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
28908 const char32_t* end = input + size;
28916 while (input + 4 < end) {
28917 const __m128i in = _mm_loadu_si128((__m128i *)input);
28920 input += 4;
28932 return input;
28936 const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) {
28937 const char32_t* start = input;
28938 const char32_t* end = input + size;
28946 while (input + 4 < end) {
28947 const __m128i in = _mm_loadu_si128((__m128i *)input);
28953 return result(error_code::TOO_LARGE, input - start);
28958 return result(error_code::SURROGATE, input - start);
28960 input += 4;
28963 return result(error_code::SUCCESS, input - start);
29005 // by reserving 8 extra input bytes, we expect the output to have 8-16 bytes free
29079 // Shift input to process next 4 bytes
29114 size_t convert_masked_utf8_to_utf16(const char *input,
29117 // we use an approach where we try to process up to 12 input bytes.
29118 // Why 12 input bytes and not 16? Because we are concerned with the size of
29128 const __m128i in = _mm_loadu_si128((__m128i *)input);
29185 // SIX (6) input code-code units
29187 // we process SIX (6) input code-code units. The max length in bytes of six code
29200 // FOUR (4) input code-code units
29219 // TWO (2) input code-code units
29297 size_t convert_masked_utf8_to_utf32(const char *input,
29300 // we use an approach where we try to process up to 12 input bytes.
29301 // Why 12 input bytes and not 16? Because we are concerned with the size of
29310 const __m128i in = _mm_loadu_si128((__m128i *)input);
29361 // SIX (6) input code-code units
29363 // we process SIX (6) input code-code units. The max length in bytes of six code
29376 // FOUR (4) input code-code units
29393 // TWO (2) input code-code units
29427 size_t convert_masked_utf8_to_latin1(const char *input,
29430 // we use an approach where we try to process up to 12 input bytes.
29431 // Why 12 input bytes and not 16? Because we are concerned with the size of
29439 const __m128i in = _mm_loadu_si128((__m128i *)input);
29453 // this indicates an invalid input:
29456 // SIX (6) input code-code units
29458 // we process SIX (6) input code-code units. The max length in bytes of six code
29544 1. an input register contains no surrogates and each value
29546 2. an input register contains no surrogates and values are
29548 3. an input register contains surrogates --- i.e. codepoints
29653 // 1. Check if there are any surrogate word in the input chunk.
29673 We expand the input word (16-bit) into two code units (32-bit), thus
29857 // 1. Check if there are any surrogate word in the input chunk.
29877 We expand the input word (16-bit) into two code units (32-bit), thus
29999 1. an input register contains no surrogates and each value
30001 2. an input register contains no surrogates and values are
30003 3. an input register contains surrogates --- i.e. codepoints
30062 // 1. Check if there are any surrogate word in the input chunk.
30130 // 1. Check if there are any surrogate word in the input chunk.
30293 // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
30309 // Proceed with next input
30341 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
30396 We expand the input word (16-bit) into two code units (32-bit), thus
30504 // check for invalid input
30536 // Check for too large input
30551 // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
30565 // Proceed with next input
30597 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
30658 We expand the input word (16-bit) into two code units (32-bit), thus
30830 // check for invalid input
31009 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
31081 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
31099 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
31101 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
31102 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
31112 simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
31113 // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
31122 return input.gt_bits(max_value);
31128 // The last input we received
31130 // Whether the last input we received was incomplete (used for ASCII fast path)
31136 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
31139 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
31140 simd8<uint8_t> sc = check_special_cases(input, prev1);
31141 this->error |= check_multibyte_lengths(input, prev_input, sc);
31153 simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
31154 if(simdutf_likely(is_ascii(input))) {
31161 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
31162 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31164 this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
31165 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31166 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31167 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31169 this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
31170 this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
31199 bool generic_validate_utf8(const uint8_t * input, size_t length) {
31201 buf_block_reader<64> reader(input, length);
31216 bool generic_validate_utf8(const char * input, size_t length) {
31217 return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
31224 result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
31226 buf_block_reader<64> reader(input, length);
31233 result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input + count), length - count);
31248 result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input), reinterpret_cast<const char*>(input) + count, length - count);
31256 result generic_validate_utf8_with_errors(const char * input, size_t length) {
31257 return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
31261 bool generic_validate_ascii(const uint8_t * input, size_t length) {
31262 buf_block_reader<64> reader(input, length);
31277 bool generic_validate_ascii(const char * input, size_t length) {
31278 return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
31282 result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
31283 buf_block_reader<64> reader(input, length);
31288 result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
31299 result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
31306 result generic_validate_ascii_with_errors(const char * input, size_t length) {
31307 return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
31327 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
31336 simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
31352 // slow path to work, we should have at least 12 input bytes left.
31370 size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
31381 utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
31400 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
31472 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
31490 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
31492 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
31493 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
31508 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
31511 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
31512 simd8<uint8_t> sc = check_special_cases(input, prev1);
31513 this->error |= check_multibyte_lengths(input, prev_input, sc);
31525 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
31532 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
31535 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
31536 if(input.is_ascii()) {
31537 input.store_ascii_as_utf16<endian>(utf16_output);
31546 this->check_utf8_bytes(input.chunks[0], zero);
31547 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31549 this->check_utf8_bytes(input.chunks[0], zero);
31550 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31551 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31552 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31554 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
31559 // slow path to work, we should have at least 12 input bytes left.
31600 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
31607 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
31610 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
31611 if(input.is_ascii()) {
31612 input.store_ascii_as_utf16<endian>(utf16_output);
31621 this->check_utf8_bytes(input.chunks[0], zero);
31622 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31624 this->check_utf8_bytes(input.chunks[0], zero);
31625 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31626 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31627 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31636 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
31641 // slow path to work, we should have at least 12 input bytes left.
31707 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
31713 simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
31725 size_t consumed = convert_masked_utf8_to_utf32(input + pos,
31732 utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
31752 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
31824 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
31842 simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
31844 simd8<uint8_t> prev2 = input.prev<2>(prev_input);
31845 simd8<uint8_t> prev3 = input.prev<3>(prev_input);
31860 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
31863 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
31864 simd8<uint8_t> sc = check_special_cases(input, prev1);
31865 this->error |= check_multibyte_lengths(input, prev_input, sc);
31877 // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
31884 // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
31887 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
31888 if(input.is_ascii()) {
31889 input.store_ascii_as_utf32(utf32_output);
31898 this->check_utf8_bytes(input.chunks[0], zero);
31899 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31901 this->check_utf8_bytes(input.chunks[0], zero);
31902 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31903 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31904 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31906 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
31911 // slow path to work, we should have at least 12 input bytes left.
31951 // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
31958 // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
31961 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
31962 if(input.is_ascii()) {
31963 input.store_ascii_as_utf32(utf32_output);
31972 this->check_utf8_bytes(input.chunks[0], zero);
31973 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31975 this->check_utf8_bytes(input.chunks[0], zero);
31976 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
31977 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
31978 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
31985 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
31990 // slow path to work, we should have at least 12 input bytes left.
32055 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32056 uint64_t utf8_continuation_mask = input.gt(-65);
32067 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32068 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
32072 int64_t utf8_4byte = input.gteq_unsigned(240);
32093 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
32094 if (!match_system(big_endian)) { input.swap_bytes(); }
32095 uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
32107 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
32108 if (!match_system(big_endian)) { input.swap_bytes(); }
32109 uint64_t ascii_mask = input.lteq(0x7F);
32110 uint64_t twobyte_mask = input.lteq(0x7FF);
32111 uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
32131 simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
32132 input.swap_bytes();
32133 input.store(reinterpret_cast<uint16_t *>(output));
32157 simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
32233 const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
32260 simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
32263 simd8<uint8_t> prev1 = input.prev<1>(prev_input);
32264 this->error |= check_special_cases(input, prev1);
32275 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
32282 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
32285 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32286 if(input.is_ascii()) {
32287 input.store((int8_t*)latin1_output);
32296 this->check_utf8_bytes(input.chunks[0], zero);
32297 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
32299 this->check_utf8_bytes(input.chunks[0], zero);
32300 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
32301 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
32302 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
32304 uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
32309 // slow path to work, we should have at least 12 input bytes left.
32349 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
32356 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
32359 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32360 if(input.is_ascii()) {
32361 input.store((int8_t*)latin1_output);
32370 this->check_utf8_bytes(input.chunks[0], zero);
32371 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
32373 this->check_utf8_bytes(input.chunks[0], zero);
32374 this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
32375 this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
32376 this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
32385 uint64_t utf8_continuation_mask = input.lt(-65 + 1);
32390 // slow path to work, we should have at least 12 input bytes left.
32462 // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
32469 // If the input is long enough, then we have that margin-1 is the eight last leading byte.
32472 simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
32473 if(input.is_ascii()) {
32474 input.store((int8_t*)latin1_output);
32479 uint64_t utf8_continuation_mask = input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in this case, we also have ASCII to account for.
32484 // slow path to work, we should have at least 12 input bytes left.
32530 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
32532 auto bom_encoding = simdutf::BOM::check_bom(input, length);
32535 return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
32537 if (implementation::validate_utf8(input, length)) {
32707 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
32709 return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
32712 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
32714 return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
32727 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
32729 return utf8_to_utf32::convert_valid(input, size, utf32_output);
33084 void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
33085 utf16::change_endianness_utf16(input, length, output);
33088 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
33089 return utf16::count_code_points<endianness::LITTLE>(input, length);
33092 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
33093 return utf16::count_code_points<endianness::BIG>(input, length);
33096 simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
33097 return utf8::count_code_points(input, length);
33112 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
33113 return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
33116 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
33117 return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
33128 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t len) const noexcept {
33129 const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
33175 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
33176 return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
33179 simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
33180 return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
33183 simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
33184 return utf8::utf16_length_from_utf8(input, length);
33187 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
33195 __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
33210 return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
33213 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
33219 __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
33225 return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
33228 simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
33229 return utf8::count_code_points(input, length);