Lines Matching defs:__m128i

2208     const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
2209 const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
2212 const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
2213 const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
2584 __m128i value;
2587 simdutf_really_inline base() : value{__m128i()} {}
2590 simdutf_really_inline base(const __m128i _value) : value(_value) {}
2592 simdutf_really_inline operator const __m128i&() const { return this->value; }
2593 simdutf_really_inline operator __m128i&() { return this->value; }
2596 __m128i first = _mm_cvtepu8_epi16(*this);
2597 __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8));
2599 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
2603 _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
2604 _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second);
2607 _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
2608 _mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4)));
2609 _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8)));
2610 _mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12)));
2634 simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
2652 simdutf_really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
2668 return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
2682 simdutf_really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
2685 simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
2721 simdutf_really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
2758 simdutf_really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
2826 return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
2830 simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
3008 simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
3009 simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
3010 simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
3011 simd8<uint8_t>(__m128i(this->chunks[3])) >= mask
3026 simdutf_really_inline base16(const __m128i _value) : base<simd16<T>>(_value) {}
3028 simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr))) {}
3046 simdutf_really_inline simd16<bool>(const __m128i _value) : base16<bool>(_value) {}
3060 return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
3064 simdutf_really_inline base16_numeric(const __m128i _value) : base16<T>(_value) {}
3067 simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
3083 simdutf_really_inline simd16(const __m128i _value) : base16_numeric<int16_t>(_value) {}
3106 simdutf_really_inline simd16(const __m128i _value) : base16_numeric<uint16_t>(_value) {}
3161 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
3470 using __m128i = __vector unsigned char;
3473 __m128i value;
3476 simdutf_really_inline base() : value{__m128i()} {}
3479 simdutf_really_inline base(const __m128i _value) : value(_value) {}
3482 simdutf_really_inline operator const __m128i &() const {
3485 simdutf_really_inline operator __m128i &() { return this->value; }
3489 return vec_or(this->value, (__m128i)other);
3492 return vec_and(this->value, (__m128i)other);
3495 return vec_xor(this->value, (__m128i)other);
3498 return vec_andc(this->value, (__m128i)other);
3526 simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
3529 return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
3536 __m128i chunk = this->value;
3538 chunk = (__m128i)vec_reve(this->value);
3539 prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
3541 chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
3543 chunk = (__m128i)vec_reve((__m128i)chunk);
3552 return (__m128i)vec_splats((unsigned char)(-(!!_value)));
3556 simdutf_really_inline simd8<bool>(const __m128i _value)
3564 const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
3567 result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
3568 (__m128i)perm_mask));
3576 return !vec_all_eq(this->value, (__m128i)vec_splats(0));
3579 return this->value ^ (__m128i)splat(true);
3586 return (__m128i)vec_splats(value);
3590 return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
3602 simdutf_really_inline base8_numeric(const __m128i _value)
3607 vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
3615 return (__m128i)((__m128i)this->value + (__m128i)other);
3618 return (__m128i)((__m128i)this->value - (__m128i)other);
3633 return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
3652 simdutf_really_inline simd8(const __m128i _value)
3664 : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
3679 return (__m128i)vec_max((__vector signed char)this->value,
3680 (__vector signed char)(__m128i)other);
3684 return (__m128i)vec_min((__vector signed char)this->value,
3685 (__vector signed char)(__m128i)other);
3689 return (__m128i)vec_cmpgt((__vector signed char)this->value,
3690 (__vector signed char)(__m128i)other);
3694 return (__m128i)vec_cmplt((__vector signed char)this->value,
3695 (__vector signed char)(__m128i)other);
3702 simdutf_really_inline simd8(const __m128i _value)
3713 : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
3728 return (__m128i)vec_adds(this->value, (__m128i)other);
3732 return (__m128i)vec_subs(this->value, (__m128i)other);
3738 return (__m128i)vec_max(this->value, (__m128i)other);
3742 return (__m128i)vec_min(this->value, (__m128i)other);
3773 return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
3790 return vec_all_eq(this->value, (__m128i)vec_splats(0));
3796 return vec_all_eq(vec_and(this->value, (__m128i)bits),
3797 (__m128i)vec_splats(0));
3804 (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
3808 (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
18569 const __m128i t0 = _mm512_castsi512_si128(utf8); \
18570 const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1); \
18571 const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2); \
18572 const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3); \
19756 _mm_storeu_si128((__m128i *)latin1_output,
19793 _mm_storeu_si128((__m128i *)latin1_output,
19845 const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
19847 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
19890 const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
19891 const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
19895 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
19897 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
19974 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
19976 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
19978 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
19980 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
19987 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
19988 const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
19992 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
19993 const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
19997 const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
19998 const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
20003 const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
20004 const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
20006 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
20008 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
20010 _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
20012 _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
20090 const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
20092 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
20135 const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
20136 const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
20140 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
20142 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
20224 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
20226 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
20228 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
20230 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
20237 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
20238 const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
20242 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
20243 const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
20247 const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
20248 const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
20253 const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
20254 const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
20256 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
20258 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
20260 _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
20262 _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
20327 __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
20329 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
20332 _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
20395 __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
20397 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
20400 _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
20634 __m128i in = _mm_loadu_si128((__m128i*)&buf[i]);
22501 __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
22503 const __m128i v_80 = _mm_set1_epi8((char)0x80);
22506 _mm_storeu_si128((__m128i *)utf8_output, in8);
22552 const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
22553 const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
22558 _mm_storeu_si128((__m128i *)utf8_output,
22561 _mm_storeu_si128((__m128i *)utf8_output,
22581 __m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(latin1_input + i));
22584 __m128i xmm1 = _mm_cvtepu8_epi16(xmm0);
22593 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
22599 _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i), xmm1);
22602 _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i + 8), xmm0);
22615 __m128i in = _mm_loadl_epi64((__m128i*)&buf[i]);
22652 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
22653 const __m128i in = _mm_loadu_si128((__m128i *)input);
22671 const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
22672 const __m128i perm = _mm_shuffle_epi8(in, sh);
22673 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
22674 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
22675 __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
22677 _mm_storeu_si128((__m128i *)utf16_output, composed);
22684 const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
22685 const __m128i perm = _mm_shuffle_epi8(in, sh);
22686 const __m128i ascii =
22688 const __m128i middlebyte =
22690 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22691 const __m128i highbyte =
22693 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
22694 const __m128i composed =
22696 __m128i composed_repacked = _mm_packus_epi32(composed, composed);
22698 _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
22713 const __m128i sh =
22714 _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
22715 const __m128i perm = _mm_shuffle_epi8(in, sh);
22716 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
22717 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
22718 __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
22720 _mm_storeu_si128((__m128i *)utf16_output, composed);
22724 const __m128i sh =
22725 _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
22726 const __m128i perm = _mm_shuffle_epi8(in, sh);
22727 const __m128i ascii =
22729 const __m128i middlebyte =
22731 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22732 const __m128i highbyte =
22734 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
22735 const __m128i composed =
22737 __m128i composed_repacked = _mm_packus_epi32(composed, composed);
22739 _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
22751 const __m128i sh =
22752 _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
22753 const __m128i perm = _mm_shuffle_epi8(in, sh);
22754 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
22755 const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
22756 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22757 __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
22759 const __m128i correct =
22762 const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
22765 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
22766 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
22771 const __m128i composed =
22774 const __m128i composedminus =
22776 const __m128i lowtenbits =
22779 const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
22780 const __m128i lowtenbitsadd =
22782 const __m128i hightenbitsadd =
22784 const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
22785 __m128i surrogates =
22790 _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
22793 _mm_storeu_si128((__m128i *)basic_buffer, composed);
22795 _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
22833 const __m128i in = _mm_loadu_si128((__m128i *)input);
22846 const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
22847 const __m128i perm = _mm_shuffle_epi8(in, sh);
22848 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
22849 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
22850 const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
22858 const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
22859 const __m128i perm = _mm_shuffle_epi8(in, sh);
22860 const __m128i ascii =
22862 const __m128i middlebyte =
22864 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22865 const __m128i highbyte =
22867 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
22868 const __m128i composed =
22870 _mm_storeu_si128((__m128i *)utf32_output, composed);
22886 const __m128i sh =
22887 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
22888 const __m128i perm = _mm_shuffle_epi8(in, sh);
22889 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
22890 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
22891 const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
22897 const __m128i sh =
22898 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
22899 const __m128i perm = _mm_shuffle_epi8(in, sh);
22900 const __m128i ascii =
22902 const __m128i middlebyte =
22904 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22905 const __m128i highbyte =
22907 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
22908 const __m128i composed =
22910 _mm_storeu_si128((__m128i *)utf32_output, composed);
22914 const __m128i sh =
22915 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
22916 const __m128i perm = _mm_shuffle_epi8(in, sh);
22917 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
22918 const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
22919 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
22920 __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
22922 const __m128i correct =
22925 const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
22926 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
22927 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
22928 const __m128i composed =
22931 _mm_storeu_si128((__m128i *)utf32_output, composed);
22960 __m128i lo = _mm256_extractf128_si256(in, 0);
22961 __m128i hi = _mm256_extractf128_si256(in, 1);
22962 __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
22963 __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
22964 _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
22966 _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
22996 __m128i lo = _mm256_extractf128_si256(in, 0);
22997 __m128i hi = _mm256_extractf128_si256(in, 1);
22998 __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
22999 __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
23000 _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
23002 _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
23102 const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
23104 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
23148 const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
23149 const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
23153 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
23155 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
23236 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
23238 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
23240 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
23242 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
23249 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
23250 const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
23254 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
23255 const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
23259 const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
23260 const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
23265 const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
23266 const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
23268 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
23270 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
23272 _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
23274 _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
23345 const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
23347 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
23391 const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
23392 const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
23396 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
23398 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
23479 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
23481 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
23483 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
23485 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
23492 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
23493 const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
23497 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
23498 const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
23502 const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
23503 const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
23508 const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
23509 const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
23511 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
23513 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
23515 _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
23517 _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
23777 _mm_storeu_si128((__m128i *)latin1_output,
23827 _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result));
23864 const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
23866 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
23909 const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
23910 const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
23914 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
23916 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
23993 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
23995 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
23997 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
23999 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
24006 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
24007 const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
24011 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
24012 const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
24016 const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
24017 const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
24022 const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
24023 const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
24025 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
24027 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
24029 _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
24031 _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
24109 const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
24111 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
24154 const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
24155 const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
24159 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
24161 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
24243 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
24245 _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
24247 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
24249 _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
24256 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
24257 const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
24261 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
24262 const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
24266 const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
24267 const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
24272 const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
24273 const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
24275 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
24277 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
24279 _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
24281 _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
24343 __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
24345 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
24348 _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
24411 __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
24413 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
24416 _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
24470 const __m128i in = _mm_loadu_si128((__m128i *)input);
24471 const __m128i in_second_half = _mm_loadu_si128((__m128i *)(input + 16));
24492 _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
24509 const __m128i sh =
24510 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
24511 const __m128i perm = _mm_shuffle_epi8(in, sh);
24512 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
24513 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
24514 __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
24515 const __m128i latin1_packed = _mm_packus_epi16(composed,composed);
24518 _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
28429 const __m128i v_u16,
28431 const __m128i one_byte_bytemask,
28435 const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
28437 const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
28439 const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
28446 const __m128i t0 = _mm_slli_epi16(v_u16, 2);
28448 const __m128i t1 = _mm_and_si128(t0, v_1f00);
28450 const __m128i t2 = _mm_and_si128(v_u16, v_003f);
28452 const __m128i t3 = _mm_or_si128(t1, t2);
28454 const __m128i t4 = _mm_or_si128(t3, v_c080);
28457 const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask);
28466 const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
28467 const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
28470 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
28477 const __m128i v_u16,
28479 const __m128i v_0000,
28480 const __m128i v_ff80
28483 const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000);
28510 __m128i currentmax = _mm_setzero_si128();
28515 __m128i in = _mm_loadu_si128((__m128i*)buf);
28516 __m128i secondin = _mm_loadu_si128((__m128i*)buf+1);
28517 __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
28518 __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
28625 __m128i currentoffsetmax = _mm_setzero_si128();
28626 const __m128i offset = _mm_set1_epi32(0xffff2000);
28627 const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
28640 const __m128i in32 = _mm_loadu_si128((__m128i *)input);
28646 __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
28692 const __m128i standardmax = _mm_set1_epi32(0x10ffff);
28693 __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
28910 const __m128i standardmax = _mm_set1_epi32(0x10ffff);
28911 const __m128i offset = _mm_set1_epi32(0xffff2000);
28912 const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
28913 __m128i currentmax = _mm_setzero_si128();
28914 __m128i currentoffsetmax = _mm_setzero_si128();
28917 const __m128i in = _mm_loadu_si128((__m128i *)input);
28922 __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
28940 const __m128i standardmax = _mm_set1_epi32(0x10ffff);
28941 const __m128i offset = _mm_set1_epi32(0xffff2000);
28942 const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
28943 __m128i currentmax = _mm_setzero_si128();
28944 __m128i currentoffsetmax = _mm_setzero_si128();
28947 const __m128i in = _mm_loadu_si128((__m128i *)input);
28951 __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
28974 const __m128i v_0000 = _mm_setzero_si128();
28976 const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80);
28978 const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
28980 const __m128i latin_1_half_into_u16_byte_mask = _mm_setr_epi8(
28991 const __m128i latin_2_half_into_u16_byte_mask = _mm_setr_epi8(
29008 __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input);
29012 _mm_storeu_si128((__m128i*)utf8_output, v_latin);
29021 __m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
29023 __m128i v_u16_latin_2_half = _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);
29033 __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input);
29036 _mm_storeu_si128((__m128i*)utf8_output, v_latin);
29042 __m128i v_u16_latin_1_half = _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
29058 __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&latin1_input[i]));
29059 __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in)
29061 __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in)
29064 _mm_storeu_si128(reinterpret_cast<__m128i*>(&utf16_output[i]), out1);
29065 _mm_storeu_si128(reinterpret_cast<__m128i*>(&utf16_output[i + 8]), out2);
29077 __m128i in = _mm_loadu_si128((__m128i*)buf);
29080 __m128i in_shifted1 = _mm_srli_si128(in, 4);
29081 __m128i in_shifted2 = _mm_srli_si128(in, 8);
29082 __m128i in_shifted3 = _mm_srli_si128(in, 12);
29085 __m128i out1 = _mm_cvtepu8_epi32(in);
29086 __m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
29087 __m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
29088 __m128i out4 = _mm_cvtepu8_epi32(in_shifted3);
29090 _mm_storeu_si128((__m128i*)utf32_output, out1);
29091 _mm_storeu_si128((__m128i*)(utf32_output + 4), out2);
29092 _mm_storeu_si128((__m128i*)(utf32_output + 8), out3);
29093 _mm_storeu_si128((__m128i*)(utf32_output + 12), out4);
29127 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29128 const __m128i in = _mm_loadu_si128((__m128i *)input);
29133 __m128i ascii_first = _mm_cvtepu8_epi16(in);
29134 __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8));
29139 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
29140 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second);
29147 const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29148 const __m128i perm = _mm_shuffle_epi8(in, sh);
29149 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29150 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29151 __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29153 _mm_storeu_si128((__m128i *)utf16_output, composed);
29160 const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
29161 const __m128i perm = _mm_shuffle_epi8(in, sh);
29162 const __m128i ascii =
29164 const __m128i middlebyte =
29166 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29167 const __m128i highbyte =
29169 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
29170 const __m128i composed =
29172 __m128i composed_repacked = _mm_packus_epi32(composed, composed);
29174 _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
29190 const __m128i sh =
29191 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29192 const __m128i perm = _mm_shuffle_epi8(in, sh);
29193 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29194 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29195 __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29197 _mm_storeu_si128((__m128i *)utf16_output, composed);
29201 const __m128i sh =
29202 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29203 const __m128i perm = _mm_shuffle_epi8(in, sh);
29204 const __m128i ascii =
29206 const __m128i middlebyte =
29208 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29209 const __m128i highbyte =
29211 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
29212 const __m128i composed =
29214 __m128i composed_repacked = _mm_packus_epi32(composed, composed);
29216 _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
29228 const __m128i sh =
29229 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29230 const __m128i perm = _mm_shuffle_epi8(in, sh);
29231 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
29232 const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
29233 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29234 __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
29236 const __m128i correct =
29239 const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
29242 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
29243 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
29248 const __m128i composed =
29251 const __m128i composedminus =
29253 const __m128i lowtenbits =
29256 const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
29257 const __m128i lowtenbitsadd =
29259 const __m128i hightenbitsadd =
29261 const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
29262 __m128i surrogates =
29267 _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
29270 _mm_storeu_si128((__m128i *)basic_buffer, composed);
29272 _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
29310 const __m128i in = _mm_loadu_si128((__m128i *)input);
29315 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in));
29316 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4)));
29317 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8)));
29318 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12)));
29325 const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29326 const __m128i perm = _mm_shuffle_epi8(in, sh);
29327 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29328 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29329 const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29330 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
29331 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
29338 const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
29339 const __m128i perm = _mm_shuffle_epi8(in, sh);
29340 const __m128i ascii =
29342 const __m128i middlebyte =
29344 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29345 const __m128i highbyte =
29347 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
29348 const __m128i composed =
29350 _mm_storeu_si128((__m128i *)utf32_output, composed);
29366 const __m128i sh =
29367 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29368 const __m128i perm = _mm_shuffle_epi8(in, sh);
29369 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29370 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29371 const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29372 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
29373 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
29377 const __m128i sh =
29378 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29379 const __m128i perm = _mm_shuffle_epi8(in, sh);
29380 const __m128i ascii =
29382 const __m128i middlebyte =
29384 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29385 const __m128i highbyte =
29387 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
29388 const __m128i composed =
29390 _mm_storeu_si128((__m128i *)utf32_output, composed);
29394 const __m128i sh =
29395 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29396 const __m128i perm = _mm_shuffle_epi8(in, sh);
29397 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
29398 const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
29399 const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
29400 __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
29402 const __m128i correct =
29405 const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
29406 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
29407 const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
29408 const __m128i composed =
29411 _mm_storeu_si128((__m128i *)utf32_output, composed);
29439 const __m128i in = _mm_loadu_si128((__m128i *)input);
29444 _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
29461 const __m128i sh =
29462 _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
29463 const __m128i perm = _mm_shuffle_epi8(in, sh);
29464 const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
29465 const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
29466 __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
29467 const __m128i latin1_packed = _mm_packus_epi16(composed,composed);
29470 _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
29482 __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buf));
29485 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29489 __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
29492 __m128i latin1_packed = _mm_packus_epi16(in, in);
29493 _mm_storel_epi64(reinterpret_cast<__m128i*>(latin1_output), latin1_packed);
29509 __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buf));
29512 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29516 __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
29518 __m128i latin1_packed = _mm_packus_epi16(in, in);
29519 _mm_storel_epi64(reinterpret_cast<__m128i*>(latin1_output), latin1_packed);
29597 const __m128i v_0000 = _mm_setzero_si128();
29598 const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
29599 const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
29603 __m128i in = _mm_loadu_si128((__m128i*)buf);
29605 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29609 const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
29611 __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
29613 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29619 const __m128i utf8_packed = _mm_packus_epi16(in,in);
29621 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
29629 const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
29631 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
29640 const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
29644 const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
29656 const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
29665 const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
29694 const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
29696 const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
29698 const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
29701 const __m128i s0 = _mm_srli_epi16(in, 4);
29703 const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
29705 const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
29707 const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
29708 const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
29709 const __m128i s4 = _mm_xor_si128(s3, m0);
29713 const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
29714 const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
29721 const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
29722 const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
29723 const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
29724 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
29726 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
29734 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
29735 const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
29740 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
29741 const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
29743 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
29745 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
29801 const __m128i v_0000 = _mm_setzero_si128();
29802 const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
29803 const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
29807 __m128i in = _mm_loadu_si128((__m128i*)buf);
29809 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29813 const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
29815 __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
29817 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
29823 const __m128i utf8_packed = _mm_packus_epi16(in,in);
29825 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
29833 const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
29835 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
29844 const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
29848 const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
29860 const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
29869 const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
29898 const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
29900 const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
29902 const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
29905 const __m128i s0 = _mm_srli_epi16(in, 4);
29907 const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
29909 const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
29911 const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
29912 const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
29913 const __m128i s4 = _mm_xor_si128(s3, m0);
29917 const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
29918 const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
29925 const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
29926 const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
29927 const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
29928 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
29930 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
29938 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
29939 const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
29944 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
29945 const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
29947 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
29949 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
30051 const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
30052 const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
30055 __m128i in = _mm_loadu_si128((__m128i*)buf);
30058 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
30065 const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
30074 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
30075 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
30119 const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
30120 const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
30123 __m128i in = _mm_loadu_si128((__m128i*)buf);
30126 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
30133 const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
30142 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
30143 _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
30182 __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
30183 __m128i shufmask =
30187 __m128i in1 = _mm_loadu_si128((__m128i *)buf);
30188 __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
30189 __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
30190 __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
30192 __m128i check_combined = _mm_or_si128(in1, in2);
30199 __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask), _mm_shuffle_epi8(in2, shufmask));
30200 __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask), _mm_shuffle_epi8(in4, shufmask));
30201 __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
30202 _mm_storeu_si128((__m128i *)latin1_output, pack);
30216 __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
30217 __m128i shufmask =
30221 __m128i in1 = _mm_loadu_si128((__m128i *)buf);
30222 __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
30223 __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
30224 __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
30226 __m128i check_combined = _mm_or_si128(in1, in2);
30244 __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask), _mm_shuffle_epi8(in2, shufmask));
30245 __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask), _mm_shuffle_epi8(in4, shufmask));
30246 __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
30247 _mm_storeu_si128((__m128i *)latin1_output, pack);
30260 const __m128i v_0000 = _mm_setzero_si128();//__m128 = 128 bits
30261 const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); //1111 1000 0000 0000
30262 const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); //1100 0000 1000 0000
30263 const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); //1111 1111 1000 0000
30264 const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); //1111 1111 1111 1111 0000 0000 0000 0000
30265 const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); //0111 1111 1111 1111 1111 1111 1111 1111
30266 __m128i running_max = _mm_setzero_si128();
30267 __m128i forbidden_bytemask = _mm_setzero_si128();
30272 __m128i in = _mm_loadu_si128((__m128i*)buf);
30273 __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);//These two values can hold only 8 UTF32 chars
30279 __m128i in_16 = _mm_packus_epi32(
30296 __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
30297 __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
30299 __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));//pack into 1 vector, now you have two
30303 const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16); //creates two copy of in_16 in 1 vector
30305 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); //put them into the output
30316 const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
30318 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30327 const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
30335 const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
30343 const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
30344 const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
30347 const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
30349 const __m128i t1 = _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
30351 const __m128i t2 = _mm_and_si128(in_16, v_003f);// potential second utf8 byte
30353 const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together
30355 const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
30358 const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
30367 const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
30368 const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
30371 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30381 const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
30385 const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
30388 const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
30417 const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
30419 const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
30421 const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
30424 const __m128i s0 = _mm_srli_epi16(in_16, 4);
30426 const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
30428 const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
30430 const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
30431 const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
30432 const __m128i s4 = _mm_xor_si128(s3, m0);
30436 const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
30437 const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
30444 const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
30445 const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
30446 const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
30447 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
30449 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
30457 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
30458 const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
30463 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
30464 const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
30466 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
30468 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
30505 const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
30521 const __m128i v_0000 = _mm_setzero_si128();
30522 const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
30523 const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
30524 const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
30525 const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
30526 const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
30527 const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
30533 __m128i in = _mm_loadu_si128((__m128i*)buf);
30534 __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
30537 __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
30543 __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
30553 __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
30554 __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
30555 __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
30559 const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
30561 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30567 __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
30576 const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
30578 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30587 const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
30591 const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
30599 const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
30600 const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
30603 const __m128i t0 = _mm_slli_epi16(in_16, 2);
30605 const __m128i t1 = _mm_and_si128(t0, v_1f00);
30607 const __m128i t2 = _mm_and_si128(in_16, v_003f);
30609 const __m128i t3 = _mm_or_si128(t1, t2);
30611 const __m128i t4 = _mm_or_si128(t3, v_c080);
30614 const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
30623 const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
30624 const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
30627 _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
30637 const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
30644 const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
30645 const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
30650 const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
30679 const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
30681 const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
30683 const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
30686 const __m128i s0 = _mm_srli_epi16(in_16, 4);
30688 const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
30690 const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
30692 const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
30693 const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
30694 const __m128i s4 = _mm_xor_si128(s3, m0);
30698 const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
30699 const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
30706 const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
30707 const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
30708 const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
30709 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
30711 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
30719 const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
30720 const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
30725 const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
30726 const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
30728 _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
30730 _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
30775 const __m128i v_0000 = _mm_setzero_si128();
30776 const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
30777 __m128i forbidden_bytemask = _mm_setzero_si128();
30780 __m128i in = _mm_loadu_si128((__m128i*)buf);
30781 __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
30782 const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
30788 __m128i utf16_packed = _mm_packus_epi32(in, nextin);
30790 const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
30791 const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
30795 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
30799 _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
30842 const __m128i v_0000 = _mm_setzero_si128();
30843 const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
30846 __m128i in = _mm_loadu_si128((__m128i*)buf);
30847 __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
30848 const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
30854 __m128i utf16_packed = _mm_packus_epi32(in, nextin);
30856 const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
30857 const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
30858 const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
30864 const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
30868 _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
33130 size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
33132 __m128i two_64bits = _mm_setzero_si128();
33133 while (i + sizeof(__m128i) <= len) {
33134 __m128i runner = _mm_setzero_si128();
33135 size_t iterations = (len - i) / sizeof(__m128i);
33139 size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
33140 for (; i + 4*sizeof(__m128i) <= max_i; i += 4*sizeof(__m128i)) {
33141 __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
33142 __m128i input2 = _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
33143 __m128i input3 = _mm_loadu_si128((const __m128i *)(str + i + 2*sizeof(__m128i)));
33144 __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i)));
33145 __m128i input12 = _mm_add_epi8(
33152 __m128i input34 = _mm_add_epi8(
33159 __m128i input1234 = _mm_add_epi8(input12, input34);
33162 for (; i <= max_i; i += sizeof(__m128i)) {
33163 __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
33188 const __m128i v_00000000 = _mm_setzero_si128();
33189 const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
33190 const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
33191 const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
33195 __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
33196 const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
33197 const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
33198 const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
33199 const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
33200 const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
33214 const __m128i v_00000000 = _mm_setzero_si128();
33215 const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
33219 __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
33220 const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);