Lines Matching defs:__m256i

1678     __m256i value;
1681 simdutf_really_inline base() : value{__m256i()} {}
1684 simdutf_really_inline base(const __m256i _value) : value(_value) {}
1686 simdutf_really_inline operator const __m256i&() const { return this->value; }
1687 simdutf_really_inline operator __m256i&() { return this->value; }
1690 __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
1691 __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1));
1693 const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
1698 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
1699 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
1702 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
1703 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8))));
1704 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1)));
1705 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8)));
1727 simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
1746 simdutf_really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
1762 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
1778 simdutf_really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
1781 simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
1818 simdutf_really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
1861 simdutf_really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
2052 (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
2053 (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask)
2074 simdutf_really_inline base16(const __m256i _value) : base<simd16<T>>(_value) {}
2076 simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr))) {}
2097 simdutf_really_inline simd16<bool>(const __m256i _value) : base16<bool>(_value) {}
2111 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
2115 simdutf_really_inline base16_numeric(const __m256i _value) : base16<T>(_value) {}
2118 simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
2134 simdutf_really_inline simd16(const __m256i _value) : base16_numeric<int16_t>(_value) {}
2151 simdutf_really_inline simd16(const __m256i _value) : base16_numeric<uint16_t>(_value) {}
2196 const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
2216 const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
2217 const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
18136 _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
18139 _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
18210 _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),_mm512_castsi512_si256(byteflip)));
18212 _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
18578 const __m256i h0 = _mm512_castsi512_si256(utf8); \
18579 const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1); \
19350 (__m256i *)latin1_output,
19402 (__m256i *)latin1_output,
19821 const __m256i v_0000 = _mm256_setzero_si256();
19822 const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
19823 const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
19824 const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
19825 const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
19826 const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
19827 __m256i running_max = _mm256_setzero_si256();
19828 __m256i forbidden_bytemask = _mm256_setzero_si256();
19833 __m256i in = _mm256_loadu_si256((__m256i*)buf);
19834 __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
19838 __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
19854 const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
19858 const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
19864 const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
19865 const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
19868 const __m256i t0 = _mm256_slli_epi16(in_16, 2);
19870 const __m256i t1 = _mm256_and_si256(t0, v_1f00);
19872 const __m256i t2 = _mm256_and_si256(in_16, v_003f);
19874 const __m256i t3 = _mm256_or_si256(t1, t2);
19876 const __m256i t4 = _mm256_or_si256(t3, v_c080);
19879 const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
19893 const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
19905 const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
19909 const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
19912 const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
19943 const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
19945 const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
19947 const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
19950 const __m256i s0 = _mm256_srli_epi16(in_16, 4);
19952 const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
19954 const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
19956 const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
19957 const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
19958 const __m256i s4 = _mm256_xor_si256(s3, m0);
19962 const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
19963 const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
19971 const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
19972 const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
19973 const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
20048 const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
20063 const __m256i v_0000 = _mm256_setzero_si256();
20064 const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
20065 const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
20066 const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
20067 const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
20068 const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
20069 const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
20074 __m256i in = _mm256_loadu_si256((__m256i*)buf);
20075 __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
20077 const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
20083 __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
20099 const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
20103 const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
20109 const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
20110 const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
20113 const __m256i t0 = _mm256_slli_epi16(in_16, 2);
20115 const __m256i t1 = _mm256_and_si256(t0, v_1f00);
20117 const __m256i t2 = _mm256_and_si256(in_16, v_003f);
20119 const __m256i t3 = _mm256_or_si256(t1, t2);
20121 const __m256i t4 = _mm256_or_si256(t3, v_c080);
20124 const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
20138 const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
20150 const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
20156 const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
20157 const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
20162 const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
20193 const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
20195 const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
20197 const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
20200 const __m256i s0 = _mm256_srli_epi16(in_16, 4);
20202 const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
20204 const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
20206 const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
20207 const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
20208 const __m256i s4 = _mm256_xor_si256(s3, m0);
20212 const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
20213 const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
20221 const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
20222 const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
20223 const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
20309 __m256i forbidden_bytemask = _mm256_setzero_si256();
20313 __m256i in = _mm256_loadu_si256((__m256i*)buf);
20315 const __m256i v_00000000 = _mm256_setzero_si256();
20316 const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
20319 const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
20323 const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
20324 const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
20378 __m256i in = _mm256_loadu_si256((__m256i*)buf);
20380 const __m256i v_00000000 = _mm256_setzero_si256();
20381 const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
20384 const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
20388 const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
20389 const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
20390 const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
20603 __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]);
20614 __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len);
21736 __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0);
21737 __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1);
21896 __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
21897 __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
22053 __m256i currentmax = _mm256_setzero_si256();
22058 __m256i in = _mm256_loadu_si256((__m256i*)buf);
22059 __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
22154 __m256i currentoffsetmax = _mm256_setzero_si256();
22155 const __m256i offset = _mm256_set1_epi32(0xffff2000);
22156 const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
22165 const __m256i in32 = _mm256_loadu_si256((__m256i *)input);
22171 __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
22215 const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
22216 __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
22434 const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
22435 const __m256i offset = _mm256_set1_epi32(0xffff2000);
22436 const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
22437 __m256i currentmax = _mm256_setzero_si256();
22438 __m256i currentoffsetmax = _mm256_setzero_si256();
22441 const __m256i in = _mm256_loadu_si256((__m256i *)input);
22446 __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
22464 const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
22465 const __m256i offset = _mm256_set1_epi32(0xffff2000);
22466 const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
22467 __m256i currentmax = _mm256_setzero_si256();
22468 __m256i currentoffsetmax = _mm256_setzero_si256();
22471 const __m256i in = _mm256_loadu_si256((__m256i *)input);
22475 __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
22495 const __m256i v_0000 = _mm256_setzero_si256();
22496 const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
22497 const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
22513 const __m256i in = _mm256_cvtepu8_epi16((in8));
22518 const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
22519 const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
22522 const __m256i t0 = _mm256_slli_epi16(in, 2);
22524 const __m256i t1 = _mm256_and_si256(t0, v_1f00);
22526 const __m256i t2 = _mm256_and_si256(in, v_003f);
22528 const __m256i t3 = _mm256_or_si256(t1, t2);
22530 const __m256i t4 = _mm256_or_si256(t3, v_c080);
22535 const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
22538 const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
22555 const __m256i utf8_packed = _mm256_shuffle_epi8(
22618 __m256i out = _mm256_cvtepu8_epi32(in);
22621 _mm256_storeu_si256((__m256i*)&utf32_output[i], out);
22658 __m256i ascii = _mm256_cvtepu8_epi16(in);
22660 const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
22664 _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
22838 _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in));
22839 _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8)));
22851 _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
22892 _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
22948 __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
22951 const __m256i swap = _mm256_setr_epi8(
22957 __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
22985 __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
22988 const __m256i swap = _mm256_setr_epi8(
22994 __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
23085 const __m256i v_0000 = _mm256_setzero_si256();
23086 const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
23087 const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
23088 const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
23092 __m256i in = _mm256_loadu_si256((__m256i*)buf);
23094 const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
23099 const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
23111 const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
23115 const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
23122 const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
23123 const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
23126 const __m256i t0 = _mm256_slli_epi16(in, 2);
23128 const __m256i t1 = _mm256_and_si256(t0, v_1f00);
23130 const __m256i t2 = _mm256_and_si256(in, v_003f);
23132 const __m256i t3 = _mm256_or_si256(t1, t2);
23134 const __m256i t4 = _mm256_or_si256(t3, v_c080);
23137 const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
23151 const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
23165 const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
23174 const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
23205 const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
23207 const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
23209 const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
23212 const __m256i s0 = _mm256_srli_epi16(in, 4);
23214 const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
23216 const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
23218 const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
23219 const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
23220 const __m256i s4 = _mm256_xor_si256(s3, m0);
23224 const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
23225 const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
23233 const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
23234 const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
23235 const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
23328 const __m256i v_0000 = _mm256_setzero_si256();
23329 const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
23330 const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
23331 const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
23335 __m256i in = _mm256_loadu_si256((__m256i*)buf);
23337 const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
23342 const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
23354 const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
23358 const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
23365 const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
23366 const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
23369 const __m256i t0 = _mm256_slli_epi16(in, 2);
23371 const __m256i t1 = _mm256_and_si256(t0, v_1f00);
23373 const __m256i t2 = _mm256_and_si256(in, v_003f);
23375 const __m256i t3 = _mm256_or_si256(t1, t2);
23377 const __m256i t4 = _mm256_or_si256(t3, v_c080);
23380 const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
23394 const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
23408 const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
23417 const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
23448 const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
23450 const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
23452 const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
23455 const __m256i s0 = _mm256_srli_epi16(in, 4);
23457 const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
23459 const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
23461 const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
23462 const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
23463 const __m256i s4 = _mm256_xor_si256(s3, m0);
23467 const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
23468 const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
23476 const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
23477 const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
23478 const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
23617 const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
23618 const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
23621 __m256i in = _mm256_loadu_si256((__m256i*)buf);
23623 const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
23631 const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
23640 _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
23641 _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
23685 const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
23686 const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
23689 __m256i in = _mm256_loadu_si256((__m256i*)buf);
23691 const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
23699 const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
23708 _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
23709 _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
23750 __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
23752 __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
23757 __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
23758 __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
23760 __m256i check_combined = _mm256_or_si256(in1, in2);
23767 __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
23768 __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
23771 __m256i idx1 = _mm256_set_epi32(-1, -1,-1,-1,-1,-1,4,0);
23772 __m256i idx2 = _mm256_set_epi32(-1, -1,-1,-1,4,0,-1,-1);
23773 __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
23774 __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
23776 __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
23792 __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
23793 __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
23800 __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
23801 __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
23803 __m256i check_combined = _mm256_or_si256(in1, in2);
23818 __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
23819 __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
23821 __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
23822 __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
23823 __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
23824 __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
23826 __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
23840 const __m256i v_0000 = _mm256_setzero_si256();
23841 const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
23842 const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
23843 const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
23844 const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
23845 const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
23846 __m256i running_max = _mm256_setzero_si256();
23847 __m256i forbidden_bytemask = _mm256_setzero_si256();
23852 __m256i in = _mm256_loadu_si256((__m256i*)buf);
23853 __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
23857 __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
23873 const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
23877 const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
23883 const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
23884 const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
23887 const __m256i t0 = _mm256_slli_epi16(in_16, 2);
23889 const __m256i t1 = _mm256_and_si256(t0, v_1f00);
23891 const __m256i t2 = _mm256_and_si256(in_16, v_003f);
23893 const __m256i t3 = _mm256_or_si256(t1, t2);
23895 const __m256i t4 = _mm256_or_si256(t3, v_c080);
23898 const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
23912 const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
23924 const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
23928 const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
23931 const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
23962 const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
23964 const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
23966 const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
23969 const __m256i s0 = _mm256_srli_epi16(in_16, 4);
23971 const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
23973 const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
23975 const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
23976 const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
23977 const __m256i s4 = _mm256_xor_si256(s3, m0);
23981 const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
23982 const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
23990 const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
23991 const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
23992 const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
24067 const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
24082 const __m256i v_0000 = _mm256_setzero_si256();
24083 const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
24084 const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
24085 const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
24086 const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
24087 const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
24088 const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
24093 __m256i in = _mm256_loadu_si256((__m256i*)buf);
24094 __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
24096 const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
24102 __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
24118 const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
24122 const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
24128 const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
24129 const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
24132 const __m256i t0 = _mm256_slli_epi16(in_16, 2);
24134 const __m256i t1 = _mm256_and_si256(t0, v_1f00);
24136 const __m256i t2 = _mm256_and_si256(in_16, v_003f);
24138 const __m256i t3 = _mm256_or_si256(t1, t2);
24140 const __m256i t4 = _mm256_or_si256(t3, v_c080);
24143 const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
24157 const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
24169 const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
24175 const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
24176 const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
24181 const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
24212 const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
24214 const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
24216 const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
24219 const __m256i s0 = _mm256_srli_epi16(in_16, 4);
24221 const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
24223 const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
24225 const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
24226 const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
24227 const __m256i s4 = _mm256_xor_si256(s3, m0);
24231 const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
24232 const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
24240 const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
24241 const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
24242 const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
24325 __m256i forbidden_bytemask = _mm256_setzero_si256();
24329 __m256i in = _mm256_loadu_si256((__m256i*)buf);
24331 const __m256i v_00000000 = _mm256_setzero_si256();
24332 const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
24335 const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
24339 const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
24340 const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
24394 __m256i in = _mm256_loadu_si256((__m256i*)buf);
24396 const __m256i v_00000000 = _mm256_setzero_si256();
24397 const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
24400 const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
24404 const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
24405 const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
24406 const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
24480 __m256i in_combined = _mm256_set_m128i(in_second_half, in);
24483 _mm256_storeu_si256(reinterpret_cast<__m256i *>(latin1_output), in_combined);
26755 size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
26757 __m256i four_64bits = _mm256_setzero_si256();
26758 while (i + sizeof(__m256i) <= len) {
26759 __m256i runner = _mm256_setzero_si256();
26761 size_t iterations = (len - i) / sizeof(__m256i);
26765 size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
26766 for (; i + 4*sizeof(__m256i) <= max_i; i += 4*sizeof(__m256i)) {
26767 __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
26768 __m256i input2 = _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
26769 __m256i input3 = _mm256_loadu_si256((const __m256i *)(data + i + 2*sizeof(__m256i)));
26770 __m256i input4 = _mm256_loadu_si256((const __m256i *)(data + i + 3*sizeof(__m256i)));
26771 __m256i input12 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
26773 __m256i input23 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
26775 __m256i input1234 = _mm256_add_epi8(input12, input23);
26779 for (; i <= max_i; i += sizeof(__m256i)) {
26780 __m256i input_256_chunk = _mm256_loadu_si256((const __m256i *)(data + i));
26795 const __m256i v_00000000 = _mm256_setzero_si256();
26796 const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
26797 const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
26798 const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
26802 __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
26803 const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
26804 const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
26805 const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
26806 const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
26807 const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
26821 const __m256i v_00000000 = _mm256_setzero_si256();
26822 const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
26826 __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
26827 const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);