Lines Matching defs:word

4340 inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
4341 return uint16_t((word >> 8) | (word << 8));
4349 uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
4350 if((word &0xF800) == 0xD800) {
4352 uint16_t diff = uint16_t(word - 0xD800);
4370 uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
4371 if((word & 0xF800) == 0xD800) {
4373 uint16_t diff = uint16_t(word - 0xD800);
4392 uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
4393 counter += ((word & 0xFC00) != 0xDC00);
4404 uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
4406 counter += static_cast<size_t>(word > 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
4407 counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) || (word >= 0xE000)); // three-byte
4418 uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
4419 counter += ((word & 0xFC00) != 0xDC00);
10657 uint32_t word = data[pos];
10658 if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
10669 uint32_t word = data[pos];
10670 if(word > 0x10FFFF) {
10673 if(word >= 0xD800 && word <= 0xDFFF) {
10699 counter++; // non-surrogate word
10781 uint32_t word = data[pos];
10782 if((word & 0xFFFFFF80)==0) {
10784 *utf8_output++ = char(word);
10786 } else if((word & 0xFFFFF800)==0) {
10789 *utf8_output++ = char((word>>6) | 0b11000000);
10790 *utf8_output++ = char((word & 0b111111) | 0b10000000);
10792 } else if((word & 0xFFFF0000)==0) {
10795 *utf8_output++ = char((word>>12) | 0b11100000);
10796 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10797 *utf8_output++ = char((word & 0b111111) | 0b10000000);
10802 *utf8_output++ = char((word>>18) | 0b11110000);
10803 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10804 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10805 *utf8_output++ = char((word & 0b111111) | 0b10000000);
10845 uint32_t word = data[pos];
10846 if((word & 0xFFFFFF80)==0) {
10848 *utf8_output++ = char(word);
10850 } else if((word & 0xFFFFF800)==0) {
10853 *utf8_output++ = char((word>>6) | 0b11000000);
10854 *utf8_output++ = char((word & 0b111111) | 0b10000000);
10856 } else if((word & 0xFFFF0000)==0) {
10859 if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
10860 *utf8_output++ = char((word>>12) | 0b11100000);
10861 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10862 *utf8_output++ = char((word & 0b111111) | 0b10000000);
10867 if (word > 0x10FFFF) { return 0; }
10868 *utf8_output++ = char((word>>18) | 0b11110000);
10869 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10870 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10871 *utf8_output++ = char((word & 0b111111) | 0b10000000);
10894 uint32_t word = data[pos];
10895 if((word & 0xFFFFFF80)==0) {
10897 *utf8_output++ = char(word);
10899 } else if((word & 0xFFFFF800)==0) {
10902 *utf8_output++ = char((word>>6) | 0b11000000);
10903 *utf8_output++ = char((word & 0b111111) | 0b10000000);
10905 } else if((word & 0xFFFF0000)==0) {
10908 if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
10909 *utf8_output++ = char((word>>12) | 0b11100000);
10910 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10911 *utf8_output++ = char((word & 0b111111) | 0b10000000);
10916 if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
10917 *utf8_output++ = char((word>>18) | 0b11110000);
10918 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10919 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10920 *utf8_output++ = char((word & 0b111111) | 0b10000000);
10950 uint32_t word = data[pos];
10951 if((word & 0xFFFF0000)==0) {
10953 *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
10957 word -= 0x10000;
10958 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
10959 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
10994 uint32_t word = data[pos];
10995 if((word & 0xFFFF0000)==0) {
10996 if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
10998 *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
11001 if (word > 0x10FFFF) { return 0; }
11002 word -= 0x10000;
11003 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
11004 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
11023 uint32_t word = data[pos];
11024 if((word & 0xFFFF0000)==0) {
11025 if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
11027 *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
11030 if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
11031 word -= 0x10000;
11032 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
11033 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
11084 uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11085 if((word & 0xFF80)==0) {
11087 *utf8_output++ = char(word);
11089 } else if((word & 0xF800)==0) {
11092 *utf8_output++ = char((word>>6) | 0b11000000);
11093 *utf8_output++ = char((word & 0b111111) | 0b10000000);
11095 } else if((word &0xF800 ) != 0xD800) {
11098 *utf8_output++ = char((word>>12) | 0b11100000);
11099 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
11100 *utf8_output++ = char((word & 0b111111) | 0b10000000);
11104 uint16_t diff = uint16_t(word - 0xD800);
11157 uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11158 if((word & 0xFF80)==0) {
11160 *utf8_output++ = char(word);
11162 } else if((word & 0xF800)==0) {
11165 *utf8_output++ = char((word>>6) | 0b11000000);
11166 *utf8_output++ = char((word & 0b111111) | 0b10000000);
11168 } else if((word &0xF800 ) != 0xD800) {
11171 *utf8_output++ = char((word>>12) | 0b11100000);
11172 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
11173 *utf8_output++ = char((word & 0b111111) | 0b10000000);
11178 uint16_t diff = uint16_t(word - 0xD800);
11216 uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11217 if((word & 0xFF80)==0) {
11219 *utf8_output++ = char(word);
11221 } else if((word & 0xF800)==0) {
11224 *utf8_output++ = char((word>>6) | 0b11000000);
11225 *utf8_output++ = char((word & 0b111111) | 0b10000000);
11227 } else if((word &0xF800 ) != 0xD800) {
11230 *utf8_output++ = char((word>>12) | 0b11100000);
11231 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
11232 *utf8_output++ = char((word & 0b111111) | 0b10000000);
11237 uint16_t diff = uint16_t(word - 0xD800);
11278 uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11279 if((word &0xF800 ) != 0xD800) {
11280 // No surrogate pair, extend 16-bit word to 32-bit word
11281 *utf32_output++ = char32_t(word);
11285 uint16_t diff = uint16_t(word - 0xD800);
11319 uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11320 if((word &0xF800 ) != 0xD800) {
11321 // No surrogate pair, extend 16-bit word to 32-bit word
11322 *utf32_output++ = char32_t(word);
11326 uint16_t diff = uint16_t(word - 0xD800);
11346 uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11347 if((word &0xF800 ) != 0xD800) {
11348 // No surrogate pair, extend 16-bit word to 32-bit word
11349 *utf32_output++ = char32_t(word);
11353 uint16_t diff = uint16_t(word - 0xD800);
11410 // a single UTF-16 word.
11420 // a single UTF-16 word.
11429 // we have a 4-byte UTF-8 word.
11498 // a single UTF-16 word.
11511 // a single UTF-16 word.
11530 // we have a 4-byte UTF-8 word.
11587 // a single UTF-16 word.
11600 // a single UTF-16 word.
11617 // we have a 4-byte UTF-8 word.
11751 // we have a 4-byte UTF-8 word.
11834 // we have a 4-byte UTF-8 word.
11904 // we have a 4-byte UTF-8 word.
12054 uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
12055 *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
12069 uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
12070 *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
12218 // we have a 4-byte UTF-8 word.
12297 uint16_t word = 0;
12301 word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
12302 too_large |= word;
12303 *current_write++ = char(word & 0xFF);
12319 uint16_t word;
12343 word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
12344 if((word & 0xFF00 ) == 0) {
12345 *latin_output++ = char(word & 0xFF);
12494 uint16_t word = 0;
12497 word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
12498 *latin_output++ = char(word);
12634 // in the middle of a very long word).
12720 // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
12722 // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
12890 // byte of each word, we compress the two vectors into one which
12901 // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
12910 // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
12916 // H - word-mask for high surrogates: the six highest bits are 0b1101'11
12920 // L - word mask for low surrogates
12925 // (A low surrogate placed in the 7th register's word
12936 // The 15th word may be either a low or high surrogate. It the next
12960 // byte of each word, we compress the two vectors into one which
12972 // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
12981 // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
12987 // H - word-mask for high surrogates: the six highest bits are 0b1101'11
12991 // L - word mask for low surrogates
12996 // (A low surrogate placed in the 7th register's word
13007 // The 15th word may be either a low or high surrogate. It the next
13109 // input 8-bit word : [aabb|bbbb] x 8
13747 uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
13748 if(word <= 0xff) {
13749 *latin1_output++ = char(word);
13789 a single word may produce one, two or three UTF8 bytes.
13853 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
13916 We expand the input word (16-bit) into two code units (32-bit), thus
14026 uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14027 if((word & 0xFF80)==0) {
14028 *utf8_output++ = char(word);
14029 } else if((word & 0xF800)==0) {
14030 *utf8_output++ = char((word>>6) | 0b11000000);
14031 *utf8_output++ = char((word & 0b111111) | 0b10000000);
14032 } else if((word &0xF800 ) != 0xD800) {
14033 *utf8_output++ = char((word>>12) | 0b11100000);
14034 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14035 *utf8_output++ = char((word & 0b111111) | 0b10000000);
14038 uint16_t diff = uint16_t(word - 0xD800);
14107 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14170 We expand the input word (16-bit) into two code units (32-bit), thus
14280 uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14281 if((word & 0xFF80)==0) {
14282 *utf8_output++ = char(word);
14283 } else if((word & 0xF800)==0) {
14284 *utf8_output++ = char((word>>6) | 0b11000000);
14285 *utf8_output++ = char((word & 0b111111) | 0b10000000);
14286 } else if((word &0xF800 ) != 0xD800) {
14287 *utf8_output++ = char((word>>12) | 0b11100000);
14288 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14289 *utf8_output++ = char((word & 0b111111) | 0b10000000);
14292 uint16_t diff = uint16_t(word - 0xD800);
14341 a single word may produce one, two or three UTF8 bytes.
14394 uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14395 if((word &0xF800 ) != 0xD800) {
14396 *utf32_output++ = char32_t(word);
14399 uint16_t diff = uint16_t(word - 0xD800);
14452 uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14453 if((word &0xF800 ) != 0xD800) {
14454 *utf32_output++ = char32_t(word);
14457 uint16_t diff = uint16_t(word - 0xD800);
14518 uint32_t word = buf[k];
14519 if(word <= 0xff) {
14520 *latin1_output++ = char(word);
14562 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14624 We expand the input word (16-bit) into two code units (32-bit), thus
14725 // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
14734 uint32_t word = buf[k];
14735 if((word & 0xFFFFFF80)==0) {
14736 *utf8_output++ = char(word);
14737 } else if((word & 0xFFFFF800)==0) {
14738 *utf8_output++ = char((word>>6) | 0b11000000);
14739 *utf8_output++ = char((word & 0b111111) | 0b10000000);
14740 } else if((word & 0xFFFF0000)==0) {
14741 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
14742 *utf8_output++ = char((word>>12) | 0b11100000);
14743 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14744 *utf8_output++ = char((word & 0b111111) | 0b10000000);
14746 if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
14747 *utf8_output++ = char((word>>18) | 0b11110000);
14748 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
14749 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14750 *utf8_output++ = char((word & 0b111111) | 0b10000000);
14795 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14862 We expand the input word (16-bit) into two code units (32-bit), thus
14964 // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
14973 uint32_t word = buf[k];
14974 if((word & 0xFFFFFF80)==0) {
14975 *utf8_output++ = char(word);
14976 } else if((word & 0xFFFFF800)==0) {
14977 *utf8_output++ = char((word>>6) | 0b11000000);
14978 *utf8_output++ = char((word & 0b111111) | 0b10000000);
14979 } else if((word & 0xFFFF0000)==0) {
14980 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
14981 *utf8_output++ = char((word>>12) | 0b11100000);
14982 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14983 *utf8_output++ = char((word & 0b111111) | 0b10000000);
14985 if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
14986 *utf8_output++ = char((word>>18) | 0b11110000);
14987 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
14988 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14989 *utf8_output++ = char((word & 0b111111) | 0b10000000);
15027 uint32_t word = buf[k];
15028 if((word & 0xFFFF0000)==0) {
15030 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
15031 *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
15034 if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
15035 word -= 0x10000;
15036 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
15037 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
15088 uint32_t word = buf[k];
15089 if((word & 0xFFFF0000)==0) {
15091 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
15092 *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
15095 if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
15096 word -= 0x10000;
15097 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
15098 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
15889 } else { // In case of success, we want the number of word written
16234 } else { // In case of success, we want the number of word written
16279 // We count one word for anything that is not a continuation (so
16638 } else { // In case of success, we want the number of word written
18154 // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
18159 // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
18227 // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
18232 // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
18308 Bit layout of single word. We show 4 cases for each possible
18607 - pair.second - the first unprocessed output word
19392 uint16_t word;
19393 while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
19395 *latin1_output++ = uint8_t(word);
19415 uint16_t word;
19416 while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
19418 *latin1_output++ = uint8_t(word);
19437 * is written to 'outlen' and the function reports the number of input word
19682 /* 2. Shift by one 16-bit word to align low surrogates with high surrogates
19707 // 5. Store all valid UTF-32 code units (low surrogate positions and 32nd word are invalid)
19721 // Only process 31 code units, but keep track if the 31st word is a high surrogate as a carry
19862 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
19922 We expand the input word (16-bit) into two code units (32-bit), thus
20016 // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
20024 uint32_t word = buf[k];
20025 if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII)
20026 *utf8_output++ = char(word);
20027 } else if((word & 0xFFFFF800)==0) { // 2-byte
20028 *utf8_output++ = char((word>>6) | 0b11000000);
20029 *utf8_output++ = char((word & 0b111111) | 0b10000000);
20030 } else if((word & 0xFFFF0000 )==0) { // 3-byte
20031 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
20032 *utf8_output++ = char((word>>12) | 0b11100000);
20033 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20034 *utf8_output++ = char((word & 0b111111) | 0b10000000);
20036 if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
20037 *utf8_output++ = char((word>>18) | 0b11110000);
20038 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
20039 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20040 *utf8_output++ = char((word & 0b111111) | 0b10000000);
20107 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
20172 We expand the input word (16-bit) into two code units (32-bit), thus
20266 // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
20274 uint32_t word = buf[k];
20275 if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII)
20276 *utf8_output++ = char(word);
20277 } else if((word & 0xFFFFF800)==0) { // 2-byte
20278 *utf8_output++ = char((word>>6) | 0b11000000);
20279 *utf8_output++ = char((word & 0b111111) | 0b10000000);
20280 } else if((word & 0xFFFF0000 )==0) { // 3-byte
20281 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
20282 *utf8_output++ = char((word>>12) | 0b11100000);
20283 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20284 *utf8_output++ = char((word & 0b111111) | 0b10000000);
20286 if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
20287 *utf8_output++ = char((word>>18) | 0b11110000);
20288 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
20289 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20290 *utf8_output++ = char((word & 0b111111) | 0b10000000);
20340 uint32_t word = buf[k];
20341 if((word & 0xFFFF0000)==0) {
20343 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
20344 *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
20347 if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
20348 word -= 0x10000;
20349 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
20350 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
20408 uint32_t word = buf[k];
20409 if((word & 0xFFFF0000)==0) {
20411 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
20412 *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
20415 if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
20416 word -= 0x10000;
20417 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
20418 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
20688 // two most significant bytes of any 32-bit word. On the other hand, to
20690 // significant bytes of a 32-bit word since they always come in pairs in
21916 // We count one word for anything that is not a continuation (so
22077 // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
22079 // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
22238 0xd000 .. 0xd7ff - a valid word
22248 - V = valid word,
22252 0 1 2 3 4 5 6 7 <--- word index
22265 and recheck this word in the next iteration
22283 // byte of each word, we compress the two vectors into one which
22298 // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
22308 // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
22314 // H - word-mask for high surrogates: the six highest bits are 0b1101'11
22318 // L - word mask for low surrogates
22323 // (A low surrogate placed in the 7th register's word
22335 // The 31 word may be either a low or high surrogate. It the next
22361 // byte of each word, we compress the two vectors into one which
22376 // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
22386 // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
22392 // H - word-mask for high surrogates: the six highest bits are 0b1101'11
22396 // L - word mask for low surrogates
22401 // (A low surrogate placed in the 7th register's word
22413 // The 31 word may be either a low or high surrogate. It the next
22502 // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
22516 // input 16-bit word : [0000|0000|aabb|bbbb] x 8
22583 // Zero extend each byte in xmm0 to word and put it in another xmm register
22589 // Zero extend each byte in the shifted xmm0 to word in xmm0
23009 uint16_t word = !match_system(big_endian)
23012 if (word <= 0xff) {
23013 *latin1_output++ = char(word);
23057 a single word may produce one, two or three UTF8 bytes.
23098 // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
23120 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23162 // 1. Check if there are any surrogate word in the input chunk.
23163 // We have also deal with situation when there is a surrogate word
23168 // = 0xc000 if the last word is a surrogate
23184 We expand the input word (16-bit) into two code units (32-bit), thus
23286 uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23287 if((word & 0xFF80)==0) {
23288 *utf8_output++ = char(word);
23289 } else if((word & 0xF800)==0) {
23290 *utf8_output++ = char((word>>6) | 0b11000000);
23291 *utf8_output++ = char((word & 0b111111) | 0b10000000);
23292 } else if((word &0xF800 ) != 0xD800) {
23293 *utf8_output++ = char((word>>12) | 0b11100000);
23294 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
23295 *utf8_output++ = char((word & 0b111111) | 0b10000000);
23298 uint16_t diff = uint16_t(word - 0xD800);
23341 // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
23363 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23405 // 1. Check if there are any surrogate word in the input chunk.
23406 // We have also deal with situation when there is a surrogate word
23411 // = 0xc000 if the last word is a surrogate
23427 We expand the input word (16-bit) into two code units (32-bit), thus
23529 uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23530 if((word & 0xFF80)==0) {
23531 *utf8_output++ = char(word);
23532 } else if((word & 0xF800)==0) {
23533 *utf8_output++ = char((word>>6) | 0b11000000);
23534 *utf8_output++ = char((word & 0b111111) | 0b10000000);
23535 } else if((word &0xF800 ) != 0xD800) {
23536 *utf8_output++ = char((word>>12) | 0b11100000);
23537 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
23538 *utf8_output++ = char((word & 0b111111) | 0b10000000);
23541 uint16_t diff = uint16_t(word - 0xD800);
23589 a single word may produce one, two or three UTF8 bytes.
23628 // 1. Check if there are any surrogate word in the input chunk.
23629 // We have also deal with situation when there is a surrogate word
23634 // = 0xc000 if the last word is a surrogate
23653 uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23654 if((word &0xF800 ) != 0xD800) {
23656 *utf32_output++ = char32_t(word);
23659 uint16_t diff = uint16_t(word - 0xD800);
23696 // 1. Check if there are any surrogate word in the input chunk.
23697 // We have also deal with situation when there is a surrogate word
23702 // = 0xc000 if the last word is a surrogate
23721 uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23722 if((word &0xF800 ) != 0xD800) {
23724 *utf32_output++ = char32_t(word);
23727 uint16_t diff = uint16_t(word - 0xD800);
23881 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23941 We expand the input word (16-bit) into two code units (32-bit), thus
24035 // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
24043 uint32_t word = buf[k];
24044 if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII)
24045 *utf8_output++ = char(word);
24046 } else if((word & 0xFFFFF800)==0) { // 2-byte
24047 *utf8_output++ = char((word>>6) | 0b11000000);
24048 *utf8_output++ = char((word & 0b111111) | 0b10000000);
24049 } else if((word & 0xFFFF0000 )==0) { // 3-byte
24050 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
24051 *utf8_output++ = char((word>>12) | 0b11100000);
24052 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24053 *utf8_output++ = char((word & 0b111111) | 0b10000000);
24055 if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
24056 *utf8_output++ = char((word>>18) | 0b11110000);
24057 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
24058 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24059 *utf8_output++ = char((word & 0b111111) | 0b10000000);
24126 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
24191 We expand the input word (16-bit) into two code units (32-bit), thus
24285 // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
24293 uint32_t word = buf[k];
24294 if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII)
24295 *utf8_output++ = char(word);
24296 } else if((word & 0xFFFFF800)==0) { // 2-byte
24297 *utf8_output++ = char((word>>6) | 0b11000000);
24298 *utf8_output++ = char((word & 0b111111) | 0b10000000);
24299 } else if((word & 0xFFFF0000 )==0) { // 3-byte
24300 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
24301 *utf8_output++ = char((word>>12) | 0b11100000);
24302 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24303 *utf8_output++ = char((word & 0b111111) | 0b10000000);
24305 if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
24306 *utf8_output++ = char((word>>18) | 0b11110000);
24307 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
24308 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24309 *utf8_output++ = char((word & 0b111111) | 0b10000000);
24356 uint32_t word = buf[k];
24357 if((word & 0xFFFF0000)==0) {
24359 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
24360 *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
24363 if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
24364 word -= 0x10000;
24365 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
24366 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
24424 uint32_t word = buf[k];
24425 if((word & 0xFFFF0000)==0) {
24427 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
24428 *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
24431 if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
24432 word -= 0x10000;
24433 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
24434 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
25300 } else { // In case of success, we want the number of word written
25645 } else { // In case of success, we want the number of word written
25690 // We count one word for anything that is not a continuation (so
26051 } else { // In case of success, we want the number of word written
27672 } else { // In case of success, we want the number of word written
28017 } else { // In case of success, we want the number of word written
28062 // We count one word for anything that is not a continuation (so
28442 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
28543 // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
28545 // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
28715 0xd000 .. 0xd7ff - a valid word
28725 - V = valid word,
28729 0 1 2 3 4 5 6 7 <--- word index
28742 and recheck this word in the next iteration
28760 // byte of each word, we compress the two vectors into one which
28774 // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
28784 // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
28790 // H - word-mask for high surrogates: the six highest bits are 0b1101'11
28794 // L - word mask for low surrogates
28799 // (A low surrogate placed in the 7th register's word
28811 // The 15th word may be either a low or high surrogate. It the next
28837 // byte of each word, we compress the two vectors into one which
28852 // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
28862 // Fact: high surrogate has 11th bit set (3rd bit in the higher word)
28868 // H - word-mask for high surrogates: the six highest bits are 0b1101'11
28872 // L - word mask for low surrogates
28877 // (A low surrogate placed in the 7th register's word
28889 // The 15th word may be either a low or high surrogate. It the next
29525 uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
29526 if(word <= 0xff) {
29527 *latin1_output++ = char(word);
29568 a single word may produce one, two or three UTF8 bytes.
29608 // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
29653 // 1. Check if there are any surrogate word in the input chunk.
29654 // We have also deal with situation when there is a surrogate word
29659 // = 0xc000 if the last word is a surrogate
29673 We expand the input word (16-bit) into two code units (32-bit), thus
29758 uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
29759 if((word & 0xFF80)==0) {
29760 *utf8_output++ = char(word);
29761 } else if((word & 0xF800)==0) {
29762 *utf8_output++ = char((word>>6) | 0b11000000);
29763 *utf8_output++ = char((word & 0b111111) | 0b10000000);
29764 } else if((word &0xF800 ) != 0xD800) {
29765 *utf8_output++ = char((word>>12) | 0b11100000);
29766 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
29767 *utf8_output++ = char((word & 0b111111) | 0b10000000);
29770 uint16_t diff = uint16_t(word - 0xD800);
29812 // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
29857 // 1. Check if there are any surrogate word in the input chunk.
29858 // We have also deal with situation when there is a surrogate word
29863 // = 0xc000 if the last word is a surrogate
29877 We expand the input word (16-bit) into two code units (32-bit), thus
29962 uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
29963 if((word & 0xFF80)==0) {
29964 *utf8_output++ = char(word);
29965 } else if((word & 0xF800)==0) {
29966 *utf8_output++ = char((word>>6) | 0b11000000);
29967 *utf8_output++ = char((word & 0b111111) | 0b10000000);
29968 } else if((word &0xF800 ) != 0xD800) {
29969 *utf8_output++ = char((word>>12) | 0b11100000);
29970 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
29971 *utf8_output++ = char((word & 0b111111) | 0b10000000);
29974 uint16_t diff = uint16_t(word - 0xD800);
30023 a single word may produce one, two or three UTF8 bytes.
30062 // 1. Check if there are any surrogate word in the input chunk.
30063 // We have also deal with situation when there is a surrogate word
30068 // = 0xc000 if the last word is a surrogate
30087 uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
30088 if((word &0xF800 ) != 0xD800) {
30089 *utf32_output++ = char32_t(word);
30092 uint16_t diff = uint16_t(word - 0xD800);
30130 // 1. Check if there are any surrogate word in the input chunk.
30131 // We have also deal with situation when there is a surrogate word
30136 // = 0xc000 if the last word is a surrogate
30155 uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
30156 if((word &0xF800 ) != 0xD800) {
30157 *utf32_output++ = char32_t(word);
30160 uint16_t diff = uint16_t(word - 0xD800);
30341 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
30396 We expand the input word (16-bit) into two code units (32-bit), thus
30473 // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
30481 uint32_t word = buf[k];
30482 if((word & 0xFFFFFF80)==0) {
30483 *utf8_output++ = char(word);
30484 } else if((word & 0xFFFFF800)==0) {
30485 *utf8_output++ = char((word>>6) | 0b11000000);
30486 *utf8_output++ = char((word & 0b111111) | 0b10000000);
30487 } else if((word &0xFFFF0000 )==0) {
30488 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
30489 *utf8_output++ = char((word>>12) | 0b11100000);
30490 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30491 *utf8_output++ = char((word & 0b111111) | 0b10000000);
30493 if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
30494 *utf8_output++ = char((word>>18) | 0b11110000);
30495 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
30496 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30497 *utf8_output++ = char((word & 0b111111) | 0b10000000);
30597 // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
30658 We expand the input word (16-bit) into two code units (32-bit), thus
30735 // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
30743 uint32_t word = buf[k];
30744 if((word & 0xFFFFFF80)==0) {
30745 *utf8_output++ = char(word);
30746 } else if((word & 0xFFFFF800)==0) {
30747 *utf8_output++ = char((word>>6) | 0b11000000);
30748 *utf8_output++ = char((word & 0b111111) | 0b10000000);
30749 } else if((word &0xFFFF0000 )==0) {
30750 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
30751 *utf8_output++ = char((word>>12) | 0b11100000);
30752 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30753 *utf8_output++ = char((word & 0b111111) | 0b10000000);
30755 if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); }
30756 *utf8_output++ = char((word>>18) | 0b11110000);
30757 *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
30758 *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30759 *utf8_output++ = char((word & 0b111111) | 0b10000000);
30807 uint32_t word = buf[k];
30808 if((word & 0xFFFF0000)==0) {
30810 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
30811 *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
30814 if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
30815 word -= 0x10000;
30816 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
30817 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
30876 uint32_t word = buf[k];
30877 if((word & 0xFFFF0000)==0) {
30879 if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
30880 *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
30883 if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
30884 word -= 0x10000;
30885 uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
30886 uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
31679 } else { // In case of success, we want the number of word written
32024 } else { // In case of success, we want the number of word written
32069 // We count one word for anything that is not a continuation (so
32428 } else { // In case of success, we want the number of word written