simdutf.cpp - OpenGrok cross reference for /third_party/node/deps/simdutf/simdutf.cpp

Lines Matching defs:word
4340 inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
4341   return uint16_t((word >> 8) | (word << 8));
4349     uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
4350     if((word &0xF800) == 0xD800) {
4352         uint16_t diff = uint16_t(word - 0xD800);
4370     uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
4371     if((word & 0xF800) == 0xD800) {
4373         uint16_t diff = uint16_t(word - 0xD800);
4392     uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
4393     counter += ((word & 0xFC00) != 0xDC00);
4404     uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
4406     counter += static_cast<size_t>(word > 0x7F);    // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
4407     counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) || (word >= 0xE000));   // three-byte
4418     uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
4419     counter += ((word & 0xFC00) != 0xDC00);
10657     uint32_t word = data[pos];
10658     if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
10669     uint32_t word = data[pos];
10670     if(word > 0x10FFFF) {
10673     if(word >= 0xD800 && word <= 0xDFFF) {
10699     counter++;                                      // non-surrogate word
10781     uint32_t word = data[pos];
10782     if((word & 0xFFFFFF80)==0) {
10784       *utf8_output++ = char(word);
10786     } else if((word & 0xFFFFF800)==0) {
10789       *utf8_output++ = char((word>>6) | 0b11000000);
10790       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10792     } else if((word & 0xFFFF0000)==0) {
10795       *utf8_output++ = char((word>>12) | 0b11100000);
10796       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10797       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10802       *utf8_output++ = char((word>>18) | 0b11110000);
10803       *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10804       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10805       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10845     uint32_t word = data[pos];
10846     if((word & 0xFFFFFF80)==0) {
10848       *utf8_output++ = char(word);
10850     } else if((word & 0xFFFFF800)==0) {
10853       *utf8_output++ = char((word>>6) | 0b11000000);
10854       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10856     } else if((word & 0xFFFF0000)==0) {
10859 			if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
10860       *utf8_output++ = char((word>>12) | 0b11100000);
10861       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10862       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10867 			if (word > 0x10FFFF) { return 0; }
10868       *utf8_output++ = char((word>>18) | 0b11110000);
10869       *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10870       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10871       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10894     uint32_t word = data[pos];
10895     if((word & 0xFFFFFF80)==0) {
10897       *utf8_output++ = char(word);
10899     } else if((word & 0xFFFFF800)==0) {
10902       *utf8_output++ = char((word>>6) | 0b11000000);
10903       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10905     } else if((word & 0xFFFF0000)==0) {
10908 			if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
10909       *utf8_output++ = char((word>>12) | 0b11100000);
10910       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10911       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10916 			if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
10917       *utf8_output++ = char((word>>18) | 0b11110000);
10918       *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
10919       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
10920       *utf8_output++ = char((word & 0b111111) | 0b10000000);
10950     uint32_t word = data[pos];
10951     if((word & 0xFFFF0000)==0) {
10953       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
10957       word -= 0x10000;
10958       uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
10959       uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
10994     uint32_t word = data[pos];
10995     if((word & 0xFFFF0000)==0) {
10996       if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
10998       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
11001       if (word > 0x10FFFF) { return 0; }
11002       word -= 0x10000;
11003       uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
11004       uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
11023     uint32_t word = data[pos];
11024     if((word & 0xFFFF0000)==0) {
11025       if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
11027       *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
11030       if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
11031       word -= 0x10000;
11032       uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
11033       uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
11084     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11085     if((word & 0xFF80)==0) {
11087       *utf8_output++ = char(word);
11089     } else if((word & 0xF800)==0) {
11092       *utf8_output++ = char((word>>6) | 0b11000000);
11093       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11095     } else if((word &0xF800 ) != 0xD800) {
11098       *utf8_output++ = char((word>>12) | 0b11100000);
11099       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
11100       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11104       uint16_t diff = uint16_t(word - 0xD800);
11157     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11158     if((word & 0xFF80)==0) {
11160       *utf8_output++ = char(word);
11162     } else if((word & 0xF800)==0) {
11165       *utf8_output++ = char((word>>6) | 0b11000000);
11166       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11168     } else if((word &0xF800 ) != 0xD800) {
11171       *utf8_output++ = char((word>>12) | 0b11100000);
11172       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
11173       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11178       uint16_t diff = uint16_t(word - 0xD800);
11216     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11217     if((word & 0xFF80)==0) {
11219       *utf8_output++ = char(word);
11221     } else if((word & 0xF800)==0) {
11224       *utf8_output++ = char((word>>6) | 0b11000000);
11225       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11227     } else if((word &0xF800 ) != 0xD800) {
11230       *utf8_output++ = char((word>>12) | 0b11100000);
11231       *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
11232       *utf8_output++ = char((word & 0b111111) | 0b10000000);
11237       uint16_t diff = uint16_t(word - 0xD800);
11278     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11279     if((word &0xF800 ) != 0xD800) {
11280       // No surrogate pair, extend 16-bit word to 32-bit word
11281       *utf32_output++ = char32_t(word);
11285       uint16_t diff = uint16_t(word - 0xD800);
11319     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11320     if((word &0xF800 ) != 0xD800) {
11321       // No surrogate pair, extend 16-bit word to 32-bit word
11322       *utf32_output++ = char32_t(word);
11326       uint16_t diff = uint16_t(word - 0xD800);
11346     uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
11347     if((word &0xF800 ) != 0xD800) {
11348       // No surrogate pair, extend 16-bit word to 32-bit word
11349       *utf32_output++ = char32_t(word);
11353       uint16_t diff = uint16_t(word - 0xD800);
11410       // a single UTF-16 word.
11420       // a single UTF-16 word.
11429       // we have a 4-byte UTF-8 word.
11498       // a single UTF-16 word.
11511       // a single UTF-16 word.
11530       // we have a 4-byte UTF-8 word.
11587       // a single UTF-16 word.
11600       // a single UTF-16 word.
11617       // we have a 4-byte UTF-8 word.
11751       // we have a 4-byte UTF-8 word.
11834       // we have a 4-byte UTF-8 word.
11904       // we have a 4-byte UTF-8 word.
12054     uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
12055     *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
12069     uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
12070     *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
12218       // we have a 4-byte UTF-8 word.
12297   uint16_t word = 0;
12301     word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
12302     too_large |= word;
12303     *current_write++ = char(word & 0xFF);
12319   uint16_t word;
12343     word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
12344     if((word & 0xFF00 ) == 0) {
12345         *latin_output++ = char(word & 0xFF);
12494   uint16_t word = 0;
12497     word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
12498     *latin_output++ = char(word);
12634   // in the middle of a very long word).
12720             // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
12722             // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
12890         //    byte of each word, we compress the two vectors into one which
12901         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
12910             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
12916             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
12920             // L - word mask for low surrogates
12925                               // (A low surrogate placed in the 7th register's word
12936                 // The 15th word may be either a low or high surrogate. It the next
12960         //    byte of each word, we compress the two vectors into one which
12972         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
12981             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
12987             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
12991             // L - word mask for low surrogates
12996                               // (A low surrogate placed in the 7th register's word
13007                 // The 15th word may be either a low or high surrogate. It the next
13109     // input 8-bit word : [aabb|bbbb] x 8
13747         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
13748         if(word <= 0xff) {
13749           *latin1_output++ = char(word);
13789     a single word may produce one, two or three UTF8 bytes.
13853           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
13916           We expand the input word (16-bit) into two code units (32-bit), thus
14026         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14027         if((word & 0xFF80)==0) {
14028           *utf8_output++ = char(word);
14029         } else if((word & 0xF800)==0) {
14030           *utf8_output++ = char((word>>6) | 0b11000000);
14031           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14032         } else if((word &0xF800 ) != 0xD800) {
14033           *utf8_output++ = char((word>>12) | 0b11100000);
14034           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14035           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14038           uint16_t diff = uint16_t(word - 0xD800);
14107           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14170           We expand the input word (16-bit) into two code units (32-bit), thus
14280         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14281         if((word & 0xFF80)==0) {
14282           *utf8_output++ = char(word);
14283         } else if((word & 0xF800)==0) {
14284           *utf8_output++ = char((word>>6) | 0b11000000);
14285           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14286         } else if((word &0xF800 ) != 0xD800) {
14287           *utf8_output++ = char((word>>12) | 0b11100000);
14288           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14289           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14292           uint16_t diff = uint16_t(word - 0xD800);
14341     a single word may produce one, two or three UTF8 bytes.
14394         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14395         if((word &0xF800 ) != 0xD800) {
14396           *utf32_output++ = char32_t(word);
14399           uint16_t diff = uint16_t(word - 0xD800);
14452         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
14453         if((word &0xF800 ) != 0xD800) {
14454           *utf32_output++ = char32_t(word);
14457           uint16_t diff = uint16_t(word - 0xD800);
14518         uint32_t word = buf[k];
14519         if(word <= 0xff) {
14520           *latin1_output++ = char(word);
14562         // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14624             We expand the input word (16-bit) into two code units (32-bit), thus
14725     // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
14734         uint32_t word = buf[k];
14735         if((word & 0xFFFFFF80)==0) {
14736           *utf8_output++ = char(word);
14737         } else if((word & 0xFFFFF800)==0) {
14738           *utf8_output++ = char((word>>6) | 0b11000000);
14739           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14740         } else if((word & 0xFFFF0000)==0) {
14741           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
14742           *utf8_output++ = char((word>>12) | 0b11100000);
14743           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14744           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14746           if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
14747           *utf8_output++ = char((word>>18) | 0b11110000);
14748           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
14749           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14750           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14795         // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
14862             We expand the input word (16-bit) into two code units (32-bit), thus
14964     // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
14973         uint32_t word = buf[k];
14974         if((word & 0xFFFFFF80)==0) {
14975           *utf8_output++ = char(word);
14976         } else if((word & 0xFFFFF800)==0) {
14977           *utf8_output++ = char((word>>6) | 0b11000000);
14978           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14979         } else if((word & 0xFFFF0000)==0) {
14980           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
14981           *utf8_output++ = char((word>>12) | 0b11100000);
14982           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14983           *utf8_output++ = char((word & 0b111111) | 0b10000000);
14985           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
14986           *utf8_output++ = char((word>>18) | 0b11110000);
14987           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
14988           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
14989           *utf8_output++ = char((word & 0b111111) | 0b10000000);
15027         uint32_t word = buf[k];
15028         if((word & 0xFFFF0000)==0) {
15030           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
15031           *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
15034           if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
15035           word -= 0x10000;
15036           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
15037           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
15088         uint32_t word = buf[k];
15089         if((word & 0xFFFF0000)==0) {
15091           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
15092           *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
15095           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
15096           word -= 0x10000;
15097           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
15098           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
15889         } else {    // In case of success, we want the number of word written
16234         } else {    // In case of success, we want the number of word written
16279       // We count one word for anything that is not a continuation (so
16638         } else {    // In case of success, we want the number of word written
18154         //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
18159         //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
18227         //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0
18232         //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0
18308         Bit layout of single word. We show 4 cases for each possible
18607     - pair.second   - the first unprocessed output word
19392       uint16_t word;
19393       while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
19395         *latin1_output++ = uint8_t(word);
19415       uint16_t word;
19416       while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
19418         *latin1_output++ = uint8_t(word);
19437  * is written to 'outlen' and the function reports the number of input word
19682         /*  2. Shift by one 16-bit word to align low surrogates with high surrogates
19707         //  5. Store all valid UTF-32 code units (low surrogate positions and 32nd word are invalid)
19721         // Only process 31 code units, but keep track if the 31st word is a high surrogate as a carry
19862       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
19922         We expand the input word (16-bit) into two code units (32-bit), thus
20016       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
20024         uint32_t word = buf[k];
20025         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
20026           *utf8_output++ = char(word);
20027         } else if((word & 0xFFFFF800)==0) { // 2-byte
20028           *utf8_output++ = char((word>>6) | 0b11000000);
20029           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20030         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
20031           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
20032           *utf8_output++ = char((word>>12) | 0b11100000);
20033           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20034           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20036           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
20037           *utf8_output++ = char((word>>18) | 0b11110000);
20038           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
20039           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20040           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20107       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
20172         We expand the input word (16-bit) into two code units (32-bit), thus
20266       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
20274         uint32_t word = buf[k];
20275         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
20276           *utf8_output++ = char(word);
20277         } else if((word & 0xFFFFF800)==0) { // 2-byte
20278           *utf8_output++ = char((word>>6) | 0b11000000);
20279           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20280         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
20281           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
20282           *utf8_output++ = char((word>>12) | 0b11100000);
20283           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20284           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20286           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
20287           *utf8_output++ = char((word>>18) | 0b11110000);
20288           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
20289           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
20290           *utf8_output++ = char((word & 0b111111) | 0b10000000);
20340         uint32_t word = buf[k];
20341         if((word & 0xFFFF0000)==0) {
20343           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
20344           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
20347           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
20348           word -= 0x10000;
20349           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
20350           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
20408         uint32_t word = buf[k];
20409         if((word & 0xFFFF0000)==0) {
20411           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
20412           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
20415           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
20416           word -= 0x10000;
20417           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
20418           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
20688         // two most significant bytes of any 32-bit word. On the other hand, to
20690         // significant bytes of a 32-bit word since they always come in pairs in
21916       // We count one word for anything that is not a continuation (so
22077             // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
22079             // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
22238     0xd000 .. 0xd7ff - a valid word
22248     - V = valid word,
22252       0   1   2   3   4   5   6   7    <--- word index
22265                                   and recheck this word in the next iteration
22283         //    byte of each word, we compress the two vectors into one which
22298         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
22308             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
22314             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
22318             // L - word mask for low surrogates
22323                                               // (A low surrogate placed in the 7th register's word
22335                 // The 31 word may be either a low or high surrogate. It the next
22361         //    byte of each word, we compress the two vectors into one which
22376         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
22386             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
22392             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
22396             // L - word mask for low surrogates
22401                                               // (A low surrogate placed in the 7th register's word
22413                 // The 31 word may be either a low or high surrogate. It the next
22502     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
22516     // input 16-bit word : [0000|0000|aabb|bbbb] x 8
22583         // Zero extend each byte in xmm0 to word and put it in another xmm register
22589         // Zero extend each byte in the shifted xmm0 to word in xmm0
23009         uint16_t word = !match_system(big_endian)
23012         if (word <= 0xff) {
23013           *latin1_output++ = char(word);
23057     a single word may produce one, two or three UTF8 bytes.
23098     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
23120           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23162     // 1. Check if there are any surrogate word in the input chunk.
23163     //    We have also deal with situation when there is a surrogate word
23168     //         = 0xc000 if the last word is a surrogate
23184           We expand the input word (16-bit) into two code units (32-bit), thus
23286         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23287         if((word & 0xFF80)==0) {
23288           *utf8_output++ = char(word);
23289         } else if((word & 0xF800)==0) {
23290           *utf8_output++ = char((word>>6) | 0b11000000);
23291           *utf8_output++ = char((word & 0b111111) | 0b10000000);
23292         } else if((word &0xF800 ) != 0xD800) {
23293           *utf8_output++ = char((word>>12) | 0b11100000);
23294           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
23295           *utf8_output++ = char((word & 0b111111) | 0b10000000);
23298           uint16_t diff = uint16_t(word - 0xD800);
23341     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
23363           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23405     // 1. Check if there are any surrogate word in the input chunk.
23406     //    We have also deal with situation when there is a surrogate word
23411     //         = 0xc000 if the last word is a surrogate
23427           We expand the input word (16-bit) into two code units (32-bit), thus
23529         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23530         if((word & 0xFF80)==0) {
23531           *utf8_output++ = char(word);
23532         } else if((word & 0xF800)==0) {
23533           *utf8_output++ = char((word>>6) | 0b11000000);
23534           *utf8_output++ = char((word & 0b111111) | 0b10000000);
23535         } else if((word &0xF800 ) != 0xD800) {
23536           *utf8_output++ = char((word>>12) | 0b11100000);
23537           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
23538           *utf8_output++ = char((word & 0b111111) | 0b10000000);
23541           uint16_t diff = uint16_t(word - 0xD800);
23589     a single word may produce one, two or three UTF8 bytes.
23628     // 1. Check if there are any surrogate word in the input chunk.
23629     //    We have also deal with situation when there is a surrogate word
23634     //         = 0xc000 if the last word is a surrogate
23653         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23654         if((word &0xF800 ) != 0xD800) {
23656           *utf32_output++ = char32_t(word);
23659           uint16_t diff = uint16_t(word - 0xD800);
23696     // 1. Check if there are any surrogate word in the input chunk.
23697     //    We have also deal with situation when there is a surrogate word
23702     //         = 0xc000 if the last word is a surrogate
23721         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
23722         if((word &0xF800 ) != 0xD800) {
23724           *utf32_output++ = char32_t(word);
23727           uint16_t diff = uint16_t(word - 0xD800);
23881       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
23941         We expand the input word (16-bit) into two code units (32-bit), thus
24035       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
24043         uint32_t word = buf[k];
24044         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
24045           *utf8_output++ = char(word);
24046         } else if((word & 0xFFFFF800)==0) { // 2-byte
24047           *utf8_output++ = char((word>>6) | 0b11000000);
24048           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24049         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
24050           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
24051           *utf8_output++ = char((word>>12) | 0b11100000);
24052           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24053           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24055           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
24056           *utf8_output++ = char((word>>18) | 0b11110000);
24057           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
24058           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24059           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24126       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
24191         We expand the input word (16-bit) into two code units (32-bit), thus
24285       // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
24293         uint32_t word = buf[k];
24294         if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
24295           *utf8_output++ = char(word);
24296         } else if((word & 0xFFFFF800)==0) { // 2-byte
24297           *utf8_output++ = char((word>>6) | 0b11000000);
24298           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24299         } else if((word & 0xFFFF0000 )==0) {  // 3-byte
24300           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
24301           *utf8_output++ = char((word>>12) | 0b11100000);
24302           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24303           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24305           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
24306           *utf8_output++ = char((word>>18) | 0b11110000);
24307           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
24308           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
24309           *utf8_output++ = char((word & 0b111111) | 0b10000000);
24356         uint32_t word = buf[k];
24357         if((word & 0xFFFF0000)==0) {
24359           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
24360           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
24363           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
24364           word -= 0x10000;
24365           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
24366           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
24424         uint32_t word = buf[k];
24425         if((word & 0xFFFF0000)==0) {
24427           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
24428           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
24431           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
24432           word -= 0x10000;
24433           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
24434           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
25300         } else {    // In case of success, we want the number of word written
25645         } else {    // In case of success, we want the number of word written
25690       // We count one word for anything that is not a continuation (so
26051         } else {    // In case of success, we want the number of word written
27672         } else {    // In case of success, we want the number of word written
28017         } else {    // In case of success, we want the number of word written
28062       // We count one word for anything that is not a continuation (so
28442           // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
28543             // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
28545             // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
28715     0xd000 .. 0xd7ff - a valid word
28725     - V = valid word,
28729       0   1   2   3   4   5   6   7    <--- word index
28742                                   and recheck this word in the next iteration
28760         //    byte of each word, we compress the two vectors into one which
28774         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
28784             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
28790             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
28794             // L - word mask for low surrogates
28799                                               // (A low surrogate placed in the 7th register's word
28811                 // The 15th word may be either a low or high surrogate. It the next
28837         //    byte of each word, we compress the two vectors into one which
28852         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
28862             //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
28868             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
28872             // L - word mask for low surrogates
28877                                               // (A low surrogate placed in the 7th register's word
28889                 // The 15th word may be either a low or high surrogate. It the next
29525         uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
29526         if(word <= 0xff) {
29527           *latin1_output++ = char(word);
29568     a single word may produce one, two or three UTF8 bytes.
29608     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
29653     // 1. Check if there are any surrogate word in the input chunk.
29654     //    We have also deal with situation when there is a surrogate word
29659     //         = 0xc000 if the last word is a surrogate
29673           We expand the input word (16-bit) into two code units (32-bit), thus
29758         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
29759         if((word & 0xFF80)==0) {
29760           *utf8_output++ = char(word);
29761         } else if((word & 0xF800)==0) {
29762           *utf8_output++ = char((word>>6) | 0b11000000);
29763           *utf8_output++ = char((word & 0b111111) | 0b10000000);
29764         } else if((word &0xF800 ) != 0xD800) {
29765           *utf8_output++ = char((word>>12) | 0b11100000);
29766           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
29767           *utf8_output++ = char((word & 0b111111) | 0b10000000);
29770           uint16_t diff = uint16_t(word - 0xD800);
29812     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
29857     // 1. Check if there are any surrogate word in the input chunk.
29858     //    We have also deal with situation when there is a surrogate word
29863     //         = 0xc000 if the last word is a surrogate
29877           We expand the input word (16-bit) into two code units (32-bit), thus
29962         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
29963         if((word & 0xFF80)==0) {
29964           *utf8_output++ = char(word);
29965         } else if((word & 0xF800)==0) {
29966           *utf8_output++ = char((word>>6) | 0b11000000);
29967           *utf8_output++ = char((word & 0b111111) | 0b10000000);
29968         } else if((word &0xF800 ) != 0xD800) {
29969           *utf8_output++ = char((word>>12) | 0b11100000);
29970           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
29971           *utf8_output++ = char((word & 0b111111) | 0b10000000);
29974           uint16_t diff = uint16_t(word - 0xD800);
30023     a single word may produce one, two or three UTF8 bytes.
30062     // 1. Check if there are any surrogate word in the input chunk.
30063     //    We have also deal with situation when there is a surrogate word
30068     //         = 0xc000 if the last word is a surrogate
30087         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
30088         if((word &0xF800 ) != 0xD800) {
30089           *utf32_output++ = char32_t(word);
30092           uint16_t diff = uint16_t(word - 0xD800);
30130     // 1. Check if there are any surrogate word in the input chunk.
30131     //    We have also deal with situation when there is a surrogate word
30136     //         = 0xc000 if the last word is a surrogate
30155         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
30156         if((word &0xF800 ) != 0xD800) {
30157           *utf32_output++ = char32_t(word);
30160           uint16_t diff = uint16_t(word - 0xD800);
30341       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
30396         We expand the input word (16-bit) into two code units (32-bit), thus
30473       // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
30481         uint32_t word = buf[k];
30482         if((word & 0xFFFFFF80)==0) {
30483           *utf8_output++ = char(word);
30484         } else if((word & 0xFFFFF800)==0) {
30485           *utf8_output++ = char((word>>6) | 0b11000000);
30486           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30487         } else if((word &0xFFFF0000 )==0) {
30488           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
30489           *utf8_output++ = char((word>>12) | 0b11100000);
30490           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30491           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30493           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
30494           *utf8_output++ = char((word>>18) | 0b11110000);
30495           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
30496           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30497           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30597       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
30658         We expand the input word (16-bit) into two code units (32-bit), thus
30735       // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
30743         uint32_t word = buf[k];
30744         if((word & 0xFFFFFF80)==0) {
30745           *utf8_output++ = char(word);
30746         } else if((word & 0xFFFFF800)==0) {
30747           *utf8_output++ = char((word>>6) | 0b11000000);
30748           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30749         } else if((word &0xFFFF0000 )==0) {
30750           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
30751           *utf8_output++ = char((word>>12) | 0b11100000);
30752           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30753           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30755           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); }
30756           *utf8_output++ = char((word>>18) | 0b11110000);
30757           *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
30758           *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
30759           *utf8_output++ = char((word & 0b111111) | 0b10000000);
30807         uint32_t word = buf[k];
30808         if((word & 0xFFFF0000)==0) {
30810           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
30811           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
30814           if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
30815           word -= 0x10000;
30816           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
30817           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
30876         uint32_t word = buf[k];
30877         if((word & 0xFFFF0000)==0) {
30879           if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
30880           *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
30883           if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
30884           word -= 0x10000;
30885           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
30886           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
31679         } else {    // In case of success, we want the number of word written
32024         } else {    // In case of success, we want the number of word written
32069       // We count one word for anything that is not a continuation (so
32428         } else {    // In case of success, we want the number of word written