1//     __ _____ _____ _____
2//  __|  |   __|     |   | |  JSON for Modern C++ (supporting code)
3// |  |  |__   |  |  | | | |  version 3.11.2
4// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
5//
6// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
7// SPDX-License-Identifier: MIT
8
9#include "doctest_compatibility.h"
10
11// for some reason including this after the json header leads to linker errors with VS 2017...
12#include <locale>
13#include <nlohmann/json.hpp>
14using nlohmann::json;
15
16#include <fstream>
17#include <sstream>
18#include <iomanip>
19#include "make_test_data_available.hpp"
20
21TEST_CASE("Unicode (1/5)" * doctest::skip())
22{
23    SECTION("\\uxxxx sequences")
24    {
25        // create an escaped string from a code point
26        const auto codepoint_to_unicode = [](std::size_t cp)
27        {
28            // code points are represented as a six-character sequence: a
29            // reverse solidus, followed by the lowercase letter u, followed
30            // by four hexadecimal digits that encode the character's code
31            // point
32            std::stringstream ss;
33            ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
34            return ss.str();
35        };
36
37        SECTION("correct sequences")
38        {
39            // generate all UTF-8 code points; in total, 1112064 code points are
40            // generated: 0x1FFFFF code points - 2048 invalid values between
41            // 0xD800 and 0xDFFF.
42            for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
43            {
44                // string to store the code point as in \uxxxx format
45                std::string json_text = "\"";
46
47                // decide whether to use one or two \uxxxx sequences
48                if (cp < 0x10000u)
49                {
50                    // The Unicode standard permanently reserves these code point
51                    // values for UTF-16 encoding of the high and low surrogates, and
52                    // they will never be assigned a character, so there should be no
53                    // reason to encode them. The official Unicode standard says that
54                    // no UTF forms, including UTF-16, can encode these code points.
55                    if (cp >= 0xD800u && cp <= 0xDFFFu)
56                    {
57                        // if we would not skip these code points, we would get a
58                        // "missing low surrogate" exception
59                        continue;
60                    }
61
62                    // code points in the Basic Multilingual Plane can be
63                    // represented with one \uxxxx sequence
64                    json_text += codepoint_to_unicode(cp);
65                }
66                else
67                {
68                    // To escape an extended character that is not in the Basic
69                    // Multilingual Plane, the character is represented as a
70                    // 12-character sequence, encoding the UTF-16 surrogate pair
71                    const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
72                    const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
73                    json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
74                }
75
76                json_text += "\"";
77                CAPTURE(json_text)
78                json _;
79                CHECK_NOTHROW(_ = json::parse(json_text));
80            }
81        }
82
83        SECTION("incorrect sequences")
84        {
85            SECTION("incorrect surrogate values")
86            {
87                json _;
88
89                CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uDC00\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 7: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uDC00'", json::parse_error&);
90
91                CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD7FF\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uD7FF\\uDC00'", json::parse_error&);
92
93                CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800]\""), "[json.exception.parse_error.101] parse error at line 1, column 8: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800]'", json::parse_error&);
94
95                CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\v\""), "[json.exception.parse_error.101] parse error at line 1, column 9: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\v'", json::parse_error&);
96
97                CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\u123\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: '\\u' must be followed by 4 hex digits; last read: '\"\\uD800\\u123\"'", json::parse_error&);
98
99                CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uDBFF\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uDBFF'", json::parse_error&);
100
101                CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uE000\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uE000'", json::parse_error&);
102            }
103        }
104
105#if 0
106        SECTION("incorrect sequences")
107        {
108            SECTION("high surrogate without low surrogate")
109            {
110                // D800..DBFF are high surrogates and must be followed by low
111                // surrogates DC00..DFFF; here, nothing follows
112                for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
113                {
114                    std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
115                    CAPTURE(json_text)
116                    CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
117                }
118            }
119
120            SECTION("high surrogate with wrong low surrogate")
121            {
122                // D800..DBFF are high surrogates and must be followed by low
123                // surrogates DC00..DFFF; here a different sequence follows
124                for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
125                {
126                    for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
127                    {
128                        if (0xDC00u <= cp2 && cp2 <= 0xDFFFu)
129                        {
130                            continue;
131                        }
132
133                        std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
134                        CAPTURE(json_text)
135                        CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
136                    }
137                }
138            }
139
140            SECTION("low surrogate without high surrogate")
141            {
142                // low surrogates DC00..DFFF must follow high surrogates; here,
143                // they occur alone
144                for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
145                {
146                    std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
147                    CAPTURE(json_text)
148                    CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
149                }
150            }
151
152        }
153#endif
154    }
155
156    SECTION("read all unicode characters")
157    {
158        // read a file with all unicode characters stored as single-character
159        // strings in a JSON array
160        std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/all_unicode.json");
161        json j;
162        CHECK_NOTHROW(f >> j);
163
164        // the array has 1112064 + 1 elements (a terminating "null" value)
165        // Note: 1112064 = 0x1FFFFF code points - 2048 invalid values between
166        // 0xD800 and 0xDFFF.
167        CHECK(j.size() == 1112065);
168
169        SECTION("check JSON Pointers")
170        {
171            for (const auto& s : j)
172            {
173                // skip non-string JSON values
174                if (!s.is_string())
175                {
176                    continue;
177                }
178
179                auto ptr = s.get<std::string>();
180
181                // tilde must be followed by 0 or 1
182                if (ptr == "~")
183                {
184                    ptr += "0";
185                }
186
187                // JSON Pointers must begin with "/"
188                ptr.insert(0, "/");
189
190                CHECK_NOTHROW(json::json_pointer("/" + ptr));
191
192                // check escape/unescape roundtrip
193                auto escaped = nlohmann::detail::escape(ptr);
194                nlohmann::detail::unescape(escaped);
195                CHECK(escaped == ptr);
196            }
197        }
198    }
199
200    SECTION("ignore byte-order-mark")
201    {
202        SECTION("in a stream")
203        {
204            // read a file with a UTF-8 BOM
205            std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/bom.json");
206            json j;
207            CHECK_NOTHROW(f >> j);
208        }
209
210        SECTION("with an iterator")
211        {
212            std::string i = "\xef\xbb\xbf{\n   \"foo\": true\n}";
213            json _;
214            CHECK_NOTHROW(_ = json::parse(i.begin(), i.end()));
215        }
216    }
217
218    SECTION("error for incomplete/wrong BOM")
219    {
220        json _;
221        CHECK_THROWS_AS(_ = json::parse("\xef\xbb"), json::parse_error&);
222        CHECK_THROWS_AS(_ = json::parse("\xef\xbb\xbb"), json::parse_error&);
223    }
224}
225
226namespace
227{
228void roundtrip(bool success_expected, const std::string& s);
229
230void roundtrip(bool success_expected, const std::string& s)
231{
232    CAPTURE(s)
233    json _;
234
235    // create JSON string value
236    json j = s;
237    // create JSON text
238    std::string ps = std::string("\"") + s + "\"";
239
240    if (success_expected)
241    {
242        // serialization succeeds
243        CHECK_NOTHROW(j.dump());
244
245        // exclude parse test for U+0000
246        if (s[0] != '\0')
247        {
248            // parsing JSON text succeeds
249            CHECK_NOTHROW(_ = json::parse(ps));
250        }
251
252        // roundtrip succeeds
253        CHECK_NOTHROW(_ = json::parse(j.dump()));
254
255        // after roundtrip, the same string is stored
256        json jr = json::parse(j.dump());
257        CHECK(jr.get<std::string>() == s);
258    }
259    else
260    {
261        // serialization fails
262        CHECK_THROWS_AS(j.dump(), json::type_error&);
263
264        // parsing JSON text fails
265        CHECK_THROWS_AS(_ = json::parse(ps), json::parse_error&);
266    }
267}
268} // namespace
269
270TEST_CASE("Markus Kuhn's UTF-8 decoder capability and stress test")
271{
272    // Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
273    // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
274
275    SECTION("1  Some correct UTF-8 text")
276    {
277        roundtrip(true, "κόσμε");
278    }
279
280    SECTION("2  Boundary condition test cases")
281    {
282        SECTION("2.1  First possible sequence of a certain length")
283        {
284            // 2.1.1  1 byte  (U-00000000)
285            roundtrip(true, std::string("\0", 1));
286            // 2.1.2  2 bytes (U-00000080)
287            roundtrip(true, "\xc2\x80");
288            // 2.1.3  3 bytes (U-00000800)
289            roundtrip(true, "\xe0\xa0\x80");
290            // 2.1.4  4 bytes (U-00010000)
291            roundtrip(true, "\xf0\x90\x80\x80");
292
293            // 2.1.5  5 bytes (U-00200000)
294            roundtrip(false, "\xF8\x88\x80\x80\x80");
295            // 2.1.6  6 bytes (U-04000000)
296            roundtrip(false, "\xFC\x84\x80\x80\x80\x80");
297        }
298
299        SECTION("2.2  Last possible sequence of a certain length")
300        {
301            // 2.2.1  1 byte  (U-0000007F)
302            roundtrip(true, "\x7f");
303            // 2.2.2  2 bytes (U-000007FF)
304            roundtrip(true, "\xdf\xbf");
305            // 2.2.3  3 bytes (U-0000FFFF)
306            roundtrip(true, "\xef\xbf\xbf");
307
308            // 2.2.4  4 bytes (U-001FFFFF)
309            roundtrip(false, "\xF7\xBF\xBF\xBF");
310            // 2.2.5  5 bytes (U-03FFFFFF)
311            roundtrip(false, "\xFB\xBF\xBF\xBF\xBF");
312            // 2.2.6  6 bytes (U-7FFFFFFF)
313            roundtrip(false, "\xFD\xBF\xBF\xBF\xBF\xBF");
314        }
315
316        SECTION("2.3  Other boundary conditions")
317        {
318            // 2.3.1  U-0000D7FF = ed 9f bf
319            roundtrip(true, "\xed\x9f\xbf");
320            // 2.3.2  U-0000E000 = ee 80 80
321            roundtrip(true, "\xee\x80\x80");
322            // 2.3.3  U-0000FFFD = ef bf bd
323            roundtrip(true, "\xef\xbf\xbd");
324            // 2.3.4  U-0010FFFF = f4 8f bf bf
325            roundtrip(true, "\xf4\x8f\xbf\xbf");
326
327            // 2.3.5  U-00110000 = f4 90 80 80
328            roundtrip(false, "\xf4\x90\x80\x80");
329        }
330    }
331
332    SECTION("3  Malformed sequences")
333    {
334        SECTION("3.1  Unexpected continuation bytes")
335        {
336            // Each unexpected continuation byte should be separately signalled as a
337            // malformed sequence of its own.
338
339            // 3.1.1  First continuation byte 0x80
340            roundtrip(false, "\x80");
341            // 3.1.2  Last  continuation byte 0xbf
342            roundtrip(false, "\xbf");
343
344            // 3.1.3  2 continuation bytes
345            roundtrip(false, "\x80\xbf");
346            // 3.1.4  3 continuation bytes
347            roundtrip(false, "\x80\xbf\x80");
348            // 3.1.5  4 continuation bytes
349            roundtrip(false, "\x80\xbf\x80\xbf");
350            // 3.1.6  5 continuation bytes
351            roundtrip(false, "\x80\xbf\x80\xbf\x80");
352            // 3.1.7  6 continuation bytes
353            roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf");
354            // 3.1.8  7 continuation bytes
355            roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf\x80");
356
357            // 3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf)
358            roundtrip(false, "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf");
359        }
360
361        SECTION("3.2  Lonely start characters")
362        {
363            // 3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf)
364            roundtrip(false, "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf");
365            // 3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef)
366            roundtrip(false, "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef");
367            // 3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7)
368            roundtrip(false, "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7");
369            // 3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb)
370            roundtrip(false, "\xf8 \xf9 \xfa \xfb");
371            // 3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd)
372            roundtrip(false, "\xfc \xfd");
373        }
374
375        SECTION("3.3  Sequences with last continuation byte missing")
376        {
377            // All bytes of an incomplete sequence should be signalled as a single
378            // malformed sequence, i.e., you should see only a single replacement
379            // character in each of the next 10 tests. (Characters as in section 2)
380
381            // 3.3.1  2-byte sequence with last byte missing (U+0000)
382            roundtrip(false, "\xc0");
383            // 3.3.2  3-byte sequence with last byte missing (U+0000)
384            roundtrip(false, "\xe0\x80");
385            // 3.3.3  4-byte sequence with last byte missing (U+0000)
386            roundtrip(false, "\xf0\x80\x80");
387            // 3.3.4  5-byte sequence with last byte missing (U+0000)
388            roundtrip(false, "\xf8\x80\x80\x80");
389            // 3.3.5  6-byte sequence with last byte missing (U+0000)
390            roundtrip(false, "\xfc\x80\x80\x80\x80");
391            // 3.3.6  2-byte sequence with last byte missing (U-000007FF)
392            roundtrip(false, "\xdf");
393            // 3.3.7  3-byte sequence with last byte missing (U-0000FFFF)
394            roundtrip(false, "\xef\xbf");
395            // 3.3.8  4-byte sequence with last byte missing (U-001FFFFF)
396            roundtrip(false, "\xf7\xbf\xbf");
397            // 3.3.9  5-byte sequence with last byte missing (U-03FFFFFF)
398            roundtrip(false, "\xfb\xbf\xbf\xbf");
399            // 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF)
400            roundtrip(false, "\xfd\xbf\xbf\xbf\xbf");
401        }
402
403        SECTION("3.4  Concatenation of incomplete sequences")
404        {
405            // All the 10 sequences of 3.3 concatenated, you should see 10 malformed
406            // sequences being signalled:
407            roundtrip(false, "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf");
408        }
409
410        SECTION("3.5  Impossible bytes")
411        {
412            // The following two bytes cannot appear in a correct UTF-8 string
413
414            // 3.5.1  fe
415            roundtrip(false, "\xfe");
416            // 3.5.2  ff
417            roundtrip(false, "\xff");
418            // 3.5.3  fe fe ff ff
419            roundtrip(false, "\xfe\xfe\xff\xff");
420        }
421    }
422
423    SECTION("4  Overlong sequences")
424    {
425        // The following sequences are not malformed according to the letter of
426        // the Unicode 2.0 standard. However, they are longer then necessary and
427        // a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8
428        // decoder" should reject them just like malformed sequences for two
429        // reasons: (1) It helps to debug applications if overlong sequences are
430        // not treated as valid representations of characters, because this helps
431        // to spot problems more quickly. (2) Overlong sequences provide
432        // alternative representations of characters, that could maliciously be
433        // used to bypass filters that check only for ASCII characters. For
434        // instance, a 2-byte encoded line feed (LF) would not be caught by a
435        // line counter that counts only 0x0a bytes, but it would still be
436        // processed as a line feed by an unsafe UTF-8 decoder later in the
437        // pipeline. From a security point of view, ASCII compatibility of UTF-8
438        // sequences means also, that ASCII characters are *only* allowed to be
439        // represented by ASCII bytes in the range 0x00-0x7f. To ensure this
440        // aspect of ASCII compatibility, use only "safe UTF-8 decoders" that
441        // reject overlong UTF-8 sequences for which a shorter encoding exists.
442
443        SECTION("4.1  Examples of an overlong ASCII character")
444        {
445            // With a safe UTF-8 decoder, all of the following five overlong
446            // representations of the ASCII character slash ("/") should be rejected
447            // like a malformed UTF-8 sequence, for instance by substituting it with
448            // a replacement character. If you see a slash below, you do not have a
449            // safe UTF-8 decoder!
450
451            // 4.1.1 U+002F = c0 af
452            roundtrip(false, "\xc0\xaf");
453            // 4.1.2 U+002F = e0 80 af
454            roundtrip(false, "\xe0\x80\xaf");
455            // 4.1.3 U+002F = f0 80 80 af
456            roundtrip(false, "\xf0\x80\x80\xaf");
457            // 4.1.4 U+002F = f8 80 80 80 af
458            roundtrip(false, "\xf8\x80\x80\x80\xaf");
459            // 4.1.5 U+002F = fc 80 80 80 80 af
460            roundtrip(false, "\xfc\x80\x80\x80\x80\xaf");
461        }
462
463        SECTION("4.2  Maximum overlong sequences")
464        {
465            // Below you see the highest Unicode value that is still resulting in an
466            // overlong sequence if represented with the given number of bytes. This
467            // is a boundary test for safe UTF-8 decoders. All five characters should
468            // be rejected like malformed UTF-8 sequences.
469
470            // 4.2.1  U-0000007F = c1 bf
471            roundtrip(false, "\xc1\xbf");
472            // 4.2.2  U-000007FF = e0 9f bf
473            roundtrip(false, "\xe0\x9f\xbf");
474            // 4.2.3  U-0000FFFF = f0 8f bf bf
475            roundtrip(false, "\xf0\x8f\xbf\xbf");
476            // 4.2.4  U-001FFFFF = f8 87 bf bf bf
477            roundtrip(false, "\xf8\x87\xbf\xbf\xbf");
478            // 4.2.5  U-03FFFFFF = fc 83 bf bf bf bf
479            roundtrip(false, "\xfc\x83\xbf\xbf\xbf\xbf");
480        }
481
482        SECTION("4.3  Overlong representation of the NUL character")
483        {
484            // The following five sequences should also be rejected like malformed
485            // UTF-8 sequences and should not be treated like the ASCII NUL
486            // character.
487
488            // 4.3.1  U+0000 = c0 80
489            roundtrip(false, "\xc0\x80");
490            // 4.3.2  U+0000 = e0 80 80
491            roundtrip(false, "\xe0\x80\x80");
492            // 4.3.3  U+0000 = f0 80 80 80
493            roundtrip(false, "\xf0\x80\x80\x80");
494            // 4.3.4  U+0000 = f8 80 80 80 80
495            roundtrip(false, "\xf8\x80\x80\x80\x80");
496            // 4.3.5  U+0000 = fc 80 80 80 80 80
497            roundtrip(false, "\xfc\x80\x80\x80\x80\x80");
498        }
499    }
500
501    SECTION("5  Illegal code positions")
502    {
503        // The following UTF-8 sequences should be rejected like malformed
504        // sequences, because they never represent valid ISO 10646 characters and
505        // a UTF-8 decoder that accepts them might introduce security problems
506        // comparable to overlong UTF-8 sequences.
507
508        SECTION("5.1 Single UTF-16 surrogates")
509        {
510            // 5.1.1  U+D800 = ed a0 80
511            roundtrip(false, "\xed\xa0\x80");
512            // 5.1.2  U+DB7F = ed ad bf
513            roundtrip(false, "\xed\xad\xbf");
514            // 5.1.3  U+DB80 = ed ae 80
515            roundtrip(false, "\xed\xae\x80");
516            // 5.1.4  U+DBFF = ed af bf
517            roundtrip(false, "\xed\xaf\xbf");
518            // 5.1.5  U+DC00 = ed b0 80
519            roundtrip(false, "\xed\xb0\x80");
520            // 5.1.6  U+DF80 = ed be 80
521            roundtrip(false, "\xed\xbe\x80");
522            // 5.1.7  U+DFFF = ed bf bf
523            roundtrip(false, "\xed\xbf\xbf");
524        }
525
526        SECTION("5.2 Paired UTF-16 surrogates")
527        {
528            // 5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80
529            roundtrip(false, "\xed\xa0\x80\xed\xb0\x80");
530            // 5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf
531            roundtrip(false, "\xed\xa0\x80\xed\xbf\xbf");
532            // 5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80
533            roundtrip(false, "\xed\xad\xbf\xed\xb0\x80");
534            // 5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf
535            roundtrip(false, "\xed\xad\xbf\xed\xbf\xbf");
536            // 5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80
537            roundtrip(false, "\xed\xae\x80\xed\xb0\x80");
538            // 5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf
539            roundtrip(false, "\xed\xae\x80\xed\xbf\xbf");
540            // 5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80
541            roundtrip(false, "\xed\xaf\xbf\xed\xb0\x80");
542            // 5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf
543            roundtrip(false, "\xed\xaf\xbf\xed\xbf\xbf");
544        }
545
546        SECTION("5.3 Noncharacter code positions")
547        {
548            // The following "noncharacters" are "reserved for internal use" by
549            // applications, and according to older versions of the Unicode Standard
550            // "should never be interchanged". Unicode Corrigendum #9 dropped the
551            // latter restriction. Nevertheless, their presence in incoming UTF-8 data
552            // can remain a potential security risk, depending on what use is made of
553            // these codes subsequently. Examples of such internal use:
554            //
555            //  - Some file APIs with 16-bit characters may use the integer value -1
556            //    = U+FFFF to signal an end-of-file (EOF) or error condition.
557            //
558            //  - In some UTF-16 receivers, code point U+FFFE might trigger a
559            //    byte-swap operation (to convert between UTF-16LE and UTF-16BE).
560            //
561            // With such internal use of noncharacters, it may be desirable and safer
562            // to block those code points in UTF-8 decoders, as they should never
563            // occur legitimately in incoming UTF-8 data, and could trigger unsafe
564            // behaviour in subsequent processing.
565
566            // Particularly problematic noncharacters in 16-bit applications:
567
568            // 5.3.1  U+FFFE = ef bf be
569            roundtrip(true, "\xef\xbf\xbe");
570            // 5.3.2  U+FFFF = ef bf bf
571            roundtrip(true, "\xef\xbf\xbf");
572
573            // 5.3.3  U+FDD0 .. U+FDEF
574            roundtrip(true, "\xEF\xB7\x90");
575            roundtrip(true, "\xEF\xB7\x91");
576            roundtrip(true, "\xEF\xB7\x92");
577            roundtrip(true, "\xEF\xB7\x93");
578            roundtrip(true, "\xEF\xB7\x94");
579            roundtrip(true, "\xEF\xB7\x95");
580            roundtrip(true, "\xEF\xB7\x96");
581            roundtrip(true, "\xEF\xB7\x97");
582            roundtrip(true, "\xEF\xB7\x98");
583            roundtrip(true, "\xEF\xB7\x99");
584            roundtrip(true, "\xEF\xB7\x9A");
585            roundtrip(true, "\xEF\xB7\x9B");
586            roundtrip(true, "\xEF\xB7\x9C");
587            roundtrip(true, "\xEF\xB7\x9D");
588            roundtrip(true, "\xEF\xB7\x9E");
589            roundtrip(true, "\xEF\xB7\x9F");
590            roundtrip(true, "\xEF\xB7\xA0");
591            roundtrip(true, "\xEF\xB7\xA1");
592            roundtrip(true, "\xEF\xB7\xA2");
593            roundtrip(true, "\xEF\xB7\xA3");
594            roundtrip(true, "\xEF\xB7\xA4");
595            roundtrip(true, "\xEF\xB7\xA5");
596            roundtrip(true, "\xEF\xB7\xA6");
597            roundtrip(true, "\xEF\xB7\xA7");
598            roundtrip(true, "\xEF\xB7\xA8");
599            roundtrip(true, "\xEF\xB7\xA9");
600            roundtrip(true, "\xEF\xB7\xAA");
601            roundtrip(true, "\xEF\xB7\xAB");
602            roundtrip(true, "\xEF\xB7\xAC");
603            roundtrip(true, "\xEF\xB7\xAD");
604            roundtrip(true, "\xEF\xB7\xAE");
605            roundtrip(true, "\xEF\xB7\xAF");
606
607            // 5.3.4  U+nFFFE U+nFFFF (for n = 1..10)
608            roundtrip(true, "\xF0\x9F\xBF\xBF");
609            roundtrip(true, "\xF0\xAF\xBF\xBF");
610            roundtrip(true, "\xF0\xBF\xBF\xBF");
611            roundtrip(true, "\xF1\x8F\xBF\xBF");
612            roundtrip(true, "\xF1\x9F\xBF\xBF");
613            roundtrip(true, "\xF1\xAF\xBF\xBF");
614            roundtrip(true, "\xF1\xBF\xBF\xBF");
615            roundtrip(true, "\xF2\x8F\xBF\xBF");
616            roundtrip(true, "\xF2\x9F\xBF\xBF");
617            roundtrip(true, "\xF2\xAF\xBF\xBF");
618        }
619    }
620}
621