1// __ _____ _____ _____ 2// __| | __| | | | JSON for Modern C++ (supporting code) 3// | | |__ | | | | | | version 3.11.2 4// |_____|_____|_____|_|___| https://github.com/nlohmann/json 5// 6// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me> 7// SPDX-License-Identifier: MIT 8 9#include "doctest_compatibility.h" 10 11// for some reason including this after the json header leads to linker errors with VS 2017... 12#include <locale> 13#include <nlohmann/json.hpp> 14using nlohmann::json; 15 16#include <fstream> 17#include <sstream> 18#include <iomanip> 19#include "make_test_data_available.hpp" 20 21TEST_CASE("Unicode (1/5)" * doctest::skip()) 22{ 23 SECTION("\\uxxxx sequences") 24 { 25 // create an escaped string from a code point 26 const auto codepoint_to_unicode = [](std::size_t cp) 27 { 28 // code points are represented as a six-character sequence: a 29 // reverse solidus, followed by the lowercase letter u, followed 30 // by four hexadecimal digits that encode the character's code 31 // point 32 std::stringstream ss; 33 ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp; 34 return ss.str(); 35 }; 36 37 SECTION("correct sequences") 38 { 39 // generate all UTF-8 code points; in total, 1112064 code points are 40 // generated: 0x1FFFFF code points - 2048 invalid values between 41 // 0xD800 and 0xDFFF. 42 for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp) 43 { 44 // string to store the code point as in \uxxxx format 45 std::string json_text = "\""; 46 47 // decide whether to use one or two \uxxxx sequences 48 if (cp < 0x10000u) 49 { 50 // The Unicode standard permanently reserves these code point 51 // values for UTF-16 encoding of the high and low surrogates, and 52 // they will never be assigned a character, so there should be no 53 // reason to encode them. The official Unicode standard says that 54 // no UTF forms, including UTF-16, can encode these code points. 55 if (cp >= 0xD800u && cp <= 0xDFFFu) 56 { 57 // if we would not skip these code points, we would get a 58 // "missing low surrogate" exception 59 continue; 60 } 61 62 // code points in the Basic Multilingual Plane can be 63 // represented with one \uxxxx sequence 64 json_text += codepoint_to_unicode(cp); 65 } 66 else 67 { 68 // To escape an extended character that is not in the Basic 69 // Multilingual Plane, the character is represented as a 70 // 12-character sequence, encoding the UTF-16 surrogate pair 71 const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu); 72 const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu); 73 json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2); 74 } 75 76 json_text += "\""; 77 CAPTURE(json_text) 78 json _; 79 CHECK_NOTHROW(_ = json::parse(json_text)); 80 } 81 } 82 83 SECTION("incorrect sequences") 84 { 85 SECTION("incorrect surrogate values") 86 { 87 json _; 88 89 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uDC00\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 7: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uDC00'", json::parse_error&); 90 91 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD7FF\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uD7FF\\uDC00'", json::parse_error&); 92 93 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800]\""), "[json.exception.parse_error.101] parse error at line 1, column 8: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800]'", json::parse_error&); 94 95 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\v\""), "[json.exception.parse_error.101] parse error at line 1, column 9: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\v'", json::parse_error&); 96 97 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\u123\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: '\\u' must be followed by 4 hex digits; last read: '\"\\uD800\\u123\"'", json::parse_error&); 98 99 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uDBFF\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uDBFF'", json::parse_error&); 100 101 CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uE000\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uE000'", json::parse_error&); 102 } 103 } 104 105#if 0 106 SECTION("incorrect sequences") 107 { 108 SECTION("high surrogate without low surrogate") 109 { 110 // D800..DBFF are high surrogates and must be followed by low 111 // surrogates DC00..DFFF; here, nothing follows 112 for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp) 113 { 114 std::string json_text = "\"" + codepoint_to_unicode(cp) + "\""; 115 CAPTURE(json_text) 116 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&); 117 } 118 } 119 120 SECTION("high surrogate with wrong low surrogate") 121 { 122 // D800..DBFF are high surrogates and must be followed by low 123 // surrogates DC00..DFFF; here a different sequence follows 124 for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1) 125 { 126 for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2) 127 { 128 if (0xDC00u <= cp2 && cp2 <= 0xDFFFu) 129 { 130 continue; 131 } 132 133 std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\""; 134 CAPTURE(json_text) 135 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&); 136 } 137 } 138 } 139 140 SECTION("low surrogate without high surrogate") 141 { 142 // low surrogates DC00..DFFF must follow high surrogates; here, 143 // they occur alone 144 for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp) 145 { 146 std::string json_text = "\"" + codepoint_to_unicode(cp) + "\""; 147 CAPTURE(json_text) 148 CHECK_THROWS_AS(json::parse(json_text), json::parse_error&); 149 } 150 } 151 152 } 153#endif 154 } 155 156 SECTION("read all unicode characters") 157 { 158 // read a file with all unicode characters stored as single-character 159 // strings in a JSON array 160 std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/all_unicode.json"); 161 json j; 162 CHECK_NOTHROW(f >> j); 163 164 // the array has 1112064 + 1 elements (a terminating "null" value) 165 // Note: 1112064 = 0x1FFFFF code points - 2048 invalid values between 166 // 0xD800 and 0xDFFF. 167 CHECK(j.size() == 1112065); 168 169 SECTION("check JSON Pointers") 170 { 171 for (const auto& s : j) 172 { 173 // skip non-string JSON values 174 if (!s.is_string()) 175 { 176 continue; 177 } 178 179 auto ptr = s.get<std::string>(); 180 181 // tilde must be followed by 0 or 1 182 if (ptr == "~") 183 { 184 ptr += "0"; 185 } 186 187 // JSON Pointers must begin with "/" 188 ptr.insert(0, "/"); 189 190 CHECK_NOTHROW(json::json_pointer("/" + ptr)); 191 192 // check escape/unescape roundtrip 193 auto escaped = nlohmann::detail::escape(ptr); 194 nlohmann::detail::unescape(escaped); 195 CHECK(escaped == ptr); 196 } 197 } 198 } 199 200 SECTION("ignore byte-order-mark") 201 { 202 SECTION("in a stream") 203 { 204 // read a file with a UTF-8 BOM 205 std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/bom.json"); 206 json j; 207 CHECK_NOTHROW(f >> j); 208 } 209 210 SECTION("with an iterator") 211 { 212 std::string i = "\xef\xbb\xbf{\n \"foo\": true\n}"; 213 json _; 214 CHECK_NOTHROW(_ = json::parse(i.begin(), i.end())); 215 } 216 } 217 218 SECTION("error for incomplete/wrong BOM") 219 { 220 json _; 221 CHECK_THROWS_AS(_ = json::parse("\xef\xbb"), json::parse_error&); 222 CHECK_THROWS_AS(_ = json::parse("\xef\xbb\xbb"), json::parse_error&); 223 } 224} 225 226namespace 227{ 228void roundtrip(bool success_expected, const std::string& s); 229 230void roundtrip(bool success_expected, const std::string& s) 231{ 232 CAPTURE(s) 233 json _; 234 235 // create JSON string value 236 json j = s; 237 // create JSON text 238 std::string ps = std::string("\"") + s + "\""; 239 240 if (success_expected) 241 { 242 // serialization succeeds 243 CHECK_NOTHROW(j.dump()); 244 245 // exclude parse test for U+0000 246 if (s[0] != '\0') 247 { 248 // parsing JSON text succeeds 249 CHECK_NOTHROW(_ = json::parse(ps)); 250 } 251 252 // roundtrip succeeds 253 CHECK_NOTHROW(_ = json::parse(j.dump())); 254 255 // after roundtrip, the same string is stored 256 json jr = json::parse(j.dump()); 257 CHECK(jr.get<std::string>() == s); 258 } 259 else 260 { 261 // serialization fails 262 CHECK_THROWS_AS(j.dump(), json::type_error&); 263 264 // parsing JSON text fails 265 CHECK_THROWS_AS(_ = json::parse(ps), json::parse_error&); 266 } 267} 268} // namespace 269 270TEST_CASE("Markus Kuhn's UTF-8 decoder capability and stress test") 271{ 272 // Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0 273 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt 274 275 SECTION("1 Some correct UTF-8 text") 276 { 277 roundtrip(true, "κόσμε"); 278 } 279 280 SECTION("2 Boundary condition test cases") 281 { 282 SECTION("2.1 First possible sequence of a certain length") 283 { 284 // 2.1.1 1 byte (U-00000000) 285 roundtrip(true, std::string("\0", 1)); 286 // 2.1.2 2 bytes (U-00000080) 287 roundtrip(true, "\xc2\x80"); 288 // 2.1.3 3 bytes (U-00000800) 289 roundtrip(true, "\xe0\xa0\x80"); 290 // 2.1.4 4 bytes (U-00010000) 291 roundtrip(true, "\xf0\x90\x80\x80"); 292 293 // 2.1.5 5 bytes (U-00200000) 294 roundtrip(false, "\xF8\x88\x80\x80\x80"); 295 // 2.1.6 6 bytes (U-04000000) 296 roundtrip(false, "\xFC\x84\x80\x80\x80\x80"); 297 } 298 299 SECTION("2.2 Last possible sequence of a certain length") 300 { 301 // 2.2.1 1 byte (U-0000007F) 302 roundtrip(true, "\x7f"); 303 // 2.2.2 2 bytes (U-000007FF) 304 roundtrip(true, "\xdf\xbf"); 305 // 2.2.3 3 bytes (U-0000FFFF) 306 roundtrip(true, "\xef\xbf\xbf"); 307 308 // 2.2.4 4 bytes (U-001FFFFF) 309 roundtrip(false, "\xF7\xBF\xBF\xBF"); 310 // 2.2.5 5 bytes (U-03FFFFFF) 311 roundtrip(false, "\xFB\xBF\xBF\xBF\xBF"); 312 // 2.2.6 6 bytes (U-7FFFFFFF) 313 roundtrip(false, "\xFD\xBF\xBF\xBF\xBF\xBF"); 314 } 315 316 SECTION("2.3 Other boundary conditions") 317 { 318 // 2.3.1 U-0000D7FF = ed 9f bf 319 roundtrip(true, "\xed\x9f\xbf"); 320 // 2.3.2 U-0000E000 = ee 80 80 321 roundtrip(true, "\xee\x80\x80"); 322 // 2.3.3 U-0000FFFD = ef bf bd 323 roundtrip(true, "\xef\xbf\xbd"); 324 // 2.3.4 U-0010FFFF = f4 8f bf bf 325 roundtrip(true, "\xf4\x8f\xbf\xbf"); 326 327 // 2.3.5 U-00110000 = f4 90 80 80 328 roundtrip(false, "\xf4\x90\x80\x80"); 329 } 330 } 331 332 SECTION("3 Malformed sequences") 333 { 334 SECTION("3.1 Unexpected continuation bytes") 335 { 336 // Each unexpected continuation byte should be separately signalled as a 337 // malformed sequence of its own. 338 339 // 3.1.1 First continuation byte 0x80 340 roundtrip(false, "\x80"); 341 // 3.1.2 Last continuation byte 0xbf 342 roundtrip(false, "\xbf"); 343 344 // 3.1.3 2 continuation bytes 345 roundtrip(false, "\x80\xbf"); 346 // 3.1.4 3 continuation bytes 347 roundtrip(false, "\x80\xbf\x80"); 348 // 3.1.5 4 continuation bytes 349 roundtrip(false, "\x80\xbf\x80\xbf"); 350 // 3.1.6 5 continuation bytes 351 roundtrip(false, "\x80\xbf\x80\xbf\x80"); 352 // 3.1.7 6 continuation bytes 353 roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf"); 354 // 3.1.8 7 continuation bytes 355 roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf\x80"); 356 357 // 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf) 358 roundtrip(false, "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"); 359 } 360 361 SECTION("3.2 Lonely start characters") 362 { 363 // 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf) 364 roundtrip(false, "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf"); 365 // 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef) 366 roundtrip(false, "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef"); 367 // 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7) 368 roundtrip(false, "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7"); 369 // 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb) 370 roundtrip(false, "\xf8 \xf9 \xfa \xfb"); 371 // 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd) 372 roundtrip(false, "\xfc \xfd"); 373 } 374 375 SECTION("3.3 Sequences with last continuation byte missing") 376 { 377 // All bytes of an incomplete sequence should be signalled as a single 378 // malformed sequence, i.e., you should see only a single replacement 379 // character in each of the next 10 tests. (Characters as in section 2) 380 381 // 3.3.1 2-byte sequence with last byte missing (U+0000) 382 roundtrip(false, "\xc0"); 383 // 3.3.2 3-byte sequence with last byte missing (U+0000) 384 roundtrip(false, "\xe0\x80"); 385 // 3.3.3 4-byte sequence with last byte missing (U+0000) 386 roundtrip(false, "\xf0\x80\x80"); 387 // 3.3.4 5-byte sequence with last byte missing (U+0000) 388 roundtrip(false, "\xf8\x80\x80\x80"); 389 // 3.3.5 6-byte sequence with last byte missing (U+0000) 390 roundtrip(false, "\xfc\x80\x80\x80\x80"); 391 // 3.3.6 2-byte sequence with last byte missing (U-000007FF) 392 roundtrip(false, "\xdf"); 393 // 3.3.7 3-byte sequence with last byte missing (U-0000FFFF) 394 roundtrip(false, "\xef\xbf"); 395 // 3.3.8 4-byte sequence with last byte missing (U-001FFFFF) 396 roundtrip(false, "\xf7\xbf\xbf"); 397 // 3.3.9 5-byte sequence with last byte missing (U-03FFFFFF) 398 roundtrip(false, "\xfb\xbf\xbf\xbf"); 399 // 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF) 400 roundtrip(false, "\xfd\xbf\xbf\xbf\xbf"); 401 } 402 403 SECTION("3.4 Concatenation of incomplete sequences") 404 { 405 // All the 10 sequences of 3.3 concatenated, you should see 10 malformed 406 // sequences being signalled: 407 roundtrip(false, "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf"); 408 } 409 410 SECTION("3.5 Impossible bytes") 411 { 412 // The following two bytes cannot appear in a correct UTF-8 string 413 414 // 3.5.1 fe 415 roundtrip(false, "\xfe"); 416 // 3.5.2 ff 417 roundtrip(false, "\xff"); 418 // 3.5.3 fe fe ff ff 419 roundtrip(false, "\xfe\xfe\xff\xff"); 420 } 421 } 422 423 SECTION("4 Overlong sequences") 424 { 425 // The following sequences are not malformed according to the letter of 426 // the Unicode 2.0 standard. However, they are longer then necessary and 427 // a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 428 // decoder" should reject them just like malformed sequences for two 429 // reasons: (1) It helps to debug applications if overlong sequences are 430 // not treated as valid representations of characters, because this helps 431 // to spot problems more quickly. (2) Overlong sequences provide 432 // alternative representations of characters, that could maliciously be 433 // used to bypass filters that check only for ASCII characters. For 434 // instance, a 2-byte encoded line feed (LF) would not be caught by a 435 // line counter that counts only 0x0a bytes, but it would still be 436 // processed as a line feed by an unsafe UTF-8 decoder later in the 437 // pipeline. From a security point of view, ASCII compatibility of UTF-8 438 // sequences means also, that ASCII characters are *only* allowed to be 439 // represented by ASCII bytes in the range 0x00-0x7f. To ensure this 440 // aspect of ASCII compatibility, use only "safe UTF-8 decoders" that 441 // reject overlong UTF-8 sequences for which a shorter encoding exists. 442 443 SECTION("4.1 Examples of an overlong ASCII character") 444 { 445 // With a safe UTF-8 decoder, all of the following five overlong 446 // representations of the ASCII character slash ("/") should be rejected 447 // like a malformed UTF-8 sequence, for instance by substituting it with 448 // a replacement character. If you see a slash below, you do not have a 449 // safe UTF-8 decoder! 450 451 // 4.1.1 U+002F = c0 af 452 roundtrip(false, "\xc0\xaf"); 453 // 4.1.2 U+002F = e0 80 af 454 roundtrip(false, "\xe0\x80\xaf"); 455 // 4.1.3 U+002F = f0 80 80 af 456 roundtrip(false, "\xf0\x80\x80\xaf"); 457 // 4.1.4 U+002F = f8 80 80 80 af 458 roundtrip(false, "\xf8\x80\x80\x80\xaf"); 459 // 4.1.5 U+002F = fc 80 80 80 80 af 460 roundtrip(false, "\xfc\x80\x80\x80\x80\xaf"); 461 } 462 463 SECTION("4.2 Maximum overlong sequences") 464 { 465 // Below you see the highest Unicode value that is still resulting in an 466 // overlong sequence if represented with the given number of bytes. This 467 // is a boundary test for safe UTF-8 decoders. All five characters should 468 // be rejected like malformed UTF-8 sequences. 469 470 // 4.2.1 U-0000007F = c1 bf 471 roundtrip(false, "\xc1\xbf"); 472 // 4.2.2 U-000007FF = e0 9f bf 473 roundtrip(false, "\xe0\x9f\xbf"); 474 // 4.2.3 U-0000FFFF = f0 8f bf bf 475 roundtrip(false, "\xf0\x8f\xbf\xbf"); 476 // 4.2.4 U-001FFFFF = f8 87 bf bf bf 477 roundtrip(false, "\xf8\x87\xbf\xbf\xbf"); 478 // 4.2.5 U-03FFFFFF = fc 83 bf bf bf bf 479 roundtrip(false, "\xfc\x83\xbf\xbf\xbf\xbf"); 480 } 481 482 SECTION("4.3 Overlong representation of the NUL character") 483 { 484 // The following five sequences should also be rejected like malformed 485 // UTF-8 sequences and should not be treated like the ASCII NUL 486 // character. 487 488 // 4.3.1 U+0000 = c0 80 489 roundtrip(false, "\xc0\x80"); 490 // 4.3.2 U+0000 = e0 80 80 491 roundtrip(false, "\xe0\x80\x80"); 492 // 4.3.3 U+0000 = f0 80 80 80 493 roundtrip(false, "\xf0\x80\x80\x80"); 494 // 4.3.4 U+0000 = f8 80 80 80 80 495 roundtrip(false, "\xf8\x80\x80\x80\x80"); 496 // 4.3.5 U+0000 = fc 80 80 80 80 80 497 roundtrip(false, "\xfc\x80\x80\x80\x80\x80"); 498 } 499 } 500 501 SECTION("5 Illegal code positions") 502 { 503 // The following UTF-8 sequences should be rejected like malformed 504 // sequences, because they never represent valid ISO 10646 characters and 505 // a UTF-8 decoder that accepts them might introduce security problems 506 // comparable to overlong UTF-8 sequences. 507 508 SECTION("5.1 Single UTF-16 surrogates") 509 { 510 // 5.1.1 U+D800 = ed a0 80 511 roundtrip(false, "\xed\xa0\x80"); 512 // 5.1.2 U+DB7F = ed ad bf 513 roundtrip(false, "\xed\xad\xbf"); 514 // 5.1.3 U+DB80 = ed ae 80 515 roundtrip(false, "\xed\xae\x80"); 516 // 5.1.4 U+DBFF = ed af bf 517 roundtrip(false, "\xed\xaf\xbf"); 518 // 5.1.5 U+DC00 = ed b0 80 519 roundtrip(false, "\xed\xb0\x80"); 520 // 5.1.6 U+DF80 = ed be 80 521 roundtrip(false, "\xed\xbe\x80"); 522 // 5.1.7 U+DFFF = ed bf bf 523 roundtrip(false, "\xed\xbf\xbf"); 524 } 525 526 SECTION("5.2 Paired UTF-16 surrogates") 527 { 528 // 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 529 roundtrip(false, "\xed\xa0\x80\xed\xb0\x80"); 530 // 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf 531 roundtrip(false, "\xed\xa0\x80\xed\xbf\xbf"); 532 // 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 533 roundtrip(false, "\xed\xad\xbf\xed\xb0\x80"); 534 // 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf 535 roundtrip(false, "\xed\xad\xbf\xed\xbf\xbf"); 536 // 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 537 roundtrip(false, "\xed\xae\x80\xed\xb0\x80"); 538 // 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf 539 roundtrip(false, "\xed\xae\x80\xed\xbf\xbf"); 540 // 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 541 roundtrip(false, "\xed\xaf\xbf\xed\xb0\x80"); 542 // 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf 543 roundtrip(false, "\xed\xaf\xbf\xed\xbf\xbf"); 544 } 545 546 SECTION("5.3 Noncharacter code positions") 547 { 548 // The following "noncharacters" are "reserved for internal use" by 549 // applications, and according to older versions of the Unicode Standard 550 // "should never be interchanged". Unicode Corrigendum #9 dropped the 551 // latter restriction. Nevertheless, their presence in incoming UTF-8 data 552 // can remain a potential security risk, depending on what use is made of 553 // these codes subsequently. Examples of such internal use: 554 // 555 // - Some file APIs with 16-bit characters may use the integer value -1 556 // = U+FFFF to signal an end-of-file (EOF) or error condition. 557 // 558 // - In some UTF-16 receivers, code point U+FFFE might trigger a 559 // byte-swap operation (to convert between UTF-16LE and UTF-16BE). 560 // 561 // With such internal use of noncharacters, it may be desirable and safer 562 // to block those code points in UTF-8 decoders, as they should never 563 // occur legitimately in incoming UTF-8 data, and could trigger unsafe 564 // behaviour in subsequent processing. 565 566 // Particularly problematic noncharacters in 16-bit applications: 567 568 // 5.3.1 U+FFFE = ef bf be 569 roundtrip(true, "\xef\xbf\xbe"); 570 // 5.3.2 U+FFFF = ef bf bf 571 roundtrip(true, "\xef\xbf\xbf"); 572 573 // 5.3.3 U+FDD0 .. U+FDEF 574 roundtrip(true, "\xEF\xB7\x90"); 575 roundtrip(true, "\xEF\xB7\x91"); 576 roundtrip(true, "\xEF\xB7\x92"); 577 roundtrip(true, "\xEF\xB7\x93"); 578 roundtrip(true, "\xEF\xB7\x94"); 579 roundtrip(true, "\xEF\xB7\x95"); 580 roundtrip(true, "\xEF\xB7\x96"); 581 roundtrip(true, "\xEF\xB7\x97"); 582 roundtrip(true, "\xEF\xB7\x98"); 583 roundtrip(true, "\xEF\xB7\x99"); 584 roundtrip(true, "\xEF\xB7\x9A"); 585 roundtrip(true, "\xEF\xB7\x9B"); 586 roundtrip(true, "\xEF\xB7\x9C"); 587 roundtrip(true, "\xEF\xB7\x9D"); 588 roundtrip(true, "\xEF\xB7\x9E"); 589 roundtrip(true, "\xEF\xB7\x9F"); 590 roundtrip(true, "\xEF\xB7\xA0"); 591 roundtrip(true, "\xEF\xB7\xA1"); 592 roundtrip(true, "\xEF\xB7\xA2"); 593 roundtrip(true, "\xEF\xB7\xA3"); 594 roundtrip(true, "\xEF\xB7\xA4"); 595 roundtrip(true, "\xEF\xB7\xA5"); 596 roundtrip(true, "\xEF\xB7\xA6"); 597 roundtrip(true, "\xEF\xB7\xA7"); 598 roundtrip(true, "\xEF\xB7\xA8"); 599 roundtrip(true, "\xEF\xB7\xA9"); 600 roundtrip(true, "\xEF\xB7\xAA"); 601 roundtrip(true, "\xEF\xB7\xAB"); 602 roundtrip(true, "\xEF\xB7\xAC"); 603 roundtrip(true, "\xEF\xB7\xAD"); 604 roundtrip(true, "\xEF\xB7\xAE"); 605 roundtrip(true, "\xEF\xB7\xAF"); 606 607 // 5.3.4 U+nFFFE U+nFFFF (for n = 1..10) 608 roundtrip(true, "\xF0\x9F\xBF\xBF"); 609 roundtrip(true, "\xF0\xAF\xBF\xBF"); 610 roundtrip(true, "\xF0\xBF\xBF\xBF"); 611 roundtrip(true, "\xF1\x8F\xBF\xBF"); 612 roundtrip(true, "\xF1\x9F\xBF\xBF"); 613 roundtrip(true, "\xF1\xAF\xBF\xBF"); 614 roundtrip(true, "\xF1\xBF\xBF\xBF"); 615 roundtrip(true, "\xF2\x8F\xBF\xBF"); 616 roundtrip(true, "\xF2\x9F\xBF\xBF"); 617 roundtrip(true, "\xF2\xAF\xBF\xBF"); 618 } 619 } 620} 621