1/* Copyright JS Foundation and other contributors, http://js.foundation 2 * 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "lit-strings.h" 17 18#include "jrt-libc-includes.h" 19 20/** 21 * Validate utf-8 string 22 * 23 * NOTE: 24 * Isolated surrogates are allowed. 25 * Correct pair of surrogates is not allowed, it should be represented as 4-byte utf-8 character. 26 * 27 * @return true if utf-8 string is well-formed 28 * false otherwise 29 */ 30bool 31lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ 32 lit_utf8_size_t buf_size) /**< string size */ 33{ 34 lit_utf8_size_t idx = 0; 35 36 bool is_prev_code_point_high_surrogate = false; 37 while (idx < buf_size) 38 { 39 lit_utf8_byte_t c = utf8_buf_p[idx++]; 40 if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 41 { 42 is_prev_code_point_high_surrogate = false; 43 continue; 44 } 45 46 lit_code_point_t code_point = 0; 47 lit_code_point_t min_code_point = 0; 48 lit_utf8_size_t extra_bytes_count; 49 if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 50 { 51 extra_bytes_count = 1; 52 min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN; 53 code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); 54 } 55 else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) 56 { 57 extra_bytes_count = 2; 58 min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN; 59 code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); 60 } 61 else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER) 62 { 63 extra_bytes_count = 3; 64 min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN; 65 code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK)); 66 } 67 else 68 { 69 /* utf-8 string could not contain 5- and 6-byte sequences. */ 70 return false; 71 } 72 73 if (idx + extra_bytes_count > buf_size) 74 { 75 /* utf-8 string breaks in the middle */ 76 return false; 77 } 78 79 for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset) 80 { 81 c = utf8_buf_p[idx + offset]; 82 if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) 83 { 84 /* invalid continuation byte */ 85 return false; 86 } 87 code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; 88 code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK); 89 } 90 91 if (code_point < min_code_point 92 || code_point > LIT_UNICODE_CODE_POINT_MAX) 93 { 94 /* utf-8 string doesn't encode valid unicode code point */ 95 return false; 96 } 97 98 if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN 99 && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX) 100 { 101 is_prev_code_point_high_surrogate = true; 102 } 103 else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN 104 && code_point <= LIT_UTF16_LOW_SURROGATE_MAX 105 && is_prev_code_point_high_surrogate) 106 { 107 /* sequence of high and low surrogate is not allowed */ 108 return false; 109 } 110 else 111 { 112 is_prev_code_point_high_surrogate = false; 113 } 114 115 idx += extra_bytes_count; 116 } 117 118 return true; 119} /* lit_is_valid_utf8_string */ 120 121/** 122 * Validate cesu-8 string 123 * 124 * @return true if cesu-8 string is well-formed 125 * false otherwise 126 */ 127bool 128lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ 129 lit_utf8_size_t buf_size) /**< string size */ 130{ 131 lit_utf8_size_t idx = 0; 132 133 while (idx < buf_size) 134 { 135 lit_utf8_byte_t c = cesu8_buf_p[idx++]; 136 if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 137 { 138 continue; 139 } 140 141 lit_code_point_t code_point = 0; 142 lit_code_point_t min_code_point = 0; 143 lit_utf8_size_t extra_bytes_count; 144 if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 145 { 146 extra_bytes_count = 1; 147 min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN; 148 code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); 149 } 150 else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) 151 { 152 extra_bytes_count = 2; 153 min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN; 154 code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); 155 } 156 else 157 { 158 return false; 159 } 160 161 if (idx + extra_bytes_count > buf_size) 162 { 163 /* cesu-8 string breaks in the middle */ 164 return false; 165 } 166 167 for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset) 168 { 169 c = cesu8_buf_p[idx + offset]; 170 if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) 171 { 172 /* invalid continuation byte */ 173 return false; 174 } 175 code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; 176 code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK); 177 } 178 179 if (code_point < min_code_point) 180 { 181 /* cesu-8 string doesn't encode valid unicode code point */ 182 return false; 183 } 184 185 idx += extra_bytes_count; 186 } 187 188 return true; 189} /* lit_is_valid_cesu8_string */ 190 191/** 192 * Check if the code point is UTF-16 low surrogate 193 * 194 * @return true / false 195 */ 196bool 197lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point) /**< code point */ 198{ 199 return LIT_UTF16_LOW_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_LOW_SURROGATE_MAX; 200} /* lit_is_code_point_utf16_low_surrogate */ 201 202/** 203 * Check if the code point is UTF-16 high surrogate 204 * 205 * @return true / false 206 */ 207bool 208lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point) /**< code point */ 209{ 210 return LIT_UTF16_HIGH_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX; 211} /* lit_is_code_point_utf16_high_surrogate */ 212 213/** 214 * Represents code point (>0xFFFF) as surrogate pair and returns its lower part 215 * 216 * @return lower code_unit of the surrogate pair 217 */ 218static ecma_char_t 219convert_code_point_to_low_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */ 220{ 221 JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX); 222 223 ecma_char_t code_unit_bits; 224 code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK); 225 226 return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER | code_unit_bits); 227} /* convert_code_point_to_low_surrogate */ 228 229/** 230 * Represents code point (>0xFFFF) as surrogate pair and returns its higher part 231 * 232 * @return higher code_unit of the surrogate pair 233 */ 234static ecma_char_t 235convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */ 236{ 237 JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX); 238 JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX); 239 240 ecma_char_t code_unit_bits; 241 code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE); 242 243 return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits); 244} /* convert_code_point_to_high_surrogate */ 245 246/** 247 * UTF16 Encoding method for a code point 248 * 249 * See also: 250 * ECMA-262 v6, 10.1.1 251 * 252 * @return uint8_t, the number of returning code points 253 */ 254uint8_t 255lit_utf16_encode_code_point (lit_code_point_t cp, /**< the code point we encode */ 256 ecma_char_t *cu_p) /**< result of the encoding */ 257{ 258 if (cp <= LIT_UTF16_CODE_UNIT_MAX) 259 { 260 cu_p[0] = (ecma_char_t) cp; 261 return 1; 262 } 263 264 cu_p[0] = convert_code_point_to_high_surrogate (cp); 265 cu_p[1] = convert_code_point_to_low_surrogate (cp); 266 return 2; 267} /* lit_utf16_encode_code_point */ 268 269/** 270 * Calculate size of a zero-terminated utf-8 string 271 * 272 * NOTE: 273 * - string cannot be NULL 274 * - string should not contain zero characters in the middle 275 * 276 * @return size of a string 277 */ 278lit_utf8_size_t 279lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated utf-8 string */ 280{ 281 JERRY_ASSERT (utf8_str_p != NULL); 282 return (lit_utf8_size_t) strlen ((const char *) utf8_str_p); 283} /* lit_zt_utf8_string_size */ 284 285/** 286 * Calculate length of a cesu-8 encoded string 287 * 288 * @return UTF-16 code units count 289 */ 290ecma_length_t 291lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ 292 lit_utf8_size_t utf8_buf_size) /**< string size */ 293{ 294 ecma_length_t length = 0; 295 lit_utf8_size_t size = 0; 296 297 while (size < utf8_buf_size) 298 { 299 size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size)); 300 length++; 301 } 302 303 JERRY_ASSERT (size == utf8_buf_size); 304 305 return length; 306} /* lit_utf8_string_length */ 307 308/** 309 * Calculate the required size of an utf-8 encoded string from cesu-8 encoded string 310 * 311 * @return size of an utf-8 encoded string 312 */ 313lit_utf8_size_t 314lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ 315 lit_utf8_size_t cesu8_buf_size) /**< string size */ 316{ 317 lit_utf8_size_t offset = 0; 318 lit_utf8_size_t utf8_buf_size = cesu8_buf_size; 319 ecma_char_t prev_ch = 0; 320 321 while (offset < cesu8_buf_size) 322 { 323 ecma_char_t ch; 324 offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch); 325 326 if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch)) 327 { 328 utf8_buf_size -= 2; 329 } 330 331 prev_ch = ch; 332 } 333 334 JERRY_ASSERT (offset == cesu8_buf_size); 335 336 return utf8_buf_size; 337} /* lit_get_utf8_size_of_cesu8_string */ 338 339/** 340 * Calculate length of an utf-8 encoded string from cesu-8 encoded string 341 * 342 * @return length of an utf-8 encoded string 343 */ 344ecma_length_t 345lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ 346 lit_utf8_size_t cesu8_buf_size) /**< string size */ 347{ 348 lit_utf8_size_t offset = 0; 349 ecma_length_t utf8_length = 0; 350 ecma_char_t prev_ch = 0; 351 352 while (offset < cesu8_buf_size) 353 { 354 ecma_char_t ch; 355 offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch); 356 357 if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch)) 358 { 359 utf8_length++; 360 } 361 362 prev_ch = ch; 363 } 364 365 JERRY_ASSERT (offset == cesu8_buf_size); 366 367 return utf8_length; 368} /* lit_get_utf8_length_of_cesu8_string */ 369 370/** 371 * Decodes a unicode code point from non-empty utf-8-encoded buffer 372 * 373 * @return number of bytes occupied by code point in the string 374 */ 375lit_utf8_size_t 376lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ 377 lit_utf8_size_t buf_size, /**< size of the buffer in bytes */ 378 lit_code_point_t *code_point) /**< [out] code point */ 379{ 380 JERRY_ASSERT (buf_p && buf_size); 381 382 lit_utf8_byte_t c = buf_p[0]; 383 if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 384 { 385 *code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK); 386 return 1; 387 } 388 389 lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL; 390 ecma_length_t bytes_count = 0; 391 if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 392 { 393 bytes_count = 2; 394 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); 395 } 396 else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) 397 { 398 bytes_count = 3; 399 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); 400 } 401 else 402 { 403 JERRY_ASSERT ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER); 404 bytes_count = 4; 405 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_3_BITS_MASK)); 406 } 407 408 JERRY_ASSERT (buf_size >= bytes_count); 409 410 for (uint32_t i = 1; i < bytes_count; ++i) 411 { 412 ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; 413 ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK); 414 } 415 416 *code_point = ret; 417 return bytes_count; 418} /* lit_read_code_point_from_utf8 */ 419 420/** 421 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 422 * 423 * @return number of bytes occupied by code point in the string 424 */ 425lit_utf8_size_t 426lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ 427 ecma_char_t *code_point) /**< [out] code point */ 428{ 429 JERRY_ASSERT (buf_p); 430 431 lit_utf8_byte_t c = buf_p[0]; 432 if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 433 { 434 *code_point = (ecma_char_t) (c & LIT_UTF8_LAST_7_BITS_MASK); 435 return 1; 436 } 437 438 lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL; 439 ecma_length_t bytes_count; 440 if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 441 { 442 bytes_count = 2; 443 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); 444 } 445 else 446 { 447 JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER); 448 bytes_count = 3; 449 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); 450 } 451 452 for (uint32_t i = 1; i < bytes_count; ++i) 453 { 454 ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; 455 ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK); 456 } 457 458 JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX); 459 *code_point = (ecma_char_t) ret; 460 return bytes_count; 461} /* lit_read_code_unit_from_utf8 */ 462 463/** 464 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 465 * 466 * @return number of bytes occupied by code point in the string 467 */ 468lit_utf8_size_t 469lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ 470 ecma_char_t *code_point) /**< [out] code point */ 471{ 472 JERRY_ASSERT (buf_p); 473 474 lit_utf8_decr (&buf_p); 475 return lit_read_code_unit_from_utf8 (buf_p, code_point); 476} /* lit_read_prev_code_unit_from_utf8 */ 477 478/** 479 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 480 * 481 * @return next code unit 482 */ 483ecma_char_t 484lit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ 485{ 486 JERRY_ASSERT (*buf_p); 487 ecma_char_t ch; 488 489 *buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch); 490 491 return ch; 492} /* lit_cesu8_read_next */ 493 494/** 495 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 496 * 497 * @return previous code unit 498 */ 499ecma_char_t 500lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ 501{ 502 JERRY_ASSERT (*buf_p); 503 ecma_char_t ch; 504 505 lit_utf8_decr (buf_p); 506 lit_read_code_unit_from_utf8 (*buf_p, &ch); 507 508 return ch; 509} /* lit_cesu8_read_prev */ 510 511/** 512 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 513 * 514 * @return next code unit 515 */ 516ecma_char_t JERRY_ATTR_NOINLINE 517lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ 518{ 519 JERRY_ASSERT (buf_p != NULL); 520 ecma_char_t ch; 521 522 lit_read_code_unit_from_utf8 (buf_p, &ch); 523 524 return ch; 525} /* lit_cesu8_peek_next */ 526 527/** 528 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 529 * 530 * @return previous code unit 531 */ 532ecma_char_t JERRY_ATTR_NOINLINE 533lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ 534{ 535 JERRY_ASSERT (buf_p != NULL); 536 ecma_char_t ch; 537 538 lit_read_prev_code_unit_from_utf8 (buf_p, &ch); 539 540 return ch; 541} /* lit_cesu8_peek_prev */ 542 543/** 544 * Increase cesu-8 encoded string pointer by one code unit. 545 */ 546inline void JERRY_ATTR_ALWAYS_INLINE 547lit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ 548{ 549 JERRY_ASSERT (*buf_p); 550 551 *buf_p += lit_get_unicode_char_size_by_utf8_first_byte (**buf_p); 552} /* lit_utf8_incr */ 553 554/** 555 * Decrease cesu-8 encoded string pointer by one code unit. 556 */ 557void 558lit_utf8_decr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ 559{ 560 JERRY_ASSERT (*buf_p); 561 const lit_utf8_byte_t *current_p = *buf_p; 562 563 do 564 { 565 current_p--; 566 } 567 while ((*(current_p) & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER); 568 569 *buf_p = current_p; 570} /* lit_utf8_decr */ 571 572/** 573 * Calc hash using the specified hash_basis. 574 * 575 * NOTE: 576 * This is implementation of FNV-1a hash function, which is released into public domain. 577 * Constants used, are carefully picked primes by the authors. 578 * More info: http://www.isthe.com/chongo/tech/comp/fnv/ 579 * 580 * @return ecma-string's hash 581 */ 582inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE 583lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, /**< hash to be combined with */ 584 const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */ 585 lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */ 586{ 587 JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0); 588 589 uint32_t hash = hash_basis; 590 591 for (uint32_t i = 0; i < utf8_buf_size; i++) 592 { 593 /* 16777619 is 32 bit FNV_prime = 2^24 + 2^8 + 0x93 = 16777619 */ 594 hash = (hash ^ utf8_buf_p[i]) * 16777619; 595 } 596 597 return (lit_string_hash_t) hash; 598} /* lit_utf8_string_hash_combine */ 599 600/** 601 * Calculate hash from the buffer. 602 * 603 * @return ecma-string's hash 604 */ 605inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE 606lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */ 607 lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */ 608{ 609 JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0); 610 611 /* 32 bit offset_basis for FNV = 2166136261 */ 612 return lit_utf8_string_hash_combine ((lit_string_hash_t) 2166136261, utf8_buf_p, utf8_buf_size); 613} /* lit_utf8_string_calc_hash */ 614 615/** 616 * Return code unit at the specified position in string 617 * 618 * NOTE: 619 * code_unit_offset should be less then string's length 620 * 621 * @return code unit value 622 */ 623ecma_char_t 624lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ 625 lit_utf8_size_t utf8_buf_size, /**< string size in bytes */ 626 ecma_length_t code_unit_offset) /**< ofset of a code_unit */ 627{ 628 lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) utf8_buf_p; 629 ecma_char_t code_unit; 630 631 do 632 { 633 JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size); 634 current_p += lit_read_code_unit_from_utf8 (current_p, &code_unit); 635 } 636 while (code_unit_offset--); 637 638 return code_unit; 639} /* lit_utf8_string_code_unit_at */ 640 641/** 642 * Get CESU-8 encoded size of character 643 * 644 * @return number of bytes occupied in CESU-8 645 */ 646inline lit_utf8_size_t JERRY_ATTR_ALWAYS_INLINE 647lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */ 648{ 649 if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 650 { 651 return 1; 652 } 653 else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 654 { 655 return 2; 656 } 657 else 658 { 659 JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER); 660 return 3; 661 } 662} /* lit_get_unicode_char_size_by_utf8_first_byte */ 663 664/** 665 * Convert code unit to cesu-8 representation 666 * 667 * @return byte count required to represent the code unit 668 */ 669lit_utf8_size_t 670lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */ 671 lit_utf8_byte_t *buf_p) /**< buffer where to store the result and its size 672 * should be at least LIT_UTF8_MAX_BYTES_IN_CODE_UNIT */ 673{ 674 if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) 675 { 676 buf_p[0] = (lit_utf8_byte_t) code_unit; 677 return 1; 678 } 679 else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX) 680 { 681 uint32_t code_unit_bits = code_unit; 682 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK); 683 code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 684 685 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK); 686 JERRY_ASSERT (first_byte_bits == code_unit_bits); 687 688 buf_p[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits; 689 buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 690 return 2; 691 } 692 else 693 { 694 uint32_t code_unit_bits = code_unit; 695 lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK); 696 code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 697 698 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK); 699 code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 700 701 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK); 702 JERRY_ASSERT (first_byte_bits == code_unit_bits); 703 704 buf_p[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits; 705 buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 706 buf_p[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits; 707 return 3; 708 } 709} /* lit_code_unit_to_utf8 */ 710 711/** 712 * Convert code point to cesu-8 representation 713 * 714 * @return byte count required to represent the code point 715 */ 716lit_utf8_size_t 717lit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */ 718 lit_utf8_byte_t *buf) /**< buffer where to store the result, 719 * its size should be at least 6 bytes */ 720{ 721 if (code_point <= LIT_UTF16_CODE_UNIT_MAX) 722 { 723 return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf); 724 } 725 else 726 { 727 lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf); 728 offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset); 729 return offset; 730 } 731} /* lit_code_point_to_cesu8 */ 732 733/** 734 * Convert code point to utf-8 representation 735 * 736 * @return byte count required to represent the code point 737 */ 738lit_utf8_size_t 739lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */ 740 lit_utf8_byte_t *buf) /**< buffer where to store the result, 741 * its size should be at least 4 bytes */ 742{ 743 if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) 744 { 745 buf[0] = (lit_utf8_byte_t) code_point; 746 return 1; 747 } 748 else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX) 749 { 750 uint32_t code_point_bits = code_point; 751 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 752 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 753 754 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK); 755 JERRY_ASSERT (first_byte_bits == code_point_bits); 756 757 buf[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits; 758 buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 759 return 2; 760 } 761 else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX) 762 { 763 uint32_t code_point_bits = code_point; 764 lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 765 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 766 767 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 768 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 769 770 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK); 771 JERRY_ASSERT (first_byte_bits == code_point_bits); 772 773 buf[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits; 774 buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 775 buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits; 776 return 3; 777 } 778 else 779 { 780 JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX); 781 782 uint32_t code_point_bits = code_point; 783 lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 784 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 785 786 lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 787 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 788 789 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 790 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 791 792 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK); 793 JERRY_ASSERT (first_byte_bits == code_point_bits); 794 795 buf[0] = LIT_UTF8_4_BYTE_MARKER | first_byte_bits; 796 buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 797 buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits; 798 buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits; 799 return 4; 800 } 801} /* lit_code_point_to_utf8 */ 802 803/** 804 * Convert cesu-8 string to an utf-8 string and put it into the buffer. 805 * It is the caller's responsibility to make sure that the string fits in the buffer. 806 * 807 * @return number of bytes copied to the buffer. 808 */ 809lit_utf8_size_t 810lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, /**< cesu-8 string */ 811 lit_utf8_size_t cesu8_size, /**< size of cesu-8 string */ 812 lit_utf8_byte_t *utf8_string, /**< destination utf-8 buffer pointer 813 * (can be NULL if buffer_size == 0) */ 814 lit_utf8_size_t utf8_size) /**< size of utf-8 buffer */ 815{ 816 const lit_utf8_byte_t *cesu8_pos = cesu8_string; 817 const lit_utf8_byte_t *cesu8_end_pos = cesu8_string + cesu8_size; 818 819 lit_utf8_byte_t *utf8_pos = utf8_string; 820 lit_utf8_byte_t *utf8_end_pos = utf8_string + utf8_size; 821 822 lit_utf8_size_t size = 0; 823 824 ecma_char_t prev_ch = 0; 825 lit_utf8_size_t prev_ch_size = 0; 826 827 while (cesu8_pos < cesu8_end_pos) 828 { 829 ecma_char_t ch; 830 lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch); 831 832 if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch)) 833 { 834 JERRY_ASSERT (code_unit_size == prev_ch_size); 835 utf8_pos -= prev_ch_size; 836 lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_ch, ch); 837 lit_code_point_to_utf8 (code_point, utf8_pos); 838 size++; 839 } 840 else 841 { 842 memcpy (utf8_pos, cesu8_pos, code_unit_size); 843 size += code_unit_size; 844 } 845 846 utf8_pos = utf8_string + size; 847 cesu8_pos += code_unit_size; 848 prev_ch = ch; 849 prev_ch_size = code_unit_size; 850 } 851 852 JERRY_ASSERT (cesu8_pos == cesu8_end_pos); 853 JERRY_ASSERT (utf8_pos <= utf8_end_pos); 854 855 return size; 856} /* lit_convert_cesu8_string_to_utf8_string */ 857 858/** 859 * Convert surrogate pair to code point 860 * 861 * @return code point 862 */ 863lit_code_point_t 864lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */ 865 ecma_char_t low_surrogate) /**< low surrogate code point */ 866{ 867 JERRY_ASSERT (lit_is_code_point_utf16_high_surrogate (high_surrogate)); 868 JERRY_ASSERT (lit_is_code_point_utf16_low_surrogate (low_surrogate)); 869 870 lit_code_point_t code_point; 871 code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN); 872 code_point <<= LIT_UTF16_BITS_IN_SURROGATE; 873 874 code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT; 875 876 code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN); 877 return code_point; 878} /* lit_convert_surrogate_pair_to_code_point */ 879 880/** 881 * Relational compare of cesu-8 strings 882 * 883 * First string is less than second string if: 884 * - strings are not equal; 885 * - first string is prefix of second or is lexicographically less than second. 886 * 887 * @return true - if first string is less than second string, 888 * false - otherwise 889 */ 890bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< utf-8 string */ 891 lit_utf8_size_t string1_size, /**< string size */ 892 const lit_utf8_byte_t *string2_p, /**< utf-8 string */ 893 lit_utf8_size_t string2_size) /**< string size */ 894{ 895 lit_utf8_byte_t *string1_pos = (lit_utf8_byte_t *) string1_p; 896 lit_utf8_byte_t *string2_pos = (lit_utf8_byte_t *) string2_p; 897 const lit_utf8_byte_t *string1_end_p = string1_p + string1_size; 898 const lit_utf8_byte_t *string2_end_p = string2_p + string2_size; 899 900 while (string1_pos < string1_end_p && string2_pos < string2_end_p) 901 { 902 ecma_char_t ch1, ch2; 903 string1_pos += lit_read_code_unit_from_utf8 (string1_pos, &ch1); 904 string2_pos += lit_read_code_unit_from_utf8 (string2_pos, &ch2); 905 906 if (ch1 < ch2) 907 { 908 return true; 909 } 910 else if (ch1 > ch2) 911 { 912 return false; 913 } 914 } 915 916 return (string1_pos >= string1_end_p && string2_pos < string2_end_p); 917} /* lit_compare_utf8_strings_relational */ 918