1425bb815Sopenharmony_ci/* Copyright JS Foundation and other contributors, http://js.foundation 2425bb815Sopenharmony_ci * 3425bb815Sopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License"); 4425bb815Sopenharmony_ci * you may not use this file except in compliance with the License. 5425bb815Sopenharmony_ci * You may obtain a copy of the License at 6425bb815Sopenharmony_ci * 7425bb815Sopenharmony_ci * http://www.apache.org/licenses/LICENSE-2.0 8425bb815Sopenharmony_ci * 9425bb815Sopenharmony_ci * Unless required by applicable law or agreed to in writing, software 10425bb815Sopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS 11425bb815Sopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12425bb815Sopenharmony_ci * See the License for the specific language governing permissions and 13425bb815Sopenharmony_ci * limitations under the License. 14425bb815Sopenharmony_ci */ 15425bb815Sopenharmony_ci 16425bb815Sopenharmony_ci#include "lit-strings.h" 17425bb815Sopenharmony_ci 18425bb815Sopenharmony_ci#include "jrt-libc-includes.h" 19425bb815Sopenharmony_ci 20425bb815Sopenharmony_ci/** 21425bb815Sopenharmony_ci * Validate utf-8 string 22425bb815Sopenharmony_ci * 23425bb815Sopenharmony_ci * NOTE: 24425bb815Sopenharmony_ci * Isolated surrogates are allowed. 25425bb815Sopenharmony_ci * Correct pair of surrogates is not allowed, it should be represented as 4-byte utf-8 character. 26425bb815Sopenharmony_ci * 27425bb815Sopenharmony_ci * @return true if utf-8 string is well-formed 28425bb815Sopenharmony_ci * false otherwise 29425bb815Sopenharmony_ci */ 30425bb815Sopenharmony_cibool 31425bb815Sopenharmony_cilit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ 32425bb815Sopenharmony_ci lit_utf8_size_t buf_size) /**< string size */ 33425bb815Sopenharmony_ci{ 34425bb815Sopenharmony_ci lit_utf8_size_t idx = 0; 35425bb815Sopenharmony_ci 36425bb815Sopenharmony_ci bool is_prev_code_point_high_surrogate = false; 37425bb815Sopenharmony_ci while (idx < buf_size) 38425bb815Sopenharmony_ci { 39425bb815Sopenharmony_ci lit_utf8_byte_t c = utf8_buf_p[idx++]; 40425bb815Sopenharmony_ci if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 41425bb815Sopenharmony_ci { 42425bb815Sopenharmony_ci is_prev_code_point_high_surrogate = false; 43425bb815Sopenharmony_ci continue; 44425bb815Sopenharmony_ci } 45425bb815Sopenharmony_ci 46425bb815Sopenharmony_ci lit_code_point_t code_point = 0; 47425bb815Sopenharmony_ci lit_code_point_t min_code_point = 0; 48425bb815Sopenharmony_ci lit_utf8_size_t extra_bytes_count; 49425bb815Sopenharmony_ci if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 50425bb815Sopenharmony_ci { 51425bb815Sopenharmony_ci extra_bytes_count = 1; 52425bb815Sopenharmony_ci min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN; 53425bb815Sopenharmony_ci code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); 54425bb815Sopenharmony_ci } 55425bb815Sopenharmony_ci else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) 56425bb815Sopenharmony_ci { 57425bb815Sopenharmony_ci extra_bytes_count = 2; 58425bb815Sopenharmony_ci min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN; 59425bb815Sopenharmony_ci code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); 60425bb815Sopenharmony_ci } 61425bb815Sopenharmony_ci else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER) 62425bb815Sopenharmony_ci { 63425bb815Sopenharmony_ci extra_bytes_count = 3; 64425bb815Sopenharmony_ci min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN; 65425bb815Sopenharmony_ci code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK)); 66425bb815Sopenharmony_ci } 67425bb815Sopenharmony_ci else 68425bb815Sopenharmony_ci { 69425bb815Sopenharmony_ci /* utf-8 string could not contain 5- and 6-byte sequences. */ 70425bb815Sopenharmony_ci return false; 71425bb815Sopenharmony_ci } 72425bb815Sopenharmony_ci 73425bb815Sopenharmony_ci if (idx + extra_bytes_count > buf_size) 74425bb815Sopenharmony_ci { 75425bb815Sopenharmony_ci /* utf-8 string breaks in the middle */ 76425bb815Sopenharmony_ci return false; 77425bb815Sopenharmony_ci } 78425bb815Sopenharmony_ci 79425bb815Sopenharmony_ci for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset) 80425bb815Sopenharmony_ci { 81425bb815Sopenharmony_ci c = utf8_buf_p[idx + offset]; 82425bb815Sopenharmony_ci if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) 83425bb815Sopenharmony_ci { 84425bb815Sopenharmony_ci /* invalid continuation byte */ 85425bb815Sopenharmony_ci return false; 86425bb815Sopenharmony_ci } 87425bb815Sopenharmony_ci code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; 88425bb815Sopenharmony_ci code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK); 89425bb815Sopenharmony_ci } 90425bb815Sopenharmony_ci 91425bb815Sopenharmony_ci if (code_point < min_code_point 92425bb815Sopenharmony_ci || code_point > LIT_UNICODE_CODE_POINT_MAX) 93425bb815Sopenharmony_ci { 94425bb815Sopenharmony_ci /* utf-8 string doesn't encode valid unicode code point */ 95425bb815Sopenharmony_ci return false; 96425bb815Sopenharmony_ci } 97425bb815Sopenharmony_ci 98425bb815Sopenharmony_ci if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN 99425bb815Sopenharmony_ci && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX) 100425bb815Sopenharmony_ci { 101425bb815Sopenharmony_ci is_prev_code_point_high_surrogate = true; 102425bb815Sopenharmony_ci } 103425bb815Sopenharmony_ci else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN 104425bb815Sopenharmony_ci && code_point <= LIT_UTF16_LOW_SURROGATE_MAX 105425bb815Sopenharmony_ci && is_prev_code_point_high_surrogate) 106425bb815Sopenharmony_ci { 107425bb815Sopenharmony_ci /* sequence of high and low surrogate is not allowed */ 108425bb815Sopenharmony_ci return false; 109425bb815Sopenharmony_ci } 110425bb815Sopenharmony_ci else 111425bb815Sopenharmony_ci { 112425bb815Sopenharmony_ci is_prev_code_point_high_surrogate = false; 113425bb815Sopenharmony_ci } 114425bb815Sopenharmony_ci 115425bb815Sopenharmony_ci idx += extra_bytes_count; 116425bb815Sopenharmony_ci } 117425bb815Sopenharmony_ci 118425bb815Sopenharmony_ci return true; 119425bb815Sopenharmony_ci} /* lit_is_valid_utf8_string */ 120425bb815Sopenharmony_ci 121425bb815Sopenharmony_ci/** 122425bb815Sopenharmony_ci * Validate cesu-8 string 123425bb815Sopenharmony_ci * 124425bb815Sopenharmony_ci * @return true if cesu-8 string is well-formed 125425bb815Sopenharmony_ci * false otherwise 126425bb815Sopenharmony_ci */ 127425bb815Sopenharmony_cibool 128425bb815Sopenharmony_cilit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ 129425bb815Sopenharmony_ci lit_utf8_size_t buf_size) /**< string size */ 130425bb815Sopenharmony_ci{ 131425bb815Sopenharmony_ci lit_utf8_size_t idx = 0; 132425bb815Sopenharmony_ci 133425bb815Sopenharmony_ci while (idx < buf_size) 134425bb815Sopenharmony_ci { 135425bb815Sopenharmony_ci lit_utf8_byte_t c = cesu8_buf_p[idx++]; 136425bb815Sopenharmony_ci if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 137425bb815Sopenharmony_ci { 138425bb815Sopenharmony_ci continue; 139425bb815Sopenharmony_ci } 140425bb815Sopenharmony_ci 141425bb815Sopenharmony_ci lit_code_point_t code_point = 0; 142425bb815Sopenharmony_ci lit_code_point_t min_code_point = 0; 143425bb815Sopenharmony_ci lit_utf8_size_t extra_bytes_count; 144425bb815Sopenharmony_ci if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 145425bb815Sopenharmony_ci { 146425bb815Sopenharmony_ci extra_bytes_count = 1; 147425bb815Sopenharmony_ci min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN; 148425bb815Sopenharmony_ci code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); 149425bb815Sopenharmony_ci } 150425bb815Sopenharmony_ci else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) 151425bb815Sopenharmony_ci { 152425bb815Sopenharmony_ci extra_bytes_count = 2; 153425bb815Sopenharmony_ci min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN; 154425bb815Sopenharmony_ci code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); 155425bb815Sopenharmony_ci } 156425bb815Sopenharmony_ci else 157425bb815Sopenharmony_ci { 158425bb815Sopenharmony_ci return false; 159425bb815Sopenharmony_ci } 160425bb815Sopenharmony_ci 161425bb815Sopenharmony_ci if (idx + extra_bytes_count > buf_size) 162425bb815Sopenharmony_ci { 163425bb815Sopenharmony_ci /* cesu-8 string breaks in the middle */ 164425bb815Sopenharmony_ci return false; 165425bb815Sopenharmony_ci } 166425bb815Sopenharmony_ci 167425bb815Sopenharmony_ci for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset) 168425bb815Sopenharmony_ci { 169425bb815Sopenharmony_ci c = cesu8_buf_p[idx + offset]; 170425bb815Sopenharmony_ci if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) 171425bb815Sopenharmony_ci { 172425bb815Sopenharmony_ci /* invalid continuation byte */ 173425bb815Sopenharmony_ci return false; 174425bb815Sopenharmony_ci } 175425bb815Sopenharmony_ci code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; 176425bb815Sopenharmony_ci code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK); 177425bb815Sopenharmony_ci } 178425bb815Sopenharmony_ci 179425bb815Sopenharmony_ci if (code_point < min_code_point) 180425bb815Sopenharmony_ci { 181425bb815Sopenharmony_ci /* cesu-8 string doesn't encode valid unicode code point */ 182425bb815Sopenharmony_ci return false; 183425bb815Sopenharmony_ci } 184425bb815Sopenharmony_ci 185425bb815Sopenharmony_ci idx += extra_bytes_count; 186425bb815Sopenharmony_ci } 187425bb815Sopenharmony_ci 188425bb815Sopenharmony_ci return true; 189425bb815Sopenharmony_ci} /* lit_is_valid_cesu8_string */ 190425bb815Sopenharmony_ci 191425bb815Sopenharmony_ci/** 192425bb815Sopenharmony_ci * Check if the code point is UTF-16 low surrogate 193425bb815Sopenharmony_ci * 194425bb815Sopenharmony_ci * @return true / false 195425bb815Sopenharmony_ci */ 196425bb815Sopenharmony_cibool 197425bb815Sopenharmony_cilit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point) /**< code point */ 198425bb815Sopenharmony_ci{ 199425bb815Sopenharmony_ci return LIT_UTF16_LOW_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_LOW_SURROGATE_MAX; 200425bb815Sopenharmony_ci} /* lit_is_code_point_utf16_low_surrogate */ 201425bb815Sopenharmony_ci 202425bb815Sopenharmony_ci/** 203425bb815Sopenharmony_ci * Check if the code point is UTF-16 high surrogate 204425bb815Sopenharmony_ci * 205425bb815Sopenharmony_ci * @return true / false 206425bb815Sopenharmony_ci */ 207425bb815Sopenharmony_cibool 208425bb815Sopenharmony_cilit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point) /**< code point */ 209425bb815Sopenharmony_ci{ 210425bb815Sopenharmony_ci return LIT_UTF16_HIGH_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX; 211425bb815Sopenharmony_ci} /* lit_is_code_point_utf16_high_surrogate */ 212425bb815Sopenharmony_ci 213425bb815Sopenharmony_ci/** 214425bb815Sopenharmony_ci * Represents code point (>0xFFFF) as surrogate pair and returns its lower part 215425bb815Sopenharmony_ci * 216425bb815Sopenharmony_ci * @return lower code_unit of the surrogate pair 217425bb815Sopenharmony_ci */ 218425bb815Sopenharmony_cistatic ecma_char_t 219425bb815Sopenharmony_ciconvert_code_point_to_low_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */ 220425bb815Sopenharmony_ci{ 221425bb815Sopenharmony_ci JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX); 222425bb815Sopenharmony_ci 223425bb815Sopenharmony_ci ecma_char_t code_unit_bits; 224425bb815Sopenharmony_ci code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK); 225425bb815Sopenharmony_ci 226425bb815Sopenharmony_ci return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER | code_unit_bits); 227425bb815Sopenharmony_ci} /* convert_code_point_to_low_surrogate */ 228425bb815Sopenharmony_ci 229425bb815Sopenharmony_ci/** 230425bb815Sopenharmony_ci * Represents code point (>0xFFFF) as surrogate pair and returns its higher part 231425bb815Sopenharmony_ci * 232425bb815Sopenharmony_ci * @return higher code_unit of the surrogate pair 233425bb815Sopenharmony_ci */ 234425bb815Sopenharmony_cistatic ecma_char_t 235425bb815Sopenharmony_ciconvert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */ 236425bb815Sopenharmony_ci{ 237425bb815Sopenharmony_ci JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX); 238425bb815Sopenharmony_ci JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX); 239425bb815Sopenharmony_ci 240425bb815Sopenharmony_ci ecma_char_t code_unit_bits; 241425bb815Sopenharmony_ci code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE); 242425bb815Sopenharmony_ci 243425bb815Sopenharmony_ci return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits); 244425bb815Sopenharmony_ci} /* convert_code_point_to_high_surrogate */ 245425bb815Sopenharmony_ci 246425bb815Sopenharmony_ci/** 247425bb815Sopenharmony_ci * UTF16 Encoding method for a code point 248425bb815Sopenharmony_ci * 249425bb815Sopenharmony_ci * See also: 250425bb815Sopenharmony_ci * ECMA-262 v6, 10.1.1 251425bb815Sopenharmony_ci * 252425bb815Sopenharmony_ci * @return uint8_t, the number of returning code points 253425bb815Sopenharmony_ci */ 254425bb815Sopenharmony_ciuint8_t 255425bb815Sopenharmony_cilit_utf16_encode_code_point (lit_code_point_t cp, /**< the code point we encode */ 256425bb815Sopenharmony_ci ecma_char_t *cu_p) /**< result of the encoding */ 257425bb815Sopenharmony_ci{ 258425bb815Sopenharmony_ci if (cp <= LIT_UTF16_CODE_UNIT_MAX) 259425bb815Sopenharmony_ci { 260425bb815Sopenharmony_ci cu_p[0] = (ecma_char_t) cp; 261425bb815Sopenharmony_ci return 1; 262425bb815Sopenharmony_ci } 263425bb815Sopenharmony_ci 264425bb815Sopenharmony_ci cu_p[0] = convert_code_point_to_high_surrogate (cp); 265425bb815Sopenharmony_ci cu_p[1] = convert_code_point_to_low_surrogate (cp); 266425bb815Sopenharmony_ci return 2; 267425bb815Sopenharmony_ci} /* lit_utf16_encode_code_point */ 268425bb815Sopenharmony_ci 269425bb815Sopenharmony_ci/** 270425bb815Sopenharmony_ci * Calculate size of a zero-terminated utf-8 string 271425bb815Sopenharmony_ci * 272425bb815Sopenharmony_ci * NOTE: 273425bb815Sopenharmony_ci * - string cannot be NULL 274425bb815Sopenharmony_ci * - string should not contain zero characters in the middle 275425bb815Sopenharmony_ci * 276425bb815Sopenharmony_ci * @return size of a string 277425bb815Sopenharmony_ci */ 278425bb815Sopenharmony_cilit_utf8_size_t 279425bb815Sopenharmony_cilit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated utf-8 string */ 280425bb815Sopenharmony_ci{ 281425bb815Sopenharmony_ci JERRY_ASSERT (utf8_str_p != NULL); 282425bb815Sopenharmony_ci return (lit_utf8_size_t) strlen ((const char *) utf8_str_p); 283425bb815Sopenharmony_ci} /* lit_zt_utf8_string_size */ 284425bb815Sopenharmony_ci 285425bb815Sopenharmony_ci/** 286425bb815Sopenharmony_ci * Calculate length of a cesu-8 encoded string 287425bb815Sopenharmony_ci * 288425bb815Sopenharmony_ci * @return UTF-16 code units count 289425bb815Sopenharmony_ci */ 290425bb815Sopenharmony_ciecma_length_t 291425bb815Sopenharmony_cilit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ 292425bb815Sopenharmony_ci lit_utf8_size_t utf8_buf_size) /**< string size */ 293425bb815Sopenharmony_ci{ 294425bb815Sopenharmony_ci ecma_length_t length = 0; 295425bb815Sopenharmony_ci lit_utf8_size_t size = 0; 296425bb815Sopenharmony_ci 297425bb815Sopenharmony_ci while (size < utf8_buf_size) 298425bb815Sopenharmony_ci { 299425bb815Sopenharmony_ci size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size)); 300425bb815Sopenharmony_ci length++; 301425bb815Sopenharmony_ci } 302425bb815Sopenharmony_ci 303425bb815Sopenharmony_ci JERRY_ASSERT (size == utf8_buf_size); 304425bb815Sopenharmony_ci 305425bb815Sopenharmony_ci return length; 306425bb815Sopenharmony_ci} /* lit_utf8_string_length */ 307425bb815Sopenharmony_ci 308425bb815Sopenharmony_ci/** 309425bb815Sopenharmony_ci * Calculate the required size of an utf-8 encoded string from cesu-8 encoded string 310425bb815Sopenharmony_ci * 311425bb815Sopenharmony_ci * @return size of an utf-8 encoded string 312425bb815Sopenharmony_ci */ 313425bb815Sopenharmony_cilit_utf8_size_t 314425bb815Sopenharmony_cilit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ 315425bb815Sopenharmony_ci lit_utf8_size_t cesu8_buf_size) /**< string size */ 316425bb815Sopenharmony_ci{ 317425bb815Sopenharmony_ci lit_utf8_size_t offset = 0; 318425bb815Sopenharmony_ci lit_utf8_size_t utf8_buf_size = cesu8_buf_size; 319425bb815Sopenharmony_ci ecma_char_t prev_ch = 0; 320425bb815Sopenharmony_ci 321425bb815Sopenharmony_ci while (offset < cesu8_buf_size) 322425bb815Sopenharmony_ci { 323425bb815Sopenharmony_ci ecma_char_t ch; 324425bb815Sopenharmony_ci offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch); 325425bb815Sopenharmony_ci 326425bb815Sopenharmony_ci if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch)) 327425bb815Sopenharmony_ci { 328425bb815Sopenharmony_ci utf8_buf_size -= 2; 329425bb815Sopenharmony_ci } 330425bb815Sopenharmony_ci 331425bb815Sopenharmony_ci prev_ch = ch; 332425bb815Sopenharmony_ci } 333425bb815Sopenharmony_ci 334425bb815Sopenharmony_ci JERRY_ASSERT (offset == cesu8_buf_size); 335425bb815Sopenharmony_ci 336425bb815Sopenharmony_ci return utf8_buf_size; 337425bb815Sopenharmony_ci} /* lit_get_utf8_size_of_cesu8_string */ 338425bb815Sopenharmony_ci 339425bb815Sopenharmony_ci/** 340425bb815Sopenharmony_ci * Calculate length of an utf-8 encoded string from cesu-8 encoded string 341425bb815Sopenharmony_ci * 342425bb815Sopenharmony_ci * @return length of an utf-8 encoded string 343425bb815Sopenharmony_ci */ 344425bb815Sopenharmony_ciecma_length_t 345425bb815Sopenharmony_cilit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ 346425bb815Sopenharmony_ci lit_utf8_size_t cesu8_buf_size) /**< string size */ 347425bb815Sopenharmony_ci{ 348425bb815Sopenharmony_ci lit_utf8_size_t offset = 0; 349425bb815Sopenharmony_ci ecma_length_t utf8_length = 0; 350425bb815Sopenharmony_ci ecma_char_t prev_ch = 0; 351425bb815Sopenharmony_ci 352425bb815Sopenharmony_ci while (offset < cesu8_buf_size) 353425bb815Sopenharmony_ci { 354425bb815Sopenharmony_ci ecma_char_t ch; 355425bb815Sopenharmony_ci offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch); 356425bb815Sopenharmony_ci 357425bb815Sopenharmony_ci if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch)) 358425bb815Sopenharmony_ci { 359425bb815Sopenharmony_ci utf8_length++; 360425bb815Sopenharmony_ci } 361425bb815Sopenharmony_ci 362425bb815Sopenharmony_ci prev_ch = ch; 363425bb815Sopenharmony_ci } 364425bb815Sopenharmony_ci 365425bb815Sopenharmony_ci JERRY_ASSERT (offset == cesu8_buf_size); 366425bb815Sopenharmony_ci 367425bb815Sopenharmony_ci return utf8_length; 368425bb815Sopenharmony_ci} /* lit_get_utf8_length_of_cesu8_string */ 369425bb815Sopenharmony_ci 370425bb815Sopenharmony_ci/** 371425bb815Sopenharmony_ci * Decodes a unicode code point from non-empty utf-8-encoded buffer 372425bb815Sopenharmony_ci * 373425bb815Sopenharmony_ci * @return number of bytes occupied by code point in the string 374425bb815Sopenharmony_ci */ 375425bb815Sopenharmony_cilit_utf8_size_t 376425bb815Sopenharmony_cilit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ 377425bb815Sopenharmony_ci lit_utf8_size_t buf_size, /**< size of the buffer in bytes */ 378425bb815Sopenharmony_ci lit_code_point_t *code_point) /**< [out] code point */ 379425bb815Sopenharmony_ci{ 380425bb815Sopenharmony_ci JERRY_ASSERT (buf_p && buf_size); 381425bb815Sopenharmony_ci 382425bb815Sopenharmony_ci lit_utf8_byte_t c = buf_p[0]; 383425bb815Sopenharmony_ci if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 384425bb815Sopenharmony_ci { 385425bb815Sopenharmony_ci *code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK); 386425bb815Sopenharmony_ci return 1; 387425bb815Sopenharmony_ci } 388425bb815Sopenharmony_ci 389425bb815Sopenharmony_ci lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL; 390425bb815Sopenharmony_ci ecma_length_t bytes_count = 0; 391425bb815Sopenharmony_ci if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 392425bb815Sopenharmony_ci { 393425bb815Sopenharmony_ci bytes_count = 2; 394425bb815Sopenharmony_ci ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); 395425bb815Sopenharmony_ci } 396425bb815Sopenharmony_ci else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) 397425bb815Sopenharmony_ci { 398425bb815Sopenharmony_ci bytes_count = 3; 399425bb815Sopenharmony_ci ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); 400425bb815Sopenharmony_ci } 401425bb815Sopenharmony_ci else 402425bb815Sopenharmony_ci { 403425bb815Sopenharmony_ci JERRY_ASSERT ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER); 404425bb815Sopenharmony_ci bytes_count = 4; 405425bb815Sopenharmony_ci ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_3_BITS_MASK)); 406425bb815Sopenharmony_ci } 407425bb815Sopenharmony_ci 408425bb815Sopenharmony_ci JERRY_ASSERT (buf_size >= bytes_count); 409425bb815Sopenharmony_ci 410425bb815Sopenharmony_ci for (uint32_t i = 1; i < bytes_count; ++i) 411425bb815Sopenharmony_ci { 412425bb815Sopenharmony_ci ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; 413425bb815Sopenharmony_ci ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK); 414425bb815Sopenharmony_ci } 415425bb815Sopenharmony_ci 416425bb815Sopenharmony_ci *code_point = ret; 417425bb815Sopenharmony_ci return bytes_count; 418425bb815Sopenharmony_ci} /* lit_read_code_point_from_utf8 */ 419425bb815Sopenharmony_ci 420425bb815Sopenharmony_ci/** 421425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 422425bb815Sopenharmony_ci * 423425bb815Sopenharmony_ci * @return number of bytes occupied by code point in the string 424425bb815Sopenharmony_ci */ 425425bb815Sopenharmony_cilit_utf8_size_t 426425bb815Sopenharmony_cilit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ 427425bb815Sopenharmony_ci ecma_char_t *code_point) /**< [out] code point */ 428425bb815Sopenharmony_ci{ 429425bb815Sopenharmony_ci JERRY_ASSERT (buf_p); 430425bb815Sopenharmony_ci 431425bb815Sopenharmony_ci lit_utf8_byte_t c = buf_p[0]; 432425bb815Sopenharmony_ci if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 433425bb815Sopenharmony_ci { 434425bb815Sopenharmony_ci *code_point = (ecma_char_t) (c & LIT_UTF8_LAST_7_BITS_MASK); 435425bb815Sopenharmony_ci return 1; 436425bb815Sopenharmony_ci } 437425bb815Sopenharmony_ci 438425bb815Sopenharmony_ci lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL; 439425bb815Sopenharmony_ci ecma_length_t bytes_count; 440425bb815Sopenharmony_ci if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 441425bb815Sopenharmony_ci { 442425bb815Sopenharmony_ci bytes_count = 2; 443425bb815Sopenharmony_ci ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); 444425bb815Sopenharmony_ci } 445425bb815Sopenharmony_ci else 446425bb815Sopenharmony_ci { 447425bb815Sopenharmony_ci JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER); 448425bb815Sopenharmony_ci bytes_count = 3; 449425bb815Sopenharmony_ci ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); 450425bb815Sopenharmony_ci } 451425bb815Sopenharmony_ci 452425bb815Sopenharmony_ci for (uint32_t i = 1; i < bytes_count; ++i) 453425bb815Sopenharmony_ci { 454425bb815Sopenharmony_ci ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; 455425bb815Sopenharmony_ci ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK); 456425bb815Sopenharmony_ci } 457425bb815Sopenharmony_ci 458425bb815Sopenharmony_ci JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX); 459425bb815Sopenharmony_ci *code_point = (ecma_char_t) ret; 460425bb815Sopenharmony_ci return bytes_count; 461425bb815Sopenharmony_ci} /* lit_read_code_unit_from_utf8 */ 462425bb815Sopenharmony_ci 463425bb815Sopenharmony_ci/** 464425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 465425bb815Sopenharmony_ci * 466425bb815Sopenharmony_ci * @return number of bytes occupied by code point in the string 467425bb815Sopenharmony_ci */ 468425bb815Sopenharmony_cilit_utf8_size_t 469425bb815Sopenharmony_cilit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ 470425bb815Sopenharmony_ci ecma_char_t *code_point) /**< [out] code point */ 471425bb815Sopenharmony_ci{ 472425bb815Sopenharmony_ci JERRY_ASSERT (buf_p); 473425bb815Sopenharmony_ci 474425bb815Sopenharmony_ci lit_utf8_decr (&buf_p); 475425bb815Sopenharmony_ci return lit_read_code_unit_from_utf8 (buf_p, code_point); 476425bb815Sopenharmony_ci} /* lit_read_prev_code_unit_from_utf8 */ 477425bb815Sopenharmony_ci 478425bb815Sopenharmony_ci/** 479425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 480425bb815Sopenharmony_ci * 481425bb815Sopenharmony_ci * @return next code unit 482425bb815Sopenharmony_ci */ 483425bb815Sopenharmony_ciecma_char_t 484425bb815Sopenharmony_cilit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ 485425bb815Sopenharmony_ci{ 486425bb815Sopenharmony_ci JERRY_ASSERT (*buf_p); 487425bb815Sopenharmony_ci ecma_char_t ch; 488425bb815Sopenharmony_ci 489425bb815Sopenharmony_ci *buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch); 490425bb815Sopenharmony_ci 491425bb815Sopenharmony_ci return ch; 492425bb815Sopenharmony_ci} /* lit_cesu8_read_next */ 493425bb815Sopenharmony_ci 494425bb815Sopenharmony_ci/** 495425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 496425bb815Sopenharmony_ci * 497425bb815Sopenharmony_ci * @return previous code unit 498425bb815Sopenharmony_ci */ 499425bb815Sopenharmony_ciecma_char_t 500425bb815Sopenharmony_cilit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ 501425bb815Sopenharmony_ci{ 502425bb815Sopenharmony_ci JERRY_ASSERT (*buf_p); 503425bb815Sopenharmony_ci ecma_char_t ch; 504425bb815Sopenharmony_ci 505425bb815Sopenharmony_ci lit_utf8_decr (buf_p); 506425bb815Sopenharmony_ci lit_read_code_unit_from_utf8 (*buf_p, &ch); 507425bb815Sopenharmony_ci 508425bb815Sopenharmony_ci return ch; 509425bb815Sopenharmony_ci} /* lit_cesu8_read_prev */ 510425bb815Sopenharmony_ci 511425bb815Sopenharmony_ci/** 512425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 513425bb815Sopenharmony_ci * 514425bb815Sopenharmony_ci * @return next code unit 515425bb815Sopenharmony_ci */ 516425bb815Sopenharmony_ciecma_char_t JERRY_ATTR_NOINLINE 517425bb815Sopenharmony_cilit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ 518425bb815Sopenharmony_ci{ 519425bb815Sopenharmony_ci JERRY_ASSERT (buf_p != NULL); 520425bb815Sopenharmony_ci ecma_char_t ch; 521425bb815Sopenharmony_ci 522425bb815Sopenharmony_ci lit_read_code_unit_from_utf8 (buf_p, &ch); 523425bb815Sopenharmony_ci 524425bb815Sopenharmony_ci return ch; 525425bb815Sopenharmony_ci} /* lit_cesu8_peek_next */ 526425bb815Sopenharmony_ci 527425bb815Sopenharmony_ci/** 528425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer 529425bb815Sopenharmony_ci * 530425bb815Sopenharmony_ci * @return previous code unit 531425bb815Sopenharmony_ci */ 532425bb815Sopenharmony_ciecma_char_t JERRY_ATTR_NOINLINE 533425bb815Sopenharmony_cilit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ 534425bb815Sopenharmony_ci{ 535425bb815Sopenharmony_ci JERRY_ASSERT (buf_p != NULL); 536425bb815Sopenharmony_ci ecma_char_t ch; 537425bb815Sopenharmony_ci 538425bb815Sopenharmony_ci lit_read_prev_code_unit_from_utf8 (buf_p, &ch); 539425bb815Sopenharmony_ci 540425bb815Sopenharmony_ci return ch; 541425bb815Sopenharmony_ci} /* lit_cesu8_peek_prev */ 542425bb815Sopenharmony_ci 543425bb815Sopenharmony_ci/** 544425bb815Sopenharmony_ci * Increase cesu-8 encoded string pointer by one code unit. 545425bb815Sopenharmony_ci */ 546425bb815Sopenharmony_ciinline void JERRY_ATTR_ALWAYS_INLINE 547425bb815Sopenharmony_cilit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ 548425bb815Sopenharmony_ci{ 549425bb815Sopenharmony_ci JERRY_ASSERT (*buf_p); 550425bb815Sopenharmony_ci 551425bb815Sopenharmony_ci *buf_p += lit_get_unicode_char_size_by_utf8_first_byte (**buf_p); 552425bb815Sopenharmony_ci} /* lit_utf8_incr */ 553425bb815Sopenharmony_ci 554425bb815Sopenharmony_ci/** 555425bb815Sopenharmony_ci * Decrease cesu-8 encoded string pointer by one code unit. 556425bb815Sopenharmony_ci */ 557425bb815Sopenharmony_civoid 558425bb815Sopenharmony_cilit_utf8_decr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ 559425bb815Sopenharmony_ci{ 560425bb815Sopenharmony_ci JERRY_ASSERT (*buf_p); 561425bb815Sopenharmony_ci const lit_utf8_byte_t *current_p = *buf_p; 562425bb815Sopenharmony_ci 563425bb815Sopenharmony_ci do 564425bb815Sopenharmony_ci { 565425bb815Sopenharmony_ci current_p--; 566425bb815Sopenharmony_ci } 567425bb815Sopenharmony_ci while ((*(current_p) & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER); 568425bb815Sopenharmony_ci 569425bb815Sopenharmony_ci *buf_p = current_p; 570425bb815Sopenharmony_ci} /* lit_utf8_decr */ 571425bb815Sopenharmony_ci 572425bb815Sopenharmony_ci/** 573425bb815Sopenharmony_ci * Calc hash using the specified hash_basis. 574425bb815Sopenharmony_ci * 575425bb815Sopenharmony_ci * NOTE: 576425bb815Sopenharmony_ci * This is implementation of FNV-1a hash function, which is released into public domain. 577425bb815Sopenharmony_ci * Constants used, are carefully picked primes by the authors. 578425bb815Sopenharmony_ci * More info: http://www.isthe.com/chongo/tech/comp/fnv/ 579425bb815Sopenharmony_ci * 580425bb815Sopenharmony_ci * @return ecma-string's hash 581425bb815Sopenharmony_ci */ 582425bb815Sopenharmony_ciinline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE 583425bb815Sopenharmony_cilit_utf8_string_hash_combine (lit_string_hash_t hash_basis, /**< hash to be combined with */ 584425bb815Sopenharmony_ci const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */ 585425bb815Sopenharmony_ci lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */ 586425bb815Sopenharmony_ci{ 587425bb815Sopenharmony_ci JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0); 588425bb815Sopenharmony_ci 589425bb815Sopenharmony_ci uint32_t hash = hash_basis; 590425bb815Sopenharmony_ci 591425bb815Sopenharmony_ci for (uint32_t i = 0; i < utf8_buf_size; i++) 592425bb815Sopenharmony_ci { 593425bb815Sopenharmony_ci /* 16777619 is 32 bit FNV_prime = 2^24 + 2^8 + 0x93 = 16777619 */ 594425bb815Sopenharmony_ci hash = (hash ^ utf8_buf_p[i]) * 16777619; 595425bb815Sopenharmony_ci } 596425bb815Sopenharmony_ci 597425bb815Sopenharmony_ci return (lit_string_hash_t) hash; 598425bb815Sopenharmony_ci} /* lit_utf8_string_hash_combine */ 599425bb815Sopenharmony_ci 600425bb815Sopenharmony_ci/** 601425bb815Sopenharmony_ci * Calculate hash from the buffer. 602425bb815Sopenharmony_ci * 603425bb815Sopenharmony_ci * @return ecma-string's hash 604425bb815Sopenharmony_ci */ 605425bb815Sopenharmony_ciinline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE 606425bb815Sopenharmony_cilit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */ 607425bb815Sopenharmony_ci lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */ 608425bb815Sopenharmony_ci{ 609425bb815Sopenharmony_ci JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0); 610425bb815Sopenharmony_ci 611425bb815Sopenharmony_ci /* 32 bit offset_basis for FNV = 2166136261 */ 612425bb815Sopenharmony_ci return lit_utf8_string_hash_combine ((lit_string_hash_t) 2166136261, utf8_buf_p, utf8_buf_size); 613425bb815Sopenharmony_ci} /* lit_utf8_string_calc_hash */ 614425bb815Sopenharmony_ci 615425bb815Sopenharmony_ci/** 616425bb815Sopenharmony_ci * Return code unit at the specified position in string 617425bb815Sopenharmony_ci * 618425bb815Sopenharmony_ci * NOTE: 619425bb815Sopenharmony_ci * code_unit_offset should be less then string's length 620425bb815Sopenharmony_ci * 621425bb815Sopenharmony_ci * @return code unit value 622425bb815Sopenharmony_ci */ 623425bb815Sopenharmony_ciecma_char_t 624425bb815Sopenharmony_cilit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ 625425bb815Sopenharmony_ci lit_utf8_size_t utf8_buf_size, /**< string size in bytes */ 626425bb815Sopenharmony_ci ecma_length_t code_unit_offset) /**< ofset of a code_unit */ 627425bb815Sopenharmony_ci{ 628425bb815Sopenharmony_ci lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) utf8_buf_p; 629425bb815Sopenharmony_ci ecma_char_t code_unit; 630425bb815Sopenharmony_ci 631425bb815Sopenharmony_ci do 632425bb815Sopenharmony_ci { 633425bb815Sopenharmony_ci JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size); 634425bb815Sopenharmony_ci current_p += lit_read_code_unit_from_utf8 (current_p, &code_unit); 635425bb815Sopenharmony_ci } 636425bb815Sopenharmony_ci while (code_unit_offset--); 637425bb815Sopenharmony_ci 638425bb815Sopenharmony_ci return code_unit; 639425bb815Sopenharmony_ci} /* lit_utf8_string_code_unit_at */ 640425bb815Sopenharmony_ci 641425bb815Sopenharmony_ci/** 642425bb815Sopenharmony_ci * Get CESU-8 encoded size of character 643425bb815Sopenharmony_ci * 644425bb815Sopenharmony_ci * @return number of bytes occupied in CESU-8 645425bb815Sopenharmony_ci */ 646425bb815Sopenharmony_ciinline lit_utf8_size_t JERRY_ATTR_ALWAYS_INLINE 647425bb815Sopenharmony_cilit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */ 648425bb815Sopenharmony_ci{ 649425bb815Sopenharmony_ci if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) 650425bb815Sopenharmony_ci { 651425bb815Sopenharmony_ci return 1; 652425bb815Sopenharmony_ci } 653425bb815Sopenharmony_ci else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) 654425bb815Sopenharmony_ci { 655425bb815Sopenharmony_ci return 2; 656425bb815Sopenharmony_ci } 657425bb815Sopenharmony_ci else 658425bb815Sopenharmony_ci { 659425bb815Sopenharmony_ci JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER); 660425bb815Sopenharmony_ci return 3; 661425bb815Sopenharmony_ci } 662425bb815Sopenharmony_ci} /* lit_get_unicode_char_size_by_utf8_first_byte */ 663425bb815Sopenharmony_ci 664425bb815Sopenharmony_ci/** 665425bb815Sopenharmony_ci * Convert code unit to cesu-8 representation 666425bb815Sopenharmony_ci * 667425bb815Sopenharmony_ci * @return byte count required to represent the code unit 668425bb815Sopenharmony_ci */ 669425bb815Sopenharmony_cilit_utf8_size_t 670425bb815Sopenharmony_cilit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */ 671425bb815Sopenharmony_ci lit_utf8_byte_t *buf_p) /**< buffer where to store the result and its size 672425bb815Sopenharmony_ci * should be at least LIT_UTF8_MAX_BYTES_IN_CODE_UNIT */ 673425bb815Sopenharmony_ci{ 674425bb815Sopenharmony_ci if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) 675425bb815Sopenharmony_ci { 676425bb815Sopenharmony_ci buf_p[0] = (lit_utf8_byte_t) code_unit; 677425bb815Sopenharmony_ci return 1; 678425bb815Sopenharmony_ci } 679425bb815Sopenharmony_ci else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX) 680425bb815Sopenharmony_ci { 681425bb815Sopenharmony_ci uint32_t code_unit_bits = code_unit; 682425bb815Sopenharmony_ci lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK); 683425bb815Sopenharmony_ci code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 684425bb815Sopenharmony_ci 685425bb815Sopenharmony_ci lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK); 686425bb815Sopenharmony_ci JERRY_ASSERT (first_byte_bits == code_unit_bits); 687425bb815Sopenharmony_ci 688425bb815Sopenharmony_ci buf_p[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits; 689425bb815Sopenharmony_ci buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 690425bb815Sopenharmony_ci return 2; 691425bb815Sopenharmony_ci } 692425bb815Sopenharmony_ci else 693425bb815Sopenharmony_ci { 694425bb815Sopenharmony_ci uint32_t code_unit_bits = code_unit; 695425bb815Sopenharmony_ci lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK); 696425bb815Sopenharmony_ci code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 697425bb815Sopenharmony_ci 698425bb815Sopenharmony_ci lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK); 699425bb815Sopenharmony_ci code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 700425bb815Sopenharmony_ci 701425bb815Sopenharmony_ci lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK); 702425bb815Sopenharmony_ci JERRY_ASSERT (first_byte_bits == code_unit_bits); 703425bb815Sopenharmony_ci 704425bb815Sopenharmony_ci buf_p[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits; 705425bb815Sopenharmony_ci buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 706425bb815Sopenharmony_ci buf_p[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits; 707425bb815Sopenharmony_ci return 3; 708425bb815Sopenharmony_ci } 709425bb815Sopenharmony_ci} /* lit_code_unit_to_utf8 */ 710425bb815Sopenharmony_ci 711425bb815Sopenharmony_ci/** 712425bb815Sopenharmony_ci * Convert code point to cesu-8 representation 713425bb815Sopenharmony_ci * 714425bb815Sopenharmony_ci * @return byte count required to represent the code point 715425bb815Sopenharmony_ci */ 716425bb815Sopenharmony_cilit_utf8_size_t 717425bb815Sopenharmony_cilit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */ 718425bb815Sopenharmony_ci lit_utf8_byte_t *buf) /**< buffer where to store the result, 719425bb815Sopenharmony_ci * its size should be at least 6 bytes */ 720425bb815Sopenharmony_ci{ 721425bb815Sopenharmony_ci if (code_point <= LIT_UTF16_CODE_UNIT_MAX) 722425bb815Sopenharmony_ci { 723425bb815Sopenharmony_ci return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf); 724425bb815Sopenharmony_ci } 725425bb815Sopenharmony_ci else 726425bb815Sopenharmony_ci { 727425bb815Sopenharmony_ci lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf); 728425bb815Sopenharmony_ci offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset); 729425bb815Sopenharmony_ci return offset; 730425bb815Sopenharmony_ci } 731425bb815Sopenharmony_ci} /* lit_code_point_to_cesu8 */ 732425bb815Sopenharmony_ci 733425bb815Sopenharmony_ci/** 734425bb815Sopenharmony_ci * Convert code point to utf-8 representation 735425bb815Sopenharmony_ci * 736425bb815Sopenharmony_ci * @return byte count required to represent the code point 737425bb815Sopenharmony_ci */ 738425bb815Sopenharmony_cilit_utf8_size_t 739425bb815Sopenharmony_cilit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */ 740425bb815Sopenharmony_ci lit_utf8_byte_t *buf) /**< buffer where to store the result, 741425bb815Sopenharmony_ci * its size should be at least 4 bytes */ 742425bb815Sopenharmony_ci{ 743425bb815Sopenharmony_ci if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) 744425bb815Sopenharmony_ci { 745425bb815Sopenharmony_ci buf[0] = (lit_utf8_byte_t) code_point; 746425bb815Sopenharmony_ci return 1; 747425bb815Sopenharmony_ci } 748425bb815Sopenharmony_ci else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX) 749425bb815Sopenharmony_ci { 750425bb815Sopenharmony_ci uint32_t code_point_bits = code_point; 751425bb815Sopenharmony_ci lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 752425bb815Sopenharmony_ci code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 753425bb815Sopenharmony_ci 754425bb815Sopenharmony_ci lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK); 755425bb815Sopenharmony_ci JERRY_ASSERT (first_byte_bits == code_point_bits); 756425bb815Sopenharmony_ci 757425bb815Sopenharmony_ci buf[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits; 758425bb815Sopenharmony_ci buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 759425bb815Sopenharmony_ci return 2; 760425bb815Sopenharmony_ci } 761425bb815Sopenharmony_ci else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX) 762425bb815Sopenharmony_ci { 763425bb815Sopenharmony_ci uint32_t code_point_bits = code_point; 764425bb815Sopenharmony_ci lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 765425bb815Sopenharmony_ci code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 766425bb815Sopenharmony_ci 767425bb815Sopenharmony_ci lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 768425bb815Sopenharmony_ci code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 769425bb815Sopenharmony_ci 770425bb815Sopenharmony_ci lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK); 771425bb815Sopenharmony_ci JERRY_ASSERT (first_byte_bits == code_point_bits); 772425bb815Sopenharmony_ci 773425bb815Sopenharmony_ci buf[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits; 774425bb815Sopenharmony_ci buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 775425bb815Sopenharmony_ci buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits; 776425bb815Sopenharmony_ci return 3; 777425bb815Sopenharmony_ci } 778425bb815Sopenharmony_ci else 779425bb815Sopenharmony_ci { 780425bb815Sopenharmony_ci JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX); 781425bb815Sopenharmony_ci 782425bb815Sopenharmony_ci uint32_t code_point_bits = code_point; 783425bb815Sopenharmony_ci lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 784425bb815Sopenharmony_ci code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 785425bb815Sopenharmony_ci 786425bb815Sopenharmony_ci lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 787425bb815Sopenharmony_ci code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 788425bb815Sopenharmony_ci 789425bb815Sopenharmony_ci lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); 790425bb815Sopenharmony_ci code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; 791425bb815Sopenharmony_ci 792425bb815Sopenharmony_ci lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK); 793425bb815Sopenharmony_ci JERRY_ASSERT (first_byte_bits == code_point_bits); 794425bb815Sopenharmony_ci 795425bb815Sopenharmony_ci buf[0] = LIT_UTF8_4_BYTE_MARKER | first_byte_bits; 796425bb815Sopenharmony_ci buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; 797425bb815Sopenharmony_ci buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits; 798425bb815Sopenharmony_ci buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits; 799425bb815Sopenharmony_ci return 4; 800425bb815Sopenharmony_ci } 801425bb815Sopenharmony_ci} /* lit_code_point_to_utf8 */ 802425bb815Sopenharmony_ci 803425bb815Sopenharmony_ci/** 804425bb815Sopenharmony_ci * Convert cesu-8 string to an utf-8 string and put it into the buffer. 805425bb815Sopenharmony_ci * It is the caller's responsibility to make sure that the string fits in the buffer. 806425bb815Sopenharmony_ci * 807425bb815Sopenharmony_ci * @return number of bytes copied to the buffer. 808425bb815Sopenharmony_ci */ 809425bb815Sopenharmony_cilit_utf8_size_t 810425bb815Sopenharmony_cilit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, /**< cesu-8 string */ 811425bb815Sopenharmony_ci lit_utf8_size_t cesu8_size, /**< size of cesu-8 string */ 812425bb815Sopenharmony_ci lit_utf8_byte_t *utf8_string, /**< destination utf-8 buffer pointer 813425bb815Sopenharmony_ci * (can be NULL if buffer_size == 0) */ 814425bb815Sopenharmony_ci lit_utf8_size_t utf8_size) /**< size of utf-8 buffer */ 815425bb815Sopenharmony_ci{ 816425bb815Sopenharmony_ci const lit_utf8_byte_t *cesu8_pos = cesu8_string; 817425bb815Sopenharmony_ci const lit_utf8_byte_t *cesu8_end_pos = cesu8_string + cesu8_size; 818425bb815Sopenharmony_ci 819425bb815Sopenharmony_ci lit_utf8_byte_t *utf8_pos = utf8_string; 820425bb815Sopenharmony_ci lit_utf8_byte_t *utf8_end_pos = utf8_string + utf8_size; 821425bb815Sopenharmony_ci 822425bb815Sopenharmony_ci lit_utf8_size_t size = 0; 823425bb815Sopenharmony_ci 824425bb815Sopenharmony_ci ecma_char_t prev_ch = 0; 825425bb815Sopenharmony_ci lit_utf8_size_t prev_ch_size = 0; 826425bb815Sopenharmony_ci 827425bb815Sopenharmony_ci while (cesu8_pos < cesu8_end_pos) 828425bb815Sopenharmony_ci { 829425bb815Sopenharmony_ci ecma_char_t ch; 830425bb815Sopenharmony_ci lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch); 831425bb815Sopenharmony_ci 832425bb815Sopenharmony_ci if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch)) 833425bb815Sopenharmony_ci { 834425bb815Sopenharmony_ci JERRY_ASSERT (code_unit_size == prev_ch_size); 835425bb815Sopenharmony_ci utf8_pos -= prev_ch_size; 836425bb815Sopenharmony_ci lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_ch, ch); 837425bb815Sopenharmony_ci lit_code_point_to_utf8 (code_point, utf8_pos); 838425bb815Sopenharmony_ci size++; 839425bb815Sopenharmony_ci } 840425bb815Sopenharmony_ci else 841425bb815Sopenharmony_ci { 842425bb815Sopenharmony_ci memcpy (utf8_pos, cesu8_pos, code_unit_size); 843425bb815Sopenharmony_ci size += code_unit_size; 844425bb815Sopenharmony_ci } 845425bb815Sopenharmony_ci 846425bb815Sopenharmony_ci utf8_pos = utf8_string + size; 847425bb815Sopenharmony_ci cesu8_pos += code_unit_size; 848425bb815Sopenharmony_ci prev_ch = ch; 849425bb815Sopenharmony_ci prev_ch_size = code_unit_size; 850425bb815Sopenharmony_ci } 851425bb815Sopenharmony_ci 852425bb815Sopenharmony_ci JERRY_ASSERT (cesu8_pos == cesu8_end_pos); 853425bb815Sopenharmony_ci JERRY_ASSERT (utf8_pos <= utf8_end_pos); 854425bb815Sopenharmony_ci 855425bb815Sopenharmony_ci return size; 856425bb815Sopenharmony_ci} /* lit_convert_cesu8_string_to_utf8_string */ 857425bb815Sopenharmony_ci 858425bb815Sopenharmony_ci/** 859425bb815Sopenharmony_ci * Convert surrogate pair to code point 860425bb815Sopenharmony_ci * 861425bb815Sopenharmony_ci * @return code point 862425bb815Sopenharmony_ci */ 863425bb815Sopenharmony_cilit_code_point_t 864425bb815Sopenharmony_cilit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */ 865425bb815Sopenharmony_ci ecma_char_t low_surrogate) /**< low surrogate code point */ 866425bb815Sopenharmony_ci{ 867425bb815Sopenharmony_ci JERRY_ASSERT (lit_is_code_point_utf16_high_surrogate (high_surrogate)); 868425bb815Sopenharmony_ci JERRY_ASSERT (lit_is_code_point_utf16_low_surrogate (low_surrogate)); 869425bb815Sopenharmony_ci 870425bb815Sopenharmony_ci lit_code_point_t code_point; 871425bb815Sopenharmony_ci code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN); 872425bb815Sopenharmony_ci code_point <<= LIT_UTF16_BITS_IN_SURROGATE; 873425bb815Sopenharmony_ci 874425bb815Sopenharmony_ci code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT; 875425bb815Sopenharmony_ci 876425bb815Sopenharmony_ci code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN); 877425bb815Sopenharmony_ci return code_point; 878425bb815Sopenharmony_ci} /* lit_convert_surrogate_pair_to_code_point */ 879425bb815Sopenharmony_ci 880425bb815Sopenharmony_ci/** 881425bb815Sopenharmony_ci * Relational compare of cesu-8 strings 882425bb815Sopenharmony_ci * 883425bb815Sopenharmony_ci * First string is less than second string if: 884425bb815Sopenharmony_ci * - strings are not equal; 885425bb815Sopenharmony_ci * - first string is prefix of second or is lexicographically less than second. 886425bb815Sopenharmony_ci * 887425bb815Sopenharmony_ci * @return true - if first string is less than second string, 888425bb815Sopenharmony_ci * false - otherwise 889425bb815Sopenharmony_ci */ 890425bb815Sopenharmony_cibool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< utf-8 string */ 891425bb815Sopenharmony_ci lit_utf8_size_t string1_size, /**< string size */ 892425bb815Sopenharmony_ci const lit_utf8_byte_t *string2_p, /**< utf-8 string */ 893425bb815Sopenharmony_ci lit_utf8_size_t string2_size) /**< string size */ 894425bb815Sopenharmony_ci{ 895425bb815Sopenharmony_ci lit_utf8_byte_t *string1_pos = (lit_utf8_byte_t *) string1_p; 896425bb815Sopenharmony_ci lit_utf8_byte_t *string2_pos = (lit_utf8_byte_t *) string2_p; 897425bb815Sopenharmony_ci const lit_utf8_byte_t *string1_end_p = string1_p + string1_size; 898425bb815Sopenharmony_ci const lit_utf8_byte_t *string2_end_p = string2_p + string2_size; 899425bb815Sopenharmony_ci 900425bb815Sopenharmony_ci while (string1_pos < string1_end_p && string2_pos < string2_end_p) 901425bb815Sopenharmony_ci { 902425bb815Sopenharmony_ci ecma_char_t ch1, ch2; 903425bb815Sopenharmony_ci string1_pos += lit_read_code_unit_from_utf8 (string1_pos, &ch1); 904425bb815Sopenharmony_ci string2_pos += lit_read_code_unit_from_utf8 (string2_pos, &ch2); 905425bb815Sopenharmony_ci 906425bb815Sopenharmony_ci if (ch1 < ch2) 907425bb815Sopenharmony_ci { 908425bb815Sopenharmony_ci return true; 909425bb815Sopenharmony_ci } 910425bb815Sopenharmony_ci else if (ch1 > ch2) 911425bb815Sopenharmony_ci { 912425bb815Sopenharmony_ci return false; 913425bb815Sopenharmony_ci } 914425bb815Sopenharmony_ci } 915425bb815Sopenharmony_ci 916425bb815Sopenharmony_ci return (string1_pos >= string1_end_p && string2_pos < string2_end_p); 917425bb815Sopenharmony_ci} /* lit_compare_utf8_strings_relational */ 918