1425bb815Sopenharmony_ci/* Copyright JS Foundation and other contributors, http://js.foundation
2425bb815Sopenharmony_ci *
3425bb815Sopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License");
4425bb815Sopenharmony_ci * you may not use this file except in compliance with the License.
5425bb815Sopenharmony_ci * You may obtain a copy of the License at
6425bb815Sopenharmony_ci *
7425bb815Sopenharmony_ci *     http://www.apache.org/licenses/LICENSE-2.0
8425bb815Sopenharmony_ci *
9425bb815Sopenharmony_ci * Unless required by applicable law or agreed to in writing, software
10425bb815Sopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS
11425bb815Sopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12425bb815Sopenharmony_ci * See the License for the specific language governing permissions and
13425bb815Sopenharmony_ci * limitations under the License.
14425bb815Sopenharmony_ci */
15425bb815Sopenharmony_ci
16425bb815Sopenharmony_ci#include "lit-strings.h"
17425bb815Sopenharmony_ci
18425bb815Sopenharmony_ci#include "jrt-libc-includes.h"
19425bb815Sopenharmony_ci
20425bb815Sopenharmony_ci/**
21425bb815Sopenharmony_ci * Validate utf-8 string
22425bb815Sopenharmony_ci *
23425bb815Sopenharmony_ci * NOTE:
24425bb815Sopenharmony_ci *   Isolated surrogates are allowed.
25425bb815Sopenharmony_ci *   Correct pair of surrogates is not allowed, it should be represented as 4-byte utf-8 character.
26425bb815Sopenharmony_ci *
27425bb815Sopenharmony_ci * @return true if utf-8 string is well-formed
28425bb815Sopenharmony_ci *         false otherwise
29425bb815Sopenharmony_ci */
30425bb815Sopenharmony_cibool
31425bb815Sopenharmony_cilit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
32425bb815Sopenharmony_ci                          lit_utf8_size_t buf_size) /**< string size */
33425bb815Sopenharmony_ci{
34425bb815Sopenharmony_ci  lit_utf8_size_t idx = 0;
35425bb815Sopenharmony_ci
36425bb815Sopenharmony_ci  bool is_prev_code_point_high_surrogate = false;
37425bb815Sopenharmony_ci  while (idx < buf_size)
38425bb815Sopenharmony_ci  {
39425bb815Sopenharmony_ci    lit_utf8_byte_t c = utf8_buf_p[idx++];
40425bb815Sopenharmony_ci    if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
41425bb815Sopenharmony_ci    {
42425bb815Sopenharmony_ci      is_prev_code_point_high_surrogate = false;
43425bb815Sopenharmony_ci      continue;
44425bb815Sopenharmony_ci    }
45425bb815Sopenharmony_ci
46425bb815Sopenharmony_ci    lit_code_point_t code_point = 0;
47425bb815Sopenharmony_ci    lit_code_point_t min_code_point = 0;
48425bb815Sopenharmony_ci    lit_utf8_size_t extra_bytes_count;
49425bb815Sopenharmony_ci    if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
50425bb815Sopenharmony_ci    {
51425bb815Sopenharmony_ci      extra_bytes_count = 1;
52425bb815Sopenharmony_ci      min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
53425bb815Sopenharmony_ci      code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
54425bb815Sopenharmony_ci    }
55425bb815Sopenharmony_ci    else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
56425bb815Sopenharmony_ci    {
57425bb815Sopenharmony_ci      extra_bytes_count = 2;
58425bb815Sopenharmony_ci      min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
59425bb815Sopenharmony_ci      code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
60425bb815Sopenharmony_ci    }
61425bb815Sopenharmony_ci    else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
62425bb815Sopenharmony_ci    {
63425bb815Sopenharmony_ci      extra_bytes_count = 3;
64425bb815Sopenharmony_ci      min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN;
65425bb815Sopenharmony_ci      code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
66425bb815Sopenharmony_ci    }
67425bb815Sopenharmony_ci    else
68425bb815Sopenharmony_ci    {
69425bb815Sopenharmony_ci      /* utf-8 string could not contain 5- and 6-byte sequences. */
70425bb815Sopenharmony_ci      return false;
71425bb815Sopenharmony_ci    }
72425bb815Sopenharmony_ci
73425bb815Sopenharmony_ci    if (idx + extra_bytes_count > buf_size)
74425bb815Sopenharmony_ci    {
75425bb815Sopenharmony_ci      /* utf-8 string breaks in the middle */
76425bb815Sopenharmony_ci      return false;
77425bb815Sopenharmony_ci    }
78425bb815Sopenharmony_ci
79425bb815Sopenharmony_ci    for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
80425bb815Sopenharmony_ci    {
81425bb815Sopenharmony_ci      c = utf8_buf_p[idx + offset];
82425bb815Sopenharmony_ci      if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
83425bb815Sopenharmony_ci      {
84425bb815Sopenharmony_ci        /* invalid continuation byte */
85425bb815Sopenharmony_ci        return false;
86425bb815Sopenharmony_ci      }
87425bb815Sopenharmony_ci      code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
88425bb815Sopenharmony_ci      code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
89425bb815Sopenharmony_ci    }
90425bb815Sopenharmony_ci
91425bb815Sopenharmony_ci    if (code_point < min_code_point
92425bb815Sopenharmony_ci        || code_point > LIT_UNICODE_CODE_POINT_MAX)
93425bb815Sopenharmony_ci    {
94425bb815Sopenharmony_ci      /* utf-8 string doesn't encode valid unicode code point */
95425bb815Sopenharmony_ci      return false;
96425bb815Sopenharmony_ci    }
97425bb815Sopenharmony_ci
98425bb815Sopenharmony_ci    if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
99425bb815Sopenharmony_ci        && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX)
100425bb815Sopenharmony_ci    {
101425bb815Sopenharmony_ci      is_prev_code_point_high_surrogate = true;
102425bb815Sopenharmony_ci    }
103425bb815Sopenharmony_ci    else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN
104425bb815Sopenharmony_ci             && code_point <= LIT_UTF16_LOW_SURROGATE_MAX
105425bb815Sopenharmony_ci             && is_prev_code_point_high_surrogate)
106425bb815Sopenharmony_ci    {
107425bb815Sopenharmony_ci      /* sequence of high and low surrogate is not allowed */
108425bb815Sopenharmony_ci      return false;
109425bb815Sopenharmony_ci    }
110425bb815Sopenharmony_ci    else
111425bb815Sopenharmony_ci    {
112425bb815Sopenharmony_ci      is_prev_code_point_high_surrogate = false;
113425bb815Sopenharmony_ci    }
114425bb815Sopenharmony_ci
115425bb815Sopenharmony_ci    idx += extra_bytes_count;
116425bb815Sopenharmony_ci  }
117425bb815Sopenharmony_ci
118425bb815Sopenharmony_ci  return true;
119425bb815Sopenharmony_ci} /* lit_is_valid_utf8_string */
120425bb815Sopenharmony_ci
121425bb815Sopenharmony_ci/**
122425bb815Sopenharmony_ci * Validate cesu-8 string
123425bb815Sopenharmony_ci *
124425bb815Sopenharmony_ci * @return true if cesu-8 string is well-formed
125425bb815Sopenharmony_ci *         false otherwise
126425bb815Sopenharmony_ci */
127425bb815Sopenharmony_cibool
128425bb815Sopenharmony_cilit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
129425bb815Sopenharmony_ci                           lit_utf8_size_t buf_size) /**< string size */
130425bb815Sopenharmony_ci{
131425bb815Sopenharmony_ci  lit_utf8_size_t idx = 0;
132425bb815Sopenharmony_ci
133425bb815Sopenharmony_ci  while (idx < buf_size)
134425bb815Sopenharmony_ci  {
135425bb815Sopenharmony_ci    lit_utf8_byte_t c = cesu8_buf_p[idx++];
136425bb815Sopenharmony_ci    if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
137425bb815Sopenharmony_ci    {
138425bb815Sopenharmony_ci      continue;
139425bb815Sopenharmony_ci    }
140425bb815Sopenharmony_ci
141425bb815Sopenharmony_ci    lit_code_point_t code_point = 0;
142425bb815Sopenharmony_ci    lit_code_point_t min_code_point = 0;
143425bb815Sopenharmony_ci    lit_utf8_size_t extra_bytes_count;
144425bb815Sopenharmony_ci    if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
145425bb815Sopenharmony_ci    {
146425bb815Sopenharmony_ci      extra_bytes_count = 1;
147425bb815Sopenharmony_ci      min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
148425bb815Sopenharmony_ci      code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
149425bb815Sopenharmony_ci    }
150425bb815Sopenharmony_ci    else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
151425bb815Sopenharmony_ci    {
152425bb815Sopenharmony_ci      extra_bytes_count = 2;
153425bb815Sopenharmony_ci      min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
154425bb815Sopenharmony_ci      code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
155425bb815Sopenharmony_ci    }
156425bb815Sopenharmony_ci    else
157425bb815Sopenharmony_ci    {
158425bb815Sopenharmony_ci      return false;
159425bb815Sopenharmony_ci    }
160425bb815Sopenharmony_ci
161425bb815Sopenharmony_ci    if (idx + extra_bytes_count > buf_size)
162425bb815Sopenharmony_ci    {
163425bb815Sopenharmony_ci      /* cesu-8 string breaks in the middle */
164425bb815Sopenharmony_ci      return false;
165425bb815Sopenharmony_ci    }
166425bb815Sopenharmony_ci
167425bb815Sopenharmony_ci    for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
168425bb815Sopenharmony_ci    {
169425bb815Sopenharmony_ci      c = cesu8_buf_p[idx + offset];
170425bb815Sopenharmony_ci      if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
171425bb815Sopenharmony_ci      {
172425bb815Sopenharmony_ci        /* invalid continuation byte */
173425bb815Sopenharmony_ci        return false;
174425bb815Sopenharmony_ci      }
175425bb815Sopenharmony_ci      code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
176425bb815Sopenharmony_ci      code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
177425bb815Sopenharmony_ci    }
178425bb815Sopenharmony_ci
179425bb815Sopenharmony_ci    if (code_point < min_code_point)
180425bb815Sopenharmony_ci    {
181425bb815Sopenharmony_ci      /* cesu-8 string doesn't encode valid unicode code point */
182425bb815Sopenharmony_ci      return false;
183425bb815Sopenharmony_ci    }
184425bb815Sopenharmony_ci
185425bb815Sopenharmony_ci    idx += extra_bytes_count;
186425bb815Sopenharmony_ci  }
187425bb815Sopenharmony_ci
188425bb815Sopenharmony_ci  return true;
189425bb815Sopenharmony_ci} /* lit_is_valid_cesu8_string */
190425bb815Sopenharmony_ci
191425bb815Sopenharmony_ci/**
192425bb815Sopenharmony_ci * Check if the code point is UTF-16 low surrogate
193425bb815Sopenharmony_ci *
194425bb815Sopenharmony_ci * @return true / false
195425bb815Sopenharmony_ci */
196425bb815Sopenharmony_cibool
197425bb815Sopenharmony_cilit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point) /**< code point */
198425bb815Sopenharmony_ci{
199425bb815Sopenharmony_ci  return LIT_UTF16_LOW_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_LOW_SURROGATE_MAX;
200425bb815Sopenharmony_ci} /* lit_is_code_point_utf16_low_surrogate */
201425bb815Sopenharmony_ci
202425bb815Sopenharmony_ci/**
203425bb815Sopenharmony_ci * Check if the code point is UTF-16 high surrogate
204425bb815Sopenharmony_ci *
205425bb815Sopenharmony_ci * @return true / false
206425bb815Sopenharmony_ci */
207425bb815Sopenharmony_cibool
208425bb815Sopenharmony_cilit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point) /**< code point */
209425bb815Sopenharmony_ci{
210425bb815Sopenharmony_ci  return LIT_UTF16_HIGH_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX;
211425bb815Sopenharmony_ci} /* lit_is_code_point_utf16_high_surrogate */
212425bb815Sopenharmony_ci
213425bb815Sopenharmony_ci/**
214425bb815Sopenharmony_ci * Represents code point (>0xFFFF) as surrogate pair and returns its lower part
215425bb815Sopenharmony_ci *
216425bb815Sopenharmony_ci * @return lower code_unit of the surrogate pair
217425bb815Sopenharmony_ci */
218425bb815Sopenharmony_cistatic ecma_char_t
219425bb815Sopenharmony_ciconvert_code_point_to_low_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
220425bb815Sopenharmony_ci{
221425bb815Sopenharmony_ci  JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
222425bb815Sopenharmony_ci
223425bb815Sopenharmony_ci  ecma_char_t code_unit_bits;
224425bb815Sopenharmony_ci  code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK);
225425bb815Sopenharmony_ci
226425bb815Sopenharmony_ci  return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER | code_unit_bits);
227425bb815Sopenharmony_ci} /* convert_code_point_to_low_surrogate */
228425bb815Sopenharmony_ci
229425bb815Sopenharmony_ci/**
230425bb815Sopenharmony_ci * Represents code point (>0xFFFF) as surrogate pair and returns its higher part
231425bb815Sopenharmony_ci *
232425bb815Sopenharmony_ci * @return higher code_unit of the surrogate pair
233425bb815Sopenharmony_ci */
234425bb815Sopenharmony_cistatic ecma_char_t
235425bb815Sopenharmony_ciconvert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
236425bb815Sopenharmony_ci{
237425bb815Sopenharmony_ci  JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
238425bb815Sopenharmony_ci  JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
239425bb815Sopenharmony_ci
240425bb815Sopenharmony_ci  ecma_char_t code_unit_bits;
241425bb815Sopenharmony_ci  code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE);
242425bb815Sopenharmony_ci
243425bb815Sopenharmony_ci  return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits);
244425bb815Sopenharmony_ci} /* convert_code_point_to_high_surrogate */
245425bb815Sopenharmony_ci
246425bb815Sopenharmony_ci/**
247425bb815Sopenharmony_ci * UTF16 Encoding method for a code point
248425bb815Sopenharmony_ci *
249425bb815Sopenharmony_ci * See also:
250425bb815Sopenharmony_ci *          ECMA-262 v6, 10.1.1
251425bb815Sopenharmony_ci *
252425bb815Sopenharmony_ci * @return uint8_t, the number of returning code points
253425bb815Sopenharmony_ci */
254425bb815Sopenharmony_ciuint8_t
255425bb815Sopenharmony_cilit_utf16_encode_code_point (lit_code_point_t cp, /**< the code point we encode */
256425bb815Sopenharmony_ci                             ecma_char_t *cu_p) /**< result of the encoding */
257425bb815Sopenharmony_ci{
258425bb815Sopenharmony_ci  if (cp <= LIT_UTF16_CODE_UNIT_MAX)
259425bb815Sopenharmony_ci  {
260425bb815Sopenharmony_ci    cu_p[0] = (ecma_char_t) cp;
261425bb815Sopenharmony_ci    return 1;
262425bb815Sopenharmony_ci  }
263425bb815Sopenharmony_ci
264425bb815Sopenharmony_ci  cu_p[0] = convert_code_point_to_high_surrogate (cp);
265425bb815Sopenharmony_ci  cu_p[1] = convert_code_point_to_low_surrogate (cp);
266425bb815Sopenharmony_ci  return 2;
267425bb815Sopenharmony_ci} /* lit_utf16_encode_code_point */
268425bb815Sopenharmony_ci
269425bb815Sopenharmony_ci/**
270425bb815Sopenharmony_ci * Calculate size of a zero-terminated utf-8 string
271425bb815Sopenharmony_ci *
272425bb815Sopenharmony_ci * NOTE:
273425bb815Sopenharmony_ci *   - string cannot be NULL
274425bb815Sopenharmony_ci *   - string should not contain zero characters in the middle
275425bb815Sopenharmony_ci *
276425bb815Sopenharmony_ci * @return size of a string
277425bb815Sopenharmony_ci */
278425bb815Sopenharmony_cilit_utf8_size_t
279425bb815Sopenharmony_cilit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated utf-8 string */
280425bb815Sopenharmony_ci{
281425bb815Sopenharmony_ci  JERRY_ASSERT (utf8_str_p != NULL);
282425bb815Sopenharmony_ci  return (lit_utf8_size_t) strlen ((const char *) utf8_str_p);
283425bb815Sopenharmony_ci} /* lit_zt_utf8_string_size */
284425bb815Sopenharmony_ci
285425bb815Sopenharmony_ci/**
286425bb815Sopenharmony_ci * Calculate length of a cesu-8 encoded string
287425bb815Sopenharmony_ci *
288425bb815Sopenharmony_ci * @return UTF-16 code units count
289425bb815Sopenharmony_ci */
290425bb815Sopenharmony_ciecma_length_t
291425bb815Sopenharmony_cilit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
292425bb815Sopenharmony_ci                        lit_utf8_size_t utf8_buf_size) /**< string size */
293425bb815Sopenharmony_ci{
294425bb815Sopenharmony_ci  ecma_length_t length = 0;
295425bb815Sopenharmony_ci  lit_utf8_size_t size = 0;
296425bb815Sopenharmony_ci
297425bb815Sopenharmony_ci  while (size < utf8_buf_size)
298425bb815Sopenharmony_ci  {
299425bb815Sopenharmony_ci    size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size));
300425bb815Sopenharmony_ci    length++;
301425bb815Sopenharmony_ci  }
302425bb815Sopenharmony_ci
303425bb815Sopenharmony_ci  JERRY_ASSERT (size == utf8_buf_size);
304425bb815Sopenharmony_ci
305425bb815Sopenharmony_ci  return length;
306425bb815Sopenharmony_ci} /* lit_utf8_string_length */
307425bb815Sopenharmony_ci
308425bb815Sopenharmony_ci/**
309425bb815Sopenharmony_ci * Calculate the required size of an utf-8 encoded string from cesu-8 encoded string
310425bb815Sopenharmony_ci *
311425bb815Sopenharmony_ci * @return size of an utf-8 encoded string
312425bb815Sopenharmony_ci */
313425bb815Sopenharmony_cilit_utf8_size_t
314425bb815Sopenharmony_cilit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
315425bb815Sopenharmony_ci                                   lit_utf8_size_t cesu8_buf_size) /**< string size */
316425bb815Sopenharmony_ci{
317425bb815Sopenharmony_ci  lit_utf8_size_t offset = 0;
318425bb815Sopenharmony_ci  lit_utf8_size_t utf8_buf_size = cesu8_buf_size;
319425bb815Sopenharmony_ci  ecma_char_t prev_ch = 0;
320425bb815Sopenharmony_ci
321425bb815Sopenharmony_ci  while (offset < cesu8_buf_size)
322425bb815Sopenharmony_ci  {
323425bb815Sopenharmony_ci    ecma_char_t ch;
324425bb815Sopenharmony_ci    offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
325425bb815Sopenharmony_ci
326425bb815Sopenharmony_ci    if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
327425bb815Sopenharmony_ci    {
328425bb815Sopenharmony_ci      utf8_buf_size -= 2;
329425bb815Sopenharmony_ci    }
330425bb815Sopenharmony_ci
331425bb815Sopenharmony_ci    prev_ch = ch;
332425bb815Sopenharmony_ci  }
333425bb815Sopenharmony_ci
334425bb815Sopenharmony_ci  JERRY_ASSERT (offset == cesu8_buf_size);
335425bb815Sopenharmony_ci
336425bb815Sopenharmony_ci  return utf8_buf_size;
337425bb815Sopenharmony_ci} /* lit_get_utf8_size_of_cesu8_string */
338425bb815Sopenharmony_ci
339425bb815Sopenharmony_ci/**
340425bb815Sopenharmony_ci * Calculate length of an utf-8 encoded string from cesu-8 encoded string
341425bb815Sopenharmony_ci *
342425bb815Sopenharmony_ci * @return length of an utf-8 encoded string
343425bb815Sopenharmony_ci */
344425bb815Sopenharmony_ciecma_length_t
345425bb815Sopenharmony_cilit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
346425bb815Sopenharmony_ci                                     lit_utf8_size_t cesu8_buf_size) /**< string size */
347425bb815Sopenharmony_ci{
348425bb815Sopenharmony_ci  lit_utf8_size_t offset = 0;
349425bb815Sopenharmony_ci  ecma_length_t utf8_length = 0;
350425bb815Sopenharmony_ci  ecma_char_t prev_ch = 0;
351425bb815Sopenharmony_ci
352425bb815Sopenharmony_ci  while (offset < cesu8_buf_size)
353425bb815Sopenharmony_ci  {
354425bb815Sopenharmony_ci    ecma_char_t ch;
355425bb815Sopenharmony_ci    offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
356425bb815Sopenharmony_ci
357425bb815Sopenharmony_ci    if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch))
358425bb815Sopenharmony_ci    {
359425bb815Sopenharmony_ci      utf8_length++;
360425bb815Sopenharmony_ci    }
361425bb815Sopenharmony_ci
362425bb815Sopenharmony_ci    prev_ch = ch;
363425bb815Sopenharmony_ci  }
364425bb815Sopenharmony_ci
365425bb815Sopenharmony_ci  JERRY_ASSERT (offset == cesu8_buf_size);
366425bb815Sopenharmony_ci
367425bb815Sopenharmony_ci  return utf8_length;
368425bb815Sopenharmony_ci} /* lit_get_utf8_length_of_cesu8_string */
369425bb815Sopenharmony_ci
370425bb815Sopenharmony_ci/**
371425bb815Sopenharmony_ci * Decodes a unicode code point from non-empty utf-8-encoded buffer
372425bb815Sopenharmony_ci *
373425bb815Sopenharmony_ci * @return number of bytes occupied by code point in the string
374425bb815Sopenharmony_ci */
375425bb815Sopenharmony_cilit_utf8_size_t
376425bb815Sopenharmony_cilit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
377425bb815Sopenharmony_ci                               lit_utf8_size_t buf_size, /**< size of the buffer in bytes */
378425bb815Sopenharmony_ci                               lit_code_point_t *code_point) /**< [out] code point */
379425bb815Sopenharmony_ci{
380425bb815Sopenharmony_ci  JERRY_ASSERT (buf_p && buf_size);
381425bb815Sopenharmony_ci
382425bb815Sopenharmony_ci  lit_utf8_byte_t c = buf_p[0];
383425bb815Sopenharmony_ci  if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
384425bb815Sopenharmony_ci  {
385425bb815Sopenharmony_ci    *code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
386425bb815Sopenharmony_ci    return 1;
387425bb815Sopenharmony_ci  }
388425bb815Sopenharmony_ci
389425bb815Sopenharmony_ci  lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
390425bb815Sopenharmony_ci  ecma_length_t bytes_count = 0;
391425bb815Sopenharmony_ci  if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
392425bb815Sopenharmony_ci  {
393425bb815Sopenharmony_ci    bytes_count = 2;
394425bb815Sopenharmony_ci    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
395425bb815Sopenharmony_ci  }
396425bb815Sopenharmony_ci  else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
397425bb815Sopenharmony_ci  {
398425bb815Sopenharmony_ci    bytes_count = 3;
399425bb815Sopenharmony_ci    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
400425bb815Sopenharmony_ci  }
401425bb815Sopenharmony_ci  else
402425bb815Sopenharmony_ci  {
403425bb815Sopenharmony_ci    JERRY_ASSERT ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
404425bb815Sopenharmony_ci    bytes_count = 4;
405425bb815Sopenharmony_ci    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
406425bb815Sopenharmony_ci  }
407425bb815Sopenharmony_ci
408425bb815Sopenharmony_ci  JERRY_ASSERT (buf_size >= bytes_count);
409425bb815Sopenharmony_ci
410425bb815Sopenharmony_ci  for (uint32_t i = 1; i < bytes_count; ++i)
411425bb815Sopenharmony_ci  {
412425bb815Sopenharmony_ci    ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
413425bb815Sopenharmony_ci    ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
414425bb815Sopenharmony_ci  }
415425bb815Sopenharmony_ci
416425bb815Sopenharmony_ci  *code_point = ret;
417425bb815Sopenharmony_ci  return bytes_count;
418425bb815Sopenharmony_ci} /* lit_read_code_point_from_utf8 */
419425bb815Sopenharmony_ci
420425bb815Sopenharmony_ci/**
421425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
422425bb815Sopenharmony_ci *
423425bb815Sopenharmony_ci * @return number of bytes occupied by code point in the string
424425bb815Sopenharmony_ci */
425425bb815Sopenharmony_cilit_utf8_size_t
426425bb815Sopenharmony_cilit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
427425bb815Sopenharmony_ci                              ecma_char_t *code_point) /**< [out] code point */
428425bb815Sopenharmony_ci{
429425bb815Sopenharmony_ci  JERRY_ASSERT (buf_p);
430425bb815Sopenharmony_ci
431425bb815Sopenharmony_ci  lit_utf8_byte_t c = buf_p[0];
432425bb815Sopenharmony_ci  if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
433425bb815Sopenharmony_ci  {
434425bb815Sopenharmony_ci    *code_point = (ecma_char_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
435425bb815Sopenharmony_ci    return 1;
436425bb815Sopenharmony_ci  }
437425bb815Sopenharmony_ci
438425bb815Sopenharmony_ci  lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
439425bb815Sopenharmony_ci  ecma_length_t bytes_count;
440425bb815Sopenharmony_ci  if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
441425bb815Sopenharmony_ci  {
442425bb815Sopenharmony_ci    bytes_count = 2;
443425bb815Sopenharmony_ci    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
444425bb815Sopenharmony_ci  }
445425bb815Sopenharmony_ci  else
446425bb815Sopenharmony_ci  {
447425bb815Sopenharmony_ci    JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
448425bb815Sopenharmony_ci    bytes_count = 3;
449425bb815Sopenharmony_ci    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
450425bb815Sopenharmony_ci  }
451425bb815Sopenharmony_ci
452425bb815Sopenharmony_ci  for (uint32_t i = 1; i < bytes_count; ++i)
453425bb815Sopenharmony_ci  {
454425bb815Sopenharmony_ci    ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
455425bb815Sopenharmony_ci    ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
456425bb815Sopenharmony_ci  }
457425bb815Sopenharmony_ci
458425bb815Sopenharmony_ci  JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX);
459425bb815Sopenharmony_ci  *code_point = (ecma_char_t) ret;
460425bb815Sopenharmony_ci  return bytes_count;
461425bb815Sopenharmony_ci} /* lit_read_code_unit_from_utf8 */
462425bb815Sopenharmony_ci
463425bb815Sopenharmony_ci/**
464425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
465425bb815Sopenharmony_ci *
466425bb815Sopenharmony_ci * @return number of bytes occupied by code point in the string
467425bb815Sopenharmony_ci */
468425bb815Sopenharmony_cilit_utf8_size_t
469425bb815Sopenharmony_cilit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
470425bb815Sopenharmony_ci                                   ecma_char_t *code_point) /**< [out] code point */
471425bb815Sopenharmony_ci{
472425bb815Sopenharmony_ci  JERRY_ASSERT (buf_p);
473425bb815Sopenharmony_ci
474425bb815Sopenharmony_ci  lit_utf8_decr (&buf_p);
475425bb815Sopenharmony_ci  return lit_read_code_unit_from_utf8 (buf_p, code_point);
476425bb815Sopenharmony_ci} /* lit_read_prev_code_unit_from_utf8 */
477425bb815Sopenharmony_ci
478425bb815Sopenharmony_ci/**
479425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
480425bb815Sopenharmony_ci *
481425bb815Sopenharmony_ci * @return next code unit
482425bb815Sopenharmony_ci */
483425bb815Sopenharmony_ciecma_char_t
484425bb815Sopenharmony_cilit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
485425bb815Sopenharmony_ci{
486425bb815Sopenharmony_ci  JERRY_ASSERT (*buf_p);
487425bb815Sopenharmony_ci  ecma_char_t ch;
488425bb815Sopenharmony_ci
489425bb815Sopenharmony_ci  *buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch);
490425bb815Sopenharmony_ci
491425bb815Sopenharmony_ci  return ch;
492425bb815Sopenharmony_ci} /* lit_cesu8_read_next */
493425bb815Sopenharmony_ci
494425bb815Sopenharmony_ci/**
495425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
496425bb815Sopenharmony_ci *
497425bb815Sopenharmony_ci * @return previous code unit
498425bb815Sopenharmony_ci */
499425bb815Sopenharmony_ciecma_char_t
500425bb815Sopenharmony_cilit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
501425bb815Sopenharmony_ci{
502425bb815Sopenharmony_ci  JERRY_ASSERT (*buf_p);
503425bb815Sopenharmony_ci  ecma_char_t ch;
504425bb815Sopenharmony_ci
505425bb815Sopenharmony_ci  lit_utf8_decr (buf_p);
506425bb815Sopenharmony_ci  lit_read_code_unit_from_utf8 (*buf_p, &ch);
507425bb815Sopenharmony_ci
508425bb815Sopenharmony_ci  return ch;
509425bb815Sopenharmony_ci} /* lit_cesu8_read_prev */
510425bb815Sopenharmony_ci
511425bb815Sopenharmony_ci/**
512425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
513425bb815Sopenharmony_ci *
514425bb815Sopenharmony_ci * @return next code unit
515425bb815Sopenharmony_ci */
516425bb815Sopenharmony_ciecma_char_t JERRY_ATTR_NOINLINE
517425bb815Sopenharmony_cilit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
518425bb815Sopenharmony_ci{
519425bb815Sopenharmony_ci  JERRY_ASSERT (buf_p != NULL);
520425bb815Sopenharmony_ci  ecma_char_t ch;
521425bb815Sopenharmony_ci
522425bb815Sopenharmony_ci  lit_read_code_unit_from_utf8 (buf_p, &ch);
523425bb815Sopenharmony_ci
524425bb815Sopenharmony_ci  return ch;
525425bb815Sopenharmony_ci} /* lit_cesu8_peek_next */
526425bb815Sopenharmony_ci
527425bb815Sopenharmony_ci/**
528425bb815Sopenharmony_ci * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
529425bb815Sopenharmony_ci *
530425bb815Sopenharmony_ci * @return previous code unit
531425bb815Sopenharmony_ci */
532425bb815Sopenharmony_ciecma_char_t JERRY_ATTR_NOINLINE
533425bb815Sopenharmony_cilit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
534425bb815Sopenharmony_ci{
535425bb815Sopenharmony_ci  JERRY_ASSERT (buf_p != NULL);
536425bb815Sopenharmony_ci  ecma_char_t ch;
537425bb815Sopenharmony_ci
538425bb815Sopenharmony_ci  lit_read_prev_code_unit_from_utf8 (buf_p, &ch);
539425bb815Sopenharmony_ci
540425bb815Sopenharmony_ci  return ch;
541425bb815Sopenharmony_ci} /* lit_cesu8_peek_prev */
542425bb815Sopenharmony_ci
543425bb815Sopenharmony_ci/**
544425bb815Sopenharmony_ci * Increase cesu-8 encoded string pointer by one code unit.
545425bb815Sopenharmony_ci */
546425bb815Sopenharmony_ciinline void JERRY_ATTR_ALWAYS_INLINE
547425bb815Sopenharmony_cilit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
548425bb815Sopenharmony_ci{
549425bb815Sopenharmony_ci  JERRY_ASSERT (*buf_p);
550425bb815Sopenharmony_ci
551425bb815Sopenharmony_ci  *buf_p += lit_get_unicode_char_size_by_utf8_first_byte (**buf_p);
552425bb815Sopenharmony_ci} /* lit_utf8_incr */
553425bb815Sopenharmony_ci
554425bb815Sopenharmony_ci/**
555425bb815Sopenharmony_ci * Decrease cesu-8 encoded string pointer by one code unit.
556425bb815Sopenharmony_ci */
557425bb815Sopenharmony_civoid
558425bb815Sopenharmony_cilit_utf8_decr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
559425bb815Sopenharmony_ci{
560425bb815Sopenharmony_ci  JERRY_ASSERT (*buf_p);
561425bb815Sopenharmony_ci  const lit_utf8_byte_t *current_p = *buf_p;
562425bb815Sopenharmony_ci
563425bb815Sopenharmony_ci  do
564425bb815Sopenharmony_ci  {
565425bb815Sopenharmony_ci    current_p--;
566425bb815Sopenharmony_ci  }
567425bb815Sopenharmony_ci  while ((*(current_p) & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
568425bb815Sopenharmony_ci
569425bb815Sopenharmony_ci  *buf_p = current_p;
570425bb815Sopenharmony_ci} /* lit_utf8_decr */
571425bb815Sopenharmony_ci
572425bb815Sopenharmony_ci/**
573425bb815Sopenharmony_ci * Calc hash using the specified hash_basis.
574425bb815Sopenharmony_ci *
575425bb815Sopenharmony_ci * NOTE:
576425bb815Sopenharmony_ci *   This is implementation of FNV-1a hash function, which is released into public domain.
577425bb815Sopenharmony_ci *   Constants used, are carefully picked primes by the authors.
578425bb815Sopenharmony_ci *   More info: http://www.isthe.com/chongo/tech/comp/fnv/
579425bb815Sopenharmony_ci *
580425bb815Sopenharmony_ci * @return ecma-string's hash
581425bb815Sopenharmony_ci */
582425bb815Sopenharmony_ciinline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
583425bb815Sopenharmony_cilit_utf8_string_hash_combine (lit_string_hash_t hash_basis, /**< hash to be combined with */
584425bb815Sopenharmony_ci                              const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
585425bb815Sopenharmony_ci                              lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
586425bb815Sopenharmony_ci{
587425bb815Sopenharmony_ci  JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
588425bb815Sopenharmony_ci
589425bb815Sopenharmony_ci  uint32_t hash = hash_basis;
590425bb815Sopenharmony_ci
591425bb815Sopenharmony_ci  for (uint32_t i = 0; i < utf8_buf_size; i++)
592425bb815Sopenharmony_ci  {
593425bb815Sopenharmony_ci    /* 16777619 is 32 bit FNV_prime = 2^24 + 2^8 + 0x93 = 16777619 */
594425bb815Sopenharmony_ci    hash = (hash ^ utf8_buf_p[i]) * 16777619;
595425bb815Sopenharmony_ci  }
596425bb815Sopenharmony_ci
597425bb815Sopenharmony_ci  return (lit_string_hash_t) hash;
598425bb815Sopenharmony_ci} /* lit_utf8_string_hash_combine */
599425bb815Sopenharmony_ci
600425bb815Sopenharmony_ci/**
601425bb815Sopenharmony_ci * Calculate hash from the buffer.
602425bb815Sopenharmony_ci *
603425bb815Sopenharmony_ci * @return ecma-string's hash
604425bb815Sopenharmony_ci */
605425bb815Sopenharmony_ciinline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
606425bb815Sopenharmony_cilit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
607425bb815Sopenharmony_ci                           lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
608425bb815Sopenharmony_ci{
609425bb815Sopenharmony_ci  JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
610425bb815Sopenharmony_ci
611425bb815Sopenharmony_ci  /* 32 bit offset_basis for FNV = 2166136261 */
612425bb815Sopenharmony_ci  return lit_utf8_string_hash_combine ((lit_string_hash_t) 2166136261, utf8_buf_p, utf8_buf_size);
613425bb815Sopenharmony_ci} /* lit_utf8_string_calc_hash */
614425bb815Sopenharmony_ci
615425bb815Sopenharmony_ci/**
616425bb815Sopenharmony_ci * Return code unit at the specified position in string
617425bb815Sopenharmony_ci *
618425bb815Sopenharmony_ci * NOTE:
619425bb815Sopenharmony_ci *   code_unit_offset should be less then string's length
620425bb815Sopenharmony_ci *
621425bb815Sopenharmony_ci * @return code unit value
622425bb815Sopenharmony_ci */
623425bb815Sopenharmony_ciecma_char_t
624425bb815Sopenharmony_cilit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
625425bb815Sopenharmony_ci                              lit_utf8_size_t utf8_buf_size, /**< string size in bytes */
626425bb815Sopenharmony_ci                              ecma_length_t code_unit_offset) /**< ofset of a code_unit */
627425bb815Sopenharmony_ci{
628425bb815Sopenharmony_ci  lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) utf8_buf_p;
629425bb815Sopenharmony_ci  ecma_char_t code_unit;
630425bb815Sopenharmony_ci
631425bb815Sopenharmony_ci  do
632425bb815Sopenharmony_ci  {
633425bb815Sopenharmony_ci    JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size);
634425bb815Sopenharmony_ci    current_p += lit_read_code_unit_from_utf8 (current_p, &code_unit);
635425bb815Sopenharmony_ci  }
636425bb815Sopenharmony_ci  while (code_unit_offset--);
637425bb815Sopenharmony_ci
638425bb815Sopenharmony_ci  return code_unit;
639425bb815Sopenharmony_ci} /* lit_utf8_string_code_unit_at */
640425bb815Sopenharmony_ci
641425bb815Sopenharmony_ci/**
642425bb815Sopenharmony_ci * Get CESU-8 encoded size of character
643425bb815Sopenharmony_ci *
644425bb815Sopenharmony_ci * @return number of bytes occupied in CESU-8
645425bb815Sopenharmony_ci */
646425bb815Sopenharmony_ciinline lit_utf8_size_t JERRY_ATTR_ALWAYS_INLINE
647425bb815Sopenharmony_cilit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */
648425bb815Sopenharmony_ci{
649425bb815Sopenharmony_ci  if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
650425bb815Sopenharmony_ci  {
651425bb815Sopenharmony_ci    return 1;
652425bb815Sopenharmony_ci  }
653425bb815Sopenharmony_ci  else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
654425bb815Sopenharmony_ci  {
655425bb815Sopenharmony_ci    return 2;
656425bb815Sopenharmony_ci  }
657425bb815Sopenharmony_ci  else
658425bb815Sopenharmony_ci  {
659425bb815Sopenharmony_ci    JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
660425bb815Sopenharmony_ci    return 3;
661425bb815Sopenharmony_ci  }
662425bb815Sopenharmony_ci} /* lit_get_unicode_char_size_by_utf8_first_byte */
663425bb815Sopenharmony_ci
664425bb815Sopenharmony_ci/**
665425bb815Sopenharmony_ci * Convert code unit to cesu-8 representation
666425bb815Sopenharmony_ci *
667425bb815Sopenharmony_ci * @return byte count required to represent the code unit
668425bb815Sopenharmony_ci */
669425bb815Sopenharmony_cilit_utf8_size_t
670425bb815Sopenharmony_cilit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */
671425bb815Sopenharmony_ci                       lit_utf8_byte_t *buf_p) /**< buffer where to store the result and its size
672425bb815Sopenharmony_ci                                                *   should be at least LIT_UTF8_MAX_BYTES_IN_CODE_UNIT */
673425bb815Sopenharmony_ci{
674425bb815Sopenharmony_ci  if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
675425bb815Sopenharmony_ci  {
676425bb815Sopenharmony_ci    buf_p[0] = (lit_utf8_byte_t) code_unit;
677425bb815Sopenharmony_ci    return 1;
678425bb815Sopenharmony_ci  }
679425bb815Sopenharmony_ci  else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
680425bb815Sopenharmony_ci  {
681425bb815Sopenharmony_ci    uint32_t code_unit_bits = code_unit;
682425bb815Sopenharmony_ci    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
683425bb815Sopenharmony_ci    code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
684425bb815Sopenharmony_ci
685425bb815Sopenharmony_ci    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK);
686425bb815Sopenharmony_ci    JERRY_ASSERT (first_byte_bits == code_unit_bits);
687425bb815Sopenharmony_ci
688425bb815Sopenharmony_ci    buf_p[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
689425bb815Sopenharmony_ci    buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
690425bb815Sopenharmony_ci    return 2;
691425bb815Sopenharmony_ci  }
692425bb815Sopenharmony_ci  else
693425bb815Sopenharmony_ci  {
694425bb815Sopenharmony_ci    uint32_t code_unit_bits = code_unit;
695425bb815Sopenharmony_ci    lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
696425bb815Sopenharmony_ci    code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
697425bb815Sopenharmony_ci
698425bb815Sopenharmony_ci    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
699425bb815Sopenharmony_ci    code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
700425bb815Sopenharmony_ci
701425bb815Sopenharmony_ci    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK);
702425bb815Sopenharmony_ci    JERRY_ASSERT (first_byte_bits == code_unit_bits);
703425bb815Sopenharmony_ci
704425bb815Sopenharmony_ci    buf_p[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
705425bb815Sopenharmony_ci    buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
706425bb815Sopenharmony_ci    buf_p[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
707425bb815Sopenharmony_ci    return 3;
708425bb815Sopenharmony_ci  }
709425bb815Sopenharmony_ci} /* lit_code_unit_to_utf8 */
710425bb815Sopenharmony_ci
711425bb815Sopenharmony_ci/**
712425bb815Sopenharmony_ci * Convert code point to cesu-8 representation
713425bb815Sopenharmony_ci *
714425bb815Sopenharmony_ci * @return byte count required to represent the code point
715425bb815Sopenharmony_ci */
716425bb815Sopenharmony_cilit_utf8_size_t
717425bb815Sopenharmony_cilit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */
718425bb815Sopenharmony_ci                         lit_utf8_byte_t *buf) /**< buffer where to store the result,
719425bb815Sopenharmony_ci                                                *   its size should be at least 6 bytes */
720425bb815Sopenharmony_ci{
721425bb815Sopenharmony_ci  if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
722425bb815Sopenharmony_ci  {
723425bb815Sopenharmony_ci    return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf);
724425bb815Sopenharmony_ci  }
725425bb815Sopenharmony_ci  else
726425bb815Sopenharmony_ci  {
727425bb815Sopenharmony_ci    lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf);
728425bb815Sopenharmony_ci    offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset);
729425bb815Sopenharmony_ci    return offset;
730425bb815Sopenharmony_ci  }
731425bb815Sopenharmony_ci} /* lit_code_point_to_cesu8 */
732425bb815Sopenharmony_ci
733425bb815Sopenharmony_ci/**
734425bb815Sopenharmony_ci * Convert code point to utf-8 representation
735425bb815Sopenharmony_ci *
736425bb815Sopenharmony_ci * @return byte count required to represent the code point
737425bb815Sopenharmony_ci */
738425bb815Sopenharmony_cilit_utf8_size_t
739425bb815Sopenharmony_cilit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
740425bb815Sopenharmony_ci                        lit_utf8_byte_t *buf) /**< buffer where to store the result,
741425bb815Sopenharmony_ci                                              *   its size should be at least 4 bytes */
742425bb815Sopenharmony_ci{
743425bb815Sopenharmony_ci  if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
744425bb815Sopenharmony_ci  {
745425bb815Sopenharmony_ci    buf[0] = (lit_utf8_byte_t) code_point;
746425bb815Sopenharmony_ci    return 1;
747425bb815Sopenharmony_ci  }
748425bb815Sopenharmony_ci  else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
749425bb815Sopenharmony_ci  {
750425bb815Sopenharmony_ci    uint32_t code_point_bits = code_point;
751425bb815Sopenharmony_ci    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
752425bb815Sopenharmony_ci    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
753425bb815Sopenharmony_ci
754425bb815Sopenharmony_ci    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK);
755425bb815Sopenharmony_ci    JERRY_ASSERT (first_byte_bits == code_point_bits);
756425bb815Sopenharmony_ci
757425bb815Sopenharmony_ci    buf[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
758425bb815Sopenharmony_ci    buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
759425bb815Sopenharmony_ci    return 2;
760425bb815Sopenharmony_ci  }
761425bb815Sopenharmony_ci  else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX)
762425bb815Sopenharmony_ci  {
763425bb815Sopenharmony_ci    uint32_t code_point_bits = code_point;
764425bb815Sopenharmony_ci    lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
765425bb815Sopenharmony_ci    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
766425bb815Sopenharmony_ci
767425bb815Sopenharmony_ci    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
768425bb815Sopenharmony_ci    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
769425bb815Sopenharmony_ci
770425bb815Sopenharmony_ci    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK);
771425bb815Sopenharmony_ci    JERRY_ASSERT (first_byte_bits == code_point_bits);
772425bb815Sopenharmony_ci
773425bb815Sopenharmony_ci    buf[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
774425bb815Sopenharmony_ci    buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
775425bb815Sopenharmony_ci    buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
776425bb815Sopenharmony_ci    return 3;
777425bb815Sopenharmony_ci  }
778425bb815Sopenharmony_ci  else
779425bb815Sopenharmony_ci  {
780425bb815Sopenharmony_ci    JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX);
781425bb815Sopenharmony_ci
782425bb815Sopenharmony_ci    uint32_t code_point_bits = code_point;
783425bb815Sopenharmony_ci    lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
784425bb815Sopenharmony_ci    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
785425bb815Sopenharmony_ci
786425bb815Sopenharmony_ci    lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
787425bb815Sopenharmony_ci    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
788425bb815Sopenharmony_ci
789425bb815Sopenharmony_ci    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
790425bb815Sopenharmony_ci    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
791425bb815Sopenharmony_ci
792425bb815Sopenharmony_ci    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK);
793425bb815Sopenharmony_ci    JERRY_ASSERT (first_byte_bits == code_point_bits);
794425bb815Sopenharmony_ci
795425bb815Sopenharmony_ci    buf[0] = LIT_UTF8_4_BYTE_MARKER | first_byte_bits;
796425bb815Sopenharmony_ci    buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
797425bb815Sopenharmony_ci    buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
798425bb815Sopenharmony_ci    buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits;
799425bb815Sopenharmony_ci    return 4;
800425bb815Sopenharmony_ci  }
801425bb815Sopenharmony_ci} /* lit_code_point_to_utf8 */
802425bb815Sopenharmony_ci
803425bb815Sopenharmony_ci/**
804425bb815Sopenharmony_ci * Convert cesu-8 string to an utf-8 string and put it into the buffer.
805425bb815Sopenharmony_ci * It is the caller's responsibility to make sure that the string fits in the buffer.
806425bb815Sopenharmony_ci *
807425bb815Sopenharmony_ci * @return number of bytes copied to the buffer.
808425bb815Sopenharmony_ci */
809425bb815Sopenharmony_cilit_utf8_size_t
810425bb815Sopenharmony_cilit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, /**< cesu-8 string */
811425bb815Sopenharmony_ci                                         lit_utf8_size_t cesu8_size, /**< size of cesu-8 string */
812425bb815Sopenharmony_ci                                         lit_utf8_byte_t *utf8_string, /**< destination utf-8 buffer pointer
813425bb815Sopenharmony_ci                                                                        * (can be NULL if buffer_size == 0) */
814425bb815Sopenharmony_ci                                         lit_utf8_size_t utf8_size) /**< size of utf-8 buffer */
815425bb815Sopenharmony_ci{
816425bb815Sopenharmony_ci  const lit_utf8_byte_t *cesu8_pos = cesu8_string;
817425bb815Sopenharmony_ci  const lit_utf8_byte_t *cesu8_end_pos = cesu8_string + cesu8_size;
818425bb815Sopenharmony_ci
819425bb815Sopenharmony_ci  lit_utf8_byte_t *utf8_pos = utf8_string;
820425bb815Sopenharmony_ci  lit_utf8_byte_t *utf8_end_pos = utf8_string + utf8_size;
821425bb815Sopenharmony_ci
822425bb815Sopenharmony_ci  lit_utf8_size_t size = 0;
823425bb815Sopenharmony_ci
824425bb815Sopenharmony_ci  ecma_char_t prev_ch = 0;
825425bb815Sopenharmony_ci  lit_utf8_size_t prev_ch_size = 0;
826425bb815Sopenharmony_ci
827425bb815Sopenharmony_ci  while (cesu8_pos < cesu8_end_pos)
828425bb815Sopenharmony_ci  {
829425bb815Sopenharmony_ci    ecma_char_t ch;
830425bb815Sopenharmony_ci    lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch);
831425bb815Sopenharmony_ci
832425bb815Sopenharmony_ci    if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
833425bb815Sopenharmony_ci    {
834425bb815Sopenharmony_ci      JERRY_ASSERT (code_unit_size == prev_ch_size);
835425bb815Sopenharmony_ci      utf8_pos -= prev_ch_size;
836425bb815Sopenharmony_ci      lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_ch, ch);
837425bb815Sopenharmony_ci      lit_code_point_to_utf8 (code_point, utf8_pos);
838425bb815Sopenharmony_ci      size++;
839425bb815Sopenharmony_ci    }
840425bb815Sopenharmony_ci    else
841425bb815Sopenharmony_ci    {
842425bb815Sopenharmony_ci      memcpy (utf8_pos, cesu8_pos, code_unit_size);
843425bb815Sopenharmony_ci      size += code_unit_size;
844425bb815Sopenharmony_ci    }
845425bb815Sopenharmony_ci
846425bb815Sopenharmony_ci    utf8_pos = utf8_string + size;
847425bb815Sopenharmony_ci    cesu8_pos += code_unit_size;
848425bb815Sopenharmony_ci    prev_ch = ch;
849425bb815Sopenharmony_ci    prev_ch_size = code_unit_size;
850425bb815Sopenharmony_ci  }
851425bb815Sopenharmony_ci
852425bb815Sopenharmony_ci  JERRY_ASSERT (cesu8_pos == cesu8_end_pos);
853425bb815Sopenharmony_ci  JERRY_ASSERT (utf8_pos <= utf8_end_pos);
854425bb815Sopenharmony_ci
855425bb815Sopenharmony_ci  return size;
856425bb815Sopenharmony_ci} /* lit_convert_cesu8_string_to_utf8_string */
857425bb815Sopenharmony_ci
858425bb815Sopenharmony_ci/**
859425bb815Sopenharmony_ci * Convert surrogate pair to code point
860425bb815Sopenharmony_ci *
861425bb815Sopenharmony_ci * @return code point
862425bb815Sopenharmony_ci */
863425bb815Sopenharmony_cilit_code_point_t
864425bb815Sopenharmony_cilit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */
865425bb815Sopenharmony_ci                                          ecma_char_t low_surrogate) /**< low surrogate code point */
866425bb815Sopenharmony_ci{
867425bb815Sopenharmony_ci  JERRY_ASSERT (lit_is_code_point_utf16_high_surrogate (high_surrogate));
868425bb815Sopenharmony_ci  JERRY_ASSERT (lit_is_code_point_utf16_low_surrogate (low_surrogate));
869425bb815Sopenharmony_ci
870425bb815Sopenharmony_ci  lit_code_point_t code_point;
871425bb815Sopenharmony_ci  code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
872425bb815Sopenharmony_ci  code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
873425bb815Sopenharmony_ci
874425bb815Sopenharmony_ci  code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
875425bb815Sopenharmony_ci
876425bb815Sopenharmony_ci  code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
877425bb815Sopenharmony_ci  return code_point;
878425bb815Sopenharmony_ci} /* lit_convert_surrogate_pair_to_code_point */
879425bb815Sopenharmony_ci
880425bb815Sopenharmony_ci/**
881425bb815Sopenharmony_ci * Relational compare of cesu-8 strings
882425bb815Sopenharmony_ci *
883425bb815Sopenharmony_ci * First string is less than second string if:
884425bb815Sopenharmony_ci *  - strings are not equal;
885425bb815Sopenharmony_ci *  - first string is prefix of second or is lexicographically less than second.
886425bb815Sopenharmony_ci *
887425bb815Sopenharmony_ci * @return true - if first string is less than second string,
888425bb815Sopenharmony_ci *         false - otherwise
889425bb815Sopenharmony_ci */
890425bb815Sopenharmony_cibool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< utf-8 string */
891425bb815Sopenharmony_ci                                          lit_utf8_size_t string1_size, /**< string size */
892425bb815Sopenharmony_ci                                          const lit_utf8_byte_t *string2_p, /**< utf-8 string */
893425bb815Sopenharmony_ci                                          lit_utf8_size_t string2_size) /**< string size */
894425bb815Sopenharmony_ci{
895425bb815Sopenharmony_ci  lit_utf8_byte_t *string1_pos = (lit_utf8_byte_t *) string1_p;
896425bb815Sopenharmony_ci  lit_utf8_byte_t *string2_pos = (lit_utf8_byte_t *) string2_p;
897425bb815Sopenharmony_ci  const lit_utf8_byte_t *string1_end_p = string1_p + string1_size;
898425bb815Sopenharmony_ci  const lit_utf8_byte_t *string2_end_p = string2_p + string2_size;
899425bb815Sopenharmony_ci
900425bb815Sopenharmony_ci  while (string1_pos < string1_end_p && string2_pos < string2_end_p)
901425bb815Sopenharmony_ci  {
902425bb815Sopenharmony_ci    ecma_char_t ch1, ch2;
903425bb815Sopenharmony_ci    string1_pos += lit_read_code_unit_from_utf8 (string1_pos, &ch1);
904425bb815Sopenharmony_ci    string2_pos += lit_read_code_unit_from_utf8 (string2_pos, &ch2);
905425bb815Sopenharmony_ci
906425bb815Sopenharmony_ci    if (ch1 < ch2)
907425bb815Sopenharmony_ci    {
908425bb815Sopenharmony_ci      return true;
909425bb815Sopenharmony_ci    }
910425bb815Sopenharmony_ci    else if (ch1 > ch2)
911425bb815Sopenharmony_ci    {
912425bb815Sopenharmony_ci      return false;
913425bb815Sopenharmony_ci    }
914425bb815Sopenharmony_ci  }
915425bb815Sopenharmony_ci
916425bb815Sopenharmony_ci  return (string1_pos >= string1_end_p && string2_pos < string2_end_p);
917425bb815Sopenharmony_ci} /* lit_compare_utf8_strings_relational */
918