1/* Copyright JS Foundation and other contributors, http://js.foundation
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "lit-strings.h"
17
18#include "jrt-libc-includes.h"
19
20/**
21 * Validate utf-8 string
22 *
23 * NOTE:
24 *   Isolated surrogates are allowed.
25 *   Correct pair of surrogates is not allowed, it should be represented as 4-byte utf-8 character.
26 *
27 * @return true if utf-8 string is well-formed
28 *         false otherwise
29 */
30bool
31lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
32                          lit_utf8_size_t buf_size) /**< string size */
33{
34  lit_utf8_size_t idx = 0;
35
36  bool is_prev_code_point_high_surrogate = false;
37  while (idx < buf_size)
38  {
39    lit_utf8_byte_t c = utf8_buf_p[idx++];
40    if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
41    {
42      is_prev_code_point_high_surrogate = false;
43      continue;
44    }
45
46    lit_code_point_t code_point = 0;
47    lit_code_point_t min_code_point = 0;
48    lit_utf8_size_t extra_bytes_count;
49    if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
50    {
51      extra_bytes_count = 1;
52      min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
53      code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
54    }
55    else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
56    {
57      extra_bytes_count = 2;
58      min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
59      code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
60    }
61    else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
62    {
63      extra_bytes_count = 3;
64      min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN;
65      code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
66    }
67    else
68    {
69      /* utf-8 string could not contain 5- and 6-byte sequences. */
70      return false;
71    }
72
73    if (idx + extra_bytes_count > buf_size)
74    {
75      /* utf-8 string breaks in the middle */
76      return false;
77    }
78
79    for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
80    {
81      c = utf8_buf_p[idx + offset];
82      if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
83      {
84        /* invalid continuation byte */
85        return false;
86      }
87      code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
88      code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
89    }
90
91    if (code_point < min_code_point
92        || code_point > LIT_UNICODE_CODE_POINT_MAX)
93    {
94      /* utf-8 string doesn't encode valid unicode code point */
95      return false;
96    }
97
98    if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
99        && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX)
100    {
101      is_prev_code_point_high_surrogate = true;
102    }
103    else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN
104             && code_point <= LIT_UTF16_LOW_SURROGATE_MAX
105             && is_prev_code_point_high_surrogate)
106    {
107      /* sequence of high and low surrogate is not allowed */
108      return false;
109    }
110    else
111    {
112      is_prev_code_point_high_surrogate = false;
113    }
114
115    idx += extra_bytes_count;
116  }
117
118  return true;
119} /* lit_is_valid_utf8_string */
120
121/**
122 * Validate cesu-8 string
123 *
124 * @return true if cesu-8 string is well-formed
125 *         false otherwise
126 */
127bool
128lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
129                           lit_utf8_size_t buf_size) /**< string size */
130{
131  lit_utf8_size_t idx = 0;
132
133  while (idx < buf_size)
134  {
135    lit_utf8_byte_t c = cesu8_buf_p[idx++];
136    if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
137    {
138      continue;
139    }
140
141    lit_code_point_t code_point = 0;
142    lit_code_point_t min_code_point = 0;
143    lit_utf8_size_t extra_bytes_count;
144    if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
145    {
146      extra_bytes_count = 1;
147      min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
148      code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
149    }
150    else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
151    {
152      extra_bytes_count = 2;
153      min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
154      code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
155    }
156    else
157    {
158      return false;
159    }
160
161    if (idx + extra_bytes_count > buf_size)
162    {
163      /* cesu-8 string breaks in the middle */
164      return false;
165    }
166
167    for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
168    {
169      c = cesu8_buf_p[idx + offset];
170      if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
171      {
172        /* invalid continuation byte */
173        return false;
174      }
175      code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
176      code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
177    }
178
179    if (code_point < min_code_point)
180    {
181      /* cesu-8 string doesn't encode valid unicode code point */
182      return false;
183    }
184
185    idx += extra_bytes_count;
186  }
187
188  return true;
189} /* lit_is_valid_cesu8_string */
190
191/**
192 * Check if the code point is UTF-16 low surrogate
193 *
194 * @return true / false
195 */
196bool
197lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point) /**< code point */
198{
199  return LIT_UTF16_LOW_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_LOW_SURROGATE_MAX;
200} /* lit_is_code_point_utf16_low_surrogate */
201
202/**
203 * Check if the code point is UTF-16 high surrogate
204 *
205 * @return true / false
206 */
207bool
208lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point) /**< code point */
209{
210  return LIT_UTF16_HIGH_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX;
211} /* lit_is_code_point_utf16_high_surrogate */
212
213/**
214 * Represents code point (>0xFFFF) as surrogate pair and returns its lower part
215 *
216 * @return lower code_unit of the surrogate pair
217 */
218static ecma_char_t
219convert_code_point_to_low_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
220{
221  JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
222
223  ecma_char_t code_unit_bits;
224  code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK);
225
226  return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER | code_unit_bits);
227} /* convert_code_point_to_low_surrogate */
228
229/**
230 * Represents code point (>0xFFFF) as surrogate pair and returns its higher part
231 *
232 * @return higher code_unit of the surrogate pair
233 */
234static ecma_char_t
235convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
236{
237  JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
238  JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
239
240  ecma_char_t code_unit_bits;
241  code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE);
242
243  return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits);
244} /* convert_code_point_to_high_surrogate */
245
246/**
247 * UTF16 Encoding method for a code point
248 *
249 * See also:
250 *          ECMA-262 v6, 10.1.1
251 *
252 * @return uint8_t, the number of returning code points
253 */
254uint8_t
255lit_utf16_encode_code_point (lit_code_point_t cp, /**< the code point we encode */
256                             ecma_char_t *cu_p) /**< result of the encoding */
257{
258  if (cp <= LIT_UTF16_CODE_UNIT_MAX)
259  {
260    cu_p[0] = (ecma_char_t) cp;
261    return 1;
262  }
263
264  cu_p[0] = convert_code_point_to_high_surrogate (cp);
265  cu_p[1] = convert_code_point_to_low_surrogate (cp);
266  return 2;
267} /* lit_utf16_encode_code_point */
268
269/**
270 * Calculate size of a zero-terminated utf-8 string
271 *
272 * NOTE:
273 *   - string cannot be NULL
274 *   - string should not contain zero characters in the middle
275 *
276 * @return size of a string
277 */
278lit_utf8_size_t
279lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated utf-8 string */
280{
281  JERRY_ASSERT (utf8_str_p != NULL);
282  return (lit_utf8_size_t) strlen ((const char *) utf8_str_p);
283} /* lit_zt_utf8_string_size */
284
285/**
286 * Calculate length of a cesu-8 encoded string
287 *
288 * @return UTF-16 code units count
289 */
290ecma_length_t
291lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
292                        lit_utf8_size_t utf8_buf_size) /**< string size */
293{
294  ecma_length_t length = 0;
295  lit_utf8_size_t size = 0;
296
297  while (size < utf8_buf_size)
298  {
299    size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size));
300    length++;
301  }
302
303  JERRY_ASSERT (size == utf8_buf_size);
304
305  return length;
306} /* lit_utf8_string_length */
307
308/**
309 * Calculate the required size of an utf-8 encoded string from cesu-8 encoded string
310 *
311 * @return size of an utf-8 encoded string
312 */
313lit_utf8_size_t
314lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
315                                   lit_utf8_size_t cesu8_buf_size) /**< string size */
316{
317  lit_utf8_size_t offset = 0;
318  lit_utf8_size_t utf8_buf_size = cesu8_buf_size;
319  ecma_char_t prev_ch = 0;
320
321  while (offset < cesu8_buf_size)
322  {
323    ecma_char_t ch;
324    offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
325
326    if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
327    {
328      utf8_buf_size -= 2;
329    }
330
331    prev_ch = ch;
332  }
333
334  JERRY_ASSERT (offset == cesu8_buf_size);
335
336  return utf8_buf_size;
337} /* lit_get_utf8_size_of_cesu8_string */
338
339/**
340 * Calculate length of an utf-8 encoded string from cesu-8 encoded string
341 *
342 * @return length of an utf-8 encoded string
343 */
344ecma_length_t
345lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
346                                     lit_utf8_size_t cesu8_buf_size) /**< string size */
347{
348  lit_utf8_size_t offset = 0;
349  ecma_length_t utf8_length = 0;
350  ecma_char_t prev_ch = 0;
351
352  while (offset < cesu8_buf_size)
353  {
354    ecma_char_t ch;
355    offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
356
357    if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch))
358    {
359      utf8_length++;
360    }
361
362    prev_ch = ch;
363  }
364
365  JERRY_ASSERT (offset == cesu8_buf_size);
366
367  return utf8_length;
368} /* lit_get_utf8_length_of_cesu8_string */
369
370/**
371 * Decodes a unicode code point from non-empty utf-8-encoded buffer
372 *
373 * @return number of bytes occupied by code point in the string
374 */
375lit_utf8_size_t
376lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
377                               lit_utf8_size_t buf_size, /**< size of the buffer in bytes */
378                               lit_code_point_t *code_point) /**< [out] code point */
379{
380  JERRY_ASSERT (buf_p && buf_size);
381
382  lit_utf8_byte_t c = buf_p[0];
383  if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
384  {
385    *code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
386    return 1;
387  }
388
389  lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
390  ecma_length_t bytes_count = 0;
391  if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
392  {
393    bytes_count = 2;
394    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
395  }
396  else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
397  {
398    bytes_count = 3;
399    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
400  }
401  else
402  {
403    JERRY_ASSERT ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
404    bytes_count = 4;
405    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
406  }
407
408  JERRY_ASSERT (buf_size >= bytes_count);
409
410  for (uint32_t i = 1; i < bytes_count; ++i)
411  {
412    ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
413    ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
414  }
415
416  *code_point = ret;
417  return bytes_count;
418} /* lit_read_code_point_from_utf8 */
419
420/**
421 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
422 *
423 * @return number of bytes occupied by code point in the string
424 */
425lit_utf8_size_t
426lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
427                              ecma_char_t *code_point) /**< [out] code point */
428{
429  JERRY_ASSERT (buf_p);
430
431  lit_utf8_byte_t c = buf_p[0];
432  if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
433  {
434    *code_point = (ecma_char_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
435    return 1;
436  }
437
438  lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
439  ecma_length_t bytes_count;
440  if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
441  {
442    bytes_count = 2;
443    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
444  }
445  else
446  {
447    JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
448    bytes_count = 3;
449    ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
450  }
451
452  for (uint32_t i = 1; i < bytes_count; ++i)
453  {
454    ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
455    ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
456  }
457
458  JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX);
459  *code_point = (ecma_char_t) ret;
460  return bytes_count;
461} /* lit_read_code_unit_from_utf8 */
462
463/**
464 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
465 *
466 * @return number of bytes occupied by code point in the string
467 */
468lit_utf8_size_t
469lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
470                                   ecma_char_t *code_point) /**< [out] code point */
471{
472  JERRY_ASSERT (buf_p);
473
474  lit_utf8_decr (&buf_p);
475  return lit_read_code_unit_from_utf8 (buf_p, code_point);
476} /* lit_read_prev_code_unit_from_utf8 */
477
478/**
479 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
480 *
481 * @return next code unit
482 */
483ecma_char_t
484lit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
485{
486  JERRY_ASSERT (*buf_p);
487  ecma_char_t ch;
488
489  *buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch);
490
491  return ch;
492} /* lit_cesu8_read_next */
493
494/**
495 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
496 *
497 * @return previous code unit
498 */
499ecma_char_t
500lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
501{
502  JERRY_ASSERT (*buf_p);
503  ecma_char_t ch;
504
505  lit_utf8_decr (buf_p);
506  lit_read_code_unit_from_utf8 (*buf_p, &ch);
507
508  return ch;
509} /* lit_cesu8_read_prev */
510
511/**
512 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
513 *
514 * @return next code unit
515 */
516ecma_char_t JERRY_ATTR_NOINLINE
517lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
518{
519  JERRY_ASSERT (buf_p != NULL);
520  ecma_char_t ch;
521
522  lit_read_code_unit_from_utf8 (buf_p, &ch);
523
524  return ch;
525} /* lit_cesu8_peek_next */
526
527/**
528 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
529 *
530 * @return previous code unit
531 */
532ecma_char_t JERRY_ATTR_NOINLINE
533lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
534{
535  JERRY_ASSERT (buf_p != NULL);
536  ecma_char_t ch;
537
538  lit_read_prev_code_unit_from_utf8 (buf_p, &ch);
539
540  return ch;
541} /* lit_cesu8_peek_prev */
542
543/**
544 * Increase cesu-8 encoded string pointer by one code unit.
545 */
546inline void JERRY_ATTR_ALWAYS_INLINE
547lit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
548{
549  JERRY_ASSERT (*buf_p);
550
551  *buf_p += lit_get_unicode_char_size_by_utf8_first_byte (**buf_p);
552} /* lit_utf8_incr */
553
554/**
555 * Decrease cesu-8 encoded string pointer by one code unit.
556 */
557void
558lit_utf8_decr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
559{
560  JERRY_ASSERT (*buf_p);
561  const lit_utf8_byte_t *current_p = *buf_p;
562
563  do
564  {
565    current_p--;
566  }
567  while ((*(current_p) & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
568
569  *buf_p = current_p;
570} /* lit_utf8_decr */
571
572/**
573 * Calc hash using the specified hash_basis.
574 *
575 * NOTE:
576 *   This is implementation of FNV-1a hash function, which is released into public domain.
577 *   Constants used, are carefully picked primes by the authors.
578 *   More info: http://www.isthe.com/chongo/tech/comp/fnv/
579 *
580 * @return ecma-string's hash
581 */
582inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
583lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, /**< hash to be combined with */
584                              const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
585                              lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
586{
587  JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
588
589  uint32_t hash = hash_basis;
590
591  for (uint32_t i = 0; i < utf8_buf_size; i++)
592  {
593    /* 16777619 is 32 bit FNV_prime = 2^24 + 2^8 + 0x93 = 16777619 */
594    hash = (hash ^ utf8_buf_p[i]) * 16777619;
595  }
596
597  return (lit_string_hash_t) hash;
598} /* lit_utf8_string_hash_combine */
599
600/**
601 * Calculate hash from the buffer.
602 *
603 * @return ecma-string's hash
604 */
605inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
606lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
607                           lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
608{
609  JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
610
611  /* 32 bit offset_basis for FNV = 2166136261 */
612  return lit_utf8_string_hash_combine ((lit_string_hash_t) 2166136261, utf8_buf_p, utf8_buf_size);
613} /* lit_utf8_string_calc_hash */
614
615/**
616 * Return code unit at the specified position in string
617 *
618 * NOTE:
619 *   code_unit_offset should be less then string's length
620 *
621 * @return code unit value
622 */
623ecma_char_t
624lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
625                              lit_utf8_size_t utf8_buf_size, /**< string size in bytes */
626                              ecma_length_t code_unit_offset) /**< ofset of a code_unit */
627{
628  lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) utf8_buf_p;
629  ecma_char_t code_unit;
630
631  do
632  {
633    JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size);
634    current_p += lit_read_code_unit_from_utf8 (current_p, &code_unit);
635  }
636  while (code_unit_offset--);
637
638  return code_unit;
639} /* lit_utf8_string_code_unit_at */
640
641/**
642 * Get CESU-8 encoded size of character
643 *
644 * @return number of bytes occupied in CESU-8
645 */
646inline lit_utf8_size_t JERRY_ATTR_ALWAYS_INLINE
647lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */
648{
649  if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
650  {
651    return 1;
652  }
653  else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
654  {
655    return 2;
656  }
657  else
658  {
659    JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
660    return 3;
661  }
662} /* lit_get_unicode_char_size_by_utf8_first_byte */
663
664/**
665 * Convert code unit to cesu-8 representation
666 *
667 * @return byte count required to represent the code unit
668 */
669lit_utf8_size_t
670lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */
671                       lit_utf8_byte_t *buf_p) /**< buffer where to store the result and its size
672                                                *   should be at least LIT_UTF8_MAX_BYTES_IN_CODE_UNIT */
673{
674  if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
675  {
676    buf_p[0] = (lit_utf8_byte_t) code_unit;
677    return 1;
678  }
679  else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
680  {
681    uint32_t code_unit_bits = code_unit;
682    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
683    code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
684
685    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK);
686    JERRY_ASSERT (first_byte_bits == code_unit_bits);
687
688    buf_p[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
689    buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
690    return 2;
691  }
692  else
693  {
694    uint32_t code_unit_bits = code_unit;
695    lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
696    code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
697
698    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
699    code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
700
701    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK);
702    JERRY_ASSERT (first_byte_bits == code_unit_bits);
703
704    buf_p[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
705    buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
706    buf_p[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
707    return 3;
708  }
709} /* lit_code_unit_to_utf8 */
710
711/**
712 * Convert code point to cesu-8 representation
713 *
714 * @return byte count required to represent the code point
715 */
716lit_utf8_size_t
717lit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */
718                         lit_utf8_byte_t *buf) /**< buffer where to store the result,
719                                                *   its size should be at least 6 bytes */
720{
721  if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
722  {
723    return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf);
724  }
725  else
726  {
727    lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf);
728    offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset);
729    return offset;
730  }
731} /* lit_code_point_to_cesu8 */
732
733/**
734 * Convert code point to utf-8 representation
735 *
736 * @return byte count required to represent the code point
737 */
738lit_utf8_size_t
739lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
740                        lit_utf8_byte_t *buf) /**< buffer where to store the result,
741                                              *   its size should be at least 4 bytes */
742{
743  if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
744  {
745    buf[0] = (lit_utf8_byte_t) code_point;
746    return 1;
747  }
748  else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
749  {
750    uint32_t code_point_bits = code_point;
751    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
752    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
753
754    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK);
755    JERRY_ASSERT (first_byte_bits == code_point_bits);
756
757    buf[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
758    buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
759    return 2;
760  }
761  else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX)
762  {
763    uint32_t code_point_bits = code_point;
764    lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
765    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
766
767    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
768    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
769
770    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK);
771    JERRY_ASSERT (first_byte_bits == code_point_bits);
772
773    buf[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
774    buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
775    buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
776    return 3;
777  }
778  else
779  {
780    JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX);
781
782    uint32_t code_point_bits = code_point;
783    lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
784    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
785
786    lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
787    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
788
789    lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
790    code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
791
792    lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK);
793    JERRY_ASSERT (first_byte_bits == code_point_bits);
794
795    buf[0] = LIT_UTF8_4_BYTE_MARKER | first_byte_bits;
796    buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
797    buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
798    buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits;
799    return 4;
800  }
801} /* lit_code_point_to_utf8 */
802
803/**
804 * Convert cesu-8 string to an utf-8 string and put it into the buffer.
805 * It is the caller's responsibility to make sure that the string fits in the buffer.
806 *
807 * @return number of bytes copied to the buffer.
808 */
809lit_utf8_size_t
810lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, /**< cesu-8 string */
811                                         lit_utf8_size_t cesu8_size, /**< size of cesu-8 string */
812                                         lit_utf8_byte_t *utf8_string, /**< destination utf-8 buffer pointer
813                                                                        * (can be NULL if buffer_size == 0) */
814                                         lit_utf8_size_t utf8_size) /**< size of utf-8 buffer */
815{
816  const lit_utf8_byte_t *cesu8_pos = cesu8_string;
817  const lit_utf8_byte_t *cesu8_end_pos = cesu8_string + cesu8_size;
818
819  lit_utf8_byte_t *utf8_pos = utf8_string;
820  lit_utf8_byte_t *utf8_end_pos = utf8_string + utf8_size;
821
822  lit_utf8_size_t size = 0;
823
824  ecma_char_t prev_ch = 0;
825  lit_utf8_size_t prev_ch_size = 0;
826
827  while (cesu8_pos < cesu8_end_pos)
828  {
829    ecma_char_t ch;
830    lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch);
831
832    if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
833    {
834      JERRY_ASSERT (code_unit_size == prev_ch_size);
835      utf8_pos -= prev_ch_size;
836      lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_ch, ch);
837      lit_code_point_to_utf8 (code_point, utf8_pos);
838      size++;
839    }
840    else
841    {
842      memcpy (utf8_pos, cesu8_pos, code_unit_size);
843      size += code_unit_size;
844    }
845
846    utf8_pos = utf8_string + size;
847    cesu8_pos += code_unit_size;
848    prev_ch = ch;
849    prev_ch_size = code_unit_size;
850  }
851
852  JERRY_ASSERT (cesu8_pos == cesu8_end_pos);
853  JERRY_ASSERT (utf8_pos <= utf8_end_pos);
854
855  return size;
856} /* lit_convert_cesu8_string_to_utf8_string */
857
858/**
859 * Convert surrogate pair to code point
860 *
861 * @return code point
862 */
863lit_code_point_t
864lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */
865                                          ecma_char_t low_surrogate) /**< low surrogate code point */
866{
867  JERRY_ASSERT (lit_is_code_point_utf16_high_surrogate (high_surrogate));
868  JERRY_ASSERT (lit_is_code_point_utf16_low_surrogate (low_surrogate));
869
870  lit_code_point_t code_point;
871  code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
872  code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
873
874  code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
875
876  code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
877  return code_point;
878} /* lit_convert_surrogate_pair_to_code_point */
879
880/**
881 * Relational compare of cesu-8 strings
882 *
883 * First string is less than second string if:
884 *  - strings are not equal;
885 *  - first string is prefix of second or is lexicographically less than second.
886 *
887 * @return true - if first string is less than second string,
888 *         false - otherwise
889 */
890bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< utf-8 string */
891                                          lit_utf8_size_t string1_size, /**< string size */
892                                          const lit_utf8_byte_t *string2_p, /**< utf-8 string */
893                                          lit_utf8_size_t string2_size) /**< string size */
894{
895  lit_utf8_byte_t *string1_pos = (lit_utf8_byte_t *) string1_p;
896  lit_utf8_byte_t *string2_pos = (lit_utf8_byte_t *) string2_p;
897  const lit_utf8_byte_t *string1_end_p = string1_p + string1_size;
898  const lit_utf8_byte_t *string2_end_p = string2_p + string2_size;
899
900  while (string1_pos < string1_end_p && string2_pos < string2_end_p)
901  {
902    ecma_char_t ch1, ch2;
903    string1_pos += lit_read_code_unit_from_utf8 (string1_pos, &ch1);
904    string2_pos += lit_read_code_unit_from_utf8 (string2_pos, &ch2);
905
906    if (ch1 < ch2)
907    {
908      return true;
909    }
910    else if (ch1 > ch2)
911    {
912      return false;
913    }
914  }
915
916  return (string1_pos >= string1_end_p && string2_pos < string2_end_p);
917} /* lit_compare_utf8_strings_relational */
918