1425bb815Sopenharmony_ci/* Copyright JS Foundation and other contributors, http://js.foundation
2425bb815Sopenharmony_ci *
3425bb815Sopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License");
4425bb815Sopenharmony_ci * you may not use this file except in compliance with the License.
5425bb815Sopenharmony_ci * You may obtain a copy of the License at
6425bb815Sopenharmony_ci *
7425bb815Sopenharmony_ci *     http://www.apache.org/licenses/LICENSE-2.0
8425bb815Sopenharmony_ci *
9425bb815Sopenharmony_ci * Unless required by applicable law or agreed to in writing, software
10425bb815Sopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS
11425bb815Sopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12425bb815Sopenharmony_ci * See the License for the specific language governing permissions and
13425bb815Sopenharmony_ci * limitations under the License.
14425bb815Sopenharmony_ci */
15425bb815Sopenharmony_ci
16425bb815Sopenharmony_ci#ifndef LIT_STRINGS_H
17425bb815Sopenharmony_ci#define LIT_STRINGS_H
18425bb815Sopenharmony_ci
19425bb815Sopenharmony_ci#include "jrt.h"
20425bb815Sopenharmony_ci#include "lit-globals.h"
21425bb815Sopenharmony_ci
22425bb815Sopenharmony_ci/**
23425bb815Sopenharmony_ci * Null character (used in few cases as utf-8 string end marker)
24425bb815Sopenharmony_ci */
25425bb815Sopenharmony_ci#define LIT_BYTE_NULL (0)
26425bb815Sopenharmony_ci
27425bb815Sopenharmony_ci/**
28425bb815Sopenharmony_ci * For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
29425bb815Sopenharmony_ci * Unicode Standard (http://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#G7404).
30425bb815Sopenharmony_ci */
31425bb815Sopenharmony_ci#define LIT_UNICODE_CODE_POINT_NULL (0x0)
32425bb815Sopenharmony_ci#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
33425bb815Sopenharmony_ci
34425bb815Sopenharmony_ci#define LIT_UTF16_CODE_UNIT_MAX (0xFFFF)
35425bb815Sopenharmony_ci#define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000)
36425bb815Sopenharmony_ci#define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00)
37425bb815Sopenharmony_ci#define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800)
38425bb815Sopenharmony_ci#define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800)
39425bb815Sopenharmony_ci#define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF)
40425bb815Sopenharmony_ci#define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00)
41425bb815Sopenharmony_ci#define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF)
42425bb815Sopenharmony_ci#define LIT_UTF16_BITS_IN_SURROGATE (10)
43425bb815Sopenharmony_ci#define LIT_UTF16_LAST_10_BITS_MASK (0x3FF)
44425bb815Sopenharmony_ci
45425bb815Sopenharmony_ci#define LIT_UTF8_1_BYTE_MARKER (0x00)
46425bb815Sopenharmony_ci#define LIT_UTF8_2_BYTE_MARKER (0xC0)
47425bb815Sopenharmony_ci#define LIT_UTF8_3_BYTE_MARKER (0xE0)
48425bb815Sopenharmony_ci#define LIT_UTF8_4_BYTE_MARKER (0xF0)
49425bb815Sopenharmony_ci#define LIT_UTF8_EXTRA_BYTE_MARKER (0x80)
50425bb815Sopenharmony_ci
51425bb815Sopenharmony_ci#define LIT_UTF8_1_BYTE_MASK (0x80)
52425bb815Sopenharmony_ci#define LIT_UTF8_2_BYTE_MASK (0xE0)
53425bb815Sopenharmony_ci#define LIT_UTF8_3_BYTE_MASK (0xF0)
54425bb815Sopenharmony_ci#define LIT_UTF8_4_BYTE_MASK (0xF8)
55425bb815Sopenharmony_ci#define LIT_UTF8_EXTRA_BYTE_MASK (0xC0)
56425bb815Sopenharmony_ci
57425bb815Sopenharmony_ci#define LIT_UTF8_LAST_7_BITS_MASK (0x7F)
58425bb815Sopenharmony_ci#define LIT_UTF8_LAST_6_BITS_MASK (0x3F)
59425bb815Sopenharmony_ci#define LIT_UTF8_LAST_5_BITS_MASK (0x1F)
60425bb815Sopenharmony_ci#define LIT_UTF8_LAST_4_BITS_MASK (0x0F)
61425bb815Sopenharmony_ci#define LIT_UTF8_LAST_3_BITS_MASK (0x07)
62425bb815Sopenharmony_ci#define LIT_UTF8_LAST_2_BITS_MASK (0x03)
63425bb815Sopenharmony_ci#define LIT_UTF8_LAST_1_BIT_MASK  (0x01)
64425bb815Sopenharmony_ci
65425bb815Sopenharmony_ci#define LIT_UTF8_BITS_IN_EXTRA_BYTES (6)
66425bb815Sopenharmony_ci
67425bb815Sopenharmony_ci#define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F)
68425bb815Sopenharmony_ci#define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80)
69425bb815Sopenharmony_ci#define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF)
70425bb815Sopenharmony_ci#define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800)
71425bb815Sopenharmony_ci#define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX)
72425bb815Sopenharmony_ci#define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000)
73425bb815Sopenharmony_ci#define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX)
74425bb815Sopenharmony_ci
75425bb815Sopenharmony_ci/**
76425bb815Sopenharmony_ci * Differnce between byte count needed to represent code point greater than 0xFFFF
77425bb815Sopenharmony_ci * in common UTF-8 (4 bytes required) and CESU-8 (6 bytes required)
78425bb815Sopenharmony_ci */
79425bb815Sopenharmony_ci#define LIT_UTF8_CESU8_SURROGATE_SIZE_DIF (2 * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT - LIT_UTF8_MAX_BYTES_IN_CODE_POINT)
80425bb815Sopenharmony_ci
81425bb815Sopenharmony_ci/**
82425bb815Sopenharmony_ci * Byte values >= LIT_UTF8_FIRST_BYTE_MAX are not allowed in internal strings
83425bb815Sopenharmony_ci */
84425bb815Sopenharmony_ci#define LIT_UTF8_FIRST_BYTE_MAX (0xF8)
85425bb815Sopenharmony_ci
86425bb815Sopenharmony_ci/* validation */
87425bb815Sopenharmony_cibool lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t buf_size);
88425bb815Sopenharmony_cibool lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t buf_size);
89425bb815Sopenharmony_ci
90425bb815Sopenharmony_ci/* checks */
91425bb815Sopenharmony_cibool lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point);
92425bb815Sopenharmony_cibool lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point);
93425bb815Sopenharmony_ci
94425bb815Sopenharmony_ci/* size */
95425bb815Sopenharmony_cilit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p);
96425bb815Sopenharmony_cilit_utf8_size_t lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t cesu8_buf_size);
97425bb815Sopenharmony_ci
98425bb815Sopenharmony_ci/* length */
99425bb815Sopenharmony_ciecma_length_t lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size);
100425bb815Sopenharmony_ciecma_length_t lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t cesu8_buf_size);
101425bb815Sopenharmony_ci
102425bb815Sopenharmony_ci/* hash */
103425bb815Sopenharmony_cilit_string_hash_t lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size);
104425bb815Sopenharmony_cilit_string_hash_t lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, const lit_utf8_byte_t *utf8_buf_p,
105425bb815Sopenharmony_ci                                                lit_utf8_size_t utf8_buf_size);
106425bb815Sopenharmony_ci
107425bb815Sopenharmony_ci/* code unit access */
108425bb815Sopenharmony_ciecma_char_t lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size,
109425bb815Sopenharmony_ci                                          ecma_length_t code_unit_offset);
110425bb815Sopenharmony_cilit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte);
111425bb815Sopenharmony_ci
112425bb815Sopenharmony_ci/* conversion */
113425bb815Sopenharmony_cilit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t code_unit, lit_utf8_byte_t *buf_p);
114425bb815Sopenharmony_cilit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t code_point, lit_utf8_byte_t *buf);
115425bb815Sopenharmony_cilit_utf8_size_t lit_code_point_to_cesu8 (lit_code_point_t code_point, lit_utf8_byte_t *buf);
116425bb815Sopenharmony_cilit_utf8_size_t lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string,
117425bb815Sopenharmony_ci                                                         lit_utf8_size_t cesu8_size,
118425bb815Sopenharmony_ci                                                         lit_utf8_byte_t *utf8_string,
119425bb815Sopenharmony_ci                                                         lit_utf8_size_t utf8_size);
120425bb815Sopenharmony_cilit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, ecma_char_t low_surrogate);
121425bb815Sopenharmony_ci
122425bb815Sopenharmony_cibool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, lit_utf8_size_t string1_size,
123425bb815Sopenharmony_ci                                          const lit_utf8_byte_t *string2_p, lit_utf8_size_t string2_size);
124425bb815Sopenharmony_ci
125425bb815Sopenharmony_ciuint8_t lit_utf16_encode_code_point (lit_code_point_t cp, ecma_char_t *cu_p);
126425bb815Sopenharmony_ci
127425bb815Sopenharmony_ci/* read code point from buffer */
128425bb815Sopenharmony_cilit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, lit_utf8_size_t buf_size,
129425bb815Sopenharmony_ci                                               lit_code_point_t *code_point);
130425bb815Sopenharmony_ci
131425bb815Sopenharmony_cilit_utf8_size_t lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p,
132425bb815Sopenharmony_ci                                              ecma_char_t *code_point);
133425bb815Sopenharmony_ci
134425bb815Sopenharmony_cilit_utf8_size_t lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p,
135425bb815Sopenharmony_ci                                                   ecma_char_t *code_point);
136425bb815Sopenharmony_ci
137425bb815Sopenharmony_ciecma_char_t lit_cesu8_read_next (const lit_utf8_byte_t **buf_p);
138425bb815Sopenharmony_ciecma_char_t lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p);
139425bb815Sopenharmony_ciecma_char_t lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p);
140425bb815Sopenharmony_ciecma_char_t lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p);
141425bb815Sopenharmony_civoid lit_utf8_incr (const lit_utf8_byte_t **buf_p);
142425bb815Sopenharmony_civoid lit_utf8_decr (const lit_utf8_byte_t **buf_p);
143425bb815Sopenharmony_ci
144425bb815Sopenharmony_ci#endif /* !LIT_STRINGS_H */
145