1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_STRING_H
17 #define ECMASCRIPT_STRING_H
18 
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 
23 #include "ecmascript/base/utf_helper.h"
24 #include "ecmascript/common.h"
25 #include "ecmascript/ecma_macros.h"
26 #include "ecmascript/js_hclass.h"
27 #include "ecmascript/js_tagged_value.h"
28 #include "ecmascript/mem/barriers.h"
29 #include "ecmascript/mem/space.h"
30 #include "ecmascript/mem/tagged_object.h"
31 #include "ecmascript/platform/ecma_string_hash_helper.h"
32 
33 #include "libpandabase/macros.h"
34 #include "securec.h"
35 #include "unicode/locid.h"
36 
37 namespace panda {
38 namespace test {
39     class EcmaStringEqualsTest;
40 }
41 namespace ecmascript {
42 template<typename T>
43 class JSHandle;
44 class JSPandaFile;
45 class EcmaVM;
46 class LineEcmaString;
47 class ConstantString;
48 class TreeEcmaString;
49 class SlicedString;
50 class FlatStringInfo;
51 
52 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
53 #define ECMA_STRING_CHECK_LENGTH_AND_TRHOW(vm, length)                                        \
54     if ((length) >= MAX_STRING_LENGTH) {                                                      \
55         THROW_RANGE_ERROR_AND_RETURN((vm)->GetJSThread(), "Invalid string length", nullptr);  \
56     }
57 
58 class EcmaString : public TaggedObject {
59     /* Mix Hash Code: --   { 0 | [31 bits raw hash code] }     computed through string
60                       \    { 1 | [31 bits integer numbers] }   fastpath for string to number
61     */
62 public:
63     CAST_CHECK(EcmaString, IsString);
64 
65     static constexpr uint32_t IS_INTEGER_MASK = 1U << 31;
66     static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1;
67     static constexpr uint32_t STRING_INTERN_BIT = 0x2;
68     static constexpr size_t MAX_STRING_LENGTH = 0x40000000U; // 30 bits for string length, 2 bits for special meaning
69     static constexpr uint32_t STRING_LENGTH_SHIFT_COUNT = 2U;
70     static constexpr uint32_t MAX_INTEGER_HASH_NUMBER = 0x3B9AC9FF;
71     static constexpr uint32_t MAX_CACHED_INTEGER_SIZE = 9;
72 
73     static constexpr size_t MIX_LENGTH_OFFSET = TaggedObjectSize();
74     // In last bit of mix_length we store if this string is compressed or not.
75     ACCESSORS_PRIMITIVE_FIELD(MixLength, uint32_t, MIX_LENGTH_OFFSET, MIX_HASHCODE_OFFSET)
76     // In last bit of mix_hash we store if this string is small-integer number or not.
77     ACCESSORS_PRIMITIVE_FIELD(MixHashcode, uint32_t, MIX_HASHCODE_OFFSET, SIZE)
78 
79     enum CompressedStatus {
80         STRING_COMPRESSED,
81         STRING_UNCOMPRESSED,
82     };
83 
84     enum IsIntegerStatus {
85         NOT_INTEGER = 0,
86         IS_INTEGER,
87     };
88 
89     enum TrimMode : uint8_t {
90         TRIM,
91         TRIM_START,
92         TRIM_END,
93     };
94 
95     enum ConcatOptStatus {
96         BEGIN_STRING_ADD = 1,
97         IN_STRING_ADD,
98         CONFIRMED_IN_STRING_ADD,
99         END_STRING_ADD,
100         INVALID_STRING_ADD,
101         HAS_BACKING_STORE,
102     };
103 
104 private:
105     friend class EcmaStringAccessor;
106     friend class LineEcmaString;
107     friend class ConstantString;
108     friend class TreeEcmaString;
109     friend class SlicedString;
110     friend class FlatStringInfo;
111     friend class NameDictionary;
112     friend class panda::test::EcmaStringEqualsTest;
113 
114     static EcmaString *CreateEmptyString(const EcmaVM *vm);
115     static EcmaString *CreateFromUtf8(const EcmaVM *vm, const uint8_t *utf8Data, uint32_t utf8Len,
116         bool canBeCompress, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE, bool isConstantString = false,
117         uint32_t idOffset = 0);
118     static EcmaString *CreateFromUtf8CompressedSubString(const EcmaVM *vm, const JSHandle<EcmaString> &string,
119         uint32_t offset, uint32_t utf8Len, MemSpaceType type = MemSpaceType::SEMI_SPACE);
120     static EcmaString *CreateUtf16StringFromUtf8(const EcmaVM *vm, const uint8_t *utf8Data, uint32_t utf8Len,
121         MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE);
122     static EcmaString *CreateFromUtf16(const EcmaVM *vm, const uint16_t *utf16Data, uint32_t utf16Len,
123         bool canBeCompress, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE);
124     static SlicedString *CreateSlicedString(const EcmaVM *vm, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE);
125     static EcmaString *CreateLineString(const EcmaVM *vm, size_t length, bool compressed);
126     static EcmaString *CreateLineStringNoGC(const EcmaVM *vm, size_t length, bool compressed);
127     static EcmaString *CreateLineStringWithSpaceType(const EcmaVM *vm,
128         size_t length, bool compressed, MemSpaceType type);
129     static EcmaString *CreateTreeString(const EcmaVM *vm,
130         const JSHandle<EcmaString> &left, const JSHandle<EcmaString> &right, uint32_t length, bool compressed);
131     static EcmaString *CreateConstantString(const EcmaVM *vm, const uint8_t *utf8Data,
132         size_t length, bool compressed, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE, uint32_t idOffset = 0);
133     static EcmaString *Concat(const EcmaVM *vm, const JSHandle<EcmaString> &left,
134         const JSHandle<EcmaString> &right, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE);
135     template<typename T1, typename T2>
136     static uint32_t CalculateDataConcatHashCode(const T1 *dataFirst, size_t sizeFirst,
137                                                 const T2 *dataSecond, size_t sizeSecond);
138     static uint32_t CalculateAllConcatHashCode(const JSHandle<EcmaString> &firstString,
139                                                const JSHandle<EcmaString> &secondString);
140     static uint32_t CalculateConcatHashCode(const JSHandle<EcmaString> &firstString,
141                                             const JSHandle<EcmaString> &secondString);
142     static EcmaString *CopyStringToOldSpace(const EcmaVM *vm, const JSHandle<EcmaString> &original,
143         uint32_t length, bool compressed);
144     static EcmaString *FastSubString(const EcmaVM *vm,
145         const JSHandle<EcmaString> &src, uint32_t start, uint32_t length);
146     static EcmaString *GetSlicedString(const EcmaVM *vm,
147         const JSHandle<EcmaString> &src, uint32_t start, uint32_t length);
148     static EcmaString *GetSubString(const EcmaVM *vm,
149         const JSHandle<EcmaString> &src, uint32_t start, uint32_t length);
150     // require src is LineString
151     // not change src data structure
152     static inline EcmaString *FastSubUtf8String(const EcmaVM *vm,
153         const JSHandle<EcmaString> &src, uint32_t start, uint32_t length);
154     // require src is LineString
155     // not change src data structure
156     static inline EcmaString *FastSubUtf16String(const EcmaVM *vm,
157         const JSHandle<EcmaString> &src, uint32_t start, uint32_t length);
158     inline void TrimLineString(const JSThread *thread, uint32_t newLength);
IsUtf8() const159     inline bool IsUtf8() const
160     {
161         return (GetMixLength() & STRING_COMPRESSED_BIT) == STRING_COMPRESSED;
162     }
163 
IsUtf16() const164     inline bool IsUtf16() const
165     {
166         return (GetMixLength() & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED;
167     }
168 
IsInteger()169     inline bool IsInteger()
170     {
171         return (GetHashcode() & IS_INTEGER_MASK) == IS_INTEGER_MASK;
172     }
173 
174     // require is LineString
175     inline uint16_t *GetData() const;
176     inline const uint8_t *GetDataUtf8() const;
177     inline const uint16_t *GetDataUtf16() const;
178 
179     // require is LineString
180     inline uint8_t *GetDataUtf8Writable();
181     inline uint16_t *GetDataUtf16Writable();
182 
GetLength() const183     inline uint32_t GetLength() const
184     {
185         return GetMixLength() >> STRING_LENGTH_SHIFT_COUNT;
186     }
187 
SetLength(uint32_t length, bool compressed = false)188     inline void SetLength(uint32_t length, bool compressed = false)
189     {
190         ASSERT(length < MAX_STRING_LENGTH);
191         // Use 0u for compressed/utf8 expression
192         SetMixLength((length << STRING_LENGTH_SHIFT_COUNT) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED));
193     }
194 
195     inline uint32_t GetRawHashcode() const
196     {
197         return GetMixHashcode() & (~IS_INTEGER_MASK);
198     }
199 
200     static inline uint32_t MixHashcode(uint32_t hashcode, bool isInteger)
201     {
202         return isInteger ? (hashcode | IS_INTEGER_MASK) : (hashcode & (~IS_INTEGER_MASK));
203     }
204 
205     inline void SetRawHashcode(uint32_t hashcode, bool isInteger = false)
206     {
207         // Use 0u for not integer string's expression
208         SetMixHashcode(MixHashcode(hashcode, isInteger));
209     }
210 
211     inline size_t GetUtf8Length(bool modify = true, bool isGetBufferSize = false) const;
212 
213     inline void SetIsInternString()
214     {
215         SetMixLength(GetMixLength() | STRING_INTERN_BIT);
216     }
217 
218     inline bool IsInternString() const
219     {
220         return (GetMixLength() & STRING_INTERN_BIT) != 0;
221     }
222 
223     inline void ClearInternStringFlag()
224     {
225         SetMixLength(GetMixLength() & ~STRING_INTERN_BIT);
226     }
227 
228     inline bool TryGetHashCode(uint32_t *hash)
229     {
230         uint32_t hashcode = GetMixHashcode();
231         if (hashcode == 0 && GetLength() != 0) {
232             return false;
233         }
234         *hash = hashcode;
235         return true;
236     }
237 
238     inline uint32_t GetIntegerCode()
239     {
240         ASSERT(GetMixHashcode() & IS_INTEGER_MASK);
241         return GetRawHashcode();
242     }
243 
244     // not change this data structure.
245     // if string is not flat, this func has low efficiency.
246     uint32_t PUBLIC_API GetHashcode()
247     {
248         uint32_t hashcode = GetMixHashcode();
249         // GetLength() == 0 means it's an empty array.No need to computeHashCode again when hashseed is 0.
250         if (hashcode == 0 && GetLength() != 0) {
251             hashcode = ComputeHashcode();
252             SetMixHashcode(hashcode);
253         }
254         return hashcode;
255     }
256 
257     template<typename T>
258     inline static bool IsDecimalDigitChar(const T c)
259     {
260         return (c >= '0' && c <= '9');
261     }
262 
263     static uint32_t ComputeIntegerHash(uint32_t *num, uint8_t c)
264     {
265         if (!IsDecimalDigitChar(c)) {
266             return false;
267         }
268         int charDate = c - '0';
269         *num = (*num) * 10 + charDate; // 10: decimal factor
270         return true;
271     }
272 
273     bool HashIntegerString(uint32_t length, uint32_t *hash, uint32_t hashSeed) const;
274 
275     template<typename T>
276     static bool HashIntegerString(const T *data, size_t size, uint32_t *hash, uint32_t hashSeed)
277     {
278         ASSERT(size >= 0);
279         if (hashSeed == 0) {
280             if (IsDecimalDigitChar(data[0]) && data[0] != '0') {
281                 uint32_t num = data[0] - '0';
282                 uint32_t i = 1;
283                 do {
284                     if (i == size) {
285                         // compute mix hash
286                         if (num <= MAX_INTEGER_HASH_NUMBER) {
287                             *hash = MixHashcode(num, IS_INTEGER);
288                             return true;
289                         }
290                         return false;
291                     }
292                 } while (ComputeIntegerHash(&num, data[i++]));
293             }
294             if (size == 1 && (data[0] == '0')) {
295                 *hash = MixHashcode(0, IS_INTEGER);
296                 return true;
297             }
298         } else {
299             if (IsDecimalDigitChar(data[0])) {
300                 uint32_t num = hashSeed * 10 + (data[0] - '0'); // 10: decimal factor
301                 uint32_t i = 1;
302                 do {
303                     if (i == size) {
304                         // compute mix hash
305                         if (num <= MAX_INTEGER_HASH_NUMBER) {
306                             *hash = MixHashcode(num, IS_INTEGER);
307                             return true;
308                         }
309                         return false;
310                     }
311                 } while (ComputeIntegerHash(&num, data[i++]));
312             }
313         }
314         return false;
315     }
316 
317     // not change this data structure.
318     // if string is not flat, this func has low efficiency.
319     uint32_t PUBLIC_API ComputeHashcode() const;
320     std::pair<uint32_t, bool> PUBLIC_API ComputeRawHashcode() const;
321     uint32_t PUBLIC_API ComputeHashcode(uint32_t rawHashSeed, bool isInteger) const;
322 
323     static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress);
324     static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length);
325 
326     template<bool verify = true>
327     uint16_t At(int32_t index) const;
328 
329     // require is LineString
330     void WriteData(uint32_t index, uint16_t src);
331 
332     // can change left and right data structure
333     static int32_t Compare(const EcmaVM *vm, const JSHandle<EcmaString> &left, const JSHandle<EcmaString> &right);
334 
335     static bool IsSubStringAt(const EcmaVM *vm, const JSHandle<EcmaString>& left,
336         const JSHandle<EcmaString>& right, uint32_t offset);
337 
338     // Check that two spans are equal. Should have the same length.
339     /* static */
340     template<typename T, typename T1>
341     static bool StringsAreEquals(Span<const T> &str1, Span<const T1> &str2)
342     {
343         ASSERT(str1.Size() <= str2.Size());
344         size_t size = str1.Size();
345         if (!std::is_same_v<T, T1>) {
346             for (size_t i = 0; i < size; i++) {
347                 auto left = static_cast<uint16_t>(str1[i]);
348                 auto right = static_cast<uint16_t>(str2[i]);
349                 if (left != right) {
350                     return false;
351                 }
352             }
353             return true;
354         }
355 
356         return !memcmp(str1.data(), str2.data(), size * sizeof(T));
357     }
358 
359     // Converts utf8Data to utf16 and compare it with given utf16_data.
360     static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data,
361                                   uint32_t utf16Len);
362     // Compares string1 + string2 by bytes, It doesn't check canonical unicode equivalence.
363     bool EqualToSplicedString(const EcmaString *str1, const EcmaString *str2);
364     // Compares strings by bytes, It doesn't check canonical unicode equivalence.
365     static PUBLIC_API bool StringsAreEqual(const EcmaVM *vm, const JSHandle<EcmaString> &str1,
366         const JSHandle<EcmaString> &str2);
367     // Compares strings by bytes, It doesn't check canonical unicode equivalence.
368     static PUBLIC_API bool StringsAreEqual(EcmaString *str1, EcmaString *str2);
369     // Two strings have the same type of utf encoding format.
370     static bool StringsAreEqualDiffUtfEncoding(EcmaString *str1, EcmaString *str2);
371     static bool StringsAreEqualDiffUtfEncoding(const FlatStringInfo &str1, const FlatStringInfo &str2);
372     // Compares strings by bytes, It doesn't check canonical unicode equivalence.
373     // not change str1 data structure.
374     // if str1 is not flat, this func has low efficiency.
375     static bool StringIsEqualUint8Data(const EcmaString *str1, const uint8_t *dataAddr, uint32_t dataLen,
376                                        bool canBeCompress);
377     // Compares strings by bytes, It doesn't check canonical unicode equivalence.
378     // not change str1 data structure.
379     // if str1 is not flat, this func has low efficiency.
380     static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len);
381 
382     // can change receiver and search data structure
383     static int32_t IndexOf(const EcmaVM *vm,
384         const JSHandle<EcmaString> &receiver, const JSHandle<EcmaString> &search, int pos = 0);
385 
386     // can change receiver and search data structure
387     static int32_t LastIndexOf(const EcmaVM *vm,
388         const JSHandle<EcmaString> &receiver, const JSHandle<EcmaString> &search, int pos = 0);
389 
390     inline size_t CopyDataUtf8(uint8_t *buf, size_t maxLength, bool modify = true) const
391     {
392         if (maxLength == 0) {
393             return 1; // maxLength was -1 at napi
394         }
395         size_t length = GetLength();
396         if (length > maxLength) {
397             return 0;
398         }
399         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
400         buf[maxLength - 1] = '\0';
401         // Put comparison here so that internal usage and napi can use the same CopyDataRegionUtf8
402         return CopyDataRegionUtf8(buf, 0, length, maxLength, modify) + 1;  // add place for zero in the end
403     }
404 
405     // It allows user to copy into buffer even if maxLength < length
406     inline size_t WriteUtf8(uint8_t *buf, size_t maxLength, bool isWriteBuffer = false) const
407     {
408         if (maxLength == 0) {
409             return 1; // maxLength was -1 at napi
410         }
411         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
412         buf[maxLength - 1] = '\0';
413         return CopyDataRegionUtf8(buf, 0, GetLength(), maxLength, true, isWriteBuffer) + 1;
414     }
415 
416     size_t CopyDataToUtf16(uint16_t *buf, uint32_t length, uint32_t bufLength) const
417     {
418         if (IsUtf16()) {
419             CVector<uint16_t> tmpBuf;
420             const uint16_t *data = EcmaString::GetUtf16DataFlat(this, tmpBuf);
421             if (length > bufLength) {
422                 if (memcpy_s(buf, bufLength * sizeof(uint16_t), data, bufLength * sizeof(uint16_t)) != EOK) {
423                     LOG_FULL(FATAL) << "memcpy_s failed when length > bufLength";
424                     UNREACHABLE();
425                 }
426                 return bufLength;
427             }
428             if (memcpy_s(buf, bufLength * sizeof(uint16_t), data, length * sizeof(uint16_t)) != EOK) {
429                 LOG_FULL(FATAL) << "memcpy_s failed";
430                 UNREACHABLE();
431             }
432             return length;
433         }
434         CVector<uint8_t> tmpBuf;
435         const uint8_t *data = EcmaString::GetUtf8DataFlat(this, tmpBuf);
436         if (length > bufLength) {
437             return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, bufLength, bufLength);
438         }
439         return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, length, bufLength);
440     }
441 
442     // It allows user to copy into buffer even if maxLength < length
443     inline size_t WriteUtf16(uint16_t *buf, uint32_t targetLength, uint32_t bufLength) const
444     {
445         if (bufLength == 0) {
446             return 0;
447         }
448         // Returns a number representing a valid backrest length.
449         return CopyDataToUtf16(buf, targetLength, bufLength);
450     }
451 
452     size_t WriteOneByte(uint8_t *buf, size_t maxLength) const
453     {
454         if (maxLength == 0) {
455             return 0;
456         }
457         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
458         buf[maxLength - 1] = '\0';
459         uint32_t length = GetLength();
460         if (!IsUtf16()) {
461             CVector<uint8_t> tmpBuf;
462             const uint8_t *data = GetUtf8DataFlat(this, tmpBuf);
463             if (length > maxLength) {
464                 length = maxLength;
465             }
466             if (memcpy_s(buf, maxLength, data, length) != EOK) {
467                 LOG_FULL(FATAL) << "memcpy_s failed when write one byte";
468                 UNREACHABLE();
469             }
470             return length;
471         }
472 
473         CVector<uint16_t> tmpBuf;
474         const uint16_t *data = GetUtf16DataFlat(this, tmpBuf);
475         if (length > maxLength) {
476             return base::utf_helper::ConvertRegionUtf16ToLatin1(data, buf, maxLength, maxLength);
477         }
478         return base::utf_helper::ConvertRegionUtf16ToLatin1(data, buf, length, maxLength);
479     }
480 
481     size_t CopyDataRegionUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength,
482                               bool modify = true, bool isWriteBuffer = false) const
483     {
484         uint32_t len = GetLength();
485         if (start + length > len) {
486             return 0;
487         }
488         if (!IsUtf16()) {
489             if (length > std::numeric_limits<size_t>::max() / 2 - 1) {  // 2: half
490                 LOG_FULL(FATAL) << " length is higher than half of size_t::max";
491                 UNREACHABLE();
492             }
493             CVector<uint8_t> tmpBuf;
494             const uint8_t *data = GetUtf8DataFlat(this, tmpBuf) + start;
495             // Only copy maxLength number of chars into buffer if length > maxLength
496             auto dataLen = std::min(length, maxLength);
497             std::copy(data, data + dataLen, buf);
498             return dataLen;
499         }
500         CVector<uint16_t> tmpBuf;
501         const uint16_t *data = GetUtf16DataFlat(this, tmpBuf);
502         if (length > maxLength) {
503             return base::utf_helper::ConvertRegionUtf16ToUtf8(data, buf, maxLength, maxLength, start,
504                                                               modify, isWriteBuffer);
505         }
506         return base::utf_helper::ConvertRegionUtf16ToUtf8(data, buf, length, maxLength, start,
507                                                           modify, isWriteBuffer);
508     }
509 
510     inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) const
511     {
512         uint32_t length = GetLength();
513         if (length > maxLength) {
514             return 0;
515         }
516         if (IsUtf16()) {
517             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
518             CVector<uint16_t> tmpBuf;
519             const uint16_t *data = GetUtf16DataFlat(this, tmpBuf);
520             if (memcpy_s(buf, maxLength * sizeof(uint16_t), data, length * sizeof(uint16_t)) != EOK) {
521                 LOG_FULL(FATAL) << "memcpy_s failed";
522                 UNREACHABLE();
523             }
524             return length;
525         }
526         CVector<uint8_t> tmpBuf;
527         const uint8_t *data = GetUtf8DataFlat(this, tmpBuf);
528         return base::utf_helper::ConvertRegionUtf8ToUtf16(data, buf, length, maxLength);
529     }
530 
531     std::u16string ToU16String(uint32_t len = 0);
532 
533     std::unique_ptr<uint8_t[]> ToOneByteDataForced()
534     {
535         uint8_t *buf = nullptr;
536         auto length = GetLength();
537         if (IsUtf16()) {
538             auto size = length * sizeof(uint16_t);
539             buf = new uint8_t[size]();
540             CopyDataUtf16(reinterpret_cast<uint16_t *>(buf), length);
541         } else {
542             buf = new uint8_t[length + 1]();
543             CopyDataUtf8(buf, length + 1);
544         }
545         return std::unique_ptr<uint8_t[]>(buf);
546     }
547 
548     Span<const uint8_t> ToUtf8Span(CVector<uint8_t> &buf, bool modify = true, bool cesu8 = false)
549     {
550         Span<const uint8_t> str;
551         uint32_t strLen = GetLength();
552         if (UNLIKELY(IsUtf16())) {
553             CVector<uint16_t> tmpBuf;
554             const uint16_t *data = EcmaString::GetUtf16DataFlat(this, tmpBuf);
555             ASSERT(base::utf_helper::Utf16ToUtf8Size(data, strLen, modify, false, cesu8) > 0);
556             size_t len = base::utf_helper::Utf16ToUtf8Size(data, strLen, modify, false, cesu8) - 1;
557             buf.reserve(len);
558             len = base::utf_helper::ConvertRegionUtf16ToUtf8(data, buf.data(), strLen, len, 0, modify, false, cesu8);
559             str = Span<const uint8_t>(buf.data(), len);
560         } else {
561             const uint8_t *data = EcmaString::GetUtf8DataFlat(this, buf);
562             str = Span<const uint8_t>(data, strLen);
563         }
564         return str;
565     }
566 
567     Span<const uint8_t> DebuggerToUtf8Span(CVector<uint8_t> &buf, bool modify = true)
568     {
569         Span<const uint8_t> str;
570         uint32_t strLen = GetLength();
571         if (UNLIKELY(IsUtf16())) {
572             CVector<uint16_t> tmpBuf;
573             const uint16_t *data = EcmaString::GetUtf16DataFlat(this, tmpBuf);
574             size_t len = base::utf_helper::Utf16ToUtf8Size(data, strLen, modify) - 1;
575             buf.reserve(len);
576             len = base::utf_helper::DebuggerConvertRegionUtf16ToUtf8(data, buf.data(), strLen, len, 0, modify);
577             str = Span<const uint8_t>(buf.data(), len);
578         } else {
579             const uint8_t *data = EcmaString::GetUtf8DataFlat(this, buf);
580             str = Span<const uint8_t>(data, strLen);
581         }
582         return str;
583     }
584 
585     inline Span<const uint8_t> FastToUtf8Span() const;
586 
587     bool TryToGetInteger(uint32_t *result)
588     {
589         if (!IsInteger()) {
590             return false;
591         }
592         ASSERT(GetLength() <= MAX_CACHED_INTEGER_SIZE);
593         *result = GetIntegerCode();
594         return true;
595     }
596 
597     // using integer number set into hash
598     inline bool TryToSetIntegerHash(int32_t num)
599     {
600         uint32_t hashcode = GetMixHashcode();
601         if (hashcode == 0 && GetLength() != 0) {
602             SetRawHashcode(static_cast<uint32_t>(num), IS_INTEGER);
603             return true;
604         }
605         return false;
606     }
607 
608     void WriteData(EcmaString *src, uint32_t start, uint32_t destSize, uint32_t length);
609 
610     static bool CanBeCompressed(const uint8_t *utf8Data, uint32_t utf8Len);
611     static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Len);
612     static bool CanBeCompressed(const EcmaString *string);
613 
614     bool PUBLIC_API ToElementIndex(uint32_t *index);
615 
616     bool ToInt(int32_t *index, bool *negative);
617 
618     bool ToUInt64FromLoopStart(uint64_t *index, uint32_t loopStart, const uint8_t *data);
619 
620     bool PUBLIC_API ToTypedArrayIndex(uint32_t *index);
621 
622     template<bool isLower>
623     static EcmaString *ConvertCase(const EcmaVM *vm, const JSHandle<EcmaString> &src);
624 
625     template<bool isLower>
626     static EcmaString *LocaleConvertCase(const EcmaVM *vm, const JSHandle<EcmaString> &src, const icu::Locale &locale);
627 
628     template<typename T>
629     static EcmaString *TrimBody(const JSThread *thread, const JSHandle<EcmaString> &src, Span<T> &data, TrimMode mode);
630 
631     static EcmaString *Trim(const JSThread *thread, const JSHandle<EcmaString> &src, TrimMode mode = TrimMode::TRIM);
632 
633     // single char copy for loop
634     template<typename DstType, typename SrcType>
635     static void CopyChars(DstType *dst, SrcType *src, uint32_t count)
636     {
637         Span<SrcType> srcSp(src, count);
638         Span<DstType> dstSp(dst, count);
639         for (uint32_t i = 0; i < count; i++) {
640             dstSp[i] = srcSp[i];
641         }
642     }
643 
644     // memory block copy
645     template<typename T>
646     static bool MemCopyChars(Span<T> &dst, size_t dstMax, Span<const T> &src, size_t count);
647 
648     // To change the hash algorithm of EcmaString, please modify EcmaString::CalculateConcatHashCode
649     // and EcmaStringHashHelper::ComputeHashForDataPlatform simultaneously!!
650     template <typename T>
651     static uint32_t ComputeHashForData(const T *data, size_t size,
652                                        uint32_t hashSeed)
653     {
654         if (size <= static_cast<size_t>(EcmaStringHash::MIN_SIZE_FOR_UNROLLING)) {
655             uint32_t hash = hashSeed;
656             for (uint32_t i = 0; i < size ; i++) {
657                 hash = (hash << static_cast<uint32_t>(EcmaStringHash::HASH_SHIFT)) - hash + data[i];
658             }
659             return hash;
660         }
661         return EcmaStringHashHelper::ComputeHashForDataPlatform(data, size, hashSeed);
662     }
663 
664     static bool IsASCIICharacter(uint16_t data)
665     {
666         if (data == 0) {
667             return false;
668         }
669         // \0 is not considered ASCII in Ecma-Modified-UTF8 [only modify '\u0000']
670         return data <= base::utf_helper::UTF8_1B_MAX;
671     }
672 
673     template<typename T1, typename T2>
674     static int32_t IndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos, int32_t max);
675 
676     template<typename T1, typename T2>
677     static int32_t LastIndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos);
678 
679     bool IsFlat() const;
680 
681     bool IsLineString() const
682     {
683         return GetClass()->IsLineString();
684     }
685     bool IsConstantString() const
686     {
687         return GetClass()->IsConstantString();
688     }
689     bool IsSlicedString() const
690     {
691         return GetClass()->IsSlicedString();
692     }
693     bool IsTreeString() const
694     {
695         return GetClass()->IsTreeString();
696     }
697     bool NotTreeString() const
698     {
699         return !IsTreeString();
700     }
701     bool IsLineOrConstantString() const
702     {
703         auto hclass = GetClass();
704         return hclass->IsLineString() || hclass->IsConstantString();
705     }
706 
707     JSType GetStringType() const
708     {
709         JSType type = GetClass()->GetObjectType();
710         ASSERT(type >= JSType::STRING_FIRST && type <= JSType::STRING_LAST);
711         return type;
712     }
713 
714     template <typename Char>
715     static void WriteToFlat(EcmaString *src, Char *buf, uint32_t maxLength);
716 
717     template <typename Char>
718     static void WriteToFlatWithPos(EcmaString *src, Char *buf, uint32_t length, uint32_t pos);
719 
720     static const uint8_t *PUBLIC_API GetUtf8DataFlat(const EcmaString *src, CVector<uint8_t> &buf);
721 
722     static const uint8_t *PUBLIC_API GetNonTreeUtf8Data(const EcmaString *src);
723 
724     static const uint16_t *PUBLIC_API GetUtf16DataFlat(const EcmaString *src, CVector<uint16_t> &buf);
725 
726     static const uint16_t *PUBLIC_API GetNonTreeUtf16Data(const EcmaString *src);
727 
728     // string must be not flat
729     static EcmaString *SlowFlatten(const EcmaVM *vm, const JSHandle<EcmaString> &string, MemSpaceType type);
730 
731     PUBLIC_API static EcmaString *Flatten(const EcmaVM *vm, const JSHandle<EcmaString> &string,
732                                MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE);
733 
734     static FlatStringInfo FlattenAllString(const EcmaVM *vm, const JSHandle<EcmaString> &string,
735                                             MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE);
736 
737     static EcmaString *FlattenNoGCForSnapshot(const EcmaVM *vm, EcmaString *string);
738 
739     static EcmaString *ToLower(const EcmaVM *vm, const JSHandle<EcmaString> &src);
740 
741     static EcmaString *ToUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src);
742 
743     static EcmaString *ToLocaleLower(const EcmaVM *vm, const JSHandle<EcmaString> &src, const icu::Locale &locale);
744 
745     static EcmaString *ToLocaleUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src, const icu::Locale &locale);
746 
747     static EcmaString *TryToLower(const EcmaVM *vm, const JSHandle<EcmaString> &src);
748 
749     static EcmaString *TryToUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src);
750 
751     static EcmaString *ConvertUtf8ToLowerOrUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src,
752                                                  bool toLower, uint32_t startIndex = 0);
753 };
754 
755 // The LineEcmaString abstract class captures sequential string values, only LineEcmaString can store chars data
756 class LineEcmaString : public EcmaString {
757 public:
758     static constexpr uint32_t MAX_LENGTH = (1 << 28) - 16;
759     static constexpr uint32_t INIT_LENGTH_TIMES = 4;
760     // DATA_OFFSET: the string data stored after the string header.
761     // Data can be stored in utf8 or utf16 form according to compressed bit.
762     static constexpr size_t DATA_OFFSET = EcmaString::SIZE;  // DATA_OFFSET equal to Empty String size
763 
764     CAST_CHECK(LineEcmaString, IsLineString);
765 
766     DECL_VISIT_ARRAY(DATA_OFFSET, 0, GetPointerLength());
767 
768     static LineEcmaString *Cast(EcmaString *str)
769     {
770         return static_cast<LineEcmaString *>(str);
771     }
772 
773     static LineEcmaString *Cast(const EcmaString *str)
774     {
775         return LineEcmaString::Cast(const_cast<EcmaString *>(str));
776     }
777 
778     static size_t ComputeSizeUtf8(uint32_t utf8Len)
779     {
780         return DATA_OFFSET + utf8Len;
781     }
782 
783     static size_t ComputeSizeUtf16(uint32_t utf16Len)
784     {
785         return DATA_OFFSET + utf16Len * sizeof(uint16_t);
786     }
787 
788     static size_t ObjectSize(EcmaString *str)
789     {
790         uint32_t length = str->GetLength();
791         return str->IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeUtf8(length);
792     }
793 
794     static size_t DataSize(EcmaString *str)
795     {
796         uint32_t length = str->GetLength();
797         return str->IsUtf16() ? length * sizeof(uint16_t) : length;
798     }
799 
800     size_t GetPointerLength()
801     {
802         size_t byteSize = DataSize(this);
803         return AlignUp(byteSize, static_cast<size_t>(MemAlignment::MEM_ALIGN_OBJECT)) / sizeof(JSTaggedType);
804     }
805 
806     uint16_t *GetData() const
807     {
808         return reinterpret_cast<uint16_t *>(ToUintPtr(this) + DATA_OFFSET);
809     }
810 
811     template<bool verify = true>
812     uint16_t Get(int32_t index) const
813     {
814         int32_t length = static_cast<int32_t>(GetLength());
815         if (verify) {
816             if ((index < 0) || (index >= length)) {
817                 return 0;
818             }
819         }
820         if (!IsUtf16()) {
821             Span<const uint8_t> sp(GetDataUtf8(), length);
822             return sp[index];
823         }
824         Span<const uint16_t> sp(GetDataUtf16(), length);
825         return sp[index];
826     }
827 
828     void Set(uint32_t index, uint16_t src)
829     {
830         ASSERT(index < GetLength());
831         if (IsUtf8()) {
832             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
833             *(reinterpret_cast<uint8_t *>(GetData()) + index) = static_cast<uint8_t>(src);
834         } else {
835             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
836             *(GetData() + index) = src;
837         }
838     }
839 };
840 static_assert((LineEcmaString::DATA_OFFSET % static_cast<uint8_t>(MemAlignment::MEM_ALIGN_OBJECT)) == 0);
841 
842 class ConstantString : public EcmaString {
843 public:
844     static constexpr size_t RELOCTAED_DATA_OFFSET = EcmaString::SIZE;
845     // ConstantData is the pointer of const string in the pandafile.
846     // String in pandafile is encoded by the utf8 format.
847     // EntityId is normally the uint32_t index in the pandafile.
848     // When the pandafile is to be removed, EntityId will become -1.
849     // The real string data will be reloacted into bytearray and stored in RelocatedData.
850     // ConstantData will also point at data of bytearray data.
851     ACCESSORS(RelocatedData, RELOCTAED_DATA_OFFSET, ENTITY_ID_OFFSET);
852     ACCESSORS_PRIMITIVE_FIELD(EntityId, int64_t, ENTITY_ID_OFFSET, CONSTANT_DATA_OFFSET);
853     ACCESSORS_NATIVE_FIELD(ConstantData, uint8_t, CONSTANT_DATA_OFFSET, LAST_OFFSET);
854     DEFINE_ALIGN_SIZE(LAST_OFFSET);
855 
856     CAST_CHECK(ConstantString, IsConstantString);
857     DECL_VISIT_OBJECT(RELOCTAED_DATA_OFFSET, ENTITY_ID_OFFSET);
858 
859     static ConstantString *Cast(EcmaString *str)
860     {
861         return static_cast<ConstantString *>(str);
862     }
863 
864     static ConstantString *Cast(const EcmaString *str)
865     {
866         return ConstantString::Cast(const_cast<EcmaString *>(str));
867     }
868 
869     static size_t ObjectSize()
870     {
871         return ConstantString::SIZE;
872     }
873 
874     uint32_t GetEntityIdU32() const
875     {
876         ASSERT(GetEntityId() >= 0);
877         return static_cast<uint32_t>(GetEntityId());
878     }
879 
880     template<bool verify = true>
881     uint16_t Get(int32_t index) const
882     {
883         int32_t length = static_cast<int32_t>(GetLength());
884         if (verify) {
885             if ((index < 0) || (index >= length)) {
886                 return 0;
887             }
888         }
889         ASSERT(IsUtf8());
890         Span<const uint8_t> sp(GetConstantData(), length);
891         return sp[index];
892     }
893 };
894 
895 // The substrings of another string use SlicedString to describe.
896 class SlicedString : public EcmaString {
897 public:
898     static constexpr uint32_t MIN_SLICED_ECMASTRING_LENGTH = 13;
899     static constexpr size_t PARENT_OFFSET = EcmaString::SIZE;
900     ACCESSORS(Parent, PARENT_OFFSET, STARTINDEX_OFFSET);
901     ACCESSORS_PRIMITIVE_FIELD(StartIndex, uint32_t, STARTINDEX_OFFSET, BACKING_STORE_FLAG);
902     ACCESSORS_PRIMITIVE_FIELD(HasBackingStore, uint32_t, BACKING_STORE_FLAG, SIZE);
903 
904     DECL_VISIT_OBJECT(PARENT_OFFSET, STARTINDEX_OFFSET);
905 
906     CAST_CHECK(SlicedString, IsSlicedString);
907 private:
908     friend class EcmaString;
909     static SlicedString *Cast(EcmaString *str)
910     {
911         return static_cast<SlicedString *>(str);
912     }
913 
914     static SlicedString *Cast(const EcmaString *str)
915     {
916         return SlicedString::Cast(const_cast<EcmaString *>(str));
917     }
918 
919     static size_t ObjectSize()
920     {
921         return SlicedString::SIZE;
922     }
923 
924     // Minimum length for a sliced string
925     template<bool verify = true>
926     uint16_t Get(int32_t index) const
927     {
928         int32_t length = static_cast<int32_t>(GetLength());
929         if (verify) {
930             if ((index < 0) || (index >= length)) {
931                 return 0;
932             }
933         }
934         EcmaString *parent = EcmaString::Cast(GetParent());
935         if (parent->IsLineString()) {
936             if (parent->IsUtf8()) {
937                 Span<const uint8_t> sp(parent->GetDataUtf8() + GetStartIndex(), length);
938                 return sp[index];
939             }
940             Span<const uint16_t> sp(parent->GetDataUtf16() + GetStartIndex(), length);
941             return sp[index];
942         }
943         Span<const uint8_t> sp(ConstantString::Cast(parent)->GetConstantData() + GetStartIndex(), length);
944         return sp[index];
945     }
946 };
947 
948 class TreeEcmaString : public EcmaString {
949 public:
950     // Minimum length for a tree string
951     static constexpr uint32_t MIN_TREE_ECMASTRING_LENGTH = 13;
952 
953     static constexpr size_t FIRST_OFFSET = EcmaString::SIZE;
954     ACCESSORS(First, FIRST_OFFSET, SECOND_OFFSET);
955     ACCESSORS(Second, SECOND_OFFSET, SIZE);
956 
957     DECL_VISIT_OBJECT(FIRST_OFFSET, SIZE);
958 
959     CAST_CHECK(TreeEcmaString, IsTreeString);
960 
Cast(EcmaString *str)961     static TreeEcmaString *Cast(EcmaString *str)
962     {
963         return static_cast<TreeEcmaString *>(str);
964     }
965 
Cast(const EcmaString *str)966     static TreeEcmaString *Cast(const EcmaString *str)
967     {
968         return TreeEcmaString::Cast(const_cast<EcmaString *>(str));
969     }
970 
IsFlat() const971     bool IsFlat() const
972     {
973         auto strSecond = EcmaString::Cast(GetSecond());
974         return strSecond->GetLength() == 0;
975     }
976 
977     template<bool verify = true>
Get(int32_t index) const978     uint16_t Get(int32_t index) const
979     {
980         int32_t length = static_cast<int32_t>(GetLength());
981         if (verify) {
982             if ((index < 0) || (index >= length)) {
983                 return 0;
984             }
985         }
986 
987         if (IsFlat()) {
988             EcmaString *first = EcmaString::Cast(GetFirst());
989             return first->At<verify>(index);
990         }
991         EcmaString *string = const_cast<TreeEcmaString *>(this);
992         while (true) {
993             if (string->IsTreeString()) {
994                 EcmaString *first = EcmaString::Cast(TreeEcmaString::Cast(string)->GetFirst());
995                 if (static_cast<int32_t>(first->GetLength()) > index) {
996                     string = first;
997                 } else {
998                     index -= static_cast<int32_t>(first->GetLength());
999                     string = EcmaString::Cast(TreeEcmaString::Cast(string)->GetSecond());
1000                 }
1001             } else {
1002                 return string->At<verify>(index);
1003             }
1004         }
1005         UNREACHABLE();
1006     }
1007 };
1008 
1009 class FlatStringInfo {
1010 public:
FlatStringInfo(EcmaString *string, uint32_t startIndex, uint32_t length)1011     FlatStringInfo(EcmaString *string, uint32_t startIndex, uint32_t length) : string_(string),
1012                                                                                startIndex_(startIndex),
1013                                                                                length_(length) {}
IsUtf8() const1014     bool IsUtf8() const
1015     {
1016         return string_->IsUtf8();
1017     }
1018 
IsUtf16() const1019     bool IsUtf16() const
1020     {
1021         return string_->IsUtf16();
1022     }
1023 
GetString() const1024     EcmaString *GetString() const
1025     {
1026         return string_;
1027     }
1028 
SetString(EcmaString *string)1029     void SetString(EcmaString *string)
1030     {
1031         string_ = string;
1032     }
1033 
GetStartIndex() const1034     uint32_t GetStartIndex() const
1035     {
1036         return startIndex_;
1037     }
1038 
SetStartIndex(uint32_t index)1039     void SetStartIndex(uint32_t index)
1040     {
1041         startIndex_ = index;
1042     }
1043 
GetLength() const1044     uint32_t GetLength() const
1045     {
1046         return length_;
1047     }
1048 
1049     const uint8_t *GetDataUtf8() const;
1050     const uint16_t *GetDataUtf16() const;
1051     uint8_t *GetDataUtf8Writable() const;
1052     uint16_t *GetDataUtf16Writable() const;
1053     std::u16string ToU16String(uint32_t len = 0);
1054 private:
1055     EcmaString *string_ {nullptr};
1056     uint32_t startIndex_ {0};
1057     uint32_t length_ {0};
1058 };
1059 
1060 // if you want to use functions of EcmaString, please not use directly,
1061 // and use functions of EcmaStringAccessor alternatively.
1062 // eg: EcmaString *str = ***; str->GetLength() ----->  EcmaStringAccessor(str).GetLength()
1063 class PUBLIC_API EcmaStringAccessor {
1064 public:
EcmaStringAccessor(EcmaString *string)1065     explicit inline EcmaStringAccessor(EcmaString *string)
1066     {
1067         ASSERT(string != nullptr);
1068         string_ = string;
1069     }
1070 
1071     explicit EcmaStringAccessor(TaggedObject *obj);
1072 
1073     explicit EcmaStringAccessor(JSTaggedValue value);
1074 
1075     explicit EcmaStringAccessor(const JSHandle<EcmaString> &strHandle);
1076 
CalculateAllConcatHashCode(const JSHandle<EcmaString> &firstString, const JSHandle<EcmaString> &secondString)1077     static uint32_t CalculateAllConcatHashCode(const JSHandle<EcmaString> &firstString,
1078                                                const JSHandle<EcmaString> &secondString)
1079     {
1080         return EcmaString::CalculateAllConcatHashCode(firstString, secondString);
1081     }
1082 
1083     static EcmaString *CreateLineString(const EcmaVM *vm, size_t length, bool compressed);
1084 
CreateEmptyString(const EcmaVM *vm)1085     static EcmaString *CreateEmptyString(const EcmaVM *vm)
1086     {
1087         return EcmaString::CreateEmptyString(vm);
1088     }
1089 
CreateFromUtf8(const EcmaVM *vm, const uint8_t *utf8Data, uint32_t utf8Len, bool canBeCompress, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE, bool isConstantString = false, uint32_t idOffset = 0)1090     static EcmaString *CreateFromUtf8(const EcmaVM *vm, const uint8_t *utf8Data, uint32_t utf8Len, bool canBeCompress,
1091                                       MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE, bool isConstantString = false,
1092                                       uint32_t idOffset = 0)
1093     {
1094         return EcmaString::CreateFromUtf8(vm, utf8Data, utf8Len, canBeCompress, type, isConstantString, idOffset);
1095     }
1096 
CreateFromUtf8CompressedSubString(const EcmaVM *vm, const JSHandle<EcmaString> &string, uint32_t offset, uint32_t utf8Len, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)1097     static EcmaString *CreateFromUtf8CompressedSubString(const EcmaVM *vm, const JSHandle<EcmaString> &string,
1098                                                          uint32_t offset, uint32_t utf8Len,
1099                                                          MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)
1100     {
1101         return EcmaString::CreateFromUtf8CompressedSubString(vm, string, offset, utf8Len, type);
1102     }
1103 
CreateConstantString(const EcmaVM *vm, const uint8_t *utf8Data, size_t length, bool compressed, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE, uint32_t idOffset = 0)1104     static EcmaString *CreateConstantString(const EcmaVM *vm, const uint8_t *utf8Data, size_t length,
1105         bool compressed, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE, uint32_t idOffset = 0)
1106     {
1107         return EcmaString::CreateConstantString(vm, utf8Data, length, compressed, type, idOffset);
1108     }
1109 
CreateUtf16StringFromUtf8(const EcmaVM *vm, const uint8_t *utf8Data, uint32_t utf8Len, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)1110     static EcmaString *CreateUtf16StringFromUtf8(const EcmaVM *vm, const uint8_t *utf8Data, uint32_t utf8Len,
1111         MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)
1112     {
1113         return EcmaString::CreateUtf16StringFromUtf8(vm, utf8Data, utf8Len, type);
1114     }
1115 
CreateFromUtf16(const EcmaVM *vm, const uint16_t *utf16Data, uint32_t utf16Len, bool canBeCompress, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)1116     static EcmaString *CreateFromUtf16(const EcmaVM *vm, const uint16_t *utf16Data, uint32_t utf16Len,
1117                                        bool canBeCompress, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)
1118     {
1119         return EcmaString::CreateFromUtf16(vm, utf16Data, utf16Len, canBeCompress, type);
1120     }
1121 
Concat(const EcmaVM *vm, const JSHandle<EcmaString> &str1Handle, const JSHandle<EcmaString> &str2Handle, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)1122     static EcmaString *Concat(const EcmaVM *vm, const JSHandle<EcmaString> &str1Handle,
1123         const JSHandle<EcmaString> &str2Handle, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)
1124     {
1125         return EcmaString::Concat(vm, str1Handle, str2Handle, type);
1126     }
1127 
CopyStringToOldSpace(const EcmaVM *vm, const JSHandle<EcmaString> &original, uint32_t length, bool compressed)1128     static EcmaString *CopyStringToOldSpace(const EcmaVM *vm, const JSHandle<EcmaString> &original,
1129         uint32_t length, bool compressed)
1130     {
1131         return EcmaString::CopyStringToOldSpace(vm, original, length, compressed);
1132     }
1133 
1134     // can change src data structure
FastSubString(const EcmaVM *vm, const JSHandle<EcmaString> &src, uint32_t start, uint32_t length)1135     static EcmaString *FastSubString(const EcmaVM *vm,
1136         const JSHandle<EcmaString> &src, uint32_t start, uint32_t length)
1137     {
1138         return EcmaString::FastSubString(vm, src, start, length);
1139     }
1140 
1141     // get
GetSubString(const EcmaVM *vm, const JSHandle<EcmaString> &src, uint32_t start, uint32_t length)1142     static EcmaString *GetSubString(const EcmaVM *vm,
1143         const JSHandle<EcmaString> &src, uint32_t start, uint32_t length)
1144     {
1145         return EcmaString::GetSubString(vm, src, start, length);
1146     }
1147 
IsUtf8() const1148     bool IsUtf8() const
1149     {
1150         return string_->IsUtf8();
1151     }
1152 
IsUtf16() const1153     bool IsUtf16() const
1154     {
1155         return string_->IsUtf16();
1156     }
1157 
GetLength() const1158     uint32_t GetLength() const
1159     {
1160         return string_->GetLength();
1161     }
1162 
1163     // require is LineString
1164     inline size_t GetUtf8Length(bool isGetBufferSize = false) const;
1165 
ObjectSize() const1166     size_t ObjectSize() const
1167     {
1168         if (string_->IsLineString()) {
1169             return LineEcmaString::ObjectSize(string_);
1170         } if (string_->IsConstantString()) {
1171             return ConstantString::ObjectSize();
1172         } else {
1173             return TreeEcmaString::SIZE;
1174         }
1175     }
1176 
1177     // For TreeString, the calculation result is size of LineString correspondingly.
GetFlatStringSize() const1178     size_t GetFlatStringSize() const
1179     {
1180         if (string_->IsConstantString()) {
1181             return ConstantString::ObjectSize();
1182         }
1183         return LineEcmaString::ObjectSize(string_);
1184     }
1185 
IsInternString() const1186     bool IsInternString() const
1187     {
1188         return string_->IsInternString();
1189     }
1190 
SetInternString()1191     void SetInternString()
1192     {
1193         string_->SetIsInternString();
1194     }
1195 
ClearInternString()1196     void ClearInternString()
1197     {
1198         string_->ClearInternStringFlag();
1199     }
1200 
1201     // require is LineString
1202     // It's Utf8 format, but without 0 in the end.
1203     inline const uint8_t *GetDataUtf8();
1204 
1205     // require is LineString
1206     inline const uint16_t *GetDataUtf16();
1207 
1208     // not change string data structure.
1209     // if string is not flat, this func has low efficiency.
ToU16String(uint32_t len = 0)1210     std::u16string ToU16String(uint32_t len = 0)
1211     {
1212         return string_->ToU16String(len);
1213     }
1214 
1215     // not change string data structure.
1216     // if string is not flat, this func has low efficiency.
ToOneByteDataForced()1217     std::unique_ptr<uint8_t[]> ToOneByteDataForced()
1218     {
1219         return string_->ToOneByteDataForced();
1220     }
1221 
1222     // not change string data structure.
1223     // if string is not flat, this func has low efficiency.
ToUtf8Span(CVector<uint8_t> &buf)1224     Span<const uint8_t> ToUtf8Span(CVector<uint8_t> &buf)
1225     {
1226         return string_->ToUtf8Span(buf);
1227     }
1228 
1229     // only for string is flat and using UTF8 encoding
1230     inline Span<const uint8_t> FastToUtf8Span();
1231 
1232     // Using string's hash to figure out whether the string can be converted to integer
TryToGetInteger(uint32_t *result)1233     inline bool TryToGetInteger(uint32_t *result)
1234     {
1235         return string_->TryToGetInteger(result);
1236     }
1237 
TryToSetIntegerHash(int32_t num)1238     inline bool TryToSetIntegerHash(int32_t num)
1239     {
1240         return string_->TryToSetIntegerHash(num);
1241     }
1242 
1243     // not change string data structure.
1244     // if string is not flat, this func has low efficiency.
1245     std::string ToStdString(StringConvertedUsage usage = StringConvertedUsage::PRINT);
1246 
1247     // this function convert for Utf8
1248     CString Utf8ConvertToString();
1249 
1250     std::string DebuggerToStdString(StringConvertedUsage usage = StringConvertedUsage::PRINT);
1251     // not change string data structure.
1252     // if string is not flat, this func has low efficiency.
1253     CString ToCString(StringConvertedUsage usage = StringConvertedUsage::LOGICOPERATION, bool cesu8 = false);
1254 
1255     // not change string data structure.
1256     // if string is not flat, this func has low efficiency.
WriteToFlatUtf8(uint8_t *buf, uint32_t maxLength, bool isWriteBuffer = false)1257     uint32_t WriteToFlatUtf8(uint8_t *buf, uint32_t maxLength, bool isWriteBuffer = false)
1258     {
1259         return string_->WriteUtf8(buf, maxLength, isWriteBuffer);
1260     }
1261 
WriteToUtf16(uint16_t *buf, uint32_t bufLength)1262     uint32_t WriteToUtf16(uint16_t *buf, uint32_t bufLength)
1263     {
1264         return string_->WriteUtf16(buf, GetLength(), bufLength);
1265     }
1266 
WriteToOneByte(uint8_t *buf, uint32_t maxLength)1267     uint32_t WriteToOneByte(uint8_t *buf, uint32_t maxLength)
1268     {
1269         return string_->WriteOneByte(buf, maxLength);
1270     }
1271 
1272     // not change string data structure.
1273     // if string is not flat, this func has low efficiency.
WriteToFlatUtf16(uint16_t *buf, uint32_t maxLength) const1274     uint32_t WriteToFlatUtf16(uint16_t *buf, uint32_t maxLength) const
1275     {
1276         return string_->CopyDataUtf16(buf, maxLength);
1277     }
1278 
1279     template <typename Char>
WriteToFlatWithPos(EcmaString *src, Char *buf, uint32_t length, uint32_t pos)1280     static void WriteToFlatWithPos(EcmaString *src, Char *buf, uint32_t length, uint32_t pos)
1281     {
1282         src->WriteToFlatWithPos(src, buf, length, pos);
1283     }
1284 
1285     template <typename Char>
WriteToFlat(EcmaString *src, Char *buf, uint32_t maxLength)1286     static void WriteToFlat(EcmaString *src, Char *buf, uint32_t maxLength)
1287     {
1288         src->WriteToFlat(src, buf, maxLength);
1289     }
1290 
1291     // require dst is LineString
1292     // not change src data structure.
1293     // if src is not flat, this func has low efficiency.
1294     inline static void ReadData(EcmaString * dst, EcmaString *src, uint32_t start, uint32_t destSize, uint32_t length);
1295 
1296     // not change src data structure.
1297     // if src is not flat, this func has low efficiency.
1298     template<bool verify = true>
Get(uint32_t index) const1299     uint16_t Get(uint32_t index) const
1300     {
1301         return string_->At<verify>(index);
1302     }
1303 
1304     // require string is LineString.
Set(uint32_t index, uint16_t src)1305     void Set(uint32_t index, uint16_t src)
1306     {
1307         return string_->WriteData(index, src);
1308     }
1309 
1310     // not change src data structure.
1311     // if src is not flat, this func has low efficiency.
GetHashcode()1312     uint32_t GetHashcode()
1313     {
1314         return string_->GetHashcode();
1315     }
1316 
GetRawHashcode()1317     uint32_t GetRawHashcode()
1318     {
1319         return string_->GetRawHashcode();
1320     }
1321 
1322     // not change src data structure.
1323     // if src is not flat, this func has low efficiency.
ComputeRawHashcode()1324     std::pair<uint32_t, bool> ComputeRawHashcode()
1325     {
1326         return string_->ComputeRawHashcode();
1327     }
1328 
ComputeHashcode()1329     uint32_t ComputeHashcode()
1330     {
1331         return string_->ComputeHashcode();
1332     }
1333 
ComputeHashcode(uint32_t rawHashSeed, bool isInteger)1334     uint32_t ComputeHashcode(uint32_t rawHashSeed, bool isInteger)
1335     {
1336         return string_->ComputeHashcode(rawHashSeed, isInteger);
1337     }
1338 
ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress)1339     static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress)
1340     {
1341         return EcmaString::ComputeHashcodeUtf8(utf8Data, utf8Len, canBeCompress);
1342     }
1343 
ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length)1344     static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length)
1345     {
1346         return EcmaString::ComputeHashcodeUtf16(utf16Data, length);
1347     }
1348 
1349     // can change receiver and search data structure
IndexOf(const EcmaVM *vm, const JSHandle<EcmaString> &receiver, const JSHandle<EcmaString> &search, int pos = 0)1350     static int32_t IndexOf(const EcmaVM *vm,
1351         const JSHandle<EcmaString> &receiver, const JSHandle<EcmaString> &search, int pos = 0)
1352     {
1353         return EcmaString::IndexOf(vm, receiver, search, pos);
1354     }
1355 
1356     // can change receiver and search data structure
LastIndexOf(const EcmaVM *vm, const JSHandle<EcmaString> &receiver, const JSHandle<EcmaString> &search, int pos = 0)1357     static int32_t LastIndexOf(const EcmaVM *vm,
1358         const JSHandle<EcmaString> &receiver, const JSHandle<EcmaString> &search, int pos = 0)
1359     {
1360         return EcmaString::LastIndexOf(vm, receiver, search, pos);
1361     }
1362 
1363     // can change receiver and search data structure
Compare(const EcmaVM *vm, const JSHandle<EcmaString>& left, const JSHandle<EcmaString>& right)1364     static int32_t Compare(const EcmaVM *vm, const JSHandle<EcmaString>& left, const JSHandle<EcmaString>& right)
1365     {
1366         return EcmaString::Compare(vm, left, right);
1367     }
1368 
1369 
1370     // can change receiver and search data structure
IsSubStringAt(const EcmaVM *vm, const JSHandle<EcmaString>& left, const JSHandle<EcmaString>& right, uint32_t offset = 0)1371     static bool IsSubStringAt(const EcmaVM *vm, const JSHandle<EcmaString>& left,
1372         const JSHandle<EcmaString>& right, uint32_t offset = 0)
1373     {
1374         return EcmaString::IsSubStringAt(vm, left, right, offset);
1375     }
1376 
1377     // can change str1 and str2 data structure
StringsAreEqual(const EcmaVM *vm, const JSHandle<EcmaString> &str1, const JSHandle<EcmaString> &str2)1378     static bool StringsAreEqual(const EcmaVM *vm, const JSHandle<EcmaString> &str1, const JSHandle<EcmaString> &str2)
1379     {
1380         return EcmaString::StringsAreEqual(vm, str1, str2);
1381     }
1382 
1383     // not change str1 and str2 data structure.
1384     // if str1 or str2 is not flat, this func has low efficiency.
StringsAreEqual(EcmaString *str1, EcmaString *str2)1385     static bool StringsAreEqual(EcmaString *str1, EcmaString *str2)
1386     {
1387         return EcmaString::StringsAreEqual(str1, str2);
1388     }
1389 
1390     // not change str1 and str2 data structure.
1391     // if str1 or str2 is not flat, this func has low efficiency.
StringsAreEqualDiffUtfEncoding(EcmaString *str1, EcmaString *str2)1392     static bool StringsAreEqualDiffUtfEncoding(EcmaString *str1, EcmaString *str2)
1393     {
1394         return EcmaString::StringsAreEqualDiffUtfEncoding(str1, str2);
1395     }
1396 
1397     // not change str1 data structure.
1398     // if str1 is not flat, this func has low efficiency.
StringIsEqualUint8Data(const EcmaString *str1, const uint8_t *dataAddr, uint32_t dataLen, bool canBeCompress)1399     static bool StringIsEqualUint8Data(const EcmaString *str1, const uint8_t *dataAddr, uint32_t dataLen,
1400                                        bool canBeCompress)
1401     {
1402         return EcmaString::StringIsEqualUint8Data(str1, dataAddr, dataLen, canBeCompress);
1403     }
1404 
1405     // not change str1 data structure.
1406     // if str1 is not flat, this func has low efficiency.
StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len)1407     static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len)
1408     {
1409         return EcmaString::StringsAreEqualUtf16(str1, utf16Data, utf16Len);
1410     }
1411 
1412     // require str1 and str2 are LineString.
1413     // not change string data structure.
1414     // if string is not flat, this func has low efficiency.
EqualToSplicedString(const EcmaString *str1, const EcmaString *str2)1415     bool EqualToSplicedString(const EcmaString *str1, const EcmaString *str2)
1416     {
1417         return string_->EqualToSplicedString(str1, str2);
1418     }
1419 
CanBeCompressed(const uint8_t *utf8Data, uint32_t utf8Len)1420     static bool CanBeCompressed(const uint8_t *utf8Data, uint32_t utf8Len)
1421     {
1422         return EcmaString::CanBeCompressed(utf8Data, utf8Len);
1423     }
1424 
CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Len)1425     static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Len)
1426     {
1427         return EcmaString::CanBeCompressed(utf16Data, utf16Len);
1428     }
1429 
1430     // require string is LineString
CanBeCompressed(const EcmaString *string)1431     static bool CanBeCompressed(const EcmaString *string)
1432     {
1433         return EcmaString::CanBeCompressed(string);
1434     }
1435 
1436     // not change string data structure.
1437     // if string is not flat, this func has low efficiency.
ToElementIndex(uint32_t *index)1438     bool ToElementIndex(uint32_t *index)
1439     {
1440         return string_->ToElementIndex(index);
1441     }
1442 
1443     // not change string data structure.
1444     // if string is not flat, this func has low efficiency.
ToInt(int32_t *index, bool *negative)1445     bool ToInt(int32_t *index, bool *negative)
1446     {
1447         return string_->ToInt(index, negative);
1448     }
1449 
1450     // not change string data structure.
1451     // if string is not flat, this func has low efficiency.
ToTypedArrayIndex(uint32_t *index)1452     bool PUBLIC_API ToTypedArrayIndex(uint32_t *index)
1453     {
1454         return string_->ToTypedArrayIndex(index);
1455     }
1456 
ToLower(const EcmaVM *vm, const JSHandle<EcmaString> &src)1457     static EcmaString *ToLower(const EcmaVM *vm, const JSHandle<EcmaString> &src)
1458     {
1459         return EcmaString::ToLower(vm, src);
1460     }
1461 
TryToLower(const EcmaVM *vm, const JSHandle<EcmaString> &src)1462     static EcmaString *TryToLower(const EcmaVM *vm, const JSHandle<EcmaString> &src)
1463     {
1464         return EcmaString::TryToLower(vm, src);
1465     }
1466 
TryToUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src)1467     static EcmaString *TryToUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src)
1468     {
1469         return EcmaString::TryToUpper(vm, src);
1470     }
1471 
ToUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src)1472     static EcmaString *ToUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src)
1473     {
1474         return EcmaString::ToUpper(vm, src);
1475     }
1476 
ToLocaleLower(const EcmaVM *vm, const JSHandle<EcmaString> &src, const icu::Locale &locale)1477     static EcmaString *ToLocaleLower(const EcmaVM *vm, const JSHandle<EcmaString> &src, const icu::Locale &locale)
1478     {
1479         return EcmaString::ToLocaleLower(vm, src, locale);
1480     }
1481 
ToLocaleUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src, const icu::Locale &locale)1482     static EcmaString *ToLocaleUpper(const EcmaVM *vm, const JSHandle<EcmaString> &src, const icu::Locale &locale)
1483     {
1484         return EcmaString::ToLocaleUpper(vm, src, locale);
1485     }
1486 
Trim(const JSThread *thread, const JSHandle<EcmaString> &src, EcmaString::TrimMode mode = EcmaString::TrimMode::TRIM)1487     static EcmaString *Trim(const JSThread *thread,
1488         const JSHandle<EcmaString> &src, EcmaString::TrimMode mode = EcmaString::TrimMode::TRIM)
1489     {
1490         return EcmaString::Trim(thread, src, mode);
1491     }
1492 
IsASCIICharacter(uint16_t data)1493     static bool IsASCIICharacter(uint16_t data)
1494     {
1495         if (data == 0) {
1496             return false;
1497         }
1498         // \0 is not considered ASCII in Ecma-Modified-UTF8 [only modify '\u0000']
1499         return data <= base::utf_helper::UTF8_1B_MAX;
1500     }
1501 
IsFlat() const1502     bool IsFlat() const
1503     {
1504         return string_->IsFlat();
1505     }
1506 
IsLineString() const1507     bool IsLineString() const
1508     {
1509         return string_->IsLineString();
1510     }
1511 
IsConstantString() const1512     bool IsConstantString() const
1513     {
1514         return string_->IsConstantString();
1515     }
1516 
IsSlicedString() const1517     bool IsSlicedString() const
1518     {
1519         return string_->IsSlicedString();
1520     }
1521 
IsLineOrConstantString() const1522     bool IsLineOrConstantString() const
1523     {
1524         return string_->IsLineOrConstantString();
1525     }
1526 
GetStringType() const1527     JSType GetStringType() const
1528     {
1529         return string_->GetStringType();
1530     }
1531 
IsTreeString() const1532     bool IsTreeString() const
1533     {
1534         return string_->IsTreeString();
1535     }
1536 
NotTreeString() const1537     bool NotTreeString() const
1538     {
1539         return string_->NotTreeString();
1540     }
1541 
1542     // the returned string may be a linestring, constantstring, or slicestring!!
Flatten(const EcmaVM *vm, const JSHandle<EcmaString> &string, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)1543     PUBLIC_API static EcmaString *Flatten(const EcmaVM *vm, const JSHandle<EcmaString> &string,
1544         MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)
1545     {
1546         return EcmaString::Flatten(vm, string, type);
1547     }
1548 
FlattenAllString(const EcmaVM *vm, const JSHandle<EcmaString> &string, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)1549     static FlatStringInfo FlattenAllString(const EcmaVM *vm, const JSHandle<EcmaString> &string,
1550         MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)
1551     {
1552         return EcmaString::FlattenAllString(vm, string, type);
1553     }
1554 
SlowFlatten(const EcmaVM *vm, const JSHandle<EcmaString> &string, MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)1555     static EcmaString *SlowFlatten(const EcmaVM *vm, const JSHandle<EcmaString> &string,
1556         MemSpaceType type = MemSpaceType::SHARED_OLD_SPACE)
1557     {
1558         return EcmaString::SlowFlatten(vm, string, type);
1559     }
1560 
FlattenNoGCForSnapshot(const EcmaVM *vm, EcmaString *string)1561     static EcmaString *FlattenNoGCForSnapshot(const EcmaVM *vm, EcmaString *string)
1562     {
1563         return EcmaString::FlattenNoGCForSnapshot(vm, string);
1564     }
1565 
GetUtf8DataFlat(const EcmaString *src, CVector<uint8_t> &buf)1566     static const uint8_t *GetUtf8DataFlat(const EcmaString *src, CVector<uint8_t> &buf)
1567     {
1568         return EcmaString::GetUtf8DataFlat(src, buf);
1569     }
1570 
GetNonTreeUtf8Data(const EcmaString *src)1571     static const uint8_t *GetNonTreeUtf8Data(const EcmaString *src)
1572     {
1573         return EcmaString::GetNonTreeUtf8Data(src);
1574     }
1575 
GetUtf16DataFlat(const EcmaString *src, CVector<uint16_t> &buf)1576     static const uint16_t *GetUtf16DataFlat(const EcmaString *src, CVector<uint16_t> &buf)
1577     {
1578         return EcmaString::GetUtf16DataFlat(src, buf);
1579     }
1580 
GetNonTreeUtf16Data(const EcmaString *src)1581     static const uint16_t *GetNonTreeUtf16Data(const EcmaString *src)
1582     {
1583         return EcmaString::GetNonTreeUtf16Data(src);
1584     }
1585 
1586     static JSTaggedValue StringToList(JSThread *thread, JSHandle<JSTaggedValue> &str);
1587 
1588 private:
1589     EcmaString *string_ {nullptr};
1590 };
1591 }  // namespace ecmascript
1592 }  // namespace panda
1593 #endif  // ECMASCRIPT_STRING_H
1594