11cb0ef41Sopenharmony_ci// © 2020 and later: Unicode, Inc. and others.
21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
31cb0ef41Sopenharmony_ci
41cb0ef41Sopenharmony_ci// uniquecharstr.h
51cb0ef41Sopenharmony_ci// created: 2020sep01 Frank Yung-Fong Tang
61cb0ef41Sopenharmony_ci
71cb0ef41Sopenharmony_ci#ifndef __UNIQUECHARSTR_H__
81cb0ef41Sopenharmony_ci#define __UNIQUECHARSTR_H__
91cb0ef41Sopenharmony_ci
101cb0ef41Sopenharmony_ci#include "charstr.h"
111cb0ef41Sopenharmony_ci#include "uassert.h"
121cb0ef41Sopenharmony_ci#include "uhash.h"
131cb0ef41Sopenharmony_ci#include "cmemory.h"
141cb0ef41Sopenharmony_ci
151cb0ef41Sopenharmony_ciU_NAMESPACE_BEGIN
161cb0ef41Sopenharmony_ci
171cb0ef41Sopenharmony_ci/**
181cb0ef41Sopenharmony_ci * Stores NUL-terminated strings with duplicate elimination.
191cb0ef41Sopenharmony_ci * Checks for unique UTF-16 string pointers and converts to invariant characters.
201cb0ef41Sopenharmony_ci *
211cb0ef41Sopenharmony_ci * Intended to be stack-allocated. Add strings, get a unique number for each,
221cb0ef41Sopenharmony_ci * freeze the object, get a char * pointer for each string,
231cb0ef41Sopenharmony_ci * call orphanCharStrings() to capture the string storage, and let this object go out of scope.
241cb0ef41Sopenharmony_ci */
251cb0ef41Sopenharmony_ciclass UniqueCharStrings {
261cb0ef41Sopenharmony_cipublic:
271cb0ef41Sopenharmony_ci    UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) {
281cb0ef41Sopenharmony_ci        // Note: We hash on string contents but store stable char16_t * pointers.
291cb0ef41Sopenharmony_ci        // If the strings are stored in resource bundles which should be built with
301cb0ef41Sopenharmony_ci        // duplicate elimination, then we should be able to hash on just the pointer values.
311cb0ef41Sopenharmony_ci        uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode);
321cb0ef41Sopenharmony_ci        if (U_FAILURE(errorCode)) { return; }
331cb0ef41Sopenharmony_ci        strings = new CharString();
341cb0ef41Sopenharmony_ci        if (strings == nullptr) {
351cb0ef41Sopenharmony_ci            errorCode = U_MEMORY_ALLOCATION_ERROR;
361cb0ef41Sopenharmony_ci        }
371cb0ef41Sopenharmony_ci    }
381cb0ef41Sopenharmony_ci    ~UniqueCharStrings() {
391cb0ef41Sopenharmony_ci        uhash_close(&map);
401cb0ef41Sopenharmony_ci        delete strings;
411cb0ef41Sopenharmony_ci    }
421cb0ef41Sopenharmony_ci
431cb0ef41Sopenharmony_ci    /** Returns/orphans the CharString that contains all strings. */
441cb0ef41Sopenharmony_ci    CharString *orphanCharStrings() {
451cb0ef41Sopenharmony_ci        CharString *result = strings;
461cb0ef41Sopenharmony_ci        strings = nullptr;
471cb0ef41Sopenharmony_ci        return result;
481cb0ef41Sopenharmony_ci    }
491cb0ef41Sopenharmony_ci
501cb0ef41Sopenharmony_ci    /**
511cb0ef41Sopenharmony_ci     * Adds a NUL-terminated string and returns a unique number for it.
521cb0ef41Sopenharmony_ci     * The string must not change, nor move around in memory,
531cb0ef41Sopenharmony_ci     * while this UniqueCharStrings is in use.
541cb0ef41Sopenharmony_ci     *
551cb0ef41Sopenharmony_ci     * Best used with string data in a stable storage, such as strings returned
561cb0ef41Sopenharmony_ci     * by resource bundle functions.
571cb0ef41Sopenharmony_ci     */
581cb0ef41Sopenharmony_ci    int32_t add(const char16_t*p, UErrorCode &errorCode) {
591cb0ef41Sopenharmony_ci        if (U_FAILURE(errorCode)) { return -1; }
601cb0ef41Sopenharmony_ci        if (isFrozen) {
611cb0ef41Sopenharmony_ci            errorCode = U_NO_WRITE_PERMISSION;
621cb0ef41Sopenharmony_ci            return -1;
631cb0ef41Sopenharmony_ci        }
641cb0ef41Sopenharmony_ci        // The string points into the resource bundle.
651cb0ef41Sopenharmony_ci        int32_t oldIndex = uhash_geti(&map, p);
661cb0ef41Sopenharmony_ci        if (oldIndex != 0) {  // found duplicate
671cb0ef41Sopenharmony_ci            return oldIndex;
681cb0ef41Sopenharmony_ci        }
691cb0ef41Sopenharmony_ci        // Explicit NUL terminator for the previous string.
701cb0ef41Sopenharmony_ci        // The strings object is also terminated with one implicit NUL.
711cb0ef41Sopenharmony_ci        strings->append(0, errorCode);
721cb0ef41Sopenharmony_ci        int32_t newIndex = strings->length();
731cb0ef41Sopenharmony_ci        strings->appendInvariantChars(p, u_strlen(p), errorCode);
741cb0ef41Sopenharmony_ci        uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode);
751cb0ef41Sopenharmony_ci        return newIndex;
761cb0ef41Sopenharmony_ci    }
771cb0ef41Sopenharmony_ci
781cb0ef41Sopenharmony_ci    /**
791cb0ef41Sopenharmony_ci     * Adds a unicode string by value and returns a unique number for it.
801cb0ef41Sopenharmony_ci     */
811cb0ef41Sopenharmony_ci    int32_t addByValue(UnicodeString s, UErrorCode &errorCode) {
821cb0ef41Sopenharmony_ci        if (U_FAILURE(errorCode)) { return -1; }
831cb0ef41Sopenharmony_ci        if (isFrozen) {
841cb0ef41Sopenharmony_ci            errorCode = U_NO_WRITE_PERMISSION;
851cb0ef41Sopenharmony_ci            return -1;
861cb0ef41Sopenharmony_ci        }
871cb0ef41Sopenharmony_ci        int32_t oldIndex = uhash_geti(&map, s.getTerminatedBuffer());
881cb0ef41Sopenharmony_ci        if (oldIndex != 0) {  // found duplicate
891cb0ef41Sopenharmony_ci            return oldIndex;
901cb0ef41Sopenharmony_ci        }
911cb0ef41Sopenharmony_ci        // We need to store the string content of the UnicodeString.
921cb0ef41Sopenharmony_ci        UnicodeString *key = keyStore.create(s);
931cb0ef41Sopenharmony_ci        if (key == nullptr) {
941cb0ef41Sopenharmony_ci            errorCode = U_MEMORY_ALLOCATION_ERROR;
951cb0ef41Sopenharmony_ci            return -1;
961cb0ef41Sopenharmony_ci        }
971cb0ef41Sopenharmony_ci        return add(key->getTerminatedBuffer(), errorCode);
981cb0ef41Sopenharmony_ci    }
991cb0ef41Sopenharmony_ci
1001cb0ef41Sopenharmony_ci    void freeze() { isFrozen = true; }
1011cb0ef41Sopenharmony_ci
1021cb0ef41Sopenharmony_ci    /**
1031cb0ef41Sopenharmony_ci     * Returns a string pointer for its unique number, if this object is frozen.
1041cb0ef41Sopenharmony_ci     * Otherwise nullptr.
1051cb0ef41Sopenharmony_ci     */
1061cb0ef41Sopenharmony_ci    const char *get(int32_t i) const {
1071cb0ef41Sopenharmony_ci        U_ASSERT(isFrozen);
1081cb0ef41Sopenharmony_ci        return isFrozen && i > 0 ? strings->data() + i : nullptr;
1091cb0ef41Sopenharmony_ci    }
1101cb0ef41Sopenharmony_ci
1111cb0ef41Sopenharmony_ciprivate:
1121cb0ef41Sopenharmony_ci    UHashtable map;
1131cb0ef41Sopenharmony_ci    CharString *strings;
1141cb0ef41Sopenharmony_ci    MemoryPool<UnicodeString> keyStore;
1151cb0ef41Sopenharmony_ci    bool isFrozen = false;
1161cb0ef41Sopenharmony_ci};
1171cb0ef41Sopenharmony_ci
1181cb0ef41Sopenharmony_ciU_NAMESPACE_END
1191cb0ef41Sopenharmony_ci
1201cb0ef41Sopenharmony_ci#endif  // __UNIQUECHARSTR_H__
121