xref: /third_party/node/src/node_i18n.cc (revision 1cb0ef41)
1// Copyright Joyent, Inc. and other Node contributors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a
4// copy of this software and associated documentation files (the
5// "Software"), to deal in the Software without restriction, including
6// without limitation the rights to use, copy, modify, merge, publish,
7// distribute, sublicense, and/or sell copies of the Software, and to permit
8// persons to whom the Software is furnished to do so, subject to the
9// following conditions:
10//
11// The above copyright notice and this permission notice shall be included
12// in all copies or substantial portions of the Software.
13//
14// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17// NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20// USE OR OTHER DEALINGS IN THE SOFTWARE.
21
22/*
23 * notes: by srl295
24 *  - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data
25 *     ( stubdata/libicudata.a ) containing nothing, no data, and it's also
26 *    linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT
27 *    macro names. That's the "english+root" data.
28 *
29 *    If icu_data_path is non-null, the user has provided a path and we assume
30 *    it goes somewhere useful. We set that path in ICU, and exit.
31 *    If icu_data_path is null, they haven't set a path and we want the
32 *    "english+root" data.  We call
33 *       udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...)
34 *    to load up the english+root data.
35 *
36 *  - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full
37 *    data. All of the variables and command line options for changing data at
38 *    runtime are disabled, as they wouldn't fully override the internal data.
39 *    See:  http://bugs.icu-project.org/trac/ticket/10924
40 */
41
42
43#include "node_i18n.h"
44#include "node_external_reference.h"
45
46#if defined(NODE_HAVE_I18N_SUPPORT)
47
48#include "base_object-inl.h"
49#include "node.h"
50#include "node_buffer.h"
51#include "node_errors.h"
52#include "node_internals.h"
53#include "string_bytes.h"
54#include "util-inl.h"
55#include "v8.h"
56
57#include <unicode/utypes.h>
58#include <unicode/putil.h>
59#include <unicode/uchar.h>
60#include <unicode/uclean.h>
61#include <unicode/udata.h>
62#include <unicode/uidna.h>
63#include <unicode/ucnv.h>
64#include <unicode/utf8.h>
65#include <unicode/utf16.h>
66#include <unicode/timezone.h>
67#include <unicode/ulocdata.h>
68#include <unicode/uvernum.h>
69#include <unicode/uversion.h>
70#include <unicode/ustring.h>
71
72#ifdef NODE_HAVE_SMALL_ICU
73/* if this is defined, we have a 'secondary' entry point.
74   compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
75#define SMALL_ICUDATA_ENTRY_POINT \
76  SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME)
77#define SMALL_DEF2(major, suff) SMALL_DEF(major, suff)
78#ifndef U_LIB_SUFFIX_C_NAME
79#define SMALL_DEF(major, suff) icusmdt##major##_dat
80#else
81#define SMALL_DEF(major, suff) icusmdt##suff##major##_dat
82#endif
83
84extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
85#endif
86
87namespace node {
88
89using v8::Context;
90using v8::FunctionCallbackInfo;
91using v8::FunctionTemplate;
92using v8::Int32;
93using v8::Isolate;
94using v8::Local;
95using v8::MaybeLocal;
96using v8::NewStringType;
97using v8::Object;
98using v8::ObjectTemplate;
99using v8::String;
100using v8::Value;
101
102namespace i18n {
103namespace {
104
105template <typename T>
106MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) {
107  MaybeLocal<Object> ret = Buffer::New(env, buf);
108  if (ret.IsEmpty())
109    return ret;
110
111  static_assert(sizeof(T) == 1 || sizeof(T) == 2,
112                "Currently only one- or two-byte buffers are supported");
113  if (sizeof(T) > 1 && IsBigEndian()) {
114    SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf);
115    SwapBytes16(retbuf_data, retbuf_length);
116  }
117
118  return ret;
119}
120
121// One-Shot Converters
122
123void CopySourceBuffer(MaybeStackBuffer<UChar>* dest,
124                      const char* data,
125                      const size_t length,
126                      const size_t length_in_chars) {
127  dest->AllocateSufficientStorage(length_in_chars);
128  char* dst = reinterpret_cast<char*>(**dest);
129  memcpy(dst, data, length);
130  if (IsBigEndian()) {
131    SwapBytes16(dst, length);
132  }
133}
134
135typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env,
136                                            const char* fromEncoding,
137                                            const char* toEncoding,
138                                            const char* source,
139                                            const size_t source_length,
140                                            UErrorCode* status);
141
142MaybeLocal<Object> Transcode(Environment* env,
143                             const char* fromEncoding,
144                             const char* toEncoding,
145                             const char* source,
146                             const size_t source_length,
147                             UErrorCode* status) {
148  *status = U_ZERO_ERROR;
149  MaybeLocal<Object> ret;
150  MaybeStackBuffer<char> result;
151  Converter to(toEncoding);
152  Converter from(fromEncoding);
153
154  size_t sublen = ucnv_getMinCharSize(to.conv());
155  std::string sub(sublen, '?');
156  to.set_subst_chars(sub.c_str());
157
158  const uint32_t limit = source_length * to.max_char_size();
159  result.AllocateSufficientStorage(limit);
160  char* target = *result;
161  ucnv_convertEx(to.conv(), from.conv(), &target, target + limit,
162                 &source, source + source_length, nullptr, nullptr,
163                 nullptr, nullptr, true, true, status);
164  if (U_SUCCESS(*status)) {
165    result.SetLength(target - &result[0]);
166    ret = ToBufferEndian(env, &result);
167  }
168  return ret;
169}
170
171MaybeLocal<Object> TranscodeToUcs2(Environment* env,
172                                   const char* fromEncoding,
173                                   const char* toEncoding,
174                                   const char* source,
175                                   const size_t source_length,
176                                   UErrorCode* status) {
177  *status = U_ZERO_ERROR;
178  MaybeLocal<Object> ret;
179  MaybeStackBuffer<UChar> destbuf(source_length);
180  Converter from(fromEncoding);
181  const size_t length_in_chars = source_length * sizeof(UChar);
182  ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
183                source, source_length, status);
184  if (U_SUCCESS(*status))
185    ret = ToBufferEndian(env, &destbuf);
186  return ret;
187}
188
189MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
190                                     const char* fromEncoding,
191                                     const char* toEncoding,
192                                     const char* source,
193                                     const size_t source_length,
194                                     UErrorCode* status) {
195  *status = U_ZERO_ERROR;
196  MaybeStackBuffer<UChar> sourcebuf;
197  MaybeLocal<Object> ret;
198  Converter to(toEncoding);
199
200  size_t sublen = ucnv_getMinCharSize(to.conv());
201  std::string sub(sublen, '?');
202  to.set_subst_chars(sub.c_str());
203
204  const size_t length_in_chars = source_length / sizeof(UChar);
205  CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
206  MaybeStackBuffer<char> destbuf(length_in_chars);
207  const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars,
208                                       *sourcebuf, length_in_chars, status);
209  if (U_SUCCESS(*status)) {
210    destbuf.SetLength(len);
211    ret = ToBufferEndian(env, &destbuf);
212  }
213  return ret;
214}
215
216MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
217                                         const char* fromEncoding,
218                                         const char* toEncoding,
219                                         const char* source,
220                                         const size_t source_length,
221                                         UErrorCode* status) {
222  *status = U_ZERO_ERROR;
223  MaybeStackBuffer<UChar> destbuf;
224  int32_t result_length;
225  u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
226                source, source_length, status);
227  MaybeLocal<Object> ret;
228  if (U_SUCCESS(*status)) {
229    destbuf.SetLength(result_length);
230    ret = ToBufferEndian(env, &destbuf);
231  } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
232    *status = U_ZERO_ERROR;
233    destbuf.AllocateSufficientStorage(result_length);
234    u_strFromUTF8(*destbuf, result_length, &result_length,
235                  source, source_length, status);
236    if (U_SUCCESS(*status)) {
237      destbuf.SetLength(result_length);
238      ret = ToBufferEndian(env, &destbuf);
239    }
240  }
241  return ret;
242}
243
244MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
245                                         const char* fromEncoding,
246                                         const char* toEncoding,
247                                         const char* source,
248                                         const size_t source_length,
249                                         UErrorCode* status) {
250  *status = U_ZERO_ERROR;
251  MaybeLocal<Object> ret;
252  const size_t length_in_chars = source_length / sizeof(UChar);
253  int32_t result_length;
254  MaybeStackBuffer<UChar> sourcebuf;
255  MaybeStackBuffer<char> destbuf;
256  CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
257  u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
258              *sourcebuf, length_in_chars, status);
259  if (U_SUCCESS(*status)) {
260    destbuf.SetLength(result_length);
261    ret = ToBufferEndian(env, &destbuf);
262  } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
263    *status = U_ZERO_ERROR;
264    destbuf.AllocateSufficientStorage(result_length);
265    u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
266                length_in_chars, status);
267    if (U_SUCCESS(*status)) {
268      destbuf.SetLength(result_length);
269      ret = ToBufferEndian(env, &destbuf);
270    }
271  }
272  return ret;
273}
274
275const char* EncodingName(const enum encoding encoding) {
276  switch (encoding) {
277    case ASCII: return "us-ascii";
278    case LATIN1: return "iso8859-1";
279    case UCS2: return "utf16le";
280    case UTF8: return "utf-8";
281    default: return nullptr;
282  }
283}
284
285bool SupportedEncoding(const enum encoding encoding) {
286  switch (encoding) {
287    case ASCII:
288    case LATIN1:
289    case UCS2:
290    case UTF8: return true;
291    default: return false;
292  }
293}
294
295void Transcode(const FunctionCallbackInfo<Value>&args) {
296  Environment* env = Environment::GetCurrent(args);
297  Isolate* isolate = env->isolate();
298  UErrorCode status = U_ZERO_ERROR;
299  MaybeLocal<Object> result;
300
301  ArrayBufferViewContents<char> input(args[0]);
302  const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER);
303  const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER);
304
305  if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) {
306    TranscodeFunc tfn = &Transcode;
307    switch (fromEncoding) {
308      case ASCII:
309      case LATIN1:
310        if (toEncoding == UCS2)
311          tfn = &TranscodeToUcs2;
312        break;
313      case UTF8:
314        if (toEncoding == UCS2)
315          tfn = &TranscodeUcs2FromUtf8;
316        break;
317      case UCS2:
318        switch (toEncoding) {
319          case UCS2:
320            tfn = &Transcode;
321            break;
322          case UTF8:
323            tfn = &TranscodeUtf8FromUcs2;
324            break;
325          default:
326            tfn = &TranscodeFromUcs2;
327        }
328        break;
329      default:
330        // This should not happen because of the SupportedEncoding checks
331        ABORT();
332    }
333
334    result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding),
335                 input.data(), input.length(), &status);
336  } else {
337    status = U_ILLEGAL_ARGUMENT_ERROR;
338  }
339
340  if (result.IsEmpty())
341    return args.GetReturnValue().Set(status);
342
343  return args.GetReturnValue().Set(result.ToLocalChecked());
344}
345
346void ICUErrorName(const FunctionCallbackInfo<Value>& args) {
347  Environment* env = Environment::GetCurrent(args);
348  CHECK(args[0]->IsInt32());
349  UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value());
350  args.GetReturnValue().Set(
351      String::NewFromUtf8(env->isolate(),
352                          u_errorName(status)).ToLocalChecked());
353}
354
355}  // anonymous namespace
356
357Converter::Converter(const char* name, const char* sub) {
358  UErrorCode status = U_ZERO_ERROR;
359  UConverter* conv = ucnv_open(name, &status);
360  CHECK(U_SUCCESS(status));
361  conv_.reset(conv);
362  set_subst_chars(sub);
363}
364
365Converter::Converter(UConverter* converter, const char* sub)
366    : conv_(converter) {
367  set_subst_chars(sub);
368}
369
370void Converter::set_subst_chars(const char* sub) {
371  CHECK(conv_);
372  UErrorCode status = U_ZERO_ERROR;
373  if (sub != nullptr) {
374    ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status);
375    CHECK(U_SUCCESS(status));
376  }
377}
378
379void Converter::reset() {
380  ucnv_reset(conv_.get());
381}
382
383size_t Converter::min_char_size() const {
384  CHECK(conv_);
385  return ucnv_getMinCharSize(conv_.get());
386}
387
388size_t Converter::max_char_size() const {
389  CHECK(conv_);
390  return ucnv_getMaxCharSize(conv_.get());
391}
392
393void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) {
394  Environment* env = Environment::GetCurrent(args);
395
396  CHECK_GE(args.Length(), 1);
397  Utf8Value label(env->isolate(), args[0]);
398
399  UErrorCode status = U_ZERO_ERROR;
400  ConverterPointer conv(ucnv_open(*label, &status));
401  args.GetReturnValue().Set(!!U_SUCCESS(status));
402}
403
404void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) {
405  Environment* env = Environment::GetCurrent(args);
406
407  Local<ObjectTemplate> t = env->i18n_converter_template();
408  Local<Object> obj;
409  if (!t->NewInstance(env->context()).ToLocal(&obj)) return;
410
411  CHECK_GE(args.Length(), 2);
412  Utf8Value label(env->isolate(), args[0]);
413  int flags = args[1]->Uint32Value(env->context()).ToChecked();
414  bool fatal =
415      (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL;
416
417  UErrorCode status = U_ZERO_ERROR;
418  UConverter* conv = ucnv_open(*label, &status);
419  if (U_FAILURE(status))
420    return;
421
422  if (fatal) {
423    status = U_ZERO_ERROR;
424    ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP,
425                        nullptr, nullptr, nullptr, &status);
426  }
427
428  auto converter = new ConverterObject(env, obj, conv, flags);
429  size_t sublen = ucnv_getMinCharSize(conv);
430  std::string sub(sublen, '?');
431  converter->set_subst_chars(sub.c_str());
432
433  args.GetReturnValue().Set(obj);
434}
435
436void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
437  Environment* env = Environment::GetCurrent(args);
438
439  CHECK_GE(args.Length(), 4);  // Converter, Buffer, Flags, Encoding
440
441  ConverterObject* converter;
442  ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());
443
444  if (!(args[1]->IsArrayBuffer() || args[1]->IsSharedArrayBuffer() ||
445        args[1]->IsArrayBufferView())) {
446    return node::THROW_ERR_INVALID_ARG_TYPE(
447        env->isolate(),
448        "The \"input\" argument must be an instance of SharedArrayBuffer, "
449        "ArrayBuffer or ArrayBufferView.");
450  }
451
452  ArrayBufferViewContents<char> input(args[1]);
453  int flags = args[2]->Uint32Value(env->context()).ToChecked();
454
455  CHECK(args[3]->IsString());
456  Local<String> from_encoding = args[3].As<String>();
457
458  UErrorCode status = U_ZERO_ERROR;
459  MaybeStackBuffer<UChar> result;
460
461  UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;
462
463  // When flushing the final chunk, the limit is the maximum
464  // of either the input buffer length or the number of pending
465  // characters times the min char size, multiplied by 2 as unicode may
466  // take up to 2 UChars to encode a character
467  size_t limit = 2 * converter->min_char_size() *
468      (!flush ?
469          input.length() :
470          std::max(
471              input.length(),
472              static_cast<size_t>(
473                  ucnv_toUCountPending(converter->conv(), &status))));
474  status = U_ZERO_ERROR;
475
476  if (limit > 0)
477    result.AllocateSufficientStorage(limit);
478
479  auto cleanup = OnScopeLeave([&]() {
480    if (flush) {
481      // Reset the converter state.
482      converter->set_bom_seen(false);
483      converter->reset();
484    }
485  });
486
487  const char* source = input.data();
488  size_t source_length = input.length();
489
490  UChar* target = *result;
491  ucnv_toUnicode(converter->conv(),
492                 &target,
493                 target + limit,
494                 &source,
495                 source + source_length,
496                 nullptr,
497                 flush,
498                 &status);
499
500  if (U_SUCCESS(status)) {
501    bool omit_initial_bom = false;
502    if (limit > 0) {
503      result.SetLength(target - &result[0]);
504      if (result.length() > 0 &&
505          converter->unicode() &&
506          !converter->ignore_bom() &&
507          !converter->bom_seen()) {
508        // If the very first result in the stream is a BOM, and we are not
509        // explicitly told to ignore it, then we mark it for discarding.
510        if (result[0] == 0xFEFF)
511          omit_initial_bom = true;
512        converter->set_bom_seen(true);
513      }
514    }
515
516    Local<Value> error;
517    UChar* output = result.out();
518    size_t beginning = 0;
519    size_t length = result.length() * sizeof(UChar);
520
521    if (omit_initial_bom) {
522      // Perform `ret = ret.slice(2)`.
523      beginning += 2;
524      length -= 2;
525    }
526
527    char* value = reinterpret_cast<char*>(output) + beginning;
528
529    if (IsBigEndian()) {
530      SwapBytes16(value, length);
531    }
532
533    MaybeLocal<Value> encoded =
534        StringBytes::Encode(env->isolate(), value, length, UCS2, &error);
535
536    Local<Value> ret;
537    if (encoded.ToLocal(&ret)) {
538      args.GetReturnValue().Set(ret);
539      return;
540    }
541  }
542
543  node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
544      env->isolate(),
545      "The encoded data was not valid for encoding %s",
546      *node::Utf8Value(env->isolate(), from_encoding));
547}
548
549ConverterObject::ConverterObject(
550    Environment* env,
551    Local<Object> wrap,
552    UConverter* converter,
553    int flags,
554    const char* sub)
555    : BaseObject(env, wrap),
556      Converter(converter, sub),
557      flags_(flags) {
558  MakeWeak();
559
560  switch (ucnv_getType(converter)) {
561    case UCNV_UTF8:
562    case UCNV_UTF16_BigEndian:
563    case UCNV_UTF16_LittleEndian:
564      flags_ |= CONVERTER_FLAGS_UNICODE;
565      break;
566    default: {
567      // Fall through
568    }
569  }
570}
571
572
573bool InitializeICUDirectory(const std::string& path) {
574  UErrorCode status = U_ZERO_ERROR;
575  if (path.empty()) {
576#ifdef NODE_HAVE_SMALL_ICU
577    // install the 'small' data.
578    udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status);
579#else  // !NODE_HAVE_SMALL_ICU
580    // no small data, so nothing to do.
581#endif  // !NODE_HAVE_SMALL_ICU
582  } else {
583    u_setDataDirectory(path.c_str());
584    u_init(&status);
585  }
586  return status == U_ZERO_ERROR;
587}
588
589void SetDefaultTimeZone(const char* tzid) {
590  size_t tzidlen = strlen(tzid) + 1;
591  UErrorCode status = U_ZERO_ERROR;
592  MaybeStackBuffer<UChar, 256> id(tzidlen);
593  u_charsToUChars(tzid, id.out(), tzidlen);
594  // This is threadsafe:
595  ucal_setDefaultTimeZone(id.out(), &status);
596  CHECK(U_SUCCESS(status));
597}
598
599int32_t ToUnicode(MaybeStackBuffer<char>* buf,
600                  const char* input,
601                  size_t length) {
602  UErrorCode status = U_ZERO_ERROR;
603  uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
604  UIDNA* uidna = uidna_openUTS46(options, &status);
605  if (U_FAILURE(status))
606    return -1;
607  UIDNAInfo info = UIDNA_INFO_INITIALIZER;
608
609  int32_t len = uidna_nameToUnicodeUTF8(uidna,
610                                        input, length,
611                                        **buf, buf->capacity(),
612                                        &info,
613                                        &status);
614
615  // Do not check info.errors like we do with ToASCII since ToUnicode always
616  // returns a string, despite any possible errors that may have occurred.
617
618  if (status == U_BUFFER_OVERFLOW_ERROR) {
619    status = U_ZERO_ERROR;
620    buf->AllocateSufficientStorage(len);
621    len = uidna_nameToUnicodeUTF8(uidna,
622                                  input, length,
623                                  **buf, buf->capacity(),
624                                  &info,
625                                  &status);
626  }
627
628  // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
629  // string, regardless of whether an error occurred.
630
631  if (U_FAILURE(status)) {
632    len = -1;
633    buf->SetLength(0);
634  } else {
635    buf->SetLength(len);
636  }
637
638  uidna_close(uidna);
639  return len;
640}
641
642int32_t ToASCII(MaybeStackBuffer<char>* buf,
643                const char* input,
644                size_t length,
645                idna_mode mode) {
646  UErrorCode status = U_ZERO_ERROR;
647  uint32_t options =                  // CheckHyphens = false; handled later
648    UIDNA_CHECK_BIDI |                // CheckBidi = true
649    UIDNA_CHECK_CONTEXTJ |            // CheckJoiners = true
650    UIDNA_NONTRANSITIONAL_TO_ASCII;   // Nontransitional_Processing
651  if (mode == idna_mode::kStrict) {
652    options |= UIDNA_USE_STD3_RULES;  // UseSTD3ASCIIRules = beStrict
653                                      // VerifyDnsLength = beStrict;
654                                      //   handled later
655  }
656
657  UIDNA* uidna = uidna_openUTS46(options, &status);
658  if (U_FAILURE(status))
659    return -1;
660  UIDNAInfo info = UIDNA_INFO_INITIALIZER;
661
662  int32_t len = uidna_nameToASCII_UTF8(uidna,
663                                       input, length,
664                                       **buf, buf->capacity(),
665                                       &info,
666                                       &status);
667
668  if (status == U_BUFFER_OVERFLOW_ERROR) {
669    status = U_ZERO_ERROR;
670    buf->AllocateSufficientStorage(len);
671    len = uidna_nameToASCII_UTF8(uidna,
672                                 input, length,
673                                 **buf, buf->capacity(),
674                                 &info,
675                                 &status);
676  }
677
678  // In UTS #46 which specifies ToASCII, certain error conditions are
679  // configurable through options, and the WHATWG URL Standard promptly elects
680  // to disable some of them to accommodate for real-world use cases.
681  // Unfortunately, ICU4C's IDNA module does not support disabling some of
682  // these options through `options` above, and thus continues throwing
683  // unnecessary errors. To counter this situation, we just filter out the
684  // errors that may have happened afterwards, before deciding whether to
685  // return an error from this function.
686
687  // CheckHyphens = false
688  // (Specified in the current UTS #46 draft rev. 18.)
689  // Refs:
690  // - https://github.com/whatwg/url/issues/53
691  // - https://github.com/whatwg/url/pull/309
692  // - http://www.unicode.org/review/pri317/
693  // - http://www.unicode.org/reports/tr46/tr46-18.html
694  // - https://www.icann.org/news/announcement-2000-01-07-en
695  info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
696  info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
697  info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
698
699  if (mode != idna_mode::kStrict) {
700    // VerifyDnsLength = beStrict
701    info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
702    info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
703    info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
704  }
705
706  if (U_FAILURE(status) || (mode != idna_mode::kLenient && info.errors != 0)) {
707    len = -1;
708    buf->SetLength(0);
709  } else {
710    buf->SetLength(len);
711  }
712
713  uidna_close(uidna);
714  return len;
715}
716
717static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
718  Environment* env = Environment::GetCurrent(args);
719  CHECK_GE(args.Length(), 1);
720  CHECK(args[0]->IsString());
721  Utf8Value val(env->isolate(), args[0]);
722
723  MaybeStackBuffer<char> buf;
724  int32_t len = ToUnicode(&buf, *val, val.length());
725
726  if (len < 0) {
727    return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
728  }
729
730  args.GetReturnValue().Set(
731      String::NewFromUtf8(env->isolate(),
732                          *buf,
733                          NewStringType::kNormal,
734                          len).ToLocalChecked());
735}
736
737static void ToASCII(const FunctionCallbackInfo<Value>& args) {
738  Environment* env = Environment::GetCurrent(args);
739  CHECK_GE(args.Length(), 1);
740  CHECK(args[0]->IsString());
741  Utf8Value val(env->isolate(), args[0]);
742  // optional arg
743  bool lenient = args[1]->BooleanValue(env->isolate());
744  idna_mode mode = lenient ? idna_mode::kLenient : idna_mode::kDefault;
745
746  MaybeStackBuffer<char> buf;
747  int32_t len = ToASCII(&buf, *val, val.length(), mode);
748
749  if (len < 0) {
750    return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
751  }
752
753  args.GetReturnValue().Set(
754      String::NewFromUtf8(env->isolate(),
755                          *buf,
756                          NewStringType::kNormal,
757                          len).ToLocalChecked());
758}
759
760// This is similar to wcwidth except that it takes the current unicode
761// character properties database into consideration, allowing it to
762// correctly calculate the column widths of things like emoji's and
763// newer wide characters. wcwidth, on the other hand, uses a fixed
764// algorithm that does not take things like emoji into proper
765// consideration.
766//
767// TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by
768// GNOME Terminal) and Konsole don't consider them to be zero-width (see refs
769// below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't
770// allow it to be input. Linux's PTY terminal prints control characters as
771// Narrow rhombi.
772//
773// TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final
774// consonants are 0-width when combined with initial consonants; otherwise they
775// are technically Wide. But many terminals (including Konsole and
776// VTE/GLib-based) implement all medials and finals as 0-width.
777//
778// Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width
779// Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420
780// Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223
781static int GetColumnWidth(UChar32 codepoint,
782                          bool ambiguous_as_full_width = false) {
783  // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
784  // codepoint as being full width, wide, ambiguous, neutral, narrow,
785  // or halfwidth.
786  const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
787  switch (eaw) {
788    case U_EA_FULLWIDTH:
789    case U_EA_WIDE:
790      return 2;
791    case U_EA_AMBIGUOUS:
792      // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
793      if (ambiguous_as_full_width) {
794        return 2;
795      }
796      // If ambiguous_as_full_width is false:
797      [[fallthrough]];
798    case U_EA_NEUTRAL:
799      if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
800        return 2;
801      }
802      [[fallthrough]];
803    case U_EA_HALFWIDTH:
804    case U_EA_NARROW:
805    default:
806      const auto zero_width_mask = U_GC_CC_MASK |  // C0/C1 control code
807                                  U_GC_CF_MASK |  // Format control character
808                                  U_GC_ME_MASK |  // Enclosing mark
809                                  U_GC_MN_MASK;   // Nonspacing mark
810      if (codepoint != 0x00AD &&  // SOFT HYPHEN is Cf but not zero-width
811          ((U_MASK(u_charType(codepoint)) & zero_width_mask) ||
812          u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) {
813        return 0;
814      }
815      return 1;
816  }
817}
818
819// Returns the column width for the given String.
820static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
821  Environment* env = Environment::GetCurrent(args);
822  CHECK(args[0]->IsString());
823
824  bool ambiguous_as_full_width = args[1]->IsTrue();
825  bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue();
826
827  TwoByteValue value(env->isolate(), args[0]);
828  // reinterpret_cast is required by windows to compile
829  UChar* str = reinterpret_cast<UChar*>(*value);
830  static_assert(sizeof(*str) == sizeof(**value),
831                "sizeof(*str) == sizeof(**value)");
832  UChar32 c = 0;
833  UChar32 p;
834  size_t n = 0;
835  uint32_t width = 0;
836
837  while (n < value.length()) {
838    p = c;
839    U16_NEXT(str, n, value.length(), c);
840    // Don't count individual emoji codepoints that occur within an
841    // emoji sequence. This is not necessarily foolproof. Some
842    // environments display emoji sequences in the appropriate
843    // condensed form (as a single emoji glyph), other environments
844    // may not understand an emoji sequence and will display each
845    // individual emoji separately. When this happens, the width
846    // calculated will be off, and there's no reliable way of knowing
847    // in advance if a particular sequence is going to be supported.
848    // The expand_emoji_sequence option allows the caller to skip this
849    // check and count each code within an emoji sequence separately.
850    // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
851    if (!expand_emoji_sequence &&
852        n > 0 && p == 0x200d &&  // 0x200d == ZWJ (zero width joiner)
853        (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
854         u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
855      continue;
856    }
857    width += GetColumnWidth(c, ambiguous_as_full_width);
858  }
859  args.GetReturnValue().Set(width);
860}
861
862void Initialize(Local<Object> target,
863                Local<Value> unused,
864                Local<Context> context,
865                void* priv) {
866  Environment* env = Environment::GetCurrent(context);
867  SetMethod(context, target, "toUnicode", ToUnicode);
868  SetMethod(context, target, "toASCII", ToASCII);
869  SetMethod(context, target, "getStringWidth", GetStringWidth);
870
871  // One-shot converters
872  SetMethod(context, target, "icuErrName", ICUErrorName);
873  SetMethod(context, target, "transcode", Transcode);
874
875  // ConverterObject
876  {
877    Local<FunctionTemplate> t = NewFunctionTemplate(env->isolate(), nullptr);
878    t->Inherit(BaseObject::GetConstructorTemplate(env));
879    t->InstanceTemplate()->SetInternalFieldCount(
880        ConverterObject::kInternalFieldCount);
881    Local<String> converter_string =
882        FIXED_ONE_BYTE_STRING(env->isolate(), "Converter");
883    t->SetClassName(converter_string);
884    env->set_i18n_converter_template(t->InstanceTemplate());
885  }
886
887  SetMethod(context, target, "getConverter", ConverterObject::Create);
888  SetMethod(context, target, "decode", ConverterObject::Decode);
889  SetMethod(context, target, "hasConverter", ConverterObject::Has);
890}
891
892void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
893  registry->Register(ToUnicode);
894  registry->Register(ToASCII);
895  registry->Register(GetStringWidth);
896  registry->Register(ICUErrorName);
897  registry->Register(Transcode);
898  registry->Register(ConverterObject::Create);
899  registry->Register(ConverterObject::Decode);
900  registry->Register(ConverterObject::Has);
901}
902
903}  // namespace i18n
904}  // namespace node
905
906NODE_BINDING_CONTEXT_AWARE_INTERNAL(icu, node::i18n::Initialize)
907NODE_BINDING_EXTERNAL_REFERENCE(icu, node::i18n::RegisterExternalReferences)
908
909#endif  // NODE_HAVE_I18N_SUPPORT
910