1// Copyright Joyent, Inc. and other Node contributors. 2// 3// Permission is hereby granted, free of charge, to any person obtaining a 4// copy of this software and associated documentation files (the 5// "Software"), to deal in the Software without restriction, including 6// without limitation the rights to use, copy, modify, merge, publish, 7// distribute, sublicense, and/or sell copies of the Software, and to permit 8// persons to whom the Software is furnished to do so, subject to the 9// following conditions: 10// 11// The above copyright notice and this permission notice shall be included 12// in all copies or substantial portions of the Software. 13// 14// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 17// NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 18// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 20// USE OR OTHER DEALINGS IN THE SOFTWARE. 21 22/* 23 * notes: by srl295 24 * - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data 25 * ( stubdata/libicudata.a ) containing nothing, no data, and it's also 26 * linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT 27 * macro names. That's the "english+root" data. 28 * 29 * If icu_data_path is non-null, the user has provided a path and we assume 30 * it goes somewhere useful. We set that path in ICU, and exit. 31 * If icu_data_path is null, they haven't set a path and we want the 32 * "english+root" data. We call 33 * udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...) 34 * to load up the english+root data. 35 * 36 * - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full 37 * data. All of the variables and command line options for changing data at 38 * runtime are disabled, as they wouldn't fully override the internal data. 39 * See: http://bugs.icu-project.org/trac/ticket/10924 40 */ 41 42 43#include "node_i18n.h" 44#include "node_external_reference.h" 45 46#if defined(NODE_HAVE_I18N_SUPPORT) 47 48#include "base_object-inl.h" 49#include "node.h" 50#include "node_buffer.h" 51#include "node_errors.h" 52#include "node_internals.h" 53#include "string_bytes.h" 54#include "util-inl.h" 55#include "v8.h" 56 57#include <unicode/utypes.h> 58#include <unicode/putil.h> 59#include <unicode/uchar.h> 60#include <unicode/uclean.h> 61#include <unicode/udata.h> 62#include <unicode/uidna.h> 63#include <unicode/ucnv.h> 64#include <unicode/utf8.h> 65#include <unicode/utf16.h> 66#include <unicode/timezone.h> 67#include <unicode/ulocdata.h> 68#include <unicode/uvernum.h> 69#include <unicode/uversion.h> 70#include <unicode/ustring.h> 71 72#ifdef NODE_HAVE_SMALL_ICU 73/* if this is defined, we have a 'secondary' entry point. 74 compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */ 75#define SMALL_ICUDATA_ENTRY_POINT \ 76 SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME) 77#define SMALL_DEF2(major, suff) SMALL_DEF(major, suff) 78#ifndef U_LIB_SUFFIX_C_NAME 79#define SMALL_DEF(major, suff) icusmdt##major##_dat 80#else 81#define SMALL_DEF(major, suff) icusmdt##suff##major##_dat 82#endif 83 84extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[]; 85#endif 86 87namespace node { 88 89using v8::Context; 90using v8::FunctionCallbackInfo; 91using v8::FunctionTemplate; 92using v8::Int32; 93using v8::Isolate; 94using v8::Local; 95using v8::MaybeLocal; 96using v8::NewStringType; 97using v8::Object; 98using v8::ObjectTemplate; 99using v8::String; 100using v8::Value; 101 102namespace i18n { 103namespace { 104 105template <typename T> 106MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) { 107 MaybeLocal<Object> ret = Buffer::New(env, buf); 108 if (ret.IsEmpty()) 109 return ret; 110 111 static_assert(sizeof(T) == 1 || sizeof(T) == 2, 112 "Currently only one- or two-byte buffers are supported"); 113 if (sizeof(T) > 1 && IsBigEndian()) { 114 SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf); 115 SwapBytes16(retbuf_data, retbuf_length); 116 } 117 118 return ret; 119} 120 121// One-Shot Converters 122 123void CopySourceBuffer(MaybeStackBuffer<UChar>* dest, 124 const char* data, 125 const size_t length, 126 const size_t length_in_chars) { 127 dest->AllocateSufficientStorage(length_in_chars); 128 char* dst = reinterpret_cast<char*>(**dest); 129 memcpy(dst, data, length); 130 if (IsBigEndian()) { 131 SwapBytes16(dst, length); 132 } 133} 134 135typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env, 136 const char* fromEncoding, 137 const char* toEncoding, 138 const char* source, 139 const size_t source_length, 140 UErrorCode* status); 141 142MaybeLocal<Object> Transcode(Environment* env, 143 const char* fromEncoding, 144 const char* toEncoding, 145 const char* source, 146 const size_t source_length, 147 UErrorCode* status) { 148 *status = U_ZERO_ERROR; 149 MaybeLocal<Object> ret; 150 MaybeStackBuffer<char> result; 151 Converter to(toEncoding); 152 Converter from(fromEncoding); 153 154 size_t sublen = ucnv_getMinCharSize(to.conv()); 155 std::string sub(sublen, '?'); 156 to.set_subst_chars(sub.c_str()); 157 158 const uint32_t limit = source_length * to.max_char_size(); 159 result.AllocateSufficientStorage(limit); 160 char* target = *result; 161 ucnv_convertEx(to.conv(), from.conv(), &target, target + limit, 162 &source, source + source_length, nullptr, nullptr, 163 nullptr, nullptr, true, true, status); 164 if (U_SUCCESS(*status)) { 165 result.SetLength(target - &result[0]); 166 ret = ToBufferEndian(env, &result); 167 } 168 return ret; 169} 170 171MaybeLocal<Object> TranscodeToUcs2(Environment* env, 172 const char* fromEncoding, 173 const char* toEncoding, 174 const char* source, 175 const size_t source_length, 176 UErrorCode* status) { 177 *status = U_ZERO_ERROR; 178 MaybeLocal<Object> ret; 179 MaybeStackBuffer<UChar> destbuf(source_length); 180 Converter from(fromEncoding); 181 const size_t length_in_chars = source_length * sizeof(UChar); 182 ucnv_toUChars(from.conv(), *destbuf, length_in_chars, 183 source, source_length, status); 184 if (U_SUCCESS(*status)) 185 ret = ToBufferEndian(env, &destbuf); 186 return ret; 187} 188 189MaybeLocal<Object> TranscodeFromUcs2(Environment* env, 190 const char* fromEncoding, 191 const char* toEncoding, 192 const char* source, 193 const size_t source_length, 194 UErrorCode* status) { 195 *status = U_ZERO_ERROR; 196 MaybeStackBuffer<UChar> sourcebuf; 197 MaybeLocal<Object> ret; 198 Converter to(toEncoding); 199 200 size_t sublen = ucnv_getMinCharSize(to.conv()); 201 std::string sub(sublen, '?'); 202 to.set_subst_chars(sub.c_str()); 203 204 const size_t length_in_chars = source_length / sizeof(UChar); 205 CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); 206 MaybeStackBuffer<char> destbuf(length_in_chars); 207 const uint32_t len = ucnv_fromUChars(to.conv(), *destbuf, length_in_chars, 208 *sourcebuf, length_in_chars, status); 209 if (U_SUCCESS(*status)) { 210 destbuf.SetLength(len); 211 ret = ToBufferEndian(env, &destbuf); 212 } 213 return ret; 214} 215 216MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env, 217 const char* fromEncoding, 218 const char* toEncoding, 219 const char* source, 220 const size_t source_length, 221 UErrorCode* status) { 222 *status = U_ZERO_ERROR; 223 MaybeStackBuffer<UChar> destbuf; 224 int32_t result_length; 225 u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length, 226 source, source_length, status); 227 MaybeLocal<Object> ret; 228 if (U_SUCCESS(*status)) { 229 destbuf.SetLength(result_length); 230 ret = ToBufferEndian(env, &destbuf); 231 } else if (*status == U_BUFFER_OVERFLOW_ERROR) { 232 *status = U_ZERO_ERROR; 233 destbuf.AllocateSufficientStorage(result_length); 234 u_strFromUTF8(*destbuf, result_length, &result_length, 235 source, source_length, status); 236 if (U_SUCCESS(*status)) { 237 destbuf.SetLength(result_length); 238 ret = ToBufferEndian(env, &destbuf); 239 } 240 } 241 return ret; 242} 243 244MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env, 245 const char* fromEncoding, 246 const char* toEncoding, 247 const char* source, 248 const size_t source_length, 249 UErrorCode* status) { 250 *status = U_ZERO_ERROR; 251 MaybeLocal<Object> ret; 252 const size_t length_in_chars = source_length / sizeof(UChar); 253 int32_t result_length; 254 MaybeStackBuffer<UChar> sourcebuf; 255 MaybeStackBuffer<char> destbuf; 256 CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); 257 u_strToUTF8(*destbuf, destbuf.capacity(), &result_length, 258 *sourcebuf, length_in_chars, status); 259 if (U_SUCCESS(*status)) { 260 destbuf.SetLength(result_length); 261 ret = ToBufferEndian(env, &destbuf); 262 } else if (*status == U_BUFFER_OVERFLOW_ERROR) { 263 *status = U_ZERO_ERROR; 264 destbuf.AllocateSufficientStorage(result_length); 265 u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf, 266 length_in_chars, status); 267 if (U_SUCCESS(*status)) { 268 destbuf.SetLength(result_length); 269 ret = ToBufferEndian(env, &destbuf); 270 } 271 } 272 return ret; 273} 274 275const char* EncodingName(const enum encoding encoding) { 276 switch (encoding) { 277 case ASCII: return "us-ascii"; 278 case LATIN1: return "iso8859-1"; 279 case UCS2: return "utf16le"; 280 case UTF8: return "utf-8"; 281 default: return nullptr; 282 } 283} 284 285bool SupportedEncoding(const enum encoding encoding) { 286 switch (encoding) { 287 case ASCII: 288 case LATIN1: 289 case UCS2: 290 case UTF8: return true; 291 default: return false; 292 } 293} 294 295void Transcode(const FunctionCallbackInfo<Value>&args) { 296 Environment* env = Environment::GetCurrent(args); 297 Isolate* isolate = env->isolate(); 298 UErrorCode status = U_ZERO_ERROR; 299 MaybeLocal<Object> result; 300 301 ArrayBufferViewContents<char> input(args[0]); 302 const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER); 303 const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER); 304 305 if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) { 306 TranscodeFunc tfn = &Transcode; 307 switch (fromEncoding) { 308 case ASCII: 309 case LATIN1: 310 if (toEncoding == UCS2) 311 tfn = &TranscodeToUcs2; 312 break; 313 case UTF8: 314 if (toEncoding == UCS2) 315 tfn = &TranscodeUcs2FromUtf8; 316 break; 317 case UCS2: 318 switch (toEncoding) { 319 case UCS2: 320 tfn = &Transcode; 321 break; 322 case UTF8: 323 tfn = &TranscodeUtf8FromUcs2; 324 break; 325 default: 326 tfn = &TranscodeFromUcs2; 327 } 328 break; 329 default: 330 // This should not happen because of the SupportedEncoding checks 331 ABORT(); 332 } 333 334 result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding), 335 input.data(), input.length(), &status); 336 } else { 337 status = U_ILLEGAL_ARGUMENT_ERROR; 338 } 339 340 if (result.IsEmpty()) 341 return args.GetReturnValue().Set(status); 342 343 return args.GetReturnValue().Set(result.ToLocalChecked()); 344} 345 346void ICUErrorName(const FunctionCallbackInfo<Value>& args) { 347 Environment* env = Environment::GetCurrent(args); 348 CHECK(args[0]->IsInt32()); 349 UErrorCode status = static_cast<UErrorCode>(args[0].As<Int32>()->Value()); 350 args.GetReturnValue().Set( 351 String::NewFromUtf8(env->isolate(), 352 u_errorName(status)).ToLocalChecked()); 353} 354 355} // anonymous namespace 356 357Converter::Converter(const char* name, const char* sub) { 358 UErrorCode status = U_ZERO_ERROR; 359 UConverter* conv = ucnv_open(name, &status); 360 CHECK(U_SUCCESS(status)); 361 conv_.reset(conv); 362 set_subst_chars(sub); 363} 364 365Converter::Converter(UConverter* converter, const char* sub) 366 : conv_(converter) { 367 set_subst_chars(sub); 368} 369 370void Converter::set_subst_chars(const char* sub) { 371 CHECK(conv_); 372 UErrorCode status = U_ZERO_ERROR; 373 if (sub != nullptr) { 374 ucnv_setSubstChars(conv_.get(), sub, strlen(sub), &status); 375 CHECK(U_SUCCESS(status)); 376 } 377} 378 379void Converter::reset() { 380 ucnv_reset(conv_.get()); 381} 382 383size_t Converter::min_char_size() const { 384 CHECK(conv_); 385 return ucnv_getMinCharSize(conv_.get()); 386} 387 388size_t Converter::max_char_size() const { 389 CHECK(conv_); 390 return ucnv_getMaxCharSize(conv_.get()); 391} 392 393void ConverterObject::Has(const FunctionCallbackInfo<Value>& args) { 394 Environment* env = Environment::GetCurrent(args); 395 396 CHECK_GE(args.Length(), 1); 397 Utf8Value label(env->isolate(), args[0]); 398 399 UErrorCode status = U_ZERO_ERROR; 400 ConverterPointer conv(ucnv_open(*label, &status)); 401 args.GetReturnValue().Set(!!U_SUCCESS(status)); 402} 403 404void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) { 405 Environment* env = Environment::GetCurrent(args); 406 407 Local<ObjectTemplate> t = env->i18n_converter_template(); 408 Local<Object> obj; 409 if (!t->NewInstance(env->context()).ToLocal(&obj)) return; 410 411 CHECK_GE(args.Length(), 2); 412 Utf8Value label(env->isolate(), args[0]); 413 int flags = args[1]->Uint32Value(env->context()).ToChecked(); 414 bool fatal = 415 (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL; 416 417 UErrorCode status = U_ZERO_ERROR; 418 UConverter* conv = ucnv_open(*label, &status); 419 if (U_FAILURE(status)) 420 return; 421 422 if (fatal) { 423 status = U_ZERO_ERROR; 424 ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, 425 nullptr, nullptr, nullptr, &status); 426 } 427 428 auto converter = new ConverterObject(env, obj, conv, flags); 429 size_t sublen = ucnv_getMinCharSize(conv); 430 std::string sub(sublen, '?'); 431 converter->set_subst_chars(sub.c_str()); 432 433 args.GetReturnValue().Set(obj); 434} 435 436void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) { 437 Environment* env = Environment::GetCurrent(args); 438 439 CHECK_GE(args.Length(), 4); // Converter, Buffer, Flags, Encoding 440 441 ConverterObject* converter; 442 ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>()); 443 444 if (!(args[1]->IsArrayBuffer() || args[1]->IsSharedArrayBuffer() || 445 args[1]->IsArrayBufferView())) { 446 return node::THROW_ERR_INVALID_ARG_TYPE( 447 env->isolate(), 448 "The \"input\" argument must be an instance of SharedArrayBuffer, " 449 "ArrayBuffer or ArrayBufferView."); 450 } 451 452 ArrayBufferViewContents<char> input(args[1]); 453 int flags = args[2]->Uint32Value(env->context()).ToChecked(); 454 455 CHECK(args[3]->IsString()); 456 Local<String> from_encoding = args[3].As<String>(); 457 458 UErrorCode status = U_ZERO_ERROR; 459 MaybeStackBuffer<UChar> result; 460 461 UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH; 462 463 // When flushing the final chunk, the limit is the maximum 464 // of either the input buffer length or the number of pending 465 // characters times the min char size, multiplied by 2 as unicode may 466 // take up to 2 UChars to encode a character 467 size_t limit = 2 * converter->min_char_size() * 468 (!flush ? 469 input.length() : 470 std::max( 471 input.length(), 472 static_cast<size_t>( 473 ucnv_toUCountPending(converter->conv(), &status)))); 474 status = U_ZERO_ERROR; 475 476 if (limit > 0) 477 result.AllocateSufficientStorage(limit); 478 479 auto cleanup = OnScopeLeave([&]() { 480 if (flush) { 481 // Reset the converter state. 482 converter->set_bom_seen(false); 483 converter->reset(); 484 } 485 }); 486 487 const char* source = input.data(); 488 size_t source_length = input.length(); 489 490 UChar* target = *result; 491 ucnv_toUnicode(converter->conv(), 492 &target, 493 target + limit, 494 &source, 495 source + source_length, 496 nullptr, 497 flush, 498 &status); 499 500 if (U_SUCCESS(status)) { 501 bool omit_initial_bom = false; 502 if (limit > 0) { 503 result.SetLength(target - &result[0]); 504 if (result.length() > 0 && 505 converter->unicode() && 506 !converter->ignore_bom() && 507 !converter->bom_seen()) { 508 // If the very first result in the stream is a BOM, and we are not 509 // explicitly told to ignore it, then we mark it for discarding. 510 if (result[0] == 0xFEFF) 511 omit_initial_bom = true; 512 converter->set_bom_seen(true); 513 } 514 } 515 516 Local<Value> error; 517 UChar* output = result.out(); 518 size_t beginning = 0; 519 size_t length = result.length() * sizeof(UChar); 520 521 if (omit_initial_bom) { 522 // Perform `ret = ret.slice(2)`. 523 beginning += 2; 524 length -= 2; 525 } 526 527 char* value = reinterpret_cast<char*>(output) + beginning; 528 529 if (IsBigEndian()) { 530 SwapBytes16(value, length); 531 } 532 533 MaybeLocal<Value> encoded = 534 StringBytes::Encode(env->isolate(), value, length, UCS2, &error); 535 536 Local<Value> ret; 537 if (encoded.ToLocal(&ret)) { 538 args.GetReturnValue().Set(ret); 539 return; 540 } 541 } 542 543 node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA( 544 env->isolate(), 545 "The encoded data was not valid for encoding %s", 546 *node::Utf8Value(env->isolate(), from_encoding)); 547} 548 549ConverterObject::ConverterObject( 550 Environment* env, 551 Local<Object> wrap, 552 UConverter* converter, 553 int flags, 554 const char* sub) 555 : BaseObject(env, wrap), 556 Converter(converter, sub), 557 flags_(flags) { 558 MakeWeak(); 559 560 switch (ucnv_getType(converter)) { 561 case UCNV_UTF8: 562 case UCNV_UTF16_BigEndian: 563 case UCNV_UTF16_LittleEndian: 564 flags_ |= CONVERTER_FLAGS_UNICODE; 565 break; 566 default: { 567 // Fall through 568 } 569 } 570} 571 572 573bool InitializeICUDirectory(const std::string& path) { 574 UErrorCode status = U_ZERO_ERROR; 575 if (path.empty()) { 576#ifdef NODE_HAVE_SMALL_ICU 577 // install the 'small' data. 578 udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status); 579#else // !NODE_HAVE_SMALL_ICU 580 // no small data, so nothing to do. 581#endif // !NODE_HAVE_SMALL_ICU 582 } else { 583 u_setDataDirectory(path.c_str()); 584 u_init(&status); 585 } 586 return status == U_ZERO_ERROR; 587} 588 589void SetDefaultTimeZone(const char* tzid) { 590 size_t tzidlen = strlen(tzid) + 1; 591 UErrorCode status = U_ZERO_ERROR; 592 MaybeStackBuffer<UChar, 256> id(tzidlen); 593 u_charsToUChars(tzid, id.out(), tzidlen); 594 // This is threadsafe: 595 ucal_setDefaultTimeZone(id.out(), &status); 596 CHECK(U_SUCCESS(status)); 597} 598 599int32_t ToUnicode(MaybeStackBuffer<char>* buf, 600 const char* input, 601 size_t length) { 602 UErrorCode status = U_ZERO_ERROR; 603 uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE; 604 UIDNA* uidna = uidna_openUTS46(options, &status); 605 if (U_FAILURE(status)) 606 return -1; 607 UIDNAInfo info = UIDNA_INFO_INITIALIZER; 608 609 int32_t len = uidna_nameToUnicodeUTF8(uidna, 610 input, length, 611 **buf, buf->capacity(), 612 &info, 613 &status); 614 615 // Do not check info.errors like we do with ToASCII since ToUnicode always 616 // returns a string, despite any possible errors that may have occurred. 617 618 if (status == U_BUFFER_OVERFLOW_ERROR) { 619 status = U_ZERO_ERROR; 620 buf->AllocateSufficientStorage(len); 621 len = uidna_nameToUnicodeUTF8(uidna, 622 input, length, 623 **buf, buf->capacity(), 624 &info, 625 &status); 626 } 627 628 // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode 629 // string, regardless of whether an error occurred. 630 631 if (U_FAILURE(status)) { 632 len = -1; 633 buf->SetLength(0); 634 } else { 635 buf->SetLength(len); 636 } 637 638 uidna_close(uidna); 639 return len; 640} 641 642int32_t ToASCII(MaybeStackBuffer<char>* buf, 643 const char* input, 644 size_t length, 645 idna_mode mode) { 646 UErrorCode status = U_ZERO_ERROR; 647 uint32_t options = // CheckHyphens = false; handled later 648 UIDNA_CHECK_BIDI | // CheckBidi = true 649 UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true 650 UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing 651 if (mode == idna_mode::kStrict) { 652 options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict 653 // VerifyDnsLength = beStrict; 654 // handled later 655 } 656 657 UIDNA* uidna = uidna_openUTS46(options, &status); 658 if (U_FAILURE(status)) 659 return -1; 660 UIDNAInfo info = UIDNA_INFO_INITIALIZER; 661 662 int32_t len = uidna_nameToASCII_UTF8(uidna, 663 input, length, 664 **buf, buf->capacity(), 665 &info, 666 &status); 667 668 if (status == U_BUFFER_OVERFLOW_ERROR) { 669 status = U_ZERO_ERROR; 670 buf->AllocateSufficientStorage(len); 671 len = uidna_nameToASCII_UTF8(uidna, 672 input, length, 673 **buf, buf->capacity(), 674 &info, 675 &status); 676 } 677 678 // In UTS #46 which specifies ToASCII, certain error conditions are 679 // configurable through options, and the WHATWG URL Standard promptly elects 680 // to disable some of them to accommodate for real-world use cases. 681 // Unfortunately, ICU4C's IDNA module does not support disabling some of 682 // these options through `options` above, and thus continues throwing 683 // unnecessary errors. To counter this situation, we just filter out the 684 // errors that may have happened afterwards, before deciding whether to 685 // return an error from this function. 686 687 // CheckHyphens = false 688 // (Specified in the current UTS #46 draft rev. 18.) 689 // Refs: 690 // - https://github.com/whatwg/url/issues/53 691 // - https://github.com/whatwg/url/pull/309 692 // - http://www.unicode.org/review/pri317/ 693 // - http://www.unicode.org/reports/tr46/tr46-18.html 694 // - https://www.icann.org/news/announcement-2000-01-07-en 695 info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; 696 info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; 697 info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; 698 699 if (mode != idna_mode::kStrict) { 700 // VerifyDnsLength = beStrict 701 info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; 702 info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; 703 info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 704 } 705 706 if (U_FAILURE(status) || (mode != idna_mode::kLenient && info.errors != 0)) { 707 len = -1; 708 buf->SetLength(0); 709 } else { 710 buf->SetLength(len); 711 } 712 713 uidna_close(uidna); 714 return len; 715} 716 717static void ToUnicode(const FunctionCallbackInfo<Value>& args) { 718 Environment* env = Environment::GetCurrent(args); 719 CHECK_GE(args.Length(), 1); 720 CHECK(args[0]->IsString()); 721 Utf8Value val(env->isolate(), args[0]); 722 723 MaybeStackBuffer<char> buf; 724 int32_t len = ToUnicode(&buf, *val, val.length()); 725 726 if (len < 0) { 727 return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode"); 728 } 729 730 args.GetReturnValue().Set( 731 String::NewFromUtf8(env->isolate(), 732 *buf, 733 NewStringType::kNormal, 734 len).ToLocalChecked()); 735} 736 737static void ToASCII(const FunctionCallbackInfo<Value>& args) { 738 Environment* env = Environment::GetCurrent(args); 739 CHECK_GE(args.Length(), 1); 740 CHECK(args[0]->IsString()); 741 Utf8Value val(env->isolate(), args[0]); 742 // optional arg 743 bool lenient = args[1]->BooleanValue(env->isolate()); 744 idna_mode mode = lenient ? idna_mode::kLenient : idna_mode::kDefault; 745 746 MaybeStackBuffer<char> buf; 747 int32_t len = ToASCII(&buf, *val, val.length(), mode); 748 749 if (len < 0) { 750 return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII"); 751 } 752 753 args.GetReturnValue().Set( 754 String::NewFromUtf8(env->isolate(), 755 *buf, 756 NewStringType::kNormal, 757 len).ToLocalChecked()); 758} 759 760// This is similar to wcwidth except that it takes the current unicode 761// character properties database into consideration, allowing it to 762// correctly calculate the column widths of things like emoji's and 763// newer wide characters. wcwidth, on the other hand, uses a fixed 764// algorithm that does not take things like emoji into proper 765// consideration. 766// 767// TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by 768// GNOME Terminal) and Konsole don't consider them to be zero-width (see refs 769// below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't 770// allow it to be input. Linux's PTY terminal prints control characters as 771// Narrow rhombi. 772// 773// TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final 774// consonants are 0-width when combined with initial consonants; otherwise they 775// are technically Wide. But many terminals (including Konsole and 776// VTE/GLib-based) implement all medials and finals as 0-width. 777// 778// Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width 779// Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420 780// Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223 781static int GetColumnWidth(UChar32 codepoint, 782 bool ambiguous_as_full_width = false) { 783 // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a 784 // codepoint as being full width, wide, ambiguous, neutral, narrow, 785 // or halfwidth. 786 const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH); 787 switch (eaw) { 788 case U_EA_FULLWIDTH: 789 case U_EA_WIDE: 790 return 2; 791 case U_EA_AMBIGUOUS: 792 // See: http://www.unicode.org/reports/tr11/#Ambiguous for details 793 if (ambiguous_as_full_width) { 794 return 2; 795 } 796 // If ambiguous_as_full_width is false: 797 [[fallthrough]]; 798 case U_EA_NEUTRAL: 799 if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) { 800 return 2; 801 } 802 [[fallthrough]]; 803 case U_EA_HALFWIDTH: 804 case U_EA_NARROW: 805 default: 806 const auto zero_width_mask = U_GC_CC_MASK | // C0/C1 control code 807 U_GC_CF_MASK | // Format control character 808 U_GC_ME_MASK | // Enclosing mark 809 U_GC_MN_MASK; // Nonspacing mark 810 if (codepoint != 0x00AD && // SOFT HYPHEN is Cf but not zero-width 811 ((U_MASK(u_charType(codepoint)) & zero_width_mask) || 812 u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) { 813 return 0; 814 } 815 return 1; 816 } 817} 818 819// Returns the column width for the given String. 820static void GetStringWidth(const FunctionCallbackInfo<Value>& args) { 821 Environment* env = Environment::GetCurrent(args); 822 CHECK(args[0]->IsString()); 823 824 bool ambiguous_as_full_width = args[1]->IsTrue(); 825 bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue(); 826 827 TwoByteValue value(env->isolate(), args[0]); 828 // reinterpret_cast is required by windows to compile 829 UChar* str = reinterpret_cast<UChar*>(*value); 830 static_assert(sizeof(*str) == sizeof(**value), 831 "sizeof(*str) == sizeof(**value)"); 832 UChar32 c = 0; 833 UChar32 p; 834 size_t n = 0; 835 uint32_t width = 0; 836 837 while (n < value.length()) { 838 p = c; 839 U16_NEXT(str, n, value.length(), c); 840 // Don't count individual emoji codepoints that occur within an 841 // emoji sequence. This is not necessarily foolproof. Some 842 // environments display emoji sequences in the appropriate 843 // condensed form (as a single emoji glyph), other environments 844 // may not understand an emoji sequence and will display each 845 // individual emoji separately. When this happens, the width 846 // calculated will be off, and there's no reliable way of knowing 847 // in advance if a particular sequence is going to be supported. 848 // The expand_emoji_sequence option allows the caller to skip this 849 // check and count each code within an emoji sequence separately. 850 // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences 851 if (!expand_emoji_sequence && 852 n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner) 853 (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) || 854 u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) { 855 continue; 856 } 857 width += GetColumnWidth(c, ambiguous_as_full_width); 858 } 859 args.GetReturnValue().Set(width); 860} 861 862void Initialize(Local<Object> target, 863 Local<Value> unused, 864 Local<Context> context, 865 void* priv) { 866 Environment* env = Environment::GetCurrent(context); 867 SetMethod(context, target, "toUnicode", ToUnicode); 868 SetMethod(context, target, "toASCII", ToASCII); 869 SetMethod(context, target, "getStringWidth", GetStringWidth); 870 871 // One-shot converters 872 SetMethod(context, target, "icuErrName", ICUErrorName); 873 SetMethod(context, target, "transcode", Transcode); 874 875 // ConverterObject 876 { 877 Local<FunctionTemplate> t = NewFunctionTemplate(env->isolate(), nullptr); 878 t->Inherit(BaseObject::GetConstructorTemplate(env)); 879 t->InstanceTemplate()->SetInternalFieldCount( 880 ConverterObject::kInternalFieldCount); 881 Local<String> converter_string = 882 FIXED_ONE_BYTE_STRING(env->isolate(), "Converter"); 883 t->SetClassName(converter_string); 884 env->set_i18n_converter_template(t->InstanceTemplate()); 885 } 886 887 SetMethod(context, target, "getConverter", ConverterObject::Create); 888 SetMethod(context, target, "decode", ConverterObject::Decode); 889 SetMethod(context, target, "hasConverter", ConverterObject::Has); 890} 891 892void RegisterExternalReferences(ExternalReferenceRegistry* registry) { 893 registry->Register(ToUnicode); 894 registry->Register(ToASCII); 895 registry->Register(GetStringWidth); 896 registry->Register(ICUErrorName); 897 registry->Register(Transcode); 898 registry->Register(ConverterObject::Create); 899 registry->Register(ConverterObject::Decode); 900 registry->Register(ConverterObject::Has); 901} 902 903} // namespace i18n 904} // namespace node 905 906NODE_BINDING_CONTEXT_AWARE_INTERNAL(icu, node::i18n::Initialize) 907NODE_BINDING_EXTERNAL_REFERENCE(icu, node::i18n::RegisterExternalReferences) 908 909#endif // NODE_HAVE_I18N_SUPPORT 910