diff --git a/.eslintrc.yml b/.eslintrc.yml index 83239134caa..02ce8acdf41 100644 --- a/.eslintrc.yml +++ b/.eslintrc.yml @@ -146,6 +146,7 @@ globals: indentStr: true _forgetReplSet: true _fnvHashToHexString: true + getStringWidth: true # likely could be replaced with `path` _copyFileRange: true diff --git a/jstests/noPassthrough/shell_i18n.js b/jstests/noPassthrough/shell_i18n.js new file mode 100644 index 00000000000..521cd289756 --- /dev/null +++ b/jstests/noPassthrough/shell_i18n.js @@ -0,0 +1,32 @@ +'use strict'; + +// Ensure that our implementation of `icuGetStringWidth` (exposed as the `getStringWidth` global +// function) produces correct results. +function testGetStringWidth() { + assert.eq(getStringWidth('a'), 1); + assert.eq(getStringWidth(String.fromCharCode(0x0061)), 1); + assert.eq(getStringWidth('丁'), 2); + assert.eq(getStringWidth(String.fromCharCode(0x4E01)), 2); + assert.eq(getStringWidth('\ud83d\udc78\ud83c\udfff'), 4); + assert.eq(getStringWidth('πŸ‘…'), 2); + assert.eq(getStringWidth('\ud83d'), 1); + assert.eq(getStringWidth('\udc78'), 1); + assert.eq(getStringWidth('\u0000'), 0); + assert.eq(getStringWidth(String.fromCharCode(0x0007)), 0); + assert.eq(getStringWidth('\n'), 0); + assert.eq(getStringWidth(String.fromCharCode(0x00AD)), 1); + assert.eq(getStringWidth('\u200Ef\u200F'), 1); + assert.eq(getStringWidth(String.fromCharCode(0x10FFEF)), 1); + assert.eq(getStringWidth(String.fromCharCode(0x3FFEF)), 1); + assert.eq(getStringWidth(String.fromCharCode(0x0301)), 0); + assert.eq(getStringWidth(String.fromCharCode(0x1B44)), 1); + assert.eq(getStringWidth(String.fromCharCode(0x20DD)), 0); + assert.eq(getStringWidth('πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§'), 8); + assert.eq(getStringWidth('❀️'), 1); + assert.eq(getStringWidth('πŸ‘©β€β€οΈβ€πŸ‘©'), 5); + assert.eq(getStringWidth('❀'), 1); + assert.eq(getStringWidth('\u01d4'), 1); + assert.eq(getStringWidth('\u200E\n\u220A\u20D2'), 1); +} + +testGetStringWidth(); diff --git a/src/mongo/shell/SConscript b/src/mongo/shell/SConscript index a503bc9d0d4..74ae535e5f6 100644 --- a/src/mongo/shell/SConscript +++ b/src/mongo/shell/SConscript @@ -176,6 +176,7 @@ env.Library( '$BUILD_DIR/mongo/bson/util/bson_column', '$BUILD_DIR/mongo/db/auth/security_token_auth', '$BUILD_DIR/mongo/db/storage/record_store_base', + '$BUILD_DIR/mongo/util/icu', 'program_runner', ], ) diff --git a/src/mongo/shell/shell_utils_extended.cpp b/src/mongo/shell/shell_utils_extended.cpp index b3fa90eae72..ab0b835573c 100644 --- a/src/mongo/shell/shell_utils_extended.cpp +++ b/src/mongo/shell/shell_utils_extended.cpp @@ -27,7 +27,6 @@ * it in the license file. */ - #include #include #include @@ -52,7 +51,6 @@ #include "mongo/base/data_range_cursor.h" #include "mongo/base/error_codes.h" -#include "mongo/base/status_with.h" #include "mongo/base/string_data.h" #include "mongo/bson/bson_bin_util.h" #include "mongo/bson/bson_validate.h" @@ -67,6 +65,7 @@ #include "mongo/shell/shell_utils.h" #include "mongo/util/assert_util.h" #include "mongo/util/errno_util.h" +#include "mongo/util/icu.h" #include "mongo/util/md5.h" #include "mongo/util/md5.hpp" #include "mongo/util/net/socket_utils.h" @@ -635,6 +634,14 @@ BSONObj shellGetEnv(const BSONObj& a, void*) { return BSON("" << result.c_str()); } +BSONObj getStringWidth(const BSONObj& a, void* data) { + uassert(8730901, + "getStringWidth takes a single string argument", + a.nFields() == 1 && a.firstElementType() == String); + const auto str = a.firstElement().valueStringData(); + int width = icuGetStringWidth(str, false, true); + return BSON("" << width); +} } // namespace @@ -661,6 +668,7 @@ void installShellUtilsExtended(Scope& scope) { scope.injectNative("_readDumpFile", readDumpFile); scope.injectNative("_getEnv", shellGetEnv); scope.injectNative("writeBsonArrayToFile", writeBsonArrayToFile); + scope.injectNative("getStringWidth", getStringWidth); } } // namespace shell_utils diff --git a/src/mongo/util/SConscript b/src/mongo/util/SConscript index 36bf381a138..ca57aca13d0 100644 --- a/src/mongo/util/SConscript +++ b/src/mongo/util/SConscript @@ -806,9 +806,7 @@ icuEnv.Library( icuEnv.Library( target='icu', - source=[ - 'icu.cpp', - ], + source=['icu.cpp', '../../third_party/node/icu_get_string_width.cpp'], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/base', '$BUILD_DIR/third_party/shim_icu', diff --git a/src/mongo/util/icu.cpp b/src/mongo/util/icu.cpp index 2e2a9ecaf8e..e32f6979d61 100644 --- a/src/mongo/util/icu.cpp +++ b/src/mongo/util/icu.cpp @@ -30,13 +30,16 @@ #include #include #include +#include +#include #include #include +#include #include #include +#include #include -#include #include "mongo/base/error_codes.h" #include "mongo/base/init.h" // IWYU pragma: keep diff --git a/src/mongo/util/icu.h b/src/mongo/util/icu.h index 2f10aecf4e7..a5a2c8f2370 100644 --- a/src/mongo/util/icu.h +++ b/src/mongo/util/icu.h @@ -59,4 +59,7 @@ StatusWith icuSaslPrep(StringData str, UStringPrepOptions = kUStrin */ StatusWith icuX509DNPrep(StringData str); +// Similar to mk_wcswidth, but use the larger unicode database for character lookup. +int icuGetStringWidth(StringData str, bool ambiguousAsFullWidth, bool expandEmojiSequence); + } // namespace mongo diff --git a/src/third_party/node/LICENSE b/src/third_party/node/LICENSE new file mode 100644 index 00000000000..122747269bb --- /dev/null +++ b/src/third_party/node/LICENSE @@ -0,0 +1,24 @@ +Node.js is licensed for use as follows: + +""" +Copyright Node.js contributors. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +""" + diff --git a/src/third_party/node/icu_get_string_width.cpp b/src/third_party/node/icu_get_string_width.cpp new file mode 100644 index 00000000000..57f42983d67 --- /dev/null +++ b/src/third_party/node/icu_get_string_width.cpp @@ -0,0 +1,69 @@ +#include "mongo/util/icu.h" + +#include +#include + +namespace mongo { +namespace { + +int getColumnWidth(UChar32 codepoint, bool ambiguousAsFullWidth) { + const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH); + switch (eaw) { + case U_EA_FULLWIDTH: + case U_EA_WIDE: + return 2; + case U_EA_AMBIGUOUS: + if (ambiguousAsFullWidth) { + return 2; + } + [[fallthrough]]; + case U_EA_NEUTRAL: + if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) { + return 2; + } + [[fallthrough]]; + case U_EA_HALFWIDTH: + case U_EA_NARROW: + default: + const auto zero_width_mask = U_GC_CC_MASK | // C0/C1 control code + U_GC_CF_MASK | // Format control character + U_GC_ME_MASK | // Enclosing mark + U_GC_MN_MASK; // Nonspacing mark + if (codepoint != 0x00AD && // SOFT HYPHEN is Cf but not zero-width + ((U_MASK(u_charType(codepoint)) & zero_width_mask) || + u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) { + return 0; + } + return 1; + } +} +} // namespace + +// This is a modified version of GetStringWidth from node. We don't currently support wide strings +// in the test runner, so it has been converted to use the U8 cursor API. For more details on +// string width calculation see: https://www.unicode.org/reports/tr11/ +int icuGetStringWidth(StringData value, bool ambiguousAsFullWidth, bool expandEmojiSequence) { + const uint8_t* str = reinterpret_cast(value.data()); + UChar32 output = 0; + UChar32 previous; + int offset = 0; + int strLength = static_cast(value.length()); + int width = 0; + + while (offset < strLength) { + previous = output; + U8_NEXT(str, offset, strLength, output); + + if (!expandEmojiSequence && offset > 0 && previous == 0x200d && + (u_hasBinaryProperty(output, UCHAR_EMOJI_PRESENTATION) || + u_hasBinaryProperty(output, UCHAR_EMOJI_MODIFIER))) { + continue; + } + + width += getColumnWidth(output, ambiguousAsFullWidth); + } + + return width; +} + +} // namespace mongo