0
0
mirror of https://github.com/nodejs/node.git synced 2024-11-21 21:19:50 +01:00

src: use simdutf for converting externalized builtins to UTF-16

Remove the dependency on ICU for this part, as well as the
hacky way of converting embedder main sources to UTF-8 via
V8 APIs. Allow `UnionBytes` to own the memory its pointing
to in order to simplify the code on the `BuiltinLoader` side.

PR-URL: https://github.com/nodejs/node/pull/46119
Reviewed-By: Yagiz Nizipli <yagiz@nizipli.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com>
Reviewed-By: Darshan Sen <raisinten@gmail.com>
This commit is contained in:
Anna Henningsen 2023-01-10 12:25:19 +01:00 committed by GitHub
parent 757c104147
commit 21fb98e2bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 106 additions and 105 deletions

View File

@ -2024,11 +2024,7 @@ output['variables']['node_builtin_shareable_builtins'] = []
for builtin in shareable_builtins:
builtin_id = 'node_shared_builtin_' + builtin.replace('/', '_') + '_path'
if getattr(options, builtin_id):
if options.with_intl == 'none':
option_name = '--shared-builtin-' + builtin + '-path'
error(option_name + ' is incompatible with --with-intl=none' )
else:
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
else:
output['variables']['node_builtin_shareable_builtins'] += [shareable_builtins[builtin]]

View File

@ -973,6 +973,7 @@
'deps/googletest/googletest.gyp:gtest_main',
'deps/histogram/histogram.gyp:histogram',
'deps/uvwasi/uvwasi.gyp:uvwasi',
'deps/simdutf/simdutf.gyp:simdutf',
],
'includes': [

View File

@ -467,21 +467,10 @@ MaybeLocal<Value> LoadEnvironment(
Environment* env,
const char* main_script_source_utf8) {
CHECK_NOT_NULL(main_script_source_utf8);
Isolate* isolate = env->isolate();
return LoadEnvironment(
env,
[&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
// This is a slightly hacky way to convert UTF-8 to UTF-16.
Local<String> str =
String::NewFromUtf8(isolate,
main_script_source_utf8).ToLocalChecked();
auto main_utf16 = std::make_unique<String::Value>(isolate, str);
// TODO(addaleax): Avoid having a global table for all scripts.
env, [&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
std::string name = "embedder_main_" + std::to_string(env->thread_id());
builtins::BuiltinLoader::Add(
name.c_str(), UnionBytes(**main_utf16, main_utf16->length()));
env->set_main_utf16(std::move(main_utf16));
builtins::BuiltinLoader::Add(name.c_str(), main_script_source_utf8);
Realm* realm = env->principal_realm();
return realm->ExecuteBootstrapper(name.c_str());

View File

@ -805,11 +805,6 @@ void Environment::RemoveCleanupHook(CleanupQueue::Callback fn, void* arg) {
cleanup_queue_.Remove(fn, arg);
}
void Environment::set_main_utf16(std::unique_ptr<v8::String::Value> str) {
CHECK(!main_utf16_);
main_utf16_ = std::move(str);
}
void Environment::set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler) {
process_exit_handler_ = std::move(handler);

View File

@ -961,7 +961,6 @@ class Environment : public MemoryRetainer {
#endif // HAVE_INSPECTOR
inline void set_main_utf16(std::unique_ptr<v8::String::Value>);
inline void set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler);
@ -1144,11 +1143,6 @@ class Environment : public MemoryRetainer {
std::unique_ptr<Realm> principal_realm_ = nullptr;
// Keeps the main script source alive is one was passed to LoadEnvironment().
// We should probably find a way to just use plain `v8::String`s created from
// the source passed to LoadEnvironment() directly instead.
std::unique_ptr<v8::String::Value> main_utf16_;
// Used by allocate_managed_buffer() and release_managed_buffer() to keep
// track of the BackingStore for a given pointer.
std::unordered_map<char*, std::unique_ptr<v8::BackingStore>>

View File

@ -3,6 +3,7 @@
#include "env-inl.h"
#include "node_external_reference.h"
#include "node_internals.h"
#include "simdutf.h"
#include "util-inl.h"
namespace node {
@ -35,7 +36,6 @@ BuiltinLoader BuiltinLoader::instance_;
BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
LoadJavaScriptSource();
#if defined(NODE_HAVE_I18N_SUPPORT)
#ifdef NODE_SHARED_BUILTIN_CJS_MODULE_LEXER_LEXER_PATH
AddExternalizedBuiltin(
"internal/deps/cjs-module-lexer/lexer",
@ -52,7 +52,6 @@ BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
AddExternalizedBuiltin("internal/deps/undici/undici",
STRINGIFY(NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH));
#endif // NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH
#endif // NODE_HAVE_I18N_SUPPORT
}
BuiltinLoader* BuiltinLoader::GetInstance() {
@ -240,7 +239,6 @@ MaybeLocal<String> BuiltinLoader::LoadBuiltinSource(Isolate* isolate,
#endif // NODE_BUILTIN_MODULES_PATH
}
#if defined(NODE_HAVE_I18N_SUPPORT)
void BuiltinLoader::AddExternalizedBuiltin(const char* id,
const char* filename) {
std::string source;
@ -252,16 +250,20 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id,
return;
}
icu::UnicodeString utf16 = icu::UnicodeString::fromUTF8(
icu::StringPiece(source.data(), source.length()));
auto source_utf16 = std::make_unique<icu::UnicodeString>(utf16);
Add(id,
UnionBytes(reinterpret_cast<const uint16_t*>((*source_utf16).getBuffer()),
utf16.length()));
// keep source bytes for builtin alive while BuiltinLoader exists
GetInstance()->externalized_source_bytes_.push_back(std::move(source_utf16));
Add(id, source);
}
bool BuiltinLoader::Add(const char* id, std::string_view utf8source) {
size_t expected_u16_length =
simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length());
auto out = std::make_shared<std::vector<uint16_t>>(expected_u16_length);
size_t u16_length = simdutf::convert_utf8_to_utf16le(
utf8source.data(),
utf8source.length(),
reinterpret_cast<char16_t*>(out->data()));
out->resize(u16_length);
return Add(id, UnionBytes(out));
}
#endif // NODE_HAVE_I18N_SUPPORT
// Returns Local<Function> of the compiled module if return_code_cache
// is false (we are only compiling the function).

View File

@ -8,9 +8,6 @@
#include <memory>
#include <set>
#include <string>
#if defined(NODE_HAVE_I18N_SUPPORT)
#include <unicode/unistr.h>
#endif // NODE_HAVE_I18N_SUPPORT
#include <vector>
#include "node_mutex.h"
#include "node_union_bytes.h"
@ -71,6 +68,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static v8::Local<v8::String> GetConfigString(v8::Isolate* isolate);
static bool Exists(const char* id);
static bool Add(const char* id, const UnionBytes& source);
static bool Add(const char* id, std::string_view utf8source);
static bool CompileAllBuiltins(v8::Local<v8::Context> context);
static void RefreshCodeCache(const std::vector<CodeCacheInfo>& in);
@ -136,18 +134,13 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static void HasCachedBuiltins(
const v8::FunctionCallbackInfo<v8::Value>& args);
#if defined(NODE_HAVE_I18N_SUPPORT)
static void AddExternalizedBuiltin(const char* id, const char* filename);
#endif // NODE_HAVE_I18N_SUPPORT
static BuiltinLoader instance_;
BuiltinCategories builtin_categories_;
BuiltinSourceMap source_;
BuiltinCodeCacheMap code_cache_;
UnionBytes config_;
#if defined(NODE_HAVE_I18N_SUPPORT)
std::list<std::unique_ptr<icu::UnicodeString>> externalized_source_bytes_;
#endif // NODE_HAVE_I18N_SUPPORT
// Used to synchronize access to the code cache map
Mutex code_cache_mutex_;

View File

@ -11,57 +11,20 @@
namespace node {
class NonOwningExternalOneByteResource
: public v8::String::ExternalOneByteStringResource {
public:
explicit NonOwningExternalOneByteResource(const uint8_t* data, size_t length)
: data_(data), length_(length) {}
~NonOwningExternalOneByteResource() override = default;
const char* data() const override {
return reinterpret_cast<const char*>(data_);
}
size_t length() const override { return length_; }
NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
delete;
NonOwningExternalOneByteResource& operator=(
const NonOwningExternalOneByteResource&) = delete;
private:
const uint8_t* data_;
size_t length_;
};
class NonOwningExternalTwoByteResource
: public v8::String::ExternalStringResource {
public:
explicit NonOwningExternalTwoByteResource(const uint16_t* data, size_t length)
: data_(data), length_(length) {}
~NonOwningExternalTwoByteResource() override = default;
const uint16_t* data() const override { return data_; }
size_t length() const override { return length_; }
NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
delete;
NonOwningExternalTwoByteResource& operator=(
const NonOwningExternalTwoByteResource&) = delete;
private:
const uint16_t* data_;
size_t length_;
};
// Similar to a v8::String, but it's independent from Isolates
// and can be materialized in Isolates as external Strings
// via ToStringChecked. The data pointers are owned by the caller.
// via ToStringChecked.
class UnionBytes {
public:
UnionBytes(const uint16_t* data, size_t length)
: one_bytes_(nullptr), two_bytes_(data), length_(length) {}
UnionBytes(const uint8_t* data, size_t length)
: one_bytes_(data), two_bytes_(nullptr), length_(length) {}
template <typename T> // T = uint8_t or uint16_t
explicit UnionBytes(std::shared_ptr<std::vector</*const*/ T>> data)
: UnionBytes(data->data(), data->size()) {
owning_ptr_ = data;
}
UnionBytes(const UnionBytes&) = default;
UnionBytes& operator=(const UnionBytes&) = default;
@ -77,23 +40,14 @@ class UnionBytes {
CHECK_NOT_NULL(one_bytes_);
return one_bytes_;
}
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const {
if (is_one_byte()) {
NonOwningExternalOneByteResource* source =
new NonOwningExternalOneByteResource(one_bytes_data(), length_);
return v8::String::NewExternalOneByte(isolate, source).ToLocalChecked();
} else {
NonOwningExternalTwoByteResource* source =
new NonOwningExternalTwoByteResource(two_bytes_data(), length_);
return v8::String::NewExternalTwoByte(isolate, source).ToLocalChecked();
}
}
size_t length() { return length_; }
v8::Local<v8::String> ToStringChecked(v8::Isolate* isolate) const;
size_t length() const { return length_; }
private:
const uint8_t* one_bytes_;
const uint16_t* two_bytes_;
size_t length_;
std::shared_ptr<void> owning_ptr_;
};
} // namespace node

View File

@ -515,4 +515,66 @@ void SetConstructorFunction(Isolate* isolate,
that->Set(name, tmpl);
}
namespace {
class NonOwningExternalOneByteResource
: public v8::String::ExternalOneByteStringResource {
public:
explicit NonOwningExternalOneByteResource(const UnionBytes& source)
: source_(source) {}
~NonOwningExternalOneByteResource() override = default;
const char* data() const override {
return reinterpret_cast<const char*>(source_.one_bytes_data());
}
size_t length() const override { return source_.length(); }
NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) =
delete;
NonOwningExternalOneByteResource& operator=(
const NonOwningExternalOneByteResource&) = delete;
private:
const UnionBytes source_;
};
class NonOwningExternalTwoByteResource
: public v8::String::ExternalStringResource {
public:
explicit NonOwningExternalTwoByteResource(const UnionBytes& source)
: source_(source) {}
~NonOwningExternalTwoByteResource() override = default;
const uint16_t* data() const override { return source_.two_bytes_data(); }
size_t length() const override { return source_.length(); }
NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) =
delete;
NonOwningExternalTwoByteResource& operator=(
const NonOwningExternalTwoByteResource&) = delete;
private:
const UnionBytes source_;
};
} // anonymous namespace
Local<String> UnionBytes::ToStringChecked(Isolate* isolate) const {
if (UNLIKELY(length() == 0)) {
// V8 requires non-null data pointers for empty external strings,
// but we don't guarantee that. Solve this by not creating an
// external string at all in that case.
return String::Empty(isolate);
}
if (is_one_byte()) {
NonOwningExternalOneByteResource* source =
new NonOwningExternalOneByteResource(*this);
return String::NewExternalOneByte(isolate, source).ToLocalChecked();
} else {
NonOwningExternalTwoByteResource* source =
new NonOwningExternalTwoByteResource(*this);
return String::NewExternalTwoByte(isolate, source).ToLocalChecked();
}
}
} // namespace node

View File

@ -1,7 +1,8 @@
#include "util-inl.h"
#include "debug_utils-inl.h"
#include "env-inl.h"
#include "gtest/gtest.h"
#include "simdutf.h"
#include "util-inl.h"
using node::Calloc;
using node::Malloc;
@ -298,3 +299,17 @@ TEST(UtilTest, SPrintF) {
const std::string with_zero = std::string("a") + '\0' + 'b';
EXPECT_EQ(SPrintF("%s", with_zero), with_zero);
}
TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) {
// In simdutf, "LE" does *not* refer to Little Endian, it refers
// to 16-byte code units that are stored using *host* endianness.
// This is weird and confusing naming, and so we add this assertion
// here to verify that this is actually the case (so that CI tells
// us if it changed, because for most people Little Endian is
// host endianness, so locally everything would work fine).
const char utf8source[] = "\xe7\x8c\xab";
char16_t u16output;
size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output);
EXPECT_EQ(u16len, 1u);
EXPECT_EQ(u16output, 0x732B);
}