diff --git a/deps/base64/base64.gyp b/deps/base64/base64.gyp index 06b20a142b1..5d0a0c05dc2 100644 --- a/deps/base64/base64.gyp +++ b/deps/base64/base64.gyp @@ -49,6 +49,7 @@ 'HAVE_SSE42=1', 'HAVE_AVX=1', 'HAVE_AVX2=1', + 'HAVE_AVX512=1', ], 'dependencies': [ 'base64_ssse3', @@ -56,6 +57,7 @@ 'base64_sse42', 'base64_avx', 'base64_avx2', + 'base64_avx512', ], }, { 'sources': [ @@ -64,6 +66,7 @@ 'base64/lib/arch/sse42/codec.c', 'base64/lib/arch/avx/codec.c', 'base64/lib/arch/avx2/codec.c', + 'base64/lib/arch/avx512/codec.c', ], }], ], @@ -165,6 +168,30 @@ ], }, + { + 'target_name': 'base64_avx512', + 'type': 'static_library', + 'include_dirs': [ 'base64/include', 'base64/lib' ], + 'sources': [ 'base64/lib/arch/avx512/codec.c' ], + 'defines': [ 'BASE64_STATIC_DEFINE', 'HAVE_AVX512=1' ], + 'conditions': [ + [ 'OS!="win"', { + 'cflags': [ '-mavx512vl', '-mavx512vbmi' ], + 'xcode_settings': { + 'OTHER_CFLAGS': [ '-mavx512vl', '-mavx512vbmi' ] + }, + }, { + 'msvs_settings': { + 'VCCLCompilerTool': { + 'AdditionalOptions': [ + '/arch:AVX512' + ], + }, + }, + }], + ], + }, + { 'target_name': 'base64_neon32', 'type': 'static_library', diff --git a/deps/base64/base64/.gitignore b/deps/base64/base64/.gitignore index 837a2306a62..bb7b160deb3 100644 --- a/deps/base64/base64/.gitignore +++ b/deps/base64/base64/.gitignore @@ -1,12 +1 @@ -*.o -bin/base64 -lib/config.h -test/benchmark -test/test_base64 - -# visual studio symbol db, etc. -.vs/ -# build directory used by CMakePresets -out/ -# private cmake presets -CMakeUserPresets.json +# Intentionally empty diff --git a/deps/base64/base64/CMakeLists.txt b/deps/base64/base64/CMakeLists.txt index 56076e47a6a..be1de665a2c 100644 --- a/deps/base64/base64/CMakeLists.txt +++ b/deps/base64/base64/CMakeLists.txt @@ -17,7 +17,7 @@ if (POLICY CMP0127) cmake_policy(SET CMP0127 NEW) endif() -project(base64 LANGUAGES C VERSION 0.5.0) +project(base64 LANGUAGES C VERSION 0.5.1) include(GNUInstallDirs) include(CMakeDependentOption) @@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF) add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath") cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF) add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath") +cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF) +add_feature_info(AVX512 BASE64_WITH_AVX512 "add AVX512 codepath") cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF) add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath") @@ -118,6 +120,7 @@ add_library(base64 lib/arch/sse42/codec.c lib/arch/avx/codec.c lib/arch/avx2/codec.c + lib/arch/avx512/codec.c lib/arch/neon32/codec.c lib/arch/neon64/codec.c @@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64") configure_codec(SSE42 __SSSE4_2__) configure_codec(AVX) configure_codec(AVX2) + configure_codec(AVX512) elseif (_TARGET_ARCH STREQUAL "arm") set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')") diff --git a/deps/base64/base64/LICENSE b/deps/base64/base64/LICENSE index 9446393a82a..109d6521b12 100644 --- a/deps/base64/base64/LICENSE +++ b/deps/base64/base64/LICENSE @@ -1,7 +1,7 @@ Copyright (c) 2005-2007, Nick Galbreath -Copyright (c) 2013-2019, Alfred Klomp -Copyright (c) 2015-2017, Wojciech Mula +Copyright (c) 2015-2018, Wojciech Muła Copyright (c) 2016-2017, Matthieu Darbois +Copyright (c) 2013-2022, Alfred Klomp All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/deps/base64/base64/Makefile b/deps/base64/base64/Makefile index 2bb01e204fc..bcb944551ae 100644 --- a/deps/base64/base64/Makefile +++ b/deps/base64/base64/Makefile @@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic OBJCOPY ?= objcopy OBJS = \ + lib/arch/avx512/codec.o \ lib/arch/avx2/codec.o \ lib/arch/generic/codec.o \ lib/arch/neon32/codec.o \ @@ -16,6 +17,7 @@ OBJS = \ lib/codec_choose.o \ lib/tables/tables.o +HAVE_AVX512 = 0 HAVE_AVX2 = 0 HAVE_NEON32 = 0 HAVE_NEON64 = 0 @@ -26,6 +28,9 @@ HAVE_AVX = 0 # The user should supply compiler flags for the codecs they want to build. # Check which codecs we're going to include: +ifdef AVX512_CFLAGS + HAVE_AVX512 = 1 +endif ifdef AVX2_CFLAGS HAVE_AVX2 = 1 endif @@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS) $(OBJCOPY) --keep-global-symbols=lib/exports.txt $@ lib/config.h: - @echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@ + @echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@ + @echo "#define HAVE_AVX2 $(HAVE_AVX2)" >> $@ @echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@ @echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@ @echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@ @@ -75,6 +81,7 @@ lib/config.h: $(OBJS): lib/config.h $(OBJS): CFLAGS += -Ilib +lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS) lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS) lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS) lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS) diff --git a/deps/base64/base64/README.md b/deps/base64/base64/README.md index b953c324c9d..ae0a914965e 100644 --- a/deps/base64/base64/README.md +++ b/deps/base64/base64/README.md @@ -3,7 +3,7 @@ [![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml) This is an implementation of a base64 stream encoding/decoding library in C99 -with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and +with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and [OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions to encode/decode simple length-delimited strings. This library aims to be: @@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a time, which gives a speedup of four or more times compared to the "plain" bytewise codec. +AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI +instructions. Decoding part reused AVX2 implementations. For CPUs later than +Cannonlake (manufactured in 2018) supports these instructions. + NEON support is hardcoded to on or off at compile time, because portable runtime feature detection is unavailable on ARM. @@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a [articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html). His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64). +The AVX512 encoder is based on code from Wojciech Muła's +[base64simd](https://github.com/WojciechMula/base64simd) library. + The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl). ## Building @@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type: make lib/libbase64.o ``` -Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`, -`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables. +Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`, +`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables. A typical build invocation on x86 looks like this: ```sh @@ -93,6 +100,15 @@ Example: AVX2_CFLAGS=-mavx2 make ``` +### AVX512 + +To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`. +Example: + +```sh +AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make +``` + The codec will only be used if runtime feature detection shows that the target machine supports AVX2. ### SSSE3 @@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way The following constants can be used: - `BASE64_FORCE_AVX2` +- `BASE64_FORCE_AVX512` - `BASE64_FORCE_NEON32` - `BASE64_FORCE_NEON64` - `BASE64_FORCE_PLAIN` @@ -434,7 +451,7 @@ x86 processors | i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\* | 7099\* | 4917\* | 7057\* | 4799\* | 7143\* | 4902\* | 7219\* | | i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\* | 8849\* | 5284\* | 9099\* | 5289\* | 9220\* | 4849\* | 9200\* | | i7-4870HQ @ 2.5 GHz | 1471\* | 3066\* | 6721\* | 6962\* | 7015\* | 8267\* | 8328\* | 11576\* | -| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243 | 6233 | 4160 | 6344 | +| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243\* | 6233 | 4160\* | 6344 | | Xeon X5570 @ 2.93 GHz | 2161 | 1508 | 3160 | 3915 | - | - | - | - | | Pentium4 @ 3.4 GHz | 896 | 740 | - | - | - | - | - | - | | Atom N270 | 243 | 266 | 508 | 387 | - | - | - | - | diff --git a/deps/base64/base64/bin/base64.c b/deps/base64/base64/bin/base64.c index e4384fe885d..98d6b3cbab5 100644 --- a/deps/base64/base64/bin/base64.c +++ b/deps/base64/base64/bin/base64.c @@ -1,128 +1,477 @@ -#include // size_t -#include // fopen() -#include // strlen() +#define _XOPEN_SOURCE // IOV_MAX + +#include +#include +#include +#include +#include #include +#include +#include +#include #include "../include/libbase64.h" -#define BUFSIZE 1024 * 1024 +// Size of the buffer for the "raw" (not base64-encoded) data in bytes. +#define BUFFER_RAW_SIZE (1024 * 1024) -static char buf[BUFSIZE]; -static char out[(BUFSIZE * 5) / 3]; // Technically 4/3 of input, but take some margin -size_t nread; -size_t nout; +// Size of the buffer for the base64-encoded data in bytes. The base64-encoded +// data is 4/3 the size of the input, with some margin to be sure. +#define BUFFER_ENC_SIZE (BUFFER_RAW_SIZE * 4 / 3 + 16) -static int -enc (FILE *fp) +// Global config structure. +struct config { + + // Name by which the program was called on the command line. + const char *name; + + // Name of the input file for logging purposes. + const char *file; + + // Input file handle. + FILE *fp; + + // Wrap width in characters, for encoding only. + size_t wrap; + + // Whether to run in decode mode. + bool decode; + + // Whether to just print the help text and exit. + bool print_help; +}; + +// Input/output buffer structure. +struct buffer { + + // Runtime-allocated buffer for raw (unencoded) data. + char *raw; + + // Runtime-allocated buffer for base64-encoded data. + char *enc; +}; + +static bool +buffer_alloc (const struct config *config, struct buffer *buf) { - int ret = 1; + if ((buf->raw = malloc(BUFFER_RAW_SIZE)) == NULL || + (buf->enc = malloc(BUFFER_ENC_SIZE)) == NULL) { + free(buf->raw); + fprintf(stderr, "%s: malloc: %s\n", + config->name, strerror(errno)); + return false; + } + + return true; +} + +static void +buffer_free (struct buffer *buf) +{ + free(buf->raw); + free(buf->enc); +} + +static bool +writev_retry (const struct config *config, struct iovec *iov, size_t nvec) +{ + // Writing nothing always succeeds. + if (nvec == 0) { + return true; + } + + while (true) { + ssize_t nwrite; + + // Try to write the vectors to stdout. + if ((nwrite = writev(1, iov, nvec)) < 0) { + + // Retry on EINTR. + if (errno == EINTR) { + continue; + } + + // Quit on other errors. + fprintf(stderr, "%s: writev: %s\n", + config->name, strerror(errno)); + return false; + } + + // The return value of `writev' is the number of bytes written. + // To check for success, we traverse the list and remove all + // written vectors. The call succeeded if the list is empty. + while (true) { + + // Retry if this vector is not or partially written. + if (iov->iov_len > (size_t) nwrite) { + char *base = iov->iov_base; + + iov->iov_base = (size_t) nwrite + base; + iov->iov_len -= (size_t) nwrite; + break; + } + + // Move to the next vector. + nwrite -= iov->iov_len; + iov++; + + // Return successfully if all vectors were written. + if (--nvec == 0) { + return true; + } + } + } +} + +static inline bool +iov_append (const struct config *config, struct iovec *iov, + size_t *nvec, char *base, const size_t len) +{ + // Add the buffer to the IO vector array. + iov[*nvec].iov_base = base; + iov[*nvec].iov_len = len; + + // Increment the array index. Flush the array if it is full. + if (++(*nvec) == IOV_MAX) { + if (writev_retry(config, iov, IOV_MAX) == false) { + return false; + } + *nvec = 0; + } + + return true; +} + +static bool +write_stdout (const struct config *config, const char *buf, size_t len) +{ + while (len > 0) { + ssize_t nwrite; + + // Try to write the buffer to stdout. + if ((nwrite = write(1, buf, len)) < 0) { + + // Retry on EINTR. + if (errno == EINTR) { + continue; + } + + // Quit on other errors. + fprintf(stderr, "%s: write: %s\n", + config->name, strerror(errno)); + return false; + } + + // Update the buffer position. + buf += (size_t) nwrite; + len -= (size_t) nwrite; + } + + return true; +} + +static bool +write_wrapped (const struct config *config, char *buf, size_t len) +{ + static size_t col = 0; + + // Special case: if buf is NULL, print final trailing newline. + if (buf == NULL) { + if (config->wrap > 0 && col > 0) { + return write_stdout(config, "\n", 1); + } + return true; + } + + // If no wrap width is given, write the entire buffer. + if (config->wrap == 0) { + return write_stdout(config, buf, len); + } + + // Statically allocated IO vector buffer. + static struct iovec iov[IOV_MAX]; + size_t nvec = 0; + + while (len > 0) { + + // Number of characters to fill the current line. + size_t nwrite = config->wrap - col; + + // Do not write more data than is available. + if (nwrite > len) { + nwrite = len; + } + + // Append the data to the IO vector array. + if (iov_append(config, iov, &nvec, buf, nwrite) == false) { + return false; + } + + // Advance the buffer. + len -= nwrite; + buf += nwrite; + col += nwrite; + + // If the line is full, append a newline. + if (col == config->wrap) { + if (iov_append(config, iov, &nvec, "\n", 1) == false) { + return false; + } + col = 0; + } + } + + // Write the remaining vectors. + if (writev_retry(config, iov, nvec) == false) { + return false; + } + + return true; +} + +static bool +encode (const struct config *config, struct buffer *buf) +{ + size_t nread, nout; struct base64_state state; + // Initialize the encoder's state structure. base64_stream_encode_init(&state, 0); - while ((nread = fread(buf, 1, BUFSIZE, fp)) > 0) { - base64_stream_encode(&state, buf, nread, out, &nout); - if (nout) { - fwrite(out, nout, 1, stdout); - } - if (feof(fp)) { - break; - } - } - if (ferror(fp)) { - fprintf(stderr, "read error\n"); - ret = 0; - goto out; - } - base64_stream_encode_final(&state, out, &nout); + // Read raw data into the buffer. + while ((nread = fread(buf->raw, 1, BUFFER_RAW_SIZE, config->fp)) > 0) { - if (nout) { - fwrite(out, nout, 1, stdout); + // Encode the raw input into the encoded buffer. + base64_stream_encode(&state, buf->raw, nread, buf->enc, &nout); + + // Append the encoded data to the output stream. + if (write_wrapped(config, buf->enc, nout) == false) { + return false; + } } -out: fclose(fp); - fclose(stdout); - return ret; + + // Check for stream errors. + if (ferror(config->fp)) { + fprintf(stderr, "%s: %s: read error\n", + config->name, config->file); + return false; + } + + // Finalize the encoding by adding proper stream terminators. + base64_stream_encode_final(&state, buf->enc, &nout); + + // Append this tail to the output stream. + if (write_wrapped(config, buf->enc, nout) == false) { + return false; + } + + // Print optional trailing newline. + if (write_wrapped(config, NULL, 0) == false) { + return false; + } + + return true; } static int -dec (FILE *fp) +decode (const struct config *config, struct buffer *buf) { - int ret = 1; + size_t nread, nout; struct base64_state state; + // Initialize the decoder's state structure. base64_stream_decode_init(&state, 0); - while ((nread = fread(buf, 1, BUFSIZE, fp)) > 0) { - if (!base64_stream_decode(&state, buf, nread, out, &nout)) { - fprintf(stderr, "decoding error\n"); - ret = 0; - goto out; + // Read encoded data into the buffer. Use the smallest buffer size to + // be on the safe side: the decoded output will fit the raw buffer. + while ((nread = fread(buf->enc, 1, BUFFER_RAW_SIZE, config->fp)) > 0) { + + // Decode the input into the raw buffer. + if (base64_stream_decode(&state, buf->enc, nread, + buf->raw, &nout) == 0) { + fprintf(stderr, "%s: %s: decoding error\n", + config->name, config->file); + return false; } - if (nout) { - fwrite(out, nout, 1, stdout); + + // Append the raw data to the output stream. + if (write_stdout(config, buf->raw, nout) == false) { + return false; } - if (feof(fp)) { + } + + // Check for stream errors. + if (ferror(config->fp)) { + fprintf(stderr, "%s: %s: read error\n", + config->name, config->file); + return false; + } + + return true; +} + +static void +usage (FILE *fp, const struct config *config) +{ + const char *usage = + "Usage: %s [OPTION]... [FILE]\n" + "If no FILE is given or is specified as '-', " + "read from standard input.\n" + "Options:\n" + " -d, --decode Decode a base64 stream.\n" + " -h, --help Print this help text.\n" + " -w, --wrap=COLS Wrap encoded lines at this column. " + "Default 76, 0 to disable.\n"; + + fprintf(fp, usage, config->name); +} + +static bool +get_wrap (struct config *config, const char *str) +{ + char *eptr; + + // Reject empty strings. + if (*str == '\0') { + return false; + } + + // Convert the input string to a signed long. + const long wrap = strtol(str, &eptr, 10); + + // Reject negative numbers. + if (wrap < 0) { + return false; + } + + // Reject strings containing non-digits. + if (*eptr != '\0') { + return false; + } + + config->wrap = (size_t) wrap; + return true; +} + +static bool +parse_opts (int argc, char **argv, struct config *config) +{ + int c; + static const struct option opts[] = { + { "decode", no_argument, NULL, 'd' }, + { "help", no_argument, NULL, 'h' }, + { "wrap", required_argument, NULL, 'w' }, + { NULL } + }; + + // Remember the program's name. + config->name = *argv; + + // Parse command line options. + while ((c = getopt_long(argc, argv, ":dhw:", opts, NULL)) != -1) { + switch (c) { + case 'd': + config->decode = true; break; + + case 'h': + config->print_help = true; + return true; + + case 'w': + if (get_wrap(config, optarg) == false) { + fprintf(stderr, + "%s: invalid wrap value '%s'\n", + config->name, optarg); + return false; + } + break; + + case ':': + fprintf(stderr, "%s: missing argument for '%c'\n", + config->name, optopt); + return false; + + default: + fprintf(stderr, "%s: unknown option '%c'\n", + config->name, optopt); + return false; } } - if (ferror(fp)) { - fprintf(stderr, "read error\n"); - ret = 0; + + // Return successfully if no filename was given. + if (optind >= argc) { + return true; } -out: fclose(fp); - fclose(stdout); - return ret; + + // Return unsuccessfully if more than one filename was given. + if (optind + 1 < argc) { + fprintf(stderr, "%s: too many files\n", config->name); + return false; + } + + // For compatibility with GNU Coreutils base64, treat a filename of '-' + // as standard input. + if (strcmp(argv[optind], "-") == 0) { + return true; + } + + // Save the name of the file. + config->file = argv[optind]; + + // Open the file. + if ((config->fp = fopen(config->file, "rb")) == NULL) { + fprintf(stderr, "%s: %s: %s\n", + config->name, config->file, strerror(errno)); + return false; + } + + return true; } int main (int argc, char **argv) { - char *file; - FILE *fp; - int decode = 0; + // Default program config. + struct config config = { + .file = "stdin", + .fp = stdin, + .wrap = 76, + .decode = false, + .print_help = false, + }; + struct buffer buf; - // Parse options: - for (;;) - { - int c; - int opt_index = 0; - static struct option opt_long[] = { - { "decode", 0, 0, 'd' }, - { 0, 0, 0, 0 } - }; - if ((c = getopt_long(argc, argv, "d", opt_long, &opt_index)) == -1) { - break; - } - switch (c) - { - case 'd': - decode = 1; - break; - } - } - - // No options left on command line? Read from stdin: - if (optind >= argc) { - fp = stdin; - } - - // One option left on command line? Treat it as a file: - else if (optind + 1 == argc) { - file = argv[optind]; - if (strcmp(file, "-") == 0) { - fp = stdin; - } - else if ((fp = fopen(file, "rb")) == NULL) { - printf("cannot open %s\n", file); - return 1; - } - } - - // More than one option left on command line? Syntax error: - else { - printf("Usage: %s \n", argv[0]); + // Parse options from the command line. + if (parse_opts(argc, argv, &config) == false) { + usage(stderr, &config); return 1; } - // Invert return codes to create shell return code: - return (decode) ? !dec(fp) : !enc(fp); + // Return early if the user just wanted the help text. + if (config.print_help) { + usage(stdout, &config); + return 0; + } + + // Allocate buffers. + if (buffer_alloc(&config, &buf) == false) { + return 1; + } + + // Encode or decode the input based on the user's choice. + const bool ret = config.decode + ? decode(&config, &buf) + : encode(&config, &buf); + + // Free the buffers. + buffer_free(&buf); + + // Close the input file. + fclose(config.fp); + + // Close the output stream. + fclose(stdout); + + // That's all, folks. + return ret ? 0 : 1; } diff --git a/deps/base64/base64/cmake/Modules/TargetSIMDInstructionSet.cmake b/deps/base64/base64/cmake/Modules/TargetSIMDInstructionSet.cmake index ba1f6e51815..48508090531 100644 --- a/deps/base64/base64/cmake/Modules/TargetSIMDInstructionSet.cmake +++ b/deps/base64/base64/cmake/Modules/TargetSIMDInstructionSet.cmake @@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags) set(COMPILE_FLAGS_SSE42 "-msse4.2") set(COMPILE_FLAGS_AVX "-mavx") set(COMPILE_FLAGS_AVX2 "-mavx2") + set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi") #arm set(COMPILE_FLAGS_NEON32 "-mfpu=neon") @@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags) set(COMPILE_FLAGS_SSE42 " ") set(COMPILE_FLAGS_AVX "/arch:AVX") set(COMPILE_FLAGS_AVX2 "/arch:AVX2") + set(COMPILE_FLAGS_AVX512 "/arch:AVX512") endif() endmacro(define_SIMD_compile_flags) diff --git a/deps/base64/base64/cmake/config.h.in b/deps/base64/base64/cmake/config.h.in index 8530d1e13d4..c7faa94bc09 100644 --- a/deps/base64/base64/cmake/config.h.in +++ b/deps/base64/base64/cmake/config.h.in @@ -16,6 +16,9 @@ #cmakedefine01 BASE64_WITH_AVX2 #define HAVE_AVX2 BASE64_WITH_AVX2 +#cmakedefine01 BASE64_WITH_AVX512 +#define HAVE_AVX512 BASE64_WITH_AVX512 + #cmakedefine01 BASE64_WITH_NEON32 #define HAVE_NEON32 BASE64_WITH_NEON32 diff --git a/deps/base64/base64/include/libbase64.h b/deps/base64/base64/include/libbase64.h index d470a82f102..c5908973c5e 100644 --- a/deps/base64/base64/include/libbase64.h +++ b/deps/base64/base64/include/libbase64.h @@ -53,6 +53,7 @@ extern "C" { #define BASE64_FORCE_SSE41 (1 << 5) #define BASE64_FORCE_SSE42 (1 << 6) #define BASE64_FORCE_AVX (1 << 7) +#define BASE64_FORCE_AVX512 (1 << 8) struct base64_state { int eof; diff --git a/deps/base64/base64/lib/arch/avx/codec.c b/deps/base64/base64/lib/arch/avx/codec.c index a7a963d8358..b069618e294 100644 --- a/deps/base64/base64/lib/arch/avx/codec.c +++ b/deps/base64/base64/lib/arch/avx/codec.c @@ -11,11 +11,25 @@ #if HAVE_AVX #include +// Only enable inline assembly on supported compilers and on 64-bit CPUs. +#ifndef BASE64_AVX_USE_ASM +# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64 +# define BASE64_AVX_USE_ASM 1 +# else +# define BASE64_AVX_USE_ASM 0 +# endif +#endif + #include "../ssse3/dec_reshuffle.c" #include "../ssse3/dec_loop.c" -#include "../ssse3/enc_translate.c" -#include "../ssse3/enc_reshuffle.c" -#include "../ssse3/enc_loop.c" + +#if BASE64_AVX_USE_ASM +# include "enc_loop_asm.c" +#else +# include "../ssse3/enc_translate.c" +# include "../ssse3/enc_reshuffle.c" +# include "../ssse3/enc_loop.c" +#endif #endif // HAVE_AVX @@ -23,7 +37,17 @@ BASE64_ENC_FUNCTION(avx) { #if HAVE_AVX #include "../generic/enc_head.c" + + // For supported compilers, use a hand-optimized inline assembly + // encoder. Otherwise fall back on the SSSE3 encoder, but compiled with + // AVX flags to generate better optimized AVX code. + +#if BASE64_AVX_USE_ASM + enc_loop_avx(&s, &slen, &o, &olen); +#else enc_loop_ssse3(&s, &slen, &o, &olen); +#endif + #include "../generic/enc_tail.c" #else BASE64_ENC_STUB diff --git a/deps/base64/base64/lib/arch/avx/enc_loop_asm.c b/deps/base64/base64/lib/arch/avx/enc_loop_asm.c new file mode 100644 index 00000000000..979269af577 --- /dev/null +++ b/deps/base64/base64/lib/arch/avx/enc_loop_asm.c @@ -0,0 +1,264 @@ +// Apologies in advance for combining the preprocessor with inline assembly, +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of +// code repetition. The preprocessor is used to template large sections of +// inline assembly that differ only in the registers used. If the code was +// written out by hand, it would become very large and hard to audit. + +// Generate a block of inline assembly that loads register R0 from memory. The +// offset at which the register is loaded is set by the given round. +#define LOAD(R0, ROUND) \ + "vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t" + +// Generate a block of inline assembly that deinterleaves and shuffles register +// R0 using preloaded constants. Outputs in R0 and R1. +#define SHUF(R0, R1, R2) \ + "vpshufb %[lut0], %["R0"], %["R1"] \n\t" \ + "vpand %["R1"], %[msk0], %["R2"] \n\t" \ + "vpand %["R1"], %[msk2], %["R1"] \n\t" \ + "vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \ + "vpmullw %["R1"], %[msk3], %["R1"] \n\t" \ + "vpor %["R1"], %["R2"], %["R1"] \n\t" + +// Generate a block of inline assembly that takes R0 and R1 and translates +// their contents to the base64 alphabet, using preloaded constants. +#define TRAN(R0, R1, R2) \ + "vpsubusb %[n51], %["R1"], %["R0"] \n\t" \ + "vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \ + "vpsubb %["R2"], %["R0"], %["R0"] \n\t" \ + "vpshufb %["R0"], %[lut1], %["R2"] \n\t" \ + "vpaddb %["R1"], %["R2"], %["R0"] \n\t" + +// Generate a block of inline assembly that stores the given register R0 at an +// offset set by the given round. +#define STOR(R0, ROUND) \ + "vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t" + +// Generate a block of inline assembly that generates a single self-contained +// encoder round: fetch the data, process it, and store the result. Then update +// the source and destination pointers. +#define ROUND() \ + LOAD("a", 0) \ + SHUF("a", "b", "c") \ + TRAN("a", "b", "c") \ + STOR("a", 0) \ + "add $12, %[src] \n\t" \ + "add $16, %[dst] \n\t" + +// Define a macro that initiates a three-way interleaved encoding round by +// preloading registers a, b and c from memory. +// The register graph shows which registers are in use during each step, and +// is a visual aid for choosing registers for that step. Symbol index: +// +// + indicates that a register is loaded by that step. +// | indicates that a register is in use and must not be touched. +// - indicates that a register is decommissioned by that step. +// x indicates that a register is used as a temporary by that step. +// V indicates that a register is an input or output to the macro. +// +#define ROUND_3_INIT() /* a b c d e f */ \ + LOAD("a", 0) /* + */ \ + SHUF("a", "d", "e") /* | + x */ \ + LOAD("b", 1) /* | + | */ \ + TRAN("a", "d", "e") /* | | - x */ \ + LOAD("c", 2) /* V V V */ + +// Define a macro that translates, shuffles and stores the input registers A, B +// and C, and preloads registers D, E and F for the next round. +// This macro can be arbitrarily daisy-chained by feeding output registers D, E +// and F back into the next round as input registers A, B and C. The macro +// carefully interleaves memory operations with data operations for optimal +// pipelined performance. + +#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + LOAD(D, (ROUND + 3)) /* V V V + */ \ + SHUF(B, E, F) /* | | | | + x */ \ + STOR(A, (ROUND + 0)) /* - | | | | */ \ + TRAN(B, E, F) /* | | | - x */ \ + LOAD(E, (ROUND + 4)) /* | | | + */ \ + SHUF(C, A, F) /* + | | | | x */ \ + STOR(B, (ROUND + 1)) /* | - | | | */ \ + TRAN(C, A, F) /* - | | | x */ \ + LOAD(F, (ROUND + 5)) /* | | | + */ \ + SHUF(D, A, B) /* + x | | | | */ \ + STOR(C, (ROUND + 2)) /* | - | | | */ \ + TRAN(D, A, B) /* - x V V V */ + +// Define a macro that terminates a ROUND_3 macro by taking pre-loaded +// registers D, E and F, and translating, shuffling and storing them. +#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + SHUF(E, A, B) /* + x V V V */ \ + STOR(D, (ROUND + 3)) /* | - | | */ \ + TRAN(E, A, B) /* - x | | */ \ + SHUF(F, C, D) /* + x | | */ \ + STOR(E, (ROUND + 4)) /* | - | */ \ + TRAN(F, C, D) /* - x | */ \ + STOR(F, (ROUND + 5)) /* - */ + +// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f. +#define ROUND_3_A(ROUND) \ + ROUND_3(ROUND, "a", "b", "c", "d", "e", "f") + +// Define a type B round. Inputs and outputs are swapped with regard to type A. +#define ROUND_3_B(ROUND) \ + ROUND_3(ROUND, "d", "e", "f", "a", "b", "c") + +// Terminating macro for a type A round. +#define ROUND_3_A_LAST(ROUND) \ + ROUND_3_A(ROUND) \ + ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f") + +// Terminating macro for a type B round. +#define ROUND_3_B_LAST(ROUND) \ + ROUND_3_B(ROUND) \ + ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c") + +// Suppress clang's warning that the literal string in the asm statement is +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 +// compilers). It may be true, but the goal here is not C99 portability. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" + +static inline void +enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + // For a clearer explanation of the algorithm used by this function, + // please refer to the plain (not inline assembly) implementation. This + // function follows the same basic logic. + + if (*slen < 16) { + return; + } + + // Process blocks of 12 bytes at a time. Input is read in blocks of 16 + // bytes, so "reserve" four bytes from the input buffer to ensure that + // we never read beyond the end of the input buffer. + size_t rounds = (*slen - 4) / 12; + + *slen -= rounds * 12; // 12 bytes consumed per round + *olen += rounds * 16; // 16 bytes produced per round + + // Number of times to go through the 36x loop. + size_t loops = rounds / 36; + + // Number of rounds remaining after the 36x loop. + rounds %= 36; + + // Lookup tables. + const __m128i lut0 = _mm_set_epi8( + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); + + const __m128i lut1 = _mm_setr_epi8( + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); + + // Temporary registers. + __m128i a, b, c, d, e, f; + + __asm__ volatile ( + + // If there are 36 rounds or more, enter a 36x unrolled loop of + // interleaved encoding rounds. The rounds interleave memory + // operations (load/store) with data operations (table lookups, + // etc) to maximize pipeline throughput. + " test %[loops], %[loops] \n\t" + " jz 18f \n\t" + " jmp 36f \n\t" + " \n\t" + ".balign 64 \n\t" + "36: " ROUND_3_INIT() + " " ROUND_3_A( 0) + " " ROUND_3_B( 3) + " " ROUND_3_A( 6) + " " ROUND_3_B( 9) + " " ROUND_3_A(12) + " " ROUND_3_B(15) + " " ROUND_3_A(18) + " " ROUND_3_B(21) + " " ROUND_3_A(24) + " " ROUND_3_B(27) + " " ROUND_3_A_LAST(30) + " add $(12 * 36), %[src] \n\t" + " add $(16 * 36), %[dst] \n\t" + " dec %[loops] \n\t" + " jnz 36b \n\t" + + // Enter an 18x unrolled loop for rounds of 18 or more. + "18: cmp $18, %[rounds] \n\t" + " jl 9f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B(3) + " " ROUND_3_A(6) + " " ROUND_3_B(9) + " " ROUND_3_A_LAST(12) + " sub $18, %[rounds] \n\t" + " add $(12 * 18), %[src] \n\t" + " add $(16 * 18), %[dst] \n\t" + + // Enter a 9x unrolled loop for rounds of 9 or more. + "9: cmp $9, %[rounds] \n\t" + " jl 6f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B_LAST(3) + " sub $9, %[rounds] \n\t" + " add $(12 * 9), %[src] \n\t" + " add $(16 * 9), %[dst] \n\t" + + // Enter a 6x unrolled loop for rounds of 6 or more. + "6: cmp $6, %[rounds] \n\t" + " jl 55f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A_LAST(0) + " sub $6, %[rounds] \n\t" + " add $(12 * 6), %[src] \n\t" + " add $(16 * 6), %[dst] \n\t" + + // Dispatch the remaining rounds 0..5. + "55: cmp $3, %[rounds] \n\t" + " jg 45f \n\t" + " je 3f \n\t" + " cmp $1, %[rounds] \n\t" + " jg 2f \n\t" + " je 1f \n\t" + " jmp 0f \n\t" + + "45: cmp $4, %[rounds] \n\t" + " je 4f \n\t" + + // Block of non-interlaced encoding rounds, which can each + // individually be jumped to. Rounds fall through to the next. + "5: " ROUND() + "4: " ROUND() + "3: " ROUND() + "2: " ROUND() + "1: " ROUND() + "0: \n\t" + + // Outputs (modified). + : [rounds] "+r" (rounds), + [loops] "+r" (loops), + [src] "+r" (*s), + [dst] "+r" (*o), + [a] "=&x" (a), + [b] "=&x" (b), + [c] "=&x" (c), + [d] "=&x" (d), + [e] "=&x" (e), + [f] "=&x" (f) + + // Inputs (not modified). + : [lut0] "x" (lut0), + [lut1] "x" (lut1), + [msk0] "x" (_mm_set1_epi32(0x0FC0FC00)), + [msk1] "x" (_mm_set1_epi32(0x04000040)), + [msk2] "x" (_mm_set1_epi32(0x003F03F0)), + [msk3] "x" (_mm_set1_epi32(0x01000010)), + [n51] "x" (_mm_set1_epi8(51)), + [n25] "x" (_mm_set1_epi8(25)) + + // Clobbers. + : "cc", "memory" + ); +} + +#pragma GCC diagnostic pop diff --git a/deps/base64/base64/lib/arch/avx2/codec.c b/deps/base64/base64/lib/arch/avx2/codec.c index 0498548b80d..8a2aa4a6071 100644 --- a/deps/base64/base64/lib/arch/avx2/codec.c +++ b/deps/base64/base64/lib/arch/avx2/codec.c @@ -11,11 +11,25 @@ #if HAVE_AVX2 #include +// Only enable inline assembly on supported compilers and on 64-bit CPUs. +#ifndef BASE64_AVX2_USE_ASM +# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64 +# define BASE64_AVX2_USE_ASM 1 +# else +# define BASE64_AVX2_USE_ASM 0 +# endif +#endif + #include "dec_reshuffle.c" #include "dec_loop.c" -#include "enc_translate.c" -#include "enc_reshuffle.c" -#include "enc_loop.c" + +#if BASE64_AVX2_USE_ASM +# include "enc_loop_asm.c" +#else +# include "enc_translate.c" +# include "enc_reshuffle.c" +# include "enc_loop.c" +#endif #endif // HAVE_AVX2 diff --git a/deps/base64/base64/lib/arch/avx2/enc_loop_asm.c b/deps/base64/base64/lib/arch/avx2/enc_loop_asm.c new file mode 100644 index 00000000000..eb775a1d1f0 --- /dev/null +++ b/deps/base64/base64/lib/arch/avx2/enc_loop_asm.c @@ -0,0 +1,291 @@ +// Apologies in advance for combining the preprocessor with inline assembly, +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of +// code repetition. The preprocessor is used to template large sections of +// inline assembly that differ only in the registers used. If the code was +// written out by hand, it would become very large and hard to audit. + +// Generate a block of inline assembly that loads register R0 from memory. The +// offset at which the register is loaded is set by the given round and a +// constant offset. +#define LOAD(R0, ROUND, OFFSET) \ + "vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t" + +// Generate a block of inline assembly that deinterleaves and shuffles register +// R0 using preloaded constants. Outputs in R0 and R1. +#define SHUF(R0, R1, R2) \ + "vpshufb %[lut0], %["R0"], %["R1"] \n\t" \ + "vpand %["R1"], %[msk0], %["R2"] \n\t" \ + "vpand %["R1"], %[msk2], %["R1"] \n\t" \ + "vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \ + "vpmullw %["R1"], %[msk3], %["R1"] \n\t" \ + "vpor %["R1"], %["R2"], %["R1"] \n\t" + +// Generate a block of inline assembly that takes R0 and R1 and translates +// their contents to the base64 alphabet, using preloaded constants. +#define TRAN(R0, R1, R2) \ + "vpsubusb %[n51], %["R1"], %["R0"] \n\t" \ + "vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \ + "vpsubb %["R2"], %["R0"], %["R0"] \n\t" \ + "vpshufb %["R0"], %[lut1], %["R2"] \n\t" \ + "vpaddb %["R1"], %["R2"], %["R0"] \n\t" + +// Generate a block of inline assembly that stores the given register R0 at an +// offset set by the given round. +#define STOR(R0, ROUND) \ + "vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t" + +// Generate a block of inline assembly that generates a single self-contained +// encoder round: fetch the data, process it, and store the result. Then update +// the source and destination pointers. +#define ROUND() \ + LOAD("a", 0, -4) \ + SHUF("a", "b", "c") \ + TRAN("a", "b", "c") \ + STOR("a", 0) \ + "add $24, %[src] \n\t" \ + "add $32, %[dst] \n\t" + +// Define a macro that initiates a three-way interleaved encoding round by +// preloading registers a, b and c from memory. +// The register graph shows which registers are in use during each step, and +// is a visual aid for choosing registers for that step. Symbol index: +// +// + indicates that a register is loaded by that step. +// | indicates that a register is in use and must not be touched. +// - indicates that a register is decommissioned by that step. +// x indicates that a register is used as a temporary by that step. +// V indicates that a register is an input or output to the macro. +// +#define ROUND_3_INIT() /* a b c d e f */ \ + LOAD("a", 0, -4) /* + */ \ + SHUF("a", "d", "e") /* | + x */ \ + LOAD("b", 1, -4) /* | + | */ \ + TRAN("a", "d", "e") /* | | - x */ \ + LOAD("c", 2, -4) /* V V V */ + +// Define a macro that translates, shuffles and stores the input registers A, B +// and C, and preloads registers D, E and F for the next round. +// This macro can be arbitrarily daisy-chained by feeding output registers D, E +// and F back into the next round as input registers A, B and C. The macro +// carefully interleaves memory operations with data operations for optimal +// pipelined performance. + +#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + LOAD(D, (ROUND + 3), -4) /* V V V + */ \ + SHUF(B, E, F) /* | | | | + x */ \ + STOR(A, (ROUND + 0)) /* - | | | | */ \ + TRAN(B, E, F) /* | | | - x */ \ + LOAD(E, (ROUND + 4), -4) /* | | | + */ \ + SHUF(C, A, F) /* + | | | | x */ \ + STOR(B, (ROUND + 1)) /* | - | | | */ \ + TRAN(C, A, F) /* - | | | x */ \ + LOAD(F, (ROUND + 5), -4) /* | | | + */ \ + SHUF(D, A, B) /* + x | | | | */ \ + STOR(C, (ROUND + 2)) /* | - | | | */ \ + TRAN(D, A, B) /* - x V V V */ + +// Define a macro that terminates a ROUND_3 macro by taking pre-loaded +// registers D, E and F, and translating, shuffling and storing them. +#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + SHUF(E, A, B) /* + x V V V */ \ + STOR(D, (ROUND + 3)) /* | - | | */ \ + TRAN(E, A, B) /* - x | | */ \ + SHUF(F, C, D) /* + x | | */ \ + STOR(E, (ROUND + 4)) /* | - | */ \ + TRAN(F, C, D) /* - x | */ \ + STOR(F, (ROUND + 5)) /* - */ + +// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f. +#define ROUND_3_A(ROUND) \ + ROUND_3(ROUND, "a", "b", "c", "d", "e", "f") + +// Define a type B round. Inputs and outputs are swapped with regard to type A. +#define ROUND_3_B(ROUND) \ + ROUND_3(ROUND, "d", "e", "f", "a", "b", "c") + +// Terminating macro for a type A round. +#define ROUND_3_A_LAST(ROUND) \ + ROUND_3_A(ROUND) \ + ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f") + +// Terminating macro for a type B round. +#define ROUND_3_B_LAST(ROUND) \ + ROUND_3_B(ROUND) \ + ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c") + +// Suppress clang's warning that the literal string in the asm statement is +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 +// compilers). It may be true, but the goal here is not C99 portability. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" + +static inline void +enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + // For a clearer explanation of the algorithm used by this function, + // please refer to the plain (not inline assembly) implementation. This + // function follows the same basic logic. + + if (*slen < 32) { + return; + } + + // Process blocks of 24 bytes at a time. Because blocks are loaded 32 + // bytes at a time an offset of -4, ensure that there will be at least + // 4 remaining bytes after the last round, so that the final read will + // not pass beyond the bounds of the input buffer. + size_t rounds = (*slen - 4) / 24; + + *slen -= rounds * 24; // 24 bytes consumed per round + *olen += rounds * 32; // 32 bytes produced per round + + // Pre-decrement the number of rounds to get the number of rounds + // *after* the first round, which is handled as a special case. + rounds--; + + // Number of times to go through the 36x loop. + size_t loops = rounds / 36; + + // Number of rounds remaining after the 36x loop. + rounds %= 36; + + // Lookup tables. + const __m256i lut0 = _mm256_set_epi8( + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, + 14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5); + + const __m256i lut1 = _mm256_setr_epi8( + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); + + // Temporary registers. + __m256i a, b, c, d, e; + + // Temporary register f doubles as the shift mask for the first round. + __m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6); + + __asm__ volatile ( + + // The first loop iteration requires special handling to ensure + // that the read, which is normally done at an offset of -4, + // does not underflow the buffer. Load the buffer at an offset + // of 0 and permute the input to achieve the same effect. + LOAD("a", 0, 0) + "vpermd %[a], %[f], %[a] \n\t" + + // Perform the standard shuffling and translation steps. + SHUF("a", "b", "c") + TRAN("a", "b", "c") + + // Store the result and increment the source and dest pointers. + "vmovdqu %[a], (%[dst]) \n\t" + "add $24, %[src] \n\t" + "add $32, %[dst] \n\t" + + // If there are 36 rounds or more, enter a 36x unrolled loop of + // interleaved encoding rounds. The rounds interleave memory + // operations (load/store) with data operations (table lookups, + // etc) to maximize pipeline throughput. + " test %[loops], %[loops] \n\t" + " jz 18f \n\t" + " jmp 36f \n\t" + " \n\t" + ".balign 64 \n\t" + "36: " ROUND_3_INIT() + " " ROUND_3_A( 0) + " " ROUND_3_B( 3) + " " ROUND_3_A( 6) + " " ROUND_3_B( 9) + " " ROUND_3_A(12) + " " ROUND_3_B(15) + " " ROUND_3_A(18) + " " ROUND_3_B(21) + " " ROUND_3_A(24) + " " ROUND_3_B(27) + " " ROUND_3_A_LAST(30) + " add $(24 * 36), %[src] \n\t" + " add $(32 * 36), %[dst] \n\t" + " dec %[loops] \n\t" + " jnz 36b \n\t" + + // Enter an 18x unrolled loop for rounds of 18 or more. + "18: cmp $18, %[rounds] \n\t" + " jl 9f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B(3) + " " ROUND_3_A(6) + " " ROUND_3_B(9) + " " ROUND_3_A_LAST(12) + " sub $18, %[rounds] \n\t" + " add $(24 * 18), %[src] \n\t" + " add $(32 * 18), %[dst] \n\t" + + // Enter a 9x unrolled loop for rounds of 9 or more. + "9: cmp $9, %[rounds] \n\t" + " jl 6f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B_LAST(3) + " sub $9, %[rounds] \n\t" + " add $(24 * 9), %[src] \n\t" + " add $(32 * 9), %[dst] \n\t" + + // Enter a 6x unrolled loop for rounds of 6 or more. + "6: cmp $6, %[rounds] \n\t" + " jl 55f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A_LAST(0) + " sub $6, %[rounds] \n\t" + " add $(24 * 6), %[src] \n\t" + " add $(32 * 6), %[dst] \n\t" + + // Dispatch the remaining rounds 0..5. + "55: cmp $3, %[rounds] \n\t" + " jg 45f \n\t" + " je 3f \n\t" + " cmp $1, %[rounds] \n\t" + " jg 2f \n\t" + " je 1f \n\t" + " jmp 0f \n\t" + + "45: cmp $4, %[rounds] \n\t" + " je 4f \n\t" + + // Block of non-interlaced encoding rounds, which can each + // individually be jumped to. Rounds fall through to the next. + "5: " ROUND() + "4: " ROUND() + "3: " ROUND() + "2: " ROUND() + "1: " ROUND() + "0: \n\t" + + // Outputs (modified). + : [rounds] "+r" (rounds), + [loops] "+r" (loops), + [src] "+r" (*s), + [dst] "+r" (*o), + [a] "=&x" (a), + [b] "=&x" (b), + [c] "=&x" (c), + [d] "=&x" (d), + [e] "=&x" (e), + [f] "+x" (f) + + // Inputs (not modified). + : [lut0] "x" (lut0), + [lut1] "x" (lut1), + [msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)), + [msk1] "x" (_mm256_set1_epi32(0x04000040)), + [msk2] "x" (_mm256_set1_epi32(0x003F03F0)), + [msk3] "x" (_mm256_set1_epi32(0x01000010)), + [n51] "x" (_mm256_set1_epi8(51)), + [n25] "x" (_mm256_set1_epi8(25)) + + // Clobbers. + : "cc", "memory" + ); +} + +#pragma GCC diagnostic pop diff --git a/deps/base64/base64/lib/arch/avx512/codec.c b/deps/base64/base64/lib/arch/avx512/codec.c new file mode 100644 index 00000000000..664120853d4 --- /dev/null +++ b/deps/base64/base64/lib/arch/avx512/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_AVX512 +#include + +#include "../avx2/dec_reshuffle.c" +#include "../avx2/dec_loop.c" +#include "enc_reshuffle_translate.c" +#include "enc_loop.c" + +#endif // HAVE_AVX512 + +BASE64_ENC_FUNCTION(avx512) +{ +#if HAVE_AVX512 + #include "../generic/enc_head.c" + enc_loop_avx512(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +// Reuse AVX2 decoding. Not supporting AVX512 at present +BASE64_DEC_FUNCTION(avx512) +{ +#if HAVE_AVX512 + #include "../generic/dec_head.c" + dec_loop_avx2(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/deps/base64/base64/lib/arch/avx512/enc_loop.c b/deps/base64/base64/lib/arch/avx512/enc_loop.c new file mode 100644 index 00000000000..4c71e160ae8 --- /dev/null +++ b/deps/base64/base64/lib/arch/avx512/enc_loop.c @@ -0,0 +1,61 @@ +static inline void +enc_loop_avx512_inner (const uint8_t **s, uint8_t **o) +{ + // Load input. + __m512i src = _mm512_loadu_si512((__m512i *) *s); + + // Reshuffle, translate, store. + src = enc_reshuffle_translate(src); + _mm512_storeu_si512((__m512i *) *o, src); + + *s += 48; + *o += 64; +} + +static inline void +enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 64) { + return; + } + + // Process blocks of 48 bytes at a time. Because blocks are loaded 64 + // bytes at a time, ensure that there will be at least 24 remaining + // bytes after the last round, so that the final read will not pass + // beyond the bounds of the input buffer. + size_t rounds = (*slen - 24) / 48; + + *slen -= rounds * 48; // 48 bytes consumed per round + *olen += rounds * 64; // 64 bytes produced per round + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_avx512_inner(s, o); + enc_loop_avx512_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_avx512_inner(s, o); + break; + } +} diff --git a/deps/base64/base64/lib/arch/avx512/enc_reshuffle_translate.c b/deps/base64/base64/lib/arch/avx512/enc_reshuffle_translate.c new file mode 100644 index 00000000000..5c332bb24ca --- /dev/null +++ b/deps/base64/base64/lib/arch/avx512/enc_reshuffle_translate.c @@ -0,0 +1,50 @@ +// AVX512 algorithm is based on permutevar and multishift. The code is based on +// https://github.com/WojciechMula/base64simd which is under BSD-2 license. + +static inline __m512i +enc_reshuffle_translate (const __m512i input) +{ + // 32-bit input + // [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0| + // b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4] + // output order [1, 2, 0, 1] + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + + const __m512i shuffle_input = _mm512_setr_epi32(0x01020001, + 0x04050304, + 0x07080607, + 0x0a0b090a, + 0x0d0e0c0d, + 0x10110f10, + 0x13141213, + 0x16171516, + 0x191a1819, + 0x1c1d1b1c, + 0x1f201e1f, + 0x22232122, + 0x25262425, + 0x28292728, + 0x2b2c2a2b, + 0x2e2f2d2e); + + // Reorder bytes + // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0| + // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] + const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input); + + // After multishift a single 32-bit lane has following layout + // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0| + // a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0] + // (a = [10:17], b = [4:11], c = [22:27], d = [16:21]) + + // 48, 54, 36, 42, 16, 22, 4, 10 + const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu); + __m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in); + + // Translate immediatedly after reshuffled. + const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit); + + // Translation 6-bit values to ASCII. + return _mm512_permutexvar_epi8(shuffled_in, lookup); +} diff --git a/deps/base64/base64/lib/arch/neon32/enc_loop.c b/deps/base64/base64/lib/arch/neon32/enc_loop.c index e9e8e285256..d694b33733c 100644 --- a/deps/base64/base64/lib/arch/neon32/enc_loop.c +++ b/deps/base64/base64/lib/arch/neon32/enc_loop.c @@ -100,7 +100,8 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o) [n63] "w" (n63) // Clobbers. - : "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" + : "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", + "cc", "memory" ); } #endif diff --git a/deps/base64/base64/lib/arch/neon64/enc_loop_asm.c b/deps/base64/base64/lib/arch/neon64/enc_loop_asm.c index cf2fd27e80d..182e9cdf4a1 100644 --- a/deps/base64/base64/lib/arch/neon64/enc_loop_asm.c +++ b/deps/base64/base64/lib/arch/neon64/enc_loop_asm.c @@ -160,7 +160,8 @@ enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) // Clobbers. : "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15" + "v12", "v13", "v14", "v15", + "cc", "memory" ); } diff --git a/deps/base64/base64/lib/arch/sse41/codec.c b/deps/base64/base64/lib/arch/sse41/codec.c index 00645feda83..6e5afe30011 100644 --- a/deps/base64/base64/lib/arch/sse41/codec.c +++ b/deps/base64/base64/lib/arch/sse41/codec.c @@ -11,11 +11,25 @@ #if HAVE_SSE41 #include +// Only enable inline assembly on supported compilers and on 64-bit CPUs. +#ifndef BASE64_SSE41_USE_ASM +# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64 +# define BASE64_SSE41_USE_ASM 1 +# else +# define BASE64_SSE41_USE_ASM 0 +# endif +#endif + #include "../ssse3/dec_reshuffle.c" #include "../ssse3/dec_loop.c" -#include "../ssse3/enc_translate.c" -#include "../ssse3/enc_reshuffle.c" -#include "../ssse3/enc_loop.c" + +#if BASE64_SSE41_USE_ASM +# include "../ssse3/enc_loop_asm.c" +#else +# include "../ssse3/enc_translate.c" +# include "../ssse3/enc_reshuffle.c" +# include "../ssse3/enc_loop.c" +#endif #endif // HAVE_SSE41 diff --git a/deps/base64/base64/lib/arch/sse42/codec.c b/deps/base64/base64/lib/arch/sse42/codec.c index cf5d97cfb29..dde823b7aa0 100644 --- a/deps/base64/base64/lib/arch/sse42/codec.c +++ b/deps/base64/base64/lib/arch/sse42/codec.c @@ -11,11 +11,25 @@ #if HAVE_SSE42 #include +// Only enable inline assembly on supported compilers and on 64-bit CPUs. +#ifndef BASE64_SSE42_USE_ASM +# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64 +# define BASE64_SSE42_USE_ASM 1 +# else +# define BASE64_SSE42_USE_ASM 0 +# endif +#endif + #include "../ssse3/dec_reshuffle.c" #include "../ssse3/dec_loop.c" -#include "../ssse3/enc_translate.c" -#include "../ssse3/enc_reshuffle.c" -#include "../ssse3/enc_loop.c" + +#if BASE64_SSE42_USE_ASM +# include "../ssse3/enc_loop_asm.c" +#else +# include "../ssse3/enc_translate.c" +# include "../ssse3/enc_reshuffle.c" +# include "../ssse3/enc_loop.c" +#endif #endif // HAVE_SSE42 diff --git a/deps/base64/base64/lib/arch/ssse3/codec.c b/deps/base64/base64/lib/arch/ssse3/codec.c index ad14a4589de..a812a2901f4 100644 --- a/deps/base64/base64/lib/arch/ssse3/codec.c +++ b/deps/base64/base64/lib/arch/ssse3/codec.c @@ -11,11 +11,27 @@ #if HAVE_SSSE3 #include +// Only enable inline assembly on supported compilers and on 64-bit CPUs. +// 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM +// registers, which is not enough to run the inline assembly. +#ifndef BASE64_SSSE3_USE_ASM +# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64 +# define BASE64_SSSE3_USE_ASM 1 +# else +# define BASE64_SSSE3_USE_ASM 0 +# endif +#endif + #include "dec_reshuffle.c" #include "dec_loop.c" -#include "enc_reshuffle.c" -#include "enc_translate.c" -#include "enc_loop.c" + +#if BASE64_SSSE3_USE_ASM +# include "enc_loop_asm.c" +#else +# include "enc_reshuffle.c" +# include "enc_translate.c" +# include "enc_loop.c" +#endif #endif // HAVE_SSSE3 diff --git a/deps/base64/base64/lib/arch/ssse3/enc_loop_asm.c b/deps/base64/base64/lib/arch/ssse3/enc_loop_asm.c new file mode 100644 index 00000000000..0cdb340a63b --- /dev/null +++ b/deps/base64/base64/lib/arch/ssse3/enc_loop_asm.c @@ -0,0 +1,268 @@ +// Apologies in advance for combining the preprocessor with inline assembly, +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of +// code repetition. The preprocessor is used to template large sections of +// inline assembly that differ only in the registers used. If the code was +// written out by hand, it would become very large and hard to audit. + +// Generate a block of inline assembly that loads register R0 from memory. The +// offset at which the register is loaded is set by the given round. +#define LOAD(R0, ROUND) \ + "lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t" + +// Generate a block of inline assembly that deinterleaves and shuffles register +// R0 using preloaded constants. Outputs in R0 and R1. +#define SHUF(R0, R1) \ + "pshufb %[lut0], %["R0"] \n\t" \ + "movdqa %["R0"], %["R1"] \n\t" \ + "pand %[msk0], %["R0"] \n\t" \ + "pand %[msk2], %["R1"] \n\t" \ + "pmulhuw %[msk1], %["R0"] \n\t" \ + "pmullw %[msk3], %["R1"] \n\t" \ + "por %["R1"], %["R0"] \n\t" + +// Generate a block of inline assembly that takes R0 and R1 and translates +// their contents to the base64 alphabet, using preloaded constants. +#define TRAN(R0, R1, R2) \ + "movdqa %["R0"], %["R1"] \n\t" \ + "movdqa %["R0"], %["R2"] \n\t" \ + "psubusb %[n51], %["R1"] \n\t" \ + "pcmpgtb %[n25], %["R2"] \n\t" \ + "psubb %["R2"], %["R1"] \n\t" \ + "movdqa %[lut1], %["R2"] \n\t" \ + "pshufb %["R1"], %["R2"] \n\t" \ + "paddb %["R2"], %["R0"] \n\t" + +// Generate a block of inline assembly that stores the given register R0 at an +// offset set by the given round. +#define STOR(R0, ROUND) \ + "movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t" + +// Generate a block of inline assembly that generates a single self-contained +// encoder round: fetch the data, process it, and store the result. Then update +// the source and destination pointers. +#define ROUND() \ + LOAD("a", 0) \ + SHUF("a", "b") \ + TRAN("a", "b", "c") \ + STOR("a", 0) \ + "add $12, %[src] \n\t" \ + "add $16, %[dst] \n\t" + +// Define a macro that initiates a three-way interleaved encoding round by +// preloading registers a, b and c from memory. +// The register graph shows which registers are in use during each step, and +// is a visual aid for choosing registers for that step. Symbol index: +// +// + indicates that a register is loaded by that step. +// | indicates that a register is in use and must not be touched. +// - indicates that a register is decommissioned by that step. +// x indicates that a register is used as a temporary by that step. +// V indicates that a register is an input or output to the macro. +// +#define ROUND_3_INIT() /* a b c d e f */ \ + LOAD("a", 0) /* + */ \ + SHUF("a", "d") /* | + */ \ + LOAD("b", 1) /* | + | */ \ + TRAN("a", "d", "e") /* | | - x */ \ + LOAD("c", 2) /* V V V */ + +// Define a macro that translates, shuffles and stores the input registers A, B +// and C, and preloads registers D, E and F for the next round. +// This macro can be arbitrarily daisy-chained by feeding output registers D, E +// and F back into the next round as input registers A, B and C. The macro +// carefully interleaves memory operations with data operations for optimal +// pipelined performance. + +#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + LOAD(D, (ROUND + 3)) /* V V V + */ \ + SHUF(B, E) /* | | | | + */ \ + STOR(A, (ROUND + 0)) /* - | | | | */ \ + TRAN(B, E, F) /* | | | - x */ \ + LOAD(E, (ROUND + 4)) /* | | | + */ \ + SHUF(C, A) /* + | | | | */ \ + STOR(B, (ROUND + 1)) /* | - | | | */ \ + TRAN(C, A, F) /* - | | | x */ \ + LOAD(F, (ROUND + 5)) /* | | | + */ \ + SHUF(D, A) /* + | | | | */ \ + STOR(C, (ROUND + 2)) /* | - | | | */ \ + TRAN(D, A, B) /* - x V V V */ + +// Define a macro that terminates a ROUND_3 macro by taking pre-loaded +// registers D, E and F, and translating, shuffling and storing them. +#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + SHUF(E, A) /* + V V V */ \ + STOR(D, (ROUND + 3)) /* | - | | */ \ + TRAN(E, A, B) /* - x | | */ \ + SHUF(F, C) /* + | | */ \ + STOR(E, (ROUND + 4)) /* | - | */ \ + TRAN(F, C, D) /* - x | */ \ + STOR(F, (ROUND + 5)) /* - */ + +// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f. +#define ROUND_3_A(ROUND) \ + ROUND_3(ROUND, "a", "b", "c", "d", "e", "f") + +// Define a type B round. Inputs and outputs are swapped with regard to type A. +#define ROUND_3_B(ROUND) \ + ROUND_3(ROUND, "d", "e", "f", "a", "b", "c") + +// Terminating macro for a type A round. +#define ROUND_3_A_LAST(ROUND) \ + ROUND_3_A(ROUND) \ + ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f") + +// Terminating macro for a type B round. +#define ROUND_3_B_LAST(ROUND) \ + ROUND_3_B(ROUND) \ + ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c") + +// Suppress clang's warning that the literal string in the asm statement is +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 +// compilers). It may be true, but the goal here is not C99 portability. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" + +static inline void +enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + // For a clearer explanation of the algorithm used by this function, + // please refer to the plain (not inline assembly) implementation. This + // function follows the same basic logic. + + if (*slen < 16) { + return; + } + + // Process blocks of 12 bytes at a time. Input is read in blocks of 16 + // bytes, so "reserve" four bytes from the input buffer to ensure that + // we never read beyond the end of the input buffer. + size_t rounds = (*slen - 4) / 12; + + *slen -= rounds * 12; // 12 bytes consumed per round + *olen += rounds * 16; // 16 bytes produced per round + + // Number of times to go through the 36x loop. + size_t loops = rounds / 36; + + // Number of rounds remaining after the 36x loop. + rounds %= 36; + + // Lookup tables. + const __m128i lut0 = _mm_set_epi8( + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); + + const __m128i lut1 = _mm_setr_epi8( + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); + + // Temporary registers. + __m128i a, b, c, d, e, f; + + __asm__ volatile ( + + // If there are 36 rounds or more, enter a 36x unrolled loop of + // interleaved encoding rounds. The rounds interleave memory + // operations (load/store) with data operations (table lookups, + // etc) to maximize pipeline throughput. + " test %[loops], %[loops] \n\t" + " jz 18f \n\t" + " jmp 36f \n\t" + " \n\t" + ".balign 64 \n\t" + "36: " ROUND_3_INIT() + " " ROUND_3_A( 0) + " " ROUND_3_B( 3) + " " ROUND_3_A( 6) + " " ROUND_3_B( 9) + " " ROUND_3_A(12) + " " ROUND_3_B(15) + " " ROUND_3_A(18) + " " ROUND_3_B(21) + " " ROUND_3_A(24) + " " ROUND_3_B(27) + " " ROUND_3_A_LAST(30) + " add $(12 * 36), %[src] \n\t" + " add $(16 * 36), %[dst] \n\t" + " dec %[loops] \n\t" + " jnz 36b \n\t" + + // Enter an 18x unrolled loop for rounds of 18 or more. + "18: cmp $18, %[rounds] \n\t" + " jl 9f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B(3) + " " ROUND_3_A(6) + " " ROUND_3_B(9) + " " ROUND_3_A_LAST(12) + " sub $18, %[rounds] \n\t" + " add $(12 * 18), %[src] \n\t" + " add $(16 * 18), %[dst] \n\t" + + // Enter a 9x unrolled loop for rounds of 9 or more. + "9: cmp $9, %[rounds] \n\t" + " jl 6f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B_LAST(3) + " sub $9, %[rounds] \n\t" + " add $(12 * 9), %[src] \n\t" + " add $(16 * 9), %[dst] \n\t" + + // Enter a 6x unrolled loop for rounds of 6 or more. + "6: cmp $6, %[rounds] \n\t" + " jl 55f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A_LAST(0) + " sub $6, %[rounds] \n\t" + " add $(12 * 6), %[src] \n\t" + " add $(16 * 6), %[dst] \n\t" + + // Dispatch the remaining rounds 0..5. + "55: cmp $3, %[rounds] \n\t" + " jg 45f \n\t" + " je 3f \n\t" + " cmp $1, %[rounds] \n\t" + " jg 2f \n\t" + " je 1f \n\t" + " jmp 0f \n\t" + + "45: cmp $4, %[rounds] \n\t" + " je 4f \n\t" + + // Block of non-interlaced encoding rounds, which can each + // individually be jumped to. Rounds fall through to the next. + "5: " ROUND() + "4: " ROUND() + "3: " ROUND() + "2: " ROUND() + "1: " ROUND() + "0: \n\t" + + // Outputs (modified). + : [rounds] "+r" (rounds), + [loops] "+r" (loops), + [src] "+r" (*s), + [dst] "+r" (*o), + [a] "=&x" (a), + [b] "=&x" (b), + [c] "=&x" (c), + [d] "=&x" (d), + [e] "=&x" (e), + [f] "=&x" (f) + + // Inputs (not modified). + : [lut0] "x" (lut0), + [lut1] "x" (lut1), + [msk0] "x" (_mm_set1_epi32(0x0FC0FC00)), + [msk1] "x" (_mm_set1_epi32(0x04000040)), + [msk2] "x" (_mm_set1_epi32(0x003F03F0)), + [msk3] "x" (_mm_set1_epi32(0x01000010)), + [n51] "x" (_mm_set1_epi8(51)), + [n25] "x" (_mm_set1_epi8(25)) + + // Clobbers. + : "cc", "memory" + ); +} + +#pragma GCC diagnostic pop diff --git a/deps/base64/base64/lib/codec_choose.c b/deps/base64/base64/lib/codec_choose.c index 6a07d6a74cc..abef3f2ae9f 100644 --- a/deps/base64/base64/lib/codec_choose.c +++ b/deps/base64/base64/lib/codec_choose.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "../include/libbase64.h" #include "codecs.h" @@ -10,7 +11,7 @@ #if (__x86_64__ || __i386__ || _M_X86 || _M_X64) #define BASE64_X86 - #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2) + #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512) #define BASE64_X86_SIMD #endif #endif @@ -31,7 +32,7 @@ __cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx) #else #include - #if HAVE_AVX2 || HAVE_AVX + #if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) static inline uint64_t _xgetbv (uint32_t index) { @@ -45,6 +46,12 @@ #endif #endif +#ifndef bit_AVX512vl +#define bit_AVX512vl (1 << 31) +#endif +#ifndef bit_AVX512vbmi +#define bit_AVX512vbmi (1 << 1) +#endif #ifndef bit_AVX2 #define bit_AVX2 (1 << 5) #endif @@ -75,6 +82,7 @@ BASE64_ENC_FUNCTION(arch); \ BASE64_DEC_FUNCTION(arch); \ +BASE64_CODEC_FUNCS(avx512) BASE64_CODEC_FUNCS(avx2) BASE64_CODEC_FUNCS(neon32) BASE64_CODEC_FUNCS(neon64) @@ -91,9 +99,10 @@ codec_choose_forced (struct codec *codec, int flags) // always allow it, even if the codec is a no-op. // For testing purposes. - if (!(flags & 0xFF)) { + if (!(flags & 0xFFFF)) { return false; } + if (flags & BASE64_FORCE_AVX2) { codec->enc = base64_stream_encode_avx2; codec->dec = base64_stream_decode_avx2; @@ -134,6 +143,11 @@ codec_choose_forced (struct codec *codec, int flags) codec->dec = base64_stream_decode_avx; return true; } + if (flags & BASE64_FORCE_AVX512) { + codec->enc = base64_stream_encode_avx512; + codec->dec = base64_stream_decode_avx512; + return true; + } return false; } @@ -178,8 +192,8 @@ codec_choose_x86 (struct codec *codec) max_level = __get_cpuid_max(0, NULL); #endif - #if HAVE_AVX2 || HAVE_AVX - // Check for AVX/AVX2 support: + #if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX + // Check for AVX/AVX2/AVX512 support: // Checking for AVX requires 3 things: // 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions // (allowing saving YMM registers on context switch) @@ -194,7 +208,17 @@ codec_choose_x86 (struct codec *codec) if (ecx & bit_XSAVE_XRSTORE) { uint64_t xcr_mask; xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); - if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { + if ((xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) == _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { // check multiple bits at once + #if HAVE_AVX512 + if (max_level >= 7) { + __cpuid_count(7, 0, eax, ebx, ecx, edx); + if ((ebx & bit_AVX512vl) && (ecx & bit_AVX512vbmi)) { + codec->enc = base64_stream_encode_avx512; + codec->dec = base64_stream_decode_avx512; + return true; + } + } + #endif #if HAVE_AVX2 if (max_level >= 7) { __cpuid_count(7, 0, eax, ebx, ecx, edx); diff --git a/deps/base64/base64/lib/lib.c b/deps/base64/base64/lib/lib.c index 4703512b87a..053931a9918 100644 --- a/deps/base64/base64/lib/lib.c +++ b/deps/base64/base64/lib/lib.c @@ -68,7 +68,7 @@ void base64_stream_decode_init (struct base64_state *state, int flags) { // If any of the codec flags are set, redo choice: - if (codec.dec == NULL || flags & 0xFF) { + if (codec.dec == NULL || flags & 0xFFFF) { codec_choose(&codec, flags); } state->eof = 0; diff --git a/deps/base64/base64/test/Makefile b/deps/base64/base64/test/Makefile index d1045824195..c896627e0bd 100644 --- a/deps/base64/base64/test/Makefile +++ b/deps/base64/base64/test/Makefile @@ -11,12 +11,15 @@ else BENCH_LDFLAGS=-lrt endif -.PHONY: clean test +.PHONY: clean test valgrind test: clean test_base64 benchmark ./test_base64 ./benchmark +valgrind: clean test_base64 + valgrind --error-exitcode=2 ./test_base64 + test_base64: test_base64.c codec_supported.o ../lib/libbase64.o $(CC) $(CFLAGS) -o $@ $^ diff --git a/deps/base64/base64/test/ci/analysis.sh b/deps/base64/base64/test/ci/analysis.sh new file mode 100755 index 00000000000..f7da1857fe0 --- /dev/null +++ b/deps/base64/base64/test/ci/analysis.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -ve + +MACHINE=$(uname -m) +export CC=gcc + +uname -a +clang --version # make analyse +${CC} --version # make -C test valgrind + +for USE_ASSEMBLY in 0 1; do + if [ "${MACHINE}" == "x86_64" ]; then + export SSSE3_CFLAGS="-mssse3 -DBASE64_SSSE3_USE_ASM=${USE_ASSEMBLY}" + export SSE41_CFLAGS="-msse4.1 -DBASE64_SSE41_USE_ASM=${USE_ASSEMBLY}" + export SSE42_CFLAGS="-msse4.2 -DBASE64_SSE42_USE_ASM=${USE_ASSEMBLY}" + export AVX_CFLAGS="-mavx -DBASE64_AVX_USE_ASM=${USE_ASSEMBLY}" + export AVX2_CFLAGS="-mavx2 -DBASE64_AVX2_USE_ASM=${USE_ASSEMBLY}" + # Temporarily disable AVX512; it is not available in CI yet. + # export AVX512_CFLAGS="-mavx512vl -mavx512vbmi" + elif [ "${MACHINE}" == "aarch64" ]; then + export NEON64_CFLAGS="-march=armv8-a" + elif [ "${MACHINE}" == "armv7l" ]; then + export NEON32_CFLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon" + fi + + if [ ${USE_ASSEMBLY} -eq 0 ]; then + echo "::group::analyze" + make analyze + echo "::endgroup::" + fi + + echo "::group::valgrind (USE_ASSEMBLY=${USE_ASSEMBLY})" + make clean + make + make -C test valgrind + echo "::endgroup::" +done diff --git a/deps/base64/base64/test/ci/test.sh b/deps/base64/base64/test/ci/test.sh index 066a49f400b..fb188418cca 100755 --- a/deps/base64/base64/test/ci/test.sh +++ b/deps/base64/base64/test/ci/test.sh @@ -7,9 +7,11 @@ if [ "${MACHINE}" == "x86_64" ]; then export SSE41_CFLAGS=-msse4.1 export SSE42_CFLAGS=-msse4.2 export AVX_CFLAGS=-mavx - # no AVX2 on GHA macOS + # no AVX2 or AVX512 on GHA macOS if [ "$(uname -s)" != "Darwin" ]; then export AVX2_CFLAGS=-mavx2 + # Temporarily disable AVX512; it is not available in CI yet. + # export AVX512_CFLAGS="-mavx512vl -mavx512vbmi" fi elif [ "${MACHINE}" == "aarch64" ]; then export NEON64_CFLAGS="-march=armv8-a" diff --git a/deps/base64/base64/test/codec_supported.c b/deps/base64/base64/test/codec_supported.c index a027b9943bf..f68c766875a 100644 --- a/deps/base64/base64/test/codec_supported.c +++ b/deps/base64/base64/test/codec_supported.c @@ -11,6 +11,7 @@ static char *_codecs[] = , "SSE41" , "SSE42" , "AVX" +, "AVX512" , NULL } ; diff --git a/deps/base64/base64/test/test_base64.c b/deps/base64/base64/test/test_base64.c index bec52d146c8..94aad2d489b 100644 --- a/deps/base64/base64/test/test_base64.c +++ b/deps/base64/base64/test/test_base64.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "../include/libbase64.h" #include "codec_supported.h" #include "moby_dick.h" @@ -92,7 +93,7 @@ assert_roundtrip (int flags, const char *src) } static int -test_char_table (int flags) +test_char_table (int flags, bool use_malloc) { bool fail = false; char chr[256]; @@ -107,8 +108,24 @@ test_char_table (int flags) for (int i = 0; i < 256; i++) { size_t chrlen = 256 - i; + char* src = &chr[i]; + if (use_malloc) { + src = malloc(chrlen); /* malloc/copy this so valgrind can find out-of-bound access */ + if (src == NULL) { + printf( + "FAIL: encoding @ %d: allocation of %lu bytes failed\n", + i, (unsigned long)chrlen + ); + fail = true; + continue; + } + memcpy(src, &chr[i], chrlen); + } - base64_encode(&chr[i], chrlen, enc, &enclen, BASE64_FORCE_PLAIN); + base64_encode(src, chrlen, enc, &enclen, flags); + if (use_malloc) { + free(src); + } if (!base64_decode(enc, enclen, dec, &declen, flags)) { printf("FAIL: decoding @ %d: decoding error\n", i); @@ -198,6 +215,11 @@ test_streaming (int flags) while (base64_stream_decode(&state, &ref[inpos], (inpos + bs > reflen) ? reflen - inpos : bs, &enc[enclen], &partlen)) { enclen += partlen; inpos += bs; + + // Has the entire buffer been consumed? + if (inpos >= 400) { + break; + } } if (enclen != 256) { printf("FAIL: stream decoding gave incorrect size: " @@ -336,7 +358,8 @@ test_one_codec (const char *codec, int flags) fail |= assert_roundtrip(flags, vec[i].out); } - fail |= test_char_table(flags); + fail |= test_char_table(flags, false); /* test with unaligned input buffer */ + fail |= test_char_table(flags, true); /* test for out-of-bound input read */ fail |= test_streaming(flags); fail |= test_invalid_dec_input(flags); diff --git a/doc/contributing/maintaining/maintaining-dependencies.md b/doc/contributing/maintaining/maintaining-dependencies.md index 9c2e3485271..dc78a9dd2b5 100644 --- a/doc/contributing/maintaining/maintaining-dependencies.md +++ b/doc/contributing/maintaining/maintaining-dependencies.md @@ -10,7 +10,7 @@ This a list of all the dependencies: * [acorn 8.11.2][] * [ada 2.7.2][] -* [base64 0.5.0][] +* [base64 0.5.1][] * [brotli 1.0.9][] * [c-ares 1.20.1][] * [cjs-module-lexer 1.2.2][] @@ -155,7 +155,7 @@ an abstract syntax tree walker for the ESTree format. The [ada](https://github.com/ada-url/ada) dependency is a fast and spec-compliant URL parser written in C++. -### base64 0.5.0 +### base64 0.5.1 The [base64](https://github.com/aklomp/base64) dependency is a base64 stream encoding/decoding library in C99 with SIMD and OpenMP acceleration. @@ -320,7 +320,7 @@ performance improvements not currently available in standard zlib. [acorn 8.11.2]: #acorn-8112 [ada 2.7.2]: #ada-272 -[base64 0.5.0]: #base64-050 +[base64 0.5.1]: #base64-051 [brotli 1.0.9]: #brotli-109 [c-ares 1.20.1]: #c-ares-1201 [cjs-module-lexer 1.2.2]: #cjs-module-lexer-122 diff --git a/src/base64_version.h b/src/base64_version.h index fa492a293b4..c3737f4beeb 100644 --- a/src/base64_version.h +++ b/src/base64_version.h @@ -2,5 +2,5 @@ // Refer to tools/dep_updaters/update-base64.sh #ifndef SRC_BASE64_VERSION_H_ #define SRC_BASE64_VERSION_H_ -#define BASE64_VERSION "0.5.0" +#define BASE64_VERSION "0.5.1" #endif // SRC_BASE64_VERSION_H_