mirror of
https://github.com/nodejs/node.git
synced 2024-11-21 13:09:21 +01:00
deps: update base64 to 0.5.1
PR-URL: https://github.com/nodejs/node/pull/50629 Fixes: https://github.com/nodejs/node/issues/50561 Fixes: https://github.com/nodejs/node/pull/45091 Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Mohammed Keyvanzadeh <mohammadkeyvanzade94@gmail.com> Reviewed-By: Yagiz Nizipli <yagiz@nizipli.com> Reviewed-By: Richard Lau <rlau@redhat.com>
This commit is contained in:
parent
3cce03a03f
commit
f45bb801b6
27
deps/base64/base64.gyp
vendored
27
deps/base64/base64.gyp
vendored
@ -49,6 +49,7 @@
|
||||
'HAVE_SSE42=1',
|
||||
'HAVE_AVX=1',
|
||||
'HAVE_AVX2=1',
|
||||
'HAVE_AVX512=1',
|
||||
],
|
||||
'dependencies': [
|
||||
'base64_ssse3',
|
||||
@ -56,6 +57,7 @@
|
||||
'base64_sse42',
|
||||
'base64_avx',
|
||||
'base64_avx2',
|
||||
'base64_avx512',
|
||||
],
|
||||
}, {
|
||||
'sources': [
|
||||
@ -64,6 +66,7 @@
|
||||
'base64/lib/arch/sse42/codec.c',
|
||||
'base64/lib/arch/avx/codec.c',
|
||||
'base64/lib/arch/avx2/codec.c',
|
||||
'base64/lib/arch/avx512/codec.c',
|
||||
],
|
||||
}],
|
||||
],
|
||||
@ -165,6 +168,30 @@
|
||||
],
|
||||
},
|
||||
|
||||
{
|
||||
'target_name': 'base64_avx512',
|
||||
'type': 'static_library',
|
||||
'include_dirs': [ 'base64/include', 'base64/lib' ],
|
||||
'sources': [ 'base64/lib/arch/avx512/codec.c' ],
|
||||
'defines': [ 'BASE64_STATIC_DEFINE', 'HAVE_AVX512=1' ],
|
||||
'conditions': [
|
||||
[ 'OS!="win"', {
|
||||
'cflags': [ '-mavx512vl', '-mavx512vbmi' ],
|
||||
'xcode_settings': {
|
||||
'OTHER_CFLAGS': [ '-mavx512vl', '-mavx512vbmi' ]
|
||||
},
|
||||
}, {
|
||||
'msvs_settings': {
|
||||
'VCCLCompilerTool': {
|
||||
'AdditionalOptions': [
|
||||
'/arch:AVX512'
|
||||
],
|
||||
},
|
||||
},
|
||||
}],
|
||||
],
|
||||
},
|
||||
|
||||
{
|
||||
'target_name': 'base64_neon32',
|
||||
'type': 'static_library',
|
||||
|
13
deps/base64/base64/.gitignore
vendored
13
deps/base64/base64/.gitignore
vendored
@ -1,12 +1 @@
|
||||
*.o
|
||||
bin/base64
|
||||
lib/config.h
|
||||
test/benchmark
|
||||
test/test_base64
|
||||
|
||||
# visual studio symbol db, etc.
|
||||
.vs/
|
||||
# build directory used by CMakePresets
|
||||
out/
|
||||
# private cmake presets
|
||||
CMakeUserPresets.json
|
||||
# Intentionally empty
|
||||
|
6
deps/base64/base64/CMakeLists.txt
vendored
6
deps/base64/base64/CMakeLists.txt
vendored
@ -17,7 +17,7 @@ if (POLICY CMP0127)
|
||||
cmake_policy(SET CMP0127 NEW)
|
||||
endif()
|
||||
|
||||
project(base64 LANGUAGES C VERSION 0.5.0)
|
||||
project(base64 LANGUAGES C VERSION 0.5.1)
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(CMakeDependentOption)
|
||||
@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF)
|
||||
add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath")
|
||||
cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF)
|
||||
add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath")
|
||||
cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF)
|
||||
add_feature_info(AVX512 BASE64_WITH_AVX512 "add AVX512 codepath")
|
||||
|
||||
cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF)
|
||||
add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath")
|
||||
@ -118,6 +120,7 @@ add_library(base64
|
||||
lib/arch/sse42/codec.c
|
||||
lib/arch/avx/codec.c
|
||||
lib/arch/avx2/codec.c
|
||||
lib/arch/avx512/codec.c
|
||||
|
||||
lib/arch/neon32/codec.c
|
||||
lib/arch/neon64/codec.c
|
||||
@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
|
||||
configure_codec(SSE42 __SSSE4_2__)
|
||||
configure_codec(AVX)
|
||||
configure_codec(AVX2)
|
||||
configure_codec(AVX512)
|
||||
|
||||
elseif (_TARGET_ARCH STREQUAL "arm")
|
||||
set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')")
|
||||
|
4
deps/base64/base64/LICENSE
vendored
4
deps/base64/base64/LICENSE
vendored
@ -1,7 +1,7 @@
|
||||
Copyright (c) 2005-2007, Nick Galbreath
|
||||
Copyright (c) 2013-2019, Alfred Klomp
|
||||
Copyright (c) 2015-2017, Wojciech Mula
|
||||
Copyright (c) 2015-2018, Wojciech Muła
|
||||
Copyright (c) 2016-2017, Matthieu Darbois
|
||||
Copyright (c) 2013-2022, Alfred Klomp
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
9
deps/base64/base64/Makefile
vendored
9
deps/base64/base64/Makefile
vendored
@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic
|
||||
OBJCOPY ?= objcopy
|
||||
|
||||
OBJS = \
|
||||
lib/arch/avx512/codec.o \
|
||||
lib/arch/avx2/codec.o \
|
||||
lib/arch/generic/codec.o \
|
||||
lib/arch/neon32/codec.o \
|
||||
@ -16,6 +17,7 @@ OBJS = \
|
||||
lib/codec_choose.o \
|
||||
lib/tables/tables.o
|
||||
|
||||
HAVE_AVX512 = 0
|
||||
HAVE_AVX2 = 0
|
||||
HAVE_NEON32 = 0
|
||||
HAVE_NEON64 = 0
|
||||
@ -26,6 +28,9 @@ HAVE_AVX = 0
|
||||
|
||||
# The user should supply compiler flags for the codecs they want to build.
|
||||
# Check which codecs we're going to include:
|
||||
ifdef AVX512_CFLAGS
|
||||
HAVE_AVX512 = 1
|
||||
endif
|
||||
ifdef AVX2_CFLAGS
|
||||
HAVE_AVX2 = 1
|
||||
endif
|
||||
@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS)
|
||||
$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@
|
||||
|
||||
lib/config.h:
|
||||
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@
|
||||
@echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@
|
||||
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" >> $@
|
||||
@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
|
||||
@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
|
||||
@echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@
|
||||
@ -75,6 +81,7 @@ lib/config.h:
|
||||
$(OBJS): lib/config.h
|
||||
$(OBJS): CFLAGS += -Ilib
|
||||
|
||||
lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS)
|
||||
lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS)
|
||||
lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
|
||||
lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS)
|
||||
|
25
deps/base64/base64/README.md
vendored
25
deps/base64/base64/README.md
vendored
@ -3,7 +3,7 @@
|
||||
[![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml)
|
||||
|
||||
This is an implementation of a base64 stream encoding/decoding library in C99
|
||||
with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
|
||||
with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
|
||||
[OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
|
||||
to encode/decode simple length-delimited strings. This library aims to be:
|
||||
|
||||
@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
|
||||
time, which gives a speedup of four or more times compared to the "plain"
|
||||
bytewise codec.
|
||||
|
||||
AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI
|
||||
instructions. Decoding part reused AVX2 implementations. For CPUs later than
|
||||
Cannonlake (manufactured in 2018) supports these instructions.
|
||||
|
||||
NEON support is hardcoded to on or off at compile time, because portable
|
||||
runtime feature detection is unavailable on ARM.
|
||||
|
||||
@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a
|
||||
[articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
|
||||
His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).
|
||||
|
||||
The AVX512 encoder is based on code from Wojciech Muła's
|
||||
[base64simd](https://github.com/WojciechMula/base64simd) library.
|
||||
|
||||
The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).
|
||||
|
||||
## Building
|
||||
@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type:
|
||||
make lib/libbase64.o
|
||||
```
|
||||
|
||||
Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
|
||||
`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
|
||||
Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`,
|
||||
`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
|
||||
A typical build invocation on x86 looks like this:
|
||||
|
||||
```sh
|
||||
@ -93,6 +100,15 @@ Example:
|
||||
AVX2_CFLAGS=-mavx2 make
|
||||
```
|
||||
|
||||
### AVX512
|
||||
|
||||
To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`.
|
||||
Example:
|
||||
|
||||
```sh
|
||||
AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make
|
||||
```
|
||||
|
||||
The codec will only be used if runtime feature detection shows that the target machine supports AVX2.
|
||||
|
||||
### SSSE3
|
||||
@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way
|
||||
The following constants can be used:
|
||||
|
||||
- `BASE64_FORCE_AVX2`
|
||||
- `BASE64_FORCE_AVX512`
|
||||
- `BASE64_FORCE_NEON32`
|
||||
- `BASE64_FORCE_NEON64`
|
||||
- `BASE64_FORCE_PLAIN`
|
||||
@ -434,7 +451,7 @@ x86 processors
|
||||
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\* | 7099\* | 4917\* | 7057\* | 4799\* | 7143\* | 4902\* | 7219\* |
|
||||
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\* | 8849\* | 5284\* | 9099\* | 5289\* | 9220\* | 4849\* | 9200\* |
|
||||
| i7-4870HQ @ 2.5 GHz | 1471\* | 3066\* | 6721\* | 6962\* | 7015\* | 8267\* | 8328\* | 11576\* |
|
||||
| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243 | 6233 | 4160 | 6344 |
|
||||
| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243\* | 6233 | 4160\* | 6344 |
|
||||
| Xeon X5570 @ 2.93 GHz | 2161 | 1508 | 3160 | 3915 | - | - | - | - |
|
||||
| Pentium4 @ 3.4 GHz | 896 | 740 | - | - | - | - | - | - |
|
||||
| Atom N270 | 243 | 266 | 508 | 387 | - | - | - | - |
|
||||
|
533
deps/base64/base64/bin/base64.c
vendored
533
deps/base64/base64/bin/base64.c
vendored
@ -1,128 +1,477 @@
|
||||
#include <stddef.h> // size_t
|
||||
#include <stdio.h> // fopen()
|
||||
#include <string.h> // strlen()
|
||||
#define _XOPEN_SOURCE // IOV_MAX
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <getopt.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <sys/uio.h>
|
||||
#include "../include/libbase64.h"
|
||||
|
||||
#define BUFSIZE 1024 * 1024
|
||||
// Size of the buffer for the "raw" (not base64-encoded) data in bytes.
|
||||
#define BUFFER_RAW_SIZE (1024 * 1024)
|
||||
|
||||
static char buf[BUFSIZE];
|
||||
static char out[(BUFSIZE * 5) / 3]; // Technically 4/3 of input, but take some margin
|
||||
size_t nread;
|
||||
size_t nout;
|
||||
// Size of the buffer for the base64-encoded data in bytes. The base64-encoded
|
||||
// data is 4/3 the size of the input, with some margin to be sure.
|
||||
#define BUFFER_ENC_SIZE (BUFFER_RAW_SIZE * 4 / 3 + 16)
|
||||
|
||||
static int
|
||||
enc (FILE *fp)
|
||||
// Global config structure.
|
||||
struct config {
|
||||
|
||||
// Name by which the program was called on the command line.
|
||||
const char *name;
|
||||
|
||||
// Name of the input file for logging purposes.
|
||||
const char *file;
|
||||
|
||||
// Input file handle.
|
||||
FILE *fp;
|
||||
|
||||
// Wrap width in characters, for encoding only.
|
||||
size_t wrap;
|
||||
|
||||
// Whether to run in decode mode.
|
||||
bool decode;
|
||||
|
||||
// Whether to just print the help text and exit.
|
||||
bool print_help;
|
||||
};
|
||||
|
||||
// Input/output buffer structure.
|
||||
struct buffer {
|
||||
|
||||
// Runtime-allocated buffer for raw (unencoded) data.
|
||||
char *raw;
|
||||
|
||||
// Runtime-allocated buffer for base64-encoded data.
|
||||
char *enc;
|
||||
};
|
||||
|
||||
static bool
|
||||
buffer_alloc (const struct config *config, struct buffer *buf)
|
||||
{
|
||||
int ret = 1;
|
||||
if ((buf->raw = malloc(BUFFER_RAW_SIZE)) == NULL ||
|
||||
(buf->enc = malloc(BUFFER_ENC_SIZE)) == NULL) {
|
||||
free(buf->raw);
|
||||
fprintf(stderr, "%s: malloc: %s\n",
|
||||
config->name, strerror(errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
buffer_free (struct buffer *buf)
|
||||
{
|
||||
free(buf->raw);
|
||||
free(buf->enc);
|
||||
}
|
||||
|
||||
static bool
|
||||
writev_retry (const struct config *config, struct iovec *iov, size_t nvec)
|
||||
{
|
||||
// Writing nothing always succeeds.
|
||||
if (nvec == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
ssize_t nwrite;
|
||||
|
||||
// Try to write the vectors to stdout.
|
||||
if ((nwrite = writev(1, iov, nvec)) < 0) {
|
||||
|
||||
// Retry on EINTR.
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Quit on other errors.
|
||||
fprintf(stderr, "%s: writev: %s\n",
|
||||
config->name, strerror(errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
// The return value of `writev' is the number of bytes written.
|
||||
// To check for success, we traverse the list and remove all
|
||||
// written vectors. The call succeeded if the list is empty.
|
||||
while (true) {
|
||||
|
||||
// Retry if this vector is not or partially written.
|
||||
if (iov->iov_len > (size_t) nwrite) {
|
||||
char *base = iov->iov_base;
|
||||
|
||||
iov->iov_base = (size_t) nwrite + base;
|
||||
iov->iov_len -= (size_t) nwrite;
|
||||
break;
|
||||
}
|
||||
|
||||
// Move to the next vector.
|
||||
nwrite -= iov->iov_len;
|
||||
iov++;
|
||||
|
||||
// Return successfully if all vectors were written.
|
||||
if (--nvec == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
iov_append (const struct config *config, struct iovec *iov,
|
||||
size_t *nvec, char *base, const size_t len)
|
||||
{
|
||||
// Add the buffer to the IO vector array.
|
||||
iov[*nvec].iov_base = base;
|
||||
iov[*nvec].iov_len = len;
|
||||
|
||||
// Increment the array index. Flush the array if it is full.
|
||||
if (++(*nvec) == IOV_MAX) {
|
||||
if (writev_retry(config, iov, IOV_MAX) == false) {
|
||||
return false;
|
||||
}
|
||||
*nvec = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
write_stdout (const struct config *config, const char *buf, size_t len)
|
||||
{
|
||||
while (len > 0) {
|
||||
ssize_t nwrite;
|
||||
|
||||
// Try to write the buffer to stdout.
|
||||
if ((nwrite = write(1, buf, len)) < 0) {
|
||||
|
||||
// Retry on EINTR.
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Quit on other errors.
|
||||
fprintf(stderr, "%s: write: %s\n",
|
||||
config->name, strerror(errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Update the buffer position.
|
||||
buf += (size_t) nwrite;
|
||||
len -= (size_t) nwrite;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
write_wrapped (const struct config *config, char *buf, size_t len)
|
||||
{
|
||||
static size_t col = 0;
|
||||
|
||||
// Special case: if buf is NULL, print final trailing newline.
|
||||
if (buf == NULL) {
|
||||
if (config->wrap > 0 && col > 0) {
|
||||
return write_stdout(config, "\n", 1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// If no wrap width is given, write the entire buffer.
|
||||
if (config->wrap == 0) {
|
||||
return write_stdout(config, buf, len);
|
||||
}
|
||||
|
||||
// Statically allocated IO vector buffer.
|
||||
static struct iovec iov[IOV_MAX];
|
||||
size_t nvec = 0;
|
||||
|
||||
while (len > 0) {
|
||||
|
||||
// Number of characters to fill the current line.
|
||||
size_t nwrite = config->wrap - col;
|
||||
|
||||
// Do not write more data than is available.
|
||||
if (nwrite > len) {
|
||||
nwrite = len;
|
||||
}
|
||||
|
||||
// Append the data to the IO vector array.
|
||||
if (iov_append(config, iov, &nvec, buf, nwrite) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Advance the buffer.
|
||||
len -= nwrite;
|
||||
buf += nwrite;
|
||||
col += nwrite;
|
||||
|
||||
// If the line is full, append a newline.
|
||||
if (col == config->wrap) {
|
||||
if (iov_append(config, iov, &nvec, "\n", 1) == false) {
|
||||
return false;
|
||||
}
|
||||
col = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Write the remaining vectors.
|
||||
if (writev_retry(config, iov, nvec) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
encode (const struct config *config, struct buffer *buf)
|
||||
{
|
||||
size_t nread, nout;
|
||||
struct base64_state state;
|
||||
|
||||
// Initialize the encoder's state structure.
|
||||
base64_stream_encode_init(&state, 0);
|
||||
|
||||
while ((nread = fread(buf, 1, BUFSIZE, fp)) > 0) {
|
||||
base64_stream_encode(&state, buf, nread, out, &nout);
|
||||
if (nout) {
|
||||
fwrite(out, nout, 1, stdout);
|
||||
}
|
||||
if (feof(fp)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ferror(fp)) {
|
||||
fprintf(stderr, "read error\n");
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
base64_stream_encode_final(&state, out, &nout);
|
||||
// Read raw data into the buffer.
|
||||
while ((nread = fread(buf->raw, 1, BUFFER_RAW_SIZE, config->fp)) > 0) {
|
||||
|
||||
if (nout) {
|
||||
fwrite(out, nout, 1, stdout);
|
||||
// Encode the raw input into the encoded buffer.
|
||||
base64_stream_encode(&state, buf->raw, nread, buf->enc, &nout);
|
||||
|
||||
// Append the encoded data to the output stream.
|
||||
if (write_wrapped(config, buf->enc, nout) == false) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
out: fclose(fp);
|
||||
fclose(stdout);
|
||||
return ret;
|
||||
|
||||
// Check for stream errors.
|
||||
if (ferror(config->fp)) {
|
||||
fprintf(stderr, "%s: %s: read error\n",
|
||||
config->name, config->file);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Finalize the encoding by adding proper stream terminators.
|
||||
base64_stream_encode_final(&state, buf->enc, &nout);
|
||||
|
||||
// Append this tail to the output stream.
|
||||
if (write_wrapped(config, buf->enc, nout) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Print optional trailing newline.
|
||||
if (write_wrapped(config, NULL, 0) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int
|
||||
dec (FILE *fp)
|
||||
decode (const struct config *config, struct buffer *buf)
|
||||
{
|
||||
int ret = 1;
|
||||
size_t nread, nout;
|
||||
struct base64_state state;
|
||||
|
||||
// Initialize the decoder's state structure.
|
||||
base64_stream_decode_init(&state, 0);
|
||||
|
||||
while ((nread = fread(buf, 1, BUFSIZE, fp)) > 0) {
|
||||
if (!base64_stream_decode(&state, buf, nread, out, &nout)) {
|
||||
fprintf(stderr, "decoding error\n");
|
||||
ret = 0;
|
||||
goto out;
|
||||
// Read encoded data into the buffer. Use the smallest buffer size to
|
||||
// be on the safe side: the decoded output will fit the raw buffer.
|
||||
while ((nread = fread(buf->enc, 1, BUFFER_RAW_SIZE, config->fp)) > 0) {
|
||||
|
||||
// Decode the input into the raw buffer.
|
||||
if (base64_stream_decode(&state, buf->enc, nread,
|
||||
buf->raw, &nout) == 0) {
|
||||
fprintf(stderr, "%s: %s: decoding error\n",
|
||||
config->name, config->file);
|
||||
return false;
|
||||
}
|
||||
if (nout) {
|
||||
fwrite(out, nout, 1, stdout);
|
||||
|
||||
// Append the raw data to the output stream.
|
||||
if (write_stdout(config, buf->raw, nout) == false) {
|
||||
return false;
|
||||
}
|
||||
if (feof(fp)) {
|
||||
}
|
||||
|
||||
// Check for stream errors.
|
||||
if (ferror(config->fp)) {
|
||||
fprintf(stderr, "%s: %s: read error\n",
|
||||
config->name, config->file);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
usage (FILE *fp, const struct config *config)
|
||||
{
|
||||
const char *usage =
|
||||
"Usage: %s [OPTION]... [FILE]\n"
|
||||
"If no FILE is given or is specified as '-', "
|
||||
"read from standard input.\n"
|
||||
"Options:\n"
|
||||
" -d, --decode Decode a base64 stream.\n"
|
||||
" -h, --help Print this help text.\n"
|
||||
" -w, --wrap=COLS Wrap encoded lines at this column. "
|
||||
"Default 76, 0 to disable.\n";
|
||||
|
||||
fprintf(fp, usage, config->name);
|
||||
}
|
||||
|
||||
static bool
|
||||
get_wrap (struct config *config, const char *str)
|
||||
{
|
||||
char *eptr;
|
||||
|
||||
// Reject empty strings.
|
||||
if (*str == '\0') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Convert the input string to a signed long.
|
||||
const long wrap = strtol(str, &eptr, 10);
|
||||
|
||||
// Reject negative numbers.
|
||||
if (wrap < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject strings containing non-digits.
|
||||
if (*eptr != '\0') {
|
||||
return false;
|
||||
}
|
||||
|
||||
config->wrap = (size_t) wrap;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
parse_opts (int argc, char **argv, struct config *config)
|
||||
{
|
||||
int c;
|
||||
static const struct option opts[] = {
|
||||
{ "decode", no_argument, NULL, 'd' },
|
||||
{ "help", no_argument, NULL, 'h' },
|
||||
{ "wrap", required_argument, NULL, 'w' },
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
// Remember the program's name.
|
||||
config->name = *argv;
|
||||
|
||||
// Parse command line options.
|
||||
while ((c = getopt_long(argc, argv, ":dhw:", opts, NULL)) != -1) {
|
||||
switch (c) {
|
||||
case 'd':
|
||||
config->decode = true;
|
||||
break;
|
||||
|
||||
case 'h':
|
||||
config->print_help = true;
|
||||
return true;
|
||||
|
||||
case 'w':
|
||||
if (get_wrap(config, optarg) == false) {
|
||||
fprintf(stderr,
|
||||
"%s: invalid wrap value '%s'\n",
|
||||
config->name, optarg);
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
case ':':
|
||||
fprintf(stderr, "%s: missing argument for '%c'\n",
|
||||
config->name, optopt);
|
||||
return false;
|
||||
|
||||
default:
|
||||
fprintf(stderr, "%s: unknown option '%c'\n",
|
||||
config->name, optopt);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (ferror(fp)) {
|
||||
fprintf(stderr, "read error\n");
|
||||
ret = 0;
|
||||
|
||||
// Return successfully if no filename was given.
|
||||
if (optind >= argc) {
|
||||
return true;
|
||||
}
|
||||
out: fclose(fp);
|
||||
fclose(stdout);
|
||||
return ret;
|
||||
|
||||
// Return unsuccessfully if more than one filename was given.
|
||||
if (optind + 1 < argc) {
|
||||
fprintf(stderr, "%s: too many files\n", config->name);
|
||||
return false;
|
||||
}
|
||||
|
||||
// For compatibility with GNU Coreutils base64, treat a filename of '-'
|
||||
// as standard input.
|
||||
if (strcmp(argv[optind], "-") == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Save the name of the file.
|
||||
config->file = argv[optind];
|
||||
|
||||
// Open the file.
|
||||
if ((config->fp = fopen(config->file, "rb")) == NULL) {
|
||||
fprintf(stderr, "%s: %s: %s\n",
|
||||
config->name, config->file, strerror(errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int
|
||||
main (int argc, char **argv)
|
||||
{
|
||||
char *file;
|
||||
FILE *fp;
|
||||
int decode = 0;
|
||||
// Default program config.
|
||||
struct config config = {
|
||||
.file = "stdin",
|
||||
.fp = stdin,
|
||||
.wrap = 76,
|
||||
.decode = false,
|
||||
.print_help = false,
|
||||
};
|
||||
struct buffer buf;
|
||||
|
||||
// Parse options:
|
||||
for (;;)
|
||||
{
|
||||
int c;
|
||||
int opt_index = 0;
|
||||
static struct option opt_long[] = {
|
||||
{ "decode", 0, 0, 'd' },
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
if ((c = getopt_long(argc, argv, "d", opt_long, &opt_index)) == -1) {
|
||||
break;
|
||||
}
|
||||
switch (c)
|
||||
{
|
||||
case 'd':
|
||||
decode = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// No options left on command line? Read from stdin:
|
||||
if (optind >= argc) {
|
||||
fp = stdin;
|
||||
}
|
||||
|
||||
// One option left on command line? Treat it as a file:
|
||||
else if (optind + 1 == argc) {
|
||||
file = argv[optind];
|
||||
if (strcmp(file, "-") == 0) {
|
||||
fp = stdin;
|
||||
}
|
||||
else if ((fp = fopen(file, "rb")) == NULL) {
|
||||
printf("cannot open %s\n", file);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// More than one option left on command line? Syntax error:
|
||||
else {
|
||||
printf("Usage: %s <file>\n", argv[0]);
|
||||
// Parse options from the command line.
|
||||
if (parse_opts(argc, argv, &config) == false) {
|
||||
usage(stderr, &config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Invert return codes to create shell return code:
|
||||
return (decode) ? !dec(fp) : !enc(fp);
|
||||
// Return early if the user just wanted the help text.
|
||||
if (config.print_help) {
|
||||
usage(stdout, &config);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Allocate buffers.
|
||||
if (buffer_alloc(&config, &buf) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Encode or decode the input based on the user's choice.
|
||||
const bool ret = config.decode
|
||||
? decode(&config, &buf)
|
||||
: encode(&config, &buf);
|
||||
|
||||
// Free the buffers.
|
||||
buffer_free(&buf);
|
||||
|
||||
// Close the input file.
|
||||
fclose(config.fp);
|
||||
|
||||
// Close the output stream.
|
||||
fclose(stdout);
|
||||
|
||||
// That's all, folks.
|
||||
return ret ? 0 : 1;
|
||||
}
|
||||
|
@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags)
|
||||
set(COMPILE_FLAGS_SSE42 "-msse4.2")
|
||||
set(COMPILE_FLAGS_AVX "-mavx")
|
||||
set(COMPILE_FLAGS_AVX2 "-mavx2")
|
||||
set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi")
|
||||
|
||||
#arm
|
||||
set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
|
||||
@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags)
|
||||
set(COMPILE_FLAGS_SSE42 " ")
|
||||
set(COMPILE_FLAGS_AVX "/arch:AVX")
|
||||
set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
|
||||
set(COMPILE_FLAGS_AVX512 "/arch:AVX512")
|
||||
endif()
|
||||
endmacro(define_SIMD_compile_flags)
|
||||
|
3
deps/base64/base64/cmake/config.h.in
vendored
3
deps/base64/base64/cmake/config.h.in
vendored
@ -16,6 +16,9 @@
|
||||
#cmakedefine01 BASE64_WITH_AVX2
|
||||
#define HAVE_AVX2 BASE64_WITH_AVX2
|
||||
|
||||
#cmakedefine01 BASE64_WITH_AVX512
|
||||
#define HAVE_AVX512 BASE64_WITH_AVX512
|
||||
|
||||
#cmakedefine01 BASE64_WITH_NEON32
|
||||
#define HAVE_NEON32 BASE64_WITH_NEON32
|
||||
|
||||
|
1
deps/base64/base64/include/libbase64.h
vendored
1
deps/base64/base64/include/libbase64.h
vendored
@ -53,6 +53,7 @@ extern "C" {
|
||||
#define BASE64_FORCE_SSE41 (1 << 5)
|
||||
#define BASE64_FORCE_SSE42 (1 << 6)
|
||||
#define BASE64_FORCE_AVX (1 << 7)
|
||||
#define BASE64_FORCE_AVX512 (1 << 8)
|
||||
|
||||
struct base64_state {
|
||||
int eof;
|
||||
|
30
deps/base64/base64/lib/arch/avx/codec.c
vendored
30
deps/base64/base64/lib/arch/avx/codec.c
vendored
@ -11,11 +11,25 @@
|
||||
#if HAVE_AVX
|
||||
#include <immintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
#ifndef BASE64_AVX_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_AVX_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_AVX_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "../ssse3/dec_reshuffle.c"
|
||||
#include "../ssse3/dec_loop.c"
|
||||
#include "../ssse3/enc_translate.c"
|
||||
#include "../ssse3/enc_reshuffle.c"
|
||||
#include "../ssse3/enc_loop.c"
|
||||
|
||||
#if BASE64_AVX_USE_ASM
|
||||
# include "enc_loop_asm.c"
|
||||
#else
|
||||
# include "../ssse3/enc_translate.c"
|
||||
# include "../ssse3/enc_reshuffle.c"
|
||||
# include "../ssse3/enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_AVX
|
||||
|
||||
@ -23,7 +37,17 @@ BASE64_ENC_FUNCTION(avx)
|
||||
{
|
||||
#if HAVE_AVX
|
||||
#include "../generic/enc_head.c"
|
||||
|
||||
// For supported compilers, use a hand-optimized inline assembly
|
||||
// encoder. Otherwise fall back on the SSSE3 encoder, but compiled with
|
||||
// AVX flags to generate better optimized AVX code.
|
||||
|
||||
#if BASE64_AVX_USE_ASM
|
||||
enc_loop_avx(&s, &slen, &o, &olen);
|
||||
#else
|
||||
enc_loop_ssse3(&s, &slen, &o, &olen);
|
||||
#endif
|
||||
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
BASE64_ENC_STUB
|
||||
|
264
deps/base64/base64/lib/arch/avx/enc_loop_asm.c
vendored
Normal file
264
deps/base64/base64/lib/arch/avx/enc_loop_asm.c
vendored
Normal file
@ -0,0 +1,264 @@
|
||||
// Apologies in advance for combining the preprocessor with inline assembly,
|
||||
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
|
||||
// code repetition. The preprocessor is used to template large sections of
|
||||
// inline assembly that differ only in the registers used. If the code was
|
||||
// written out by hand, it would become very large and hard to audit.
|
||||
|
||||
// Generate a block of inline assembly that loads register R0 from memory. The
|
||||
// offset at which the register is loaded is set by the given round.
|
||||
#define LOAD(R0, ROUND) \
|
||||
"vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that deinterleaves and shuffles register
|
||||
// R0 using preloaded constants. Outputs in R0 and R1.
|
||||
#define SHUF(R0, R1, R2) \
|
||||
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
|
||||
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
|
||||
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
|
||||
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
|
||||
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
|
||||
"vpor %["R1"], %["R2"], %["R1"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that takes R0 and R1 and translates
|
||||
// their contents to the base64 alphabet, using preloaded constants.
|
||||
#define TRAN(R0, R1, R2) \
|
||||
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
|
||||
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
|
||||
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
|
||||
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
|
||||
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that stores the given register R0 at an
|
||||
// offset set by the given round.
|
||||
#define STOR(R0, ROUND) \
|
||||
"vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
|
||||
|
||||
// Generate a block of inline assembly that generates a single self-contained
|
||||
// encoder round: fetch the data, process it, and store the result. Then update
|
||||
// the source and destination pointers.
|
||||
#define ROUND() \
|
||||
LOAD("a", 0) \
|
||||
SHUF("a", "b", "c") \
|
||||
TRAN("a", "b", "c") \
|
||||
STOR("a", 0) \
|
||||
"add $12, %[src] \n\t" \
|
||||
"add $16, %[dst] \n\t"
|
||||
|
||||
// Define a macro that initiates a three-way interleaved encoding round by
|
||||
// preloading registers a, b and c from memory.
|
||||
// The register graph shows which registers are in use during each step, and
|
||||
// is a visual aid for choosing registers for that step. Symbol index:
|
||||
//
|
||||
// + indicates that a register is loaded by that step.
|
||||
// | indicates that a register is in use and must not be touched.
|
||||
// - indicates that a register is decommissioned by that step.
|
||||
// x indicates that a register is used as a temporary by that step.
|
||||
// V indicates that a register is an input or output to the macro.
|
||||
//
|
||||
#define ROUND_3_INIT() /* a b c d e f */ \
|
||||
LOAD("a", 0) /* + */ \
|
||||
SHUF("a", "d", "e") /* | + x */ \
|
||||
LOAD("b", 1) /* | + | */ \
|
||||
TRAN("a", "d", "e") /* | | - x */ \
|
||||
LOAD("c", 2) /* V V V */
|
||||
|
||||
// Define a macro that translates, shuffles and stores the input registers A, B
|
||||
// and C, and preloads registers D, E and F for the next round.
|
||||
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
|
||||
// and F back into the next round as input registers A, B and C. The macro
|
||||
// carefully interleaves memory operations with data operations for optimal
|
||||
// pipelined performance.
|
||||
|
||||
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
LOAD(D, (ROUND + 3)) /* V V V + */ \
|
||||
SHUF(B, E, F) /* | | | | + x */ \
|
||||
STOR(A, (ROUND + 0)) /* - | | | | */ \
|
||||
TRAN(B, E, F) /* | | | - x */ \
|
||||
LOAD(E, (ROUND + 4)) /* | | | + */ \
|
||||
SHUF(C, A, F) /* + | | | | x */ \
|
||||
STOR(B, (ROUND + 1)) /* | - | | | */ \
|
||||
TRAN(C, A, F) /* - | | | x */ \
|
||||
LOAD(F, (ROUND + 5)) /* | | | + */ \
|
||||
SHUF(D, A, B) /* + x | | | | */ \
|
||||
STOR(C, (ROUND + 2)) /* | - | | | */ \
|
||||
TRAN(D, A, B) /* - x V V V */
|
||||
|
||||
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
|
||||
// registers D, E and F, and translating, shuffling and storing them.
|
||||
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
SHUF(E, A, B) /* + x V V V */ \
|
||||
STOR(D, (ROUND + 3)) /* | - | | */ \
|
||||
TRAN(E, A, B) /* - x | | */ \
|
||||
SHUF(F, C, D) /* + x | | */ \
|
||||
STOR(E, (ROUND + 4)) /* | - | */ \
|
||||
TRAN(F, C, D) /* - x | */ \
|
||||
STOR(F, (ROUND + 5)) /* - */
|
||||
|
||||
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
|
||||
#define ROUND_3_A(ROUND) \
|
||||
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Define a type B round. Inputs and outputs are swapped with regard to type A.
|
||||
#define ROUND_3_B(ROUND) \
|
||||
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Terminating macro for a type A round.
|
||||
#define ROUND_3_A_LAST(ROUND) \
|
||||
ROUND_3_A(ROUND) \
|
||||
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Terminating macro for a type B round.
|
||||
#define ROUND_3_B_LAST(ROUND) \
|
||||
ROUND_3_B(ROUND) \
|
||||
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Suppress clang's warning that the literal string in the asm statement is
|
||||
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
|
||||
// compilers). It may be true, but the goal here is not C99 portability.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
|
||||
static inline void
|
||||
enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
// For a clearer explanation of the algorithm used by this function,
|
||||
// please refer to the plain (not inline assembly) implementation. This
|
||||
// function follows the same basic logic.
|
||||
|
||||
if (*slen < 16) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
|
||||
// bytes, so "reserve" four bytes from the input buffer to ensure that
|
||||
// we never read beyond the end of the input buffer.
|
||||
size_t rounds = (*slen - 4) / 12;
|
||||
|
||||
*slen -= rounds * 12; // 12 bytes consumed per round
|
||||
*olen += rounds * 16; // 16 bytes produced per round
|
||||
|
||||
// Number of times to go through the 36x loop.
|
||||
size_t loops = rounds / 36;
|
||||
|
||||
// Number of rounds remaining after the 36x loop.
|
||||
rounds %= 36;
|
||||
|
||||
// Lookup tables.
|
||||
const __m128i lut0 = _mm_set_epi8(
|
||||
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
|
||||
|
||||
const __m128i lut1 = _mm_setr_epi8(
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
|
||||
|
||||
// Temporary registers.
|
||||
__m128i a, b, c, d, e, f;
|
||||
|
||||
__asm__ volatile (
|
||||
|
||||
// If there are 36 rounds or more, enter a 36x unrolled loop of
|
||||
// interleaved encoding rounds. The rounds interleave memory
|
||||
// operations (load/store) with data operations (table lookups,
|
||||
// etc) to maximize pipeline throughput.
|
||||
" test %[loops], %[loops] \n\t"
|
||||
" jz 18f \n\t"
|
||||
" jmp 36f \n\t"
|
||||
" \n\t"
|
||||
".balign 64 \n\t"
|
||||
"36: " ROUND_3_INIT()
|
||||
" " ROUND_3_A( 0)
|
||||
" " ROUND_3_B( 3)
|
||||
" " ROUND_3_A( 6)
|
||||
" " ROUND_3_B( 9)
|
||||
" " ROUND_3_A(12)
|
||||
" " ROUND_3_B(15)
|
||||
" " ROUND_3_A(18)
|
||||
" " ROUND_3_B(21)
|
||||
" " ROUND_3_A(24)
|
||||
" " ROUND_3_B(27)
|
||||
" " ROUND_3_A_LAST(30)
|
||||
" add $(12 * 36), %[src] \n\t"
|
||||
" add $(16 * 36), %[dst] \n\t"
|
||||
" dec %[loops] \n\t"
|
||||
" jnz 36b \n\t"
|
||||
|
||||
// Enter an 18x unrolled loop for rounds of 18 or more.
|
||||
"18: cmp $18, %[rounds] \n\t"
|
||||
" jl 9f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B(3)
|
||||
" " ROUND_3_A(6)
|
||||
" " ROUND_3_B(9)
|
||||
" " ROUND_3_A_LAST(12)
|
||||
" sub $18, %[rounds] \n\t"
|
||||
" add $(12 * 18), %[src] \n\t"
|
||||
" add $(16 * 18), %[dst] \n\t"
|
||||
|
||||
// Enter a 9x unrolled loop for rounds of 9 or more.
|
||||
"9: cmp $9, %[rounds] \n\t"
|
||||
" jl 6f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B_LAST(3)
|
||||
" sub $9, %[rounds] \n\t"
|
||||
" add $(12 * 9), %[src] \n\t"
|
||||
" add $(16 * 9), %[dst] \n\t"
|
||||
|
||||
// Enter a 6x unrolled loop for rounds of 6 or more.
|
||||
"6: cmp $6, %[rounds] \n\t"
|
||||
" jl 55f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A_LAST(0)
|
||||
" sub $6, %[rounds] \n\t"
|
||||
" add $(12 * 6), %[src] \n\t"
|
||||
" add $(16 * 6), %[dst] \n\t"
|
||||
|
||||
// Dispatch the remaining rounds 0..5.
|
||||
"55: cmp $3, %[rounds] \n\t"
|
||||
" jg 45f \n\t"
|
||||
" je 3f \n\t"
|
||||
" cmp $1, %[rounds] \n\t"
|
||||
" jg 2f \n\t"
|
||||
" je 1f \n\t"
|
||||
" jmp 0f \n\t"
|
||||
|
||||
"45: cmp $4, %[rounds] \n\t"
|
||||
" je 4f \n\t"
|
||||
|
||||
// Block of non-interlaced encoding rounds, which can each
|
||||
// individually be jumped to. Rounds fall through to the next.
|
||||
"5: " ROUND()
|
||||
"4: " ROUND()
|
||||
"3: " ROUND()
|
||||
"2: " ROUND()
|
||||
"1: " ROUND()
|
||||
"0: \n\t"
|
||||
|
||||
// Outputs (modified).
|
||||
: [rounds] "+r" (rounds),
|
||||
[loops] "+r" (loops),
|
||||
[src] "+r" (*s),
|
||||
[dst] "+r" (*o),
|
||||
[a] "=&x" (a),
|
||||
[b] "=&x" (b),
|
||||
[c] "=&x" (c),
|
||||
[d] "=&x" (d),
|
||||
[e] "=&x" (e),
|
||||
[f] "=&x" (f)
|
||||
|
||||
// Inputs (not modified).
|
||||
: [lut0] "x" (lut0),
|
||||
[lut1] "x" (lut1),
|
||||
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
|
||||
[msk1] "x" (_mm_set1_epi32(0x04000040)),
|
||||
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
|
||||
[msk3] "x" (_mm_set1_epi32(0x01000010)),
|
||||
[n51] "x" (_mm_set1_epi8(51)),
|
||||
[n25] "x" (_mm_set1_epi8(25))
|
||||
|
||||
// Clobbers.
|
||||
: "cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
20
deps/base64/base64/lib/arch/avx2/codec.c
vendored
20
deps/base64/base64/lib/arch/avx2/codec.c
vendored
@ -11,11 +11,25 @@
|
||||
#if HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
#ifndef BASE64_AVX2_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_AVX2_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_AVX2_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "dec_reshuffle.c"
|
||||
#include "dec_loop.c"
|
||||
#include "enc_translate.c"
|
||||
#include "enc_reshuffle.c"
|
||||
#include "enc_loop.c"
|
||||
|
||||
#if BASE64_AVX2_USE_ASM
|
||||
# include "enc_loop_asm.c"
|
||||
#else
|
||||
# include "enc_translate.c"
|
||||
# include "enc_reshuffle.c"
|
||||
# include "enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_AVX2
|
||||
|
||||
|
291
deps/base64/base64/lib/arch/avx2/enc_loop_asm.c
vendored
Normal file
291
deps/base64/base64/lib/arch/avx2/enc_loop_asm.c
vendored
Normal file
@ -0,0 +1,291 @@
|
||||
// Apologies in advance for combining the preprocessor with inline assembly,
|
||||
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
|
||||
// code repetition. The preprocessor is used to template large sections of
|
||||
// inline assembly that differ only in the registers used. If the code was
|
||||
// written out by hand, it would become very large and hard to audit.
|
||||
|
||||
// Generate a block of inline assembly that loads register R0 from memory. The
|
||||
// offset at which the register is loaded is set by the given round and a
|
||||
// constant offset.
|
||||
#define LOAD(R0, ROUND, OFFSET) \
|
||||
"vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that deinterleaves and shuffles register
|
||||
// R0 using preloaded constants. Outputs in R0 and R1.
|
||||
#define SHUF(R0, R1, R2) \
|
||||
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
|
||||
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
|
||||
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
|
||||
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
|
||||
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
|
||||
"vpor %["R1"], %["R2"], %["R1"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that takes R0 and R1 and translates
|
||||
// their contents to the base64 alphabet, using preloaded constants.
|
||||
#define TRAN(R0, R1, R2) \
|
||||
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
|
||||
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
|
||||
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
|
||||
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
|
||||
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that stores the given register R0 at an
|
||||
// offset set by the given round.
|
||||
#define STOR(R0, ROUND) \
|
||||
"vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t"
|
||||
|
||||
// Generate a block of inline assembly that generates a single self-contained
|
||||
// encoder round: fetch the data, process it, and store the result. Then update
|
||||
// the source and destination pointers.
|
||||
#define ROUND() \
|
||||
LOAD("a", 0, -4) \
|
||||
SHUF("a", "b", "c") \
|
||||
TRAN("a", "b", "c") \
|
||||
STOR("a", 0) \
|
||||
"add $24, %[src] \n\t" \
|
||||
"add $32, %[dst] \n\t"
|
||||
|
||||
// Define a macro that initiates a three-way interleaved encoding round by
|
||||
// preloading registers a, b and c from memory.
|
||||
// The register graph shows which registers are in use during each step, and
|
||||
// is a visual aid for choosing registers for that step. Symbol index:
|
||||
//
|
||||
// + indicates that a register is loaded by that step.
|
||||
// | indicates that a register is in use and must not be touched.
|
||||
// - indicates that a register is decommissioned by that step.
|
||||
// x indicates that a register is used as a temporary by that step.
|
||||
// V indicates that a register is an input or output to the macro.
|
||||
//
|
||||
#define ROUND_3_INIT() /* a b c d e f */ \
|
||||
LOAD("a", 0, -4) /* + */ \
|
||||
SHUF("a", "d", "e") /* | + x */ \
|
||||
LOAD("b", 1, -4) /* | + | */ \
|
||||
TRAN("a", "d", "e") /* | | - x */ \
|
||||
LOAD("c", 2, -4) /* V V V */
|
||||
|
||||
// Define a macro that translates, shuffles and stores the input registers A, B
|
||||
// and C, and preloads registers D, E and F for the next round.
|
||||
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
|
||||
// and F back into the next round as input registers A, B and C. The macro
|
||||
// carefully interleaves memory operations with data operations for optimal
|
||||
// pipelined performance.
|
||||
|
||||
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
LOAD(D, (ROUND + 3), -4) /* V V V + */ \
|
||||
SHUF(B, E, F) /* | | | | + x */ \
|
||||
STOR(A, (ROUND + 0)) /* - | | | | */ \
|
||||
TRAN(B, E, F) /* | | | - x */ \
|
||||
LOAD(E, (ROUND + 4), -4) /* | | | + */ \
|
||||
SHUF(C, A, F) /* + | | | | x */ \
|
||||
STOR(B, (ROUND + 1)) /* | - | | | */ \
|
||||
TRAN(C, A, F) /* - | | | x */ \
|
||||
LOAD(F, (ROUND + 5), -4) /* | | | + */ \
|
||||
SHUF(D, A, B) /* + x | | | | */ \
|
||||
STOR(C, (ROUND + 2)) /* | - | | | */ \
|
||||
TRAN(D, A, B) /* - x V V V */
|
||||
|
||||
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
|
||||
// registers D, E and F, and translating, shuffling and storing them.
|
||||
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
SHUF(E, A, B) /* + x V V V */ \
|
||||
STOR(D, (ROUND + 3)) /* | - | | */ \
|
||||
TRAN(E, A, B) /* - x | | */ \
|
||||
SHUF(F, C, D) /* + x | | */ \
|
||||
STOR(E, (ROUND + 4)) /* | - | */ \
|
||||
TRAN(F, C, D) /* - x | */ \
|
||||
STOR(F, (ROUND + 5)) /* - */
|
||||
|
||||
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
|
||||
#define ROUND_3_A(ROUND) \
|
||||
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Define a type B round. Inputs and outputs are swapped with regard to type A.
|
||||
#define ROUND_3_B(ROUND) \
|
||||
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Terminating macro for a type A round.
|
||||
#define ROUND_3_A_LAST(ROUND) \
|
||||
ROUND_3_A(ROUND) \
|
||||
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Terminating macro for a type B round.
|
||||
#define ROUND_3_B_LAST(ROUND) \
|
||||
ROUND_3_B(ROUND) \
|
||||
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Suppress clang's warning that the literal string in the asm statement is
|
||||
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
|
||||
// compilers). It may be true, but the goal here is not C99 portability.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
|
||||
static inline void
|
||||
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
// For a clearer explanation of the algorithm used by this function,
|
||||
// please refer to the plain (not inline assembly) implementation. This
|
||||
// function follows the same basic logic.
|
||||
|
||||
if (*slen < 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
|
||||
// bytes at a time an offset of -4, ensure that there will be at least
|
||||
// 4 remaining bytes after the last round, so that the final read will
|
||||
// not pass beyond the bounds of the input buffer.
|
||||
size_t rounds = (*slen - 4) / 24;
|
||||
|
||||
*slen -= rounds * 24; // 24 bytes consumed per round
|
||||
*olen += rounds * 32; // 32 bytes produced per round
|
||||
|
||||
// Pre-decrement the number of rounds to get the number of rounds
|
||||
// *after* the first round, which is handled as a special case.
|
||||
rounds--;
|
||||
|
||||
// Number of times to go through the 36x loop.
|
||||
size_t loops = rounds / 36;
|
||||
|
||||
// Number of rounds remaining after the 36x loop.
|
||||
rounds %= 36;
|
||||
|
||||
// Lookup tables.
|
||||
const __m256i lut0 = _mm256_set_epi8(
|
||||
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
|
||||
14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5);
|
||||
|
||||
const __m256i lut1 = _mm256_setr_epi8(
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
|
||||
|
||||
// Temporary registers.
|
||||
__m256i a, b, c, d, e;
|
||||
|
||||
// Temporary register f doubles as the shift mask for the first round.
|
||||
__m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6);
|
||||
|
||||
__asm__ volatile (
|
||||
|
||||
// The first loop iteration requires special handling to ensure
|
||||
// that the read, which is normally done at an offset of -4,
|
||||
// does not underflow the buffer. Load the buffer at an offset
|
||||
// of 0 and permute the input to achieve the same effect.
|
||||
LOAD("a", 0, 0)
|
||||
"vpermd %[a], %[f], %[a] \n\t"
|
||||
|
||||
// Perform the standard shuffling and translation steps.
|
||||
SHUF("a", "b", "c")
|
||||
TRAN("a", "b", "c")
|
||||
|
||||
// Store the result and increment the source and dest pointers.
|
||||
"vmovdqu %[a], (%[dst]) \n\t"
|
||||
"add $24, %[src] \n\t"
|
||||
"add $32, %[dst] \n\t"
|
||||
|
||||
// If there are 36 rounds or more, enter a 36x unrolled loop of
|
||||
// interleaved encoding rounds. The rounds interleave memory
|
||||
// operations (load/store) with data operations (table lookups,
|
||||
// etc) to maximize pipeline throughput.
|
||||
" test %[loops], %[loops] \n\t"
|
||||
" jz 18f \n\t"
|
||||
" jmp 36f \n\t"
|
||||
" \n\t"
|
||||
".balign 64 \n\t"
|
||||
"36: " ROUND_3_INIT()
|
||||
" " ROUND_3_A( 0)
|
||||
" " ROUND_3_B( 3)
|
||||
" " ROUND_3_A( 6)
|
||||
" " ROUND_3_B( 9)
|
||||
" " ROUND_3_A(12)
|
||||
" " ROUND_3_B(15)
|
||||
" " ROUND_3_A(18)
|
||||
" " ROUND_3_B(21)
|
||||
" " ROUND_3_A(24)
|
||||
" " ROUND_3_B(27)
|
||||
" " ROUND_3_A_LAST(30)
|
||||
" add $(24 * 36), %[src] \n\t"
|
||||
" add $(32 * 36), %[dst] \n\t"
|
||||
" dec %[loops] \n\t"
|
||||
" jnz 36b \n\t"
|
||||
|
||||
// Enter an 18x unrolled loop for rounds of 18 or more.
|
||||
"18: cmp $18, %[rounds] \n\t"
|
||||
" jl 9f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B(3)
|
||||
" " ROUND_3_A(6)
|
||||
" " ROUND_3_B(9)
|
||||
" " ROUND_3_A_LAST(12)
|
||||
" sub $18, %[rounds] \n\t"
|
||||
" add $(24 * 18), %[src] \n\t"
|
||||
" add $(32 * 18), %[dst] \n\t"
|
||||
|
||||
// Enter a 9x unrolled loop for rounds of 9 or more.
|
||||
"9: cmp $9, %[rounds] \n\t"
|
||||
" jl 6f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B_LAST(3)
|
||||
" sub $9, %[rounds] \n\t"
|
||||
" add $(24 * 9), %[src] \n\t"
|
||||
" add $(32 * 9), %[dst] \n\t"
|
||||
|
||||
// Enter a 6x unrolled loop for rounds of 6 or more.
|
||||
"6: cmp $6, %[rounds] \n\t"
|
||||
" jl 55f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A_LAST(0)
|
||||
" sub $6, %[rounds] \n\t"
|
||||
" add $(24 * 6), %[src] \n\t"
|
||||
" add $(32 * 6), %[dst] \n\t"
|
||||
|
||||
// Dispatch the remaining rounds 0..5.
|
||||
"55: cmp $3, %[rounds] \n\t"
|
||||
" jg 45f \n\t"
|
||||
" je 3f \n\t"
|
||||
" cmp $1, %[rounds] \n\t"
|
||||
" jg 2f \n\t"
|
||||
" je 1f \n\t"
|
||||
" jmp 0f \n\t"
|
||||
|
||||
"45: cmp $4, %[rounds] \n\t"
|
||||
" je 4f \n\t"
|
||||
|
||||
// Block of non-interlaced encoding rounds, which can each
|
||||
// individually be jumped to. Rounds fall through to the next.
|
||||
"5: " ROUND()
|
||||
"4: " ROUND()
|
||||
"3: " ROUND()
|
||||
"2: " ROUND()
|
||||
"1: " ROUND()
|
||||
"0: \n\t"
|
||||
|
||||
// Outputs (modified).
|
||||
: [rounds] "+r" (rounds),
|
||||
[loops] "+r" (loops),
|
||||
[src] "+r" (*s),
|
||||
[dst] "+r" (*o),
|
||||
[a] "=&x" (a),
|
||||
[b] "=&x" (b),
|
||||
[c] "=&x" (c),
|
||||
[d] "=&x" (d),
|
||||
[e] "=&x" (e),
|
||||
[f] "+x" (f)
|
||||
|
||||
// Inputs (not modified).
|
||||
: [lut0] "x" (lut0),
|
||||
[lut1] "x" (lut1),
|
||||
[msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)),
|
||||
[msk1] "x" (_mm256_set1_epi32(0x04000040)),
|
||||
[msk2] "x" (_mm256_set1_epi32(0x003F03F0)),
|
||||
[msk3] "x" (_mm256_set1_epi32(0x01000010)),
|
||||
[n51] "x" (_mm256_set1_epi8(51)),
|
||||
[n25] "x" (_mm256_set1_epi8(25))
|
||||
|
||||
// Clobbers.
|
||||
: "cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
42
deps/base64/base64/lib/arch/avx512/codec.c
vendored
Normal file
42
deps/base64/base64/lib/arch/avx512/codec.c
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "../../../include/libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#if HAVE_AVX512
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "../avx2/dec_reshuffle.c"
|
||||
#include "../avx2/dec_loop.c"
|
||||
#include "enc_reshuffle_translate.c"
|
||||
#include "enc_loop.c"
|
||||
|
||||
#endif // HAVE_AVX512
|
||||
|
||||
BASE64_ENC_FUNCTION(avx512)
|
||||
{
|
||||
#if HAVE_AVX512
|
||||
#include "../generic/enc_head.c"
|
||||
enc_loop_avx512(&s, &slen, &o, &olen);
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
BASE64_ENC_STUB
|
||||
#endif
|
||||
}
|
||||
|
||||
// Reuse AVX2 decoding. Not supporting AVX512 at present
|
||||
BASE64_DEC_FUNCTION(avx512)
|
||||
{
|
||||
#if HAVE_AVX512
|
||||
#include "../generic/dec_head.c"
|
||||
dec_loop_avx2(&s, &slen, &o, &olen);
|
||||
#include "../generic/dec_tail.c"
|
||||
#else
|
||||
BASE64_DEC_STUB
|
||||
#endif
|
||||
}
|
61
deps/base64/base64/lib/arch/avx512/enc_loop.c
vendored
Normal file
61
deps/base64/base64/lib/arch/avx512/enc_loop.c
vendored
Normal file
@ -0,0 +1,61 @@
|
||||
static inline void
|
||||
enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
|
||||
{
|
||||
// Load input.
|
||||
__m512i src = _mm512_loadu_si512((__m512i *) *s);
|
||||
|
||||
// Reshuffle, translate, store.
|
||||
src = enc_reshuffle_translate(src);
|
||||
_mm512_storeu_si512((__m512i *) *o, src);
|
||||
|
||||
*s += 48;
|
||||
*o += 64;
|
||||
}
|
||||
|
||||
static inline void
|
||||
enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 64) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 48 bytes at a time. Because blocks are loaded 64
|
||||
// bytes at a time, ensure that there will be at least 24 remaining
|
||||
// bytes after the last round, so that the final read will not pass
|
||||
// beyond the bounds of the input buffer.
|
||||
size_t rounds = (*slen - 24) / 48;
|
||||
|
||||
*slen -= rounds * 48; // 48 bytes consumed per round
|
||||
*olen += rounds * 64; // 64 bytes produced per round
|
||||
|
||||
while (rounds > 0) {
|
||||
if (rounds >= 8) {
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
rounds -= 8;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
rounds -= 4;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
rounds -= 2;
|
||||
continue;
|
||||
}
|
||||
enc_loop_avx512_inner(s, o);
|
||||
break;
|
||||
}
|
||||
}
|
50
deps/base64/base64/lib/arch/avx512/enc_reshuffle_translate.c
vendored
Normal file
50
deps/base64/base64/lib/arch/avx512/enc_reshuffle_translate.c
vendored
Normal file
@ -0,0 +1,50 @@
|
||||
// AVX512 algorithm is based on permutevar and multishift. The code is based on
|
||||
// https://github.com/WojciechMula/base64simd which is under BSD-2 license.
|
||||
|
||||
static inline __m512i
|
||||
enc_reshuffle_translate (const __m512i input)
|
||||
{
|
||||
// 32-bit input
|
||||
// [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
|
||||
// b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
|
||||
// output order [1, 2, 0, 1]
|
||||
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
|
||||
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
|
||||
|
||||
const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
|
||||
0x04050304,
|
||||
0x07080607,
|
||||
0x0a0b090a,
|
||||
0x0d0e0c0d,
|
||||
0x10110f10,
|
||||
0x13141213,
|
||||
0x16171516,
|
||||
0x191a1819,
|
||||
0x1c1d1b1c,
|
||||
0x1f201e1f,
|
||||
0x22232122,
|
||||
0x25262425,
|
||||
0x28292728,
|
||||
0x2b2c2a2b,
|
||||
0x2e2f2d2e);
|
||||
|
||||
// Reorder bytes
|
||||
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
|
||||
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
|
||||
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
|
||||
|
||||
// After multishift a single 32-bit lane has following layout
|
||||
// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
|
||||
// a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
|
||||
// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
|
||||
|
||||
// 48, 54, 36, 42, 16, 22, 4, 10
|
||||
const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
|
||||
__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
|
||||
|
||||
// Translate immediatedly after reshuffled.
|
||||
const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
|
||||
|
||||
// Translation 6-bit values to ASCII.
|
||||
return _mm512_permutexvar_epi8(shuffled_in, lookup);
|
||||
}
|
@ -100,7 +100,8 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
|
||||
[n63] "w" (n63)
|
||||
|
||||
// Clobbers.
|
||||
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
|
||||
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
|
||||
"cc", "memory"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
@ -160,7 +160,8 @@ enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
// Clobbers.
|
||||
: "v2", "v3", "v4", "v5",
|
||||
"v8", "v9", "v10", "v11",
|
||||
"v12", "v13", "v14", "v15"
|
||||
"v12", "v13", "v14", "v15",
|
||||
"cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
|
20
deps/base64/base64/lib/arch/sse41/codec.c
vendored
20
deps/base64/base64/lib/arch/sse41/codec.c
vendored
@ -11,11 +11,25 @@
|
||||
#if HAVE_SSE41
|
||||
#include <smmintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
#ifndef BASE64_SSE41_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_SSE41_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_SSE41_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "../ssse3/dec_reshuffle.c"
|
||||
#include "../ssse3/dec_loop.c"
|
||||
#include "../ssse3/enc_translate.c"
|
||||
#include "../ssse3/enc_reshuffle.c"
|
||||
#include "../ssse3/enc_loop.c"
|
||||
|
||||
#if BASE64_SSE41_USE_ASM
|
||||
# include "../ssse3/enc_loop_asm.c"
|
||||
#else
|
||||
# include "../ssse3/enc_translate.c"
|
||||
# include "../ssse3/enc_reshuffle.c"
|
||||
# include "../ssse3/enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_SSE41
|
||||
|
||||
|
20
deps/base64/base64/lib/arch/sse42/codec.c
vendored
20
deps/base64/base64/lib/arch/sse42/codec.c
vendored
@ -11,11 +11,25 @@
|
||||
#if HAVE_SSE42
|
||||
#include <nmmintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
#ifndef BASE64_SSE42_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_SSE42_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_SSE42_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "../ssse3/dec_reshuffle.c"
|
||||
#include "../ssse3/dec_loop.c"
|
||||
#include "../ssse3/enc_translate.c"
|
||||
#include "../ssse3/enc_reshuffle.c"
|
||||
#include "../ssse3/enc_loop.c"
|
||||
|
||||
#if BASE64_SSE42_USE_ASM
|
||||
# include "../ssse3/enc_loop_asm.c"
|
||||
#else
|
||||
# include "../ssse3/enc_translate.c"
|
||||
# include "../ssse3/enc_reshuffle.c"
|
||||
# include "../ssse3/enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_SSE42
|
||||
|
||||
|
22
deps/base64/base64/lib/arch/ssse3/codec.c
vendored
22
deps/base64/base64/lib/arch/ssse3/codec.c
vendored
@ -11,11 +11,27 @@
|
||||
#if HAVE_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
// 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM
|
||||
// registers, which is not enough to run the inline assembly.
|
||||
#ifndef BASE64_SSSE3_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_SSSE3_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_SSSE3_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "dec_reshuffle.c"
|
||||
#include "dec_loop.c"
|
||||
#include "enc_reshuffle.c"
|
||||
#include "enc_translate.c"
|
||||
#include "enc_loop.c"
|
||||
|
||||
#if BASE64_SSSE3_USE_ASM
|
||||
# include "enc_loop_asm.c"
|
||||
#else
|
||||
# include "enc_reshuffle.c"
|
||||
# include "enc_translate.c"
|
||||
# include "enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_SSSE3
|
||||
|
||||
|
268
deps/base64/base64/lib/arch/ssse3/enc_loop_asm.c
vendored
Normal file
268
deps/base64/base64/lib/arch/ssse3/enc_loop_asm.c
vendored
Normal file
@ -0,0 +1,268 @@
|
||||
// Apologies in advance for combining the preprocessor with inline assembly,
|
||||
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
|
||||
// code repetition. The preprocessor is used to template large sections of
|
||||
// inline assembly that differ only in the registers used. If the code was
|
||||
// written out by hand, it would become very large and hard to audit.
|
||||
|
||||
// Generate a block of inline assembly that loads register R0 from memory. The
|
||||
// offset at which the register is loaded is set by the given round.
|
||||
#define LOAD(R0, ROUND) \
|
||||
"lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that deinterleaves and shuffles register
|
||||
// R0 using preloaded constants. Outputs in R0 and R1.
|
||||
#define SHUF(R0, R1) \
|
||||
"pshufb %[lut0], %["R0"] \n\t" \
|
||||
"movdqa %["R0"], %["R1"] \n\t" \
|
||||
"pand %[msk0], %["R0"] \n\t" \
|
||||
"pand %[msk2], %["R1"] \n\t" \
|
||||
"pmulhuw %[msk1], %["R0"] \n\t" \
|
||||
"pmullw %[msk3], %["R1"] \n\t" \
|
||||
"por %["R1"], %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that takes R0 and R1 and translates
|
||||
// their contents to the base64 alphabet, using preloaded constants.
|
||||
#define TRAN(R0, R1, R2) \
|
||||
"movdqa %["R0"], %["R1"] \n\t" \
|
||||
"movdqa %["R0"], %["R2"] \n\t" \
|
||||
"psubusb %[n51], %["R1"] \n\t" \
|
||||
"pcmpgtb %[n25], %["R2"] \n\t" \
|
||||
"psubb %["R2"], %["R1"] \n\t" \
|
||||
"movdqa %[lut1], %["R2"] \n\t" \
|
||||
"pshufb %["R1"], %["R2"] \n\t" \
|
||||
"paddb %["R2"], %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that stores the given register R0 at an
|
||||
// offset set by the given round.
|
||||
#define STOR(R0, ROUND) \
|
||||
"movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
|
||||
|
||||
// Generate a block of inline assembly that generates a single self-contained
|
||||
// encoder round: fetch the data, process it, and store the result. Then update
|
||||
// the source and destination pointers.
|
||||
#define ROUND() \
|
||||
LOAD("a", 0) \
|
||||
SHUF("a", "b") \
|
||||
TRAN("a", "b", "c") \
|
||||
STOR("a", 0) \
|
||||
"add $12, %[src] \n\t" \
|
||||
"add $16, %[dst] \n\t"
|
||||
|
||||
// Define a macro that initiates a three-way interleaved encoding round by
|
||||
// preloading registers a, b and c from memory.
|
||||
// The register graph shows which registers are in use during each step, and
|
||||
// is a visual aid for choosing registers for that step. Symbol index:
|
||||
//
|
||||
// + indicates that a register is loaded by that step.
|
||||
// | indicates that a register is in use and must not be touched.
|
||||
// - indicates that a register is decommissioned by that step.
|
||||
// x indicates that a register is used as a temporary by that step.
|
||||
// V indicates that a register is an input or output to the macro.
|
||||
//
|
||||
#define ROUND_3_INIT() /* a b c d e f */ \
|
||||
LOAD("a", 0) /* + */ \
|
||||
SHUF("a", "d") /* | + */ \
|
||||
LOAD("b", 1) /* | + | */ \
|
||||
TRAN("a", "d", "e") /* | | - x */ \
|
||||
LOAD("c", 2) /* V V V */
|
||||
|
||||
// Define a macro that translates, shuffles and stores the input registers A, B
|
||||
// and C, and preloads registers D, E and F for the next round.
|
||||
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
|
||||
// and F back into the next round as input registers A, B and C. The macro
|
||||
// carefully interleaves memory operations with data operations for optimal
|
||||
// pipelined performance.
|
||||
|
||||
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
LOAD(D, (ROUND + 3)) /* V V V + */ \
|
||||
SHUF(B, E) /* | | | | + */ \
|
||||
STOR(A, (ROUND + 0)) /* - | | | | */ \
|
||||
TRAN(B, E, F) /* | | | - x */ \
|
||||
LOAD(E, (ROUND + 4)) /* | | | + */ \
|
||||
SHUF(C, A) /* + | | | | */ \
|
||||
STOR(B, (ROUND + 1)) /* | - | | | */ \
|
||||
TRAN(C, A, F) /* - | | | x */ \
|
||||
LOAD(F, (ROUND + 5)) /* | | | + */ \
|
||||
SHUF(D, A) /* + | | | | */ \
|
||||
STOR(C, (ROUND + 2)) /* | - | | | */ \
|
||||
TRAN(D, A, B) /* - x V V V */
|
||||
|
||||
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
|
||||
// registers D, E and F, and translating, shuffling and storing them.
|
||||
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
SHUF(E, A) /* + V V V */ \
|
||||
STOR(D, (ROUND + 3)) /* | - | | */ \
|
||||
TRAN(E, A, B) /* - x | | */ \
|
||||
SHUF(F, C) /* + | | */ \
|
||||
STOR(E, (ROUND + 4)) /* | - | */ \
|
||||
TRAN(F, C, D) /* - x | */ \
|
||||
STOR(F, (ROUND + 5)) /* - */
|
||||
|
||||
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
|
||||
#define ROUND_3_A(ROUND) \
|
||||
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Define a type B round. Inputs and outputs are swapped with regard to type A.
|
||||
#define ROUND_3_B(ROUND) \
|
||||
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Terminating macro for a type A round.
|
||||
#define ROUND_3_A_LAST(ROUND) \
|
||||
ROUND_3_A(ROUND) \
|
||||
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Terminating macro for a type B round.
|
||||
#define ROUND_3_B_LAST(ROUND) \
|
||||
ROUND_3_B(ROUND) \
|
||||
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Suppress clang's warning that the literal string in the asm statement is
|
||||
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
|
||||
// compilers). It may be true, but the goal here is not C99 portability.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
|
||||
static inline void
|
||||
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
// For a clearer explanation of the algorithm used by this function,
|
||||
// please refer to the plain (not inline assembly) implementation. This
|
||||
// function follows the same basic logic.
|
||||
|
||||
if (*slen < 16) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
|
||||
// bytes, so "reserve" four bytes from the input buffer to ensure that
|
||||
// we never read beyond the end of the input buffer.
|
||||
size_t rounds = (*slen - 4) / 12;
|
||||
|
||||
*slen -= rounds * 12; // 12 bytes consumed per round
|
||||
*olen += rounds * 16; // 16 bytes produced per round
|
||||
|
||||
// Number of times to go through the 36x loop.
|
||||
size_t loops = rounds / 36;
|
||||
|
||||
// Number of rounds remaining after the 36x loop.
|
||||
rounds %= 36;
|
||||
|
||||
// Lookup tables.
|
||||
const __m128i lut0 = _mm_set_epi8(
|
||||
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
|
||||
|
||||
const __m128i lut1 = _mm_setr_epi8(
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
|
||||
|
||||
// Temporary registers.
|
||||
__m128i a, b, c, d, e, f;
|
||||
|
||||
__asm__ volatile (
|
||||
|
||||
// If there are 36 rounds or more, enter a 36x unrolled loop of
|
||||
// interleaved encoding rounds. The rounds interleave memory
|
||||
// operations (load/store) with data operations (table lookups,
|
||||
// etc) to maximize pipeline throughput.
|
||||
" test %[loops], %[loops] \n\t"
|
||||
" jz 18f \n\t"
|
||||
" jmp 36f \n\t"
|
||||
" \n\t"
|
||||
".balign 64 \n\t"
|
||||
"36: " ROUND_3_INIT()
|
||||
" " ROUND_3_A( 0)
|
||||
" " ROUND_3_B( 3)
|
||||
" " ROUND_3_A( 6)
|
||||
" " ROUND_3_B( 9)
|
||||
" " ROUND_3_A(12)
|
||||
" " ROUND_3_B(15)
|
||||
" " ROUND_3_A(18)
|
||||
" " ROUND_3_B(21)
|
||||
" " ROUND_3_A(24)
|
||||
" " ROUND_3_B(27)
|
||||
" " ROUND_3_A_LAST(30)
|
||||
" add $(12 * 36), %[src] \n\t"
|
||||
" add $(16 * 36), %[dst] \n\t"
|
||||
" dec %[loops] \n\t"
|
||||
" jnz 36b \n\t"
|
||||
|
||||
// Enter an 18x unrolled loop for rounds of 18 or more.
|
||||
"18: cmp $18, %[rounds] \n\t"
|
||||
" jl 9f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B(3)
|
||||
" " ROUND_3_A(6)
|
||||
" " ROUND_3_B(9)
|
||||
" " ROUND_3_A_LAST(12)
|
||||
" sub $18, %[rounds] \n\t"
|
||||
" add $(12 * 18), %[src] \n\t"
|
||||
" add $(16 * 18), %[dst] \n\t"
|
||||
|
||||
// Enter a 9x unrolled loop for rounds of 9 or more.
|
||||
"9: cmp $9, %[rounds] \n\t"
|
||||
" jl 6f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B_LAST(3)
|
||||
" sub $9, %[rounds] \n\t"
|
||||
" add $(12 * 9), %[src] \n\t"
|
||||
" add $(16 * 9), %[dst] \n\t"
|
||||
|
||||
// Enter a 6x unrolled loop for rounds of 6 or more.
|
||||
"6: cmp $6, %[rounds] \n\t"
|
||||
" jl 55f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A_LAST(0)
|
||||
" sub $6, %[rounds] \n\t"
|
||||
" add $(12 * 6), %[src] \n\t"
|
||||
" add $(16 * 6), %[dst] \n\t"
|
||||
|
||||
// Dispatch the remaining rounds 0..5.
|
||||
"55: cmp $3, %[rounds] \n\t"
|
||||
" jg 45f \n\t"
|
||||
" je 3f \n\t"
|
||||
" cmp $1, %[rounds] \n\t"
|
||||
" jg 2f \n\t"
|
||||
" je 1f \n\t"
|
||||
" jmp 0f \n\t"
|
||||
|
||||
"45: cmp $4, %[rounds] \n\t"
|
||||
" je 4f \n\t"
|
||||
|
||||
// Block of non-interlaced encoding rounds, which can each
|
||||
// individually be jumped to. Rounds fall through to the next.
|
||||
"5: " ROUND()
|
||||
"4: " ROUND()
|
||||
"3: " ROUND()
|
||||
"2: " ROUND()
|
||||
"1: " ROUND()
|
||||
"0: \n\t"
|
||||
|
||||
// Outputs (modified).
|
||||
: [rounds] "+r" (rounds),
|
||||
[loops] "+r" (loops),
|
||||
[src] "+r" (*s),
|
||||
[dst] "+r" (*o),
|
||||
[a] "=&x" (a),
|
||||
[b] "=&x" (b),
|
||||
[c] "=&x" (c),
|
||||
[d] "=&x" (d),
|
||||
[e] "=&x" (e),
|
||||
[f] "=&x" (f)
|
||||
|
||||
// Inputs (not modified).
|
||||
: [lut0] "x" (lut0),
|
||||
[lut1] "x" (lut1),
|
||||
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
|
||||
[msk1] "x" (_mm_set1_epi32(0x04000040)),
|
||||
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
|
||||
[msk3] "x" (_mm_set1_epi32(0x01000010)),
|
||||
[n51] "x" (_mm_set1_epi8(51)),
|
||||
[n25] "x" (_mm_set1_epi8(25))
|
||||
|
||||
// Clobbers.
|
||||
: "cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
36
deps/base64/base64/lib/codec_choose.c
vendored
36
deps/base64/base64/lib/codec_choose.c
vendored
@ -2,6 +2,7 @@
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "../include/libbase64.h"
|
||||
#include "codecs.h"
|
||||
@ -10,7 +11,7 @@
|
||||
|
||||
#if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
|
||||
#define BASE64_X86
|
||||
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2)
|
||||
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512)
|
||||
#define BASE64_X86_SIMD
|
||||
#endif
|
||||
#endif
|
||||
@ -31,7 +32,7 @@
|
||||
__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
|
||||
#else
|
||||
#include <cpuid.h>
|
||||
#if HAVE_AVX2 || HAVE_AVX
|
||||
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
|
||||
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
|
||||
static inline uint64_t _xgetbv (uint32_t index)
|
||||
{
|
||||
@ -45,6 +46,12 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef bit_AVX512vl
|
||||
#define bit_AVX512vl (1 << 31)
|
||||
#endif
|
||||
#ifndef bit_AVX512vbmi
|
||||
#define bit_AVX512vbmi (1 << 1)
|
||||
#endif
|
||||
#ifndef bit_AVX2
|
||||
#define bit_AVX2 (1 << 5)
|
||||
#endif
|
||||
@ -75,6 +82,7 @@
|
||||
BASE64_ENC_FUNCTION(arch); \
|
||||
BASE64_DEC_FUNCTION(arch); \
|
||||
|
||||
BASE64_CODEC_FUNCS(avx512)
|
||||
BASE64_CODEC_FUNCS(avx2)
|
||||
BASE64_CODEC_FUNCS(neon32)
|
||||
BASE64_CODEC_FUNCS(neon64)
|
||||
@ -91,9 +99,10 @@ codec_choose_forced (struct codec *codec, int flags)
|
||||
// always allow it, even if the codec is a no-op.
|
||||
// For testing purposes.
|
||||
|
||||
if (!(flags & 0xFF)) {
|
||||
if (!(flags & 0xFFFF)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (flags & BASE64_FORCE_AVX2) {
|
||||
codec->enc = base64_stream_encode_avx2;
|
||||
codec->dec = base64_stream_decode_avx2;
|
||||
@ -134,6 +143,11 @@ codec_choose_forced (struct codec *codec, int flags)
|
||||
codec->dec = base64_stream_decode_avx;
|
||||
return true;
|
||||
}
|
||||
if (flags & BASE64_FORCE_AVX512) {
|
||||
codec->enc = base64_stream_encode_avx512;
|
||||
codec->dec = base64_stream_decode_avx512;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -178,8 +192,8 @@ codec_choose_x86 (struct codec *codec)
|
||||
max_level = __get_cpuid_max(0, NULL);
|
||||
#endif
|
||||
|
||||
#if HAVE_AVX2 || HAVE_AVX
|
||||
// Check for AVX/AVX2 support:
|
||||
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
|
||||
// Check for AVX/AVX2/AVX512 support:
|
||||
// Checking for AVX requires 3 things:
|
||||
// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
|
||||
// (allowing saving YMM registers on context switch)
|
||||
@ -194,7 +208,17 @@ codec_choose_x86 (struct codec *codec)
|
||||
if (ecx & bit_XSAVE_XRSTORE) {
|
||||
uint64_t xcr_mask;
|
||||
xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
|
||||
if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) {
|
||||
if ((xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) == _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { // check multiple bits at once
|
||||
#if HAVE_AVX512
|
||||
if (max_level >= 7) {
|
||||
__cpuid_count(7, 0, eax, ebx, ecx, edx);
|
||||
if ((ebx & bit_AVX512vl) && (ecx & bit_AVX512vbmi)) {
|
||||
codec->enc = base64_stream_encode_avx512;
|
||||
codec->dec = base64_stream_decode_avx512;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if HAVE_AVX2
|
||||
if (max_level >= 7) {
|
||||
__cpuid_count(7, 0, eax, ebx, ecx, edx);
|
||||
|
2
deps/base64/base64/lib/lib.c
vendored
2
deps/base64/base64/lib/lib.c
vendored
@ -68,7 +68,7 @@ void
|
||||
base64_stream_decode_init (struct base64_state *state, int flags)
|
||||
{
|
||||
// If any of the codec flags are set, redo choice:
|
||||
if (codec.dec == NULL || flags & 0xFF) {
|
||||
if (codec.dec == NULL || flags & 0xFFFF) {
|
||||
codec_choose(&codec, flags);
|
||||
}
|
||||
state->eof = 0;
|
||||
|
5
deps/base64/base64/test/Makefile
vendored
5
deps/base64/base64/test/Makefile
vendored
@ -11,12 +11,15 @@ else
|
||||
BENCH_LDFLAGS=-lrt
|
||||
endif
|
||||
|
||||
.PHONY: clean test
|
||||
.PHONY: clean test valgrind
|
||||
|
||||
test: clean test_base64 benchmark
|
||||
./test_base64
|
||||
./benchmark
|
||||
|
||||
valgrind: clean test_base64
|
||||
valgrind --error-exitcode=2 ./test_base64
|
||||
|
||||
test_base64: test_base64.c codec_supported.o ../lib/libbase64.o
|
||||
$(CC) $(CFLAGS) -o $@ $^
|
||||
|
||||
|
37
deps/base64/base64/test/ci/analysis.sh
vendored
Executable file
37
deps/base64/base64/test/ci/analysis.sh
vendored
Executable file
@ -0,0 +1,37 @@
|
||||
#!/bin/bash
|
||||
set -ve
|
||||
|
||||
MACHINE=$(uname -m)
|
||||
export CC=gcc
|
||||
|
||||
uname -a
|
||||
clang --version # make analyse
|
||||
${CC} --version # make -C test valgrind
|
||||
|
||||
for USE_ASSEMBLY in 0 1; do
|
||||
if [ "${MACHINE}" == "x86_64" ]; then
|
||||
export SSSE3_CFLAGS="-mssse3 -DBASE64_SSSE3_USE_ASM=${USE_ASSEMBLY}"
|
||||
export SSE41_CFLAGS="-msse4.1 -DBASE64_SSE41_USE_ASM=${USE_ASSEMBLY}"
|
||||
export SSE42_CFLAGS="-msse4.2 -DBASE64_SSE42_USE_ASM=${USE_ASSEMBLY}"
|
||||
export AVX_CFLAGS="-mavx -DBASE64_AVX_USE_ASM=${USE_ASSEMBLY}"
|
||||
export AVX2_CFLAGS="-mavx2 -DBASE64_AVX2_USE_ASM=${USE_ASSEMBLY}"
|
||||
# Temporarily disable AVX512; it is not available in CI yet.
|
||||
# export AVX512_CFLAGS="-mavx512vl -mavx512vbmi"
|
||||
elif [ "${MACHINE}" == "aarch64" ]; then
|
||||
export NEON64_CFLAGS="-march=armv8-a"
|
||||
elif [ "${MACHINE}" == "armv7l" ]; then
|
||||
export NEON32_CFLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon"
|
||||
fi
|
||||
|
||||
if [ ${USE_ASSEMBLY} -eq 0 ]; then
|
||||
echo "::group::analyze"
|
||||
make analyze
|
||||
echo "::endgroup::"
|
||||
fi
|
||||
|
||||
echo "::group::valgrind (USE_ASSEMBLY=${USE_ASSEMBLY})"
|
||||
make clean
|
||||
make
|
||||
make -C test valgrind
|
||||
echo "::endgroup::"
|
||||
done
|
4
deps/base64/base64/test/ci/test.sh
vendored
4
deps/base64/base64/test/ci/test.sh
vendored
@ -7,9 +7,11 @@ if [ "${MACHINE}" == "x86_64" ]; then
|
||||
export SSE41_CFLAGS=-msse4.1
|
||||
export SSE42_CFLAGS=-msse4.2
|
||||
export AVX_CFLAGS=-mavx
|
||||
# no AVX2 on GHA macOS
|
||||
# no AVX2 or AVX512 on GHA macOS
|
||||
if [ "$(uname -s)" != "Darwin" ]; then
|
||||
export AVX2_CFLAGS=-mavx2
|
||||
# Temporarily disable AVX512; it is not available in CI yet.
|
||||
# export AVX512_CFLAGS="-mavx512vl -mavx512vbmi"
|
||||
fi
|
||||
elif [ "${MACHINE}" == "aarch64" ]; then
|
||||
export NEON64_CFLAGS="-march=armv8-a"
|
||||
|
1
deps/base64/base64/test/codec_supported.c
vendored
1
deps/base64/base64/test/codec_supported.c
vendored
@ -11,6 +11,7 @@ static char *_codecs[] =
|
||||
, "SSE41"
|
||||
, "SSE42"
|
||||
, "AVX"
|
||||
, "AVX512"
|
||||
, NULL
|
||||
} ;
|
||||
|
||||
|
29
deps/base64/base64/test/test_base64.c
vendored
29
deps/base64/base64/test/test_base64.c
vendored
@ -1,6 +1,7 @@
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "../include/libbase64.h"
|
||||
#include "codec_supported.h"
|
||||
#include "moby_dick.h"
|
||||
@ -92,7 +93,7 @@ assert_roundtrip (int flags, const char *src)
|
||||
}
|
||||
|
||||
static int
|
||||
test_char_table (int flags)
|
||||
test_char_table (int flags, bool use_malloc)
|
||||
{
|
||||
bool fail = false;
|
||||
char chr[256];
|
||||
@ -107,8 +108,24 @@ test_char_table (int flags)
|
||||
for (int i = 0; i < 256; i++) {
|
||||
|
||||
size_t chrlen = 256 - i;
|
||||
char* src = &chr[i];
|
||||
if (use_malloc) {
|
||||
src = malloc(chrlen); /* malloc/copy this so valgrind can find out-of-bound access */
|
||||
if (src == NULL) {
|
||||
printf(
|
||||
"FAIL: encoding @ %d: allocation of %lu bytes failed\n",
|
||||
i, (unsigned long)chrlen
|
||||
);
|
||||
fail = true;
|
||||
continue;
|
||||
}
|
||||
memcpy(src, &chr[i], chrlen);
|
||||
}
|
||||
|
||||
base64_encode(&chr[i], chrlen, enc, &enclen, BASE64_FORCE_PLAIN);
|
||||
base64_encode(src, chrlen, enc, &enclen, flags);
|
||||
if (use_malloc) {
|
||||
free(src);
|
||||
}
|
||||
|
||||
if (!base64_decode(enc, enclen, dec, &declen, flags)) {
|
||||
printf("FAIL: decoding @ %d: decoding error\n", i);
|
||||
@ -198,6 +215,11 @@ test_streaming (int flags)
|
||||
while (base64_stream_decode(&state, &ref[inpos], (inpos + bs > reflen) ? reflen - inpos : bs, &enc[enclen], &partlen)) {
|
||||
enclen += partlen;
|
||||
inpos += bs;
|
||||
|
||||
// Has the entire buffer been consumed?
|
||||
if (inpos >= 400) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (enclen != 256) {
|
||||
printf("FAIL: stream decoding gave incorrect size: "
|
||||
@ -336,7 +358,8 @@ test_one_codec (const char *codec, int flags)
|
||||
fail |= assert_roundtrip(flags, vec[i].out);
|
||||
}
|
||||
|
||||
fail |= test_char_table(flags);
|
||||
fail |= test_char_table(flags, false); /* test with unaligned input buffer */
|
||||
fail |= test_char_table(flags, true); /* test for out-of-bound input read */
|
||||
fail |= test_streaming(flags);
|
||||
fail |= test_invalid_dec_input(flags);
|
||||
|
||||
|
@ -10,7 +10,7 @@ This a list of all the dependencies:
|
||||
|
||||
* [acorn 8.11.2][]
|
||||
* [ada 2.7.2][]
|
||||
* [base64 0.5.0][]
|
||||
* [base64 0.5.1][]
|
||||
* [brotli 1.0.9][]
|
||||
* [c-ares 1.20.1][]
|
||||
* [cjs-module-lexer 1.2.2][]
|
||||
@ -155,7 +155,7 @@ an abstract syntax tree walker for the ESTree format.
|
||||
The [ada](https://github.com/ada-url/ada) dependency is a
|
||||
fast and spec-compliant URL parser written in C++.
|
||||
|
||||
### base64 0.5.0
|
||||
### base64 0.5.1
|
||||
|
||||
The [base64](https://github.com/aklomp/base64) dependency is a base64
|
||||
stream encoding/decoding library in C99 with SIMD and OpenMP acceleration.
|
||||
@ -320,7 +320,7 @@ performance improvements not currently available in standard zlib.
|
||||
|
||||
[acorn 8.11.2]: #acorn-8112
|
||||
[ada 2.7.2]: #ada-272
|
||||
[base64 0.5.0]: #base64-050
|
||||
[base64 0.5.1]: #base64-051
|
||||
[brotli 1.0.9]: #brotli-109
|
||||
[c-ares 1.20.1]: #c-ares-1201
|
||||
[cjs-module-lexer 1.2.2]: #cjs-module-lexer-122
|
||||
|
@ -2,5 +2,5 @@
|
||||
// Refer to tools/dep_updaters/update-base64.sh
|
||||
#ifndef SRC_BASE64_VERSION_H_
|
||||
#define SRC_BASE64_VERSION_H_
|
||||
#define BASE64_VERSION "0.5.0"
|
||||
#define BASE64_VERSION "0.5.1"
|
||||
#endif // SRC_BASE64_VERSION_H_
|
||||
|
Loading…
Reference in New Issue
Block a user