Joseph Surin
3 years ago
7 changed files with 244751 additions and 5 deletions
@ -1,6 +1,6 @@
@@ -1,6 +1,6 @@
|
||||
[submodule "tgbot-cpp"] |
||||
path = tgbot-cpp |
||||
url = https://github.com/reo7sp/tgbot-cpp/ |
||||
[submodule "spdlog"] |
||||
path = spdlog |
||||
[submodule "lib/tgbot-cpp"] |
||||
path = lib/tgbot-cpp |
||||
url = https://github.com/reo7sp/tgbot-cpp.git |
||||
[submodule "lib/spdlog"] |
||||
path = lib/spdlog |
||||
url = https://github.com/gabime/spdlog.git |
||||
|
@ -0,0 +1,666 @@
@@ -0,0 +1,666 @@
|
||||
#pragma once |
||||
|
||||
#include <array> |
||||
#include <cstdint> |
||||
#include <string_view> |
||||
#include <vector> |
||||
|
||||
#ifdef __GNUG__ |
||||
#pragma GCC target("avx2") // GCC will only compile AVX2 if we tell it to.
|
||||
#endif |
||||
|
||||
#include <tmmintrin.h> |
||||
#include <immintrin.h> |
||||
|
||||
#include "CpuFeatures.hpp" |
||||
#define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1) |
||||
|
||||
namespace base64 { |
||||
enum class Codepath { |
||||
Auto = 0, |
||||
Basic = 1, |
||||
SSSE3 = 2, |
||||
AVX2 = 3 |
||||
}; |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
namespace detail { |
||||
// Static look-up table for 6-bit values to 8-bit base64 characters. All values are valid.
|
||||
constexpr std::string_view Base64LUT{ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" }; |
||||
|
||||
// Static look-up table for 8-bit base64 characters to 6-bit values. Invalid values are forced to
|
||||
// zero (i.e. no validation).
|
||||
constexpr std::array<uint8_t, 256> Base64InverseLUT = { |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63, // 0x20 - 0x2F
|
||||
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F
|
||||
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, // 0x40 - 0x4F
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0, // 0x50 - 0x5F
|
||||
0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, // 0x60 - 0x6F
|
||||
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0, // 0x70 - 0x7F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0xF0 - 0xFF
|
||||
}; |
||||
|
||||
inline Codepath get_auto_codepath() { |
||||
using namespace cpu_features; |
||||
auto features = get_features(); |
||||
if (features & Features::AVX2) { |
||||
return Codepath::AVX2; |
||||
} |
||||
if (features & Features::SSSE3) { |
||||
return Codepath::SSSE3; |
||||
} |
||||
return Codepath::Basic; |
||||
} |
||||
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
|
||||
inline size_t encode_bulk_ssse3( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
uint8_t*& dest_ptr |
||||
) { |
||||
size_t loop_count = (source_data_length / 12); |
||||
if (loop_count == 0) { |
||||
return 0; |
||||
} |
||||
|
||||
size_t loop_end = (loop_count * 12); |
||||
|
||||
// Code based on work by Wojciech Muła
|
||||
// Ref: http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
|
||||
const __m128i preshuffle_128 = _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); |
||||
const __m128i t0Mask = _mm_set_epi32(0x0fc0fc00, 0x0fc0fc00, 0x0fc0fc00, 0x0fc0fc00); |
||||
const __m128i t1Values = _mm_set_epi32(0x04000040, 0x04000040, 0x04000040, 0x04000040); |
||||
const __m128i t2Mask = _mm_set_epi32(0x003f03f0, 0x003f03f0, 0x003f03f0, 0x003f03f0); |
||||
const __m128i t3Values = _mm_set_epi32(0x01000010, 0x01000010, 0x01000010, 0x01000010); |
||||
const __m128i _51_128 = _mm_set_epi32(0x33333333, 0x33333333, 0x33333333, 0x33333333); |
||||
const __m128i _26_128 = _mm_set_epi32(0x1a1a1a1a, 0x1a1a1a1a, 0x1a1a1a1a, 0x1a1a1a1a); |
||||
const __m128i _13_128 = _mm_set_epi32(0x0d0d0d0d, 0x0d0d0d0d, 0x0d0d0d0d, 0x0d0d0d0d); |
||||
const __m128i shiftLUT = _mm_setr_epi8( |
||||
'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
||||
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, |
||||
'/' - 63, 'A', 0, 0 |
||||
); |
||||
|
||||
for (size_t i = 0; i < loop_end; i += 12, dest_ptr += 16) { |
||||
// Load four sets of octets at once.
|
||||
// [????|dddc|ccbb|baaa]
|
||||
__m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source_data[i])); |
||||
|
||||
// [?ddd|?ccc|?bbb|?aaa]
|
||||
b = _mm_shuffle_epi8(b, preshuffle_128); |
||||
|
||||
// t0 = [0000cccc|CC000000|aaaaaa00|00000000]
|
||||
// t1 = [00000000|00cccccc|00000000|00aaaaaa]
|
||||
// t2 = [00000000|00dddddd|000000bb|bbbb0000]
|
||||
// t3 = [00dddddd|00000000|00bbbbbb|00000000]
|
||||
// unpacked = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
|
||||
const __m128i t0 = _mm_and_si128(b, t0Mask); |
||||
const __m128i t2 = _mm_and_si128(b, t2Mask); |
||||
const __m128i t1 = _mm_mulhi_epu16(t0, t1Values); |
||||
const __m128i t3 = _mm_mullo_epi16(t2, t3Values); |
||||
const __m128i unpacked = _mm_or_si128(t1, t3); |
||||
|
||||
// Convert to base64 characters without lookup tables
|
||||
const __m128i reduced = _mm_or_si128( |
||||
_mm_subs_epu8(unpacked, _51_128), |
||||
_mm_and_si128( |
||||
_mm_cmpgt_epi8(_26_128, unpacked), |
||||
_13_128 |
||||
) |
||||
); |
||||
const __m128i result = _mm_add_epi8( |
||||
_mm_shuffle_epi8(shiftLUT, reduced), |
||||
unpacked |
||||
); |
||||
|
||||
// Output
|
||||
_mm_storeu_si128( |
||||
reinterpret_cast<__m128i*>(dest_ptr), |
||||
result |
||||
); |
||||
} |
||||
|
||||
return loop_end; |
||||
} |
||||
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
|
||||
inline size_t encode_bulk_avx2( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
uint8_t*& dest_ptr |
||||
) { |
||||
size_t loop_count = (source_data_length / 24); |
||||
if (loop_count == 0) { |
||||
return 0; |
||||
} |
||||
|
||||
size_t loop_end = (loop_count * 24); |
||||
|
||||
// Code based on work by Wojciech Muła
|
||||
// Ref: http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
|
||||
const __m256i preshuffle_256 = _mm256_set_epi8( |
||||
10, 11, 9, 10, |
||||
7, 8, 6, 7, |
||||
4, 5, 3, 4, |
||||
1, 2, 0, 1, |
||||
10, 11, 9, 10, |
||||
7, 8, 6, 7, |
||||
4, 5, 3, 4, |
||||
1, 2, 0, 1 |
||||
); |
||||
const __m256i t0Mask = _mm256_set1_epi32(0x0fc0fc00); |
||||
const __m256i t1Values = _mm256_set1_epi32(0x04000040); |
||||
const __m256i t2Mask = _mm256_set1_epi32(0x003f03f0); |
||||
const __m256i t3Values = _mm256_set1_epi32(0x01000010); |
||||
const __m256i _51_256 = _mm256_set1_epi32(0x33333333); |
||||
const __m256i _26_256 = _mm256_set1_epi32(0x1a1a1a1a); |
||||
const __m256i _13_256 = _mm256_set1_epi32(0x0d0d0d0d); |
||||
const __m256i shiftLUT = _mm256_setr_epi8( |
||||
'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
||||
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, |
||||
'/' - 63, 'A', 0, 0, |
||||
'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, |
||||
'0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, |
||||
'/' - 63, 'A', 0, 0 |
||||
); |
||||
|
||||
for (size_t i = 0; i < loop_end; i += 24, dest_ptr += 32) { |
||||
// Load eight sets of octets at once.
|
||||
// b_low = [????|dddc|ccbb|baaa]
|
||||
// b_high = [????|hhhg|ggff|feee]
|
||||
__m128i b_low = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source_data[i])); |
||||
__m128i b_high = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source_data[i+12])); |
||||
|
||||
// b = [?hhh|?ggg|?fff|?eee|?ddd|?ccc|?bbb|?aaa]
|
||||
__m256i b = _mm256_shuffle_epi8( |
||||
_mm256_set_m128i(b_high, b_low), |
||||
preshuffle_256 |
||||
); |
||||
|
||||
// t0 = [0000cccc|CC000000|aaaaaa00|00000000]
|
||||
// t1 = [00000000|00cccccc|00000000|00aaaaaa]
|
||||
// t2 = [00000000|00dddddd|000000bb|bbbb0000]
|
||||
// t3 = [00dddddd|00000000|00bbbbbb|00000000]
|
||||
// unpacked = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
|
||||
const __m256i t0 = _mm256_and_si256(b, t0Mask); |
||||
const __m256i t2 = _mm256_and_si256(b, t2Mask); |
||||
const __m256i t1 = _mm256_mulhi_epu16(t0, t1Values); |
||||
const __m256i t3 = _mm256_mullo_epi16(t2, t3Values); |
||||
const __m256i unpacked = _mm256_or_si256(t1, t3); |
||||
|
||||
// Convert to base64 characters without lookup tables
|
||||
const __m256i reduced = _mm256_or_si256( |
||||
_mm256_subs_epu8(unpacked, _51_256), |
||||
_mm256_and_si256( |
||||
_mm256_cmpgt_epi8(_26_256, unpacked), |
||||
_13_256 |
||||
) |
||||
); |
||||
const __m256i result = _mm256_add_epi8( |
||||
_mm256_shuffle_epi8(shiftLUT, reduced), |
||||
unpacked |
||||
); |
||||
|
||||
// Output
|
||||
_mm256_storeu_si256( |
||||
reinterpret_cast<__m256i*>(dest_ptr), |
||||
result |
||||
); |
||||
} |
||||
|
||||
return loop_end; |
||||
} |
||||
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
|
||||
inline size_t encode_bulk( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
uint8_t*& dest_ptr, |
||||
Codepath codepath = Codepath::Auto |
||||
) { |
||||
if (codepath == Codepath::Auto) { |
||||
static auto auto_codepath = get_auto_codepath(); |
||||
codepath = auto_codepath; |
||||
} |
||||
|
||||
switch (codepath) { |
||||
case Codepath::SSSE3: return encode_bulk_ssse3(source_data, source_data_length, dest_ptr); |
||||
case Codepath::AVX2: return encode_bulk_avx2(source_data, source_data_length, dest_ptr); |
||||
default: |
||||
case Codepath::Basic: return 0; |
||||
} |
||||
} |
||||
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
|
||||
inline size_t decode_bulk_ssse3( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
uint8_t*& dest_ptr |
||||
) { |
||||
size_t loop_count = (source_data_length / 16); |
||||
if (loop_count <= 1) { |
||||
return 0; |
||||
} |
||||
|
||||
loop_count--; |
||||
size_t loop_end = (loop_count * 16); |
||||
|
||||
// Code based on work by Wojciech Muła
|
||||
// Ref: http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
|
||||
const __m128i _0f_128 = _mm_set1_epi8(0x0f); |
||||
const __m128i _2f_128 = _mm_set1_epi8(0x2f); |
||||
const __m128i _n3_128 = _mm_set1_epi8(-3); |
||||
const __m128i shiftLUT = _mm_setr_epi8( |
||||
/* 0 */ 0x00, /* 1 */ 0x00, /* 2 */ 0x3e - 0x2b, /* 3 */ 0x34 - 0x30, |
||||
/* 4 */ 0x00 - 0x41, /* 5 */ 0x0f - 0x50, /* 6 */ 0x1a - 0x61, /* 7 */ 0x29 - 0x70, |
||||
/* 8 */ 0x00, /* 9 */ 0x00, /* a */ 0x00, /* b */ 0x00, |
||||
/* c */ 0x00, /* d */ 0x00, /* e */ 0x00, /* f */ 0x00 |
||||
); |
||||
const __m128i packValues1 = _mm_set_epi32(0x01400140, 0x01400140, 0x01400140, 0x01400140); |
||||
const __m128i packValues2 = _mm_set_epi32(0x00011000, 0x00011000, 0x00011000, 0x00011000); |
||||
const __m128i unshuffle_128 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1); |
||||
|
||||
for (size_t i = 0; i < loop_end; i += 16, dest_ptr += 12) { |
||||
// Load four sets of octets at once.
|
||||
__m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source_data[i])); |
||||
|
||||
// Base64 characters -> 6-bit unpacked
|
||||
const __m128i higher_nibble = _mm_and_si128(_mm_srli_epi32(b, 4), _0f_128); |
||||
const __m128i eq_2f = _mm_cmpeq_epi8(b, _2f_128); |
||||
|
||||
const __m128i shift = _mm_shuffle_epi8(shiftLUT, higher_nibble); |
||||
const __m128i t0 = _mm_add_epi8(b, shift); |
||||
const __m128i unpacked = _mm_add_epi8(t0, _mm_and_si128(eq_2f, _n3_128)); |
||||
|
||||
// 6-bit unpacked -> 8-bit packed
|
||||
const __m128i packed = _mm_madd_epi16( |
||||
_mm_maddubs_epi16(unpacked, packValues1), |
||||
packValues2 |
||||
); |
||||
|
||||
// 8-bit packed -> original order
|
||||
const __m128i unshuffled = _mm_shuffle_epi8(packed, unshuffle_128); |
||||
|
||||
// Output
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest_ptr), unshuffled); |
||||
} |
||||
|
||||
return loop_end; |
||||
} |
||||
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
|
||||
inline size_t decode_bulk_avx2( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
uint8_t*& dest_ptr |
||||
) { |
||||
size_t loop_count = (source_data_length / 32); |
||||
if (loop_count <= 1) { |
||||
return 0; |
||||
} |
||||
|
||||
loop_count--; |
||||
size_t loop_end = (loop_count * 32); |
||||
|
||||
// Code based on work by Wojciech Muła
|
||||
// Ref: http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
|
||||
const __m256i _0f_256 = _mm256_set1_epi8(0x0f); |
||||
const __m256i _2f_256 = _mm256_set1_epi8(0x2f); |
||||
const __m256i _n3_256 = _mm256_set1_epi8(-3); |
||||
const __m256i shiftLUT = _mm256_setr_epi8( |
||||
/* 0 */ 0x00, /* 1 */ 0x00, /* 2 */ 0x3e - 0x2b, /* 3 */ 0x34 - 0x30, |
||||
/* 4 */ 0x00 - 0x41, /* 5 */ 0x0f - 0x50, /* 6 */ 0x1a - 0x61, /* 7 */ 0x29 - 0x70, |
||||
/* 8 */ 0x00, /* 9 */ 0x00, /* a */ 0x00, /* b */ 0x00, |
||||
/* c */ 0x00, /* d */ 0x00, /* e */ 0x00, /* f */ 0x00, |
||||
/* 0 */ 0x00, /* 1 */ 0x00, /* 2 */ 0x3e - 0x2b, /* 3 */ 0x34 - 0x30, |
||||
/* 4 */ 0x00 - 0x41, /* 5 */ 0x0f - 0x50, /* 6 */ 0x1a - 0x61, /* 7 */ 0x29 - 0x70, |
||||
/* 8 */ 0x00, /* 9 */ 0x00, /* a */ 0x00, /* b */ 0x00, |
||||
/* c */ 0x00, /* d */ 0x00, /* e */ 0x00, /* f */ 0x00 |
||||
); |
||||
const __m256i packValues1 = _mm256_set1_epi32(0x01400140); |
||||
const __m256i packValues2 = _mm256_set1_epi32(0x00011000); |
||||
const __m256i unshuffle_256 = _mm256_setr_epi8( |
||||
2, 1, 0, |
||||
6, 5, 4, |
||||
10, 9, 8, |
||||
14, 13, 12, |
||||
-1, -1, -1, -1, |
||||
2, 1, 0, |
||||
6, 5, 4, |
||||
10, 9, 8, |
||||
14, 13, 12, |
||||
-1, -1, -1, -1 |
||||
); |
||||
|
||||
for (size_t i = 0; i < loop_end; i += 32, dest_ptr += 24) { |
||||
// Load eight sets of octets at once.
|
||||
__m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&source_data[i])); |
||||
|
||||
// Base64 characters -> 6-bit unpacked
|
||||
const __m256i higher_nibble = _mm256_and_si256(_mm256_srli_epi32(b, 4), _0f_256); |
||||
const __m256i eq_2f = _mm256_cmpeq_epi8(b, _2f_256); |
||||
|
||||
const __m256i shift = _mm256_shuffle_epi8(shiftLUT, higher_nibble); |
||||
const __m256i t0 = _mm256_add_epi8(b, shift); |
||||
const __m256i unpacked = _mm256_add_epi8(t0, _mm256_and_si256(eq_2f, _n3_256)); |
||||
|
||||
// 6-bit unpacked -> 8-bit packed
|
||||
const __m256i packed = _mm256_madd_epi16( |
||||
_mm256_maddubs_epi16(unpacked, packValues1), |
||||
packValues2 |
||||
); |
||||
|
||||
// 8-bit packed -> original order
|
||||
const __m256i unshuffled = _mm256_shuffle_epi8(packed, unshuffle_256); |
||||
|
||||
// Output
|
||||
_mm_storeu_si128( |
||||
reinterpret_cast<__m128i*>(dest_ptr), |
||||
_mm256_extracti128_si256(unshuffled, 0) |
||||
); |
||||
_mm_storeu_si128( |
||||
reinterpret_cast<__m128i*>(dest_ptr+12), |
||||
_mm256_extracti128_si256(unshuffled, 1) |
||||
); |
||||
} |
||||
|
||||
return loop_end; |
||||
} |
||||
|
||||
//----------------------------------------------------------------------------------------------------
|
||||
|
||||
inline size_t decode_bulk( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
uint8_t*& dest_ptr, |
||||
Codepath codepath = Codepath::Auto |
||||
) { |
||||
if (codepath == Codepath::Auto) { |
||||
static auto auto_codepath = get_auto_codepath(); |
||||
codepath = auto_codepath; |
||||
} |
||||
|
||||
switch (codepath) { |
||||
case Codepath::SSSE3: return decode_bulk_ssse3(source_data, source_data_length, dest_ptr); |
||||
case Codepath::AVX2: return decode_bulk_avx2(source_data, source_data_length, dest_ptr); |
||||
default: |
||||
case Codepath::Basic: return 0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
// Helper to determine the size of an encoded base64 buffer.
|
||||
inline size_t get_encoded_length(size_t binary_length, bool padded = true) { |
||||
if (padded) { |
||||
return (binary_length + 2) / 3 * 4; |
||||
} else { |
||||
size_t remainder = binary_length % 3; |
||||
size_t length = (binary_length / 3) * 4; |
||||
if (remainder) { |
||||
length += remainder + 1; |
||||
} |
||||
return length; |
||||
} |
||||
} |
||||
|
||||
// Helper to determine the size of a decoded binary buffer, given the source base64 data.
|
||||
inline size_t get_decoded_length(const uint8_t* data, const size_t data_length) { |
||||
if (data_length == 0) { |
||||
return 0; |
||||
} |
||||
|
||||
size_t octet_count = data_length / 4; |
||||
size_t remainder = data_length % 4; |
||||
if (remainder != 0) { |
||||
// Unpadded data
|
||||
return (octet_count * 3) + (remainder - 1); |
||||
} |
||||
|
||||
// Either binary % 3 == 0 || padded
|
||||
octet_count *= 3; |
||||
if (data[data_length-2] == '=') { |
||||
return octet_count - 2; |
||||
} else if (data[data_length-1] == '=') { |
||||
return octet_count - 1; |
||||
} |
||||
|
||||
return octet_count; |
||||
} |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
// Primary base64 encoding method. Asserts that the destination buffer is _exactly_ the required size.
|
||||
inline void encode( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
uint8_t* dest_data, |
||||
const size_t dest_data_length, |
||||
bool padded = true, |
||||
Codepath codepath = Codepath::Auto |
||||
) { |
||||
if (get_encoded_length(source_data_length, padded) != dest_data_length) { |
||||
throw std::logic_error("Dest buffer is incorrect size"); |
||||
} |
||||
|
||||
// Use bulk vectorized encoding for as much data as possible.
|
||||
auto dest_ptr = dest_data; |
||||
size_t loop_end = detail::encode_bulk(source_data, source_data_length, dest_ptr, codepath); |
||||
|
||||
size_t remainder = source_data_length - loop_end; |
||||
size_t octet_count = (remainder / 3); |
||||
size_t octet_end = loop_end + (octet_count * 3); |
||||
|
||||
// Process three source values at a time.
|
||||
for (size_t i = loop_end; i < octet_end; i += 3, dest_ptr += 4) { |
||||
uint8_t b0 = source_data[i ]; |
||||
uint8_t b1 = source_data[i+1]; |
||||
uint8_t b2 = source_data[i+2]; |
||||
|
||||
dest_ptr[0] = detail::Base64LUT[b0 >> 2]; |
||||
dest_ptr[1] = detail::Base64LUT[(b0 & 0x03) << 4 | b1 >> 4]; |
||||
dest_ptr[2] = detail::Base64LUT[(b1 & 0x0F) << 2 | b2 >> 6]; |
||||
dest_ptr[3] = detail::Base64LUT[b2 & 0x3F]; |
||||
} |
||||
|
||||
// Handle the remaining values separately to avoid branches the main loop.
|
||||
remainder = source_data_length - octet_end; |
||||
if (remainder == 2) { |
||||
uint8_t b0 = source_data[octet_end ]; |
||||
uint8_t b1 = source_data[octet_end+1]; |
||||
|
||||
dest_ptr[0] = detail::Base64LUT[b0 >> 2]; |
||||
dest_ptr[1] = detail::Base64LUT[(b0 & 0x03) << 4 | b1 >> 4]; |
||||
dest_ptr[2] = detail::Base64LUT[(b1 & 0x0F) << 2]; |
||||
if (padded) { |
||||
dest_ptr[3] = '='; |
||||
} |
||||
|
||||
} else if (remainder == 1) { |
||||
uint8_t b0 = source_data[octet_end]; |
||||
|
||||
dest_ptr[0] = detail::Base64LUT[b0 >> 2]; |
||||
dest_ptr[1] = detail::Base64LUT[(b0 & 0x03) << 4]; |
||||
|
||||
if (padded) { |
||||
dest_ptr[2] = '='; |
||||
dest_ptr[3] = '='; |
||||
} |
||||
} |
||||
} |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
// Helper to encode directly to a std::string.
|
||||
inline std::string encode_to_string( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
bool padded = true, |
||||
Codepath codepath = Codepath::Auto |
||||
) { |
||||
std::string str( |
||||
get_encoded_length(source_data_length, padded), |
||||
'=' |
||||
); |
||||
|
||||
encode( |
||||
source_data, |
||||
source_data_length, |
||||
reinterpret_cast<uint8_t*>(str.data()), |
||||
str.size(), |
||||
padded, |
||||
codepath |
||||
); |
||||
|
||||
return str; |
||||
} |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
// Helper to encode directly to a std::vector. This is slightly faster than std::string as it doesn't
|
||||
// need to initialize the buffer before encoding.
|
||||
inline std::vector<uint8_t> encode_to_byte_vector( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
bool padded = true, |
||||
Codepath codepath = Codepath::Auto |
||||
) { |
||||
std::vector<uint8_t> buf; |
||||
buf.resize(get_encoded_length(source_data_length, padded)); |
||||
|
||||
encode( |
||||
source_data, |
||||
source_data_length, |
||||
buf.data(), |
||||
buf.size(), |
||||
padded, |
||||
codepath |
||||
); |
||||
|
||||
return buf; |
||||
} |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
// Primary base64 decoding method. Asserts that the destination buffer is _exactly_ the required size.
|
||||
inline void decode( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
uint8_t* dest_data, |
||||
const size_t dest_data_length, |
||||
Codepath codepath = Codepath::Auto |
||||
) { |
||||
size_t binary_length = get_decoded_length(source_data, source_data_length); |
||||
if (binary_length != dest_data_length) { |
||||
throw std::logic_error("Dest buffer is incorrect size"); |
||||
} |
||||
|
||||
auto dest_ptr = dest_data; |
||||
size_t loop_end = detail::decode_bulk(source_data, source_data_length, dest_ptr, codepath); |
||||
|
||||
size_t binary_remainder = dest_data_length - std::distance(dest_data, dest_ptr); |
||||
size_t octet_count = binary_remainder / 3; |
||||
size_t octet_end = loop_end + (octet_count * 4); |
||||
|
||||
// Process four source values at a time.
|
||||
for (size_t i = loop_end; i < octet_end; i += 4, dest_ptr += 3) { |
||||
uint8_t b0 = detail::Base64InverseLUT[source_data[i ]]; |
||||
uint8_t b1 = detail::Base64InverseLUT[source_data[i+1]]; |
||||
uint8_t b2 = detail::Base64InverseLUT[source_data[i+2]]; |
||||
uint8_t b3 = detail::Base64InverseLUT[source_data[i+3]]; |
||||
|
||||
dest_ptr[0] = b0 << 2 | b1 >> 4; |
||||
dest_ptr[1] = b1 << 4 | b2 >> 2; |
||||
dest_ptr[2] = b2 << 6 | b3; |
||||
} |
||||
|
||||
// Handle the remaining values separately to avoid branches the main loop.
|
||||
binary_remainder -= (octet_count * 3); |
||||
if (binary_remainder == 2) { |
||||
uint8_t b0 = detail::Base64InverseLUT[source_data[octet_end ]]; |
||||
uint8_t b1 = detail::Base64InverseLUT[source_data[octet_end+1]]; |
||||
uint8_t b2 = detail::Base64InverseLUT[source_data[octet_end+2]]; |
||||
|
||||
dest_ptr[0] = b0 << 2 | b1 >> 4; |
||||
dest_ptr[1] = b1 << 4 | b2 >> 2; |
||||
|
||||
} else if (binary_remainder == 1) { |
||||
uint8_t b0 = detail::Base64InverseLUT[source_data[octet_end ]]; |
||||
uint8_t b1 = detail::Base64InverseLUT[source_data[octet_end+1]]; |
||||
|
||||
dest_ptr[0] = b0 << 2 | b1 >> 4; |
||||
} |
||||
} |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
// Helper to decode directly to a std::string.
|
||||
inline std::string decode_to_string( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
Codepath codepath = Codepath::Auto |
||||
) { |
||||
std::string str(get_decoded_length(source_data, source_data_length), '\0'); |
||||
|
||||
decode( |
||||
source_data, |
||||
source_data_length, |
||||
reinterpret_cast<uint8_t*>(str.data()), |
||||
str.size(), |
||||
codepath |
||||
); |
||||
|
||||
return str; |
||||
} |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
// Helper to decode directly to a std::vector. This is slightly faster than std::string as it doesn't
|
||||
// need to initialize the buffer before decoding.
|
||||
inline std::vector<uint8_t> decode_to_vector( |
||||
const uint8_t* source_data, |
||||
const size_t source_data_length, |
||||
Codepath codepath = Codepath::Auto |
||||
) { |
||||
std::vector<uint8_t> buf; |
||||
buf.resize(get_decoded_length(source_data, source_data_length)); |
||||
|
||||
decode( |
||||
source_data, |
||||
source_data_length, |
||||
buf.data(), |
||||
buf.size(), |
||||
codepath |
||||
); |
||||
|
||||
return buf; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,86 @@
@@ -0,0 +1,86 @@
|
||||
#pragma once |
||||
|
||||
#include <array> |
||||
#ifdef _MSC_VER |
||||
#include <intrin.h> |
||||
#endif |
||||
#ifdef __GNUG__ |
||||
#include <cpuid.h> |
||||
#endif |
||||
|
||||
namespace cpu_features { |
||||
|
||||
enum Features : uint64_t { |
||||
None = 0, |
||||
SSE = 1 << 0, |
||||
SSE2 = 1 << 1, |
||||
SSE3 = 1 << 2, |
||||
SSSE3 = 1 << 3, |
||||
SSE4_1 = 1 << 4, |
||||
SSE4_2 = 1 << 5, |
||||
AVX = 1 << 6, |
||||
AVX2 = 1 << 7, |
||||
AVX512F = 1 << 8, |
||||
AVX512PF = 1 << 9, |
||||
AVX512ER = 1 << 10, |
||||
AVX512CD = 1 << 11, |
||||
}; |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
namespace detail { |
||||
#ifdef _MSC_VER |
||||
inline void cpuid(std::array<int, 4>& info, int level) { |
||||
__cpuid(info.data(), level); |
||||
} |
||||
#endif |
||||
#ifdef __GNUG__ |
||||
inline void cpuid(std::array<int, 4>& info, int level) { |
||||
auto ptr = reinterpret_cast<unsigned int*>(info.data()); |
||||
if (level == 1) { |
||||
__get_cpuid(level, &ptr[0], &ptr[1], &ptr[2], &ptr[3]); |
||||
} else { |
||||
__cpuid_count(level, 0, ptr[0], ptr[1], ptr[2], ptr[3]); |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
inline Features get_features_impl() { |
||||
std::array<int, 4> info = {0}; |
||||
cpuid(info, 0); |
||||
int feature_levels = info[0]; |
||||
|
||||
// Feature level 1 always exists
|
||||
cpuid(info, 1); |
||||
std::underlying_type_t<Features> features = Features::None; |
||||
if (info[3] & (1 << 25)) { features |= Features::SSE; } |
||||
if (info[3] & (1 << 26)) { features |= Features::SSE2; } |
||||
if (info[2] & (1 << 0)) { features |= Features::SSE3; } |
||||
if (info[2] & (1 << 9)) { features |= Features::SSSE3; } |
||||
if (info[2] & (1 << 19)) { features |= Features::SSE4_1; } |
||||
if (info[2] & (1 << 20)) { features |= Features::SSE4_2; } |
||||
if (info[2] & (1 << 28)) { features |= Features::AVX; } |
||||
|
||||
// Feature level 7
|
||||
if (feature_levels >= 7) { |
||||
std::array<int, 4> info7 = {0}; |
||||
cpuid(info7, 7); |
||||
if (info7[1] & (1 << 5)) { features |= Features::AVX2; } |
||||
if (info7[1] & (1 << 16)) { features |= Features::AVX512F; } |
||||
if (info7[1] & (1 << 26)) { features |= Features::AVX512PF; } |
||||
if (info7[1] & (1 << 27)) { features |= Features::AVX512ER; } |
||||
if (info7[1] & (1 << 28)) { features |= Features::AVX512CD; } |
||||
} |
||||
|
||||
return static_cast<Features>(features); |
||||
} |
||||
} |
||||
|
||||
//--------------------------------------------------------------------------------------------------------
|
||||
|
||||
inline Features get_features() { |
||||
static Features s_features = detail::get_features_impl(); |
||||
return s_features; |
||||
} |
||||
|
||||
} |
Loading…
Reference in new issue