diff --git a/src/unicode/convert.h b/src/unicode/convert.h index 365f1e0..bb10017 100644 --- a/src/unicode/convert.h +++ b/src/unicode/convert.h @@ -13,12 +13,29 @@ */ #pragma once +#include +#include +#include +#include +//#include + #include #include #include #include #include +namespace unicode::accelerator +{ +#if defined(__AVX512BW__) + using optimal = avx512bw; +#elif defined(__SSE__) + using optimal = sse; +#else + using optimal = naive; +#endif +} + namespace unicode { template struct decoder; @@ -60,7 +77,60 @@ template<> struct decoder // {{{ unsigned expectedLength = 0; unsigned currentLength = 0; + LIBUNICODE_ALIGNED_FUNC + LIBUNICODE_FORCE_INLINE + decoder_status operator()(uint8_t const* _begin, + uint8_t const* _end, + char32_t* _output) + { + return consume(_begin, _end, _output); + } + + template + LIBUNICODE_ALIGNED_FUNC + LIBUNICODE_FORCE_INLINE + decoder_status consume(uint8_t const* _begin, + uint8_t const* _end, + char32_t* _output) + { + uint8_t const* inputBegin = _begin; + char32_t const* outputBegin = _output; + + if (!consumeUntilAligned(_begin, _end, _output)) + return decoder_status{ + false, + static_cast(_begin - inputBegin), + static_cast(_output - outputBegin), + }; + + // Accelerated processing (128-bit aligned blocks) + while (_begin <= _end - Accelerator::alignment) + { + if (*_begin < 0x80) + detail::convertAsciiBlockOnce(_begin, _output); + else if (auto const opt = consumeCodepoint(_begin, _end)) + *_output++ = *opt; + else + return decoder_status{ + false, + static_cast(_begin - inputBegin), + static_cast(_output - outputBegin), + }; + } + + return decoder_status{ + consumeTrailingBytes(_begin, _end, _output), + static_cast(_begin - inputBegin), + static_cast(_output - outputBegin), + }; + } + constexpr std::optional operator()(uint8_t _byte) + { + return consumeCodeunit(_byte); + } + + constexpr std::optional consumeCodeunit(uint8_t _byte) { if (!expectedLength) { @@ -88,7 +158,10 @@ template<> struct decoder // {{{ character = _byte & 0b0000'0111; } else + { + expectedLength = 0; // reset state return std::nullopt; // invalid + } } else { @@ -106,88 +179,62 @@ template<> struct decoder // {{{ template < typename InputIterator, + typename InputSentinel, std::enable_if_t()), char>, int> = 0 > - constexpr std::optional operator()(InputIterator& _input) + constexpr std::optional operator()(InputIterator& _input, InputSentinel _end) { - using std::nullopt; + return consumeCodepoint(_input, _end); + } - auto const ch0 = uint8_t(*_input++); - if (ch0 < 0x80) // 0xxx_xxxx - return static_cast(ch0); + template < + typename InputIterator, + typename InputSentinel, + std::enable_if_t()), char>, int> = 0 + > + LIBUNICODE_FORCE_INLINE + constexpr std::optional consumeCodepoint(InputIterator& _input, InputSentinel _end) + { + while (_input < _end) + if (auto codepoint = consumeCodeunit(*_input++)) + return codepoint; - if (ch0 < 0xC0) - return nullopt; + return std::nullopt; + } - if (ch0 < 0xE0) // 110x_xxxx 10xx_xxxx +private: + template + LIBUNICODE_FORCE_INLINE + bool consumeUntilAligned(uint8_t const*& _begin, + uint8_t const* _end, + char32_t*& _output) + { + // Consume until 128-bit aligned. + while (_begin < _end && !detail::is_aligned(_begin, Accelerator::alignment)) { - auto const ch1 = uint8_t(*_input++); - if ((ch1 >> 6) != 2) - return nullopt; - return static_cast((ch0 << 6) + ch1 - 0x3080); + if (auto const opt = consumeCodepoint(_begin, _end)) + *_output++ = *opt; + else + return false; } + return true; + } - if (ch0 < 0xF0) // 1110_xxxx 10xx_xxxx 10xx_xxxx - { - auto const ch1 = uint8_t(*_input++); - if (ch1 >> 6 != 2) - return nullopt; - auto const ch2 = uint8_t(*_input++); - if (ch2 >> 6 != 2) - return nullopt; - return static_cast((ch0 << 12) + (ch1 << 6) + ch2 - 0xE2080); - } - if (ch0 < 0xF8) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx - { - auto const ch1 = uint8_t(*_input++); - if (ch1 >> 6 != 2) - return nullopt; - auto const ch2 = uint8_t(*_input++); - if (ch2 >> 6 != 2) - return nullopt; - auto const ch3 = uint8_t(*_input++); - if (ch3 >> 6 != 2) - return nullopt; - return static_cast((ch0 << 18) + (ch1 << 12) + (ch2 << 6) + ch3 - 0x3C82080); - } - if (ch0 < 0xFC) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx - { - auto const ch1 = uint8_t(*_input++); - if (ch1 >> 6 != 2) - return nullopt; - auto const ch2 = uint8_t(*_input++); - if (ch2 >> 6 != 2) - return nullopt; - auto const ch3 = uint8_t(*_input++); - if (ch3 >> 6 != 2) - return nullopt; - auto const ch4 = uint8_t(*_input++); - if (ch4 >> 6 != 2) - return nullopt; - auto const a = static_cast((ch0 << 24u) + (ch1 << 18u) + (ch2 << 12u) + (ch3 << 6u) + ch4); - return static_cast(a - 0xFA082080lu); - } - if (ch0 < 0xFE) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + LIBUNICODE_FORCE_INLINE + bool consumeTrailingBytes(uint8_t const*& _begin, + uint8_t const* _end, + char32_t*& _output) + { + while (_begin < _end) { - auto const ch1 = uint8_t(*_input++); - if (ch1 >> 6 != 2) - return nullopt; - auto const ch2 = uint8_t(*_input++); - if (ch2 >> 6 != 2) - return nullopt; - auto const ch3 = uint8_t(*_input++); - if (ch3 >> 6 != 2) - return nullopt; - auto const ch4 = uint8_t(*_input++); - if (ch4 >> 6 != 2) - return nullopt; - auto const ch5 = uint8_t(*_input++); - if (ch5 >> 6 != 2) - return nullopt; - auto const a = static_cast((ch0 << 30) + (ch1 << 24) + (ch2 << 18) + (ch3 << 12) + (ch4 << 6) + ch5); - return static_cast(a - 0x82082080); + if (*_begin < 0x80) + *_output++ = *_begin++; + else if (auto const opt = consumeCodepoint(_begin, _end)) + *_output++ = *opt; + else + return false; } - return nullopt; + return true; } }; // }}} template<> struct encoder // {{{ @@ -261,10 +308,13 @@ template<> struct encoder // {{{ (no-op) template<> struct decoder // {{{ (no-op) { - template - constexpr std::optional operator()(InputIterator& _input) + template + constexpr std::optional operator()(InputIterator& _input, InputSentinel _end) { - return *_input++; + if (_input != _end) + return *_input++; + else + return std::nullopt; } }; // }}} template<> struct encoder // {{{ @@ -321,7 +371,7 @@ OutputIterator convert_to(std::basic_string_view _input, OutputIterator _outp encoder write{}; while (i != e) { - auto const outChar = read(i); + auto const outChar = read(i, e); if (outChar.has_value()) _output = write(outChar.value(), _output); } diff --git a/src/unicode/convert_test.cpp b/src/unicode/convert_test.cpp index 24a9a74..7978201 100644 --- a/src/unicode/convert_test.cpp +++ b/src/unicode/convert_test.cpp @@ -70,6 +70,141 @@ TEST_CASE("convert.8_to_32", "[convert]") CHECK(output == U"[ö€😀"); } +TEST_CASE("convert.utf8.ascii_sse.1.0") +{ + auto const input = "0123456789ABCDEF"sv; + auto const expected = U"0123456789ABCDEF"sv; + + auto const* i = (char8_type const*) input.data(); + auto const* e = (char8_type const*) input.data() + input.size(); + + auto decoder = unicode::decoder{}; + + u32string output; + output.resize(16); + auto const [success, nread, nwritten] = decoder.consume(i, e, output.data()); + + CHECK(success); + CHECK(nread == 16); + output.resize(nwritten); + + CHECK(output == expected); +} + +TEST_CASE("convert.utf8.ascii_sse.1.1") +{ + auto constexpr input = "0123456789ABCDEFa"sv; + auto constexpr expected = U"0123456789ABCDEFa"sv; + + auto const* i = (char8_type const*) input.data(); + auto const* e = (char8_type const*) input.data() + input.size(); + + auto decoder = unicode::decoder{}; + + u32string output; + output.resize(input.size()); + auto const [success, nread, nwritten] = decoder.consume(i, e, output.data()); + + CHECK(success); + CHECK(nread == input.size()); + output.resize(nwritten); + + CHECK(output == expected); +} + +TEST_CASE("convert.utf8.ascii_sse.3.0") +{ + auto constexpr input = "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF"sv; + auto constexpr expected = U"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF"sv; + + auto const* i = (char8_type const*) input.data(); + auto const* e = (char8_type const*) input.data() + input.size(); + + auto decoder = unicode::decoder{}; + + u32string output; + output.resize(input.size()); + auto const [success, nread, nwritten] = decoder.consume(i, e, output.data()); + + CHECK(success); + CHECK(nread == input.size()); + output.resize(nwritten); + + CHECK(output == expected); +} + +template +std::string join(array _chunks) +{ + std::stringstream ss; + for (auto chunk: _chunks) + ss << chunk; + return ss.str(); +} + +std::u32string ascii2utf32(std::string _input) +{ + std::u32string s; + s.reserve(_input.size()); + for (char ch: _input) + s += static_cast(ch); + return s; +} + +TEST_CASE("convert.utf8.ascii_sse.chunked") +{ + auto constexpr input = array{ + "0123456789ABCDEF"sv, + "\\n\\033[1"sv, + ";2m"sv + }; + auto expected = ascii2utf32(join(input)); + + auto decoder = unicode::decoder{}; + + u32string output; + output.resize(expected.size()); + size_t totalWritten = 0; + for (size_t k = 0; k < input.size(); ++k) + { + auto chunk = input[k]; + auto const* i = (char8_type const*) chunk.data(); + auto const* e = (char8_type const*) chunk.data() + chunk.size(); + + auto chunkStr = string(chunk); + auto const [success, nread, nwritten] = decoder.consume(i, e, output.data() + totalWritten); + CHECK(success); + CHECK(nread == chunk.size()); + CHECK(nwritten == nread); // because it's ASCII + totalWritten += nwritten; + } + CHECK(totalWritten == expected.size()); // because it's ASCII + output.resize(expected.size()); + + CHECK(output == expected); +} + +TEST_CASE("convert.utf8.ascii_sse.2") +{ + auto constexpr input = "0123456789ABCDEF0123456789ABCDEF"sv; + auto constexpr expected = U"0123456789ABCDEF0123456789ABCDEF"sv; + + auto const* i = (char8_type const*) input.data(); + auto const* e = (char8_type const*) input.data() + input.size(); + + auto decoder = unicode::decoder{}; + + u32string output; + output.resize(input.size() * 2); + auto const [success, nread, nwritten] = decoder.consume(i, e, output.data()); + + CHECK(success); + CHECK(nread == input.size()); + output.resize(nwritten); + + CHECK(output == expected); +} + TEST_CASE("convert.utf8.incremental_decode", "[utf8]") { auto constexpr values = string_view{ diff --git a/src/unicode/detail/convert-avx512.hpp b/src/unicode/detail/convert-avx512.hpp new file mode 100644 index 0000000..cfc2ede --- /dev/null +++ b/src/unicode/detail/convert-avx512.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include +#include + +#include + +namespace unicode::accelerator +{ + struct avx512 + { + static constexpr std::size_t alignment = 16; + }; +}; + +namespace unicode::detail +{ + +#if defined(__AVX512BW__) + +template <> +LIBUNICODE_FORCE_INLINE +void convertAsciiBlockOnce(unsigned char const*& _begin, char32_t*& _output) noexcept +{ + assert(is_aligned(_begin, accelerator::avx512::alignment)); + + // VPUNPCKLBW: _mm256_unpacklo_epi8 + + // Input: 16 codepoints input (128 bit) + // Output: up to 16 zero-extended 32-bit values, and pointers incremented accordingly + + __m128i input = _mm_loadu_si128((__m128i const*) _begin); // VMOVDQU: load 16 bytes + uint32_t mask = _mm_movemask_epi8(input); // VPMOVMSKB: Determine which octets have high bit set + + //__m256i extended = _mm128_cvtepu8_epi16(input); // VPMOVXZBD: packed zero-extend bytes to DWORD's + //_mm256_store_epi32(_output, extended); // VMOVDQA32: Write to memory + __m512i extended = _mm512_cvtepu8_epi32(input); // zero extend input bytes to words. + + auto zero = _mm_set1_epi8(0); //- Zero out the interleave register + //auto zero = _mm256_set1_epi16(0); //- Zero out the interleave register + auto a = _mm_unpacklo_epi8(input, zero); + _mm256_storeu_si256((__m256i *) _output, a); //- Write to memory + +#if 1 + auto const incr = /* mask == 0 ? 16 : */ trailingZeros(mask); + _begin += incr; + _output += incr; +#else + if (mask == 0) { + _begin += 16; + _output += 16; + } else { + auto const incr = trailingZeros(mask); + _begin += incr; + _output += incr; + } +#endif +} + +#endif // __AVX512BW__ + +} diff --git a/src/unicode/detail/convert-avx512bw.hpp b/src/unicode/detail/convert-avx512bw.hpp new file mode 100644 index 0000000..a0476f3 --- /dev/null +++ b/src/unicode/detail/convert-avx512bw.hpp @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +namespace unicode::accelerator +{ + struct avx512bw + { + static constexpr std::size_t alignment = 16; + }; +} + +namespace unicode::detail +{ + +#if defined(__AVX512BW__) + +template <> +LIBUNICODE_FORCE_INLINE +void convertAsciiBlockOnce(unsigned char const*& _begin, char32_t*& _output) noexcept +{ + assert(is_aligned(_begin, accelerator::avx512bw::alignment)); + + // VMOVDQU: load 16 bytes + __m128i input = _mm_loadu_si128((__m128i const*) _begin); + + // VPMOVMSKB: Determine which octets have high bit set + uint32_t mask = _mm_movemask_epi8(input); + + // VPMOVXZBD: packed zero-extend bytes to DWORD's + __m512i extended = _mm512_cvtepu8_epi32(input); + + // VMOVDQA32: Write to memory + _mm512_store_epi64(_output, extended); + + auto const incr = mask == 0 ? 16 : trailingZeros(mask); + _begin += incr; + _output += incr; +} + +#endif // __AVX512BW__ + +} diff --git a/src/unicode/detail/convert-common.hpp b/src/unicode/detail/convert-common.hpp new file mode 100644 index 0000000..649eac3 --- /dev/null +++ b/src/unicode/detail/convert-common.hpp @@ -0,0 +1,69 @@ +#pragma once + +#include +#include + +#include +#include +#include + +#if !defined(_MSC_VER) + #define LIBUNICODE_ALIGNED_FUNC __attribute__((aligned (128))) + #ifdef __OPTIMIZE__ + #define LIBUNICODE_FORCE_INLINE inline __attribute__((always_inline)) + #else + #define LIBUNICODE_FORCE_INLINE inline + #endif +#else + #define LIBUNICODE_ALIGNED_FUNC + #define LIBUNICODE_FORCE_INLINE inline +#endif + +#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_IX86_FP) +#define LIBUNICODE_TARGET_SSE2 1 +#endif + +namespace unicode +{ + struct decoder_status + { + bool success; + std::size_t read_offset; + std::size_t write_offset; + }; +} + +namespace unicode::detail +{ + +template +LIBUNICODE_FORCE_INLINE +bool is_ascii(T _char) noexcept +{ + return _char < 0x80; +} + +inline bool is_aligned(void const* _pointer, size_t _byte_count) noexcept +{ + return uintptr_t(_pointer) % _byte_count == 0; +} + +LIBUNICODE_FORCE_INLINE uint32_t trailingZeros(int32_t _value) noexcept +{ +#if ((defined(__linux__) && (defined(__clang__) || defined(__GNUC__))) || \ + (defined(__WIN32) && !defined(_MSVC)) || \ + defined(__APPLE__)) + return __builtin_ctz((unsigned int) _value); +#elif defined(_WIN32) // && defined(_MSVC) + unsigned long count; + _BitScanForward(&count, (unsigned long) _value); + return static_cast(count); +#else + #error Unsupported Platform / Compiler +#endif +} + +template +void convertAsciiBlockOnce(unsigned char const*& _begin, char32_t*& _output) noexcept; + +} diff --git a/src/unicode/detail/convert-naive.hpp b/src/unicode/detail/convert-naive.hpp new file mode 100644 index 0000000..61cb51e --- /dev/null +++ b/src/unicode/detail/convert-naive.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +namespace unicode::accelerator +{ + struct naive + { + static constexpr std::size_t alignment = 1; + }; +}; + +namespace unicode::detail +{ + +template <> +LIBUNICODE_FORCE_INLINE +void convertAsciiBlockOnce(unsigned char const*& _input, char32_t*& _output) noexcept +{ + for (size_t i = 0; i < accelerator::naive::alignment && is_ascii(*_input); ++i) + *_output++ = static_cast(*_input++); +} + +} diff --git a/src/unicode/detail/convert-sse.hpp b/src/unicode/detail/convert-sse.hpp new file mode 100644 index 0000000..c440927 --- /dev/null +++ b/src/unicode/detail/convert-sse.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include +#include + +namespace unicode::accelerator +{ + struct sse + { + static constexpr std::size_t alignment = 16; + }; +}; + +namespace unicode::detail +{ + +#if defined(LIBUNICODE_TARGET_SSE2) + +template <> +LIBUNICODE_FORCE_INLINE +void convertAsciiBlockOnce(unsigned char const*& _begin, char32_t*& _output) noexcept +{ + // Ensure 16-byte alignment. + // If not aligned, _mm_loadu_si128 should be used. + assert(is_aligned(_begin, accelerator::sse::alignment)); + + __m128i zero = _mm_set1_epi8(0); // Zero out the interleave register + __m128i chunk = _mm_load_si128((__m128i const*) _begin); // Load a register with 8-bit bytes + int32_t mask = _mm_movemask_epi8(chunk); // Determine which octets have high bit set + + __m128i half = _mm_unpacklo_epi8(chunk, zero); // Unpack bytes 0-7 into 16-bit words + __m128i qrtr = _mm_unpacklo_epi16(half, zero); // Unpack words 0-3 into 32-bit dwords + _mm_storeu_si128((__m128i*) _output, qrtr); // Write to memory + qrtr = _mm_unpackhi_epi16(half, zero); // Unpack words 4-7 into 32-bit dwords + _mm_storeu_si128((__m128i*) (_output + 4), qrtr); // Write to memory + + half = _mm_unpackhi_epi8(chunk, zero); // Unpack bytes 8-15 into 16-bit words + qrtr = _mm_unpacklo_epi16(half, zero); // Unpack words 8-11 into 32-bit dwords + _mm_storeu_si128((__m128i*) (_output + 8), qrtr); // Write to memory + qrtr = _mm_unpackhi_epi16(half, zero); // Unpack words 12-15 into 32-bit dwords + _mm_storeu_si128((__m128i*) (_output + 12), qrtr); // Write to memory + + auto const incr = mask == 0 ? 16 : trailingZeros(mask); + _begin += incr; + _output += incr; +} + +#endif + +}