From fdfd1e61797846da94848699158f28e5ebb22c9f Mon Sep 17 00:00:00 2001 From: Alexander Pirogov Date: Mon, 18 Dec 2023 15:29:07 +0200 Subject: [PATCH] Fixing bad utf support in to_string implementation in RD by using https://github.com/ww898/utf-cpp for string processing --- .../src/rd_core_cpp/src/main/std/to_string.h | 13 +- rd-cpp/thirdparty/CMakeLists.txt | 4 + rd-cpp/thirdparty/utf-cpp/LICENSE.md | 21 ++ rd-cpp/thirdparty/utf-cpp/README.md | 58 ++++ .../utf-cpp/include/ww898/cp_utf16.hpp | 109 ++++++++ .../utf-cpp/include/ww898/cp_utf32.hpp | 67 +++++ .../utf-cpp/include/ww898/cp_utf8.hpp | 158 +++++++++++ .../utf-cpp/include/ww898/cp_utfw.hpp | 47 ++++ .../utf-cpp/include/ww898/utf_config.hpp | 41 +++ .../utf-cpp/include/ww898/utf_converters.hpp | 256 ++++++++++++++++++ .../utf-cpp/include/ww898/utf_selector.hpp | 54 ++++ .../utf-cpp/include/ww898/utf_sizes.hpp | 136 ++++++++++ 12 files changed, 956 insertions(+), 8 deletions(-) create mode 100644 rd-cpp/thirdparty/utf-cpp/LICENSE.md create mode 100644 rd-cpp/thirdparty/utf-cpp/README.md create mode 100644 rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf16.hpp create mode 100644 rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf32.hpp create mode 100644 rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf8.hpp create mode 100644 rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utfw.hpp create mode 100644 rd-cpp/thirdparty/utf-cpp/include/ww898/utf_config.hpp create mode 100644 rd-cpp/thirdparty/utf-cpp/include/ww898/utf_converters.hpp create mode 100644 rd-cpp/thirdparty/utf-cpp/include/ww898/utf_selector.hpp create mode 100644 rd-cpp/thirdparty/utf-cpp/include/ww898/utf_sizes.hpp diff --git a/rd-cpp/src/rd_core_cpp/src/main/std/to_string.h b/rd-cpp/src/rd_core_cpp/src/main/std/to_string.h index fd8b6725f..91fbab39f 100644 --- a/rd-cpp/src/rd_core_cpp/src/main/std/to_string.h +++ b/rd-cpp/src/rd_core_cpp/src/main/std/to_string.h @@ -1,15 +1,15 @@ +// ReSharper disable CppUE4CodingStandardNamingViolationWarning #ifndef RD_CPP_TO_STRING_H #define RD_CPP_TO_STRING_H #include -#include #include #include -#include #include #include #include -#include + +#include "ww898/utf_converters.hpp" #include @@ -31,9 +31,7 @@ inline std::string to_string(const char* val) inline std::string to_string(std::wstring const& val) { - using convert_type = std::codecvt_utf8; - std::wstring_convert converter; - return converter.to_bytes(val); + return ww898::utf::conv(val); } inline std::string to_string(std::thread::id const& id) @@ -123,8 +121,7 @@ using std::to_wstring; inline std::wstring to_wstring(std::string const& s) { - // TO-DO: fix this wrong implementation - return std::wstring(s.begin(), s.end()); + return ww898::utf::conv(s); } template diff --git a/rd-cpp/thirdparty/CMakeLists.txt b/rd-cpp/thirdparty/CMakeLists.txt index 455c32496..7fe938148 100644 --- a/rd-cpp/thirdparty/CMakeLists.txt +++ b/rd-cpp/thirdparty/CMakeLists.txt @@ -9,6 +9,9 @@ add_subdirectory(CTPL) set(SPDLOG_BUILD_SHARED ON CACHE BOOL "Build shared library" FORCE) add_subdirectory(spdlog) +add_library(utf-cpp INTERFACE) +target_include_directories(utf-cpp INTERFACE utf-cpp/include) + install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/spdlog/include/ DESTINATION "${CMAKE_INSTALL_PUBLIC_HEADER_THIRDPARTY}" CONFIGURATIONS Release @@ -33,6 +36,7 @@ target_link_libraries(thirdparty PUBLIC ctpl countdownlatch spdlog::spdlog + utf-cpp ) install(FILES thirdparty.hpp diff --git a/rd-cpp/thirdparty/utf-cpp/LICENSE.md b/rd-cpp/thirdparty/utf-cpp/LICENSE.md new file mode 100644 index 000000000..c807a4214 --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Mikhail Pilin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/rd-cpp/thirdparty/utf-cpp/README.md b/rd-cpp/thirdparty/utf-cpp/README.md new file mode 100644 index 000000000..bbb2c22eb --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/README.md @@ -0,0 +1,58 @@ +# UTF-8/16/32 C++ library +This is the C++11 template based header only library under Windows/Linux/MacOs to convert UFT-8/16/32 symbols and strings. The library transparently support `wchar_t` as UTF-16 for Windows and UTF-32 for Linux and MacOs. + +UTF-8 and UTF-32 (UCS-32) both support 31 bit wide code points `[0‥0x7FFFFFFF]`with no restriction. UTF-16 supports only unicode code points `[0‥0x10FFFF]`, where high `[0xD800‥0xDBFF]` and low `[0xDC00‥0xDFFF]` surrogate regions are prohibited. + +The maximum UTF-16 symbol size is 2 words (4 bytes, both words should be in the surrogate region). UFT-32 (UCS-32) is always 1 word (4 bytes). UTF-8 has the maximum symbol size (see [conversion table](#utf-8-conversion-table) for details): +- 4 bytes for unicode code points +- 6 bytes for 31bit code points + +###### UTF-16 surrogate decoder: +|High\Low|DC00|DC01|…|DFFF| +|:-:|:-:|:-:|:-:|:-:| +|**D800**|010000|010001|…|0103FF| +|**D801**|010400|010401|…|0107FF| +|**⋮**|⋮|⋮|⋱|⋮| +|**DBFF**|10FC00|10FC01|…|10FFFF| + +![UTF-16 Surrogates](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b8/Utf-16.svg/512px-Utf-16.svg.png) + +## Supported compilers + +Tested on following compilers: +- [Visual Studio 2013 v12.0.40629.00 Update 5](perf/vc120_win.md) +- [Visual Studio 2015 v14.0.25431.01 Update 3](perf/vc140_win.md) +- [Visual Studio 2017 v15.6.7](perf/vc141_win.md) +- [Visual Studio 2019 v16.0.3](perf/vc142_win.md) +- [GNU v5.4.0](perf/gnu_linux.md) +- [Clang v6.0.1](perf/clang_linux.md) +- [Apple Clang v10.0.1](perf/clang_mac.md) + +## Usage example + +```cpp + // यूनिकोड + static char const u8s[] = "\xE0\xA4\xAF\xE0\xA5\x82\xE0\xA4\xA8\xE0\xA4\xBF\xE0\xA4\x95\xE0\xA5\x8B\xE0\xA4\xA1"; + using namespace ww898::utf; + std::u16string u16; + convz, utf16>(u8s, std::back_inserter(u16)); + std::u32string u32; + conv>(u16.begin(), u16.end(), std::back_inserter(u32)); + std::vector u8; + convz(u32.data(), std::back_inserter(u8)); + std::wstring uw; + conv(u8s, u8s + sizeof(u8s), std::back_inserter(uw)); + auto u8r = conv(uw); + auto u16r = conv(u16); + auto uwr = convz(u8s); + + auto u32r = conv(std::string_view(u8r.data(), u8r.size())); // C++17 only + + static_assert(std::is_same, utf_selector>::value, "Fail"); + static_assert( + std::is_same, utf_selector_t>::value != + std::is_same, utf_selector_t>::value, "Fail"); +``` + +## UTF-8 Conversion table +![UTF-8/32 table](https://upload.wikimedia.org/wikipedia/commons/3/38/UTF-8_Encoding_Scheme.png) diff --git a/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf16.hpp b/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf16.hpp new file mode 100644 index 000000000..54332f9c4 --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf16.hpp @@ -0,0 +1,109 @@ +/* + * MIT License + * + * Copyright (c) 2017-2019 Mikhail Pilin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include +#include + +namespace ww898 { +namespace utf { + +// 1 0 +// 98765432109876543210 +// |||||||||||||||||||| +// 110110xxxxxxxxxx|||||||||| high surrogate +// 110111xxxxxxxxxx low surrogate +struct utf16 final +{ + static size_t const max_unicode_symbol_size = 2; + static size_t const max_supported_symbol_size = max_unicode_symbol_size; + + static uint32_t const max_supported_code_point = 0x10FFFF; + + using char_type = uint16_t; + + static char_type const min_surrogate = 0xD800; + static char_type const max_surrogate = 0xDFFF; + + static char_type const min_surrogate_high = 0xD800; + static char_type const max_surrogate_high = 0xDBFF; + + static char_type const min_surrogate_low = 0xDC00; + static char_type const max_surrogate_low = 0xDFFF; + + template + static size_t char_size(PeekFn && peek_fn) + { + char_type const ch0 = std::forward(peek_fn)(); + if (ch0 < 0xD800) // [0x0000‥0xD7FF] + return 1; + if (ch0 < 0xDC00) // [0xD800‥0xDBFF] [0xDC00‥0xDFFF] + return 2; + if (ch0 < 0xE000) + throw std::runtime_error("The high utf16 surrogate char is expected"); + // [0xE000‥0xFFFF] + return 1; + } + + template + static uint32_t read(ReadFn && read_fn) + { + char_type const ch0 = read_fn(); + if (ch0 < 0xD800) // [0x0000‥0xD7FF] + return ch0; + if (ch0 < 0xDC00) // [0xD800‥0xDBFF] [0xDC00‥0xDFFF] + { + char_type const ch1 = read_fn(); if (ch1 >> 10 != 0x37) throw std::runtime_error("The low utf16 surrogate char is expected"); + return static_cast((ch0 << 10) + ch1 - 0x35FDC00); + } + if (ch0 < 0xE000) + throw std::runtime_error("The high utf16 surrogate char is expected"); + // [0xE000‥0xFFFF] + return ch0; + } + + template + static void write(uint32_t const cp, WriteFn && write_fn) + { + if (cp < 0xD800) // [0x0000‥0xD7FF] + write_fn(static_cast(cp)); + else if (cp < 0x10000) + { + if (cp < 0xE000) + throw std::runtime_error("The utf16 code point can not be in surrogate range"); + // [0xE000‥0xFFFF] + write_fn(static_cast(cp)); + } + else if (cp < 0x110000) // [0xD800‥0xDBFF] [0xDC00‥0xDFFF] + { + write_fn(static_cast(0xD7C0 + (cp >> 10 ))); + write_fn(static_cast(0xDC00 + (cp & 0x3FF))); + } + else + throw std::runtime_error("Too large the utf16 code point"); + } +}; + +}} diff --git a/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf32.hpp b/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf32.hpp new file mode 100644 index 000000000..90b11fad7 --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf32.hpp @@ -0,0 +1,67 @@ +/* + * MIT License + * + * Copyright (c) 2017-2019 Mikhail Pilin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include +#include + +namespace ww898 { +namespace utf { + +struct utf32 final +{ + static size_t const max_unicode_symbol_size = 1; + static size_t const max_supported_symbol_size = 1; + + static uint32_t const max_supported_code_point = 0x7FFFFFFF; + + using char_type = uint32_t; + + template + static size_t char_size(PeekFn &&) + { + return 1; + } + + template + static uint32_t read(ReadFn && read_fn) + { + char_type const ch = std::forward(read_fn)(); + if (ch < 0x80000000) + return ch; + throw std::runtime_error("Too large utf32 char"); + } + + template + static void write(uint32_t const cp, WriteFn && write_fn) + { + if (cp < 0x80000000) + std::forward(write_fn)(static_cast(cp)); + else + throw std::runtime_error("Too large utf32 code point"); + } +}; + +}} diff --git a/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf8.hpp b/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf8.hpp new file mode 100644 index 000000000..7c8c68d03 --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utf8.hpp @@ -0,0 +1,158 @@ +/* + * MIT License + * + * Copyright (c) 2017-2019 Mikhail Pilin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include +#include + +namespace ww898 { +namespace utf { + +// Supported combinations: +// 0xxx_xxxx +// 110x_xxxx 10xx_xxxx +// 1110_xxxx 10xx_xxxx 10xx_xxxx +// 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx +// 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx +// 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx +struct utf8 final +{ + static size_t const max_unicode_symbol_size = 4; + static size_t const max_supported_symbol_size = 6; + + static uint32_t const max_supported_code_point = 0x7FFFFFFF; + + using char_type = uint8_t; + + template + static size_t char_size(PeekFn && peek_fn) + { + char_type const ch0 = std::forward(peek_fn)(); + if (ch0 < 0x80) // 0xxx_xxxx + return 1; + if (ch0 < 0xC0) + throw std::runtime_error("The utf8 first char in sequence is incorrect"); + if (ch0 < 0xE0) // 110x_xxxx 10xx_xxxx + return 2; + if (ch0 < 0xF0) // 1110_xxxx 10xx_xxxx 10xx_xxxx + return 3; + if (ch0 < 0xF8) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + return 4; + if (ch0 < 0xFC) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + return 5; + if (ch0 < 0xFE) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + return 6; + throw std::runtime_error("The utf8 first char in sequence is incorrect"); + } + + template + static uint32_t read(ReadFn && read_fn) + { + char_type const ch0 = read_fn(); + if (ch0 < 0x80) // 0xxx_xxxx + return ch0; + if (ch0 < 0xC0) + throw std::runtime_error("The utf8 first char in sequence is incorrect"); + if (ch0 < 0xE0) // 110x_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + return (ch0 << 6) + ch1 - 0x3080; + } + if (ch0 < 0xF0) // 1110_xxxx 10xx_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; + return (ch0 << 12) + (ch1 << 6) + ch2 - 0xE2080; + } + if (ch0 < 0xF8) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; + char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err; + return (ch0 << 18) + (ch1 << 12) + (ch2 << 6) + ch3 - 0x3C82080; + } + if (ch0 < 0xFC) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; + char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err; + char_type const ch4 = read_fn(); if (ch4 >> 6 != 2) goto _err; + return (ch0 << 24) + (ch1 << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - 0xFA082080; + } + if (ch0 < 0xFE) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + char_type const ch1 = read_fn(); if (ch1 >> 6 != 2) goto _err; + char_type const ch2 = read_fn(); if (ch2 >> 6 != 2) goto _err; + char_type const ch3 = read_fn(); if (ch3 >> 6 != 2) goto _err; + char_type const ch4 = read_fn(); if (ch4 >> 6 != 2) goto _err; + char_type const ch5 = read_fn(); if (ch5 >> 6 != 2) goto _err; + return (ch0 << 30) + (ch1 << 24) + (ch2 << 18) + (ch3 << 12) + (ch4 << 6) + ch5 - 0x82082080; + } + throw std::runtime_error("The utf8 first char in sequence is incorrect"); + _err: throw std::runtime_error("The utf8 slave char in sequence is incorrect"); + } + + template + static void write(uint32_t const cp, WriteFn && write_fn) + { + if (cp < 0x80) // 0xxx_xxxx + write_fn(static_cast(cp)); + else if (cp < 0x800) // 110x_xxxx 10xx_xxxx + { + write_fn(static_cast(0xC0 | cp >> 6)); + goto _1; + } + else if (cp < 0x10000) // 1110_xxxx 10xx_xxxx 10xx_xxxx + { + write_fn(static_cast(0xE0 | cp >> 12)); + goto _2; + } + else if (cp < 0x200000) // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + write_fn(static_cast(0xF0 | cp >> 18)); + goto _3; + } + else if (cp < 0x4000000) // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + write_fn(static_cast(0xF8 | cp >> 24)); + goto _4; + } + else if (cp < 0x80000000) // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + { + write_fn(static_cast(0xFC | cp >> 30)); + goto _5; + } + else + throw std::runtime_error("Tool large UTF8 code point"); + return; + _5: write_fn(static_cast(0x80 | (cp >> 24 & 0x3F))); + _4: write_fn(static_cast(0x80 | (cp >> 18 & 0x3F))); + _3: write_fn(static_cast(0x80 | (cp >> 12 & 0x3F))); + _2: write_fn(static_cast(0x80 | (cp >> 6 & 0x3F))); + _1: write_fn(static_cast(0x80 | (cp & 0x3F))); + } +}; + +}} diff --git a/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utfw.hpp b/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utfw.hpp new file mode 100644 index 000000000..b137d1d5c --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/include/ww898/cp_utfw.hpp @@ -0,0 +1,47 @@ +/* + * MIT License + * + * Copyright (c) 2017-2019 Mikhail Pilin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#if defined(_WIN32) + +#include + +namespace ww898 { +namespace utf { +using utfw = utf16; +}} + +#elif defined(__linux__) || defined(__APPLE__) + +#include + +namespace ww898 { +namespace utf { +using utfw = utf32; +}} + +#else +#error Unsupported platform +#endif diff --git a/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_config.hpp b/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_config.hpp new file mode 100644 index 000000000..7b4c6c88a --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_config.hpp @@ -0,0 +1,41 @@ +/* + * MIT License + * + * Copyright (c) 2017-2019 Mikhail Pilin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +// Normally `__cpp_lib_string_view` should be defined in string header +#include + +#if !defined(__cpp_lib_string_view) +#if defined(_MSVC_LANG) +#define __cpp_lib_string_view _MSVC_LANG +#else +#define __cpp_lib_string_view __cplusplus +#endif +#endif + +namespace ww898 { +namespace utf { +static uint32_t const max_unicode_code_point = 0x10FFFF; +}} diff --git a/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_converters.hpp b/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_converters.hpp new file mode 100644 index 000000000..06088f64d --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_converters.hpp @@ -0,0 +1,256 @@ +/* + * MIT License + * + * Copyright (c) 2017-2019 Mikhail Pilin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +#if __cpp_lib_string_view >= 201606 +#include +#endif + +namespace ww898 { +namespace utf { + +namespace detail { + +enum struct convz_impl { normal, binary_copy }; + +template< + typename Utf, + typename Outf, + typename It, + typename Oit, + convz_impl> +struct convz_strategy +{ + Oit operator()(It it, Oit oit) const + { + auto const read_fn = [&it] { return *it++; }; + auto const write_fn = [&oit] (typename Outf::char_type const ch) { *oit++ = ch; }; + while (true) + { + auto const cp = Utf::read(read_fn); + if (!cp) + return oit; + Outf::write(cp, write_fn); + } + } +}; + +template< + typename Utf, + typename Outf, + typename It, + typename Oit> +struct convz_strategy +{ + Oit operator()(It it, Oit oit) const + { + while (true) + { + auto const ch = *it++; + if (!ch) + return oit; + *oit++ = ch; + } + } +}; + +} + +template< + typename Utf, + typename Outf, + typename It, + typename Oit> +Oit convz(It && it, Oit && oit) +{ + return detail::convz_strategy::type, + typename std::decay::type, + std::is_same::value + ? detail::convz_impl::binary_copy + : detail::convz_impl::normal>()( + std::forward(it), + std::forward(oit)); +} + +namespace detail { + +enum struct conv_impl { normal, random_interator, binary_copy }; + +template< + typename Utf, + typename Outf, + typename It, + typename Oit, + conv_impl> +struct conv_strategy final +{ + Oit operator()(It it, It const eit, Oit oit) const + { + auto const read_fn = [&it, &eit] + { + if (it == eit) + throw std::runtime_error("Not enough input"); + return *it++; + }; + auto const write_fn = [&oit] (typename Outf::char_type const ch) { *oit++ = ch; }; + while (it != eit) + Outf::write(Utf::read(read_fn), write_fn); + return oit; + } +}; + +template< + typename Utf, + typename Outf, + typename It, + typename Oit> +struct conv_strategy final +{ + Oit operator()(It it, It const eit, Oit oit) const + { + auto const write_fn = [&oit] (typename Outf::char_type const ch) { *oit++ = ch; }; + if (eit - it >= static_cast::difference_type>(Utf::max_supported_symbol_size)) + { + auto const fast_read_fn = [&it] { return *it++; }; + auto const fast_eit = eit - Utf::max_supported_symbol_size; + while (it < fast_eit) + Outf::write(Utf::read(fast_read_fn), write_fn); + } + auto const read_fn = [&it, &eit] + { + if (it == eit) + throw std::runtime_error("Not enough input"); + return *it++; + }; + while (it != eit) + Outf::write(Utf::read(read_fn), write_fn); + return oit; + } +}; + +template< + typename Utf, + typename Outf, + typename It, + typename Oit> +struct conv_strategy final +{ + Oit operator()(It it, It const eit, Oit oit) const + { + while (it != eit) + *oit++ = *it++; + return oit; + } +}; + +} + +template< + typename Utf, + typename Outf, + typename It, + typename Eit, + typename Oit> +Oit conv(It && it, Eit && eit, Oit && oit) +{ + return detail::conv_strategy::type, + typename std::decay::type, + std::is_same::value + ? detail::conv_impl::binary_copy + : std::is_base_of::type>::iterator_category>::value + ? detail::conv_impl::random_interator + : detail::conv_impl::normal>()( + std::forward(it), + std::forward(eit), + std::forward(oit)); +} + +template< + typename Outf, + typename Ch, + typename Oit> +Oit convz(Ch const * const str, Oit && oit) +{ + return convz, Outf>(str, std::forward(oit)); +} + +template< + typename Och, + typename Str> +std::basic_string convz(Str && str) +{ + std::basic_string res; + convz>(std::forward(str), std::back_inserter(res)); + return res; +} + +template< + typename Outf, + typename Ch, + typename Oit> +Oit conv(std::basic_string const & str, Oit && oit) +{ + return conv, Outf>(str.cbegin(), str.cend(), std::forward(oit)); +} + +#if __cpp_lib_string_view >= 201606 +template< + typename Outf, + typename Ch, + typename Oit> +Oit conv(std::basic_string_view const & str, Oit && oit) +{ + return conv, Outf>(str.cbegin(), str.cend(), std::forward(oit)); +} +#endif + +template< + typename Och, + typename Str, + typename std::enable_if::type, std::basic_string>::value, void *>::type = nullptr> +std::basic_string conv(Str && str) +{ + std::basic_string res; + conv>(std::forward(str), std::back_inserter(res)); + return res; +} + +template< + typename Ch> +std::basic_string conv(std::basic_string str) throw() +{ + return str; +} + +}} diff --git a/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_selector.hpp b/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_selector.hpp new file mode 100644 index 000000000..72302cb58 --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_selector.hpp @@ -0,0 +1,54 @@ +/* + * MIT License + * + * Copyright (c) 2017-2019 Mikhail Pilin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include +#include +#include +#include + +namespace ww898 { +namespace utf { +namespace detail { + +template +struct utf_selector final {}; + +template<> struct utf_selector< char> final { using type = utf8 ; }; +template<> struct utf_selector final { using type = utf8 ; }; +template<> struct utf_selector final { using type = utf8 ; }; +template<> struct utf_selector final { using type = utf16; }; +template<> struct utf_selector final { using type = utf32; }; +template<> struct utf_selector final { using type = utfw ; }; + +} + +template +using utf_selector = detail::utf_selector::type>; + +template +using utf_selector_t = typename utf_selector::type; + +}} diff --git a/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_sizes.hpp b/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_sizes.hpp new file mode 100644 index 000000000..a370cc9f0 --- /dev/null +++ b/rd-cpp/thirdparty/utf-cpp/include/ww898/utf_sizes.hpp @@ -0,0 +1,136 @@ +/* + * MIT License + * + * Copyright (c) 2017-2019 Mikhail Pilin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +#if __cpp_lib_string_view >= 201606 +#include +#endif + +namespace ww898 { +namespace utf { + +template< + typename Utf, + typename It> +size_t char_size(It it) +{ + return Utf::char_size([&it] { return *it; }); +} + +template< + typename Utf, + typename It> +size_t size(It it) +{ + size_t total_cp = 0; + while (*it) + { + size_t size = Utf::char_size([&it] { return *it; }); + while (++it, --size > 0) + if (!*it) + throw std::runtime_error("Not enough input for the null-terminated string"); + ++total_cp; + } + return total_cp; +} + +namespace detail { + +enum struct iterator_impl { forward, random_access }; + +template< + typename It, + iterator_impl> +struct next_strategy final +{ + void operator()(It & it, It const & eit, size_t size) + { + while (++it, --size > 0) + if (it == eit) + throw std::runtime_error("Not enough input for the forward iterator"); + } +}; + +template +struct next_strategy final +{ + void operator()(It & it, It const & eit, typename std::iterator_traits::difference_type const size) + { + if (eit - it < size) + throw std::runtime_error("Not enough input for the random access iterator"); + it += size; + } +}; + +} + +template< + typename Utf, + typename It, + typename Eit> +size_t size(It it, Eit const eit) +{ + size_t total_cp = 0; + while (it != eit) + { + size_t const size = Utf::char_size([&it] { return *it; }); + detail::next_strategy< + typename std::decay::type, + std::is_base_of::type>::iterator_category>::value + ? detail::iterator_impl::random_access + : detail::iterator_impl::forward>()(it, eit, size); + ++total_cp; + } + return total_cp; +} + +template +size_t size(Ch const * str) +{ + return size>(str); +} + +template +size_t size(std::basic_string str) +{ + return size>(str.cbegin(), str.cend()); +} + +#if __cpp_lib_string_view >= 201606 +template +size_t size(std::basic_string_view str) +{ + return size>(str.cbegin(), str.cend()); +} +#endif + +}}