-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #457 from JetBrains/add_utf_cpp
Fix utf support in to_string
- Loading branch information
Showing
12 changed files
with
956 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2017 Mikhail Pilin | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# UTF-8/16/32 C++ library | ||
This is the C++11 template based header only library under Windows/Linux/MacOs to convert UFT-8/16/32 symbols and strings. The library transparently support `wchar_t` as UTF-16 for Windows and UTF-32 for Linux and MacOs. | ||
|
||
UTF-8 and UTF-32 (UCS-32) both support 31 bit wide code points `[0‥0x7FFFFFFF]`with no restriction. UTF-16 supports only unicode code points `[0‥0x10FFFF]`, where high `[0xD800‥0xDBFF]` and low `[0xDC00‥0xDFFF]` surrogate regions are prohibited. | ||
|
||
The maximum UTF-16 symbol size is 2 words (4 bytes, both words should be in the surrogate region). UFT-32 (UCS-32) is always 1 word (4 bytes). UTF-8 has the maximum symbol size (see [conversion table](#utf-8-conversion-table) for details): | ||
- 4 bytes for unicode code points | ||
- 6 bytes for 31bit code points | ||
|
||
###### UTF-16 surrogate decoder: | ||
|High\Low|DC00|DC01|…|DFFF| | ||
|:-:|:-:|:-:|:-:|:-:| | ||
|**D800**|010000|010001|…|0103FF| | ||
|**D801**|010400|010401|…|0107FF| | ||
|**⋮**|⋮|⋮|⋱|⋮| | ||
|**DBFF**|10FC00|10FC01|…|10FFFF| | ||
|
||
![UTF-16 Surrogates](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b8/Utf-16.svg/512px-Utf-16.svg.png) | ||
|
||
## Supported compilers | ||
|
||
Tested on following compilers: | ||
- [Visual Studio 2013 v12.0.40629.00 Update 5](perf/vc120_win.md) | ||
- [Visual Studio 2015 v14.0.25431.01 Update 3](perf/vc140_win.md) | ||
- [Visual Studio 2017 v15.6.7](perf/vc141_win.md) | ||
- [Visual Studio 2019 v16.0.3](perf/vc142_win.md) | ||
- [GNU v5.4.0](perf/gnu_linux.md) | ||
- [Clang v6.0.1](perf/clang_linux.md) | ||
- [Apple Clang v10.0.1](perf/clang_mac.md) | ||
|
||
## Usage example | ||
|
||
```cpp | ||
// यूनिकोड | ||
static char const u8s[] = "\xE0\xA4\xAF\xE0\xA5\x82\xE0\xA4\xA8\xE0\xA4\xBF\xE0\xA4\x95\xE0\xA5\x8B\xE0\xA4\xA1"; | ||
using namespace ww898::utf; | ||
std::u16string u16; | ||
convz<utf_selector_t<decltype(*u8s)>, utf16>(u8s, std::back_inserter(u16)); | ||
std::u32string u32; | ||
conv<utf16, utf_selector_t<decltype(u32)::value_type>>(u16.begin(), u16.end(), std::back_inserter(u32)); | ||
std::vector<char> u8; | ||
convz<utf32, utf8>(u32.data(), std::back_inserter(u8)); | ||
std::wstring uw; | ||
conv<utf8, utfw>(u8s, u8s + sizeof(u8s), std::back_inserter(uw)); | ||
auto u8r = conv<char>(uw); | ||
auto u16r = conv<char16_t>(u16); | ||
auto uwr = convz<wchar_t>(u8s); | ||
|
||
auto u32r = conv<char32_t>(std::string_view(u8r.data(), u8r.size())); // C++17 only | ||
|
||
static_assert(std::is_same<utf_selector<decltype(*u8s)>, utf_selector<decltype(u8)::value_type>>::value, "Fail"); | ||
static_assert( | ||
std::is_same<utf_selector_t<decltype(u16)::value_type>, utf_selector_t<decltype(uw)::value_type>>::value != | ||
std::is_same<utf_selector_t<decltype(u32)::value_type>, utf_selector_t<decltype(uw)::value_type>>::value, "Fail"); | ||
``` | ||
## UTF-8 Conversion table | ||
![UTF-8/32 table](https://upload.wikimedia.org/wikipedia/commons/3/38/UTF-8_Encoding_Scheme.png) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
/* | ||
* MIT License | ||
* | ||
* Copyright (c) 2017-2019 Mikhail Pilin | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in all | ||
* copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
* SOFTWARE. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <stdexcept> | ||
|
||
namespace ww898 { | ||
namespace utf { | ||
|
||
// 1 0 | ||
// 98765432109876543210 | ||
// |||||||||||||||||||| | ||
// 110110xxxxxxxxxx|||||||||| high surrogate | ||
// 110111xxxxxxxxxx low surrogate | ||
struct utf16 final | ||
{ | ||
static size_t const max_unicode_symbol_size = 2; | ||
static size_t const max_supported_symbol_size = max_unicode_symbol_size; | ||
|
||
static uint32_t const max_supported_code_point = 0x10FFFF; | ||
|
||
using char_type = uint16_t; | ||
|
||
static char_type const min_surrogate = 0xD800; | ||
static char_type const max_surrogate = 0xDFFF; | ||
|
||
static char_type const min_surrogate_high = 0xD800; | ||
static char_type const max_surrogate_high = 0xDBFF; | ||
|
||
static char_type const min_surrogate_low = 0xDC00; | ||
static char_type const max_surrogate_low = 0xDFFF; | ||
|
||
template<typename PeekFn> | ||
static size_t char_size(PeekFn && peek_fn) | ||
{ | ||
char_type const ch0 = std::forward<PeekFn>(peek_fn)(); | ||
if (ch0 < 0xD800) // [0x0000‥0xD7FF] | ||
return 1; | ||
if (ch0 < 0xDC00) // [0xD800‥0xDBFF] [0xDC00‥0xDFFF] | ||
return 2; | ||
if (ch0 < 0xE000) | ||
throw std::runtime_error("The high utf16 surrogate char is expected"); | ||
// [0xE000‥0xFFFF] | ||
return 1; | ||
} | ||
|
||
template<typename ReadFn> | ||
static uint32_t read(ReadFn && read_fn) | ||
{ | ||
char_type const ch0 = read_fn(); | ||
if (ch0 < 0xD800) // [0x0000‥0xD7FF] | ||
return ch0; | ||
if (ch0 < 0xDC00) // [0xD800‥0xDBFF] [0xDC00‥0xDFFF] | ||
{ | ||
char_type const ch1 = read_fn(); if (ch1 >> 10 != 0x37) throw std::runtime_error("The low utf16 surrogate char is expected"); | ||
return static_cast<uint32_t>((ch0 << 10) + ch1 - 0x35FDC00); | ||
} | ||
if (ch0 < 0xE000) | ||
throw std::runtime_error("The high utf16 surrogate char is expected"); | ||
// [0xE000‥0xFFFF] | ||
return ch0; | ||
} | ||
|
||
template<typename WriteFn> | ||
static void write(uint32_t const cp, WriteFn && write_fn) | ||
{ | ||
if (cp < 0xD800) // [0x0000‥0xD7FF] | ||
write_fn(static_cast<char_type>(cp)); | ||
else if (cp < 0x10000) | ||
{ | ||
if (cp < 0xE000) | ||
throw std::runtime_error("The utf16 code point can not be in surrogate range"); | ||
// [0xE000‥0xFFFF] | ||
write_fn(static_cast<char_type>(cp)); | ||
} | ||
else if (cp < 0x110000) // [0xD800‥0xDBFF] [0xDC00‥0xDFFF] | ||
{ | ||
write_fn(static_cast<char_type>(0xD7C0 + (cp >> 10 ))); | ||
write_fn(static_cast<char_type>(0xDC00 + (cp & 0x3FF))); | ||
} | ||
else | ||
throw std::runtime_error("Too large the utf16 code point"); | ||
} | ||
}; | ||
|
||
}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
* MIT License | ||
* | ||
* Copyright (c) 2017-2019 Mikhail Pilin | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in all | ||
* copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
* SOFTWARE. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <stdexcept> | ||
|
||
namespace ww898 { | ||
namespace utf { | ||
|
||
struct utf32 final | ||
{ | ||
static size_t const max_unicode_symbol_size = 1; | ||
static size_t const max_supported_symbol_size = 1; | ||
|
||
static uint32_t const max_supported_code_point = 0x7FFFFFFF; | ||
|
||
using char_type = uint32_t; | ||
|
||
template<typename PeekFn> | ||
static size_t char_size(PeekFn &&) | ||
{ | ||
return 1; | ||
} | ||
|
||
template<typename ReadFn> | ||
static uint32_t read(ReadFn && read_fn) | ||
{ | ||
char_type const ch = std::forward<ReadFn>(read_fn)(); | ||
if (ch < 0x80000000) | ||
return ch; | ||
throw std::runtime_error("Too large utf32 char"); | ||
} | ||
|
||
template<typename WriteFn> | ||
static void write(uint32_t const cp, WriteFn && write_fn) | ||
{ | ||
if (cp < 0x80000000) | ||
std::forward<WriteFn>(write_fn)(static_cast<char_type>(cp)); | ||
else | ||
throw std::runtime_error("Too large utf32 code point"); | ||
} | ||
}; | ||
|
||
}} |
Oops, something went wrong.