Skip to content

Commit

Permalink
gccrs: Implement quick-check for Unicode
Browse files Browse the repository at this point in the history
gcc/rust/ChangeLog:

	* rust-lang.cc (run_rust_tests): Add test.
	* rust-system.h: Add <algorithm>.
	* util/make-rust-unicode.py: Output NFC_Quick_Check table.
	* util/rust-codepoint.h (struct Codepoint): Add is_supplementary
	method.
	* util/rust-unicode-data.h: Generated.
	* util/rust-unicode.cc (binary_search_sorted_array): Removed.
	(lookup_cc): Remove namespace.
	(is_alphabetic): Use std::binary_search
	(nfc_quick_check): New function.
	(nfc_normalize): Use nfc_quick_check.
	(is_nfc_qc_maybe): New function.
	(is_nfc_qc_no): New function.
	(rust_nfc_qc_test): New test.
	* util/rust-unicode.h (is_nfc_qc_no): New function.
	(is_nfc_qc_maybe): New function.
	(enum class): New enum class.
	(nfc_quick_check): New function.
	(rust_nfc_qc_test): New test.

Signed-off-by: Raiki Tamura <[email protected]>
  • Loading branch information
tamaroning authored and P-E-P committed Jan 29, 2024
1 parent bef4967 commit d6e10d2
Show file tree
Hide file tree
Showing 7 changed files with 280 additions and 82 deletions.
1 change: 1 addition & 0 deletions gcc/rust/rust-lang.cc
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,7 @@ run_rust_tests ()
{
// Call tests for the rust frontend here
rust_input_source_test ();
rust_nfc_qc_test ();
rust_utf8_normalize_test ();
rust_punycode_encode_test ();
rust_cfg_parser_test ();
Expand Down
1 change: 1 addition & 0 deletions gcc/rust/rust-system.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include <utility>
#include <fstream>
#include <array>
#include <algorithm>

// Rust frontend requires C++11 minimum, so will have unordered_map and set
#include <unordered_map>
Expand Down
39 changes: 31 additions & 8 deletions gcc/rust/util/make-rust-unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,30 @@ def write_numeric() -> None:
print("}};")


def write_nfc_qc():
print(
"const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_NO_RANGES = {{{{".format(
len(nfc_qc_no_ranges)
)
)
print(" // clang-format off")
for r in nfc_qc_no_ranges:
print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
print(" // clang-format on")
print("}};")

print(
"const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_MAYBE_RANGES = {{{{".format(
len(nfc_qc_maybe_ranges)
)
)
print(" // clang-format off")
for r in nfc_qc_maybe_ranges:
print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
print(" // clang-format on")
print("}};")


def main() -> None:
if len(sys.argv) != 4:
print("too few arguments", file=sys.stderr)
Expand All @@ -265,13 +289,12 @@ def main() -> None:
print(COPYRIGHT)
print()

print('#include "rust-system.h"')
print()
print("namespace Rust {")
print()
print('#include "rust-system.h"\n')
print("namespace Rust {\n")
print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
print()
print(
"const uint32_t NUM_NUMERIC_CODEPOINTS = {};\n".format(len(numeric_codepoints))
)

write_decomposition()
print()
Expand All @@ -283,8 +306,8 @@ def main() -> None:
print()
write_numeric()
print()

# TODO: write NFC_QC table
write_nfc_qc()
print()

print("} // namespace Rust")

Expand Down
1 change: 1 addition & 0 deletions gcc/rust/util/rust-codepoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ struct Codepoint
static Codepoint eof () { return Codepoint (UINT32_MAX); }
bool is_eof () const { return value == UINT32_MAX; }
bool is_ascii () const { return value <= MAX_ASCII_CODEPOINT; }
bool is_supplementary_character () const { return value > 0xFFFF; }

// Returns a C++ string containing string value of codepoint.
std::string as_string ();
Expand Down
158 changes: 154 additions & 4 deletions gcc/rust/util/rust-unicode-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

namespace Rust {

const uint32_t NUM_ALPHABETIC_RANGES = 1117;
const uint32_t NUM_ALPHABETIC_RANGES = 1141;
const uint32_t NUM_NUMERIC_CODEPOINTS = 1831;

const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {
Expand Down Expand Up @@ -4167,6 +4167,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x0bd7, 0x0bd8},
{0x0c00, 0x0c01},
{0x0c01, 0x0c04},
{0x0c04, 0x0c05},
{0x0c05, 0x0c0d},
{0x0c0e, 0x0c11},
{0x0c12, 0x0c29},
Expand Down Expand Up @@ -4202,6 +4203,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x0ce0, 0x0ce2},
{0x0ce2, 0x0ce4},
{0x0cf1, 0x0cf3},
{0x0cf3, 0x0cf4},
{0x0d00, 0x0d02},
{0x0d02, 0x0d04},
{0x0d04, 0x0d0d},
Expand Down Expand Up @@ -4257,7 +4259,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x0f49, 0x0f6d},
{0x0f71, 0x0f7f},
{0x0f7f, 0x0f80},
{0x0f80, 0x0f82},
{0x0f80, 0x0f84},
{0x0f88, 0x0f8d},
{0x0f8d, 0x0f98},
{0x0f99, 0x0fbd},
Expand Down Expand Up @@ -4758,6 +4760,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x11071, 0x11073},
{0x11073, 0x11075},
{0x11075, 0x11076},
{0x11080, 0x11082},
{0x11082, 0x11083},
{0x11083, 0x110b0},
{0x110b0, 0x110b3},
Expand Down Expand Up @@ -4794,6 +4797,8 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x11234, 0x11235},
{0x11237, 0x11238},
{0x1123e, 0x1123f},
{0x1123f, 0x11241},
{0x11241, 0x11242},
{0x11280, 0x11287},
{0x11288, 0x11289},
{0x1128a, 0x1128e},
Expand Down Expand Up @@ -4948,12 +4953,22 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x11ee0, 0x11ef3},
{0x11ef3, 0x11ef5},
{0x11ef5, 0x11ef7},
{0x11f00, 0x11f02},
{0x11f02, 0x11f03},
{0x11f03, 0x11f04},
{0x11f04, 0x11f11},
{0x11f12, 0x11f34},
{0x11f34, 0x11f36},
{0x11f36, 0x11f3b},
{0x11f3e, 0x11f40},
{0x11f40, 0x11f41},
{0x11fb0, 0x11fb1},
{0x12000, 0x1239a},
{0x12400, 0x1246f},
{0x12480, 0x12544},
{0x12f90, 0x12ff1},
{0x13000, 0x1342f},
{0x13000, 0x13430},
{0x13441, 0x13447},
{0x14400, 0x14647},
{0x16800, 0x16a39},
{0x16a40, 0x16a5f},
Expand All @@ -4980,7 +4995,9 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x1aff5, 0x1affc},
{0x1affd, 0x1afff},
{0x1b000, 0x1b123},
{0x1b132, 0x1b133},
{0x1b150, 0x1b153},
{0x1b155, 0x1b156},
{0x1b164, 0x1b168},
{0x1b170, 0x1b2fc},
{0x1bc00, 0x1bc6b},
Expand Down Expand Up @@ -5021,16 +5038,21 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x1df00, 0x1df0a},
{0x1df0a, 0x1df0b},
{0x1df0b, 0x1df1f},
{0x1df25, 0x1df2b},
{0x1e000, 0x1e007},
{0x1e008, 0x1e019},
{0x1e01b, 0x1e022},
{0x1e023, 0x1e025},
{0x1e026, 0x1e02b},
{0x1e030, 0x1e06e},
{0x1e08f, 0x1e090},
{0x1e100, 0x1e12d},
{0x1e137, 0x1e13e},
{0x1e14e, 0x1e14f},
{0x1e290, 0x1e2ae},
{0x1e2c0, 0x1e2ec},
{0x1e4d0, 0x1e4eb},
{0x1e4eb, 0x1e4ec},
{0x1e7e0, 0x1e7e7},
{0x1e7e8, 0x1e7ec},
{0x1e7ed, 0x1e7ef},
Expand Down Expand Up @@ -5076,12 +5098,14 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x1f150, 0x1f16a},
{0x1f170, 0x1f18a},
{0x20000, 0x2a6e0},
{0x2a700, 0x2b739},
{0x2a700, 0x2b73a},
{0x2b740, 0x2b81e},
{0x2b820, 0x2cea2},
{0x2ceb0, 0x2ebe1},
{0x2ebf0, 0x2ee5e},
{0x2f800, 0x2fa1e},
{0x30000, 0x3134b},
{0x31350, 0x323b0},
// clang-format on
}};

Expand Down Expand Up @@ -5205,4 +5229,130 @@ const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{
// clang-format on
}};

const std::array<std::pair<uint32_t, uint32_t>, 74> NFC_QC_NO_RANGES = {{
// clang-format off
{0x0340, 0x0342},
{0x0343, 0x0345},
{0x0374, 0x0375},
{0x037e, 0x037f},
{0x0387, 0x0388},
{0x0958, 0x0960},
{0x09dc, 0x09de},
{0x09df, 0x09e0},
{0x0a33, 0x0a34},
{0x0a36, 0x0a37},
{0x0a59, 0x0a5c},
{0x0a5e, 0x0a5f},
{0x0b5c, 0x0b5e},
{0x0f43, 0x0f44},
{0x0f4d, 0x0f4e},
{0x0f52, 0x0f53},
{0x0f57, 0x0f58},
{0x0f5c, 0x0f5d},
{0x0f69, 0x0f6a},
{0x0f73, 0x0f74},
{0x0f75, 0x0f77},
{0x0f78, 0x0f79},
{0x0f81, 0x0f82},
{0x0f93, 0x0f94},
{0x0f9d, 0x0f9e},
{0x0fa2, 0x0fa3},
{0x0fa7, 0x0fa8},
{0x0fac, 0x0fad},
{0x0fb9, 0x0fba},
{0x1f71, 0x1f72},
{0x1f73, 0x1f74},
{0x1f75, 0x1f76},
{0x1f77, 0x1f78},
{0x1f79, 0x1f7a},
{0x1f7b, 0x1f7c},
{0x1f7d, 0x1f7e},
{0x1fbb, 0x1fbc},
{0x1fbe, 0x1fbf},
{0x1fc9, 0x1fca},
{0x1fcb, 0x1fcc},
{0x1fd3, 0x1fd4},
{0x1fdb, 0x1fdc},
{0x1fe3, 0x1fe4},
{0x1feb, 0x1fec},
{0x1fee, 0x1ff0},
{0x1ff9, 0x1ffa},
{0x1ffb, 0x1ffc},
{0x1ffd, 0x1ffe},
{0x2000, 0x2002},
{0x2126, 0x2127},
{0x212a, 0x212c},
{0x2329, 0x232a},
{0x232a, 0x232b},
{0x2adc, 0x2add},
{0xf900, 0xfa0e},
{0xfa10, 0xfa11},
{0xfa12, 0xfa13},
{0xfa15, 0xfa1f},
{0xfa20, 0xfa21},
{0xfa22, 0xfa23},
{0xfa25, 0xfa27},
{0xfa2a, 0xfa6e},
{0xfa70, 0xfada},
{0xfb1d, 0xfb1e},
{0xfb1f, 0xfb20},
{0xfb2a, 0xfb37},
{0xfb38, 0xfb3d},
{0xfb3e, 0xfb3f},
{0xfb40, 0xfb42},
{0xfb43, 0xfb45},
{0xfb46, 0xfb4f},
{0x1d15e, 0x1d165},
{0x1d1bb, 0x1d1c1},
{0x2f800, 0x2fa1e},
// clang-format on
}};
const std::array<std::pair<uint32_t, uint32_t>, 43> NFC_QC_MAYBE_RANGES = {{
// clang-format off
{0x0300, 0x0305},
{0x0306, 0x030d},
{0x030f, 0x0310},
{0x0311, 0x0312},
{0x0313, 0x0315},
{0x031b, 0x031c},
{0x0323, 0x0329},
{0x032d, 0x032f},
{0x0330, 0x0332},
{0x0338, 0x0339},
{0x0342, 0x0343},
{0x0345, 0x0346},
{0x0653, 0x0656},
{0x093c, 0x093d},
{0x09be, 0x09bf},
{0x09d7, 0x09d8},
{0x0b3e, 0x0b3f},
{0x0b56, 0x0b57},
{0x0b57, 0x0b58},
{0x0bbe, 0x0bbf},
{0x0bd7, 0x0bd8},
{0x0c56, 0x0c57},
{0x0cc2, 0x0cc3},
{0x0cd5, 0x0cd7},
{0x0d3e, 0x0d3f},
{0x0d57, 0x0d58},
{0x0dca, 0x0dcb},
{0x0dcf, 0x0dd0},
{0x0ddf, 0x0de0},
{0x102e, 0x102f},
{0x1161, 0x1176},
{0x11a8, 0x11c3},
{0x1b35, 0x1b36},
{0x3099, 0x309b},
{0x110ba, 0x110bb},
{0x11127, 0x11128},
{0x1133e, 0x1133f},
{0x11357, 0x11358},
{0x114b0, 0x114b1},
{0x114ba, 0x114bb},
{0x114bd, 0x114be},
{0x115af, 0x115b0},
{0x11930, 0x11931},
// clang-format on
}};

} // namespace Rust
Loading

0 comments on commit d6e10d2

Please sign in to comment.