Upgrade to unicode 16

ycm-core · Oct 10, 2024 · a3dfe56 · a3dfe56
1 parent 9cb5a84
commit a3dfe56
Show file tree

Hide file tree

Showing 8 changed files with 1,104 additions and 302 deletions.
diff --git a/cpp/ycm/Character.cpp b/cpp/ycm/Character.cpp
@@ -31,7 +31,7 @@ bool CodePointCompare( const CodePoint *left, const CodePoint *right ) {
 
 
 // Sort the code points according to the Canonical Ordering Algorithm.
-// See https://www.unicode.org/versions/latest/ch03.pdf#G49591
+// See https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49591
 CodePointSequence CanonicalSort( CodePointSequence code_points ) {
  auto code_point_start = code_points.begin();
  auto code_point_end = code_points.end();
@@ -64,7 +64,7 @@ CodePointSequence CanonicalSort( CodePointSequence code_points ) {
 
 // Decompose a UTF-8 encoded string into a sequence of code points according to
 // Canonical Decomposition. See
-// https://www.unicode.org/versions/latest/ch03.pdf#G733
+// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G733
 CodePointSequence CanonicalDecompose( std::string_view text ) {
  assert( NormalizeInput( text ) == text );
  return CanonicalSort( BreakIntoCodePoints( text ) );
@@ -78,7 +78,7 @@ Character::Character( std::string_view character )
  is_punctuation_( false ),
  is_uppercase_( false ) {
  // Normalize the character through NFD (Normalization Form D). See
- // https://www.unicode.org/versions/latest/ch03.pdf#G49621
+ // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49621
  CodePointSequence code_points = CanonicalDecompose( character );
 
  for ( const auto &code_point : code_points ) {

diff --git a/cpp/ycm/Character.h b/cpp/ycm/Character.h
@@ -27,7 +27,7 @@ namespace YouCompleteMe {
 // This class represents a UTF-8 character. It takes a UTF-8 encoded string
 // corresponding to a grapheme cluster (see
 // https://www.unicode.org/glossary/#grapheme_cluster), normalize it through NFD
-// (see https://www.unicode.org/versions/latest/ch03.pdf#G49621), and
+// (see https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49621), and
 // compute the folded and swapped case versions of the normalized character. It
 // also holds some properties like if the character is a letter or a
 // punctuation, and if it is uppercase.

diff --git a/cpp/ycm/CodePoint.h b/cpp/ycm/CodePoint.h
@@ -91,7 +91,7 @@ struct RawCodePoint {
 // - its breaking property: used to split a word into characters.
 // - its combining class: used to sort a sequence of code points according to
 // the Canonical Ordering algorithm (see
-// https://www.unicode.org/versions/latest/ch03.pdf#G49591).
+// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49591).
 class CodePoint {
 public:
  YCM_EXPORT explicit CodePoint( std::string_view code_point );

diff --git a/cpp/ycm/UnicodeTable.inc b/cpp/ycm/UnicodeTable.inc
diff --git a/cpp/ycm/tests/CodePoint_test.cpp b/cpp/ycm/tests/CodePoint_test.cpp
@@ -87,16 +87,21 @@ const TextCodePointPair tests[] = {
  GraphemeBreakProperty::EXTEND,
  IndicConjunctBreakProperty::EXTEND } },
  // Bengali vowel sign Aa
- { "া", { "া", "া", "া", false, false, false, GraphemeBreakProperty::EXTEND } },
+ { "া", { "া", "া", "া", false, false, false,
+ GraphemeBreakProperty::EXTEND,
+ IndicConjunctBreakProperty::EXTEND } },
  // Zero-width non-joiner
  { "‌", { "‌", "‌", "‌", false, false, false,
  GraphemeBreakProperty::EXTEND } },
  // Combining cyrillic millions sign
- { "҈", { "҈", "҈", "҈", false, false, false, GraphemeBreakProperty::EXTEND } },
+ { "҈", { "҈", "҈", "҈", false, false, false,
+ GraphemeBreakProperty::EXTEND,
+ IndicConjunctBreakProperty::EXTEND } },
 
  // Zero-width joiner
  { "‍", { "‍", "‍", "‍", false, false, false,
- GraphemeBreakProperty::ZWJ, IndicConjunctBreakProperty::EXTEND } },
+ GraphemeBreakProperty::ZWJ,
+ IndicConjunctBreakProperty::EXTEND } },
 
  // Regional indicator symbol letter b
  { "🇧", { "🇧", "🇧", "🇧", false, false, false,