Implement grapheme clusters (microsoft#16916)

First, this adds `GraphemeTableGen` which * parses `ucd.nounihan.grouped.xml` * computes the cluster break property for each codepoint * computes the East Asian Width property for each codepoint * compresses everything into a 4-stage trie * computes a LUT of cluster break rules between 2 codepoints * and serializes everything to C++ tables and helper functions Next, this adds `GraphemeTestTableGen` which * parses `GraphemeBreakTest.txt` * splits each test into graphemes and break opportunities * and serializes everything to a C++ table for use as unit tests `CodepointWidthDetector.cpp` was rewritten from scratch to * use an iterator struct (`GraphemeState`) to maintain state * accumulate codepoints until a break opportunity arises * accumulate the total width of a grapheme * support 3 different measurement modes: Grapheme clusters, `wcswidth`-style, and a mode identical to the old conhost With this in place the following changes were made: * `ROW::WriteHelper::_replaceTextUnicode` now uses the new grapheme cluster text iterators * The same function was modified to join new text with existing contents of the current cell if they join to form a cluster * Otherwise, a ton of places were modified to funnel the selection of the measurement mode over from WT's settings to ConPTY This is part of microsoft#1472 ## Validation Steps Performed * So many tests ✅ * https://github.com/apparebit/demicode works fantastic ✅ * UTF8-torture-test.txt works fantastic ✅
l3dlp-sandbox · Jun 26, 2024 · cb48bab · cb48bab
1 parent 174dcb9
commit cb48bab
Show file tree

Hide file tree

Showing 54 changed files with 3,794 additions and 727 deletions.
diff --git a/.github/actions/spelling/expect/expect.txt b/.github/actions/spelling/expect/expect.txt
@@ -146,6 +146,7 @@ bytebuffer
 cac
 cacafire
 CALLCONV
+CANDRABINDU
 capslock
 CARETBLINKINGENABLED
 CARRIAGERETURN
@@ -156,6 +157,7 @@ CBash
 cbiex
 CBN
 cbt
+Ccc
 CCCBB
 cch
 CCHAR
@@ -293,7 +295,6 @@ CREATESTRUCT
 CREATESTRUCTW
 createvpack
 crisman
-CRLFs
 crloew
 CRTLIBS
 csbi
@@ -594,6 +595,7 @@ fesb
 FFAF
 ffd
 FFDE
+FFFD
 FFFDb
 fgbg
 FGCOLOR
@@ -614,6 +616,7 @@ FINDREGEX
 FINDSTRINGEXACT
 FINDUP
 FIter
+FITZPATRICK
 FIXEDFILEINFO
 Flg
 flyouts
@@ -882,10 +885,12 @@ jconcpp
 JLO
 JOBOBJECT
 JOBOBJECTINFOCLASS
+JONGSEONG
 JPN
 jsoncpp
 jsprovider
 jumplist
+JUNGSEONG
 KAttrs
 kawa
 Kazu
@@ -904,6 +909,7 @@ keyups
 KILLACTIVE
 KILLFOCUS
 kinda
+KIYEOK
 KLF
 KLMNO
 KLMNOPQRST
@@ -1013,6 +1019,7 @@ luma
 lval
 LVB
 LVERTICAL
+LVT
 LWA
 LWIN
 lwkmvj
@@ -1209,6 +1216,7 @@ ntuser
 NTVDM
 ntverp
 nugetversions
+NUKTA
 nullness
 nullonfailure
 nullopts
@@ -1471,7 +1479,6 @@ READMODE
 rectread
 redef
 redefinable
-Redir
 redist
 REDSCROLL
 REFCLSID
@@ -1489,6 +1496,7 @@ renderengine
 rendersize
 reparented
 reparenting
+REPH
 replatformed
 Replymessage
 reportfileaccesses
@@ -1519,6 +1527,7 @@ rgw
 RIGHTALIGN
 RIGHTBUTTON
 riid
+ris
 RIS
 roadmap
 robomac
@@ -1924,6 +1933,7 @@ vga
 vgaoem
 viewkind
 viewports
+VIRAMA
 Virt
 VIRTTERM
 vkey
@@ -1974,8 +1984,8 @@ wchars
 WCIA
 WCIW
 WCSHELPER
-wcsicmp
 wcsrev
+wcswidth
 wddm
 wddmcon
 WDDMCONSOLECONTEXT
@@ -2131,6 +2141,7 @@ XFORM
 XIn
 XManifest
 XMath
+XNamespace
 xorg
 XPan
 XResource
@@ -2162,6 +2173,7 @@ Zabcdefghijklmn
 Zabcdefghijklmnopqrstuvwxyz
 ZCmd
 ZCtrl
+ZWJs
 zxcvbnm
 ZYXWVU
 ZYXWVUTd
diff --git a/src/buffer/out/Row.cpp b/src/buffer/out/Row.cpp
@@ -5,10 +5,8 @@
 #include "Row.hpp"
 
 #include <isa_availability.h>
-#include <til/unicode.h>
 
-#include "textBuffer.hpp"
-#include "../../types/inc/GlyphWidth.hpp"
+#include "../../types/inc/CodepointWidthDetector.hpp"
 
 // It would be nice to add checked array access in the future, but it's a little annoying to do so without impacting
 // performance (including Debug performance). Other languages are a little bit more ergonomic there than C++.
@@ -568,6 +566,7 @@ void ROW::ReplaceAttributes(const til::CoordType beginIndex, const til::CoordTyp
 void ROW::ReplaceCharacters(til::CoordType columnBegin, til::CoordType width, const std::wstring_view& chars)
 try
 {
+    assert(width >= 1 && width <= 2);
     WriteHelper h{ *this, columnBegin, _columnCount, chars };
     if (!h.IsValid())
     {
@@ -666,56 +665,91 @@ catch (...)
 
 [[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept
 {
-    const auto end = chars.end();
+    auto& cwd = CodepointWidthDetector::Singleton();
 
-    while (it != end)
+    // Check if the new text joins with the existing contents of the row to form a single grapheme cluster.
+    if (it == chars.begin())
     {
-        unsigned int width = 1;
-        auto ptr = &*it;
-        const auto wch = *ptr;
-        size_t advance = 1;
+        auto colPrev = colBeg;
+        while (colPrev > 0 && row._uncheckedIsTrailer(--colPrev))
+        {
+        }
 
-        ++it;
+        const auto chPrev = row._uncheckedCharOffset(colPrev);
+        const std::wstring_view charsPrev{ row._chars.data() + chPrev, ch - chPrev };
 
-        // Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
-        // It also allows us to skip the surrogate pair decoding at the same time.
-        if (wch >= 0x80)
+        GraphemeState state;
+        cwd.GraphemeNext(state, charsPrev);
+        cwd.GraphemeNext(state, chars);
+
+        if (state.len > 0)
         {
-            if (til::is_surrogate(wch))
+            colBegDirty = colPrev;
+            colEnd = colPrev;
+
+            const auto width = std::max(1, state.width);
+            const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
+            if (colEndNew > colLimit)
             {
-                if (it != end && til::is_leading_surrogate(wch) && til::is_trailing_surrogate(*it))
-                {
-                    advance = 2;
-                    ++it;
-                }
-                else
-                {
-                    ptr = &UNICODE_REPLACEMENT;
-                }
+                colEndDirty = colLimit;
+                charsConsumed = ch - chBeg;
+                return;
             }
 
-            width = IsGlyphFullWidth({ ptr, advance }) + 1u;
-        }
+            // Fill our char-offset buffer with 1 entry containing the mapping from the
+            // current column (colEnd) to the start of the glyph in the string (ch)...
+            til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(chPrev);
+            // ...followed by 0-N entries containing an indication that the
+            // columns are just a wide-glyph extension of the preceding one.
+            while (colEnd < colEndNew)
+            {
+                til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(chPrev | CharOffsetsTrailer);
+            }
 
-        const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
-        if (colEndNew > colLimit)
-        {
-            colEndDirty = colLimit;
-            charsConsumed = ch - chBeg;
-            return;
+            ch += state.len;
+            it += state.len;
         }
+    }
+    else
+    {
+        // The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
+        // In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
+        // and let MeasureNext() find the next proper grapheme boundary.
+        --colEnd;
+        --ch;
+        --it;
+    }
+
+    if (const auto end = chars.end(); it != end)
+    {
+        GraphemeState state{ .beg = &*it };
 
-        // Fill our char-offset buffer with 1 entry containing the mapping from the
-        // current column (colEnd) to the start of the glyph in the string (ch)...
-        til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch);
-        // ...followed by 0-N entries containing an indication that the
-        // columns are just a wide-glyph extension of the preceding one.
-        while (colEnd < colEndNew)
+        do
         {
-            til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
-        }
+            cwd.GraphemeNext(state, chars);
+
+            const auto width = std::max(1, state.width);
+            const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
+            if (colEndNew > colLimit)
+            {
+                colEndDirty = colLimit;
+                charsConsumed = ch - chBeg;
+                return;
+            }
+
+            // Fill our char-offset buffer with 1 entry containing the mapping from the
+            // current column (colEnd) to the start of the glyph in the string (ch)...
+            til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch);
+            // ...followed by 0-N entries containing an indication that the
+            // columns are just a wide-glyph extension of the preceding one.
+            while (colEnd < colEndNew)
+            {
+                til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
+            }
 
-        ch += advance;
+            ch += state.len;
+            it += state.len;
+        } while (it != end);
     }
 
     colEndDirty = colEnd;
@@ -1058,7 +1092,7 @@ std::wstring_view ROW::GetText() const noexcept
 
 std::wstring_view ROW::GetText(til::CoordType columnBegin, til::CoordType columnEnd) const noexcept
 {
-    const til::CoordType columns = _columnCount;
+    const auto columns = GetReadableColumnCount();
     const auto colBeg = clamp(columnBegin, 0, columns);
     const auto colEnd = clamp(columnEnd, colBeg, columns);
     const size_t chBeg = _uncheckedCharOffset(gsl::narrow_cast<size_t>(colBeg));