Bugix infinite loop in term.wrap() with 1 length

From #273, > The following code enters an infinite loop: import blessed blessed.Terminal().wrap('\u5973', 1) This fixes by explicit test: when the given individual sequence is of length '2', and the width is '1', and the cur_len is '0', we cannot break down this "Wide" character any further -- so it is allowed to flow outside the given cell. - 'faulthandler_timeout = 30' is added to [pytest] in tox.ini, - Tests for East-Asian, Emoji, and ZWJ are added - Further noting that blessed gets ZWJ wrong
jquast · Jun 26, 2024 · 593c286 · 593c286
1 parent c28b53f
commit 593c286
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 9 deletions.
diff --git a/blessed/sequences.py b/blessed/sequences.py
@@ -184,11 +184,11 @@ def _wrap_chunks(self, chunks):
             while chunks:
                 chunk_len = Sequence(chunks[-1], term).length()
                 if cur_len + chunk_len > width:
+                    if chunk_len > width:
+                        self._handle_long_word(chunks, cur_line, cur_len, width)
                     break
                 cur_line.append(chunks.pop())
                 cur_len += chunk_len
-            if chunks and Sequence(chunks[-1], term).length() > width:
-                self._handle_long_word(chunks, cur_line, cur_len, width)
             if drop_whitespace and (
                     cur_line and Sequence(cur_line[-1], term).strip() == ''):
                 del cur_line[-1]
@@ -200,10 +200,18 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
         """
         Sequence-aware :meth:`textwrap.TextWrapper._handle_long_word`.
 
-        This simply ensures that word boundaries are not broken mid-sequence, as standard python
-        textwrap would incorrectly determine the length of a string containing sequences, and may
-        also break consider sequences part of a "word" that may be broken by hyphen (``-``), where
-        this implementation corrects both.
+        This method ensures that word boundaries are not broken mid-sequence, as
+        standard python textwrap would incorrectly determine the length of a
+        string containing sequences and wide characters it would also break
+        these "words" that would be broken by hyphen (``-``), this
+        implementation corrects both.
+
+        This is done by mutating the passed arguments, removing items from
+        'reversed_chunks' and appending them to 'cur_line'.
+
+        However, some characters (east-asian, emoji, etc.) cannot be split any
+        less than 2 cells, so in the case of a width of 1, we have no choice
+        but to allow those characters to flow outside of the given cell.
         """
         # Figure out when indent is larger than the specified width, and make
         # sure at least one character is stripped off on every pass
@@ -217,8 +225,14 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
             idx = nxt = 0
             for text, _ in iter_parse(term, chunk):
                 nxt += len(text)
-                if Sequence(chunk[:nxt], term).length() > space_left:
-                    break
+                seq_length = Sequence(chunk[:nxt], term).length()
+                if seq_length > space_left:
+                    if cur_len == 0 and width == 1 and seq_length == 2:
+                        # Emoji etc. cannot be split under 2 cells, so in the case of a width of 1, we have no choice
+                        # but to allow those characters to flow outside of the given cell.
+                        pass
+                    else:
+                        break
                 idx = nxt
             cur_line.append(chunk[:idx])
             reversed_chunks[-1] = chunk[idx:]

diff --git a/tests/test_length_sequence.py b/tests/test_length_sequence.py
@@ -37,6 +37,22 @@ def child():
     child()
 
 
+def test_length_with_zwj_is_wrong():
+    """Because of the way Zero-Width Joiner (ZWJ) is measured, blessed gets this wrong"""
+    # But for the time being, so do many terminals (~85%), so its not a huge deal..
+    # https://ucs-detect.readthedocs.io/results.html
+    @as_subprocess
+    def child():
+        term = TestTerminal()
+        # RGI_Emoji_ZWJ_Sequence  ; family: woman, woman, girl, boy
+        given = term.bold_red(u'\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466')
+        expected = sum((2, 0, 2, 0, 2, 0, 2))
+
+        # exercise,
+        assert term.length(given) == expected
+
+
+
 def test_length_ansiart():
     """Test length of ANSI art"""
     @as_subprocess
@@ -60,7 +76,7 @@ def child(kind):
 
 
 def test_sequence_length(all_terms):
-    """Ensure T.length(string containing sequence) is correcterm."""
+    """Ensure T.length(string containing sequence) is correct."""
     # pylint: disable=too-complex,too-many-statements
     @as_subprocess
     def child(kind):

diff --git a/tests/test_wrap.py b/tests/test_wrap.py
@@ -113,3 +113,19 @@ def child():
         assert expected == result
 
     child()
+
+def test_east_asian_emojis_width_1():
+    """Tests edge-case of east-asian and emoji characters split into single columns."""
+    @as_subprocess
+    def child():
+        term = TestTerminal()
+        # by @grayjk from https://github.com/jquast/blessed/issues/273
+        result = term.wrap('\u5973', 1)
+        assert result == ['\u5973']
+
+        # much like test_length_with_zwj_is_wrong(), blessed gets ZWJ wrong when wrapping, also.
+        # RGI_Emoji_ZWJ_Sequence  ; family: woman, woman, girl, boy
+        result = term.wrap(u'\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466', 1)
+        assert result == ['\U0001F469\u200D', '\U0001F469\u200D', '\U0001F467\u200D', '\U0001F466']
+
+    child()
diff --git a/tox.ini b/tox.ini
@@ -275,6 +275,8 @@ addopts =
     --ignore=setup.py
     --ignore=.tox
     --junit-xml=.tox/results.{envname}.xml
+# if any test takes over 30 seconds, dump traceback
+faulthandler_timeout = 30
 filterwarnings = error
 junit_family = xunit1
 log_format=%(levelname)s %(relativeCreated)2.2f %(filename)s:%(lineno)d %(message)s