From 593c286b52537f519dfe3b7ff4cfcde4b72e6bf7 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Wed, 26 Jun 2024 11:59:16 -0400 Subject: [PATCH] Bugix infinite loop in term.wrap() with 1 length From https://github.com/jquast/blessed/issues/273, > The following code enters an infinite loop: import blessed blessed.Terminal().wrap('\u5973', 1) This fixes by explicit test: when the given individual sequence is of length '2', and the width is '1', and the cur_len is '0', we cannot break down this "Wide" character any further -- so it is allowed to flow outside the given cell. - 'faulthandler_timeout = 30' is added to [pytest] in tox.ini, - Tests for East-Asian, Emoji, and ZWJ are added - Further noting that blessed gets ZWJ wrong --- blessed/sequences.py | 30 ++++++++++++++++++++++-------- tests/test_length_sequence.py | 18 +++++++++++++++++- tests/test_wrap.py | 16 ++++++++++++++++ tox.ini | 2 ++ 4 files changed, 57 insertions(+), 9 deletions(-) diff --git a/blessed/sequences.py b/blessed/sequences.py index 486294cf..fca03bea 100644 --- a/blessed/sequences.py +++ b/blessed/sequences.py @@ -184,11 +184,11 @@ def _wrap_chunks(self, chunks): while chunks: chunk_len = Sequence(chunks[-1], term).length() if cur_len + chunk_len > width: + if chunk_len > width: + self._handle_long_word(chunks, cur_line, cur_len, width) break cur_line.append(chunks.pop()) cur_len += chunk_len - if chunks and Sequence(chunks[-1], term).length() > width: - self._handle_long_word(chunks, cur_line, cur_len, width) if drop_whitespace and ( cur_line and Sequence(cur_line[-1], term).strip() == ''): del cur_line[-1] @@ -200,10 +200,18 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): """ Sequence-aware :meth:`textwrap.TextWrapper._handle_long_word`. - This simply ensures that word boundaries are not broken mid-sequence, as standard python - textwrap would incorrectly determine the length of a string containing sequences, and may - also break consider sequences part of a "word" that may be broken by hyphen (``-``), where - this implementation corrects both. + This method ensures that word boundaries are not broken mid-sequence, as + standard python textwrap would incorrectly determine the length of a + string containing sequences and wide characters it would also break + these "words" that would be broken by hyphen (``-``), this + implementation corrects both. + + This is done by mutating the passed arguments, removing items from + 'reversed_chunks' and appending them to 'cur_line'. + + However, some characters (east-asian, emoji, etc.) cannot be split any + less than 2 cells, so in the case of a width of 1, we have no choice + but to allow those characters to flow outside of the given cell. """ # Figure out when indent is larger than the specified width, and make # sure at least one character is stripped off on every pass @@ -217,8 +225,14 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): idx = nxt = 0 for text, _ in iter_parse(term, chunk): nxt += len(text) - if Sequence(chunk[:nxt], term).length() > space_left: - break + seq_length = Sequence(chunk[:nxt], term).length() + if seq_length > space_left: + if cur_len == 0 and width == 1 and seq_length == 2: + # Emoji etc. cannot be split under 2 cells, so in the case of a width of 1, we have no choice + # but to allow those characters to flow outside of the given cell. + pass + else: + break idx = nxt cur_line.append(chunk[:idx]) reversed_chunks[-1] = chunk[idx:] diff --git a/tests/test_length_sequence.py b/tests/test_length_sequence.py index 03db8505..b65a1af8 100644 --- a/tests/test_length_sequence.py +++ b/tests/test_length_sequence.py @@ -37,6 +37,22 @@ def child(): child() +def test_length_with_zwj_is_wrong(): + """Because of the way Zero-Width Joiner (ZWJ) is measured, blessed gets this wrong""" + # But for the time being, so do many terminals (~85%), so its not a huge deal.. + # https://ucs-detect.readthedocs.io/results.html + @as_subprocess + def child(): + term = TestTerminal() + # RGI_Emoji_ZWJ_Sequence ; family: woman, woman, girl, boy + given = term.bold_red(u'\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466') + expected = sum((2, 0, 2, 0, 2, 0, 2)) + + # exercise, + assert term.length(given) == expected + + + def test_length_ansiart(): """Test length of ANSI art""" @as_subprocess @@ -60,7 +76,7 @@ def child(kind): def test_sequence_length(all_terms): - """Ensure T.length(string containing sequence) is correcterm.""" + """Ensure T.length(string containing sequence) is correct.""" # pylint: disable=too-complex,too-many-statements @as_subprocess def child(kind): diff --git a/tests/test_wrap.py b/tests/test_wrap.py index e74aa2a9..f31986e9 100644 --- a/tests/test_wrap.py +++ b/tests/test_wrap.py @@ -113,3 +113,19 @@ def child(): assert expected == result child() + +def test_east_asian_emojis_width_1(): + """Tests edge-case of east-asian and emoji characters split into single columns.""" + @as_subprocess + def child(): + term = TestTerminal() + # by @grayjk from https://github.com/jquast/blessed/issues/273 + result = term.wrap('\u5973', 1) + assert result == ['\u5973'] + + # much like test_length_with_zwj_is_wrong(), blessed gets ZWJ wrong when wrapping, also. + # RGI_Emoji_ZWJ_Sequence ; family: woman, woman, girl, boy + result = term.wrap(u'\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466', 1) + assert result == ['\U0001F469\u200D', '\U0001F469\u200D', '\U0001F467\u200D', '\U0001F466'] + + child() \ No newline at end of file diff --git a/tox.ini b/tox.ini index d3a44051..b18eb6a6 100644 --- a/tox.ini +++ b/tox.ini @@ -275,6 +275,8 @@ addopts = --ignore=setup.py --ignore=.tox --junit-xml=.tox/results.{envname}.xml +# if any test takes over 30 seconds, dump traceback +faulthandler_timeout = 30 filterwarnings = error junit_family = xunit1 log_format=%(levelname)s %(relativeCreated)2.2f %(filename)s:%(lineno)d %(message)s