From c6a7b4602f88fdd0d53f9ebc9e433c1a42e07cdc Mon Sep 17 00:00:00 2001 From: 5j9 <5j9@users.noreply.github.com> Date: Thu, 11 Apr 2024 23:12:19 +0330 Subject: [PATCH] chore(_balanced_quotes_shadow): rewrite to return spans This branch was intended to improves performance, but my test results do not show meaningful enough performance improvements to convince me to merge it into main branch. --- tests/wikitext/test_get_bolds_and_italics.py | 45 +++-- wikitextparser/_comment_bold_italic.py | 2 +- wikitextparser/_parameter.py | 2 +- wikitextparser/_parser_function.py | 2 +- wikitextparser/_tag.py | 2 +- wikitextparser/_template.py | 2 +- wikitextparser/_wikilink.py | 2 +- wikitextparser/_wikitext.py | 181 ++++++++++--------- 8 files changed, 130 insertions(+), 108 deletions(-) diff --git a/tests/wikitext/test_get_bolds_and_italics.py b/tests/wikitext/test_get_bolds_and_italics.py index 11b3720..5f4e27e 100644 --- a/tests/wikitext/test_get_bolds_and_italics.py +++ b/tests/wikitext/test_get_bolds_and_italics.py @@ -21,19 +21,10 @@ def test_get_bolds(): assert_bold("'''b'''", "'''b'''") assert_no_bold("''i1'''s") assert_no_bold("") - assert_bold( - "a'''" "b'''d", - "'''b'''", - ) assert_bold("'''b{{a|'''}}", "'''b{{a|'''}}") # ? assert_bold("a'''b{{text|c|d}}e'''f", "'''b{{text|c|d}}e'''") assert_bold("{{text|'''b'''}}", "'''b'''") assert_bold("{{text|'''b}}", "'''b") # ? - assert_bold("{{{PARAM|'''b}}} c", "'''b") # ? - assert ( - repr(parse("'''b\na'''c").get_bolds()) - == """[Bold("'''b"), Bold("'''c")]""" - ) assert_bold("'''b'''", "'''b'''") assert_bold("'''br'''c", "'''br'''") assert_bold("'''''b'''i", "'''b'''") @@ -53,19 +44,37 @@ def test_get_bolds(): assert_bold("{{text|{{text|'''b'''}}}}", "'''b'''") -def test_no_end_in_wikilink(): +def test_hald_bolds_with_newline_in_between(): + assert ( + repr(parse("'''b\na'''c").get_bolds()) + == """[Bold("'''b"), Bold("'''c")]""" + ) + + +def test_half_bold_in_param(): + assert_bold("{{{PARAM|'''b}}} c", "'''b") # ? + + +def test_half_bold_in_wikilink(): assert_bold("[[a|'''b]] c", "'''b") -def test_get_italics(): - def ai(s: str, o: str, r: bool = True): - italics = parse(s).get_italics(r) - assert len(italics) == 1 - assert italics[0].string == o +def test_comment_before_and_after_bold(): + assert_bold( + "a'''" "b'''d", + "'''b'''", + ) + +def ai(s: str, o: str, r: bool = True): + italics = parse(s).get_italics(r) + assert len(italics) == 1 + assert italics[0].string == o + + +def test_get_italics(): ai("''i'''", "''i'''") ai("a''' '' b '' '''c", "'' b ''") - ai("'''''i'''''", "'''''i'''''") ai("a'' ''' ib ''' ''c", "'' ''' ib ''' ''") ai("''i''", "''i''") ai( @@ -81,6 +90,10 @@ def ai(s: str, o: str, r: bool = True): ai("''' ''i'''", "''i'''") +def test_get_italics_2(): + ai("'''''i'''''", "'''''i'''''") + + def test_bold_italic_index_change(): p = parse("'''b1''' ''i1'' '''b2'''") b1, b2 = p.get_bolds(recursive=False) diff --git a/wikitextparser/_comment_bold_italic.py b/wikitextparser/_comment_bold_italic.py index 607685d..c333cf7 100644 --- a/wikitextparser/_comment_bold_italic.py +++ b/wikitextparser/_comment_bold_italic.py @@ -49,7 +49,7 @@ def text(self, s: str): self[b:e] = s @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: # noinspection PyUnresolvedReferences return self._match.span(1) diff --git a/wikitextparser/_parameter.py b/wikitextparser/_parameter.py index f8055fa..2336151 100644 --- a/wikitextparser/_parameter.py +++ b/wikitextparser/_parameter.py @@ -104,5 +104,5 @@ def parameters(self) -> List['Parameter']: return super().parameters[1:] @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: return 3, -3 diff --git a/wikitextparser/_parser_function.py b/wikitextparser/_parser_function.py index d336961..066adff 100644 --- a/wikitextparser/_parser_function.py +++ b/wikitextparser/_parser_function.py @@ -19,7 +19,7 @@ class SubWikiTextWithArgs(SubWikiText): _first_arg_sep = 0 @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: return 2, -2 @property diff --git a/wikitextparser/_tag.py b/wikitextparser/_tag.py index 7fa8a65..2bd5bb5 100644 --- a/wikitextparser/_tag.py +++ b/wikitextparser/_tag.py @@ -214,6 +214,6 @@ def get_tags(self, name=None) -> List['Tag']: return super().get_tags(name)[1:] @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: s = self.string return s.find('>') + 1, s.rfind('<') diff --git a/wikitextparser/_template.py b/wikitextparser/_template.py index 8d10530..930be49 100644 --- a/wikitextparser/_template.py +++ b/wikitextparser/_template.py @@ -29,7 +29,7 @@ class Template(SubWikiTextWithArgs): _first_arg_sep = 124 @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: return 2, -2 def normal_name( diff --git a/wikitextparser/_wikilink.py b/wikitextparser/_wikilink.py index befd977..2b474af 100644 --- a/wikitextparser/_wikilink.py +++ b/wikitextparser/_wikilink.py @@ -22,7 +22,7 @@ class WikiLink(SubWikiText): __slots__ = '_cached_match' @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: s = self.string f = s.find rf = s.rfind diff --git a/wikitextparser/_wikitext.py b/wikitextparser/_wikitext.py index 9a1b449..2a2ef47 100644 --- a/wikitextparser/_wikitext.py +++ b/wikitextparser/_wikitext.py @@ -1,4 +1,4 @@ -from bisect import bisect_left, bisect_right, insort_right +from bisect import bisect_left, bisect_right, insort_left, insort_right from html import unescape from itertools import compress, islice from operator import attrgetter @@ -114,31 +114,7 @@ ).finditer BOLD_ITALIC_FINDITER = rc( # bold-italic, bold, or italic tokens - rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)""", - MULTILINE | VERBOSE, -).finditer - -BOLD_FINDITER = rc( - rb""" - # start token - '\0*+'\0*+' - # content - (\0*+[^'\n]++.*?) - # end token - (?:'\0*+'\0*+'|$) -""", - MULTILINE | VERBOSE, -).finditer - -ITALIC_FINDITER = rc( - rb""" - # start token - '\0*+' - # content - (\0*+[^'\n]++.*?) - # end token - (?:'\0*+'|$) -""", + rb"""'\0*+(')(?:\0*+('))?+(?:\0*(')\0*')?+(?=[^']|$)""", MULTILINE | VERBOSE, ).finditer @@ -206,6 +182,9 @@ def _table_to_text(t: 'Table') -> str: ) +_MarkupSpans = List[tuple[int, int]] + + class WikiText: # In subclasses of WikiText _type is used as the key for _type_to_spans # Therefore: self._span can be found in self._type_to_spans[self._type]. @@ -575,8 +554,8 @@ def _nesting_level(self, parent_types) -> int: return level @property - def _content_span(self) -> Tuple[int, int]: - # return content_start, self_len, self_end + def _relative_content_span(self) -> Tuple[int, int]: + # return content_start, content_end return 0, len(self) @property @@ -601,7 +580,7 @@ def _shadow(self) -> bytearray: self._lststr[0][ss:se], 'ascii', 'replace' ) if self._type in SPAN_PARSER_TYPES: - cs, ce = self._content_span + cs, ce = self._relative_content_span head = shadow[:cs] tail = shadow[ce:] shadow[:cs] = b'_' * cs @@ -1008,69 +987,105 @@ def comments(self) -> List['Comment']: ] @property - def _balanced_quotes_shadow(self) -> bytearray: - """Return a byte array with non-markup-apostrophes removed. + def _bold_italic_marks( + self, + ) -> tuple[bytearray, _MarkupSpans, _MarkupSpans]: + """Return (shadow, bold markup spans, italic markup spans). The comments at /includes/parser/Parser.php:doQuotes are helpful: https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php https://phabricator.wikimedia.org/T15227#178834 """ - bold_matches = [] - odd_italics = False - odd_bold_italics = False - shadow_copy = self._shadow[:] - append_bold = bold_matches.append + bold_marks = [] + italic_marks: _MarkupSpans = [] + line_probably_bolds: _MarkupSpans = [] + line_italics: _MarkupSpans = [] + line_bolds = [] + shadow = self._shadow + find = shadow.find + cs, ce = self._relative_content_span + if ce < -1: + ce += self._span_data[1] def process_line(): - nonlocal odd_italics - if odd_italics and (len(bold_matches) + odd_bold_italics) % 2: - # one of the bold marks needs to be interpreted as italic + nonlocal bold_marks, italic_marks, line_bolds + if ( + len(line_italics) % 2 + and (len(line_bolds) + len(line_probably_bolds)) % 2 + ): + # one of the probably_bolds needs to be interpreted as italic first_multi_letter_word = first_space = None - for bold_match in bold_matches: - bold_start = bold_match.start() - if shadow_copy[bold_start - 1 : bold_start] == b' ': + for i, (lpbs, _) in enumerate(line_probably_bolds): + if shadow[lpbs - 1] == 32: # space if first_space is None: - first_space = bold_start + first_space = i continue - if shadow_copy[bold_start - 2 : bold_start - 1] == b' ': - shadow_copy[bold_start] = 95 # _ + if shadow[lpbs - 2] == 32: # space + s, e = line_probably_bolds.pop(i) + insort_left(line_italics, (s + 1, e)) break # first_single_letter_word if first_multi_letter_word is None: - first_multi_letter_word = bold_start + first_multi_letter_word = i continue else: # there was no first_single_letter_word if first_multi_letter_word is not None: - shadow_copy[first_multi_letter_word] = 95 # _ + s, e = line_probably_bolds.pop(first_multi_letter_word) + insort_left(line_italics, (s + 1, e)) elif first_space is not None: - shadow_copy[first_space] = 95 # _ - bold_matches.clear() - odd_italics = False + s, e = line_probably_bolds.pop(first_space) + insort_left(line_italics, (s + 1, e)) + + line_bolds += line_probably_bolds + line_bolds.sort() + if len(line_italics) % 2: + line_end = find(b'\n', line_italics[-1][1], ce) + if line_end == -1: + line_end = ce + line_italics.append((line_end, line_end)) + if len(line_bolds) % 2: + line_end = find(b'\n', line_bolds[-1][1], ce) + if line_end == -1: + line_end = ce + line_bolds.append((line_end, line_end)) + + bold_marks += line_bolds + italic_marks += line_italics + line_bolds.clear() + line_probably_bolds.clear() + line_italics.clear() + + def add_bold_italic(): + line_bolds.append((ms, m.end(2))) + line_italics.append((m.start(3), me)) + + def add_italic_bold(): + line_italics.append((ms, m.end(1))) + line_bolds.append((m.start(2), me)) last_end = 0 - find = shadow_copy.find - for m in BOLD_ITALIC_FINDITER(shadow_copy): - if find(b'\n', last_end, m.start()) > -1: # newline + for m in BOLD_ITALIC_FINDITER(shadow, cs, ce): + ms, me = span = m.span() + if find(b'\n', last_end, ms) > -1: # newline process_line() if m[2] is None: # italic - odd_italics ^= True + line_italics.append(span) elif m[3] is None: # bold - s, e = m.span(1) - if s != e: # four apostrophes, hide the first one - shadow_copy[s] = 95 # _ - append_bold(m) + line_probably_bolds.append(span) else: # bold-italic - s, e = m.span(1) - es = e - s - if es: # more than 5 apostrophes, hide the previous ones - shadow_copy[s:e] = b'_' * es - odd_bold_italics ^= True - odd_italics ^= True - last_end = m.end() + # this part might need more tuning or later correction + if len(line_italics) % 2: # odd italics + add_bold_italic() + else: # even italics + add_italic_bold() + + last_end = me process_line() # string end - return shadow_copy + return shadow, bold_marks, italic_marks - def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]): + def _bolds_italics_recurse( + self, result: List[Union['Bold', 'Italic']], filter_cls: Optional[type] + ): for prop in ( 'templates', 'parser_functions', @@ -1113,19 +1128,19 @@ def get_bolds_and_italics( s = self._span_data[0] type_to_spans = self._type_to_spans tts_setdefault = type_to_spans.setdefault - balanced_shadow = self._balanced_quotes_shadow - rs, re = self._content_span + shadow, bold_marks, italic_marks = self._bold_italic_marks if filter_cls is None or filter_cls is Bold: bold_spans = tts_setdefault('Bold', []) get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get - bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re)) - for m in bold_matches: - ms, me = m.span() + bmi = iter(bold_marks) + + for start_mark, end_mark in zip(bmi, bmi): + ms, me = start_mark[0], end_mark[1] b, e = s + ms, s + me old_span = get_old_bold_span((b, e)) if old_span is None: - span = [b, e, None, balanced_shadow[ms:me]] + span = [b, e, None, shadow[ms:me]] insort_right(bold_spans, span) else: span = old_span @@ -1137,31 +1152,25 @@ def get_bolds_and_italics( return result elif filter_cls is Bold: return result - else: # filter_cls is Italic - bold_matches = BOLD_FINDITER(balanced_shadow, rs, re) # filter_cls is None or filter_cls is Italic - # remove bold tokens before searching for italics - for m in bold_matches: - ms, me = m.span() - cs, ce = m.span(1) # content - balanced_shadow[ms:cs] = b'_' * (cs - ms) - balanced_shadow[ce:me] = b'_' * (me - ce) - italic_spans = tts_setdefault('Italic', []) get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get - for m in ITALIC_FINDITER(balanced_shadow, rs, re): - ms, me = m.span() + imi = iter(italic_marks) + for start_mark, end_mark in zip(imi, imi): + ms, me = start_mark[0], end_mark[1] b, e = span = s + ms, s + me old_span = get_old_italic_span(span) if old_span is None: - span = [b, e, None, balanced_shadow[ms:me]] + span = [b, e, None, shadow[ms:me]] insort_right(italic_spans, span) else: span = old_span append( - Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1)) + Italic( + _lststr, type_to_spans, span, 'Italic', end_mark[0] != me + ) ) if recursive and filter_cls is Italic: self._bolds_italics_recurse(result, filter_cls)