From c6a7b4602f88fdd0d53f9ebc9e433c1a42e07cdc Mon Sep 17 00:00:00 2001
From: 5j9 <5j9@users.noreply.github.com>
Date: Thu, 11 Apr 2024 23:12:19 +0330
Subject: [PATCH] chore(_balanced_quotes_shadow): rewrite to return spans
This branch was intended to improves performance,
but my test results do not show meaningful enough performance improvements
to convince me to merge it into main branch.
---
tests/wikitext/test_get_bolds_and_italics.py | 45 +++--
wikitextparser/_comment_bold_italic.py | 2 +-
wikitextparser/_parameter.py | 2 +-
wikitextparser/_parser_function.py | 2 +-
wikitextparser/_tag.py | 2 +-
wikitextparser/_template.py | 2 +-
wikitextparser/_wikilink.py | 2 +-
wikitextparser/_wikitext.py | 181 ++++++++++---------
8 files changed, 130 insertions(+), 108 deletions(-)
diff --git a/tests/wikitext/test_get_bolds_and_italics.py b/tests/wikitext/test_get_bolds_and_italics.py
index 11b3720..5f4e27e 100644
--- a/tests/wikitext/test_get_bolds_and_italics.py
+++ b/tests/wikitext/test_get_bolds_and_italics.py
@@ -21,19 +21,10 @@ def test_get_bolds():
assert_bold("'''b'''", "'''b'''")
assert_no_bold("''i1'''s")
assert_no_bold("")
- assert_bold(
- "a'''" "b'''d",
- "'''b'''",
- )
assert_bold("'''b{{a|'''}}", "'''b{{a|'''}}") # ?
assert_bold("a'''b{{text|c|d}}e'''f", "'''b{{text|c|d}}e'''")
assert_bold("{{text|'''b'''}}", "'''b'''")
assert_bold("{{text|'''b}}", "'''b") # ?
- assert_bold("{{{PARAM|'''b}}} c", "'''b") # ?
- assert (
- repr(parse("'''b\na'''c").get_bolds())
- == """[Bold("'''b"), Bold("'''c")]"""
- )
assert_bold("'''b'''", "'''b'''")
assert_bold("'''br'''c", "'''br'''")
assert_bold("'''''b'''i", "'''b'''")
@@ -53,19 +44,37 @@ def test_get_bolds():
assert_bold("{{text|{{text|'''b'''}}}}", "'''b'''")
-def test_no_end_in_wikilink():
+def test_hald_bolds_with_newline_in_between():
+ assert (
+ repr(parse("'''b\na'''c").get_bolds())
+ == """[Bold("'''b"), Bold("'''c")]"""
+ )
+
+
+def test_half_bold_in_param():
+ assert_bold("{{{PARAM|'''b}}} c", "'''b") # ?
+
+
+def test_half_bold_in_wikilink():
assert_bold("[[a|'''b]] c", "'''b")
-def test_get_italics():
- def ai(s: str, o: str, r: bool = True):
- italics = parse(s).get_italics(r)
- assert len(italics) == 1
- assert italics[0].string == o
+def test_comment_before_and_after_bold():
+ assert_bold(
+ "a'''" "b'''d",
+ "'''b'''",
+ )
+
+def ai(s: str, o: str, r: bool = True):
+ italics = parse(s).get_italics(r)
+ assert len(italics) == 1
+ assert italics[0].string == o
+
+
+def test_get_italics():
ai("''i'''", "''i'''")
ai("a''' '' b '' '''c", "'' b ''")
- ai("'''''i'''''", "'''''i'''''")
ai("a'' ''' ib ''' ''c", "'' ''' ib ''' ''")
ai("''i''", "''i''")
ai(
@@ -81,6 +90,10 @@ def ai(s: str, o: str, r: bool = True):
ai("''' ''i'''", "''i'''")
+def test_get_italics_2():
+ ai("'''''i'''''", "'''''i'''''")
+
+
def test_bold_italic_index_change():
p = parse("'''b1''' ''i1'' '''b2'''")
b1, b2 = p.get_bolds(recursive=False)
diff --git a/wikitextparser/_comment_bold_italic.py b/wikitextparser/_comment_bold_italic.py
index 607685d..c333cf7 100644
--- a/wikitextparser/_comment_bold_italic.py
+++ b/wikitextparser/_comment_bold_italic.py
@@ -49,7 +49,7 @@ def text(self, s: str):
self[b:e] = s
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
# noinspection PyUnresolvedReferences
return self._match.span(1)
diff --git a/wikitextparser/_parameter.py b/wikitextparser/_parameter.py
index f8055fa..2336151 100644
--- a/wikitextparser/_parameter.py
+++ b/wikitextparser/_parameter.py
@@ -104,5 +104,5 @@ def parameters(self) -> List['Parameter']:
return super().parameters[1:]
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
return 3, -3
diff --git a/wikitextparser/_parser_function.py b/wikitextparser/_parser_function.py
index d336961..066adff 100644
--- a/wikitextparser/_parser_function.py
+++ b/wikitextparser/_parser_function.py
@@ -19,7 +19,7 @@ class SubWikiTextWithArgs(SubWikiText):
_first_arg_sep = 0
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
return 2, -2
@property
diff --git a/wikitextparser/_tag.py b/wikitextparser/_tag.py
index 7fa8a65..2bd5bb5 100644
--- a/wikitextparser/_tag.py
+++ b/wikitextparser/_tag.py
@@ -214,6 +214,6 @@ def get_tags(self, name=None) -> List['Tag']:
return super().get_tags(name)[1:]
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
s = self.string
return s.find('>') + 1, s.rfind('<')
diff --git a/wikitextparser/_template.py b/wikitextparser/_template.py
index 8d10530..930be49 100644
--- a/wikitextparser/_template.py
+++ b/wikitextparser/_template.py
@@ -29,7 +29,7 @@ class Template(SubWikiTextWithArgs):
_first_arg_sep = 124
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
return 2, -2
def normal_name(
diff --git a/wikitextparser/_wikilink.py b/wikitextparser/_wikilink.py
index befd977..2b474af 100644
--- a/wikitextparser/_wikilink.py
+++ b/wikitextparser/_wikilink.py
@@ -22,7 +22,7 @@ class WikiLink(SubWikiText):
__slots__ = '_cached_match'
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
s = self.string
f = s.find
rf = s.rfind
diff --git a/wikitextparser/_wikitext.py b/wikitextparser/_wikitext.py
index 9a1b449..2a2ef47 100644
--- a/wikitextparser/_wikitext.py
+++ b/wikitextparser/_wikitext.py
@@ -1,4 +1,4 @@
-from bisect import bisect_left, bisect_right, insort_right
+from bisect import bisect_left, bisect_right, insort_left, insort_right
from html import unescape
from itertools import compress, islice
from operator import attrgetter
@@ -114,31 +114,7 @@
).finditer
BOLD_ITALIC_FINDITER = rc( # bold-italic, bold, or italic tokens
- rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)""",
- MULTILINE | VERBOSE,
-).finditer
-
-BOLD_FINDITER = rc(
- rb"""
- # start token
- '\0*+'\0*+'
- # content
- (\0*+[^'\n]++.*?)
- # end token
- (?:'\0*+'\0*+'|$)
-""",
- MULTILINE | VERBOSE,
-).finditer
-
-ITALIC_FINDITER = rc(
- rb"""
- # start token
- '\0*+'
- # content
- (\0*+[^'\n]++.*?)
- # end token
- (?:'\0*+'|$)
-""",
+ rb"""'\0*+(')(?:\0*+('))?+(?:\0*(')\0*')?+(?=[^']|$)""",
MULTILINE | VERBOSE,
).finditer
@@ -206,6 +182,9 @@ def _table_to_text(t: 'Table') -> str:
)
+_MarkupSpans = List[tuple[int, int]]
+
+
class WikiText:
# In subclasses of WikiText _type is used as the key for _type_to_spans
# Therefore: self._span can be found in self._type_to_spans[self._type].
@@ -575,8 +554,8 @@ def _nesting_level(self, parent_types) -> int:
return level
@property
- def _content_span(self) -> Tuple[int, int]:
- # return content_start, self_len, self_end
+ def _relative_content_span(self) -> Tuple[int, int]:
+ # return content_start, content_end
return 0, len(self)
@property
@@ -601,7 +580,7 @@ def _shadow(self) -> bytearray:
self._lststr[0][ss:se], 'ascii', 'replace'
)
if self._type in SPAN_PARSER_TYPES:
- cs, ce = self._content_span
+ cs, ce = self._relative_content_span
head = shadow[:cs]
tail = shadow[ce:]
shadow[:cs] = b'_' * cs
@@ -1008,69 +987,105 @@ def comments(self) -> List['Comment']:
]
@property
- def _balanced_quotes_shadow(self) -> bytearray:
- """Return a byte array with non-markup-apostrophes removed.
+ def _bold_italic_marks(
+ self,
+ ) -> tuple[bytearray, _MarkupSpans, _MarkupSpans]:
+ """Return (shadow, bold markup spans, italic markup spans).
The comments at /includes/parser/Parser.php:doQuotes are helpful:
https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
https://phabricator.wikimedia.org/T15227#178834
"""
- bold_matches = []
- odd_italics = False
- odd_bold_italics = False
- shadow_copy = self._shadow[:]
- append_bold = bold_matches.append
+ bold_marks = []
+ italic_marks: _MarkupSpans = []
+ line_probably_bolds: _MarkupSpans = []
+ line_italics: _MarkupSpans = []
+ line_bolds = []
+ shadow = self._shadow
+ find = shadow.find
+ cs, ce = self._relative_content_span
+ if ce < -1:
+ ce += self._span_data[1]
def process_line():
- nonlocal odd_italics
- if odd_italics and (len(bold_matches) + odd_bold_italics) % 2:
- # one of the bold marks needs to be interpreted as italic
+ nonlocal bold_marks, italic_marks, line_bolds
+ if (
+ len(line_italics) % 2
+ and (len(line_bolds) + len(line_probably_bolds)) % 2
+ ):
+ # one of the probably_bolds needs to be interpreted as italic
first_multi_letter_word = first_space = None
- for bold_match in bold_matches:
- bold_start = bold_match.start()
- if shadow_copy[bold_start - 1 : bold_start] == b' ':
+ for i, (lpbs, _) in enumerate(line_probably_bolds):
+ if shadow[lpbs - 1] == 32: # space
if first_space is None:
- first_space = bold_start
+ first_space = i
continue
- if shadow_copy[bold_start - 2 : bold_start - 1] == b' ':
- shadow_copy[bold_start] = 95 # _
+ if shadow[lpbs - 2] == 32: # space
+ s, e = line_probably_bolds.pop(i)
+ insort_left(line_italics, (s + 1, e))
break # first_single_letter_word
if first_multi_letter_word is None:
- first_multi_letter_word = bold_start
+ first_multi_letter_word = i
continue
else: # there was no first_single_letter_word
if first_multi_letter_word is not None:
- shadow_copy[first_multi_letter_word] = 95 # _
+ s, e = line_probably_bolds.pop(first_multi_letter_word)
+ insort_left(line_italics, (s + 1, e))
elif first_space is not None:
- shadow_copy[first_space] = 95 # _
- bold_matches.clear()
- odd_italics = False
+ s, e = line_probably_bolds.pop(first_space)
+ insort_left(line_italics, (s + 1, e))
+
+ line_bolds += line_probably_bolds
+ line_bolds.sort()
+ if len(line_italics) % 2:
+ line_end = find(b'\n', line_italics[-1][1], ce)
+ if line_end == -1:
+ line_end = ce
+ line_italics.append((line_end, line_end))
+ if len(line_bolds) % 2:
+ line_end = find(b'\n', line_bolds[-1][1], ce)
+ if line_end == -1:
+ line_end = ce
+ line_bolds.append((line_end, line_end))
+
+ bold_marks += line_bolds
+ italic_marks += line_italics
+ line_bolds.clear()
+ line_probably_bolds.clear()
+ line_italics.clear()
+
+ def add_bold_italic():
+ line_bolds.append((ms, m.end(2)))
+ line_italics.append((m.start(3), me))
+
+ def add_italic_bold():
+ line_italics.append((ms, m.end(1)))
+ line_bolds.append((m.start(2), me))
last_end = 0
- find = shadow_copy.find
- for m in BOLD_ITALIC_FINDITER(shadow_copy):
- if find(b'\n', last_end, m.start()) > -1: # newline
+ for m in BOLD_ITALIC_FINDITER(shadow, cs, ce):
+ ms, me = span = m.span()
+ if find(b'\n', last_end, ms) > -1: # newline
process_line()
if m[2] is None: # italic
- odd_italics ^= True
+ line_italics.append(span)
elif m[3] is None: # bold
- s, e = m.span(1)
- if s != e: # four apostrophes, hide the first one
- shadow_copy[s] = 95 # _
- append_bold(m)
+ line_probably_bolds.append(span)
else: # bold-italic
- s, e = m.span(1)
- es = e - s
- if es: # more than 5 apostrophes, hide the previous ones
- shadow_copy[s:e] = b'_' * es
- odd_bold_italics ^= True
- odd_italics ^= True
- last_end = m.end()
+ # this part might need more tuning or later correction
+ if len(line_italics) % 2: # odd italics
+ add_bold_italic()
+ else: # even italics
+ add_italic_bold()
+
+ last_end = me
process_line() # string end
- return shadow_copy
+ return shadow, bold_marks, italic_marks
- def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
+ def _bolds_italics_recurse(
+ self, result: List[Union['Bold', 'Italic']], filter_cls: Optional[type]
+ ):
for prop in (
'templates',
'parser_functions',
@@ -1113,19 +1128,19 @@ def get_bolds_and_italics(
s = self._span_data[0]
type_to_spans = self._type_to_spans
tts_setdefault = type_to_spans.setdefault
- balanced_shadow = self._balanced_quotes_shadow
- rs, re = self._content_span
+ shadow, bold_marks, italic_marks = self._bold_italic_marks
if filter_cls is None or filter_cls is Bold:
bold_spans = tts_setdefault('Bold', [])
get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
- bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
- for m in bold_matches:
- ms, me = m.span()
+ bmi = iter(bold_marks)
+
+ for start_mark, end_mark in zip(bmi, bmi):
+ ms, me = start_mark[0], end_mark[1]
b, e = s + ms, s + me
old_span = get_old_bold_span((b, e))
if old_span is None:
- span = [b, e, None, balanced_shadow[ms:me]]
+ span = [b, e, None, shadow[ms:me]]
insort_right(bold_spans, span)
else:
span = old_span
@@ -1137,31 +1152,25 @@ def get_bolds_and_italics(
return result
elif filter_cls is Bold:
return result
- else: # filter_cls is Italic
- bold_matches = BOLD_FINDITER(balanced_shadow, rs, re)
# filter_cls is None or filter_cls is Italic
- # remove bold tokens before searching for italics
- for m in bold_matches:
- ms, me = m.span()
- cs, ce = m.span(1) # content
- balanced_shadow[ms:cs] = b'_' * (cs - ms)
- balanced_shadow[ce:me] = b'_' * (me - ce)
-
italic_spans = tts_setdefault('Italic', [])
get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
- for m in ITALIC_FINDITER(balanced_shadow, rs, re):
- ms, me = m.span()
+ imi = iter(italic_marks)
+ for start_mark, end_mark in zip(imi, imi):
+ ms, me = start_mark[0], end_mark[1]
b, e = span = s + ms, s + me
old_span = get_old_italic_span(span)
if old_span is None:
- span = [b, e, None, balanced_shadow[ms:me]]
+ span = [b, e, None, shadow[ms:me]]
insort_right(italic_spans, span)
else:
span = old_span
append(
- Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
+ Italic(
+ _lststr, type_to_spans, span, 'Italic', end_mark[0] != me
+ )
)
if recursive and filter_cls is Italic:
self._bolds_italics_recurse(result, filter_cls)