Skip to content

Commit

Permalink
fix(_shadow): use _content_span to strip head and tail
Browse files Browse the repository at this point in the history
fixes #126
  • Loading branch information
5j9 committed Dec 21, 2023
1 parent 6e305a3 commit 535ee20
Show file tree
Hide file tree
Showing 10 changed files with 90 additions and 53 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
Unreleased
----------
- Fixed a bug in ``plain_text``. (#126)
- Fixed another bug in parsing tables that end without a ``|}`` mark. (#125)

v0.55.6
Expand Down
66 changes: 38 additions & 28 deletions tests/wikitext/test_get_bolds_and_italics.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,42 @@
from wikitextparser import Bold, Italic, parse


def assert_bold(
input_string: str, expected_bold_string: str, recursive: bool = True, /
):
assert (
parse(input_string).get_bolds(recursive)[0].string
== expected_bold_string
)


def assert_no_bold(input_string: str):
assert not parse(input_string).get_bolds(True)


def test_get_bolds():
def ab(s: str, o: str, r: bool = True):
assert parse(s).get_bolds(r)[0].string == o

def anb(s: str):
assert not parse(s).get_bolds(True)

ab("A''''''''''B", "'''B")
ab("''''''a''''''", "'''a''''") # '<i><b>a'</b></i>
ab("a'''<!--b-->'''BI", "'''BI")
ab("'''b'''", "'''b'''")
anb("''i1'''s")
anb("<!--'''b'''-->")
ab(
assert_bold("A''''''''''B", "'''B")
assert_bold("''''''a''''''", "'''a''''") # '<i><b>a'</b></i>
assert_bold("a'''<!--b-->'''BI", "'''BI")
assert_bold("'''b'''", "'''b'''")
assert_no_bold("''i1'''s")
assert_no_bold("<!--'''b'''-->")
assert_bold(
"a<!---->'<!---->'<!---->'<!---->" "b<!---->'<!---->'<!---->'<!---->d",
"'<!---->'<!---->'<!---->b<!---->'<!---->'<!---->'",
)
ab("'''b{{a|'''}}", "'''b{{a|'''}}") # ?
ab("a'''b{{text|c|d}}e'''f", "'''b{{text|c|d}}e'''")
ab("{{text|'''b'''}}", "'''b'''")
ab("{{text|'''b}}", "'''b") # ?
ab("[[a|'''b]] c", "'''b")
ab("{{{PARAM|'''b}}} c", "'''b") # ?
assert_bold("'''b{{a|'''}}", "'''b{{a|'''}}") # ?
assert_bold("a'''b{{text|c|d}}e'''f", "'''b{{text|c|d}}e'''")
assert_bold("{{text|'''b'''}}", "'''b'''")
assert_bold("{{text|'''b}}", "'''b") # ?
assert_bold("{{{PARAM|'''b}}} c", "'''b") # ?
assert (
repr(parse("'''b\na'''c").get_bolds())
== """[Bold("'''b"), Bold("'''c")]"""
)
ab("'''<S>b</S>'''", "'''<S>b</S>'''")
ab("'''b<S>r'''c</S>", "'''b<S>r'''")
ab("'''''b'''i", "'''b'''")
assert_bold("'''<S>b</S>'''", "'''<S>b</S>'''")
assert_bold("'''b<S>r'''c</S>", "'''b<S>r'''")
assert_bold("'''''b'''i", "'''b'''")
assert (
repr(parse("'''b<ref>r'''c</ref>a").get_bolds())
== """[Bold("'''b<ref>r'''c</ref>a"), Bold("'''c")]"""
Expand All @@ -39,12 +45,16 @@ def anb(s: str):
repr(parse("'''b<ref>r'''c</ref>a").get_bolds(False))
== """[Bold("'''b<ref>r'''c</ref>a")]"""
)
ab("'''b{{{p|'''}}}", "'''b{{{p|'''}}}") # ?
ab("<nowiki>'''a</nowiki>'''b", "'''a")
anb("' ' ' a ' ' '")
ab("x''' '''y", "''' '''")
ab("x''''''y", "'''y")
ab("{{text|{{text|'''b'''}}}}", "'''b'''")
assert_bold("'''b{{{p|'''}}}", "'''b{{{p|'''}}}") # ?
assert_bold("<nowiki>'''a</nowiki>'''b", "'''a")
assert_no_bold("' ' ' a ' ' '")
assert_bold("x''' '''y", "''' '''")
assert_bold("x''''''y", "'''y")
assert_bold("{{text|{{text|'''b'''}}}}", "'''b'''")


def test_no_end_in_wikilink():
assert_bold("[[a|'''b]] c", "'''b")


def test_get_italics():
Expand Down
13 changes: 13 additions & 0 deletions tests/wikitext/test_plain_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,16 @@ def test_file_links():
# Fails for the following cases:
# assert parse('[[Media:Example.jpg]]').plain_text() == 'Media:Example.jpg'
# assert parse('[[Media:n.jpg|Sunflowers]]').plain_text() == 'Sunflowers'


def test_tag_containing_comment_with_no_end(): # 126
parsed = parse(
"""
[[a|b]]
<gallery>
<!--
</gallery>
"""
)
del parsed.wikilinks[0][:]
assert parsed.plain_text().strip() == ''
4 changes: 2 additions & 2 deletions wikitextparser/_comment_bold_italic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, List, MutableSequence, Optional, Union
from typing import Dict, List, MutableSequence, Optional, Union, Tuple

from regex import DOTALL, MULTILINE

Expand Down Expand Up @@ -49,7 +49,7 @@ def text(self, s: str):
self[b:e] = s

@property
def _relative_contents_end(self) -> tuple:
def _content_span(self) -> Tuple[int, int]:
# noinspection PyUnresolvedReferences
return self._match.span(1)

Expand Down
4 changes: 2 additions & 2 deletions wikitextparser/_parameter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional
from typing import List, Optional, Tuple

from ._wikitext import WS, SubWikiText

Expand Down Expand Up @@ -104,5 +104,5 @@ def parameters(self) -> List['Parameter']:
return super().parameters[1:]

@property
def _relative_contents_end(self) -> tuple:
def _content_span(self) -> Tuple[int, int]:
return 3, -3
10 changes: 5 additions & 5 deletions wikitextparser/_parser_function.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from bisect import insort
from typing import Iterable, List, Union
from typing import Iterable, List, Union, Tuple

from ._argument import Argument
from ._wikilist import WikiList
Expand All @@ -19,6 +19,10 @@ class SubWikiTextWithArgs(SubWikiText):
_name_args_matcher = NotImplemented
_first_arg_sep = 0

@property
def _content_span(self) -> Tuple[int, int]:
return 2, -2

@property
def nesting_level(self) -> int:
"""Return the nesting level of self.
Expand Down Expand Up @@ -95,10 +99,6 @@ def name(self) -> str:
def name(self, newname: str) -> None:
self[2 : 2 + len(self.name)] = newname

@property
def _relative_contents_end(self) -> tuple:
return 2, -2


class ParserFunction(SubWikiTextWithArgs):
__slots__ = ()
Expand Down
7 changes: 4 additions & 3 deletions wikitextparser/_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* https://www.mediawiki.org/wiki/HTML_restriction
"""

from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple

from regex import DOTALL, VERBOSE

Expand Down Expand Up @@ -215,5 +215,6 @@ def get_tags(self, name=None) -> List['Tag']:
return super().get_tags(name)[1:]

@property
def _relative_contents_end(self) -> tuple:
return self._match.span('contents')
def _content_span(self) -> Tuple[int, int]:
s = self.string
return s.find('>') + 1, s.rfind('<')
4 changes: 4 additions & 0 deletions wikitextparser/_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ class Template(SubWikiTextWithArgs):
_name_args_matcher = TL_NAME_ARGS_FULLMATCH
_first_arg_sep = 124

@property
def _content_span(self) -> Tuple[int, int]:
return 2, -2

def normal_name(
self,
rm_namespaces=('Template',),
Expand Down
13 changes: 8 additions & 5 deletions wikitextparser/_wikilink.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Define the WikiLink class."""


from typing import List, Optional
from typing import List, Optional, Tuple

from regex import DOTALL

Expand All @@ -22,6 +22,13 @@
class WikiLink(SubWikiText):
__slots__ = '_cached_match'

@property
def _content_span(self) -> Tuple[int, int]:
s = self.string
f = s.find
rf = s.rfind
return f('[', f('[') + 1) + 1, rf(']', None, rf(']'))

@property
def _match(self):
shadow = self._shadow
Expand Down Expand Up @@ -141,7 +148,3 @@ def title(self) -> None:
@property
def wikilinks(self) -> List['WikiLink']:
return super().wikilinks[1:]

@property
def _relative_contents_end(self) -> tuple:
return self._match.span(4)
21 changes: 13 additions & 8 deletions wikitextparser/_wikitext.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,11 @@ def _nesting_level(self, parent_types) -> int:
level += 1
return level

@property
def _content_span(self) -> Tuple[int, int]:
# return content_start, self_len, self_end
return 0, len(self)

@property
def _shadow(self) -> bytearray:
"""Return a copy of self.string with specific sub-spans replaced.
Expand All @@ -597,12 +602,14 @@ def _shadow(self) -> bytearray:
self._lststr[0][ss:se], 'ascii', 'replace'
)
if self._type in SPAN_PARSER_TYPES:
head = shadow[:2]
tail = shadow[-2:]
shadow[:2] = shadow[-2:] = b'__'
cs, ce = self._content_span
head = shadow[:cs]
tail = shadow[ce:]
shadow[:cs] = b'_' * cs
shadow[ce:] = b'_' * len(tail)
parse_to_spans(shadow)
shadow[:2] = head
shadow[-2:] = tail
shadow[:cs] = head
shadow[ce:] = tail
else:
parse_to_spans(shadow)
return shadow
Expand Down Expand Up @@ -1002,8 +1009,6 @@ def comments(self) -> List['Comment']:
for span in self._subspans('Comment')
]

_relative_contents_end = span

@property
def _balanced_quotes_shadow(self):
"""Return bold and italic match objects according MW's algorithm.
Expand Down Expand Up @@ -1110,7 +1115,7 @@ def get_bolds_and_italics(
type_to_spans = self._type_to_spans
tts_setdefault = type_to_spans.setdefault
balanced_shadow = self._balanced_quotes_shadow
rs, re = self._relative_contents_end
rs, re = self._content_span

if filter_cls is None or filter_cls is Bold:
bold_spans = tts_setdefault('Bold', [])
Expand Down

0 comments on commit 535ee20

Please sign in to comment.