From f725b3ac75860761dfb6fee41164f63f12c47955 Mon Sep 17 00:00:00 2001 From: Chris Caron Date: Fri, 6 Oct 2023 18:08:10 -0400 Subject: [PATCH] code & test improvements, added more coverage --- apprise/conversion.py | 36 ++++++++++++++++++++++++++++-------- test/test_conversion.py | 30 +++++++++++++++++++++++------- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/apprise/conversion.py b/apprise/conversion.py index 689697f0d2..891714336d 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -101,7 +101,7 @@ class HTMLConverter(HTMLParser, object): # The following tags must start on a new line BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'div', 'td', 'th', 'code', 'pre', 'label', 'li',) + 'div', 'td', 'th', 'pre', 'samp', 'label', 'li',) # the folowing tags ignore any internal text IGNORE_TAGS = ( @@ -216,8 +216,10 @@ class HTMLMarkDownConverter(HTMLConverter): """An HTML to markdown converter tuned for email messages.""" # Escape markdown characters - MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])', - re.DOTALL | re.MULTILINE) + MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE) + + # Detect Carriage Return + HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE) def __init__(self, **kwargs): super().__init__(**kwargs) @@ -225,6 +227,8 @@ def __init__(self, **kwargs): # Store href value self._link = "" + self._preserver_cr = False + def handle_data(self, data, *args, **kwargs): """ Store our data if it is not on the ignore list @@ -234,7 +238,8 @@ def handle_data(self, data, *args, **kwargs): if self._do_store: # Tidy our whitespace - content = self.WS_TRIM.sub(' ', data) + content = self.WS_TRIM.sub(' ', data) \ + if not self._preserver_cr else data content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content) # Add hyperlink @@ -287,19 +292,28 @@ def handle_starttag(self, tag, attrs): elif tag == 'h6': self._result.append('###### ') - elif tag in ['strong', 'b']: + elif tag in ('strong', 'b'): self._result.append('**') - elif tag in ['em', 'i']: + elif tag in ('em', 'i'): self._result.append('*') elif tag == 'code': self._result.append('`') + self._preserver_cr = True + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = True elif tag == 'a': for name, link in attrs: if name == 'href': self._link = '(' + link + ')' + # Take an early exit for speed (in case there are more + # parameters - no need to waste time looking at them) + break def handle_endtag(self, tag): """ @@ -311,11 +325,17 @@ def handle_endtag(self, tag): if tag in self.BLOCK_TAGS: self._result.append(self.BLOCK_END) - if tag in ['strong', 'b']: + if tag in ('strong', 'b'): self._result.append('**') - elif tag in ['em', 'i']: + elif tag in ('em', 'i'): self._result.append('*') elif tag == 'code': self._result.append('`') + self._preserver_cr = False + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = False diff --git a/test/test_conversion.py b/test/test_conversion.py index 0a8230d59e..c09eb86e8a 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -153,24 +153,24 @@ def to_markdown(body): """ return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body) - assert to_markdown("No HTML code here.") == "No HTML code here\." + assert to_markdown("No HTML code here.") == "No HTML code here." clist = to_markdown("") assert "- Lots and lots" in clist - assert "- of lists\." in clist + assert "- of lists." in clist - assert "> To be or not to be\." == to_markdown( + assert "> To be or not to be." == to_markdown( "
To be or not to be.
") cspace = to_markdown( "

Fancy heading

" "

And a paragraph too.
Plus line break.

") assert "# Fancy heading" in cspace - assert "And a paragraph too\.\nPlus line break\." in cspace + assert "And a paragraph too.\nPlus line break." in cspace assert to_markdown( "" - "

Some obnoxious text here.

") == "Some obnoxious text here\." + "

Some obnoxious text here.

") == "Some obnoxious text here." assert to_markdown( "

line 1

" @@ -194,9 +194,18 @@ def to_markdown(body): # first 2 entries are okay, but last will do as best as it can assert to_markdown( + "

Heading 1

" + "

Heading 2

" + "

Heading 3

" + "

Heading 4

" + "
Heading 5
" + "
Heading 6
" "

line 1" - "

line 2" - "

line 3>") == "line 1\nline 2\nline 3\>" + "

line 2" + "

line 3>") == \ + "# Heading 1\n## Heading 2\n### Heading 3\n" \ + "#### Heading 4\n##### Heading 5\n###### Heading 6\n" \ + "line 1\n*line 2*\nline 3>" # Make sure we ignore fields that aren't important to us assert to_markdown( @@ -205,6 +214,13 @@ def to_markdown(body): "Another line without being enclosed") == \ "line 1\nAnother line without being enclosed" + # Test and

+    assert to_markdown(
+        "multi-line 1\nmulti-line 2more content"
+        "
multi-line 1\nmulti-line 2
more content") == \ + '`multi-line 1\nmulti-line 2`more content' \ + '\n```\nmulti-line 1\nmulti-line 2\n```\nmore content' + # Test cases when there are no new lines (we're dealing with just inline # entries); an empty entry as well assert to_markdown("test "