From 8d543a5eb32f6f65c6d814f1f00ecfd006376c6b Mon Sep 17 00:00:00 2001 From: Hector Rodriguez Medina Date: Sun, 20 Aug 2023 18:16:47 -0300 Subject: [PATCH] Add conversion for HTML to markdown --- apprise/conversion.py | 123 +++++++++++++++++++++++++++++++++++++++- test/test_conversion.py | 104 +++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+), 2 deletions(-) diff --git a/apprise/conversion.py b/apprise/conversion.py index 77c9aa5e58..689697f0d2 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -50,8 +50,7 @@ def convert_between(from_format, to_format, content): (NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html, (NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html, (NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text, - # For now; use same converter for Markdown support - (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text, + (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown, } convert = converters.get((from_format, to_format)) @@ -86,6 +85,17 @@ def html_to_text(content): return parser.converted +def html_to_markdown(content): + """ + Converts a content from HTML to markdown. + """ + + parser = HTMLMarkDownConverter() + parser.feed(content) + parser.close() + return parser.converted + + class HTMLConverter(HTMLParser, object): """An HTML to plain text converter tuned for email messages.""" @@ -200,3 +210,112 @@ def handle_endtag(self, tag): if tag in self.BLOCK_TAGS: self._result.append(self.BLOCK_END) + + +class HTMLMarkDownConverter(HTMLConverter): + """An HTML to markdown converter tuned for email messages.""" + + # Escape markdown characters + MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])', + re.DOTALL | re.MULTILINE) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + # Store href value + self._link = "" + + def handle_data(self, data, *args, **kwargs): + """ + Store our data if it is not on the ignore list + """ + + # initialize our previous flag + if self._do_store: + + # Tidy our whitespace + content = self.WS_TRIM.sub(' ', data) + content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content) + + # Add hyperlink + if self._link == "": + self._result.append(content) + else: + self._result.append("[" + content + "]" + self._link) + + def handle_starttag(self, tag, attrs): + """ + Process our starting HTML Tag + """ + # Toggle initial states + self._do_store = tag not in self.IGNORE_TAGS + self._link = "" + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) + + if tag == 'li': + self._result.append('- ') + + elif tag == 'br': + self._result.append('\n') + + elif tag == 'hr': + if self._result: + self._result[-1] = self._result[-1].rstrip(' ') + + self._result.append('\n---\n') + + elif tag == 'blockquote': + self._result.append('> ') + + elif tag == 'h1': + self._result.append('# ') + + elif tag == 'h2': + self._result.append('## ') + + elif tag == 'h3': + self._result.append('### ') + + elif tag == 'h4': + self._result.append('#### ') + + elif tag == 'h5': + self._result.append('##### ') + + elif tag == 'h6': + self._result.append('###### ') + + elif tag in ['strong', 'b']: + self._result.append('**') + + elif tag in ['em', 'i']: + self._result.append('*') + + elif tag == 'code': + self._result.append('`') + + elif tag == 'a': + for name, link in attrs: + if name == 'href': + self._link = '(' + link + ')' + + def handle_endtag(self, tag): + """ + Edge case handling of open/close tags + """ + self._do_store = True + self._link = "" + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) + + if tag in ['strong', 'b']: + self._result.append('**') + + elif tag in ['em', 'i']: + self._result.append('*') + + elif tag == 'code': + self._result.append('`') diff --git a/test/test_conversion.py b/test/test_conversion.py index 103ebea6a3..0a8230d59e 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -143,6 +143,110 @@ def to_html(body): assert to_html(object) +def test_conversion_html_to_markdown(): + """conversion: Test HTML to plain text + """ + + def to_markdown(body): + """ + A function to simply html conversion tests + """ + return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body) + + assert to_markdown("No HTML code here.") == "No HTML code here\." + + clist = to_markdown("") + assert "- Lots and lots" in clist + assert "- of lists\." in clist + + assert "> To be or not to be\." == to_markdown( + "
To be or not to be.
") + + cspace = to_markdown( + "

Fancy heading

" + "

And a paragraph too.
Plus line break.

") + assert "# Fancy heading" in cspace + assert "And a paragraph too\.\nPlus line break\." in cspace + + assert to_markdown( + "" + "

Some obnoxious text here.

") == "Some obnoxious text here\." + + assert to_markdown( + "

line 1

" + "

line 2

" + "

line 3

") == "line 1\nline 2\nline 3" + + # Case sensitivity + assert to_markdown( + "

line 1

" + "

line 2

" + "

line 3

") == "line 1\nline 2\nline 3" + + # double new lines (testing
and
) + assert to_markdown( + "some information

and more information") == \ + "some information\n\nand more information" + + # + # Test bad tags + # + + # first 2 entries are okay, but last will do as best as it can + assert to_markdown( + "

line 1" + "

line 2" + "

line 3>") == "line 1\nline 2\nline 3\>" + + # Make sure we ignore fields that aren't important to us + assert to_markdown( + "" + "

line 1

" + "Another line without being enclosed") == \ + "line 1\nAnother line without being enclosed" + + # Test cases when there are no new lines (we're dealing with just inline + # entries); an empty entry as well + assert to_markdown("test " + "my link") == \ + "test [my link](#)" + + #

missing + assert to_markdown("
line 1 bold
" + " my link" + "

3rd line") == \ + "line 1 **bold**\n[my link](/link)\n3rd line" + + #


on it's own + assert to_markdown("
") == "---" + assert to_markdown("
") == "---" + + # We need to handle HTML Encodings + assert to_markdown(""" + + ignore this entry + + Let's handle special html encoding +
+ + """) == "Let's handle special html encoding\n---" + + # If you give nothing, you get nothing in return + assert to_markdown("") == "" + + with pytest.raises(TypeError): + # Invalid input + assert to_markdown(None) + + with pytest.raises(TypeError): + # Invalid input + assert to_markdown(42) + + with pytest.raises(TypeError): + # Invalid input + assert to_markdown(object) + + def test_conversion_text_to(): """conversion: Test Text to all types """