Skip to content

Commit

Permalink
Add conversion for HTML to markdown
Browse files Browse the repository at this point in the history
  • Loading branch information
hrodz committed Aug 20, 2023
1 parent 31caff1 commit 8d543a5
Show file tree
Hide file tree
Showing 2 changed files with 225 additions and 2 deletions.
123 changes: 121 additions & 2 deletions apprise/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ def convert_between(from_format, to_format, content):
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html,
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
# For now; use same converter for Markdown support
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown,
}

convert = converters.get((from_format, to_format))
Expand Down Expand Up @@ -86,6 +85,17 @@ def html_to_text(content):
return parser.converted


def html_to_markdown(content):
"""
Converts a content from HTML to markdown.
"""

parser = HTMLMarkDownConverter()
parser.feed(content)
parser.close()
return parser.converted


class HTMLConverter(HTMLParser, object):
"""An HTML to plain text converter tuned for email messages."""

Expand Down Expand Up @@ -200,3 +210,112 @@ def handle_endtag(self, tag):

if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)


class HTMLMarkDownConverter(HTMLConverter):
"""An HTML to markdown converter tuned for email messages."""

# Escape markdown characters
MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])',
re.DOTALL | re.MULTILINE)

def __init__(self, **kwargs):
super().__init__(**kwargs)

# Store href value
self._link = ""

def handle_data(self, data, *args, **kwargs):
"""
Store our data if it is not on the ignore list
"""

# initialize our previous flag
if self._do_store:

# Tidy our whitespace
content = self.WS_TRIM.sub(' ', data)
content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)

# Add hyperlink
if self._link == "":
self._result.append(content)
else:
self._result.append("[" + content + "]" + self._link)

def handle_starttag(self, tag, attrs):
"""
Process our starting HTML Tag
"""
# Toggle initial states
self._do_store = tag not in self.IGNORE_TAGS
self._link = ""

if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)

if tag == 'li':
self._result.append('- ')

elif tag == 'br':
self._result.append('\n')

elif tag == 'hr':
if self._result:
self._result[-1] = self._result[-1].rstrip(' ')

self._result.append('\n---\n')

elif tag == 'blockquote':
self._result.append('> ')

elif tag == 'h1':
self._result.append('# ')

elif tag == 'h2':
self._result.append('## ')

elif tag == 'h3':
self._result.append('### ')

elif tag == 'h4':
self._result.append('#### ')

elif tag == 'h5':
self._result.append('##### ')

elif tag == 'h6':
self._result.append('###### ')

elif tag in ['strong', 'b']:
self._result.append('**')

elif tag in ['em', 'i']:
self._result.append('*')

elif tag == 'code':
self._result.append('`')

elif tag == 'a':
for name, link in attrs:
if name == 'href':
self._link = '(' + link + ')'

def handle_endtag(self, tag):
"""
Edge case handling of open/close tags
"""
self._do_store = True
self._link = ""

if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)

if tag in ['strong', 'b']:
self._result.append('**')

elif tag in ['em', 'i']:
self._result.append('*')

elif tag == 'code':
self._result.append('`')
104 changes: 104 additions & 0 deletions test/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,110 @@ def to_html(body):
assert to_html(object)


def test_conversion_html_to_markdown():
"""conversion: Test HTML to plain text
"""

def to_markdown(body):
"""
A function to simply html conversion tests
"""
return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)

assert to_markdown("No HTML code here.") == "No HTML code here\."

clist = to_markdown("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
assert "- Lots and lots" in clist
assert "- of lists\." in clist

assert "> To be or not to be\." == to_markdown(
"<blockquote>To be or not to be.</blockquote>")

cspace = to_markdown(
"<h2>Fancy heading</h2>"
"<p>And a paragraph too.<br>Plus line break.</p>")
assert "# Fancy heading" in cspace
assert "And a paragraph too\.\nPlus line break\." in cspace

assert to_markdown(
"<style>body { font: 200%; }</style>"
"<p>Some obnoxious text here.</p>") == "Some obnoxious text here\."

assert to_markdown(
"<p>line 1</p>"
"<p>line 2</p>"
"<p>line 3</p>") == "line 1\nline 2\nline 3"

# Case sensitivity
assert to_markdown(
"<p>line 1</P>"
"<P>line 2</P>"
"<P>line 3</P>") == "line 1\nline 2\nline 3"

# double new lines (testing <br> and </br>)
assert to_markdown(
"some information<br/><br>and more information") == \
"some information\n\nand more information"

#
# Test bad tags
#

# first 2 entries are okay, but last will do as best as it can
assert to_markdown(
"<p>line 1</>"
"<p>line 2</gar>"
"<p>line 3>") == "line 1\nline 2\nline 3\>"

# Make sure we ignore fields that aren't important to us
assert to_markdown(
"<script>ignore this</script>"
"<p>line 1</p>"
"Another line without being enclosed") == \
"line 1\nAnother line without being enclosed"

# Test cases when there are no new lines (we're dealing with just inline
# entries); an empty entry as well
assert to_markdown("<span></span<<span>test</span> "
"<a href='#'>my link</a>") == \
"test [my link](#)"

# </p> missing
assert to_markdown("<body><div>line 1 <b>bold</b></div> "
" <a href='/link'>my link</a>"
"<p>3rd line</body>") == \
"line 1 **bold**\n[my link](/link)\n3rd line"

# <hr/> on it's own
assert to_markdown("<hr/>") == "---"
assert to_markdown("<hr>") == "---"

# We need to handle HTML Encodings
assert to_markdown("""
<html>
<title>ignore this entry</title>
<body>
Let&apos;s handle&nbsp;special html encoding
<hr/>
</body>
""") == "Let's handle special html encoding\n---"

# If you give nothing, you get nothing in return
assert to_markdown("") == ""

with pytest.raises(TypeError):
# Invalid input
assert to_markdown(None)

with pytest.raises(TypeError):
# Invalid input
assert to_markdown(42)

with pytest.raises(TypeError):
# Invalid input
assert to_markdown(object)


def test_conversion_text_to():
"""conversion: Test Text to all types
"""
Expand Down

0 comments on commit 8d543a5

Please sign in to comment.