Skip to content

Commit

Permalink
code & test improvements, added more coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
caronc committed Oct 6, 2023
1 parent 8d543a5 commit f725b3a
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 15 deletions.
36 changes: 28 additions & 8 deletions apprise/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class HTMLConverter(HTMLParser, object):

# The following tags must start on a new line
BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
'div', 'td', 'th', 'pre', 'samp', 'label', 'li',)

# the folowing tags ignore any internal text
IGNORE_TAGS = (
Expand Down Expand Up @@ -216,15 +216,19 @@ class HTMLMarkDownConverter(HTMLConverter):
"""An HTML to markdown converter tuned for email messages."""

# Escape markdown characters
MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])',
re.DOTALL | re.MULTILINE)
MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE)

# Detect Carriage Return
HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE)

def __init__(self, **kwargs):
super().__init__(**kwargs)

# Store href value
self._link = ""

self._preserver_cr = False

def handle_data(self, data, *args, **kwargs):
"""
Store our data if it is not on the ignore list
Expand All @@ -234,7 +238,8 @@ def handle_data(self, data, *args, **kwargs):
if self._do_store:

# Tidy our whitespace
content = self.WS_TRIM.sub(' ', data)
content = self.WS_TRIM.sub(' ', data) \
if not self._preserver_cr else data
content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)

# Add hyperlink
Expand Down Expand Up @@ -287,19 +292,28 @@ def handle_starttag(self, tag, attrs):
elif tag == 'h6':
self._result.append('###### ')

elif tag in ['strong', 'b']:
elif tag in ('strong', 'b'):
self._result.append('**')

elif tag in ['em', 'i']:
elif tag in ('em', 'i'):
self._result.append('*')

elif tag == 'code':
self._result.append('`')
self._preserver_cr = True

elif tag in ('pre', 'samp'):
self._result.append('```')
self._result.append(self.BLOCK_END)
self._preserver_cr = True

elif tag == 'a':
for name, link in attrs:
if name == 'href':
self._link = '(' + link + ')'
# Take an early exit for speed (in case there are more
# parameters - no need to waste time looking at them)
break

def handle_endtag(self, tag):
"""
Expand All @@ -311,11 +325,17 @@ def handle_endtag(self, tag):
if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)

if tag in ['strong', 'b']:
if tag in ('strong', 'b'):
self._result.append('**')

elif tag in ['em', 'i']:
elif tag in ('em', 'i'):
self._result.append('*')

elif tag == 'code':
self._result.append('`')
self._preserver_cr = False

elif tag in ('pre', 'samp'):
self._result.append('```')
self._result.append(self.BLOCK_END)
self._preserver_cr = False
30 changes: 23 additions & 7 deletions test/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,24 +153,24 @@ def to_markdown(body):
"""
return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)

assert to_markdown("No HTML code here.") == "No HTML code here\."
assert to_markdown("No HTML code here.") == "No HTML code here."

clist = to_markdown("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
assert "- Lots and lots" in clist
assert "- of lists\." in clist
assert "- of lists." in clist

assert "> To be or not to be\." == to_markdown(
assert "> To be or not to be." == to_markdown(
"<blockquote>To be or not to be.</blockquote>")

cspace = to_markdown(
"<h2>Fancy heading</h2>"
"<p>And a paragraph too.<br>Plus line break.</p>")
assert "# Fancy heading" in cspace
assert "And a paragraph too\.\nPlus line break\." in cspace
assert "And a paragraph too.\nPlus line break." in cspace

assert to_markdown(
"<style>body { font: 200%; }</style>"
"<p>Some obnoxious text here.</p>") == "Some obnoxious text here\."
"<p>Some obnoxious text here.</p>") == "Some obnoxious text here."

assert to_markdown(
"<p>line 1</p>"
Expand All @@ -194,9 +194,18 @@ def to_markdown(body):

# first 2 entries are okay, but last will do as best as it can
assert to_markdown(
"<h1>Heading 1</h1>"
"<h2>Heading 2</h2>"
"<h3>Heading 3</h3>"
"<h4>Heading 4</h4>"
"<h5>Heading 5</h5>"
"<h6>Heading 6</h6>"
"<p>line 1</>"
"<p>line 2</gar>"
"<p>line 3>") == "line 1\nline 2\nline 3\>"
"<p><em>line 2</em></gar>"
"<p>line 3>") == \
"# Heading 1\n## Heading 2\n### Heading 3\n" \
"#### Heading 4\n##### Heading 5\n###### Heading 6\n" \
"line 1\n*line 2*\nline 3>"

# Make sure we ignore fields that aren't important to us
assert to_markdown(
Expand All @@ -205,6 +214,13 @@ def to_markdown(body):
"Another line without being enclosed") == \
"line 1\nAnother line without being enclosed"

# Test <code> and <pre>
assert to_markdown(
"<code>multi-line 1\nmulti-line 2</code>more content"
"<pre>multi-line 1\nmulti-line 2</pre>more content") == \
'`multi-line 1\nmulti-line 2`more content' \
'\n```\nmulti-line 1\nmulti-line 2\n```\nmore content'

# Test cases when there are no new lines (we're dealing with just inline
# entries); an empty entry as well
assert to_markdown("<span></span<<span>test</span> "
Expand Down

0 comments on commit f725b3a

Please sign in to comment.