Skip to content

Commit

Permalink
Make link_announcer respect page encoding headers (CloudBotIRC#206)
Browse files Browse the repository at this point in the history
* Make link_announcer.py respect HTML specified page encoding

* clean up re-encoding logic

* Add unit tests for parsing page encoding in link announcer
  • Loading branch information
linuxdaemon authored Apr 8, 2018
1 parent 3308aec commit 8fc4ba1
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 3 deletions.
30 changes: 29 additions & 1 deletion plugins/link_announcer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,48 @@ def no_parens(pattern):
MAX_RECV = 1000000


def get_encoding(soup):
meta_charset = soup.find('meta', charset=True)

if meta_charset:
return meta_charset['charset']
else:
meta_content_type = soup.find(
'meta', {'http-equiv': lambda t: t and t.lower() == 'content-type', 'content': True}
)
if meta_content_type:
return requests.utils.get_encoding_from_headers({'content-type': meta_content_type['content']})

return None


def parse_content(content, encoding=None):
html = BeautifulSoup(content, "lxml", from_encoding=encoding)
old_encoding = encoding

encoding = get_encoding(html)

if encoding is not None and encoding != old_encoding:
html = BeautifulSoup(content, "lxml", from_encoding=encoding)

return html


@hook.regex(url_re, priority=Priority.LOW, action=Action.HALTTYPE, only_no_match=True)
def print_url_title(message, match):
with closing(requests.get(match.group(), headers=HEADERS, stream=True, timeout=3)) as r:
r.raise_for_status()
if not r.encoding:
return

# TODO Switch to reading chunks until full title is found, up to MAX_RECV bytes
content = r.raw.read(MAX_RECV + 1, decode_content=True)
encoding = r.encoding

if len(content) > MAX_RECV:
return

html = BeautifulSoup(content, "lxml", from_encoding=encoding)
html = parse_content(content, encoding)

if html.title:
title = html.title.text
Expand Down
29 changes: 27 additions & 2 deletions tests/plugin_tests/test_link_announcer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from plugins.link_announcer import url_re
import codecs

from bs4 import BeautifulSoup

from plugins.link_announcer import url_re, get_encoding

MATCHES = (
"http://foo.com/blah_blah",
Expand Down Expand Up @@ -65,7 +69,8 @@
("(https://foo.bar)", "https://foo.bar"),
("[https://example.com]", "https://example.com"),
("<a hreh=\"https://example.com/test.page?#test\">", "https://example.com/test.page?#test"),
("<https://www.example.com/this.is.a.test/blah.txt?a=1#123>", "https://www.example.com/this.is.a.test/blah.txt?a=1#123"),
("<https://www.example.com/this.is.a.test/blah.txt?a=1#123>",
"https://www.example.com/this.is.a.test/blah.txt?a=1#123"),
)


Expand All @@ -82,3 +87,23 @@ def test_search():
for text, out in SEARCH:
match = url_re.search(text)
assert match and match.group() == out


ENCODINGS = (
(b'<meta charset="utf8">', codecs.lookup('utf8')),
(b'', None),
(b'<meta http-equiv="Content-Type" content="text/html; charset=utf-8">', codecs.lookup('utf8')),
)


def test_encoding_parse():
for text, enc in ENCODINGS:
soup = BeautifulSoup(text, "lxml")
encoding = get_encoding(soup)
if encoding is None:
assert enc is None, "Got empty encoding from {!r} expected {!r}".format(text, enc)
continue

enc_obj = codecs.lookup(encoding)

assert enc, enc_obj

0 comments on commit 8fc4ba1

Please sign in to comment.