Make link_announcer respect page encoding headers (CloudBotIRC#206)

* Make link_announcer.py respect HTML specified page encoding * clean up re-encoding logic * Add unit tests for parsing page encoding in link announcer
linuxdaemon · Apr 8, 2018 · 8fc4ba1 · 8fc4ba1
1 parent 3308aec
commit 8fc4ba1
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 3 deletions.
diff --git a/plugins/link_announcer.py b/plugins/link_announcer.py
@@ -60,20 +60,48 @@ def no_parens(pattern):
 MAX_RECV = 1000000
 
 
+def get_encoding(soup):
+    meta_charset = soup.find('meta', charset=True)
+
+    if meta_charset:
+        return meta_charset['charset']
+    else:
+        meta_content_type = soup.find(
+            'meta', {'http-equiv': lambda t: t and t.lower() == 'content-type', 'content': True}
+        )
+        if meta_content_type:
+            return requests.utils.get_encoding_from_headers({'content-type': meta_content_type['content']})
+
+    return None
+
+
+def parse_content(content, encoding=None):
+    html = BeautifulSoup(content, "lxml", from_encoding=encoding)
+    old_encoding = encoding
+
+    encoding = get_encoding(html)
+
+    if encoding is not None and encoding != old_encoding:
+        html = BeautifulSoup(content, "lxml", from_encoding=encoding)
+
+    return html
+
+
 @hook.regex(url_re, priority=Priority.LOW, action=Action.HALTTYPE, only_no_match=True)
 def print_url_title(message, match):
     with closing(requests.get(match.group(), headers=HEADERS, stream=True, timeout=3)) as r:
         r.raise_for_status()
         if not r.encoding:
             return
 
+        # TODO Switch to reading chunks until full title is found, up to MAX_RECV bytes
         content = r.raw.read(MAX_RECV + 1, decode_content=True)
         encoding = r.encoding
 
     if len(content) > MAX_RECV:
         return
 
-    html = BeautifulSoup(content, "lxml", from_encoding=encoding)
+    html = parse_content(content, encoding)
 
     if html.title:
         title = html.title.text

diff --git a/tests/plugin_tests/test_link_announcer.py b/tests/plugin_tests/test_link_announcer.py
@@ -1,4 +1,8 @@
-from plugins.link_announcer import url_re
+import codecs
+
+from bs4 import BeautifulSoup
+
+from plugins.link_announcer import url_re, get_encoding
 
 MATCHES = (
     "http://foo.com/blah_blah",
@@ -65,7 +69,8 @@
     ("(https://foo.bar)", "https://foo.bar"),
     ("[https://example.com]", "https://example.com"),
     ("<a hreh=\"https://example.com/test.page?#test\">", "https://example.com/test.page?#test"),
-    ("<https://www.example.com/this.is.a.test/blah.txt?a=1#123>", "https://www.example.com/this.is.a.test/blah.txt?a=1#123"),
+    ("<https://www.example.com/this.is.a.test/blah.txt?a=1#123>",
+     "https://www.example.com/this.is.a.test/blah.txt?a=1#123"),
 )
 
 
@@ -82,3 +87,23 @@ def test_search():
     for text, out in SEARCH:
         match = url_re.search(text)
         assert match and match.group() == out
+
+
+ENCODINGS = (
+    (b'<meta charset="utf8">', codecs.lookup('utf8')),
+    (b'', None),
+    (b'<meta http-equiv="Content-Type" content="text/html; charset=utf-8">', codecs.lookup('utf8')),
+)
+
+
+def test_encoding_parse():
+    for text, enc in ENCODINGS:
+        soup = BeautifulSoup(text, "lxml")
+        encoding = get_encoding(soup)
+        if encoding is None:
+            assert enc is None, "Got empty encoding from {!r} expected {!r}".format(text, enc)
+            continue
+
+        enc_obj = codecs.lookup(encoding)
+
+        assert enc, enc_obj