diff --git a/plugins/link_announcer.py b/plugins/link_announcer.py index d426ec819..639551fb5 100644 --- a/plugins/link_announcer.py +++ b/plugins/link_announcer.py @@ -60,6 +60,33 @@ def no_parens(pattern): MAX_RECV = 1000000 +def get_encoding(soup): + meta_charset = soup.find('meta', charset=True) + + if meta_charset: + return meta_charset['charset'] + else: + meta_content_type = soup.find( + 'meta', {'http-equiv': lambda t: t and t.lower() == 'content-type', 'content': True} + ) + if meta_content_type: + return requests.utils.get_encoding_from_headers({'content-type': meta_content_type['content']}) + + return None + + +def parse_content(content, encoding=None): + html = BeautifulSoup(content, "lxml", from_encoding=encoding) + old_encoding = encoding + + encoding = get_encoding(html) + + if encoding is not None and encoding != old_encoding: + html = BeautifulSoup(content, "lxml", from_encoding=encoding) + + return html + + @hook.regex(url_re, priority=Priority.LOW, action=Action.HALTTYPE, only_no_match=True) def print_url_title(message, match): with closing(requests.get(match.group(), headers=HEADERS, stream=True, timeout=3)) as r: @@ -67,13 +94,14 @@ def print_url_title(message, match): if not r.encoding: return + # TODO Switch to reading chunks until full title is found, up to MAX_RECV bytes content = r.raw.read(MAX_RECV + 1, decode_content=True) encoding = r.encoding if len(content) > MAX_RECV: return - html = BeautifulSoup(content, "lxml", from_encoding=encoding) + html = parse_content(content, encoding) if html.title: title = html.title.text diff --git a/tests/plugin_tests/test_link_announcer.py b/tests/plugin_tests/test_link_announcer.py index 9bb8b145b..2b1d1a3db 100644 --- a/tests/plugin_tests/test_link_announcer.py +++ b/tests/plugin_tests/test_link_announcer.py @@ -1,4 +1,8 @@ -from plugins.link_announcer import url_re +import codecs + +from bs4 import BeautifulSoup + +from plugins.link_announcer import url_re, get_encoding MATCHES = ( "http://foo.com/blah_blah", @@ -65,7 +69,8 @@ ("(https://foo.bar)", "https://foo.bar"), ("[https://example.com]", "https://example.com"), ("", "https://example.com/test.page?#test"), - ("", "https://www.example.com/this.is.a.test/blah.txt?a=1#123"), + ("", + "https://www.example.com/this.is.a.test/blah.txt?a=1#123"), ) @@ -82,3 +87,23 @@ def test_search(): for text, out in SEARCH: match = url_re.search(text) assert match and match.group() == out + + +ENCODINGS = ( + (b'', codecs.lookup('utf8')), + (b'', None), + (b'', codecs.lookup('utf8')), +) + + +def test_encoding_parse(): + for text, enc in ENCODINGS: + soup = BeautifulSoup(text, "lxml") + encoding = get_encoding(soup) + if encoding is None: + assert enc is None, "Got empty encoding from {!r} expected {!r}".format(text, enc) + continue + + enc_obj = codecs.lookup(encoding) + + assert enc, enc_obj