diff --git a/ural/could_be_rss.py b/ural/could_be_rss.py index f5f347d4..dd1a114a 100644 --- a/ural/could_be_rss.py +++ b/ural/could_be_rss.py @@ -10,8 +10,15 @@ from ural.utils import safe_urlsplit from ural.get_hostname import get_hostname -HOSTNAMES_BAD = ["linkedin.com", "news.google.com"] -HOSTNAMES_GOOD = ["feeds.feedburner.com", "feeds2.feedburner.com"] +HOSTNAMES_BAD = [ + ("linkedin.com", ""), + ("news.google.com", "__i/rss") +] +HOSTNAMES_GOOD = [ + ("feeds.feedburner.com", ""), + ("feeds2.feedburner.com", ""), + ("news.google.com", "rss/topics") +] QUERY_RE = re.compile(r"(?:feed|format|type)\=(?:xml|feed|atom|rss)", re.I) FILE_RE = re.compile( r"(((?:feeds?|rss|atoms?|blogs?)\.(?:xml|atom|rss|php|json)$)|((?:news|latest|index|posts?)\.(?:xml|atom|rss)$)|(.+\.(?:rss|atom)$)|((?:[\/\.\-_]|^)(?:rss|feeds?|atom)[0-9]{,10}(?:[\/\.\-_]|$)))", @@ -23,10 +30,14 @@ def could_be_rss(url): split = safe_urlsplit(url) hostname = get_hostname(url) - if not hostname or hostname in HOSTNAMES_BAD: + for h, p in HOSTNAMES_GOOD: + if hostname == h and p in split.path: + return True + for h, p in HOSTNAMES_BAD: + if hostname == h and p in split.path: + return False + if not hostname: return False - if hostname in HOSTNAMES_GOOD: - return True if split.query and QUERY_RE.search(split.query): return True if split.path and FILE_RE.search(split.path):