Skip to content

Commit

Permalink
add url exceptions
Browse files Browse the repository at this point in the history
  • Loading branch information
16arpi committed Jul 12, 2023
1 parent a8b2a0d commit 8377f39
Showing 1 changed file with 16 additions and 5 deletions.
21 changes: 16 additions & 5 deletions ural/could_be_rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,15 @@
from ural.utils import safe_urlsplit
from ural.get_hostname import get_hostname

HOSTNAMES_BAD = ["linkedin.com", "news.google.com"]
HOSTNAMES_GOOD = ["feeds.feedburner.com", "feeds2.feedburner.com"]
HOSTNAMES_BAD = [
("linkedin.com", ""),
("news.google.com", "__i/rss")
]
HOSTNAMES_GOOD = [
("feeds.feedburner.com", ""),
("feeds2.feedburner.com", ""),
("news.google.com", "rss/topics")
]
QUERY_RE = re.compile(r"(?:feed|format|type)\=(?:xml|feed|atom|rss)", re.I)
FILE_RE = re.compile(
r"(((?:feeds?|rss|atoms?|blogs?)\.(?:xml|atom|rss|php|json)$)|((?:news|latest|index|posts?)\.(?:xml|atom|rss)$)|(.+\.(?:rss|atom)$)|((?:[\/\.\-_]|^)(?:rss|feeds?|atom)[0-9]{,10}(?:[\/\.\-_]|$)))",
Expand All @@ -23,10 +30,14 @@ def could_be_rss(url):
split = safe_urlsplit(url)
hostname = get_hostname(url)

if not hostname or hostname in HOSTNAMES_BAD:
for h, p in HOSTNAMES_GOOD:
if hostname == h and p in split.path:
return True
for h, p in HOSTNAMES_BAD:
if hostname == h and p in split.path:
return False
if not hostname:
return False
if hostname in HOSTNAMES_GOOD:
return True
if split.query and QUERY_RE.search(split.query):
return True
if split.path and FILE_RE.search(split.path):
Expand Down

0 comments on commit 8377f39

Please sign in to comment.