Skip to content

Commit

Permalink
Adding canonicalize_url
Browse files Browse the repository at this point in the history
Related to #55
  • Loading branch information
Yomguithereal committed Jul 15, 2023
1 parent 2e3a7c7 commit 8706984
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 3 deletions.
27 changes: 27 additions & 0 deletions test/canonicalize_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# coding: utf-8
# =============================================================================
# Ural URL Fingerprinting Unit Tests
# =============================================================================
from __future__ import unicode_literals

from ural import canonicalize_url

TESTS = [
(" http://lemonde.fr/test.html ", "http://lemonde.fr/test.html"),
("http://lemonde.fr/test\x00.html", "http://lemonde.fr/test.html"),
("lemonde.fr", "https://lemonde.fr"),
("http://LEMONDE.FR/TEST", "http://lemonde.fr/TEST"),
("http://lemonde.fr:80/test", "http://lemonde.fr/test"),
("http://xn--tlrama-bvab.fr", "http://télérama.fr"),
(
"http://mozilla.org?x=%D1%88%D0%B5%D0%BB%D0%BB%D1%8B",
"http://mozilla.org?x=шеллы",
),
("http://mozilla.org?x=шеллы", "http://mozilla.org?x=шеллы"),
]


class TestFingerprintUrl(object):
def test_canonicalize_url(self):
for url, result in TESTS:
assert canonicalize_url(url) == result
1 change: 1 addition & 0 deletions ural/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#
from ural.classes.hostname_trie_set import HostnameTrieSet

from ural.canonicalize_url import canonicalize_url
from ural.could_be_html import could_be_html
from ural.could_be_rss import could_be_rss
from ural.ensure_protocol import ensure_protocol
Expand Down
47 changes: 47 additions & 0 deletions ural/canonicalize_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from ural.utils import (
urlsplit,
urlunsplit,
SplitResult,
split_netloc,
unsplit_netloc,
space_aware_unquote,
decode_punycode_hostname,
)
from ural.ensure_protocol import ensure_protocol
from ural.patterns import CONTROL_CHARS_RE


def canonicalize_url(url, default_protocol="https", unsplit=True):
# Cleaning
url = url.strip()
url = CONTROL_CHARS_RE.sub("", url)

# Ensuring a protocol
url = ensure_protocol(url, default_protocol)

# Parsing
scheme, netloc, path, query, fragment = urlsplit(url)
auth, hostname, port = split_netloc(netloc)

# Decoding and normalizing hostname
hostname = decode_punycode_hostname(hostname)
hostname = hostname.lower()

# Dropping HTTP/HTTPS ports
if port == "80" or port == "443":
port = ""

netloc = unsplit_netloc(auth, hostname, port)

# Unquoting
path = space_aware_unquote(path)
query = space_aware_unquote(query)
fragment = space_aware_unquote(fragment)

result = SplitResult(scheme, netloc, path, query, fragment)

if not unsplit:
return result

# Serializing
return urlunsplit(result)
3 changes: 3 additions & 0 deletions ural/canonicalize_url.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
def canonicalize_url(
url: str, default_protocol: str = ..., unsplit: bool = ...
) -> str: ...
2 changes: 1 addition & 1 deletion ural/fingerprint_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


def lang_query_item_filter(key, _):
return key in LANG_QUERY_KEYS
return key not in LANG_QUERY_KEYS


def strip_lang_subdomains_from_netloc(netloc):
Expand Down
2 changes: 1 addition & 1 deletion ural/normalize_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def should_strip_query_item(
return domain_filter(key, value)

if query_item_filter is not None:
return query_item_filter(key, value)
return not query_item_filter(key, value)

return False

Expand Down
24 changes: 24 additions & 0 deletions ural/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,27 @@ def add_query_argument(url, name, value=None, quote=True):
url += "#" + fragment

return url


def split_netloc(netloc):
if "@" in netloc:
auth, hostname = netloc.split("@", 1)
else:
auth = ""
hostname = netloc

if "]:" in hostname or hostname.count(":") == 1:
hostname, port = hostname.split(":", 1)
else:
port = ""

return auth, hostname, port


def unsplit_netloc(auth, hostname, port):
if auth:
hostname = auth + "@" + hostname
if port:
hostname += ":" + port

return unsplit_netloc
4 changes: 3 additions & 1 deletion ural/utils.pyi
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Union, Dict, List, overload
from typing import Optional, Dict, List, Tuple, overload
from urllib.parse import SplitResult
from ural.types import AnyUrlTarget, Literal, QueryArgValue

Expand All @@ -20,3 +20,5 @@ def safe_parse_qs(query: str) -> Dict[str, List[str]]: ...
def add_query_argument(
url: str, name: str, value: QueryArgValue, quote: bool = ...
) -> str: ...
def split_netloc(netloc: str) -> Tuple[str, str, str]: ...
def unsplit_netloc(auth: str, hostname: str, port: str) -> str: ...

0 comments on commit 8706984

Please sign in to comment.