From 870698406e5c095003a808df9fcf0a5db81b47ce Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Sat, 15 Jul 2023 15:02:20 +0200 Subject: [PATCH] Adding canonicalize_url Related to #55 --- test/canonicalize_url.py | 27 ++++++++++++++++++++++ ural/__init__.py | 1 + ural/canonicalize_url.py | 47 +++++++++++++++++++++++++++++++++++++++ ural/canonicalize_url.pyi | 3 +++ ural/fingerprint_url.py | 2 +- ural/normalize_url.py | 2 +- ural/utils.py | 24 ++++++++++++++++++++ ural/utils.pyi | 4 +++- 8 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 test/canonicalize_url.py create mode 100644 ural/canonicalize_url.py create mode 100644 ural/canonicalize_url.pyi diff --git a/test/canonicalize_url.py b/test/canonicalize_url.py new file mode 100644 index 00000000..70aa981a --- /dev/null +++ b/test/canonicalize_url.py @@ -0,0 +1,27 @@ +# coding: utf-8 +# ============================================================================= +# Ural URL Fingerprinting Unit Tests +# ============================================================================= +from __future__ import unicode_literals + +from ural import canonicalize_url + +TESTS = [ + (" http://lemonde.fr/test.html ", "http://lemonde.fr/test.html"), + ("http://lemonde.fr/test\x00.html", "http://lemonde.fr/test.html"), + ("lemonde.fr", "https://lemonde.fr"), + ("http://LEMONDE.FR/TEST", "http://lemonde.fr/TEST"), + ("http://lemonde.fr:80/test", "http://lemonde.fr/test"), + ("http://xn--tlrama-bvab.fr", "http://télérama.fr"), + ( + "http://mozilla.org?x=%D1%88%D0%B5%D0%BB%D0%BB%D1%8B", + "http://mozilla.org?x=шеллы", + ), + ("http://mozilla.org?x=шеллы", "http://mozilla.org?x=шеллы"), +] + + +class TestFingerprintUrl(object): + def test_canonicalize_url(self): + for url, result in TESTS: + assert canonicalize_url(url) == result diff --git a/ural/__init__.py b/ural/__init__.py index 3493d098..90210e64 100644 --- a/ural/__init__.py +++ b/ural/__init__.py @@ -4,6 +4,7 @@ # from ural.classes.hostname_trie_set import HostnameTrieSet +from ural.canonicalize_url import canonicalize_url from ural.could_be_html import could_be_html from ural.could_be_rss import could_be_rss from ural.ensure_protocol import ensure_protocol diff --git a/ural/canonicalize_url.py b/ural/canonicalize_url.py new file mode 100644 index 00000000..7136f6d5 --- /dev/null +++ b/ural/canonicalize_url.py @@ -0,0 +1,47 @@ +from ural.utils import ( + urlsplit, + urlunsplit, + SplitResult, + split_netloc, + unsplit_netloc, + space_aware_unquote, + decode_punycode_hostname, +) +from ural.ensure_protocol import ensure_protocol +from ural.patterns import CONTROL_CHARS_RE + + +def canonicalize_url(url, default_protocol="https", unsplit=True): + # Cleaning + url = url.strip() + url = CONTROL_CHARS_RE.sub("", url) + + # Ensuring a protocol + url = ensure_protocol(url, default_protocol) + + # Parsing + scheme, netloc, path, query, fragment = urlsplit(url) + auth, hostname, port = split_netloc(netloc) + + # Decoding and normalizing hostname + hostname = decode_punycode_hostname(hostname) + hostname = hostname.lower() + + # Dropping HTTP/HTTPS ports + if port == "80" or port == "443": + port = "" + + netloc = unsplit_netloc(auth, hostname, port) + + # Unquoting + path = space_aware_unquote(path) + query = space_aware_unquote(query) + fragment = space_aware_unquote(fragment) + + result = SplitResult(scheme, netloc, path, query, fragment) + + if not unsplit: + return result + + # Serializing + return urlunsplit(result) diff --git a/ural/canonicalize_url.pyi b/ural/canonicalize_url.pyi new file mode 100644 index 00000000..b0c1b39a --- /dev/null +++ b/ural/canonicalize_url.pyi @@ -0,0 +1,3 @@ +def canonicalize_url( + url: str, default_protocol: str = ..., unsplit: bool = ... +) -> str: ... diff --git a/ural/fingerprint_url.py b/ural/fingerprint_url.py index 820889c9..5055862b 100644 --- a/ural/fingerprint_url.py +++ b/ural/fingerprint_url.py @@ -10,7 +10,7 @@ def lang_query_item_filter(key, _): - return key in LANG_QUERY_KEYS + return key not in LANG_QUERY_KEYS def strip_lang_subdomains_from_netloc(netloc): diff --git a/ural/normalize_url.py b/ural/normalize_url.py index 538be9d9..95898619 100644 --- a/ural/normalize_url.py +++ b/ural/normalize_url.py @@ -126,7 +126,7 @@ def should_strip_query_item( return domain_filter(key, value) if query_item_filter is not None: - return query_item_filter(key, value) + return not query_item_filter(key, value) return False diff --git a/ural/utils.py b/ural/utils.py index 0eb0becb..02de74b6 100644 --- a/ural/utils.py +++ b/ural/utils.py @@ -176,3 +176,27 @@ def add_query_argument(url, name, value=None, quote=True): url += "#" + fragment return url + + +def split_netloc(netloc): + if "@" in netloc: + auth, hostname = netloc.split("@", 1) + else: + auth = "" + hostname = netloc + + if "]:" in hostname or hostname.count(":") == 1: + hostname, port = hostname.split(":", 1) + else: + port = "" + + return auth, hostname, port + + +def unsplit_netloc(auth, hostname, port): + if auth: + hostname = auth + "@" + hostname + if port: + hostname += ":" + port + + return unsplit_netloc diff --git a/ural/utils.pyi b/ural/utils.pyi index d86fe892..bacfcd3a 100644 --- a/ural/utils.pyi +++ b/ural/utils.pyi @@ -1,4 +1,4 @@ -from typing import Optional, Union, Dict, List, overload +from typing import Optional, Dict, List, Tuple, overload from urllib.parse import SplitResult from ural.types import AnyUrlTarget, Literal, QueryArgValue @@ -20,3 +20,5 @@ def safe_parse_qs(query: str) -> Dict[str, List[str]]: ... def add_query_argument( url: str, name: str, value: QueryArgValue, quote: bool = ... ) -> str: ... +def split_netloc(netloc: str) -> Tuple[str, str, str]: ... +def unsplit_netloc(auth: str, hostname: str, port: str) -> str: ...