Adding canonicalize_url

Related to #55
medialab · Jul 15, 2023 · 8706984 · 8706984
1 parent 2e3a7c7
commit 8706984
Show file tree

Hide file tree

Showing 8 changed files with 107 additions and 3 deletions.
diff --git a/test/canonicalize_url.py b/test/canonicalize_url.py
@@ -0,0 +1,27 @@
+# coding: utf-8
+# =============================================================================
+# Ural URL Fingerprinting Unit Tests
+# =============================================================================
+from __future__ import unicode_literals
+
+from ural import canonicalize_url
+
+TESTS = [
+    ("   http://lemonde.fr/test.html   ", "http://lemonde.fr/test.html"),
+    ("http://lemonde.fr/test\x00.html", "http://lemonde.fr/test.html"),
+    ("lemonde.fr", "https://lemonde.fr"),
+    ("http://LEMONDE.FR/TEST", "http://lemonde.fr/TEST"),
+    ("http://lemonde.fr:80/test", "http://lemonde.fr/test"),
+    ("http://xn--tlrama-bvab.fr", "http://télérama.fr"),
+    (
+        "http://mozilla.org?x=%D1%88%D0%B5%D0%BB%D0%BB%D1%8B",
+        "http://mozilla.org?x=шеллы",
+    ),
+    ("http://mozilla.org?x=шеллы", "http://mozilla.org?x=шеллы"),
+]
+
+
+class TestFingerprintUrl(object):
+    def test_canonicalize_url(self):
+        for url, result in TESTS:
+            assert canonicalize_url(url) == result
diff --git a/ural/__init__.py b/ural/__init__.py
@@ -4,6 +4,7 @@
 #
 from ural.classes.hostname_trie_set import HostnameTrieSet
 
+from ural.canonicalize_url import canonicalize_url
 from ural.could_be_html import could_be_html
 from ural.could_be_rss import could_be_rss
 from ural.ensure_protocol import ensure_protocol

diff --git a/ural/canonicalize_url.py b/ural/canonicalize_url.py
@@ -0,0 +1,47 @@
+from ural.utils import (
+    urlsplit,
+    urlunsplit,
+    SplitResult,
+    split_netloc,
+    unsplit_netloc,
+    space_aware_unquote,
+    decode_punycode_hostname,
+)
+from ural.ensure_protocol import ensure_protocol
+from ural.patterns import CONTROL_CHARS_RE
+
+
+def canonicalize_url(url, default_protocol="https", unsplit=True):
+    # Cleaning
+    url = url.strip()
+    url = CONTROL_CHARS_RE.sub("", url)
+
+    # Ensuring a protocol
+    url = ensure_protocol(url, default_protocol)
+
+    # Parsing
+    scheme, netloc, path, query, fragment = urlsplit(url)
+    auth, hostname, port = split_netloc(netloc)
+
+    # Decoding and normalizing hostname
+    hostname = decode_punycode_hostname(hostname)
+    hostname = hostname.lower()
+
+    # Dropping HTTP/HTTPS ports
+    if port == "80" or port == "443":
+        port = ""
+
+    netloc = unsplit_netloc(auth, hostname, port)
+
+    # Unquoting
+    path = space_aware_unquote(path)
+    query = space_aware_unquote(query)
+    fragment = space_aware_unquote(fragment)
+
+    result = SplitResult(scheme, netloc, path, query, fragment)
+
+    if not unsplit:
+        return result
+
+    # Serializing
+    return urlunsplit(result)
diff --git a/ural/canonicalize_url.pyi b/ural/canonicalize_url.pyi
@@ -0,0 +1,3 @@
+def canonicalize_url(
+    url: str, default_protocol: str = ..., unsplit: bool = ...
+) -> str: ...
diff --git a/ural/fingerprint_url.py b/ural/fingerprint_url.py
@@ -10,7 +10,7 @@
 
 
 def lang_query_item_filter(key, _):
-    return key in LANG_QUERY_KEYS
+    return key not in LANG_QUERY_KEYS
 
 
 def strip_lang_subdomains_from_netloc(netloc):

diff --git a/ural/normalize_url.py b/ural/normalize_url.py
@@ -126,7 +126,7 @@ def should_strip_query_item(
         return domain_filter(key, value)
 
     if query_item_filter is not None:
-        return query_item_filter(key, value)
+        return not query_item_filter(key, value)
 
     return False
 

diff --git a/ural/utils.py b/ural/utils.py
@@ -176,3 +176,27 @@ def add_query_argument(url, name, value=None, quote=True):
         url += "#" + fragment
 
     return url
+
+
+def split_netloc(netloc):
+    if "@" in netloc:
+        auth, hostname = netloc.split("@", 1)
+    else:
+        auth = ""
+        hostname = netloc
+
+    if "]:" in hostname or hostname.count(":") == 1:
+        hostname, port = hostname.split(":", 1)
+    else:
+        port = ""
+
+    return auth, hostname, port
+
+
+def unsplit_netloc(auth, hostname, port):
+    if auth:
+        hostname = auth + "@" + hostname
+    if port:
+        hostname += ":" + port
+
+    return unsplit_netloc
diff --git a/ural/utils.pyi b/ural/utils.pyi
@@ -1,4 +1,4 @@
-from typing import Optional, Union, Dict, List, overload
+from typing import Optional, Dict, List, Tuple, overload
 from urllib.parse import SplitResult
 from ural.types import AnyUrlTarget, Literal, QueryArgValue
 
@@ -20,3 +20,5 @@ def safe_parse_qs(query: str) -> Dict[str, List[str]]: ...
 def add_query_argument(
     url: str, name: str, value: QueryArgValue, quote: bool = ...
 ) -> str: ...
+def split_netloc(netloc: str) -> Tuple[str, str, str]: ...
+def unsplit_netloc(auth: str, hostname: str, port: str) -> str: ...