From a7d19351832d5828e4052a82510c1b7f80ece5ef Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Wed, 19 Jul 2023 13:51:54 +0200 Subject: [PATCH] Adding strip_fragment to canonicalize Fix #186 --- README.md | 2 ++ test/canonicalize_url_test.py | 7 ++++++- test/normalize_url_test.py | 2 +- ural/canonicalize_url.py | 16 +++++++++++----- ural/canonicalize_url.pyi | 7 ++++++- ural/normalize_url.py | 2 +- ural/quote.py | 2 ++ 7 files changed, 29 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 287e7af3..64af0a7d 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,8 @@ canonicalize_url('www.LEMONDE.fr') * **url** *string*: url to canonicalize. * **quoted** *?bool* [`False`]: by default the function will unquote the url as much as possible all while keeping the url safe. If this kwarg is set to `True`, the function will instead quote the url as much as possible all while ensuring nothing will be double-quoted. +* **default_protocol** *?str* [`https`]: default protocol to add when the url has none. +* **strip_fragment** *?str* [`False`]: whether to strip the url's fragment. --- diff --git a/test/canonicalize_url_test.py b/test/canonicalize_url_test.py index 6f6701de..5ad2347f 100644 --- a/test/canonicalize_url_test.py +++ b/test/canonicalize_url_test.py @@ -10,6 +10,7 @@ (" http://lemonde.fr/test.html ", "http://lemonde.fr/test.html"), ("http://lemonde.fr/test\x00.html", "http://lemonde.fr/test.html"), ("lemonde.fr", "https://lemonde.fr"), + ("lemonde.fr#ok", "https://lemonde.fr/#ok"), ("http://LEMONDE.FR/TEST", "http://lemonde.fr/TEST"), ("http://lemonde.fr:80/test", "http://lemonde.fr/test"), ("http://xn--tlrama-bvab.fr", "http://télérama.fr"), @@ -41,7 +42,10 @@ ("http://example.com?test", "http://example.com/?test"), ("http://example.com#test", "http://example.com/#test"), ("http://example.com?test#test", "http://example.com/?test#test"), - ('http://lemonde.fr/?test&test=&test=value', 'http://lemonde.fr/?test&test=&test=value') + ( + "http://lemonde.fr/?test&test=&test=value", + "http://lemonde.fr/?test&test=&test=value", + ), ] TESTS_ADVANCED = [ @@ -56,6 +60,7 @@ {"quoted": True}, ), ("http://lemonde.fr/t%c3%a9", "http://lemonde.fr/t%C3%A9", {"quoted": True}), + ("http://lemonde.fr#ok", "http://lemonde.fr", {"strip_fragment": True}), ] diff --git a/test/normalize_url_test.py b/test/normalize_url_test.py index 308614de..54bc5cb3 100644 --- a/test/normalize_url_test.py +++ b/test/normalize_url_test.py @@ -200,7 +200,7 @@ ), ("http://lemonde.fr?%3d=value", "lemonde.fr?%3D=value"), ("http://lemonde.fr/default.asp", "lemonde.fr"), - ('http://lemonde.fr/?test&test=&test=value', 'lemonde.fr?test&test=&test=value') + ("http://lemonde.fr/?test&test=&test=value", "lemonde.fr?test&test=&test=value"), ] diff --git a/ural/canonicalize_url.py b/ural/canonicalize_url.py index a59d891e..252d1fe5 100644 --- a/ural/canonicalize_url.py +++ b/ural/canonicalize_url.py @@ -21,7 +21,9 @@ from ural.patterns import CONTROL_CHARS_RE -def canonicalize_url(url, default_protocol="https", unsplit=True, quoted=False): +def canonicalize_url( + url, default_protocol="https", unsplit=True, quoted=False, strip_fragment=False +): # Cleaning url = CONTROL_CHARS_RE.sub("", url) url = url.strip() @@ -49,6 +51,9 @@ def canonicalize_url(url, default_protocol="https", unsplit=True, quoted=False): if port == 80 or port == 443: port = None + if strip_fragment: + fragment = None + # Empty path etc. if not path or path == "/": if not query and not fragment: @@ -87,10 +92,11 @@ def canonicalize_url(url, default_protocol="https", unsplit=True, quoted=False): query = safe_serialize_qsl(qsl) - if quoted: - fragment = safely_quote(fragment) - else: - fragment = safely_unquote_fragment(fragment) + if fragment: + if quoted: + fragment = safely_quote(fragment) + else: + fragment = safely_unquote_fragment(fragment) # Repacking netloc = unsplit_netloc(user, password, hostname, port) diff --git a/ural/canonicalize_url.pyi b/ural/canonicalize_url.pyi index 3dfc2c61..cbfa0af3 100644 --- a/ural/canonicalize_url.pyi +++ b/ural/canonicalize_url.pyi @@ -1,3 +1,8 @@ def canonicalize_url( - url: str, default_protocol: str = ..., unsplit: bool = ..., quoted: bool = ... + url: str, + default_protocol: str = ..., + quoted: bool = ..., + strip_fragment: bool = ..., + # + unsplit: bool = ..., ) -> str: ... diff --git a/ural/normalize_url.py b/ural/normalize_url.py index bb50acab..4d253f14 100644 --- a/ural/normalize_url.py +++ b/ural/normalize_url.py @@ -103,7 +103,7 @@ def qsl_sort_key(item): - return item[0], item[1] or '', 0 if item[1] is None else 1 + return item[0], item[1] or "", 0 if item[1] is None else 1 def should_strip_query_item( diff --git a/ural/quote.py b/ural/quote.py index c7e97c36..22b2d97e 100644 --- a/ural/quote.py +++ b/ural/quote.py @@ -141,8 +141,10 @@ def safely_quote_qsl(qsl): def upper_match(match): return match.group(0).upper() + LOWERCASE_QUOTED_RE = re.compile(r"%(?:[0-9A-F][a-f]|[a-f][0-9A-F]|[a-f]{2})") + def upper_quoted(string): return LOWERCASE_QUOTED_RE.sub(upper_match, string)