From a7d19351832d5828e4052a82510c1b7f80ece5ef Mon Sep 17 00:00:00 2001
From: Yomguithereal <guillaumeplique@gmail.com>
Date: Wed, 19 Jul 2023 13:51:54 +0200
Subject: [PATCH] Adding strip_fragment to canonicalize Fix #186

---
 README.md                     |  2 ++
 test/canonicalize_url_test.py |  7 ++++++-
 test/normalize_url_test.py    |  2 +-
 ural/canonicalize_url.py      | 16 +++++++++++-----
 ural/canonicalize_url.pyi     |  7 ++++++-
 ural/normalize_url.py         |  2 +-
 ural/quote.py                 |  2 ++
 7 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 287e7af3..64af0a7d 100644
--- a/README.md
+++ b/README.md
@@ -182,6 +182,8 @@ canonicalize_url('www.LEMONDE.fr')
 
 * **url** *string*: url to canonicalize.
 * **quoted** *?bool* [`False`]: by default the function will unquote the url as much as possible all while keeping the url safe. If this kwarg is set to `True`, the function will instead quote the url as much as possible all while ensuring nothing will be double-quoted.
+* **default_protocol** *?str* [`https`]: default protocol to add when the url has none.
+* **strip_fragment** *?str* [`False`]: whether to strip the url's fragment.
 
 ---
 
diff --git a/test/canonicalize_url_test.py b/test/canonicalize_url_test.py
index 6f6701de..5ad2347f 100644
--- a/test/canonicalize_url_test.py
+++ b/test/canonicalize_url_test.py
@@ -10,6 +10,7 @@
     ("   http://lemonde.fr/test.html   ", "http://lemonde.fr/test.html"),
     ("http://lemonde.fr/test\x00.html", "http://lemonde.fr/test.html"),
     ("lemonde.fr", "https://lemonde.fr"),
+    ("lemonde.fr#ok", "https://lemonde.fr/#ok"),
     ("http://LEMONDE.FR/TEST", "http://lemonde.fr/TEST"),
     ("http://lemonde.fr:80/test", "http://lemonde.fr/test"),
     ("http://xn--tlrama-bvab.fr", "http://télérama.fr"),
@@ -41,7 +42,10 @@
     ("http://example.com?test", "http://example.com/?test"),
     ("http://example.com#test", "http://example.com/#test"),
     ("http://example.com?test#test", "http://example.com/?test#test"),
-    ('http://lemonde.fr/?test&test=&test=value', 'http://lemonde.fr/?test&test=&test=value')
+    (
+        "http://lemonde.fr/?test&test=&test=value",
+        "http://lemonde.fr/?test&test=&test=value",
+    ),
 ]
 
 TESTS_ADVANCED = [
@@ -56,6 +60,7 @@
         {"quoted": True},
     ),
     ("http://lemonde.fr/t%c3%a9", "http://lemonde.fr/t%C3%A9", {"quoted": True}),
+    ("http://lemonde.fr#ok", "http://lemonde.fr", {"strip_fragment": True}),
 ]
 
 
diff --git a/test/normalize_url_test.py b/test/normalize_url_test.py
index 308614de..54bc5cb3 100644
--- a/test/normalize_url_test.py
+++ b/test/normalize_url_test.py
@@ -200,7 +200,7 @@
     ),
     ("http://lemonde.fr?%3d=value", "lemonde.fr?%3D=value"),
     ("http://lemonde.fr/default.asp", "lemonde.fr"),
-    ('http://lemonde.fr/?test&test=&test=value', 'lemonde.fr?test&test=&test=value')
+    ("http://lemonde.fr/?test&test=&test=value", "lemonde.fr?test&test=&test=value"),
 ]
 
 
diff --git a/ural/canonicalize_url.py b/ural/canonicalize_url.py
index a59d891e..252d1fe5 100644
--- a/ural/canonicalize_url.py
+++ b/ural/canonicalize_url.py
@@ -21,7 +21,9 @@
 from ural.patterns import CONTROL_CHARS_RE
 
 
-def canonicalize_url(url, default_protocol="https", unsplit=True, quoted=False):
+def canonicalize_url(
+    url, default_protocol="https", unsplit=True, quoted=False, strip_fragment=False
+):
     # Cleaning
     url = CONTROL_CHARS_RE.sub("", url)
     url = url.strip()
@@ -49,6 +51,9 @@ def canonicalize_url(url, default_protocol="https", unsplit=True, quoted=False):
     if port == 80 or port == 443:
         port = None
 
+    if strip_fragment:
+        fragment = None
+
     # Empty path etc.
     if not path or path == "/":
         if not query and not fragment:
@@ -87,10 +92,11 @@ def canonicalize_url(url, default_protocol="https", unsplit=True, quoted=False):
 
     query = safe_serialize_qsl(qsl)
 
-    if quoted:
-        fragment = safely_quote(fragment)
-    else:
-        fragment = safely_unquote_fragment(fragment)
+    if fragment:
+        if quoted:
+            fragment = safely_quote(fragment)
+        else:
+            fragment = safely_unquote_fragment(fragment)
 
     # Repacking
     netloc = unsplit_netloc(user, password, hostname, port)
diff --git a/ural/canonicalize_url.pyi b/ural/canonicalize_url.pyi
index 3dfc2c61..cbfa0af3 100644
--- a/ural/canonicalize_url.pyi
+++ b/ural/canonicalize_url.pyi
@@ -1,3 +1,8 @@
 def canonicalize_url(
-    url: str, default_protocol: str = ..., unsplit: bool = ..., quoted: bool = ...
+    url: str,
+    default_protocol: str = ...,
+    quoted: bool = ...,
+    strip_fragment: bool = ...,
+    #
+    unsplit: bool = ...,
 ) -> str: ...
diff --git a/ural/normalize_url.py b/ural/normalize_url.py
index bb50acab..4d253f14 100644
--- a/ural/normalize_url.py
+++ b/ural/normalize_url.py
@@ -103,7 +103,7 @@
 
 
 def qsl_sort_key(item):
-    return item[0], item[1] or '', 0 if item[1] is None else 1
+    return item[0], item[1] or "", 0 if item[1] is None else 1
 
 
 def should_strip_query_item(
diff --git a/ural/quote.py b/ural/quote.py
index c7e97c36..22b2d97e 100644
--- a/ural/quote.py
+++ b/ural/quote.py
@@ -141,8 +141,10 @@ def safely_quote_qsl(qsl):
 def upper_match(match):
     return match.group(0).upper()
 
+
 LOWERCASE_QUOTED_RE = re.compile(r"%(?:[0-9A-F][a-f]|[a-f][0-9A-F]|[a-f]{2})")
 
+
 def upper_quoted(string):
     return LOWERCASE_QUOTED_RE.sub(upper_match, string)