Skip to content

Commit

Permalink
Adding strip_fragment to canonicalize
Browse files Browse the repository at this point in the history
Fix #186
  • Loading branch information
Yomguithereal committed Jul 19, 2023
1 parent a7b58f6 commit a7d1935
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 9 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ canonicalize_url('www.LEMONDE.fr')

* **url** *string*: url to canonicalize.
* **quoted** *?bool* [`False`]: by default the function will unquote the url as much as possible all while keeping the url safe. If this kwarg is set to `True`, the function will instead quote the url as much as possible all while ensuring nothing will be double-quoted.
* **default_protocol** *?str* [`https`]: default protocol to add when the url has none.
* **strip_fragment** *?str* [`False`]: whether to strip the url's fragment.

---

Expand Down
7 changes: 6 additions & 1 deletion test/canonicalize_url_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
(" http://lemonde.fr/test.html ", "http://lemonde.fr/test.html"),
("http://lemonde.fr/test\x00.html", "http://lemonde.fr/test.html"),
("lemonde.fr", "https://lemonde.fr"),
("lemonde.fr#ok", "https://lemonde.fr/#ok"),
("http://LEMONDE.FR/TEST", "http://lemonde.fr/TEST"),
("http://lemonde.fr:80/test", "http://lemonde.fr/test"),
("http://xn--tlrama-bvab.fr", "http://télérama.fr"),
Expand Down Expand Up @@ -41,7 +42,10 @@
("http://example.com?test", "http://example.com/?test"),
("http://example.com#test", "http://example.com/#test"),
("http://example.com?test#test", "http://example.com/?test#test"),
('http://lemonde.fr/?test&test=&test=value', 'http://lemonde.fr/?test&test=&test=value')
(
"http://lemonde.fr/?test&test=&test=value",
"http://lemonde.fr/?test&test=&test=value",
),
]

TESTS_ADVANCED = [
Expand All @@ -56,6 +60,7 @@
{"quoted": True},
),
("http://lemonde.fr/t%c3%a9", "http://lemonde.fr/t%C3%A9", {"quoted": True}),
("http://lemonde.fr#ok", "http://lemonde.fr", {"strip_fragment": True}),
]


Expand Down
2 changes: 1 addition & 1 deletion test/normalize_url_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@
),
("http://lemonde.fr?%3d=value", "lemonde.fr?%3D=value"),
("http://lemonde.fr/default.asp", "lemonde.fr"),
('http://lemonde.fr/?test&test=&test=value', 'lemonde.fr?test&test=&test=value')
("http://lemonde.fr/?test&test=&test=value", "lemonde.fr?test&test=&test=value"),
]


Expand Down
16 changes: 11 additions & 5 deletions ural/canonicalize_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
from ural.patterns import CONTROL_CHARS_RE


def canonicalize_url(url, default_protocol="https", unsplit=True, quoted=False):
def canonicalize_url(
url, default_protocol="https", unsplit=True, quoted=False, strip_fragment=False
):
# Cleaning
url = CONTROL_CHARS_RE.sub("", url)
url = url.strip()
Expand Down Expand Up @@ -49,6 +51,9 @@ def canonicalize_url(url, default_protocol="https", unsplit=True, quoted=False):
if port == 80 or port == 443:
port = None

if strip_fragment:
fragment = None

# Empty path etc.
if not path or path == "/":
if not query and not fragment:
Expand Down Expand Up @@ -87,10 +92,11 @@ def canonicalize_url(url, default_protocol="https", unsplit=True, quoted=False):

query = safe_serialize_qsl(qsl)

if quoted:
fragment = safely_quote(fragment)
else:
fragment = safely_unquote_fragment(fragment)
if fragment:
if quoted:
fragment = safely_quote(fragment)
else:
fragment = safely_unquote_fragment(fragment)

# Repacking
netloc = unsplit_netloc(user, password, hostname, port)
Expand Down
7 changes: 6 additions & 1 deletion ural/canonicalize_url.pyi
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
def canonicalize_url(
url: str, default_protocol: str = ..., unsplit: bool = ..., quoted: bool = ...
url: str,
default_protocol: str = ...,
quoted: bool = ...,
strip_fragment: bool = ...,
#
unsplit: bool = ...,
) -> str: ...
2 changes: 1 addition & 1 deletion ural/normalize_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@


def qsl_sort_key(item):
return item[0], item[1] or '', 0 if item[1] is None else 1
return item[0], item[1] or "", 0 if item[1] is None else 1


def should_strip_query_item(
Expand Down
2 changes: 2 additions & 0 deletions ural/quote.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,10 @@ def safely_quote_qsl(qsl):
def upper_match(match):
return match.group(0).upper()


LOWERCASE_QUOTED_RE = re.compile(r"%(?:[0-9A-F][a-f]|[a-f][0-9A-F]|[a-f]{2})")


def upper_quoted(string):
return LOWERCASE_QUOTED_RE.sub(upper_match, string)

Expand Down

0 comments on commit a7d1935

Please sign in to comment.