Skip to content

Commit

Permalink
urls_from_html now strips
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Jul 19, 2023
1 parent b3a6c6d commit 2e56ad2
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
5 changes: 5 additions & 0 deletions test/urls_from_html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ def test_edge_cases(self):
'<a href="" target="_blank">charte</a><a href=\'\' target="_blank">charte</a><a href= target="_blank">charte</a>'
)
) == ["", "", ""]
assert list(
urls_from_html(
"<a \nhref='http://lemonde.fr '></a><a \nhref=' http://lemonde.fr '></a><a \nhref=' http://lemonde.fr'></a>"
)
) == ["http://lemonde.fr", "http://lemonde.fr", "http://lemonde.fr"]
assert list(urls_from_html(HTML_WITH_SCRIPT_TAGS)) == ["http://lemonde.fr"]
assert list(urls_from_html(HTML_WITH_PREFIXED_HREF)) == ["http://lemonde.fr"]

Expand Down
9 changes: 7 additions & 2 deletions ural/urls_from_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ def __urls_finditer_binary(string, encoding="utf-8", errors="strict"):

def urls_from_html(string, encoding="utf-8", errors="strict"):
if isinstance(string, bytes):
return __urls_finditer_binary(string, encoding=encoding, errors=errors)
iterator = __urls_finditer_binary(string, encoding=encoding, errors=errors)
else:
iterator = __urls_finditer(string)

return __urls_finditer(string)
for url in iterator:
url = url.strip()

yield url

0 comments on commit 2e56ad2

Please sign in to comment.