diff --git a/test/urls_from_html_test.py b/test/urls_from_html_test.py
index 80e30431..6f6061c7 100644
--- a/test/urls_from_html_test.py
+++ b/test/urls_from_html_test.py
@@ -42,6 +42,9 @@ def test_edge_cases(self):
"http://lemonde.fr"
]
+ def test_binary(self):
+ assert set(urls_from_html(HTML.encode())) == REF_SET
+
def test_base_url(self):
assert set(
urls_from_html(
diff --git a/ural/urls_from_html.py b/ural/urls_from_html.py
index a996c00a..ec116ba4 100644
--- a/ural/urls_from_html.py
+++ b/ural/urls_from_html.py
@@ -6,39 +6,69 @@
# argument.
#
from __future__ import unicode_literals
-import re
-from ural.patterns import URL_IN_HTML_RE
+from ural.patterns import URL_IN_HTML_RE, URL_IN_HTML_BINARY_RE
from ural.utils import urljoin
-def urls_from_html(string, base_url=None):
- """
- Function returning an iterator over the urls present in the HTML string argument.
+def urls_finditer(string):
+ for match in URL_IN_HTML_RE.finditer(string):
+ url = match.group(1)
- Args:
- string (str): source html string.
- base_url (str, optional): base_url to concatenate to the found urls.
- Defaults to None.
+ if url is not None:
+ url = url.strip('"')
+ else:
+ url = match.group(2)
- Yields:
- str: an url.
+ if url is not None:
+ url = url.strip("'")
+ else:
+ url = match.group(3)
- """
- for match in re.finditer(URL_IN_HTML_RE, string):
+ assert url is not None
+
+ yield url
+
+
+def urls_finditer_binary(string, encoding="utf-8", errors="strict"):
+ for match in URL_IN_HTML_BINARY_RE.finditer(string):
url = match.group(1)
if url is not None:
- url = url.strip('"')
+ url = url.strip(b'"')
else:
url = match.group(2)
if url is not None:
- url = url.strip("'")
+ url = url.strip(b"'")
else:
url = match.group(3)
assert url is not None
+ yield url.decode(encoding, errors=errors)
+
+
+def urls_from_html(string, base_url=None, encoding="utf-8", errors="strict"):
+ """
+ Function returning an iterator over the urls present in the HTML string argument.
+
+ Args:
+ string (str): source html string.
+ base_url (str, optional): base_url to concatenate to the found urls.
+ Defaults to None.
+
+ Yields:
+ str: an url.
+
+ """
+ iterator = (
+ urls_finditer_binary(string, encoding=encoding, errors=errors)
+ if isinstance(string, bytes)
+ else urls_finditer(string)
+ )
+
+ for url in iterator:
+
if base_url is not None:
url = urljoin(base_url, url)
diff --git a/ural/urls_from_html.pyi b/ural/urls_from_html.pyi
index 93eba32e..e380075f 100644
--- a/ural/urls_from_html.pyi
+++ b/ural/urls_from_html.pyi
@@ -1,5 +1,8 @@
from typing import Optional, Iterator, Union
def urls_from_html(
- html: Union[bytes, str], base_url: Optional[str] = ...
+ html: Union[bytes, str],
+ base_url: Optional[str] = ...,
+ encoding: str = ...,
+ errors: str = ...,
) -> Iterator[str]: ...