diff --git a/test/urls_from_html_test.py b/test/urls_from_html_test.py index 80e30431..6f6061c7 100644 --- a/test/urls_from_html_test.py +++ b/test/urls_from_html_test.py @@ -42,6 +42,9 @@ def test_edge_cases(self): "http://lemonde.fr" ] + def test_binary(self): + assert set(urls_from_html(HTML.encode())) == REF_SET + def test_base_url(self): assert set( urls_from_html( diff --git a/ural/urls_from_html.py b/ural/urls_from_html.py index a996c00a..ec116ba4 100644 --- a/ural/urls_from_html.py +++ b/ural/urls_from_html.py @@ -6,39 +6,69 @@ # argument. # from __future__ import unicode_literals -import re -from ural.patterns import URL_IN_HTML_RE +from ural.patterns import URL_IN_HTML_RE, URL_IN_HTML_BINARY_RE from ural.utils import urljoin -def urls_from_html(string, base_url=None): - """ - Function returning an iterator over the urls present in the HTML string argument. +def urls_finditer(string): + for match in URL_IN_HTML_RE.finditer(string): + url = match.group(1) - Args: - string (str): source html string. - base_url (str, optional): base_url to concatenate to the found urls. - Defaults to None. + if url is not None: + url = url.strip('"') + else: + url = match.group(2) - Yields: - str: an url. + if url is not None: + url = url.strip("'") + else: + url = match.group(3) - """ - for match in re.finditer(URL_IN_HTML_RE, string): + assert url is not None + + yield url + + +def urls_finditer_binary(string, encoding="utf-8", errors="strict"): + for match in URL_IN_HTML_BINARY_RE.finditer(string): url = match.group(1) if url is not None: - url = url.strip('"') + url = url.strip(b'"') else: url = match.group(2) if url is not None: - url = url.strip("'") + url = url.strip(b"'") else: url = match.group(3) assert url is not None + yield url.decode(encoding, errors=errors) + + +def urls_from_html(string, base_url=None, encoding="utf-8", errors="strict"): + """ + Function returning an iterator over the urls present in the HTML string argument. + + Args: + string (str): source html string. + base_url (str, optional): base_url to concatenate to the found urls. + Defaults to None. + + Yields: + str: an url. + + """ + iterator = ( + urls_finditer_binary(string, encoding=encoding, errors=errors) + if isinstance(string, bytes) + else urls_finditer(string) + ) + + for url in iterator: + if base_url is not None: url = urljoin(base_url, url) diff --git a/ural/urls_from_html.pyi b/ural/urls_from_html.pyi index 93eba32e..e380075f 100644 --- a/ural/urls_from_html.pyi +++ b/ural/urls_from_html.pyi @@ -1,5 +1,8 @@ from typing import Optional, Iterator, Union def urls_from_html( - html: Union[bytes, str], base_url: Optional[str] = ... + html: Union[bytes, str], + base_url: Optional[str] = ..., + encoding: str = ..., + errors: str = ..., ) -> Iterator[str]: ...