Binary support for urls_from_html

Related to #172
medialab · Jul 15, 2023 · 3c9e329 · 3c9e329
1 parent 3bf348f
commit 3c9e329
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 16 deletions.
diff --git a/test/urls_from_html_test.py b/test/urls_from_html_test.py
@@ -42,6 +42,9 @@ def test_edge_cases(self):
  "http://lemonde.fr"
  ]
 
+ def test_binary(self):
+ assert set(urls_from_html(HTML.encode())) == REF_SET
+
  def test_base_url(self):
  assert set(
  urls_from_html(

diff --git a/ural/urls_from_html.py b/ural/urls_from_html.py
@@ -6,39 +6,69 @@
 # argument.
 #
 from __future__ import unicode_literals
-import re
-from ural.patterns import URL_IN_HTML_RE
+from ural.patterns import URL_IN_HTML_RE, URL_IN_HTML_BINARY_RE
 from ural.utils import urljoin
 
 
-def urls_from_html(string, base_url=None):
- """
- Function returning an iterator over the urls present in the HTML string argument.
+def urls_finditer(string):
+ for match in URL_IN_HTML_RE.finditer(string):
+  url = match.group(1)
 
- Args:
- string (str): source html string.
- base_url (str, optional): base_url to concatenate to the found urls.
- Defaults to None.
+  if url is not None:
+  url = url.strip('"')
+ else:
+ url = match.group(2)
 
- Yields:
- str: an url.
+ if url is not None:
+ url = url.strip("'")
+ else:
+ url = match.group(3)
 
- """
- for match in re.finditer(URL_IN_HTML_RE, string):
+ assert url is not None
+
+ yield url
+
+
+def urls_finditer_binary(string, encoding="utf-8", errors="strict"):
+ for match in URL_IN_HTML_BINARY_RE.finditer(string):
  url = match.group(1)
 
  if url is not None:
- url = url.strip('"')
+ url = url.strip(b'"')
  else:
  url = match.group(2)
 
  if url is not None:
- url = url.strip("'")
+ url = url.strip(b"'")
  else:
  url = match.group(3)
 
  assert url is not None
 
+ yield url.decode(encoding, errors=errors)
+
+
+def urls_from_html(string, base_url=None, encoding="utf-8", errors="strict"):
+ """
+ Function returning an iterator over the urls present in the HTML string argument.
+
+ Args:
+ string (str): source html string.
+ base_url (str, optional): base_url to concatenate to the found urls.
+ Defaults to None.
+
+ Yields:
+ str: an url.
+
+ """
+ iterator = (
+ urls_finditer_binary(string, encoding=encoding, errors=errors)
+ if isinstance(string, bytes)
+ else urls_finditer(string)
+ )
+
+ for url in iterator:
+
  if base_url is not None:
  url = urljoin(base_url, url)
 

diff --git a/ural/urls_from_html.pyi b/ural/urls_from_html.pyi
@@ -1,5 +1,8 @@
 from typing import Optional, Iterator, Union
 
 def urls_from_html(
- html: Union[bytes, str], base_url: Optional[str] = ...
+ html: Union[bytes, str],
+ base_url: Optional[str] = ...,
+ encoding: str = ...,
+ errors: str = ...,
 ) -> Iterator[str]: ...