Skip to content

Commit

Permalink
Binary support for urls_from_html
Browse files Browse the repository at this point in the history
Related to #172
  • Loading branch information
Yomguithereal committed Jul 15, 2023
1 parent 3bf348f commit 3c9e329
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 16 deletions.
3 changes: 3 additions & 0 deletions test/urls_from_html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def test_edge_cases(self):
"http://lemonde.fr"
]

def test_binary(self):
assert set(urls_from_html(HTML.encode())) == REF_SET

def test_base_url(self):
assert set(
urls_from_html(
Expand Down
60 changes: 45 additions & 15 deletions ural/urls_from_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,69 @@
# argument.
#
from __future__ import unicode_literals
import re
from ural.patterns import URL_IN_HTML_RE
from ural.patterns import URL_IN_HTML_RE, URL_IN_HTML_BINARY_RE
from ural.utils import urljoin


def urls_from_html(string, base_url=None):
"""
Function returning an iterator over the urls present in the HTML string argument.
def urls_finditer(string):
for match in URL_IN_HTML_RE.finditer(string):
url = match.group(1)

Args:
string (str): source html string.
base_url (str, optional): base_url to concatenate to the found urls.
Defaults to None.
if url is not None:
url = url.strip('"')
else:
url = match.group(2)

Yields:
str: an url.
if url is not None:
url = url.strip("'")
else:
url = match.group(3)

"""
for match in re.finditer(URL_IN_HTML_RE, string):
assert url is not None

yield url


def urls_finditer_binary(string, encoding="utf-8", errors="strict"):
for match in URL_IN_HTML_BINARY_RE.finditer(string):
url = match.group(1)

if url is not None:
url = url.strip('"')
url = url.strip(b'"')
else:
url = match.group(2)

if url is not None:
url = url.strip("'")
url = url.strip(b"'")
else:
url = match.group(3)

assert url is not None

yield url.decode(encoding, errors=errors)


def urls_from_html(string, base_url=None, encoding="utf-8", errors="strict"):
"""
Function returning an iterator over the urls present in the HTML string argument.
Args:
string (str): source html string.
base_url (str, optional): base_url to concatenate to the found urls.
Defaults to None.
Yields:
str: an url.
"""
iterator = (
urls_finditer_binary(string, encoding=encoding, errors=errors)
if isinstance(string, bytes)
else urls_finditer(string)
)

for url in iterator:

if base_url is not None:
url = urljoin(base_url, url)

Expand Down
5 changes: 4 additions & 1 deletion ural/urls_from_html.pyi
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from typing import Optional, Iterator, Union

def urls_from_html(
html: Union[bytes, str], base_url: Optional[str] = ...
html: Union[bytes, str],
base_url: Optional[str] = ...,
encoding: str = ...,
errors: str = ...,
) -> Iterator[str]: ...

0 comments on commit 3c9e329

Please sign in to comment.