Skip to content

Commit

Permalink
urls_from_html now decode html entities
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Jul 19, 2023
1 parent 2e56ad2 commit b2c01bd
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
8 changes: 8 additions & 0 deletions test/urls_from_html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# =============================================================================
# Ural URL Extraction From HTML Unit Tests
# =============================================================================
from __future__ import unicode_literals
from ural import urls_from_html

HTML = """
Expand Down Expand Up @@ -74,6 +75,13 @@ def test_edge_cases(self):
) == ["http://lemonde.fr", "http://lemonde.fr", "http://lemonde.fr"]
assert list(urls_from_html(HTML_WITH_SCRIPT_TAGS)) == ["http://lemonde.fr"]
assert list(urls_from_html(HTML_WITH_PREFIXED_HREF)) == ["http://lemonde.fr"]
assert list(
urls_from_html(
'<a href="https://www.magazines.fr/nos-magazines/feminin/magazine-cosmopolitan.html?utm_medium=Site_&eacute;ditorial&amp;utm_source=editorial&amp;utm_campaign=conquete-COSMO&amp;utm_term=cta-haut&amp;utm_content=CTA"></a>'
)
) == [
"https://www.magazines.fr/nos-magazines/feminin/magazine-cosmopolitan.html?utm_medium=Site_éditorial&utm_source=editorial&utm_campaign=conquete-COSMO&utm_term=cta-haut&utm_content=CTA"
]

def test_binary(self):
assert set(urls_from_html(HTML.encode())) == REF_SET
11 changes: 11 additions & 0 deletions ural/urls_from_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@
SCRIPT_TAG_BINARY_RE,
)

try:
from html import unescape
except ImportError:
from HTMLParser import HTMLParser

_html_parser = HTMLParser()

def unescape(string):
return _html_parser.unescape(string)


def __urls_finditer(string):
string = SCRIPT_TAG_RE.sub("", string)
Expand Down Expand Up @@ -64,5 +74,6 @@ def urls_from_html(string, encoding="utf-8", errors="strict"):

for url in iterator:
url = url.strip()
url = unescape(url)

yield url

0 comments on commit b2c01bd

Please sign in to comment.