diff --git a/test/urls_from_html_test.py b/test/urls_from_html_test.py
index 0c5a91c..f86ee8e 100644
--- a/test/urls_from_html_test.py
+++ b/test/urls_from_html_test.py
@@ -3,6 +3,7 @@
# =============================================================================
# Ural URL Extraction From HTML Unit Tests
# =============================================================================
+from __future__ import unicode_literals
from ural import urls_from_html
HTML = """
@@ -74,6 +75,13 @@ def test_edge_cases(self):
) == ["http://lemonde.fr", "http://lemonde.fr", "http://lemonde.fr"]
assert list(urls_from_html(HTML_WITH_SCRIPT_TAGS)) == ["http://lemonde.fr"]
assert list(urls_from_html(HTML_WITH_PREFIXED_HREF)) == ["http://lemonde.fr"]
+ assert list(
+ urls_from_html(
+ ''
+ )
+ ) == [
+ "https://www.magazines.fr/nos-magazines/feminin/magazine-cosmopolitan.html?utm_medium=Site_éditorial&utm_source=editorial&utm_campaign=conquete-COSMO&utm_term=cta-haut&utm_content=CTA"
+ ]
def test_binary(self):
assert set(urls_from_html(HTML.encode())) == REF_SET
diff --git a/ural/urls_from_html.py b/ural/urls_from_html.py
index 2aa811e..9410ffc 100644
--- a/ural/urls_from_html.py
+++ b/ural/urls_from_html.py
@@ -13,6 +13,16 @@
SCRIPT_TAG_BINARY_RE,
)
+try:
+ from html import unescape
+except ImportError:
+ from HTMLParser import HTMLParser
+
+ _html_parser = HTMLParser()
+
+ def unescape(string):
+ return _html_parser.unescape(string)
+
def __urls_finditer(string):
string = SCRIPT_TAG_RE.sub("", string)
@@ -64,5 +74,6 @@ def urls_from_html(string, encoding="utf-8", errors="strict"):
for url in iterator:
url = url.strip()
+ url = unescape(url)
yield url