From b2c01bdf4074d607aed99ff44b5fcaf590691bb6 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Wed, 19 Jul 2023 22:10:48 +0200 Subject: [PATCH] urls_from_html now decode html entities --- test/urls_from_html_test.py | 8 ++++++++ ural/urls_from_html.py | 11 +++++++++++ 2 files changed, 19 insertions(+) diff --git a/test/urls_from_html_test.py b/test/urls_from_html_test.py index 0c5a91c..f86ee8e 100644 --- a/test/urls_from_html_test.py +++ b/test/urls_from_html_test.py @@ -3,6 +3,7 @@ # ============================================================================= # Ural URL Extraction From HTML Unit Tests # ============================================================================= +from __future__ import unicode_literals from ural import urls_from_html HTML = """ @@ -74,6 +75,13 @@ def test_edge_cases(self): ) == ["http://lemonde.fr", "http://lemonde.fr", "http://lemonde.fr"] assert list(urls_from_html(HTML_WITH_SCRIPT_TAGS)) == ["http://lemonde.fr"] assert list(urls_from_html(HTML_WITH_PREFIXED_HREF)) == ["http://lemonde.fr"] + assert list( + urls_from_html( + '' + ) + ) == [ + "https://www.magazines.fr/nos-magazines/feminin/magazine-cosmopolitan.html?utm_medium=Site_éditorial&utm_source=editorial&utm_campaign=conquete-COSMO&utm_term=cta-haut&utm_content=CTA" + ] def test_binary(self): assert set(urls_from_html(HTML.encode())) == REF_SET diff --git a/ural/urls_from_html.py b/ural/urls_from_html.py index 2aa811e..9410ffc 100644 --- a/ural/urls_from_html.py +++ b/ural/urls_from_html.py @@ -13,6 +13,16 @@ SCRIPT_TAG_BINARY_RE, ) +try: + from html import unescape +except ImportError: + from HTMLParser import HTMLParser + + _html_parser = HTMLParser() + + def unescape(string): + return _html_parser.unescape(string) + def __urls_finditer(string): string = SCRIPT_TAG_RE.sub("", string) @@ -64,5 +74,6 @@ def urls_from_html(string, encoding="utf-8", errors="strict"): for url in iterator: url = url.strip() + url = unescape(url) yield url