diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py index 8d276bd6..55129bdf 100644 --- a/scripts/generate_parser_test_files.py +++ b/scripts/generate_parser_test_files.py @@ -76,7 +76,7 @@ def get_test_article(enum: PublisherEnum) -> Optional[Article]: basic_logger.warn(f"Couldn't get article for {publisher.name}. Skipping") continue html = HTMLTestFile( - url=article.html.url, + url=article.html.responded_url, content=article.html.content, crawl_date=article.html.crawl_date, publisher=publisher, diff --git a/setup.cfg b/setup.cfg index 7093734d..0b889fd1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,6 +27,7 @@ install_requires = typing-extensions >= 4.0, < 5.0 langdetect~=1.0.9 aiohttp~=3.8.4 + validators~=0.20.0 python_requires = >=3.8 zip_safe = no diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py index 88f2099d..989f4253 100644 --- a/src/fundus/scraping/article.py +++ b/src/fundus/scraping/article.py @@ -54,7 +54,7 @@ def lang(self) -> Optional[str]: try: language = langdetect.detect(self.plaintext) except langdetect.LangDetectException: - basic_logger.debug(f"Unable to detect language for article '{self.html.url}'") + basic_logger.debug(f"Unable to detect language for article '{self.html.responded_url}'") # use @lang attribute of tag as fallback if not language or language == langdetect.detector_factory.Detector.UNKNOWN_LANG: diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 7a247174..775848fc 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -19,6 +19,7 @@ import aiohttp import feedparser import lxml.html +import validators from aiohttp.client_exceptions import ClientError from aiohttp.http_exceptions import HttpProcessingError from aiohttp.web_exceptions import HTTPError @@ -74,10 +75,6 @@ def supported_file_formats(self) -> List[str]: return list(self.archive_mapping.keys()) -def validate_url(url: str) -> bool: - return bool(re.match(r"https?://(?:[a-zA-Z]|\d|[$-_@.&+]|[!*(),]|%[\da-fA-F][\da-fA-F])+", url)) - - @dataclass class URLSource(AsyncIterable[str], ABC): url: str @@ -87,7 +84,7 @@ class URLSource(AsyncIterable[str], ABC): def __post_init__(self): if not self._request_header: self._request_header = _default_header - if not validate_url(self.url): + if not validators.url(self.url): raise ValueError(f"Invalid url '{self.url}'") def set_header(self, request_header: Dict[str, str]) -> None: @@ -130,7 +127,7 @@ class Sitemap(URLSource): async def _get_pre_filtered_urls(self) -> AsyncIterator[str]: async def yield_recursive(sitemap_url: str) -> AsyncIterator[str]: session = await session_handler.get_session() - if not validate_url(sitemap_url): + if not validators.url(sitemap_url): basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed") async with session.get(url=sitemap_url, headers=self._request_header) as response: try: @@ -200,7 +197,7 @@ def _filter(self, url: str) -> bool: async def fetch(self) -> AsyncIterator[HTML]: async for url in self.url_source: - if not validate_url(url): + if not validators.url(url): basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed") continue @@ -211,7 +208,6 @@ async def fetch(self) -> AsyncIterator[HTML]: session = await session_handler.get_session() async with session.get(url, headers=self.request_header) as response: - if self._filter(str(response.url)): basic_logger.debug(f"Skipped responded URL '{url}' because of URL filter") continue diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py index 79edbd69..784d78d9 100644 --- a/src/fundus/scraping/pipeline.py +++ b/src/fundus/scraping/pipeline.py @@ -111,7 +111,9 @@ def run( event_loop = asyncio.get_event_loop() def article_gen() -> Iterator[Article]: - interleave: AsyncIterator[Iterable[Optional[Article]]] = batched_interleave_longest(*async_article_iterators) + interleave: AsyncIterator[Iterable[Optional[Article]]] = batched_interleave_longest( + *async_article_iterators + ) while True: start_time = time.time() batch: Optional[Iterable[Optional[Article]]] = event_loop.run_until_complete(