use validators instead f custom regex

flairNLP · Jul 4, 2023 · 777e156 · 777e156
1 parent e693ea5
commit 777e156
Show file tree

Hide file tree

Showing 5 changed files with 10 additions and 11 deletions.
diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py
@@ -76,7 +76,7 @@ def get_test_article(enum: PublisherEnum) -> Optional[Article]:
  basic_logger.warn(f"Couldn't get article for {publisher.name}. Skipping")
  continue
  html = HTMLTestFile(
- url=article.html.url,
+ url=article.html.responded_url,
  content=article.html.content,
  crawl_date=article.html.crawl_date,
  publisher=publisher,

diff --git a/setup.cfg b/setup.cfg
@@ -27,6 +27,7 @@ install_requires =
  typing-extensions >= 4.0, < 5.0
  langdetect~=1.0.9
  aiohttp~=3.8.4
+ validators~=0.20.0
 python_requires = >=3.8
 zip_safe = no
 

diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
@@ -54,7 +54,7 @@ def lang(self) -> Optional[str]:
  try:
  language = langdetect.detect(self.plaintext)
  except langdetect.LangDetectException:
- basic_logger.debug(f"Unable to detect language for article '{self.html.url}'")
+ basic_logger.debug(f"Unable to detect language for article '{self.html.responded_url}'")
 
  # use @lang attribute of <html> tag as fallback
  if not language or language == langdetect.detector_factory.Detector.UNKNOWN_LANG:

diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
@@ -19,6 +19,7 @@
 import aiohttp
 import feedparser
 import lxml.html
+import validators
 from aiohttp.client_exceptions import ClientError
 from aiohttp.http_exceptions import HttpProcessingError
 from aiohttp.web_exceptions import HTTPError
@@ -74,10 +75,6 @@ def supported_file_formats(self) -> List[str]:
  return list(self.archive_mapping.keys())
 
 
-def validate_url(url: str) -> bool:
- return bool(re.match(r"https?://(?:[a-zA-Z]|\d|[$-_@.&+]|[!*(),]|%[\da-fA-F][\da-fA-F])+", url))
-
-
 @dataclass
 class URLSource(AsyncIterable[str], ABC):
  url: str
@@ -87,7 +84,7 @@ class URLSource(AsyncIterable[str], ABC):
  def __post_init__(self):
  if not self._request_header:
  self._request_header = _default_header
- if not validate_url(self.url):
+ if not validators.url(self.url):
  raise ValueError(f"Invalid url '{self.url}'")
 
  def set_header(self, request_header: Dict[str, str]) -> None:
@@ -130,7 +127,7 @@ class Sitemap(URLSource):
  async def _get_pre_filtered_urls(self) -> AsyncIterator[str]:
  async def yield_recursive(sitemap_url: str) -> AsyncIterator[str]:
  session = await session_handler.get_session()
- if not validate_url(sitemap_url):
+ if not validators.url(sitemap_url):
  basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
  async with session.get(url=sitemap_url, headers=self._request_header) as response:
  try:
@@ -200,7 +197,7 @@ def _filter(self, url: str) -> bool:
 
  async def fetch(self) -> AsyncIterator[HTML]:
  async for url in self.url_source:
- if not validate_url(url):
+ if not validators.url(url):
  basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
  continue
 
@@ -211,7 +208,6 @@ async def fetch(self) -> AsyncIterator[HTML]:
  session = await session_handler.get_session()
 
  async with session.get(url, headers=self.request_header) as response:
-
  if self._filter(str(response.url)):
  basic_logger.debug(f"Skipped responded URL '{url}' because of URL filter")
  continue

diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py
@@ -111,7 +111,9 @@ def run(
  event_loop = asyncio.get_event_loop()
 
  def article_gen() -> Iterator[Article]:
- interleave: AsyncIterator[Iterable[Optional[Article]]] = batched_interleave_longest(*async_article_iterators)
+ interleave: AsyncIterator[Iterable[Optional[Article]]] = batched_interleave_longest(
+ *async_article_iterators
+ )
  while True:
  start_time = time.time()
  batch: Optional[Iterable[Optional[Article]]] = event_loop.run_until_complete(