Skip to content

Commit

Permalink
use validators instead f custom regex
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxDall committed Jul 4, 2023
1 parent e693ea5 commit 777e156
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 11 deletions.
2 changes: 1 addition & 1 deletion scripts/generate_parser_test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def get_test_article(enum: PublisherEnum) -> Optional[Article]:
basic_logger.warn(f"Couldn't get article for {publisher.name}. Skipping")
continue
html = HTMLTestFile(
url=article.html.url,
url=article.html.responded_url,
content=article.html.content,
crawl_date=article.html.crawl_date,
publisher=publisher,
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ install_requires =
typing-extensions >= 4.0, < 5.0
langdetect~=1.0.9
aiohttp~=3.8.4
validators~=0.20.0
python_requires = >=3.8
zip_safe = no

Expand Down
2 changes: 1 addition & 1 deletion src/fundus/scraping/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def lang(self) -> Optional[str]:
try:
language = langdetect.detect(self.plaintext)
except langdetect.LangDetectException:
basic_logger.debug(f"Unable to detect language for article '{self.html.url}'")
basic_logger.debug(f"Unable to detect language for article '{self.html.responded_url}'")

# use @lang attribute of <html> tag as fallback
if not language or language == langdetect.detector_factory.Detector.UNKNOWN_LANG:
Expand Down
12 changes: 4 additions & 8 deletions src/fundus/scraping/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import aiohttp
import feedparser
import lxml.html
import validators
from aiohttp.client_exceptions import ClientError
from aiohttp.http_exceptions import HttpProcessingError
from aiohttp.web_exceptions import HTTPError
Expand Down Expand Up @@ -74,10 +75,6 @@ def supported_file_formats(self) -> List[str]:
return list(self.archive_mapping.keys())


def validate_url(url: str) -> bool:
return bool(re.match(r"https?://(?:[a-zA-Z]|\d|[$-_@.&+]|[!*(),]|%[\da-fA-F][\da-fA-F])+", url))


@dataclass
class URLSource(AsyncIterable[str], ABC):
url: str
Expand All @@ -87,7 +84,7 @@ class URLSource(AsyncIterable[str], ABC):
def __post_init__(self):
if not self._request_header:
self._request_header = _default_header
if not validate_url(self.url):
if not validators.url(self.url):
raise ValueError(f"Invalid url '{self.url}'")

def set_header(self, request_header: Dict[str, str]) -> None:
Expand Down Expand Up @@ -130,7 +127,7 @@ class Sitemap(URLSource):
async def _get_pre_filtered_urls(self) -> AsyncIterator[str]:
async def yield_recursive(sitemap_url: str) -> AsyncIterator[str]:
session = await session_handler.get_session()
if not validate_url(sitemap_url):
if not validators.url(sitemap_url):
basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
async with session.get(url=sitemap_url, headers=self._request_header) as response:
try:
Expand Down Expand Up @@ -200,7 +197,7 @@ def _filter(self, url: str) -> bool:

async def fetch(self) -> AsyncIterator[HTML]:
async for url in self.url_source:
if not validate_url(url):
if not validators.url(url):
basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
continue

Expand All @@ -211,7 +208,6 @@ async def fetch(self) -> AsyncIterator[HTML]:
session = await session_handler.get_session()

async with session.get(url, headers=self.request_header) as response:

if self._filter(str(response.url)):
basic_logger.debug(f"Skipped responded URL '{url}' because of URL filter")
continue
Expand Down
4 changes: 3 additions & 1 deletion src/fundus/scraping/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ def run(
event_loop = asyncio.get_event_loop()

def article_gen() -> Iterator[Article]:
interleave: AsyncIterator[Iterable[Optional[Article]]] = batched_interleave_longest(*async_article_iterators)
interleave: AsyncIterator[Iterable[Optional[Article]]] = batched_interleave_longest(
*async_article_iterators
)
while True:
start_time = time.time()
batch: Optional[Iterable[Optional[Article]]] = event_loop.run_until_complete(
Expand Down

0 comments on commit 777e156

Please sign in to comment.