From d546105084927f9adebaab8cb1c8bd52aa1b0d68 Mon Sep 17 00:00:00 2001 From: MaxDall Date: Mon, 10 Jul 2023 16:21:16 +0200 Subject: [PATCH 1/3] GH262: remove `add_url_filter` from `HTMLSession` and propagate parameter instead --- src/fundus/scraping/html.py | 19 +++++++++---------- src/fundus/scraping/pipeline.py | 21 +++++++-------------- src/fundus/scraping/scraper.py | 5 +++-- 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 81ec7161..e0e74187 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -29,7 +29,7 @@ from fundus.logging import basic_logger from fundus.scraping.filter import URLFilter, inverse -from fundus.utils.more_async import async_next, make_iterable_async +from fundus.utils.more_async import make_iterable_async _default_header = {"user-agent": "Fundus"} @@ -240,24 +240,23 @@ def __init__( else: self.url_source = make_iterable_async(url_source) self.publisher = publisher - self.url_filter = [] if not url_filter else [url_filter] + self.url_filter = url_filter self.request_header = request_header or _default_header if isinstance(url_source, URLSource): url_source.set_header(self.request_header) - def add_url_filter(self, url_filter: URLFilter) -> None: - self.url_filter.append(url_filter) + async def fetch(self, url_filter: Optional[URLFilter] = None) -> AsyncIterator[HTML]: + combined_filters = ([self.url_filter] if self.url_filter else []) + ([url_filter] if url_filter else []) - def _filter(self, url: str) -> bool: - return any(url_filter(url) for url_filter in self.url_filter) + def filter_url(u: str) -> bool: + return any(f(u) for f in combined_filters) - async def fetch(self) -> AsyncIterator[HTML]: async for url in self.url_source: if not validators.url(url): basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed") continue - if self._filter(url): + if filter_url(url): basic_logger.debug(f"Skipped requested URL '{url}' because of URL filter") continue @@ -265,8 +264,8 @@ async def fetch(self) -> AsyncIterator[HTML]: try: async with session.get(url, headers=self.request_header) as response: - if self._filter(str(response.url)): - basic_logger.debug(f"Skipped responded URL '{url}' because of URL filter") + if filter_url(str(response.url)): + basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter") continue html = await response.text() response.raise_for_status() diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py index 26e41752..c0147f36 100644 --- a/src/fundus/scraping/pipeline.py +++ b/src/fundus/scraping/pipeline.py @@ -111,27 +111,20 @@ def constant_delay() -> float: else: return delay - def build_unique_url_filter() -> URLFilter: - return lambda url: url in response_cache + def build_url_filter() -> URLFilter: + def _filter(url: str) -> bool: + return (url_filter and url_filter(url)) or (only_unique and url in response_cache) - # build filters and delay. this is for readability and typeguard reasons - extraction_filter = build_extraction_filter() - unique_url_filter = build_unique_url_filter() if only_unique else None - final_delay = build_delay() + return _filter + final_delay = build_delay() response_cache: Set[str] = set() - for scraper in self.scrapers: - for source in scraper.sources: - if url_filter: - source.add_url_filter(url_filter=url_filter) - if unique_url_filter: - source.add_url_filter(url_filter=unique_url_filter) - async_article_iterators: List[AsyncIterator[Optional[Article]]] = [ scraper.scrape( error_handling=error_handling, - extraction_filter=extraction_filter, + extraction_filter=build_extraction_filter(), + url_filter=build_url_filter(), ) for scraper in self.scrapers ] diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index 7be3c976..464cca9f 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -5,7 +5,7 @@ from fundus.logging import basic_logger from fundus.parser import ParserProxy from fundus.scraping.article import Article -from fundus.scraping.filter import ExtractionFilter, Requires +from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter from fundus.scraping.html import HTMLSource @@ -22,6 +22,7 @@ async def scrape( self, error_handling: Literal["suppress", "catch", "raise"], extraction_filter: Optional[ExtractionFilter] = None, + url_filter: Optional[URLFilter] = None, ) -> AsyncIterator[Optional[Article]]: # TODO: add docstring; especially explain why returned Article is Optional if isinstance(extraction_filter, Requires): @@ -43,7 +44,7 @@ async def scrape( return for html_source in self.sources: - async for html in html_source.fetch(): + async for html in html_source.fetch(url_filter=url_filter): try: extraction = self.parser(html.crawl_date).parse(html.content, error_handling) From d62f0e87b7f0374f574ec4f12ab0fbb921fbc5e9 Mon Sep 17 00:00:00 2001 From: Max Dallabetta <46926170+MaxDall@users.noreply.github.com> Date: Tue, 29 Aug 2023 16:55:02 +0200 Subject: [PATCH 2/3] Update src/fundus/scraping/html.py Co-authored-by: Conrad Dobberstein <29147025+dobbersc@users.noreply.github.com> --- src/fundus/scraping/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index e0e74187..7a6eb78c 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -246,7 +246,7 @@ def __init__( url_source.set_header(self.request_header) async def fetch(self, url_filter: Optional[URLFilter] = None) -> AsyncIterator[HTML]: - combined_filters = ([self.url_filter] if self.url_filter else []) + ([url_filter] if url_filter else []) + combined_filters: List[URLFilter] = ([self.url_filter] if self.url_filter else []) + ([url_filter] if url_filter else []) def filter_url(u: str) -> bool: return any(f(u) for f in combined_filters) From e87885d17a7d6f82846d569836292abed30aefcf Mon Sep 17 00:00:00 2001 From: Max Dallabetta <46926170+MaxDall@users.noreply.github.com> Date: Tue, 29 Aug 2023 16:55:09 +0200 Subject: [PATCH 3/3] Update src/fundus/scraping/pipeline.py Co-authored-by: Conrad Dobberstein <29147025+dobbersc@users.noreply.github.com> --- src/fundus/scraping/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py index c0147f36..dda935ae 100644 --- a/src/fundus/scraping/pipeline.py +++ b/src/fundus/scraping/pipeline.py @@ -113,7 +113,7 @@ def constant_delay() -> float: def build_url_filter() -> URLFilter: def _filter(url: str) -> bool: - return (url_filter and url_filter(url)) or (only_unique and url in response_cache) + return (url_filter is not None and url_filter(url)) or (only_unique and url in response_cache) return _filter