Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH262: Rework URL filter logic [Based on #277] #278

Merged
merged 5 commits into from
Aug 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions src/fundus/scraping/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

from fundus.logging import basic_logger
from fundus.scraping.filter import URLFilter, inverse
from fundus.utils.more_async import async_next, make_iterable_async
from fundus.utils.more_async import make_iterable_async

_default_header = {"user-agent": "Fundus"}

Expand Down Expand Up @@ -252,25 +252,26 @@ def __init__(
else:
self.url_source = make_iterable_async(url_source)
self.publisher = publisher
self.url_filter = [] if not url_filter else [url_filter]
self.url_filter = url_filter
self.request_header = request_header or _default_header
if isinstance(url_source, URLSource):
url_source.set_header(self.request_header)

def add_url_filter(self, url_filter: URLFilter) -> None:
self.url_filter.append(url_filter)
async def fetch(self, url_filter: Optional[URLFilter] = None) -> AsyncIterator[Optional[HTML]]:
combined_filters: List[URLFilter] = ([self.url_filter] if self.url_filter else []) + (
[url_filter] if url_filter else []
)

def _filter(self, url: str) -> bool:
return any(url_filter(url) for url_filter in self.url_filter)
def filter_url(u: str) -> bool:
return any(f(u) for f in combined_filters)

async def fetch(self) -> AsyncIterator[Optional[HTML]]:
async for url in self.url_source:
if not validators.url(url):
basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
yield None
continue

if self._filter(url):
if filter_url(url):
basic_logger.debug(f"Skipped requested URL '{url}' because of URL filter")
yield None
continue
Expand All @@ -279,8 +280,8 @@ async def fetch(self) -> AsyncIterator[Optional[HTML]]:

try:
async with session.get(url, headers=self.request_header) as response:
if self._filter(str(response.url)):
basic_logger.debug(f"Skipped responded URL '{url}' because of URL filter")
if filter_url(str(response.url)):
basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
yield None
continue
html = await response.text()
Expand Down
20 changes: 7 additions & 13 deletions src/fundus/scraping/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,25 +112,19 @@ def constant_delay() -> float:
else:
return delay

def build_unique_url_filter() -> URLFilter:
return lambda url: url in response_cache
def build_url_filter() -> URLFilter:
def _filter(url: str) -> bool:
return (url_filter is not None and url_filter(url)) or (only_unique and url in response_cache)

# build filters and delay. this is for readability and typeguard reasons
extraction_filter = build_extraction_filter()
unique_url_filter = build_unique_url_filter() if only_unique else None
final_delay = build_delay()
return _filter

for scraper in self.scrapers:
for source in scraper.sources:
if url_filter:
source.add_url_filter(url_filter=url_filter)
if unique_url_filter:
source.add_url_filter(url_filter=unique_url_filter)
final_delay = build_delay()

async_article_iterators: List[AsyncIterator[Optional[Article]]] = [
scraper.scrape(
error_handling=error_handling,
extraction_filter=extraction_filter,
extraction_filter=build_extraction_filter(),
url_filter=build_url_filter(),
)
for scraper in self.scrapers
]
Expand Down
5 changes: 3 additions & 2 deletions src/fundus/scraping/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from fundus.logging import basic_logger
from fundus.parser import ParserProxy
from fundus.scraping.article import Article
from fundus.scraping.filter import ExtractionFilter, Requires
from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter
from fundus.scraping.html import HTMLSource


Expand All @@ -22,6 +22,7 @@ async def scrape(
self,
error_handling: Literal["suppress", "catch", "raise"],
extraction_filter: Optional[ExtractionFilter] = None,
url_filter: Optional[URLFilter] = None,
) -> AsyncIterator[Optional[Article]]:
# TODO: add docstring; especially explain why returned Article is Optional
if isinstance(extraction_filter, Requires):
Expand All @@ -43,7 +44,7 @@ async def scrape(
return

for html_source in self.sources:
async for html in html_source.fetch():
async for html in html_source.fetch(url_filter=url_filter):
if html is None:
yield None
continue
Expand Down