Skip to content

Commit

Permalink
Merge pull request #278 from flairNLP/GH262-rework_url_filter
Browse files Browse the repository at this point in the history
GH262: Rework URL filter logic [Based on #277]
  • Loading branch information
MaxDall authored Aug 29, 2023
2 parents 374663c + 94454ea commit fea579a
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 25 deletions.
21 changes: 11 additions & 10 deletions src/fundus/scraping/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

from fundus.logging import basic_logger
from fundus.scraping.filter import URLFilter, inverse
from fundus.utils.more_async import async_next, make_iterable_async
from fundus.utils.more_async import make_iterable_async

_default_header = {"user-agent": "Fundus"}

Expand Down Expand Up @@ -252,25 +252,26 @@ def __init__(
else:
self.url_source = make_iterable_async(url_source)
self.publisher = publisher
self.url_filter = [] if not url_filter else [url_filter]
self.url_filter = url_filter
self.request_header = request_header or _default_header
if isinstance(url_source, URLSource):
url_source.set_header(self.request_header)

def add_url_filter(self, url_filter: URLFilter) -> None:
self.url_filter.append(url_filter)
async def fetch(self, url_filter: Optional[URLFilter] = None) -> AsyncIterator[Optional[HTML]]:
combined_filters: List[URLFilter] = ([self.url_filter] if self.url_filter else []) + (
[url_filter] if url_filter else []
)

def _filter(self, url: str) -> bool:
return any(url_filter(url) for url_filter in self.url_filter)
def filter_url(u: str) -> bool:
return any(f(u) for f in combined_filters)

async def fetch(self) -> AsyncIterator[Optional[HTML]]:
async for url in self.url_source:
if not validators.url(url):
basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
yield None
continue

if self._filter(url):
if filter_url(url):
basic_logger.debug(f"Skipped requested URL '{url}' because of URL filter")
yield None
continue
Expand All @@ -279,8 +280,8 @@ async def fetch(self) -> AsyncIterator[Optional[HTML]]:

try:
async with session.get(url, headers=self.request_header) as response:
if self._filter(str(response.url)):
basic_logger.debug(f"Skipped responded URL '{url}' because of URL filter")
if filter_url(str(response.url)):
basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
yield None
continue
html = await response.text()
Expand Down
20 changes: 7 additions & 13 deletions src/fundus/scraping/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,25 +112,19 @@ def constant_delay() -> float:
else:
return delay

def build_unique_url_filter() -> URLFilter:
return lambda url: url in response_cache
def build_url_filter() -> URLFilter:
def _filter(url: str) -> bool:
return (url_filter is not None and url_filter(url)) or (only_unique and url in response_cache)

# build filters and delay. this is for readability and typeguard reasons
extraction_filter = build_extraction_filter()
unique_url_filter = build_unique_url_filter() if only_unique else None
final_delay = build_delay()
return _filter

for scraper in self.scrapers:
for source in scraper.sources:
if url_filter:
source.add_url_filter(url_filter=url_filter)
if unique_url_filter:
source.add_url_filter(url_filter=unique_url_filter)
final_delay = build_delay()

async_article_iterators: List[AsyncIterator[Optional[Article]]] = [
scraper.scrape(
error_handling=error_handling,
extraction_filter=extraction_filter,
extraction_filter=build_extraction_filter(),
url_filter=build_url_filter(),
)
for scraper in self.scrapers
]
Expand Down
5 changes: 3 additions & 2 deletions src/fundus/scraping/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from fundus.logging import basic_logger
from fundus.parser import ParserProxy
from fundus.scraping.article import Article
from fundus.scraping.filter import ExtractionFilter, Requires
from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter
from fundus.scraping.html import HTMLSource


Expand All @@ -22,6 +22,7 @@ async def scrape(
self,
error_handling: Literal["suppress", "catch", "raise"],
extraction_filter: Optional[ExtractionFilter] = None,
url_filter: Optional[URLFilter] = None,
) -> AsyncIterator[Optional[Article]]:
# TODO: add docstring; especially explain why returned Article is Optional
if isinstance(extraction_filter, Requires):
Expand All @@ -43,7 +44,7 @@ async def scrape(
return

for html_source in self.sources:
async for html in html_source.fetch():
async for html in html_source.fetch(url_filter=url_filter):
if html is None:
yield None
continue
Expand Down

0 comments on commit fea579a

Please sign in to comment.