diff --git a/juddges/data/nsa/scraper.py b/juddges/data/nsa/scraper.py index eba6d72..4e4cfff 100644 --- a/juddges/data/nsa/scraper.py +++ b/juddges/data/nsa/scraper.py @@ -1,20 +1,27 @@ import re import mechanicalsoup +import typer from bs4 import BeautifulSoup -from datasets import tqdm +from mpire import WorkerPool from tqdm import trange from datetime import datetime, timedelta import pymongo from loguru import logger +from juddges.utils.tor import TorClient + DB_URI = "mongodb://localhost:27017/" START_DATE = "1981-01-01" END_DATE = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d") -def main() -> None: +def main( + socket_port_start: int = typer.Option(9000), + config_port_start: int = typer.Option(9900), + n_jobs: int = typer.Option(1), +) -> None: client = pymongo.MongoClient(DB_URI) db = client["nsa"] dates_col = db["dates"] @@ -32,39 +39,19 @@ def main() -> None: start_end_dates = list(reversed(list(zip(dates, dates[1:])))) start_end_dates = filter_done_dates(start_end_dates, done) - with tqdm(total=len(start_end_dates), desc="Searching for documents") as pbar: - for start_date, end_date in start_end_dates: - pbar.set_postfix({"Date": f"{start_date} - {end_date}"}) - - nsa_scraper = NSAScraper() - - documents = nsa_scraper.search_documents(start_date, end_date) - if documents: - success = [] - for page_id, document_ids in documents.items(): - page_success = "FOUND" if document_ids is not None else "ERROR" - success.append( - { - "start_date": start_date, - "end_date": end_date, - "page_id": page_id, - "success": page_success, - "document_ids": document_ids, - } - ) - else: - success = [ - { - "start_date": start_date, - "end_date": end_date, - "page_id": None, - "success": "NO_DOCUMENTS", - } - ] - dates_col.insert_many(success) - pbar.update() - - + worker_torclient_assign = [ + TorClient("", socket_port_start + i, config_port_start + i) for i in range(n_jobs) + ] + with WorkerPool( + n_jobs=n_jobs, + pass_worker_id=True, + shared_objects=worker_torclient_assign, + start_method="threading", + ) as pool: + for result in pool.map( + process_date_range, start_end_dates, progress_bar=True, chunk_size=1 + ): + dates_col.insert_many(result) def generate_dates(start_date: str, end_date: str) -> list[str]: @@ -86,22 +73,55 @@ def filter_done_dates(dates: list[tuple[str, str]], done: list[tuple[str, str]]) return [date for date in dates if date not in done_dates] -class NSAScraper: +def process_date_range( + worker_id: int, worker_torclient_assign: list[TorClient], start_date, end_date +): + tor_client = worker_torclient_assign[worker_id] + nsa_scraper = NSAScraper(tor_client) + documents = nsa_scraper.search_documents(start_date, end_date) + if documents: + success = [] + for page_id, document_ids in documents.items(): + page_success = "FOUND" if document_ids is not None else "ERROR: Redirected" + success.append( + { + "start_date": start_date, + "end_date": end_date, + "page_id": page_id, + "success": page_success, + "document_ids": document_ids, + } + ) + else: + success = [ + { + "start_date": start_date, + "end_date": end_date, + "page_id": None, + "success": "NO_DOCUMENTS", + } + ] + return success + +class NSAScraper: + def __init__(self, tor_client: TorClient | None = None) -> None: + self.browser = mechanicalsoup.StatefulBrowser(user_agent="MechanicalSoup") + if tor_client: + self.browser.session.proxies = tor_client.proxy_config def search_documents(self, start_date, end_date): - browser = mechanicalsoup.StatefulBrowser(user_agent="MechanicalSoup") - response = browser.open("https://orzeczenia.nsa.gov.pl/cbo") + response = self.browser.open("https://orzeczenia.nsa.gov.pl/cbo") if response.status_code != 200: raise Exception(f"Failed to open the website. Status code: {response.status_code}") - browser.select_form() + self.browser.select_form() # browser["symbole"] = "648" - browser["odDaty"] = start_date - browser["doDaty"] = end_date - browser.submit_selected() - if self.any_documents_found(browser): - documents = self.retrieve_documents(browser) + self.browser["odDaty"] = start_date + self.browser["doDaty"] = end_date + self.browser.submit_selected() + if self.any_documents_found(self.browser): + documents = self.retrieve_documents(self.browser) num_documents = sum(map(lambda x: len(x) if x else 0, documents.values())) print(f"Found {num_documents} documents on {len(documents)} pages.") return documents @@ -113,7 +133,9 @@ def any_documents_found(self, browser: mechanicalsoup.StatefulBrowser) -> bool: warning_text = "Nie znaleziono orzeczeń spełniających podany warunek!" return warning_text not in browser.page.text - def retrieve_documents(self, browser: mechanicalsoup.StatefulBrowser) -> dict[int, list[str] | None]: + def retrieve_documents( + self, browser: mechanicalsoup.StatefulBrowser + ) -> dict[int, list[str] | None]: page_links = browser.links(url_regex="^/cbo/find\?p=") if not page_links: last_page = 1 @@ -128,7 +150,7 @@ def retrieve_documents(self, browser: mechanicalsoup.StatefulBrowser) -> dict[in if browser.url.endswith(f"{page_id}"): page_documents = self.find_documents_on_page(browser.page) assert ( - 0 < len(page_documents) <= 10 + 0 < len(page_documents) <= 10 ), f"Page {page_id} has {len(page_documents)} documents" documents[page_id] = page_documents else: @@ -151,5 +173,4 @@ def find_documents_on_page(self, page: BeautifulSoup) -> list[str]: return filtered_links - -main() +typer.run(main) diff --git a/juddges/utils/tor.py b/juddges/utils/tor.py index 4d92e89..6e870f7 100644 --- a/juddges/utils/tor.py +++ b/juddges/utils/tor.py @@ -25,5 +25,3 @@ def get_session(self, **kwargs) -> requests.Session: http.mount("https://", adapter) http.mount("http://", adapter) return http - -