From d63cd4988bff94648580a2469c5197f6b17b75bb Mon Sep 17 00:00:00 2001 From: Albert Sawczyn Date: Mon, 2 Sep 2024 20:30:42 +0200 Subject: [PATCH] feat: final version of scraping --- docker/nsa/docker-compose.yml | 19 ---- juddges/data/nsa/scraper.py | 138 +++------------------------- scripts/nsa/scrap_documents_list.py | 121 ++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 142 deletions(-) create mode 100644 scripts/nsa/scrap_documents_list.py diff --git a/docker/nsa/docker-compose.yml b/docker/nsa/docker-compose.yml index 26fb703..b2a8be0 100755 --- a/docker/nsa/docker-compose.yml +++ b/docker/nsa/docker-compose.yml @@ -6,24 +6,5 @@ services: volumes: - nsa_dbdata:/data/db - multiple-tor: - build: - context: ./multiple-tor - args: - SOCKET_START_PORT: 9000 - SOCKET_END_PORT: 9100 - CONTROL_START_PORT: 9900 - CONTROL_END_PORT: 10000 - dockerfile: ./Dockerfile - ports: - - 9000-9100:9000-9100 - - 9900-10000:9900-10000 - command: > - python3 main.py --num-tors 50 - - volumes: nsa_dbdata: - multitor_data: - - diff --git a/juddges/data/nsa/scraper.py b/juddges/data/nsa/scraper.py index 9bbaef6..04cf683 100644 --- a/juddges/data/nsa/scraper.py +++ b/juddges/data/nsa/scraper.py @@ -1,103 +1,28 @@ import random import re import time -from pathlib import Path import mechanicalsoup -import typer from bs4 import BeautifulSoup -from mpire import WorkerPool -from random_user_agent.user_agent import UserAgent +from loguru import logger from requests import HTTPError, RequestException from requests.adapters import HTTPAdapter from retry import retry -from datetime import datetime, timedelta -import pymongo -from loguru import logger -import urllib3 from urllib3 import Retry -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -DB_URI = "mongodb://localhost:27017/" - -START_DATE = "1981-01-01" -END_DATE = datetime.now().strftime("%Y-%m-%d") - - -def main( - n_jobs: int = typer.Option(30), -) -> None: - client = pymongo.MongoClient(DB_URI) - db = client["nsa"] - dates_col = db["dates"] - errors_col = db["errors"] - - done = dates_col.find().distinct("date") - logger.info(f"Found {len(done)} done dates in the database.") - - dates = generate_dates(START_DATE, END_DATE) - - random.shuffle(dates) - dates = filter_done_dates(dates, done) - - success = 0 - error = 0 - with WorkerPool( - n_jobs=n_jobs, - ) as pool: - for result in pool.imap_unordered( - process_date, - dates, - progress_bar=True, - progress_bar_options={"smoothing": 0}, - chunk_size=1, - ): - assert len(result) == 1 - if "error" in result: - result["error"]["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - errors_col.insert_one(result["error"]) - error += 1 - elif "success" in result: - for r in result["success"]: - r["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - dates_col.insert_many(result["success"]) - success += 1 - else: - raise ValueError(f"Invalid result: {result}") - logger.info(f"Success: {success}, Error: {error}") - -def generate_dates(start_date: str, end_date: str) -> list[str]: - date_format = "%Y-%m-%d" - start = datetime.strptime(start_date, date_format) - end = datetime.strptime(end_date, date_format) - - date_list = [] - current_date = start - while current_date <= end: - date_list.append(current_date.strftime(date_format)) - current_date += timedelta(days=1) - - return date_list - - -def filter_done_dates(dates: list[str], done: list[str]): - done_dates = set(done) - return [date for date in dates if date not in done_dates] +class IncorrectNumberOfDocumentsFound(Exception): + pass -class IncorrectNumberOfDocumentsFound(Exception): +class IncorrectPage(Exception): pass class NSAScraper: - def __init__(self, proxy_config: dict[str, str] | None = None) -> None: + def __init__(self, user_agent: str, proxy_config: dict[str, str] | None = None) -> None: self.browser = mechanicalsoup.StatefulBrowser( - user_agent=UserAgent(limit=1000) - .get_random_user_agent() - .encode("utf-8") - .decode("utf-8"), + user_agent=user_agent, requests_adapters={ "https://": HTTPAdapter( max_retries=Retry( @@ -115,7 +40,10 @@ def __init__(self, proxy_config: dict[str, str] | None = None) -> None: if proxy_config: self.browser.session.proxies = proxy_config - @retry(tries=10, exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound)) + @retry( + tries=15, + exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound, IncorrectPage), + ) def search_documents_for_date(self, date): self._browser_open("https://orzeczenia.nsa.gov.pl/cbo") self.browser.select_form() @@ -145,7 +73,7 @@ def _post_call(self, response) -> None: time_to_wait = random.normalvariate(1, 0.5) time.sleep(time_to_wait if time_to_wait > 0 else 0) if not self._correct_page(): - raise ValueError(f"Incorrect page: {self.browser.page.text}") + raise IncorrectPage(f"Incorrect page: {self.browser.page.text}") def _correct_page(self) -> bool: title = "Centralna Baza Orzeczeń Sądów Administracyjnych" @@ -168,7 +96,10 @@ def _retrieve_documents(self) -> dict[int, list[str] | None]: documents[page_id] = self._retrieve_documents_from_page(page_id) return documents - @retry(tries=10, exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound)) + @retry( + tries=15, + exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound, IncorrectPage), + ) def _retrieve_documents_from_page(self, page_id: int) -> list[str] | None: self._browser_open(f"https://orzeczenia.nsa.gov.pl/cbo/find?p={page_id}") if self.browser.url.endswith(f"{page_id}"): @@ -196,42 +127,3 @@ def _find_documents_on_page(self, page: BeautifulSoup) -> list[str]: filtered_links.append(href) return filtered_links - - -def process_date(date: str): - proxy = { - "http": "http://brd-customer-hl_9b7bcfc3-zone-datacenter_proxy1:woqmoq64k0ex@brd.superproxy.io:22225", - "https": "http://brd-customer-hl_9b7bcfc3-zone-datacenter_proxy1:woqmoq64k0ex@brd.superproxy.io:22225", - } - nsa_scraper = NSAScraper(proxy) - try: - documents = nsa_scraper.search_documents_for_date(date) - except Exception as e: - error_message = f"Failed to scrape documents for date {date}: {e}" - logger.error(f"Failed to scrape documents for date {date}: {e}; Error type: {type(e)}") - return {"error": {"date": date, "error": error_message, "error_type": type(e).__name__}} - if documents: - success = [] - for page_id, document_ids in documents.items(): - page_success = "FOUND" if document_ids is not None else "ERROR: Redirected" - success.append( - { - "date": date, - "page_id": page_id, - "success": page_success, - "document_ids": document_ids, - } - ) - else: - success = [ - { - "date": date, - "page_id": None, - "success": "NO_DOCUMENTS", - } - ] - return {"success": success} - - -if __name__ == "__main__": - typer.run(main) diff --git a/scripts/nsa/scrap_documents_list.py b/scripts/nsa/scrap_documents_list.py new file mode 100644 index 0000000..b224d8c --- /dev/null +++ b/scripts/nsa/scrap_documents_list.py @@ -0,0 +1,121 @@ +import random +from datetime import datetime, timedelta + +import pymongo +import typer +import urllib3 +from loguru import logger +from mpire import WorkerPool +from random_user_agent.user_agent import UserAgent + +from juddges.data.nsa.scraper import NSAScraper + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +DB_URI = "mongodb://localhost:27017/" + +START_DATE = "1981-01-01" +END_DATE = datetime.now().strftime("%Y-%m-%d") + + +def main( + n_jobs: int = typer.Option(30), + proxy_address: str = typer.Option(...), + db_uri: str = typer.Option(DB_URI), + start_date: str = typer.Option(START_DATE), + end_date: str = typer.Option(END_DATE), +) -> None: + client = pymongo.MongoClient(db_uri) + db = client["nsa"] + dates_col = db["dates"] + errors_col = db["errors"] + + done = dates_col.find().distinct("date") + logger.info(f"Found {len(done)} done dates in the database.") + + dates = generate_dates(start_date, end_date) + + random.shuffle(dates) + dates = filter_done_dates(dates, done) + + success = 0 + error = 0 + with WorkerPool(n_jobs=n_jobs, shared_objects=proxy_address) as pool: + for result in pool.imap_unordered( + process_date, + dates, + progress_bar=True, + progress_bar_options={"smoothing": 0}, + chunk_size=1, + ): + assert len(result) == 1 + if "error" in result: + result["error"]["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + errors_col.insert_one(result["error"]) + error += 1 + elif "success" in result: + for r in result["success"]: + r["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + dates_col.insert_many(result["success"]) + success += 1 + else: + raise ValueError(f"Invalid result: {result}") + logger.info(f"Success: {success}, Error: {error}") + + +def generate_dates(start_date: str, end_date: str) -> list[str]: + date_format = "%Y-%m-%d" + start = datetime.strptime(start_date, date_format) + end = datetime.strptime(end_date, date_format) + + date_list = [] + current_date = start + while current_date <= end: + date_list.append(current_date.strftime(date_format)) + current_date += timedelta(days=1) + + return date_list + + +def filter_done_dates(dates: list[str], done: list[str]) -> list[str]: + done_dates = set(done) + return [date for date in dates if date not in done_dates] + + +def process_date(proxy_address: str, date: str) -> dict[str, list[dict[str, str]] | dict[str, str]]: + proxy = {"http": proxy_address, "https": proxy_address} + nsa_scraper = NSAScraper( + user_agent=UserAgent(limit=1000).get_random_user_agent().encode("utf-8").decode("utf-8"), + proxy_config=proxy, + ) + try: + documents = nsa_scraper.search_documents_for_date(date) + except Exception as e: + error_message = f"Failed to scrape documents for date {date}: {e}" + logger.error(f"Failed to scrape documents for date {date}: {e}; Error type: {type(e)}") + return {"error": {"date": date, "error": error_message, "error_type": type(e).__name__}} + if documents: + success = [] + for page_id, document_ids in documents.items(): + page_success = "FOUND" if document_ids is not None else "ERROR: Redirected" + success.append( + { + "date": date, + "page_id": page_id, + "success": page_success, + "document_ids": document_ids, + } + ) + else: + success = [ + { + "date": date, + "page_id": None, + "success": "NO_DOCUMENTS", + } + ] + return {"success": success} + + +if __name__ == "__main__": + typer.run(main)