-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
136 additions
and
142 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,103 +1,28 @@ | ||
import random | ||
import re | ||
import time | ||
from pathlib import Path | ||
|
||
import mechanicalsoup | ||
import typer | ||
from bs4 import BeautifulSoup | ||
from mpire import WorkerPool | ||
from random_user_agent.user_agent import UserAgent | ||
from loguru import logger | ||
from requests import HTTPError, RequestException | ||
from requests.adapters import HTTPAdapter | ||
from retry import retry | ||
from datetime import datetime, timedelta | ||
import pymongo | ||
from loguru import logger | ||
import urllib3 | ||
from urllib3 import Retry | ||
|
||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||
|
||
DB_URI = "mongodb://localhost:27017/" | ||
|
||
START_DATE = "1981-01-01" | ||
END_DATE = datetime.now().strftime("%Y-%m-%d") | ||
|
||
|
||
def main( | ||
n_jobs: int = typer.Option(30), | ||
) -> None: | ||
client = pymongo.MongoClient(DB_URI) | ||
db = client["nsa"] | ||
dates_col = db["dates"] | ||
errors_col = db["errors"] | ||
|
||
done = dates_col.find().distinct("date") | ||
logger.info(f"Found {len(done)} done dates in the database.") | ||
|
||
dates = generate_dates(START_DATE, END_DATE) | ||
|
||
random.shuffle(dates) | ||
dates = filter_done_dates(dates, done) | ||
|
||
success = 0 | ||
error = 0 | ||
with WorkerPool( | ||
n_jobs=n_jobs, | ||
) as pool: | ||
for result in pool.imap_unordered( | ||
process_date, | ||
dates, | ||
progress_bar=True, | ||
progress_bar_options={"smoothing": 0}, | ||
chunk_size=1, | ||
): | ||
assert len(result) == 1 | ||
if "error" in result: | ||
result["error"]["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
errors_col.insert_one(result["error"]) | ||
error += 1 | ||
elif "success" in result: | ||
for r in result["success"]: | ||
r["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
dates_col.insert_many(result["success"]) | ||
success += 1 | ||
else: | ||
raise ValueError(f"Invalid result: {result}") | ||
logger.info(f"Success: {success}, Error: {error}") | ||
|
||
|
||
def generate_dates(start_date: str, end_date: str) -> list[str]: | ||
date_format = "%Y-%m-%d" | ||
start = datetime.strptime(start_date, date_format) | ||
end = datetime.strptime(end_date, date_format) | ||
|
||
date_list = [] | ||
current_date = start | ||
while current_date <= end: | ||
date_list.append(current_date.strftime(date_format)) | ||
current_date += timedelta(days=1) | ||
|
||
return date_list | ||
|
||
|
||
def filter_done_dates(dates: list[str], done: list[str]): | ||
done_dates = set(done) | ||
return [date for date in dates if date not in done_dates] | ||
class IncorrectNumberOfDocumentsFound(Exception): | ||
pass | ||
|
||
|
||
class IncorrectNumberOfDocumentsFound(Exception): | ||
class IncorrectPage(Exception): | ||
pass | ||
|
||
|
||
class NSAScraper: | ||
def __init__(self, proxy_config: dict[str, str] | None = None) -> None: | ||
def __init__(self, user_agent: str, proxy_config: dict[str, str] | None = None) -> None: | ||
self.browser = mechanicalsoup.StatefulBrowser( | ||
user_agent=UserAgent(limit=1000) | ||
.get_random_user_agent() | ||
.encode("utf-8") | ||
.decode("utf-8"), | ||
user_agent=user_agent, | ||
requests_adapters={ | ||
"https://": HTTPAdapter( | ||
max_retries=Retry( | ||
|
@@ -115,7 +40,10 @@ def __init__(self, proxy_config: dict[str, str] | None = None) -> None: | |
if proxy_config: | ||
self.browser.session.proxies = proxy_config | ||
|
||
@retry(tries=10, exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound)) | ||
@retry( | ||
tries=15, | ||
exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound, IncorrectPage), | ||
) | ||
def search_documents_for_date(self, date): | ||
self._browser_open("https://orzeczenia.nsa.gov.pl/cbo") | ||
self.browser.select_form() | ||
|
@@ -145,7 +73,7 @@ def _post_call(self, response) -> None: | |
time_to_wait = random.normalvariate(1, 0.5) | ||
time.sleep(time_to_wait if time_to_wait > 0 else 0) | ||
if not self._correct_page(): | ||
raise ValueError(f"Incorrect page: {self.browser.page.text}") | ||
raise IncorrectPage(f"Incorrect page: {self.browser.page.text}") | ||
|
||
def _correct_page(self) -> bool: | ||
title = "Centralna Baza Orzeczeń Sądów Administracyjnych" | ||
|
@@ -168,7 +96,10 @@ def _retrieve_documents(self) -> dict[int, list[str] | None]: | |
documents[page_id] = self._retrieve_documents_from_page(page_id) | ||
return documents | ||
|
||
@retry(tries=10, exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound)) | ||
@retry( | ||
tries=15, | ||
exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound, IncorrectPage), | ||
) | ||
def _retrieve_documents_from_page(self, page_id: int) -> list[str] | None: | ||
self._browser_open(f"https://orzeczenia.nsa.gov.pl/cbo/find?p={page_id}") | ||
if self.browser.url.endswith(f"{page_id}"): | ||
|
@@ -196,42 +127,3 @@ def _find_documents_on_page(self, page: BeautifulSoup) -> list[str]: | |
filtered_links.append(href) | ||
|
||
return filtered_links | ||
|
||
|
||
def process_date(date: str): | ||
proxy = { | ||
"http": "http://brd-customer-hl_9b7bcfc3-zone-datacenter_proxy1:[email protected]:22225", | ||
"https": "http://brd-customer-hl_9b7bcfc3-zone-datacenter_proxy1:[email protected]:22225", | ||
} | ||
nsa_scraper = NSAScraper(proxy) | ||
try: | ||
documents = nsa_scraper.search_documents_for_date(date) | ||
except Exception as e: | ||
error_message = f"Failed to scrape documents for date {date}: {e}" | ||
logger.error(f"Failed to scrape documents for date {date}: {e}; Error type: {type(e)}") | ||
return {"error": {"date": date, "error": error_message, "error_type": type(e).__name__}} | ||
if documents: | ||
success = [] | ||
for page_id, document_ids in documents.items(): | ||
page_success = "FOUND" if document_ids is not None else "ERROR: Redirected" | ||
success.append( | ||
{ | ||
"date": date, | ||
"page_id": page_id, | ||
"success": page_success, | ||
"document_ids": document_ids, | ||
} | ||
) | ||
else: | ||
success = [ | ||
{ | ||
"date": date, | ||
"page_id": None, | ||
"success": "NO_DOCUMENTS", | ||
} | ||
] | ||
return {"success": success} | ||
|
||
|
||
if __name__ == "__main__": | ||
typer.run(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
import random | ||
from datetime import datetime, timedelta | ||
|
||
import pymongo | ||
import typer | ||
import urllib3 | ||
from loguru import logger | ||
from mpire import WorkerPool | ||
from random_user_agent.user_agent import UserAgent | ||
|
||
from juddges.data.nsa.scraper import NSAScraper | ||
|
||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||
|
||
DB_URI = "mongodb://localhost:27017/" | ||
|
||
START_DATE = "1981-01-01" | ||
END_DATE = datetime.now().strftime("%Y-%m-%d") | ||
|
||
|
||
def main( | ||
n_jobs: int = typer.Option(30), | ||
proxy_address: str = typer.Option(...), | ||
db_uri: str = typer.Option(DB_URI), | ||
start_date: str = typer.Option(START_DATE), | ||
end_date: str = typer.Option(END_DATE), | ||
) -> None: | ||
client = pymongo.MongoClient(db_uri) | ||
db = client["nsa"] | ||
dates_col = db["dates"] | ||
errors_col = db["errors"] | ||
|
||
done = dates_col.find().distinct("date") | ||
logger.info(f"Found {len(done)} done dates in the database.") | ||
|
||
dates = generate_dates(start_date, end_date) | ||
|
||
random.shuffle(dates) | ||
dates = filter_done_dates(dates, done) | ||
|
||
success = 0 | ||
error = 0 | ||
with WorkerPool(n_jobs=n_jobs, shared_objects=proxy_address) as pool: | ||
for result in pool.imap_unordered( | ||
process_date, | ||
dates, | ||
progress_bar=True, | ||
progress_bar_options={"smoothing": 0}, | ||
chunk_size=1, | ||
): | ||
assert len(result) == 1 | ||
if "error" in result: | ||
result["error"]["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
errors_col.insert_one(result["error"]) | ||
error += 1 | ||
elif "success" in result: | ||
for r in result["success"]: | ||
r["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
dates_col.insert_many(result["success"]) | ||
success += 1 | ||
else: | ||
raise ValueError(f"Invalid result: {result}") | ||
logger.info(f"Success: {success}, Error: {error}") | ||
|
||
|
||
def generate_dates(start_date: str, end_date: str) -> list[str]: | ||
date_format = "%Y-%m-%d" | ||
start = datetime.strptime(start_date, date_format) | ||
end = datetime.strptime(end_date, date_format) | ||
|
||
date_list = [] | ||
current_date = start | ||
while current_date <= end: | ||
date_list.append(current_date.strftime(date_format)) | ||
current_date += timedelta(days=1) | ||
|
||
return date_list | ||
|
||
|
||
def filter_done_dates(dates: list[str], done: list[str]) -> list[str]: | ||
done_dates = set(done) | ||
return [date for date in dates if date not in done_dates] | ||
|
||
|
||
def process_date(proxy_address: str, date: str) -> dict[str, list[dict[str, str]] | dict[str, str]]: | ||
proxy = {"http": proxy_address, "https": proxy_address} | ||
nsa_scraper = NSAScraper( | ||
user_agent=UserAgent(limit=1000).get_random_user_agent().encode("utf-8").decode("utf-8"), | ||
proxy_config=proxy, | ||
) | ||
try: | ||
documents = nsa_scraper.search_documents_for_date(date) | ||
except Exception as e: | ||
error_message = f"Failed to scrape documents for date {date}: {e}" | ||
logger.error(f"Failed to scrape documents for date {date}: {e}; Error type: {type(e)}") | ||
return {"error": {"date": date, "error": error_message, "error_type": type(e).__name__}} | ||
if documents: | ||
success = [] | ||
for page_id, document_ids in documents.items(): | ||
page_success = "FOUND" if document_ids is not None else "ERROR: Redirected" | ||
success.append( | ||
{ | ||
"date": date, | ||
"page_id": page_id, | ||
"success": page_success, | ||
"document_ids": document_ids, | ||
} | ||
) | ||
else: | ||
success = [ | ||
{ | ||
"date": date, | ||
"page_id": None, | ||
"success": "NO_DOCUMENTS", | ||
} | ||
] | ||
return {"success": success} | ||
|
||
|
||
if __name__ == "__main__": | ||
typer.run(main) |