Skip to content

Commit

Permalink
feat: final version of scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
asawczyn committed Sep 2, 2024
1 parent a9e0cac commit d63cd49
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 142 deletions.
19 changes: 0 additions & 19 deletions docker/nsa/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,5 @@ services:
volumes:
- nsa_dbdata:/data/db

multiple-tor:
build:
context: ./multiple-tor
args:
SOCKET_START_PORT: 9000
SOCKET_END_PORT: 9100
CONTROL_START_PORT: 9900
CONTROL_END_PORT: 10000
dockerfile: ./Dockerfile
ports:
- 9000-9100:9000-9100
- 9900-10000:9900-10000
command: >
python3 main.py --num-tors 50
volumes:
nsa_dbdata:
multitor_data:


138 changes: 15 additions & 123 deletions juddges/data/nsa/scraper.py
Original file line number Diff line number Diff line change
@@ -1,103 +1,28 @@
import random
import re
import time
from pathlib import Path

import mechanicalsoup
import typer
from bs4 import BeautifulSoup
from mpire import WorkerPool
from random_user_agent.user_agent import UserAgent
from loguru import logger
from requests import HTTPError, RequestException
from requests.adapters import HTTPAdapter
from retry import retry
from datetime import datetime, timedelta
import pymongo
from loguru import logger
import urllib3
from urllib3 import Retry

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

DB_URI = "mongodb://localhost:27017/"

START_DATE = "1981-01-01"
END_DATE = datetime.now().strftime("%Y-%m-%d")


def main(
n_jobs: int = typer.Option(30),
) -> None:
client = pymongo.MongoClient(DB_URI)
db = client["nsa"]
dates_col = db["dates"]
errors_col = db["errors"]

done = dates_col.find().distinct("date")
logger.info(f"Found {len(done)} done dates in the database.")

dates = generate_dates(START_DATE, END_DATE)

random.shuffle(dates)
dates = filter_done_dates(dates, done)

success = 0
error = 0
with WorkerPool(
n_jobs=n_jobs,
) as pool:
for result in pool.imap_unordered(
process_date,
dates,
progress_bar=True,
progress_bar_options={"smoothing": 0},
chunk_size=1,
):
assert len(result) == 1
if "error" in result:
result["error"]["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
errors_col.insert_one(result["error"])
error += 1
elif "success" in result:
for r in result["success"]:
r["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dates_col.insert_many(result["success"])
success += 1
else:
raise ValueError(f"Invalid result: {result}")
logger.info(f"Success: {success}, Error: {error}")


def generate_dates(start_date: str, end_date: str) -> list[str]:
date_format = "%Y-%m-%d"
start = datetime.strptime(start_date, date_format)
end = datetime.strptime(end_date, date_format)

date_list = []
current_date = start
while current_date <= end:
date_list.append(current_date.strftime(date_format))
current_date += timedelta(days=1)

return date_list


def filter_done_dates(dates: list[str], done: list[str]):
done_dates = set(done)
return [date for date in dates if date not in done_dates]
class IncorrectNumberOfDocumentsFound(Exception):
pass


class IncorrectNumberOfDocumentsFound(Exception):
class IncorrectPage(Exception):
pass


class NSAScraper:
def __init__(self, proxy_config: dict[str, str] | None = None) -> None:
def __init__(self, user_agent: str, proxy_config: dict[str, str] | None = None) -> None:
self.browser = mechanicalsoup.StatefulBrowser(
user_agent=UserAgent(limit=1000)
.get_random_user_agent()
.encode("utf-8")
.decode("utf-8"),
user_agent=user_agent,
requests_adapters={
"https://": HTTPAdapter(
max_retries=Retry(
Expand All @@ -115,7 +40,10 @@ def __init__(self, proxy_config: dict[str, str] | None = None) -> None:
if proxy_config:
self.browser.session.proxies = proxy_config

@retry(tries=10, exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound))
@retry(
tries=15,
exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound, IncorrectPage),
)
def search_documents_for_date(self, date):
self._browser_open("https://orzeczenia.nsa.gov.pl/cbo")
self.browser.select_form()
Expand Down Expand Up @@ -145,7 +73,7 @@ def _post_call(self, response) -> None:
time_to_wait = random.normalvariate(1, 0.5)
time.sleep(time_to_wait if time_to_wait > 0 else 0)
if not self._correct_page():
raise ValueError(f"Incorrect page: {self.browser.page.text}")
raise IncorrectPage(f"Incorrect page: {self.browser.page.text}")

def _correct_page(self) -> bool:
title = "Centralna Baza Orzeczeń Sądów Administracyjnych"
Expand All @@ -168,7 +96,10 @@ def _retrieve_documents(self) -> dict[int, list[str] | None]:
documents[page_id] = self._retrieve_documents_from_page(page_id)
return documents

@retry(tries=10, exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound))
@retry(
tries=15,
exceptions=(RequestException, HTTPError, IncorrectNumberOfDocumentsFound, IncorrectPage),
)
def _retrieve_documents_from_page(self, page_id: int) -> list[str] | None:
self._browser_open(f"https://orzeczenia.nsa.gov.pl/cbo/find?p={page_id}")
if self.browser.url.endswith(f"{page_id}"):
Expand Down Expand Up @@ -196,42 +127,3 @@ def _find_documents_on_page(self, page: BeautifulSoup) -> list[str]:
filtered_links.append(href)

return filtered_links


def process_date(date: str):
proxy = {
"http": "http://brd-customer-hl_9b7bcfc3-zone-datacenter_proxy1:[email protected]:22225",
"https": "http://brd-customer-hl_9b7bcfc3-zone-datacenter_proxy1:[email protected]:22225",
}
nsa_scraper = NSAScraper(proxy)
try:
documents = nsa_scraper.search_documents_for_date(date)
except Exception as e:
error_message = f"Failed to scrape documents for date {date}: {e}"
logger.error(f"Failed to scrape documents for date {date}: {e}; Error type: {type(e)}")
return {"error": {"date": date, "error": error_message, "error_type": type(e).__name__}}
if documents:
success = []
for page_id, document_ids in documents.items():
page_success = "FOUND" if document_ids is not None else "ERROR: Redirected"
success.append(
{
"date": date,
"page_id": page_id,
"success": page_success,
"document_ids": document_ids,
}
)
else:
success = [
{
"date": date,
"page_id": None,
"success": "NO_DOCUMENTS",
}
]
return {"success": success}


if __name__ == "__main__":
typer.run(main)
121 changes: 121 additions & 0 deletions scripts/nsa/scrap_documents_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import random
from datetime import datetime, timedelta

import pymongo
import typer
import urllib3
from loguru import logger
from mpire import WorkerPool
from random_user_agent.user_agent import UserAgent

from juddges.data.nsa.scraper import NSAScraper

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

DB_URI = "mongodb://localhost:27017/"

START_DATE = "1981-01-01"
END_DATE = datetime.now().strftime("%Y-%m-%d")


def main(
n_jobs: int = typer.Option(30),
proxy_address: str = typer.Option(...),
db_uri: str = typer.Option(DB_URI),
start_date: str = typer.Option(START_DATE),
end_date: str = typer.Option(END_DATE),
) -> None:
client = pymongo.MongoClient(db_uri)
db = client["nsa"]
dates_col = db["dates"]
errors_col = db["errors"]

done = dates_col.find().distinct("date")
logger.info(f"Found {len(done)} done dates in the database.")

dates = generate_dates(start_date, end_date)

random.shuffle(dates)
dates = filter_done_dates(dates, done)

success = 0
error = 0
with WorkerPool(n_jobs=n_jobs, shared_objects=proxy_address) as pool:
for result in pool.imap_unordered(
process_date,
dates,
progress_bar=True,
progress_bar_options={"smoothing": 0},
chunk_size=1,
):
assert len(result) == 1
if "error" in result:
result["error"]["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
errors_col.insert_one(result["error"])
error += 1
elif "success" in result:
for r in result["success"]:
r["time_added"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dates_col.insert_many(result["success"])
success += 1
else:
raise ValueError(f"Invalid result: {result}")
logger.info(f"Success: {success}, Error: {error}")


def generate_dates(start_date: str, end_date: str) -> list[str]:
date_format = "%Y-%m-%d"
start = datetime.strptime(start_date, date_format)
end = datetime.strptime(end_date, date_format)

date_list = []
current_date = start
while current_date <= end:
date_list.append(current_date.strftime(date_format))
current_date += timedelta(days=1)

return date_list


def filter_done_dates(dates: list[str], done: list[str]) -> list[str]:
done_dates = set(done)
return [date for date in dates if date not in done_dates]


def process_date(proxy_address: str, date: str) -> dict[str, list[dict[str, str]] | dict[str, str]]:
proxy = {"http": proxy_address, "https": proxy_address}
nsa_scraper = NSAScraper(
user_agent=UserAgent(limit=1000).get_random_user_agent().encode("utf-8").decode("utf-8"),
proxy_config=proxy,
)
try:
documents = nsa_scraper.search_documents_for_date(date)
except Exception as e:
error_message = f"Failed to scrape documents for date {date}: {e}"
logger.error(f"Failed to scrape documents for date {date}: {e}; Error type: {type(e)}")
return {"error": {"date": date, "error": error_message, "error_type": type(e).__name__}}
if documents:
success = []
for page_id, document_ids in documents.items():
page_success = "FOUND" if document_ids is not None else "ERROR: Redirected"
success.append(
{
"date": date,
"page_id": page_id,
"success": page_success,
"document_ids": document_ids,
}
)
else:
success = [
{
"date": date,
"page_id": None,
"success": "NO_DOCUMENTS",
}
]
return {"success": success}


if __name__ == "__main__":
typer.run(main)

0 comments on commit d63cd49

Please sign in to comment.