Skip to content

Commit

Permalink
feat: add utilizing multitor to scraper script
Browse files Browse the repository at this point in the history
  • Loading branch information
asawczyn committed Aug 5, 2024
1 parent a52b58a commit 908d013
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 50 deletions.
117 changes: 69 additions & 48 deletions juddges/data/nsa/scraper.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
import re

import mechanicalsoup
import typer
from bs4 import BeautifulSoup
from datasets import tqdm
from mpire import WorkerPool
from tqdm import trange
from datetime import datetime, timedelta
import pymongo
from loguru import logger

from juddges.utils.tor import TorClient

DB_URI = "mongodb://localhost:27017/"

START_DATE = "1981-01-01"
END_DATE = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")


def main() -> None:
def main(
socket_port_start: int = typer.Option(9000),
config_port_start: int = typer.Option(9900),
n_jobs: int = typer.Option(1),
) -> None:
client = pymongo.MongoClient(DB_URI)
db = client["nsa"]
dates_col = db["dates"]
Expand All @@ -32,39 +39,19 @@ def main() -> None:
start_end_dates = list(reversed(list(zip(dates, dates[1:]))))
start_end_dates = filter_done_dates(start_end_dates, done)

with tqdm(total=len(start_end_dates), desc="Searching for documents") as pbar:
for start_date, end_date in start_end_dates:
pbar.set_postfix({"Date": f"{start_date} - {end_date}"})

nsa_scraper = NSAScraper()

documents = nsa_scraper.search_documents(start_date, end_date)
if documents:
success = []
for page_id, document_ids in documents.items():
page_success = "FOUND" if document_ids is not None else "ERROR"
success.append(
{
"start_date": start_date,
"end_date": end_date,
"page_id": page_id,
"success": page_success,
"document_ids": document_ids,
}
)
else:
success = [
{
"start_date": start_date,
"end_date": end_date,
"page_id": None,
"success": "NO_DOCUMENTS",
}
]
dates_col.insert_many(success)
pbar.update()


worker_torclient_assign = [
TorClient("", socket_port_start + i, config_port_start + i) for i in range(n_jobs)
]
with WorkerPool(
n_jobs=n_jobs,
pass_worker_id=True,
shared_objects=worker_torclient_assign,
start_method="threading",
) as pool:
for result in pool.map(
process_date_range, start_end_dates, progress_bar=True, chunk_size=1
):
dates_col.insert_many(result)


def generate_dates(start_date: str, end_date: str) -> list[str]:
Expand All @@ -86,22 +73,55 @@ def filter_done_dates(dates: list[tuple[str, str]], done: list[tuple[str, str]])
return [date for date in dates if date not in done_dates]


class NSAScraper:
def process_date_range(
worker_id: int, worker_torclient_assign: list[TorClient], start_date, end_date
):
tor_client = worker_torclient_assign[worker_id]
nsa_scraper = NSAScraper(tor_client)
documents = nsa_scraper.search_documents(start_date, end_date)
if documents:
success = []
for page_id, document_ids in documents.items():
page_success = "FOUND" if document_ids is not None else "ERROR: Redirected"
success.append(
{
"start_date": start_date,
"end_date": end_date,
"page_id": page_id,
"success": page_success,
"document_ids": document_ids,
}
)
else:
success = [
{
"start_date": start_date,
"end_date": end_date,
"page_id": None,
"success": "NO_DOCUMENTS",
}
]
return success


class NSAScraper:
def __init__(self, tor_client: TorClient | None = None) -> None:
self.browser = mechanicalsoup.StatefulBrowser(user_agent="MechanicalSoup")
if tor_client:
self.browser.session.proxies = tor_client.proxy_config

def search_documents(self, start_date, end_date):
browser = mechanicalsoup.StatefulBrowser(user_agent="MechanicalSoup")
response = browser.open("https://orzeczenia.nsa.gov.pl/cbo")
response = self.browser.open("https://orzeczenia.nsa.gov.pl/cbo")
if response.status_code != 200:
raise Exception(f"Failed to open the website. Status code: {response.status_code}")

browser.select_form()
self.browser.select_form()
# browser["symbole"] = "648"
browser["odDaty"] = start_date
browser["doDaty"] = end_date
browser.submit_selected()
if self.any_documents_found(browser):
documents = self.retrieve_documents(browser)
self.browser["odDaty"] = start_date
self.browser["doDaty"] = end_date
self.browser.submit_selected()
if self.any_documents_found(self.browser):
documents = self.retrieve_documents(self.browser)
num_documents = sum(map(lambda x: len(x) if x else 0, documents.values()))
print(f"Found {num_documents} documents on {len(documents)} pages.")
return documents
Expand All @@ -113,7 +133,9 @@ def any_documents_found(self, browser: mechanicalsoup.StatefulBrowser) -> bool:
warning_text = "Nie znaleziono orzeczeń spełniających podany warunek!"
return warning_text not in browser.page.text

def retrieve_documents(self, browser: mechanicalsoup.StatefulBrowser) -> dict[int, list[str] | None]:
def retrieve_documents(
self, browser: mechanicalsoup.StatefulBrowser
) -> dict[int, list[str] | None]:
page_links = browser.links(url_regex="^/cbo/find\?p=")
if not page_links:
last_page = 1
Expand All @@ -128,7 +150,7 @@ def retrieve_documents(self, browser: mechanicalsoup.StatefulBrowser) -> dict[in
if browser.url.endswith(f"{page_id}"):
page_documents = self.find_documents_on_page(browser.page)
assert (
0 < len(page_documents) <= 10
0 < len(page_documents) <= 10
), f"Page {page_id} has {len(page_documents)} documents"
documents[page_id] = page_documents
else:
Expand All @@ -151,5 +173,4 @@ def find_documents_on_page(self, page: BeautifulSoup) -> list[str]:
return filtered_links



main()
typer.run(main)
2 changes: 0 additions & 2 deletions juddges/utils/tor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,3 @@ def get_session(self, **kwargs) -> requests.Session:
http.mount("https://", adapter)
http.mount("http://", adapter)
return http


0 comments on commit 908d013

Please sign in to comment.