Skip to content

Commit

Permalink
feat: reduce wait time
Browse files Browse the repository at this point in the history
  • Loading branch information
asawczyn committed Sep 29, 2024
1 parent 88317c7 commit a8644fc
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 7 deletions.
1 change: 1 addition & 0 deletions docker/nsa/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ services:
- '27017:27017'
volumes:
- nsa_dbdata:/data/db
command: --wiredTigerCacheSizeGB 8

volumes:
nsa_dbdata:
16 changes: 10 additions & 6 deletions juddges/data/nsa/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,21 @@ class IncorrectPage(Exception):


class NSAScraper:
def __init__(self, user_agent: str, proxy_config: dict[str, str] | None = None) -> None:
def __init__(
self, user_agent: str, proxy_config: dict[str, str] | None = None, wait: bool = False
) -> None:
self.wait = wait
self.browser = mechanicalsoup.StatefulBrowser(
user_agent=user_agent,
requests_adapters={
"https://": HTTPAdapter(
max_retries=Retry(
total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504, 403, 429]
total=5, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504, 403, 429]
)
),
"http://": HTTPAdapter(
max_retries=Retry(
total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504, 403, 429]
total=5, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504, 403, 429]
)
),
},
Expand Down Expand Up @@ -80,9 +83,10 @@ def _browser_submit_selected(self) -> None:

def _post_call(self, response) -> None:
response.raise_for_status()
# wait random from normal distribution
time_to_wait = random.normalvariate(1, 0.5)
time.sleep(time_to_wait if time_to_wait > 0 else 0)
if self.wait:
# wait random from normal distribution
time_to_wait = random.normalvariate(1, 0.5)
time.sleep(time_to_wait if time_to_wait > 0 else 0)
if not self._correct_page():
raise IncorrectPage(f"Incorrect page: {self.browser.page.text}")

Expand Down
2 changes: 1 addition & 1 deletion scripts/nsa/download_document_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def main(
progress_bar=True,
progress_bar_options={"smoothing": 0},
chunk_size=5,
worker_lifespan=100,
):
assert len(result) == 1
if "error" in result:
Expand Down Expand Up @@ -115,6 +114,7 @@ def process_doc_id(
nsa_scraper = NSAScraper(
user_agent=random.choice(user_agents),
proxy_config=proxy,
wait=False,
)
try:
page = nsa_scraper.get_page_for_doc(doc_id)
Expand Down

0 comments on commit a8644fc

Please sign in to comment.