Skip to content

Commit

Permalink
Add runtime check to scrapes
Browse files Browse the repository at this point in the history
  • Loading branch information
msj committed May 10, 2024
1 parent 7cdcfcd commit 6766f76
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 1 deletion.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
year=$(shell date +%Y)
START_TIME=$(shell export TZ=UTC; date -Iseconds)
TIME_LIMIT=21600

.PHONY: all
all: upload
Expand Down Expand Up @@ -54,11 +56,12 @@ CIVIL_SCRAPE_START_QUERY=$(shell tail -n +2 scripts/nightly_civil_start.sql)
civil-%.jl: cases.db
START=$$(sqlite-utils query --csv --no-headers cases.db \
"$(CIVIL_SCRAPE_START_QUERY)" -p subdivision $*); \
echo $$START; \
export START_TIME=$(START_TIME); export TIME_LIMIT=$(TIME_LIMIT); \
scrapy crawl civil -s CLOSESPIDER_TIMEOUT=3600 -a year=$(year) -a division=$* -a start=$$START -O $@;

chancery.jl: cases.db
START=$$(sqlite3 cases.db < scripts/nightly_chancery_start.sql); \
export START_TIME=$(START_TIME); export TIME_LIMIT=$(TIME_LIMIT); \
scrapy crawl chancery -a year=$(year) -a start=$$START -O $@;

cases.db :
Expand Down
23 changes: 23 additions & 0 deletions courtscraper/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
from abc import ABC, abstractmethod
from datetime import datetime, timedelta, timezone

from scrapy import Spider
from scrapy.exceptions import CloseSpider
Expand Down Expand Up @@ -27,6 +29,14 @@ def __init__(
else:
self.case_numbers = self.get_case_numbers(self.year)

start_time_iso = os.getenv(
"START_TIME", datetime.now(tz=timezone.utc).isoformat()
)
self.start_time = datetime.fromisoformat(start_time_iso)

time_limit_in_secs = os.getenv("TIME_LIMIT", 21600)
self.time_limit = timedelta(seconds=int(time_limit_in_secs))

super().__init__(**kwargs)

@property
Expand All @@ -47,6 +57,19 @@ def start_requests(self):
def get_case_numbers(self):
pass

def out_of_time(self) -> bool:
"""
Checks whether the we have enough time to continue scraping.
We'll assume we need at most 30 minutes to clean up and finish
post-scrape tasks.
"""

runtime = datetime.now(tz=timezone.utc) - self.start_time
if runtime >= self.time_limit:
return True

return False

def case_numbers_from_file(self, filename):
with open(filename) as f:
for case_number in f:
Expand Down
3 changes: 3 additions & 0 deletions courtscraper/spiders/chancery.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def get_case_numbers(self, year):

def start_requests(self):
for case_number in self.case_numbers:
if self.out_of_time():
break

yield Request(
ChancerySpider.url,
meta={
Expand Down
3 changes: 3 additions & 0 deletions courtscraper/spiders/civil.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ def __init__(self, division="2", **kwargs):

def start_requests(self):
for case_number in self.case_numbers:
if self.out_of_time():
break

yield Request(
CivilSpider.url,
meta={
Expand Down

0 comments on commit 6766f76

Please sign in to comment.