diff --git a/Makefile b/Makefile index 4719af9..1d234a8 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,6 @@ year=$(shell date +%Y) +START_TIME=$(shell export TZ=UTC; date -Iseconds) +TIME_LIMIT=21600 .PHONY: all all: upload @@ -54,11 +56,12 @@ CIVIL_SCRAPE_START_QUERY=$(shell tail -n +2 scripts/nightly_civil_start.sql) civil-%.jl: cases.db START=$$(sqlite-utils query --csv --no-headers cases.db \ "$(CIVIL_SCRAPE_START_QUERY)" -p subdivision $*); \ - echo $$START; \ + export START_TIME=$(START_TIME); export TIME_LIMIT=$(TIME_LIMIT); \ scrapy crawl civil -s CLOSESPIDER_TIMEOUT=3600 -a year=$(year) -a division=$* -a start=$$START -O $@; chancery.jl: cases.db START=$$(sqlite3 cases.db < scripts/nightly_chancery_start.sql); \ + export START_TIME=$(START_TIME); export TIME_LIMIT=$(TIME_LIMIT); \ scrapy crawl chancery -a year=$(year) -a start=$$START -O $@; cases.db : diff --git a/courtscraper/spiders/base.py b/courtscraper/spiders/base.py index 73aa10a..0babffb 100644 --- a/courtscraper/spiders/base.py +++ b/courtscraper/spiders/base.py @@ -1,4 +1,6 @@ +import os from abc import ABC, abstractmethod +from datetime import datetime, timedelta, timezone from scrapy import Spider from scrapy.exceptions import CloseSpider @@ -27,6 +29,14 @@ def __init__( else: self.case_numbers = self.get_case_numbers(self.year) + start_time_iso = os.getenv( + "START_TIME", datetime.now(tz=timezone.utc).isoformat() + ) + self.start_time = datetime.fromisoformat(start_time_iso) + + time_limit_in_secs = os.getenv("TIME_LIMIT", 21600) + self.time_limit = timedelta(seconds=int(time_limit_in_secs)) + super().__init__(**kwargs) @property @@ -47,6 +57,19 @@ def start_requests(self): def get_case_numbers(self): pass + def out_of_time(self) -> bool: + """ + Checks whether the we have enough time to continue scraping. + We'll assume we need at most 30 minutes to clean up and finish + post-scrape tasks. + """ + + runtime = datetime.now(tz=timezone.utc) - self.start_time + if runtime >= self.time_limit: + return True + + return False + def case_numbers_from_file(self, filename): with open(filename) as f: for case_number in f: diff --git a/courtscraper/spiders/chancery.py b/courtscraper/spiders/chancery.py index a014f9e..b1dcec2 100644 --- a/courtscraper/spiders/chancery.py +++ b/courtscraper/spiders/chancery.py @@ -21,6 +21,9 @@ def get_case_numbers(self, year): def start_requests(self): for case_number in self.case_numbers: + if self.out_of_time(): + break + yield Request( ChancerySpider.url, meta={ diff --git a/courtscraper/spiders/civil.py b/courtscraper/spiders/civil.py index d1db6d7..750750c 100644 --- a/courtscraper/spiders/civil.py +++ b/courtscraper/spiders/civil.py @@ -13,6 +13,9 @@ def __init__(self, division="2", **kwargs): def start_requests(self): for case_number in self.case_numbers: + if self.out_of_time(): + break + yield Request( CivilSpider.url, meta={