diff --git a/.github/workflows/court_calls.yml b/.github/workflows/court_calls.yml new file mode 100644 index 0000000..adff019 --- /dev/null +++ b/.github/workflows/court_calls.yml @@ -0,0 +1,116 @@ +name: Court call scrape + +on: + workflow_dispatch: + schedule: + - cron: '15 10 * * *' + +jobs: + scrape: + name: Scrape court calls + runs-on: ubuntu-latest + + steps: + - name: Set current date as env variable + run: echo "BEGIN_COURTS_RUN=$(date +'%s')" >> $GITHUB_ENV + - uses: actions/checkout@v3 + - name: upgrade sqlite3 + run: | + sudo apt-get update + sudo apt-get install sqlite3 + + - name: Install requirements + run: | + pip install -U pyopenssl cryptography + pip install -r requirements.txt + + - name: Download latest database zip + uses: robinraju/release-downloader@v1.8 + with: + latest: true + tag: "nightly" + fileName: "*.db.zip" + + - name: Decrypt database + run: | + unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip + + - name: Scrape court calls + run: | + make -f Makefile.courtcalls all + + - name: Setup database for upload + run: | + zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db + + - name: Upload new release + uses: WebFreak001/deploy-nightly@v3.0.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label} + release_id: 131985702 + asset_path: ./cases.db.zip + asset_name: cases.db.zip + asset_content_type: application/zip # required by GitHub API + max_releases: 7 + + - name: Keepalive + uses: gautamkrishnar/keepalive-workflow@v1 + + deploy: + name: Deploy to Heroku + needs: scrape + runs-on: ubuntu-latest + + env: + HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }} + + steps: + - uses: actions/checkout@v3 + + - name: Install requirements + run: pip install -r requirements.txt + + - name: Download latest database zip + uses: robinraju/release-downloader@v1.8 + with: + latest: true + tag: "nightly" + fileName: "*.db.zip" + + - name: Decrypt database + run: | + unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip + + - name: Install heroku-builds plugin + run: | + heroku plugins:install heroku-builds + + - name: Login to Heroku CLI + uses: akhileshns/heroku-deploy@v3.12.14 + with: + heroku_api_key: ${{ secrets.HEROKU_API_KEY }} + heroku_app_name: "" + heroku_email: ${{ secrets.HEROKU_EMAIL }} + justlogin: true + + - name: Install Datasette plugins + run: | + datasette install datasette-auth-passwords datasette-auth-tokens + + - name: Get hashed Datasette password + run: | + # Store hash as an environment variable + hash=$(echo '${{ secrets.DATASETTE_INSTANCE_PW }}' \ + | datasette hash-password --no-confirm); \ + echo "hash=$hash" >> $GITHUB_ENV + + - name: Deploy Datasette instance to Heroku + run: | + datasette publish heroku cases.db \ + -n court-scraper \ + -m metadata.json \ + --setting sql_time_limit_ms 60000 \ + --install datasette-auth-passwords \ + --plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}' diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 0b7762d..407f5f0 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -35,7 +35,7 @@ jobs: run: | unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip - - name: Run scrape + - name: Run case scrape run: | echo $BEGIN_COURTS_RUN make get_new_records diff --git a/Makefile.courtcalls b/Makefile.courtcalls new file mode 100644 index 0000000..8f2c247 --- /dev/null +++ b/Makefile.courtcalls @@ -0,0 +1,14 @@ +# Makefile for scraping court calls + +.PHONY : all +all: court_calls.csv cases.db + cat $< | sqlite3 cases.db -init scripts/import_court_calls.sql -bail + +court_calls.csv: court_calls.json + cat $^ | jq '.[] | [.["Case Number"], .["Division"], .["Plaintiff"], .["Defendant"], .["Court Date"], .["Room"], .["District"], .["Sequence #"], .["Time"], .hash] | @csv' -r > $@ + +court_calls.json: court_calls.jl + cat $^ | jq --slurp '.' > $@ + +court_calls.jl : cases.db + scrapy crawl courtcalls -s CLOSESPIDER_TIMEOUT=14400 -O $@ diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py new file mode 100644 index 0000000..d7d2217 --- /dev/null +++ b/courtscraper/spiders/court_calls.py @@ -0,0 +1,221 @@ +from datetime import datetime, timedelta + +from scrapy import Spider, Request +from scrapy.http import FormRequest +from scrapy.exceptions import CloseSpider +from scrapy.spidermiddlewares.httperror import HttpError + +from lxml import html + +from scripts.hash import dict_hash + + +class CourtCallSpider(Spider): + name = "courtcalls" + url = "https://casesearch.cookcountyclerkofcourt.org/CourtCallSearch.aspx" + + def __init__(self, **kwargs): + self.failures = set() + super().__init__(**kwargs) + + def next_business_days(self, n): + """Returns the dates of the next n business days.""" + + current_date = datetime.today() + count = 0 + while count <= n: + yield f"{current_date.month}/{current_date.day}/{current_date.year}" + + next_date = current_date + timedelta(days=1) + while next_date.weekday() > 4: + # Skip weekends + next_date += timedelta(days=1) + + current_date = next_date + count += 1 + + def start_requests(self): + for date in self.next_business_days(5): + yield Request( + CourtCallSpider.url, + meta={ + "zyte_api_automap": { + "httpResponseHeaders": True, + "browserHtml": True, + "actions": [ + { + "action": "waitForSelector", + "selector": { + "type": "css", + "value": "#MainContent_rblSearchType_2", + }, + "timeout": 5, + "onError": "return", + }, + { + "action": "click", + "selector": { + "type": "css", + "value": "#MainContent_rblSearchType_2", + }, + "onError": "return", + }, + { + "action": "waitForSelector", + "selector": { + "type": "css", + "value": "#MainContent_dtTxt", + }, + "timeout": 5, + "onError": "return", + }, + { + "action": "select", + "selector": { + "type": "css", + "value": "#MainContent_ddlDivisionCode", + }, + "values": ["CV"], + "onError": "return", + }, + { + "action": "type", + "selector": { + "type": "css", + "value": "#MainContent_dtTxt", + }, + "text": date, + "onError": "return", + }, + { + "action": "click", + "selector": { + "type": "css", + "value": "#MainContent_btnSearch", + }, + "onError": "return", + }, + { + "action": "waitForSelector", + "selector": { + "type": "css", + "value": "#MainContent_pnlResults", + }, + "timeout": 5, + "onError": "return", + }, + ], + }, + "date": date, + "result_page_num": 1, + }, + errback=self.handle_error, + callback=self.parse_results, + ) + + def has_page_num(self, n, response): + """Check if there's an nth page of court call results.""" + + tree = html.fromstring(response.text) + page_table = tree.xpath("//table")[1] + next_page_link = page_table.xpath(f".//a[contains(@href,'Page${n}')]") + return bool(next_page_link) + + def get_court_calls(self, response): + """Returns the court calls found on a result page.""" + + tree = html.fromstring(response.text) + results_table = tree.xpath("//table[@id='MainContent_grdRecords']")[0] + + no_results = results_table.xpath( + ".//*[text()[contains(.,'No cases found matching your selected" + "criteria.')]]" + ) + if no_results: + return + + rows = results_table.xpath(".//tr") + headers = rows[0].xpath(".//a/text()") + for row in rows[1:-1]: + cells = row.xpath(".//td/text()") + if cells: + yield dict(zip(headers, cells)) + + def extract_form(self, response, form_xpath): + """ + ASP.NET pages are essentially forms that store the data needed to send + POST requests in hidden form inputs on the page. + + From https://www.trickster.dev/post/scraping-legacy-asp-net-site-with- + scrapy-a-real-example/ + """ + + form_data = dict() + + for hidden_input in response.xpath(form_xpath).xpath( + ".//input[@type='hidden']" + ): + name = hidden_input.attrib.get("name") + if name is None: + continue + value = hidden_input.attrib.get("value") + if value is None: + value = "" + + form_data[name] = value + + return form_data + + def get_page_n_form_data(self, n, response): + """ + Returns the form fields needed to send a POST request + for the nth page of court call results. + """ + + form_data = self.extract_form(response, "//form[@id='ctl01']") + form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords" + form_data["__EVENTARGUMENT"] = f"Page${n}" + return form_data + + def parse_results(self, response): + results = self.get_court_calls(response) + if not results: + return + + for court_call in results: + court_call["hash"] = dict_hash(court_call) + yield court_call + + # Request the next page of results + next_page_num = response.meta["result_page_num"] + 1 + next_page_exists = self.has_page_num(next_page_num, response) + if not next_page_exists: + return + + next_page_form_data = self.get_page_n_form_data(next_page_num, response) + yield FormRequest.from_response( + response, + meta={"result_page_num": next_page_num}, + formxpath="//form[@id='ctl01']", + formdata=next_page_form_data, + callback=self.parse_results, + dont_click=True, + ) + + def _failing_responses(self, response): + self.failures.add( + f"{response.meta['date']} page {response.meta['result_page_num']}" + ) + + self.logger.info(f'failures: {", ".join(sorted(self.failures))}') + + if len(self.failures) > 20: + raise CloseSpider("run of failures") + + def handle_error(self, failure): + if failure.check(HttpError): + response = failure.value.response + if response.status in (404, 500): + self._failing_responses(response) + else: + self.logger.error(repr(failure)) diff --git a/scripts/import_court_calls.sql b/scripts/import_court_calls.sql new file mode 100644 index 0000000..7d4b833 --- /dev/null +++ b/scripts/import_court_calls.sql @@ -0,0 +1,46 @@ +CREATE TEMPORARY TABLE raw_court_call ( + case_number text, + division text, + plaintiff text, + defendant text, + court_date text, + room text, + district text, + sequence text, + time text, + hash text +); + +-- noqa: disable=PRS +.mode csv +.import /dev/stdin raw_court_call +-- noqa: enable=PRS + +-- Find and insert the new court calls +INSERT INTO + court_call( + case_number, + division, + plaintiff, + defendant, + court_date, + room, + district, + sequence, + time, + hash + ) +SELECT + case_number, + division, + plaintiff, + defendant, + court_date, + room, + district, + sequence, + time, + hash +FROM + raw_court_call +WHERE raw_court_call.hash NOT IN (SELECT hash FROM court_call); diff --git a/scripts/initialize_db.sql b/scripts/initialize_db.sql index 1b9d9a3..fd53cf2 100644 --- a/scripts/initialize_db.sql +++ b/scripts/initialize_db.sql @@ -39,3 +39,17 @@ CREATE TABLE event( comments text, FOREIGN KEY(case_number) REFERENCES court_case(case_number) ); + +CREATE TABLE court_call( + case_number text not null, + division, + plaintiff, + defendant, + court_date, + room, + district, + sequence, + time, + hash, + FOREIGN KEY(case_number) REFERENCES court_case(case_number) +);