Merge pull request #38 from datamade/feature/37-court-call-scrape

Add court call scrape
datamade · Mar 15, 2024 · aa956d2 · aa956d2
2 parents 60570d2 + a5675f9
commit aa956d2
Show file tree

Hide file tree

Showing 6 changed files with 412 additions and 1 deletion.
diff --git a/.github/workflows/court_calls.yml b/.github/workflows/court_calls.yml
@@ -0,0 +1,116 @@
+name: Court call scrape
+
+on:
+ workflow_dispatch:
+ schedule:
+ - cron: '15 10 * * *'
+
+jobs:
+ scrape:
+ name: Scrape court calls
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Set current date as env variable
+ run: echo "BEGIN_COURTS_RUN=$(date +'%s')" >> $GITHUB_ENV
+ - uses: actions/checkout@v3
+ - name: upgrade sqlite3
+ run: |
+ sudo apt-get update
+ sudo apt-get install sqlite3
+
+ - name: Install requirements
+ run: |
+ pip install -U pyopenssl cryptography
+ pip install -r requirements.txt
+
+ - name: Download latest database zip
+ uses: robinraju/[email protected]
+ with:
+ latest: true
+ tag: "nightly"
+ fileName: "*.db.zip"
+
+ - name: Decrypt database
+ run: |
+ unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip
+
+ - name: Scrape court calls
+ run: |
+ make -f Makefile.courtcalls all
+
+ - name: Setup database for upload
+ run: |
+ zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db
+
+ - name: Upload new release
+ uses: WebFreak001/[email protected]
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label}
+ release_id: 131985702
+ asset_path: ./cases.db.zip
+ asset_name: cases.db.zip
+ asset_content_type: application/zip # required by GitHub API
+ max_releases: 7
+
+ - name: Keepalive
+ uses: gautamkrishnar/keepalive-workflow@v1
+
+ deploy:
+ name: Deploy to Heroku
+ needs: scrape
+ runs-on: ubuntu-latest
+
+ env:
+ HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }}
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Install requirements
+ run: pip install -r requirements.txt
+
+ - name: Download latest database zip
+ uses: robinraju/[email protected]
+ with:
+ latest: true
+ tag: "nightly"
+ fileName: "*.db.zip"
+
+ - name: Decrypt database
+ run: |
+ unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip
+
+ - name: Install heroku-builds plugin
+ run: |
+ heroku plugins:install heroku-builds
+
+ - name: Login to Heroku CLI
+ uses: akhileshns/[email protected]
+ with:
+ heroku_api_key: ${{ secrets.HEROKU_API_KEY }}
+ heroku_app_name: ""
+ heroku_email: ${{ secrets.HEROKU_EMAIL }}
+ justlogin: true
+
+ - name: Install Datasette plugins
+ run: |
+ datasette install datasette-auth-passwords datasette-auth-tokens
+
+ - name: Get hashed Datasette password
+ run: |
+ # Store hash as an environment variable
+ hash=$(echo '${{ secrets.DATASETTE_INSTANCE_PW }}' \
+ | datasette hash-password --no-confirm); \
+ echo "hash=$hash" >> $GITHUB_ENV
+
+ - name: Deploy Datasette instance to Heroku
+ run: |
+ datasette publish heroku cases.db \
+ -n court-scraper \
+ -m metadata.json \
+ --setting sql_time_limit_ms 60000 \
+ --install datasette-auth-passwords \
+ --plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}'
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -35,7 +35,7 @@ jobs:
  run: |
  unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip
 
- - name: Run scrape
+ - name: Run case scrape
  run: |
  echo $BEGIN_COURTS_RUN
  make get_new_records

diff --git a/Makefile.courtcalls b/Makefile.courtcalls
@@ -0,0 +1,14 @@
+# Makefile for scraping court calls
+
+.PHONY : all
+all: court_calls.csv cases.db
+ cat $< | sqlite3 cases.db -init scripts/import_court_calls.sql -bail
+
+court_calls.csv: court_calls.json
+ cat $^ | jq '.[] | [.["Case Number"], .["Division"], .["Plaintiff"], .["Defendant"], .["Court Date"], .["Room"], .["District"], .["Sequence #"], .["Time"], .hash] | @csv' -r > $@
+
+court_calls.json: court_calls.jl
+ cat $^ | jq --slurp '.' > $@
+
+court_calls.jl : cases.db
+ scrapy crawl courtcalls -s CLOSESPIDER_TIMEOUT=14400 -O $@
diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py
@@ -0,0 +1,221 @@
+from datetime import datetime, timedelta
+
+from scrapy import Spider, Request
+from scrapy.http import FormRequest
+from scrapy.exceptions import CloseSpider
+from scrapy.spidermiddlewares.httperror import HttpError
+
+from lxml import html
+
+from scripts.hash import dict_hash
+
+
+class CourtCallSpider(Spider):
+ name = "courtcalls"
+ url = "https://casesearch.cookcountyclerkofcourt.org/CourtCallSearch.aspx"
+
+ def __init__(self, **kwargs):
+ self.failures = set()
+ super().__init__(**kwargs)
+
+ def next_business_days(self, n):
+ """Returns the dates of the next n business days."""
+
+ current_date = datetime.today()
+ count = 0
+ while count <= n:
+ yield f"{current_date.month}/{current_date.day}/{current_date.year}"
+
+ next_date = current_date + timedelta(days=1)
+ while next_date.weekday() > 4:
+ # Skip weekends
+ next_date += timedelta(days=1)
+
+ current_date = next_date
+ count += 1
+
+ def start_requests(self):
+ for date in self.next_business_days(5):
+ yield Request(
+ CourtCallSpider.url,
+ meta={
+ "zyte_api_automap": {
+ "httpResponseHeaders": True,
+ "browserHtml": True,
+ "actions": [
+ {
+ "action": "waitForSelector",
+ "selector": {
+ "type": "css",
+ "value": "#MainContent_rblSearchType_2",
+ },
+ "timeout": 5,
+ "onError": "return",
+ },
+ {
+ "action": "click",
+ "selector": {
+ "type": "css",
+ "value": "#MainContent_rblSearchType_2",
+ },
+ "onError": "return",
+ },
+ {
+ "action": "waitForSelector",
+ "selector": {
+ "type": "css",
+ "value": "#MainContent_dtTxt",
+ },
+ "timeout": 5,
+ "onError": "return",
+ },
+ {
+ "action": "select",
+ "selector": {
+ "type": "css",
+ "value": "#MainContent_ddlDivisionCode",
+ },
+ "values": ["CV"],
+ "onError": "return",
+ },
+ {
+ "action": "type",
+ "selector": {
+ "type": "css",
+ "value": "#MainContent_dtTxt",
+ },
+ "text": date,
+ "onError": "return",
+ },
+ {
+ "action": "click",
+ "selector": {
+ "type": "css",
+ "value": "#MainContent_btnSearch",
+ },
+ "onError": "return",
+ },
+ {
+ "action": "waitForSelector",
+ "selector": {
+ "type": "css",
+ "value": "#MainContent_pnlResults",
+ },
+ "timeout": 5,
+ "onError": "return",
+ },
+ ],
+ },
+ "date": date,
+ "result_page_num": 1,
+ },
+ errback=self.handle_error,
+ callback=self.parse_results,
+ )
+
+ def has_page_num(self, n, response):
+ """Check if there's an nth page of court call results."""
+
+ tree = html.fromstring(response.text)
+ page_table = tree.xpath("//table")[1]
+ next_page_link = page_table.xpath(f".//a[contains(@href,'Page${n}')]")
+ return bool(next_page_link)
+
+ def get_court_calls(self, response):
+ """Returns the court calls found on a result page."""
+
+ tree = html.fromstring(response.text)
+ results_table = tree.xpath("//table[@id='MainContent_grdRecords']")[0]
+
+ no_results = results_table.xpath(
+ ".//*[text()[contains(.,'No cases found matching your selected"
+ "criteria.')]]"
+ )
+ if no_results:
+ return
+
+ rows = results_table.xpath(".//tr")
+ headers = rows[0].xpath(".//a/text()")
+ for row in rows[1:-1]:
+ cells = row.xpath(".//td/text()")
+ if cells:
+ yield dict(zip(headers, cells))
+
+ def extract_form(self, response, form_xpath):
+ """
+ ASP.NET pages are essentially forms that store the data needed to send
+ POST requests in hidden form inputs on the page.
+
+ From https://www.trickster.dev/post/scraping-legacy-asp-net-site-with-
+ scrapy-a-real-example/
+ """
+
+ form_data = dict()
+
+ for hidden_input in response.xpath(form_xpath).xpath(
+ ".//input[@type='hidden']"
+ ):
+ name = hidden_input.attrib.get("name")
+ if name is None:
+ continue
+ value = hidden_input.attrib.get("value")
+ if value is None:
+ value = ""
+
+ form_data[name] = value
+
+ return form_data
+
+ def get_page_n_form_data(self, n, response):
+ """
+ Returns the form fields needed to send a POST request
+ for the nth page of court call results.
+ """
+
+ form_data = self.extract_form(response, "//form[@id='ctl01']")
+ form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords"
+ form_data["__EVENTARGUMENT"] = f"Page${n}"
+ return form_data
+
+ def parse_results(self, response):
+ results = self.get_court_calls(response)
+ if not results:
+ return
+
+ for court_call in results:
+ court_call["hash"] = dict_hash(court_call)
+ yield court_call
+
+ # Request the next page of results
+ next_page_num = response.meta["result_page_num"] + 1
+ next_page_exists = self.has_page_num(next_page_num, response)
+ if not next_page_exists:
+ return
+
+ next_page_form_data = self.get_page_n_form_data(next_page_num, response)
+ yield FormRequest.from_response(
+ response,
+ meta={"result_page_num": next_page_num},
+ formxpath="//form[@id='ctl01']",
+ formdata=next_page_form_data,
+ callback=self.parse_results,
+ dont_click=True,
+ )
+
+ def _failing_responses(self, response):
+ self.failures.add(
+ f"{response.meta['date']} page {response.meta['result_page_num']}"
+ )
+
+ self.logger.info(f'failures: {", ".join(sorted(self.failures))}')
+
+ if len(self.failures) > 20:
+ raise CloseSpider("run of failures")
+
+ def handle_error(self, failure):
+ if failure.check(HttpError):
+ response = failure.value.response
+ if response.status in (404, 500):
+ self._failing_responses(response)
+ else:
+ self.logger.error(repr(failure))