From 7aa4e6e6b12a6ed285e51b8c58bf935789760a00 Mon Sep 17 00:00:00 2001 From: msj Date: Tue, 12 Mar 2024 13:10:32 -0400 Subject: [PATCH 01/10] Add court call scraper --- courtscraper/spiders/court_calls.py | 174 ++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 courtscraper/spiders/court_calls.py diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py new file mode 100644 index 0000000..d969808 --- /dev/null +++ b/courtscraper/spiders/court_calls.py @@ -0,0 +1,174 @@ +from abc import ABC + +from scrapy import Spider, Request +from scrapy.http import FormRequest +from scrapy.spidermiddlewares.httperror import HttpError + +from lxml import html + +from scripts.hash import dict_hash + + +class CourtCallSpider(ABC, Spider): + name = "courtcalls" + url = "https://casesearch.cookcountyclerkofcourt.org/CourtCallSearch.aspx" + + def __init__(self, **kwargs): + self.current_page = 1 + super().__init__(**kwargs) + + def nextBusinessDays(self, n): + """Returns the dates of the next n business days.""" + + return ["3/12/2024"] + + def start_requests(self): + for date in self.nextBusinessDays(5): + self.current_page = 1 + yield Request( + CourtCallSpider.url, + meta={ + "zyte_api_automap": { + "httpResponseHeaders": True, + "browserHtml": True, + "actions": [ + { + "action": "waitForSelector", + "selector": { + "type": "css", + "value": "#MainContent_rblSearchType_2", + }, + "timeout": 5, + "onError": "return", + }, + { + "action": "click", + "selector": { + "type": "css", + "value": "#MainContent_rblSearchType_2", + }, + "onError": "return", + }, + { + "action": "waitForSelector", + "selector": { + "type": "css", + "value": "#MainContent_dtTxt", + }, + "timeout": 5, + "onError": "return", + }, + { + "action": "select", + "selector": { + "type": "css", + "value": "#MainContent_ddlDivisionCode", + }, + "values": ["CV"], + "onError": "return", + }, + { + "action": "type", + "selector": { + "type": "css", + "value": "#MainContent_dtTxt", + }, + "text": date, + "onError": "return", + }, + { + "action": "click", + "selector": { + "type": "css", + "value": "#MainContent_btnSearch", + }, + "onError": "return", + }, + { + "action": "waitForSelector", + "selector": { + "type": "css", + "value": "#MainContent_pnlResults", + }, + "timeout": 5, + "onError": "return", + }, + ], + }, + "date": date, + }, + errback=self.handle_error, + ) + + def has_page_num(self, n, response): + """Check if there's another page of court calls.""" + tree = html.fromstring(response.text) + page_table = tree.xpath("//table")[1] + next_page_link = page_table.xpath(f".//a[contains(@href,'Page${n}')]") + return bool(next_page_link) + + def get_court_calls(self, response): + tree = html.fromstring(response.text) + results_table = tree.xpath("//table[@id='MainContent_grdRecords']")[0] + + rows = results_table.xpath(".//tr") + headers = rows[0].xpath(".//a/text()") + for row in rows[1:-1]: + cells = row.xpath(".//td/text()") + if cells: + yield dict(zip(headers, cells)) + + def extract_form(self, response, form_xpath): + form_data = dict() + + for hidden_input in response.xpath(form_xpath).xpath( + ".//input[@type='hidden']" + ): + name = hidden_input.attrib.get("name") + if name is None: + continue + value = hidden_input.attrib.get("value") + if value is None: + value = "" + + form_data[name] = value + + return form_data + + def get_page_n_form_data(self, n, response): + form_data = self.extract_form(response, "//form[@id='ctl01']") + form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords" + form_data["__EVENTARGUMENT"] = f"Page${n}" + return form_data + + def parse(self, response): + cases = self.get_court_calls(response) + for case in cases: + case["hash"] = dict_hash(case) + yield case + + breakpoint() + self.current_page += 1 + next_page = self.has_page_num(self.current_page, response) + if not next_page: + return + + # self._success(response) + next_page_form_data = self.get_page_n_form_data(self.current_page, response) + yield FormRequest.from_response( + response, + formxpath="//form[@id='ctl01']", + formdata=next_page_form_data, + callback=self.parse, + dont_click=True, + ) + + def handle_error(self, failure): + if failure.check(HttpError): + response = failure.value.response + if response.status == 404: + self._missing_case(response) + elif response.status == 500: + self._failing_responses(response) + else: + self.logger.error(repr(failure)) From 27bc7459e3c3f2948729c635fd056ce47f849f86 Mon Sep 17 00:00:00 2001 From: msj Date: Tue, 12 Mar 2024 13:31:49 -0400 Subject: [PATCH 02/10] Add date function --- courtscraper/spiders/court_calls.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py index d969808..e6c04a7 100644 --- a/courtscraper/spiders/court_calls.py +++ b/courtscraper/spiders/court_calls.py @@ -1,4 +1,5 @@ from abc import ABC +from datetime import datetime, timedelta from scrapy import Spider, Request from scrapy.http import FormRequest @@ -20,7 +21,18 @@ def __init__(self, **kwargs): def nextBusinessDays(self, n): """Returns the dates of the next n business days.""" - return ["3/12/2024"] + current_date = datetime.today() + count = 0 + while count <= n: + yield f"{current_date.month}/{current_date.day}/{current_date.year}" + + next_date = current_date + timedelta(days=1) + while next_date.weekday() > 4: + # Skip weekends + next_date += timedelta(days=1) + + current_date = next_date + count += 1 def start_requests(self): for date in self.nextBusinessDays(5): @@ -147,7 +159,6 @@ def parse(self, response): case["hash"] = dict_hash(case) yield case - breakpoint() self.current_page += 1 next_page = self.has_page_num(self.current_page, response) if not next_page: From c112deddfa0259ad1aad1a05e6c8d242559a6fcf Mon Sep 17 00:00:00 2001 From: msj Date: Tue, 12 Mar 2024 13:37:52 -0400 Subject: [PATCH 03/10] Use request meta to store current page num --- courtscraper/spiders/court_calls.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py index e6c04a7..b84bcfb 100644 --- a/courtscraper/spiders/court_calls.py +++ b/courtscraper/spiders/court_calls.py @@ -15,7 +15,6 @@ class CourtCallSpider(ABC, Spider): url = "https://casesearch.cookcountyclerkofcourt.org/CourtCallSearch.aspx" def __init__(self, **kwargs): - self.current_page = 1 super().__init__(**kwargs) def nextBusinessDays(self, n): @@ -36,7 +35,6 @@ def nextBusinessDays(self, n): def start_requests(self): for date in self.nextBusinessDays(5): - self.current_page = 1 yield Request( CourtCallSpider.url, meta={ @@ -108,6 +106,7 @@ def start_requests(self): ], }, "date": date, + "result_page_num": 1, }, errback=self.handle_error, ) @@ -159,15 +158,16 @@ def parse(self, response): case["hash"] = dict_hash(case) yield case - self.current_page += 1 - next_page = self.has_page_num(self.current_page, response) - if not next_page: + next_page_num = response.meta["result_page_num"] + 1 + next_page_exists = self.has_page_num(next_page_num, response) + if not next_page_exists: return # self._success(response) - next_page_form_data = self.get_page_n_form_data(self.current_page, response) + next_page_form_data = self.get_page_n_form_data(next_page_num, response) yield FormRequest.from_response( response, + meta={"result_page_num": next_page_num}, formxpath="//form[@id='ctl01']", formdata=next_page_form_data, callback=self.parse, From 4a4b4933e04c901ef1e2bf68291c11c1dec2e763 Mon Sep 17 00:00:00 2001 From: msj Date: Wed, 13 Mar 2024 13:00:30 -0400 Subject: [PATCH 04/10] Add court call makefile and action --- .github/workflows/nightly.yml | 5 +++- Makefile.courtcalls | 14 +++++++++++ scripts/import_court_calls.sql | 46 ++++++++++++++++++++++++++++++++++ scripts/initialize_db.sql | 14 +++++++++++ 4 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 Makefile.courtcalls create mode 100644 scripts/import_court_calls.sql diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 214b44d..b14eefb 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -35,11 +35,14 @@ jobs: run: | unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip - - name: Run scrape + - name: Run case scrape run: | echo $BEGIN_COURTS_RUN make get_new_records + - name: Scrape court calls + make -f Makefile.courtcalls all + - name: Setup database for upload run: | zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db diff --git a/Makefile.courtcalls b/Makefile.courtcalls new file mode 100644 index 0000000..8f2c247 --- /dev/null +++ b/Makefile.courtcalls @@ -0,0 +1,14 @@ +# Makefile for scraping court calls + +.PHONY : all +all: court_calls.csv cases.db + cat $< | sqlite3 cases.db -init scripts/import_court_calls.sql -bail + +court_calls.csv: court_calls.json + cat $^ | jq '.[] | [.["Case Number"], .["Division"], .["Plaintiff"], .["Defendant"], .["Court Date"], .["Room"], .["District"], .["Sequence #"], .["Time"], .hash] | @csv' -r > $@ + +court_calls.json: court_calls.jl + cat $^ | jq --slurp '.' > $@ + +court_calls.jl : cases.db + scrapy crawl courtcalls -s CLOSESPIDER_TIMEOUT=14400 -O $@ diff --git a/scripts/import_court_calls.sql b/scripts/import_court_calls.sql new file mode 100644 index 0000000..7d4b833 --- /dev/null +++ b/scripts/import_court_calls.sql @@ -0,0 +1,46 @@ +CREATE TEMPORARY TABLE raw_court_call ( + case_number text, + division text, + plaintiff text, + defendant text, + court_date text, + room text, + district text, + sequence text, + time text, + hash text +); + +-- noqa: disable=PRS +.mode csv +.import /dev/stdin raw_court_call +-- noqa: enable=PRS + +-- Find and insert the new court calls +INSERT INTO + court_call( + case_number, + division, + plaintiff, + defendant, + court_date, + room, + district, + sequence, + time, + hash + ) +SELECT + case_number, + division, + plaintiff, + defendant, + court_date, + room, + district, + sequence, + time, + hash +FROM + raw_court_call +WHERE raw_court_call.hash NOT IN (SELECT hash FROM court_call); diff --git a/scripts/initialize_db.sql b/scripts/initialize_db.sql index 164b5f6..76388e5 100644 --- a/scripts/initialize_db.sql +++ b/scripts/initialize_db.sql @@ -39,3 +39,17 @@ CREATE TABLE event( comments text, FOREIGN KEY(case_number) REFERENCES court_case(case_number) ); + +CREATE TABLE court_call( + case_number text not null, + division, + plaintiff, + defendant, + court_date, + room, + district, + sequence, + time, + hash, + FOREIGN KEY(case_number) REFERENCES court_case(case_number) +); From 016ce420b45b5db10bf6ce0792f200ed1bbe54f1 Mon Sep 17 00:00:00 2001 From: msj Date: Wed, 13 Mar 2024 13:09:16 -0400 Subject: [PATCH 05/10] Add comments --- courtscraper/spiders/court_calls.py | 34 ++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py index b84bcfb..44a5564 100644 --- a/courtscraper/spiders/court_calls.py +++ b/courtscraper/spiders/court_calls.py @@ -109,19 +109,30 @@ def start_requests(self): "result_page_num": 1, }, errback=self.handle_error, + callback=self.parse_results, ) def has_page_num(self, n, response): - """Check if there's another page of court calls.""" + """Check if there's an nth page of court call results.""" + tree = html.fromstring(response.text) page_table = tree.xpath("//table")[1] next_page_link = page_table.xpath(f".//a[contains(@href,'Page${n}')]") return bool(next_page_link) def get_court_calls(self, response): + """Returns the court calls found on a result page.""" + tree = html.fromstring(response.text) results_table = tree.xpath("//table[@id='MainContent_grdRecords']")[0] + no_results = results_table.xpath( + ".//*[text()[contains(.,'No cases found matching your selected" + "criteria.')]]" + ) + if no_results: + return + rows = results_table.xpath(".//tr") headers = rows[0].xpath(".//a/text()") for row in rows[1:-1]: @@ -130,6 +141,14 @@ def get_court_calls(self, response): yield dict(zip(headers, cells)) def extract_form(self, response, form_xpath): + """ + ASP.NET pages are essentially forms that store the data needed to send + POST requests in hidden form inputs on the page. + + From https://www.trickster.dev/post/scraping-legacy-asp-net-site-with- + scrapy-a-real-example/ + """ + form_data = dict() for hidden_input in response.xpath(form_xpath).xpath( @@ -147,17 +166,26 @@ def extract_form(self, response, form_xpath): return form_data def get_page_n_form_data(self, n, response): + """ + Returns the form fields needed to send a POST request + for the nth page of court call results. + """ + form_data = self.extract_form(response, "//form[@id='ctl01']") form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords" form_data["__EVENTARGUMENT"] = f"Page${n}" return form_data - def parse(self, response): + def parse_results(self, response): cases = self.get_court_calls(response) + if not cases: + return + for case in cases: case["hash"] = dict_hash(case) yield case + # Request the next page of results next_page_num = response.meta["result_page_num"] + 1 next_page_exists = self.has_page_num(next_page_num, response) if not next_page_exists: @@ -170,7 +198,7 @@ def parse(self, response): meta={"result_page_num": next_page_num}, formxpath="//form[@id='ctl01']", formdata=next_page_form_data, - callback=self.parse, + callback=self.parse_results, dont_click=True, ) From e19e9a9fbf6a1051515de51815d1e6e34078dc84 Mon Sep 17 00:00:00 2001 From: msj Date: Wed, 13 Mar 2024 13:15:52 -0400 Subject: [PATCH 06/10] Add failure method --- courtscraper/spiders/court_calls.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py index 44a5564..0f08acc 100644 --- a/courtscraper/spiders/court_calls.py +++ b/courtscraper/spiders/court_calls.py @@ -3,6 +3,7 @@ from scrapy import Spider, Request from scrapy.http import FormRequest +from scrapy.exceptions import CloseSpider from scrapy.spidermiddlewares.httperror import HttpError from lxml import html @@ -15,6 +16,7 @@ class CourtCallSpider(ABC, Spider): url = "https://casesearch.cookcountyclerkofcourt.org/CourtCallSearch.aspx" def __init__(self, **kwargs): + self.failures = set() super().__init__(**kwargs) def nextBusinessDays(self, n): @@ -191,7 +193,6 @@ def parse_results(self, response): if not next_page_exists: return - # self._success(response) next_page_form_data = self.get_page_n_form_data(next_page_num, response) yield FormRequest.from_response( response, @@ -202,12 +203,20 @@ def parse_results(self, response): dont_click=True, ) + def _failing_responses(self, response): + self.failures.add( + f"{response.meta['date']} page {response.meta['result_page_num']}" + ) + + self.logger.info(f'failures: {", ".join(sorted(self.failures))}') + + if len(self.failures) > 20: + raise CloseSpider("run of failures") + def handle_error(self, failure): if failure.check(HttpError): response = failure.value.response - if response.status == 404: - self._missing_case(response) - elif response.status == 500: + if response.status in (404, 500): self._failing_responses(response) else: self.logger.error(repr(failure)) From df9aed12a7edf341a6676d02466fbe1223ea9336 Mon Sep 17 00:00:00 2001 From: msj Date: Wed, 13 Mar 2024 15:09:22 -0400 Subject: [PATCH 07/10] Fix workflow --- .github/workflows/nightly.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index b14eefb..3176c12 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -41,6 +41,7 @@ jobs: make get_new_records - name: Scrape court calls + run: | make -f Makefile.courtcalls all - name: Setup database for upload From 0378cfd2474c018d9833fa5a3a033fb3bc0eaa77 Mon Sep 17 00:00:00 2001 From: msj Date: Wed, 13 Mar 2024 15:41:58 -0400 Subject: [PATCH 08/10] Clarify variables --- courtscraper/spiders/court_calls.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py index 0f08acc..4536716 100644 --- a/courtscraper/spiders/court_calls.py +++ b/courtscraper/spiders/court_calls.py @@ -179,13 +179,13 @@ def get_page_n_form_data(self, n, response): return form_data def parse_results(self, response): - cases = self.get_court_calls(response) - if not cases: + results = self.get_court_calls(response) + if not results: return - for case in cases: - case["hash"] = dict_hash(case) - yield case + for court_call in results: + court_call["hash"] = dict_hash(court_call) + yield court_call # Request the next page of results next_page_num = response.meta["result_page_num"] + 1 From 361e766947a6d284824a1aa72fb6acbcd4649c9c Mon Sep 17 00:00:00 2001 From: msj Date: Fri, 15 Mar 2024 14:44:31 -0400 Subject: [PATCH 09/10] Cleanup --- courtscraper/spiders/court_calls.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py index 4536716..d7d2217 100644 --- a/courtscraper/spiders/court_calls.py +++ b/courtscraper/spiders/court_calls.py @@ -1,4 +1,3 @@ -from abc import ABC from datetime import datetime, timedelta from scrapy import Spider, Request @@ -11,7 +10,7 @@ from scripts.hash import dict_hash -class CourtCallSpider(ABC, Spider): +class CourtCallSpider(Spider): name = "courtcalls" url = "https://casesearch.cookcountyclerkofcourt.org/CourtCallSearch.aspx" @@ -19,7 +18,7 @@ def __init__(self, **kwargs): self.failures = set() super().__init__(**kwargs) - def nextBusinessDays(self, n): + def next_business_days(self, n): """Returns the dates of the next n business days.""" current_date = datetime.today() @@ -36,7 +35,7 @@ def nextBusinessDays(self, n): count += 1 def start_requests(self): - for date in self.nextBusinessDays(5): + for date in self.next_business_days(5): yield Request( CourtCallSpider.url, meta={ From a5675f909c9ee01bcf458f296096148cf05559f8 Mon Sep 17 00:00:00 2001 From: msj Date: Fri, 15 Mar 2024 14:47:57 -0400 Subject: [PATCH 10/10] Separate court call scrape into its own action --- .github/workflows/court_calls.yml | 116 ++++++++++++++++++++++++++++++ .github/workflows/nightly.yml | 4 -- 2 files changed, 116 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/court_calls.yml diff --git a/.github/workflows/court_calls.yml b/.github/workflows/court_calls.yml new file mode 100644 index 0000000..adff019 --- /dev/null +++ b/.github/workflows/court_calls.yml @@ -0,0 +1,116 @@ +name: Court call scrape + +on: + workflow_dispatch: + schedule: + - cron: '15 10 * * *' + +jobs: + scrape: + name: Scrape court calls + runs-on: ubuntu-latest + + steps: + - name: Set current date as env variable + run: echo "BEGIN_COURTS_RUN=$(date +'%s')" >> $GITHUB_ENV + - uses: actions/checkout@v3 + - name: upgrade sqlite3 + run: | + sudo apt-get update + sudo apt-get install sqlite3 + + - name: Install requirements + run: | + pip install -U pyopenssl cryptography + pip install -r requirements.txt + + - name: Download latest database zip + uses: robinraju/release-downloader@v1.8 + with: + latest: true + tag: "nightly" + fileName: "*.db.zip" + + - name: Decrypt database + run: | + unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip + + - name: Scrape court calls + run: | + make -f Makefile.courtcalls all + + - name: Setup database for upload + run: | + zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db + + - name: Upload new release + uses: WebFreak001/deploy-nightly@v3.0.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label} + release_id: 131985702 + asset_path: ./cases.db.zip + asset_name: cases.db.zip + asset_content_type: application/zip # required by GitHub API + max_releases: 7 + + - name: Keepalive + uses: gautamkrishnar/keepalive-workflow@v1 + + deploy: + name: Deploy to Heroku + needs: scrape + runs-on: ubuntu-latest + + env: + HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }} + + steps: + - uses: actions/checkout@v3 + + - name: Install requirements + run: pip install -r requirements.txt + + - name: Download latest database zip + uses: robinraju/release-downloader@v1.8 + with: + latest: true + tag: "nightly" + fileName: "*.db.zip" + + - name: Decrypt database + run: | + unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip + + - name: Install heroku-builds plugin + run: | + heroku plugins:install heroku-builds + + - name: Login to Heroku CLI + uses: akhileshns/heroku-deploy@v3.12.14 + with: + heroku_api_key: ${{ secrets.HEROKU_API_KEY }} + heroku_app_name: "" + heroku_email: ${{ secrets.HEROKU_EMAIL }} + justlogin: true + + - name: Install Datasette plugins + run: | + datasette install datasette-auth-passwords datasette-auth-tokens + + - name: Get hashed Datasette password + run: | + # Store hash as an environment variable + hash=$(echo '${{ secrets.DATASETTE_INSTANCE_PW }}' \ + | datasette hash-password --no-confirm); \ + echo "hash=$hash" >> $GITHUB_ENV + + - name: Deploy Datasette instance to Heroku + run: | + datasette publish heroku cases.db \ + -n court-scraper \ + -m metadata.json \ + --setting sql_time_limit_ms 60000 \ + --install datasette-auth-passwords \ + --plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}' diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 3176c12..e5a3130 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -40,10 +40,6 @@ jobs: echo $BEGIN_COURTS_RUN make get_new_records - - name: Scrape court calls - run: | - make -f Makefile.courtcalls all - - name: Setup database for upload run: | zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db