Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uk data fixes #24

Merged
merged 7 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions data/datasets/en/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/england_wales_data_refined_7.jsonl
/en_judgements_dataset
/xml
/csv
6 changes: 6 additions & 0 deletions data/datasets/en/csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: 9dd651ab42dcab35b1431c4163a041ba.dir
size: 583602
nfiles: 1
hash: md5
path: csv
6 changes: 6 additions & 0 deletions data/datasets/en/en_judgements_dataset.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: ccf9486ab2d4b38836e50d06d5a080d9.dir
size: 168746581
nfiles: 3
hash: md5
path: en_judgements_dataset
5 changes: 5 additions & 0 deletions data/datasets/en/england_wales_data_refined_7.jsonl.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: 0634fad494a0ec6837834d1b8fd28f5e
size: 172557440
hash: md5
path: england_wales_data_refined_7.jsonl
6 changes: 6 additions & 0 deletions data/datasets/en/xml.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: 9203a565235f9431cc3beda483b5f727.dir
size: 75196782
nfiles: 1
hash: md5
path: xml

Large diffs are not rendered by default.

Large diffs are not rendered by default.

35 changes: 19 additions & 16 deletions scripts/england_wales/00_download_judgements.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd
from multiprocessing import Pool
import os
import time
from multiprocessing import Pool

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# Define the base URL
base_url = "https://caselaw.nationalarchives.gov.uk/judgments/advanced_search?query=&court=ewca%2Fcrim&order=date&per_page=50&page="
num_pages = 124
output_folder = "dump"
csv_file = 'judgments.csv'
csv_file = "judgments.csv"

# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)
Expand All @@ -20,15 +21,15 @@
def scrape_page(page_number):
url = base_url + str(page_number)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
soup = BeautifulSoup(response.text, "html.parser")
results = []

for li in soup.select('ul.judgment-listing__list > li'):
title_tag = li.find('a')
date_tag = li.find('time')
for li in soup.select("ul.judgment-listing__list > li"):
title_tag = li.find("a")
date_tag = li.find("time")

if title_tag and date_tag:
href = title_tag['href']
href = title_tag["href"]
title = title_tag.text.strip()
date = date_tag.text.strip()
link = "https://caselaw.nationalarchives.gov.uk" + href
Expand All @@ -40,34 +41,36 @@ def scrape_page(page_number):
# Download XML files
def download_xml(data):
title, link, date, sno = data
date_formatted = pd.to_datetime(date).strftime('%Y_%m_%d')
date_formatted = pd.to_datetime(date).strftime("%Y_%m_%d")
xml_url = link + "/data.xml"
file_name = f"{date_formatted}-{sno}.xml"
file_path = os.path.join(output_folder, file_name)

response = requests.get(xml_url)
with open(file_path, 'wb') as file:
with open(file_path, "wb") as file:
file.write(response.content)

time.sleep(1) # Pause to avoid blocking IP address


# Initialize CSV file
if not os.path.exists(csv_file):
pd.DataFrame(columns=['Title', 'Link', 'Date', 'SNo']).to_csv(csv_file, index=False)
pd.DataFrame(columns=["Title", "Link", "Date", "SNo"]).to_csv(csv_file, index=False)

# Scrape all pages and process data incrementally
sno = 1
for page in tqdm(range(1, num_pages + 1), desc="Scraping pages"):
results = scrape_page(page)

# Add serial number to each result
results_with_sno = [(title, link, date, sno + i) for i, (title, link, date) in enumerate(results)]
results_with_sno = [
(title, link, date, sno + i) for i, (title, link, date) in enumerate(results)
]
sno += len(results)

# Save results to CSV incrementally
df = pd.DataFrame(results_with_sno, columns=['Title', 'Link', 'Date', 'SNo'])
df.to_csv(csv_file, mode='a', header=False, index=False)
df = pd.DataFrame(results_with_sno, columns=["Title", "Link", "Date", "SNo"])
df.to_csv(csv_file, mode="a", header=False, index=False)

# Download XML files
with Pool() as pool:
Expand Down
Loading
Loading