Skip to content

Commit

Permalink
fix lint
Browse files Browse the repository at this point in the history
  • Loading branch information
laugustyniak committed Jun 3, 2024
1 parent e52d341 commit 1872843
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 90 deletions.
35 changes: 19 additions & 16 deletions scripts/england_wales/00_download_judgements.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd
from multiprocessing import Pool
import os
import time
from multiprocessing import Pool

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# Define the base URL
base_url = "https://caselaw.nationalarchives.gov.uk/judgments/advanced_search?query=&court=ewca%2Fcrim&order=date&per_page=50&page="
num_pages = 124
output_folder = "dump"
csv_file = 'judgments.csv'
csv_file = "judgments.csv"

# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)
Expand All @@ -20,15 +21,15 @@
def scrape_page(page_number):
url = base_url + str(page_number)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
soup = BeautifulSoup(response.text, "html.parser")
results = []

for li in soup.select('ul.judgment-listing__list > li'):
title_tag = li.find('a')
date_tag = li.find('time')
for li in soup.select("ul.judgment-listing__list > li"):
title_tag = li.find("a")
date_tag = li.find("time")

if title_tag and date_tag:
href = title_tag['href']
href = title_tag["href"]
title = title_tag.text.strip()
date = date_tag.text.strip()
link = "https://caselaw.nationalarchives.gov.uk" + href
Expand All @@ -40,34 +41,36 @@ def scrape_page(page_number):
# Download XML files
def download_xml(data):
title, link, date, sno = data
date_formatted = pd.to_datetime(date).strftime('%Y_%m_%d')
date_formatted = pd.to_datetime(date).strftime("%Y_%m_%d")
xml_url = link + "/data.xml"
file_name = f"{date_formatted}-{sno}.xml"
file_path = os.path.join(output_folder, file_name)

response = requests.get(xml_url)
with open(file_path, 'wb') as file:
with open(file_path, "wb") as file:
file.write(response.content)

time.sleep(1) # Pause to avoid blocking IP address


# Initialize CSV file
if not os.path.exists(csv_file):
pd.DataFrame(columns=['Title', 'Link', 'Date', 'SNo']).to_csv(csv_file, index=False)
pd.DataFrame(columns=["Title", "Link", "Date", "SNo"]).to_csv(csv_file, index=False)

# Scrape all pages and process data incrementally
sno = 1
for page in tqdm(range(1, num_pages + 1), desc="Scraping pages"):
results = scrape_page(page)

# Add serial number to each result
results_with_sno = [(title, link, date, sno + i) for i, (title, link, date) in enumerate(results)]
results_with_sno = [
(title, link, date, sno + i) for i, (title, link, date) in enumerate(results)
]
sno += len(results)

# Save results to CSV incrementally
df = pd.DataFrame(results_with_sno, columns=['Title', 'Link', 'Date', 'SNo'])
df.to_csv(csv_file, mode='a', header=False, index=False)
df = pd.DataFrame(results_with_sno, columns=["Title", "Link", "Date", "SNo"])
df.to_csv(csv_file, mode="a", header=False, index=False)

# Download XML files
with Pool() as pool:
Expand Down
Loading

0 comments on commit 1872843

Please sign in to comment.