From 5428e163bd6c20592422a4d266024af3443664e6 Mon Sep 17 00:00:00 2001 From: tsantosh7 Date: Sat, 1 Jun 2024 02:30:40 +0100 Subject: [PATCH 1/2] added scripts to download xml files and convert to jsonl --- .../england_wales/00_download_judgements.py | 76 +++++++++++++++++++ scripts/england_wales/01_extract_jsonl.py | 59 ++++++++++++++ .../england_wales/02_extract_jsonl_refined.py | 63 +++++++++++++++ 3 files changed, 198 insertions(+) create mode 100644 scripts/england_wales/00_download_judgements.py create mode 100644 scripts/england_wales/01_extract_jsonl.py create mode 100644 scripts/england_wales/02_extract_jsonl_refined.py diff --git a/scripts/england_wales/00_download_judgements.py b/scripts/england_wales/00_download_judgements.py new file mode 100644 index 0000000..d33bb85 --- /dev/null +++ b/scripts/england_wales/00_download_judgements.py @@ -0,0 +1,76 @@ +import requests +from bs4 import BeautifulSoup +import pandas as pd +from multiprocessing import Pool +import os +import time +from tqdm import tqdm + +# Define the base URL +base_url = "https://caselaw.nationalarchives.gov.uk/judgments/advanced_search?query=&court=ewca%2Fcrim&order=date&per_page=50&page=" +num_pages = 124 +output_folder = "dump" +csv_file = 'judgments.csv' + +# Ensure the output directory exists +os.makedirs(output_folder, exist_ok=True) + + +# Scrape data from a single page +def scrape_page(page_number): + url = base_url + str(page_number) + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + results = [] + + for li in soup.select('ul.judgment-listing__list > li'): + title_tag = li.find('a') + date_tag = li.find('time') + + if title_tag and date_tag: + href = title_tag['href'] + title = title_tag.text.strip() + date = date_tag.text.strip() + link = "https://caselaw.nationalarchives.gov.uk" + href + results.append((title, link, date)) + + return results + + +# Download XML files +def download_xml(data): + title, link, date, sno = data + date_formatted = pd.to_datetime(date).strftime('%Y_%m_%d') + xml_url = link + "/data.xml" + file_name = f"{date_formatted}-{sno}.xml" + file_path = os.path.join(output_folder, file_name) + + response = requests.get(xml_url) + with open(file_path, 'wb') as file: + file.write(response.content) + + time.sleep(1) # Pause to avoid blocking IP address + + +# Initialize CSV file +if not os.path.exists(csv_file): + pd.DataFrame(columns=['Title', 'Link', 'Date', 'SNo']).to_csv(csv_file, index=False) + +# Scrape all pages and process data incrementally +sno = 1 +for page in tqdm(range(1, num_pages + 1), desc="Scraping pages"): + results = scrape_page(page) + + # Add serial number to each result + results_with_sno = [(title, link, date, sno + i) for i, (title, link, date) in enumerate(results)] + sno += len(results) + + # Save results to CSV incrementally + df = pd.DataFrame(results_with_sno, columns=['Title', 'Link', 'Date', 'SNo']) + df.to_csv(csv_file, mode='a', header=False, index=False) + + # Download XML files + with Pool() as pool: + pool.map(download_xml, results_with_sno) + +print("Scraping and downloading completed successfully!") diff --git a/scripts/england_wales/01_extract_jsonl.py b/scripts/england_wales/01_extract_jsonl.py new file mode 100644 index 0000000..735498a --- /dev/null +++ b/scripts/england_wales/01_extract_jsonl.py @@ -0,0 +1,59 @@ +import os +import json +from bs4 import BeautifulSoup +from tqdm import tqdm + +def extract_information_from_xml(xml_content): + soup = BeautifulSoup(xml_content, 'lxml') + + # Extract required fields + _id = soup.find('uk:hash').text + signature = soup.find('neutralcitation').text if soup.find('neutralcitation') else None + hearing_date = soup.find('hearingdate').text if soup.find('hearingdate') else None + date = hearing_date.strip() if hearing_date else None + publication_date = soup.find('frbrwork').find('frbrdate')['date'] + court_type = soup.find('courttype').text if soup.find('courttype') else None + + # Get the excerpt + header_text = soup.header.get_text(separator=' ', strip=True) + excerpt = header_text[:500] + + # Get the full content of the judgment body as XML string + judgment_body = soup.find('judgmentbody') + content = str(judgment_body) if judgment_body else None + + # Get the judges list + judges = [judge.get_text() for judge in soup.find_all('judge')] + + # Get case numbers + case_numbers = [case_number.get_text() for case_number in soup.find_all('p', class_='CoverText') if + 'Case Nos:' in case_number.text] + case_numbers = [num.strip() for sublist in case_numbers for num in sublist.replace('Case Nos:', '').split()] + + return { + "_id": _id, + "signature": signature, + "date": date, + "publicationDate": publication_date, + "type": court_type, + "excerpt": excerpt, + "content": content, + "judges": judges, + "caseNumbers": case_numbers + } + + +def process_directory(directory_path, output_file): + with open(output_file, 'w') as jsonl_file: + xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')] + for filename in tqdm(xml_files, desc="Processing XML files"): + file_path = os.path.join(directory_path, filename) + with open(file_path, 'r', encoding='utf-8') as xml_file: + xml_content = xml_file.read() + judgment_data = extract_information_from_xml(xml_content) + jsonl_file.write(json.dumps(judgment_data) + '\n') + +directory_path = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/dump/' +output_file = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/englad_wales_data.jsonl' + +process_directory(directory_path, output_file) diff --git a/scripts/england_wales/02_extract_jsonl_refined.py b/scripts/england_wales/02_extract_jsonl_refined.py new file mode 100644 index 0000000..6008656 --- /dev/null +++ b/scripts/england_wales/02_extract_jsonl_refined.py @@ -0,0 +1,63 @@ +import os +import json +from bs4 import BeautifulSoup +from tqdm import tqdm + + +def extract_information_from_xml(xml_content): + soup = BeautifulSoup(xml_content, 'html.parser') + + # Extract required fields + _id = soup.find('uk:hash').text + signature = soup.find('neutralcitation').text if soup.find('neutralcitation') else None + hearing_date = soup.find('hearingdate').text if soup.find('hearingdate') else None + date = hearing_date.strip() if hearing_date else None + publication_date = soup.find('frbrwork').find('frbrdate')['date'] + court_type = soup.find('courttype').text if soup.find('courttype') else None + + # Get the excerpt + header_text = soup.header.get_text(separator=' ', strip=True) + excerpt = header_text[:500] + + # Get the full content of the header and judgment body as text + header_content = soup.header.get_text(separator='\n', strip=True) + judgment_body_content = soup.find('judgmentbody').get_text(separator='\n', strip=True) if soup.find( + 'judgmentbody') else "" + content = header_content + "\n" + judgment_body_content + + # Get the judges list + judges = [judge.get_text() for judge in soup.find_all('judge')] + + # Get case numbers + case_numbers = [case_number.get_text() for case_number in soup.find_all('p', class_='CoverText') if + 'Case Nos:' in case_number.text] + case_numbers = [num.strip() for sublist in case_numbers for num in sublist.replace('Case Nos:', '').split()] + + return { + "_id": _id, + "signature": signature, + "date": date, + "publicationDate": publication_date, + "type": court_type, + "excerpt": excerpt, + "content": content, + "judges": judges, + "caseNumbers": case_numbers + } + + +def process_directory(directory_path, output_file): + with open(output_file, 'w') as jsonl_file: + xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')] + for filename in tqdm(xml_files, desc="Processing XML files"): + file_path = os.path.join(directory_path, filename) + with open(file_path, 'r', encoding='utf-8') as xml_file: + xml_content = xml_file.read() + judgment_data = extract_information_from_xml(xml_content) + jsonl_file.write(json.dumps(judgment_data) + '\n') + + +directory_path = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/dump/' +output_file = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/englad_wales_data_refined.jsonl' + +process_directory(directory_path, output_file) From b43904d77a3cc92e8ce8ed69bedf361be5553728 Mon Sep 17 00:00:00 2001 From: tsantosh7 Date: Sun, 2 Jun 2024 15:44:42 +0100 Subject: [PATCH 2/2] extracted jsons and analysed data --- .../01_Analyze_En_Judgements_Texts.ipynb | 745 ++++++++++++++++++ .../england-wales/02_Analyse_En_Dataset.ipynb | 480 +++++++++++ scripts/england_wales/01_extract_jsonl.py | 59 -- .../england_wales/01_extract_jsonl_refined.py | 216 +++++ .../england_wales/02_extract_jsonl_refined.py | 63 -- 5 files changed, 1441 insertions(+), 122 deletions(-) create mode 100644 nbs/Data/england-wales/01_Analyze_En_Judgements_Texts.ipynb create mode 100644 nbs/Data/england-wales/02_Analyse_En_Dataset.ipynb delete mode 100644 scripts/england_wales/01_extract_jsonl.py create mode 100644 scripts/england_wales/01_extract_jsonl_refined.py delete mode 100644 scripts/england_wales/02_extract_jsonl_refined.py diff --git a/nbs/Data/england-wales/01_Analyze_En_Judgements_Texts.ipynb b/nbs/Data/england-wales/01_Analyze_En_Judgements_Texts.ipynb new file mode 100644 index 0000000..55ff4ad --- /dev/null +++ b/nbs/Data/england-wales/01_Analyze_En_Judgements_Texts.ipynb @@ -0,0 +1,745 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9e365555", + "metadata": {}, + "source": [ + "# Analyze Text of England and Wales Judgements\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6b666da3-f393-4d88-8036-e818937d2305", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import string\n", + "from datasets import Dataset, DatasetDict, load_dataset, load_from_disk\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "27d73a94-5cd3-4820-938c-a827b8c34bd0", + "metadata": {}, + "outputs": [], + "source": [ + "path_ = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/'\n", + "jsonl_file = path_+'england_wales_data_refined_7.jsonl'\n", + "dataset_path = path_+'en_judgements_dataset'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c2851986-f950-4a21-b3e1-7ce58f6fa4a4", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0ce2fe916d004bd099ef2a3b7a509d83", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Saving the dataset (0/1 shards): 0%| | 0/6154 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idtypeappeal_typeappeal_outcomecharsnum_dummy_tokensnum_non_ws_tokens
0ab0224364e4cf6562c82f8861d5268d4fa22b2ec45e0f7...crown_court<NA><NA>1244422292155
1d4630d93258ea51ecff4bc4015443b4eecf8d9b2e5b7c5...supreme_courtconviction<NA>2097736813586
237183a714b626cfe98081ac0250c804f992f340281f6d2...crown_court<NA><NA>4057071997097
3b41933b19505ab8767ce30faf8db9524f737ec5ac2c17e...crown_court<NA><NA>1945935153432
4418382a2a6c0c32d3d2bd4cb7b39e1ba259dc6bf56a78e...crown_court<NA>allowed1035218791793
\n", + "" + ], + "text/plain": [ + " _id type \\\n", + "0 ab0224364e4cf6562c82f8861d5268d4fa22b2ec45e0f7... crown_court \n", + "1 d4630d93258ea51ecff4bc4015443b4eecf8d9b2e5b7c5... supreme_court \n", + "2 37183a714b626cfe98081ac0250c804f992f340281f6d2... crown_court \n", + "3 b41933b19505ab8767ce30faf8db9524f737ec5ac2c17e... crown_court \n", + "4 418382a2a6c0c32d3d2bd4cb7b39e1ba259dc6bf56a78e... crown_court \n", + "\n", + " appeal_type appeal_outcome chars num_dummy_tokens num_non_ws_tokens \n", + "0 12444 2229 2155 \n", + "1 conviction 20977 3681 3586 \n", + "2 40570 7199 7097 \n", + "3 19459 3515 3432 \n", + "4 allowed 10352 1879 1793 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# | eval: false\n", + "stats = (\n", + " ds.select_columns([\"_id\", \"type\", \"appeal_type\", \"appeal_outcome\", \"chars\", \"num_dummy_tokens\", \"num_non_ws_tokens\"])\n", + " .to_pandas()\n", + " .convert_dtypes(dtype_backend=\"pyarrow\")\n", + ")\n", + "stats[\"type\"] = stats[\"type\"].astype(\"category\")\n", + "stats.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c6bb139f-4340-45b4-a277-43da9d31a8f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Text(0.5, 1.0, '#tokens distribution')]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# | eval: false\n", + "ax = sns.histplot(\n", + " x=stats[\"num_non_ws_tokens\"],\n", + " log_scale=True,\n", + " bins=50,\n", + ")\n", + "ax.set(title=\"#tokens distribution\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e2d55b29-5e3a-4b80-827f-9a12e9ff4b97", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# | eval: false\n", + "court_type_card_order = stats[\"type\"].value_counts().index.tolist()\n", + "court_type_data = stats[\"type\"].value_counts().plot.barh(logx=True, title=\"Types cardinality\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "e1051c95-4339-4a5e-bb1a-559ea811c5ec", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# | eval: false\n", + "appeal_type_card_order = stats[\"appeal_type\"].value_counts().index.tolist()\n", + "appeal_type_data = stats[\"appeal_type\"].value_counts().plot.barh(logx=True, title=\"Types cardinality\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c4a68318-5880-4d5f-9690-80235ed0bfe4", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# | eval: false\n", + "appeal_outcome_card_order = stats[\"appeal_outcome\"].value_counts().index.tolist()\n", + "appeal_outcome_data = stats[\"appeal_outcome\"].value_counts().plot.barh(logx=True, title=\"Types cardinality\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "ef0ec395-bd03-47bf-84b2-7adf338595f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# | eval: false\n", + "# sns.displot(data=stats, x=\"num_non_ws_tokens\", col=\"type\", col_wrap=3, log_scale=(True, False), facet_kws=dict(sharey=False, sharex=False), kind=\"hist\", bins=25)\n", + "\n", + "_, ax = plt.subplots(figsize=(8, 12))\n", + "ax.set(title=\"Per type text length ditribution\")\n", + "sns.boxenplot(data=stats, y=\"type\", x=\"num_non_ws_tokens\", order=court_type_card_order, log_scale=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "06f8c2b2-8f87-4876-b58c-a164c3412c31", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# | eval: false\n", + "# sns.displot(data=stats, x=\"num_non_ws_tokens\", col=\"type\", col_wrap=3, log_scale=(True, False), facet_kws=dict(sharey=False, sharex=False), kind=\"hist\", bins=25)\n", + "\n", + "_, ax = plt.subplots(figsize=(8, 12))\n", + "ax.set(title=\"Per type text length ditribution\")\n", + "sns.boxenplot(data=stats, y=\"appeal_type\", x=\"num_non_ws_tokens\", order=appeal_type_card_order, log_scale=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "1de5e68f-8ae4-4a67-bdd1-c84146d2475e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# | eval: false\n", + "# sns.displot(data=stats, x=\"num_non_ws_tokens\", col=\"type\", col_wrap=3, log_scale=(True, False), facet_kws=dict(sharey=False, sharex=False), kind=\"hist\", bins=25)\n", + "\n", + "_, ax = plt.subplots(figsize=(8, 12))\n", + "ax.set(title=\"Per type text length ditribution\")\n", + "sns.boxenplot(data=stats, y=\"appeal_outcome\", x=\"num_non_ws_tokens\", order=appeal_outcome_card_order, log_scale=True)" + ] + }, + { + "cell_type": "markdown", + "id": "ea06ef3f-c12d-4da6-9fc6-45f1809dabad", + "metadata": {}, + "source": [ + "# Tokenize\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "08c70fdc-0b03-4983-8da9-8d065161d3e7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" + ] + } + ], + "source": [ + "# | eval: false\n", + "from transformers import AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "0af8c3ba-aa89-4e1a-bfcb-65b618c4559e", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9f849bfbaab840c7883c4e321f589d87", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/418 [00:00 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (7729 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (4093 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (968 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (2180 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (1937 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (2857 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (8490 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (17735 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (22812 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3021 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (2964 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (1604 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (2726 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3342 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3668 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (4760 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (14217 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (1346 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (5781 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (12451 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (11813 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (6959 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (2493 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3168 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (12022 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3316 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (6039 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (5440 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (14833 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3606 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (4197 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3538 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (4618 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3974 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (14842 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3610 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (6583 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (2124 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (9074 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (11635 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (7935 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (4170 > 512). Running this sequence through the model will result in indexing errors\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (3503 > 512). Running this sequence through the model will result in indexing errors\n" + ] + } + ], + "source": [ + "# | eval: false\n", + "tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-large\")\n", + "ds = ds.map(\n", + " lambda examples: tokenizer(examples[\"content\"], padding=False, truncation=False),\n", + " batched=True,\n", + " num_proc=44,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "6f822fae-f91c-4ee1-a114-97a021bf1e81", + "metadata": {}, + "outputs": [], + "source": [ + "# | eval: false\n", + "tokenized = []\n", + "for item in ds:\n", + " tokenized.append({\"num_tokens\": len(item[\"input_ids\"])})" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "3c059b5a-5c25-4381-aad7-d69ef0b90320", + "metadata": {}, + "outputs": [], + "source": [ + "num_tokens = [item['num_tokens'] for item in tokenized]\n", + "filtered_tokens = [token for token in num_tokens if token <= 40000]" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "cdac696f-056a-4b12-a48e-ac8f8dac9eeb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# | eval: false\n", + "sns.histplot(filtered_tokens, bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "c890ee73", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot the box plot\n", + "plt.figure(figsize=(6, 6))\n", + "sns.boxplot(filtered_tokens)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08572d7e-8b02-4b5b-a078-24d88beb1378", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (ml4legal)", + "language": "python", + "name": "myenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/Data/england-wales/02_Analyse_En_Dataset.ipynb b/nbs/Data/england-wales/02_Analyse_En_Dataset.ipynb new file mode 100644 index 0000000..8e6748b --- /dev/null +++ b/nbs/Data/england-wales/02_Analyse_En_Dataset.ipynb @@ -0,0 +1,480 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a98d226c", + "metadata": {}, + "source": [ + "# Analyse Polish Dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "initial_id", + "metadata": {}, + "outputs": [], + "source": [ + "# | eval: false\n", + "import polars as pl\n", + "from datasets import Dataset, DatasetDict, load_dataset, load_from_disk" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c8a2c7d4858169a2", + "metadata": {}, + "outputs": [], + "source": [ + "# | eval: false\n", + "path_ = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/'\n", + "dataset_path = path_+'en_judgements_dataset'\n", + "ds = load_from_disk(dataset_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "dd788638-6a7d-4f31-bfed-8845eb4cfbd0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['_id', 'citation', 'signature', 'date', 'publicationDate', 'type', 'excerpt', 'content', 'judges', 'caseNumbers', 'citation_references', 'legislation', 'file_name', 'appeal_type', 'appeal_outcome', 'xml_uri', 'uri'],\n", + " num_rows: 6154\n", + "})" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "167b28d4-1e8a-4bf3-a2f3-bea277fb448f", + "metadata": {}, + "outputs": [], + "source": [ + "df = ds.to_pandas()\n", + "pl_df = pl.DataFrame(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "9e3c70ac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (5, 17)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ _id ┆ citation ┆ signature ┆ date ┆ … ┆ appeal_ty ┆ appeal_ou ┆ xml_uri ┆ uri │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ pe ┆ tcome ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ datetime[ ┆ ┆ --- ┆ --- ┆ str ┆ str │\n", + "│ ┆ ┆ ┆ ns] ┆ ┆ cat ┆ cat ┆ ┆ │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ ab0224364 ┆ [2008] ┆ EWCA_Crim ┆ null ┆ … ┆ null ┆ null ┆ https://c ┆ https:// │\n", + "│ e4cf6562c ┆ EWCA Crim ┆ _2952 ┆ ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │\n", + "│ 82f8861d5 ┆ 2952 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │\n", + "│ 268… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hiv… ┆ archiv… │\n", + "│ d4630d932 ┆ [2006] ┆ EWCA_Crim ┆ null ┆ … ┆ convictio ┆ null ┆ https://c ┆ https:// │\n", + "│ 58ea51ecf ┆ EWCA Crim ┆ _3187 ┆ ┆ ┆ n ┆ ┆ aselaw.na ┆ caselaw. │\n", + "│ f4bc40154 ┆ 3187 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │\n", + "│ 43b… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hiv… ┆ archiv… │\n", + "│ 37183a714 ┆ [2012] ┆ EWCA_Crim ┆ null ┆ … ┆ null ┆ null ┆ https://c ┆ https:// │\n", + "│ b626cfe98 ┆ EWCA Crim ┆ _1840 ┆ ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │\n", + "│ 081ac0250 ┆ 1840 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │\n", + "│ c80… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hiv… ┆ archiv… │\n", + "│ b41933b19 ┆ [2014] ┆ EWCA_Crim ┆ null ┆ … ┆ null ┆ null ┆ https://c ┆ https:// │\n", + "│ 505ab8767 ┆ EWCA Crim ┆ _1730 ┆ ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │\n", + "│ ce30faf8d ┆ 1730 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │\n", + "│ b95… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hiv… ┆ archiv… │\n", + "│ 418382a2a ┆ [2018] ┆ EWCA_Crim ┆ null ┆ … ┆ null ┆ allowed ┆ https://c ┆ https:// │\n", + "│ 6c0c32d3d ┆ EWCA Crim ┆ _2189 ┆ ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │\n", + "│ 2bd4cb7b3 ┆ 2189 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │\n", + "│ 9e1… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hiv… ┆ archiv… │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n" + ] + } + ], + "source": [ + "pl_df = pl_df.with_columns([\n", + " pl.col(\"date\").cast(pl.Utf8),\n", + " pl.col(\"publicationDate\").cast(pl.Utf8),\n", + "])\n", + "\n", + "# Define date format\n", + "dt_fmt = \"%Y-%m-%d %H:%M:%S%.f %Z\"\n", + "\n", + "# Perform column transformations\n", + "pl_df = pl_df.with_columns([\n", + " pl.col(\"date\").str.strptime(pl.Datetime, format=dt_fmt),\n", + " pl.col(\"publicationDate\").str.strptime(pl.Datetime, format=dt_fmt),\n", + " pl.col(\"type\").cast(pl.Categorical),\n", + " pl.col(\"appeal_type\").cast(pl.Categorical),\n", + " pl.col(\"appeal_outcome\").cast(pl.Categorical)\n", + "])\n", + "\n", + "# Display the first few rows of the transformed DataFrame\n", + "print(pl_df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "35e65fe2dd9a4bce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (9, 18)
statistic_idcitationsignaturedatepublicationDatetypeexcerptcontentjudgescaseNumberscitation_referenceslegislationfile_nameappeal_typeappeal_outcomexml_uriuri
strstrstrstrstrstrstrstrstrf64f64f64f64strstrstrstrstr
"count""6154""6154""6154""0""6154""6154""6058""6154"6115.04934.01392.01826.0"6154""834""1368""6154""6154"
"null_count""0""0""0""6154""0""0""96""0"39.01220.04762.04328.0"0""5320""4786""0""0"
"mean"nullnullnullnull"2013-10-13 09:46:09.320766"nullnullnullnullnullnullnullnullnullnullnullnull
"std"nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
"min""001d3b389f60bfd101c581fe8f1a9a…"[2003] EWCA Crim 1""EWCA_(Crim)_1478"null"2003-01-04 00:00:00"null"********REPORTING RESTRICTIONS…"\n", + "2020] EWCA Crim 570\n", + "No: 20190…nullnullnullnull"2003_01_04-1.xml"nullnull"https://caselaw.nationalarchiv…"https://caselaw.nationalarchiv…
"25%"nullnullnullnull"2008-06-11 00:00:00"nullnullnullnullnullnullnullnullnullnullnullnull
"50%"nullnullnullnull"2012-11-29 00:00:00"nullnullnullnullnullnullnullnullnullnullnullnull
"75%"nullnullnullnull"2019-06-07 00:00:00"nullnullnullnullnullnullnullnullnullnullnullnull
"max""ffffb6552ad89849b5d2767708b5c2…"[2024] EWCA Crim 99""Ewca_Crim_664"null"2024-05-22 00:00:00"null"…WARNING: reporting restrictio…"…WARNING: reporting restrictio…nullnullnullnull"2024_05_22-6154.xml"nullnull"https://caselaw.nationalarchiv…"https://caselaw.nationalarchiv…
" + ], + "text/plain": [ + "shape: (9, 18)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ statistic ┆ _id ┆ citation ┆ signature ┆ … ┆ appeal_ty ┆ appeal_ou ┆ xml_uri ┆ uri │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ pe ┆ tcome ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ str ┆ ┆ --- ┆ --- ┆ str ┆ str │\n", + "│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ ┆ │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ count ┆ 6154 ┆ 6154 ┆ 6154 ┆ … ┆ 834 ┆ 1368 ┆ 6154 ┆ 6154 │\n", + "│ null_coun ┆ 0 ┆ 0 ┆ 0 ┆ … ┆ 5320 ┆ 4786 ┆ 0 ┆ 0 │\n", + "│ t ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ mean ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "│ std ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "│ min ┆ 001d3b389 ┆ [2003] ┆ EWCA_(Cri ┆ … ┆ null ┆ null ┆ https://c ┆ https:// │\n", + "│ ┆ f60bfd101 ┆ EWCA Crim ┆ m)_1478 ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │\n", + "│ ┆ c581fe8f1 ┆ 1 ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │\n", + "│ ┆ a9a… ┆ ┆ ┆ ┆ ┆ ┆ hiv… ┆ archiv… │\n", + "│ 25% ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "│ 50% ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "│ 75% ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", + "│ max ┆ ffffb6552 ┆ [2024] ┆ Ewca_Crim ┆ … ┆ null ┆ null ┆ https://c ┆ https:// │\n", + "│ ┆ ad89849b5 ┆ EWCA Crim ┆ _664 ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │\n", + "│ ┆ d2767708b ┆ 99 ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │\n", + "│ ┆ 5c2… ┆ ┆ ┆ ┆ ┆ ┆ hiv… ┆ archiv… │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# | eval: false\n", + "pl_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ab23ff37327a377a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (7, 2)
typecount
catu32
"crown_court"5472
"supreme_court"660
"martial_court"11
"high_court_administrative_cour…2
"high_court_division_court"7
"civil_criminal_court"1
"division_court"1
" + ], + "text/plain": [ + "shape: (7, 2)\n", + "┌─────────────────────────────────┬───────┐\n", + "│ type ┆ count │\n", + "│ --- ┆ --- │\n", + "│ cat ┆ u32 │\n", + "╞═════════════════════════════════╪═══════╡\n", + "│ crown_court ┆ 5472 │\n", + "│ supreme_court ┆ 660 │\n", + "│ martial_court ┆ 11 │\n", + "│ high_court_administrative_cour… ┆ 2 │\n", + "│ high_court_division_court ┆ 7 │\n", + "│ civil_criminal_court ┆ 1 │\n", + "│ division_court ┆ 1 │\n", + "└─────────────────────────────────┴───────┘" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# | eval: false\n", + "pl_df[\"type\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "11883338-0a71-48ff-8699-6a4dd8cc085d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 2)
appeal_typecount
catu32
"conviction"496
"sentence"338
null5320
" + ], + "text/plain": [ + "shape: (3, 2)\n", + "┌─────────────┬───────┐\n", + "│ appeal_type ┆ count │\n", + "│ --- ┆ --- │\n", + "│ cat ┆ u32 │\n", + "╞═════════════╪═══════╡\n", + "│ conviction ┆ 496 │\n", + "│ sentence ┆ 338 │\n", + "│ null ┆ 5320 │\n", + "└─────────────┴───────┘" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl_df[\"appeal_type\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "0cdfb0a9-c761-44c5-8fa0-17508df966e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
appeal_outcomecount
catu32
"allowed"697
"refused"65
"dismissed"586
null4786
"granted"20
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌────────────────┬───────┐\n", + "│ appeal_outcome ┆ count │\n", + "│ --- ┆ --- │\n", + "│ cat ┆ u32 │\n", + "╞════════════════╪═══════╡\n", + "│ allowed ┆ 697 │\n", + "│ refused ┆ 65 │\n", + "│ dismissed ┆ 586 │\n", + "│ null ┆ 4786 │\n", + "│ granted ┆ 20 │\n", + "└────────────────┴───────┘" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl_df[\"appeal_outcome\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "11446c299cdf1700", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing content: 0.0\n", + "Missing excerpt: 0.015599610009749756\n" + ] + } + ], + "source": [ + "# | eval: false\n", + "print(f\"Missing content: {pl_df['content'].null_count() / len(pl_df)}\")\n", + "print(f\"Missing excerpt: {pl_df['excerpt'].null_count() / len(pl_df)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "891ffbad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjoAAAGzCAYAAAAmH71NAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAzC0lEQVR4nO3de3hU1b3/8c8kIYGEJFwSAuEWBcQGJBxDoFhQKNEYKRW8FKuVQBFUBguN1ULPqRFvoFYK6ihaK2jPURFrsRW5iQgi9BBABIyiIDdJSIJKAkECSdbvD3+Z45AEkskkk6x5v55nnoe99p69v3vNrvl077VmHMYYIwAAAAsF+bsAAACAhkLQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdAB0GQ4HA5NnTrV32V45f7775fD4fBoS0hI0Pjx4xv82Pv375fD4dCiRYvcbePHj1fr1q0b/NiVHA6H7r///kY7HlBbBB3gHBYtWiSHw1Hj69///re/S6y3V155RfPmzfP6/WPGjNEvf/lLSZIxRm3btvX4g4u6eeedd5psYGjKtQE1CfF3AUBz8MADD+iCCy6o0t6zZ08/VONbr7zyinbt2qXp06d79f7Nmzfr3nvvlSR9+umnOnbsmH784x/7sMLma/fu3QoKqtv/n3znnXfkcrnqFCi6d++u7777Ti1atKhjhXVzrtq+++47hYTwJwVND1clUAvp6ekaMGCAv8s4r7KyMlVUVCg0NLRRjvfVV18pNzfXHWw2bdqk6Oho9e7du1GO742SkhJFREQ0yrHCwsIadP8//LxbtmzZoMc6H38fH6gJj64AH8jKylJQUJDWrFnj0T558mSFhobq448/drcdPnxYEydOVHx8vMLCwnTBBRfozjvv1OnTp93bHDt2TNOnT1fXrl0VFhamnj176tFHH1VFRYV7m8pxGX/60580b9489ejRQ2FhYcrJydH7778vh8OhxYsX6w9/+IM6duyoiIgI/fznP9ehQ4fc+xg2bJiWLVumAwcOuB/HJSQknPNcS0tLdfToUR09elRr165VixYt1LVrVx09elTr169Xv3799PXXX+vo0aMe9VZUVGj+/Pm65JJL1LJlS8XGxurqq6/Wli1bqhxj6dKl6tu3r8LCwtSnTx+tWLHCY/2BAwc0ZcoU9e7dW61atVL79u114403av/+/R7bVT56XLdunaZMmaIOHTqoS5cukqTjx49r+vTpSkhIUFhYmDp06KArr7xS27ZtO+f5S9KGDRuUkpKili1bqkePHnruueeq3e7sMTpnzpzRrFmz1KtXL7Vs2VLt27fXkCFDtHr1aknfj6txuVyS5PGIVDr3513dGJ1KX375pdLS0hQREaH4+Hg98MADMsa411deK++//77H+87e57lqq2w7+07PRx99pPT0dEVFRal169YaMWJElce9lZ/Rhx9+qMzMTMXGxioiIkJjxoxRYWFh9R8AUAfc0QFqoaioSEePHvVoczgcat++vSTpv/7rv/Svf/1LEydO1M6dOxUZGamVK1fqL3/5ix588EElJSVJknJzczVw4EAdO3ZMkydP1sUXX6zDhw/rjTfe0MmTJxUaGqqTJ0/qiiuu0OHDh3X77berW7du2rhxo2bOnKm8vLwq42kWLlyoU6dOafLkyQoLC1O7du107NgxSdLDDz8sh8Oh3//+9yooKNC8efOUmpqq7du3q1WrVvrP//xPFRUV6auvvtKf//xnSTrvANZXX31VEyZM8Gjr3Lmzx3JsbKwkad++fe7gNHHiRC1atEjp6em67bbbVFZWpg8++ED//ve/Pe6WbdiwQW+++aamTJmiyMhIPfnkk7r++ut18OBBd39nZ2dr48aNuummm9SlSxft379fzz77rIYNG6acnByFh4d71DNlyhTFxsbqvvvuU0lJiSTpjjvu0BtvvKGpU6cqMTFRX3/9tTZs2KBPP/1Ul156aY3nv3PnTl111VWKjY3V/fffr7KyMmVlZSkuLu6c/SZ9P2B59uzZuu222zRw4EAVFxdry5Yt2rZtm6688krdfvvtys3N1erVq/W3v/2t2n1U93n/MFD+UHl5ua6++mr9+Mc/1mOPPaYVK1YoKytLZWVleuCBB85b7w/VprYf+uSTTzR06FBFRUXp3nvvVYsWLfTcc89p2LBhWrdunQYNGuSx/V133aW2bdsqKytL+/fv17x58zR16lQtXry4TnUCVRgANVq4cKGRVO0rLCzMY9udO3ea0NBQc9ttt5lvv/3WdO7c2QwYMMCcOXPGvc24ceNMUFCQyc7OrnKsiooKY4wxDz74oImIiDCff/65x/oZM2aY4OBgc/DgQWOMMfv27TOSTFRUlCkoKPDYdu3atUaS6dy5sykuLna3v/7660aSmT9/vrtt5MiRpnv37rXuk9zcXLN69WqzevVq0717dzNu3DizevVq8+qrrxpJ5sknn3Sv/+6774wxxrz33ntGkvnNb35T43kbY4wkExoaavbs2eNu+/jjj40k89RTT7nbTp48WWU/mzZtMpLMyy+/7G6r/PyGDBliysrKPLaPjo42Tqez1uddafTo0aZly5bmwIED7racnBwTHBxszv5Pavfu3U1GRoZ7OSkpyYwcOfKc+3c6nVX2Y8y5P+/KdQsXLnS3ZWRkGEnmrrvucrdVVFSYkSNHmtDQUFNYWGiM+b9rZe3atefdZ021GfP9Z5eVleVeHj16tAkNDTV79+51t+Xm5prIyEhz+eWXu9sqP6PU1FSPa+G3v/2tCQ4ONseOHav2eEBt8egKqAWXy6XVq1d7vJYvX+6xTd++fTVr1iy98MILSktL09GjR/XSSy+5B2hWVFRo6dKlGjVqVLXjfSofAyxZskRDhw5V27Zt3Y+Ijh49qtTUVJWXl2v9+vUe77v++uvdd1DONm7cOEVGRrqXb7jhBnXq1EnvvPOO133RqVMnpaamasCAATp06JBuueUWpaamKiQkRC1bttTkyZOVmpqq1NRU97iNv//973I4HMrKyqrxvCulpqaqR48e7uV+/fopKipKX375pbutVatW7n+fOXNGX3/9tXr27Kk2bdpU++hp0qRJCg4O9mhr06aN/vd//1e5ubm1Pvfy8nKtXLlSo0ePVrdu3dztP/rRj5SWlnbe97dp00affPKJvvjii1of82zn+ryr88Pp+pXT90+fPq13333X6xrOp7y8XKtWrdLo0aN14YUXuts7deqkm2++WRs2bFBxcbHHeyZPnuxxLQwdOlTl5eU6cOBAg9WJwMCjK6AWBg4cWKvByPfcc49ee+01bd68WY888ogSExPd6woLC1VcXKy+ffuecx9ffPGFduzYUeMfs4KCAo/l6maDVerVq5fHssPhUM+ePauMZamtM2fOqKioSJK0cuVKBQUF6eKLL9bRo0e1cuVK/cd//IeOHz+u48ePKzo62j0LaO/evYqPj1e7du3Oe4wfBohKbdu21bfffute/u677zR79mwtXLhQhw8f9hhzUlnfD1XXR4899pgyMjLUtWtXJScn65prrtG4ceM8/jCfrbCwUN99912VfpWk3r17nzdAPvDAA7r22mt10UUXqW/fvrr66qt16623ql+/fud83/nOpSZBQUFVzueiiy6SJK+vgdooLCzUyZMnqx2U/qMf/UgVFRU6dOiQ+vTp424/+3Nv27atJHl87oA3CDqAD3355Zfu/7e+c+dOr/ZRUVGhK6+80j1l+2yVf6gq/fDuRkP78MMPNXz4cI+27t27eyxXBrS1a9dq2LBhdT7G2XdeKv0wzNx1111auHChpk+frsGDBys6OloOh0M33XRTteNVquujX/ziFxo6dKj+8Y9/aNWqVXr88cf16KOP6s0331R6enqd666Nyy+/XHv37tVbb72lVatW6YUXXtCf//xnLViwQLfddlut9uHrz/vsO2qVysvLfXqc86nN5w54g6AD+EhFRYXGjx+vqKgoTZ8+XY888ohuuOEGXXfddZK+DwBRUVHatWvXOffTo0cPnThxQqmpqfWu6exHJMYY7dmzx+MOQk1/6KqTlJTkniF055136sc//rEyMjJUVFSkG264QfPnz3ffxaocgC19f04rV67UN998U6u7OufzxhtvKCMjQ0888YS77dSpU+5B2LXVqVMnTZkyRVOmTFFBQYEuvfRSPfzwwzUGndjYWLVq1araR0+7d++u1THbtWunCRMmaMKECTpx4oQuv/xy3X///e6gU5fP43wqKir05ZdfeoTjzz//XJLcg8Qr75yc3XfVPTKqbW2xsbEKDw+vtk8+++wzBQUFqWvXrrXaF1BfjNEBfGTu3LnauHGjnn/+eT344IO67LLLdOedd7pnawUFBWn06NH617/+Ve2U6sr/5/qLX/xCmzZt0sqVK6tsc+zYMZWVldW6ppdfflnHjx93L7/xxhvKy8vz+EMeERFR7eOe6rRt21apqakaMmSIDh48qOuvv16pqamKiIhQcHCwJk6c6B6fU/kHVPp+XIkxRrNmzarxvOsiODi4yvueeuqpWt+FKC8vr3LOHTp0UHx8vEpLS8953LS0NC1dulQHDx50t3/66afVfl5n+/rrrz2WW7durZ49e3ocs/I7fuoa2mry9NNPu/9tjNHTTz+tFi1aaMSIEZK+vyMXHBxcZezXM888U2Vfta0tODhYV111ld566y2PR2T5+fl65ZVXNGTIEEVFRXl5RkDdcEcHqIXly5frs88+q9J+2WWX6cILL9Snn36qP/7xjxo/frxGjRol6fvvB+nfv7+mTJmi119/XZL0yCOPaNWqVbriiis0efJk/ehHP1JeXp6WLFmiDRs2qE2bNrrnnnv0z3/+Uz/72c80fvx4JScnq6SkRDt37tQbb7yh/fv3KyYmplZ1t2vXTkOGDNGECROUn5+vefPmqWfPnpo0aZJ7m+TkZC1evFiZmZlKSUlR69at3edQky1btuj06dO67LLLJEkbN25Uv379avwivuHDh+vWW2/Vk08+qS+++EJXX321Kioq9MEHH2j48OF1/n2rn/3sZ/rb3/6m6OhoJSYmatOmTXr33Xfd08/P5/jx4+rSpYtuuOEGJSUlqXXr1nr33XeVnZ3tcZeoOrNmzdKKFSs0dOhQTZkyRWVlZXrqqafUp08f7dix45zvTUxM1LBhw5ScnKx27dppy5Yt7inulZKTkyVJv/nNb5SWlqbg4GDddNNNtTqvs7Vs2VIrVqxQRkaGBg0apOXLl2vZsmX6wx/+4H7EGB0drRtvvFFPPfWUHA6HevToobfffrvKWLC61vbQQw9p9erVGjJkiKZMmaKQkBA999xzKi0t1WOPPebV+QBe8dt8L6AZONf0cv3/qbdlZWUmJSXFdOnSpcpU2Pnz5xtJZvHixe62AwcOmHHjxpnY2FgTFhZmLrzwQuN0Ok1paal7m+PHj5uZM2eanj17mtDQUBMTE2Muu+wy86c//cmcPn3aGPN/038ff/zxKnVXThl+9dVXzcyZM02HDh1Mq1atzMiRIz2mRRtjzIkTJ8zNN99s2rRpYyTVaqr5nDlzTI8ePdzLqamp552qXVZWZh5//HFz8cUXm9DQUBMbG2vS09PN1q1b3dtIqnY/Z0/T/vbbb82ECRNMTEyMad26tUlLSzOfffZZle0qP7+zp/OXlpaae+65xyQlJZnIyEgTERFhkpKSzDPPPHPeczfGmHXr1pnk5GQTGhpqLrzwQrNgwQKTlZV13unlDz30kBk4cKBp06aNadWqlbn44ovNww8/7P5MK/vprrvuMrGxscbhcLj3ea7Pu6bp5REREWbv3r3mqquuMuHh4SYuLs5kZWWZ8vJyj/cXFhaa66+/3oSHh5u2bdua22+/3ezatavKPmuqzZiq08uNMWbbtm0mLS3NtG7d2oSHh5vhw4ebjRs3emxT02dU07R3oK4cxjDSC7DN+++/r+HDh2vJkiW64YYb/F0OAPgNY3QAAIC1CDoAAMBaBB0AAGAtxugAAABrcUcHAABYi6ADAACsFfBfGFhRUaHc3FxFRkb69KvXAQBAwzHG6Pjx44qPj1dQUM33bQI+6OTm5vKbKwAANFOHDh1Sly5dalwfsEHH5XLJ5XK5fzfo0KFD/PYKAADNRHFxsbp27arIyMhzbhfws66Ki4sVHR2toqIigg4AAM1Ebf9+MxgZAABYi6ADAACsRdABAADWCtig43K5lJiYqJSUFH+XAgAAGgiDkRmMDABAs8NgZAAAEPAIOgAAwFoEHQAAYK2ADToMRgYAwH4MRmYwMgAAzQ6DkQEAQMAj6AAAAGsRdAAAgLVC/F0AAABonhJmLDvvNvvnjGyESmoWsHd0mHUFAID9AjboOJ1O5eTkKDs729+lAACABhKwQQcAANiPoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUCNugwvRwAAPsFbNBhejkAAPYL2KADAADsR9ABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGCtgA06fI8OAAD2C9igw/foAABgv4ANOgAAwH4EHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgrYANOvzWFQAA9gvYoMNvXQEAYL+ADToAAMB+BB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArBWwQcflcikxMVEpKSn+LgUAADSQgA06TqdTOTk5ys7O9ncpAACggQRs0AEAAPYj6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1mn3QOXbsmAYMGKD+/furb9+++stf/uLvkgAAQBMR4u8C6isyMlLr169XeHi4SkpK1LdvX1133XVq3769v0sDAAB+1uzv6AQHBys8PFySVFpaKmOMjDF+rgoAADQFfg8669ev16hRoxQfHy+Hw6GlS5dW2cblcikhIUEtW7bUoEGDtHnzZo/1x44dU1JSkrp06aJ77rlHMTExjVQ9AABoyvwedEpKSpSUlCSXy1Xt+sWLFyszM1NZWVnatm2bkpKSlJaWpoKCAvc2bdq00ccff6x9+/bplVdeUX5+fmOVDwAAmjC/B5309HQ99NBDGjNmTLXr586dq0mTJmnChAlKTEzUggULFB4erhdffLHKtnFxcUpKStIHH3xQ4/FKS0tVXFzs8QIAAHbye9A5l9OnT2vr1q1KTU11twUFBSk1NVWbNm2SJOXn5+v48eOSpKKiIq1fv169e/eucZ+zZ89WdHS0+9W1a9eGPQkAAOA3TTroHD16VOXl5YqLi/Noj4uL05EjRyRJBw4c0NChQ5WUlKShQ4fqrrvu0iWXXFLjPmfOnKmioiL369ChQw16DgAAwH+a/fTygQMHavv27bXePiwsTGFhYQ1XEAAAaDKa9B2dmJgYBQcHVxlcnJ+fr44dO/qpKgAA0Fw06aATGhqq5ORkrVmzxt1WUVGhNWvWaPDgwfXat8vlUmJiolJSUupbJgAAaKL8/ujqxIkT2rNnj3t537592r59u9q1a6du3bopMzNTGRkZGjBggAYOHKh58+appKREEyZMqNdxnU6nnE6niouLFR0dXd/TAAAATZDfg86WLVs0fPhw93JmZqYkKSMjQ4sWLdLYsWNVWFio++67T0eOHFH//v21YsWKKgOUAQAAzuYwAf57CZV3dIqKihQVFeXvcgAAaDYSZiw77zb754xskGPX9u93kx6jAwAAUB8BG3QYjAwAgP0CNug4nU7l5OQoOzvb36UAAIAGErBBBwAA2I+gAwAArEXQAQAA1grYoMNgZAAA7BewQYfByAAA2C9ggw4AALAfQQcAAFiLoAMAAKxF0AEAANYK2KDDrCsAAOwXsEGHWVcAANgvYIMOAACwH0EHAABYi6ADAACsRdABAADWIugAAABrBWzQYXo5AAD2C9igw/RyAADsF7BBBwAA2I+gAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgEbdPgeHQAA7BewQYfv0QEAwH4BG3QAAID9CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUCNujwExAAANgvYIMOPwEBAID9AjboAAAA+xF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFgrYIOOy+VSYmKiUlJS/F0KAABoIAEbdJxOp3JycpSdne3vUgAAQAMJ2KADAADsR9ABAADW8irofPnll76uAwAAwOe8Cjo9e/bU8OHD9d///d86deqUr2sCAADwCa+CzrZt29SvXz9lZmaqY8eOuv3227V582Zf1wYAAFAvXgWd/v37a/78+crNzdWLL76ovLw8DRkyRH379tXcuXNVWFjo6zoBAADqrF6DkUNCQnTddddpyZIlevTRR7Vnzx797ne/U9euXTVu3Djl5eX5qk4AAIA6q1fQ2bJli6ZMmaJOnTpp7ty5+t3vfqe9e/dq9erVys3N1bXXXuurOgEAAOosxJs3zZ07VwsXLtTu3bt1zTXX6OWXX9Y111yjoKDvc9MFF1ygRYsWKSEhwZe1AgAA1IlXQefZZ5/Vr3/9a40fP16dOnWqdpsOHTror3/9a72KAwAAqA+vgs4XX3xx3m1CQ0OVkZHhze4BAAB8wqsxOgsXLtSSJUuqtC9ZskQvvfRSvYsCAADwBa+CzuzZsxUTE1OlvUOHDnrkkUfqXRQAAIAveBV0Dh48qAsuuKBKe/fu3XXw4MF6FwUAAOALXgWdDh06aMeOHVXaP/74Y7Vv377eRQEAAPiCV0Hnl7/8pX7zm99o7dq1Ki8vV3l5ud577z1NmzZNN910k69rBAAA8IpXs64efPBB7d+/XyNGjFBIyPe7qKio0Lhx4xijAwAAmgyvgk5oaKgWL16sBx98UB9//LFatWqlSy65RN27d/d1fQAAAF7zKuhUuuiii3TRRRf5qhYAAACf8irolJeXa9GiRVqzZo0KCgpUUVHhsf69997zSXG1cejQId16660qKChQSEiI/vjHP+rGG29stOMDAICmy6ugM23aNC1atEgjR45U37595XA4fF1XrYWEhGjevHnq37+/jhw5ouTkZF1zzTWKiIjwW00AAKBp8CrovPbaa3r99dd1zTXX+LqeOuvUqZP797Y6duyomJgYffPNNwQdAADg3fTy0NBQ9ezZ0ycFrF+/XqNGjVJ8fLwcDoeWLl1aZRuXy6WEhAS1bNlSgwYN0ubNm6vd19atW1VeXq6uXbv6pDYAANC8eRV07r77bs2fP1/GmHoXUFJSoqSkJLlcrmrXL168WJmZmcrKytK2bduUlJSktLQ0FRQUeGz3zTffaNy4cXr++efrXRMAALCDV4+uNmzYoLVr12r58uXq06ePWrRo4bH+zTffrPW+0tPTlZ6eXuP6uXPnatKkSZowYYIkacGCBVq2bJlefPFFzZgxQ5JUWlqq0aNHa8aMGbrsssvOebzS0lKVlpa6l4uLi2tdKwAAaF68Cjpt2rTRmDFjfF1LFadPn9bWrVs1c+ZMd1tQUJBSU1O1adMmSZIxRuPHj9dPf/pT3Xrrrefd5+zZszVr1qwGqxkAADQdXgWdhQsX+rqOah09elTl5eWKi4vzaI+Li9Nnn30mSfrwww+1ePFi9evXzz2+529/+5suueSSavc5c+ZMZWZmupeLi4sZ0wMAgKW8/sLAsrIyvf/++9q7d69uvvlmRUZGKjc3V1FRUWrdurUvazynIUOGVPken3MJCwtTWFhYA1YEAACaCq+CzoEDB3T11Vfr4MGDKi0t1ZVXXqnIyEg9+uijKi0t1YIFC3xSXExMjIKDg5Wfn+/Rnp+fr44dO/rkGAAAwF5ezbqaNm2aBgwYoG+//VatWrVyt48ZM0Zr1qzxWXGhoaFKTk722GdFRYXWrFmjwYMH12vfLpdLiYmJSklJqW+ZAACgifLqjs4HH3ygjRs3KjQ01KM9ISFBhw8frtO+Tpw4oT179riX9+3bp+3bt6tdu3bq1q2bMjMzlZGRoQEDBmjgwIGaN2+eSkpK3LOwvOV0OuV0OlVcXKzo6Oh67QsAADRNXgWdiooKlZeXV2n/6quvFBkZWad9bdmyRcOHD3cvVw4UzsjI0KJFizR27FgVFhbqvvvu05EjR9S/f3+tWLGiygBlAACAszmMF9/6N3bsWEVHR+v5559XZGSkduzYodjYWF177bXq1q1bo83K8oXKOzpFRUWKiorydzkAADQbCTOWnXeb/XNGNsixa/v326s7Ok888YTS0tKUmJioU6dO6eabb9YXX3yhmJgYvfrqq14X3ZhcLpdcLle1d6YAAIAdvLqjI30/vfy1117Tjh07dOLECV166aW65ZZbPAYnNwfc0QEAwDvW3tGRpJCQEP3qV7/y9u0AAAANzqug8/LLL59z/bhx47wqBgAAwJe8CjrTpk3zWD5z5oxOnjyp0NBQhYeHE3QAAECT4NUXBn777bcerxMnTmj37t0aMmRIsxmMDAAA7OdV0KlOr169NGfOnCp3e5oqvhkZAAD7+SzoSN8PUM7NzfXlLhuM0+lUTk6OsrOz/V0KAABoIF6N0fnnP//psWyMUV5enp5++mn95Cc/8UlhAAAA9eVV0Bk9erTHssPhUGxsrH7605/qiSee8EVdAAAA9eb1b10BAAA0dT4do9OcMBgZAAD7eXVHp/IXxmtj7ty53hyiwTmdTjmdTvdXSAMAAPt4FXQ++ugjffTRRzpz5ox69+4tSfr8888VHBysSy+91L2dw+HwTZUAAABe8CrojBo1SpGRkXrppZfUtm1bSd9/ieCECRM0dOhQ3X333T4tEgAAwBtejdF54oknNHv2bHfIkaS2bdvqoYceYtYVAABoMrwKOsXFxSosLKzSXlhYqOPHj9e7KAAAAF/wKuiMGTNGEyZM0JtvvqmvvvpKX331lf7+979r4sSJuu6663xdIwAAgFe8GqOzYMEC/e53v9PNN9+sM2fOfL+jkBBNnDhRjz/+uE8LBAAA8JZXQSc8PFzPPPOMHn/8ce3du1eS1KNHD0VERPi0uIbkcrnkcrlUXl7u71IAAEADqdcXBubl5SkvL0+9evVSRESEjDG+qqvB8aOeAADYz6ug8/XXX2vEiBG66KKLdM011ygvL0+SNHHiRKaWAwCAJsOroPPb3/5WLVq00MGDBxUeHu5uHzt2rFasWOGz4gAAAOrDqzE6q1at0sqVK9WlSxeP9l69eunAgQM+KQwAAKC+vLqjU1JS4nEnp9I333yjsLCwehcFAADgC14FnaFDh+rll192LzscDlVUVOixxx7T8OHDfVYcAABAfXj16Oqxxx7TiBEjtGXLFp0+fVr33nuvPvnkE33zzTf68MMPfV0jAACAV7y6o9O3b199/vnnGjJkiK699lqVlJTouuuu00cffaQePXr4usYG4XK5lJiYqJSUFH+XAgAAGojD1PHLb86cOaOrr75aCxYsUK9evRqqrkZTXFys6OhoFRUVKSoqyt/lAADQbCTMWHbebfbPGdkgx67t3+8639Fp0aKFduzYUa/iAAAAGoNXj65+9atf6a9//auvawEAAPAprwYjl5WV6cUXX9S7776r5OTkKr9xNXfuXJ8UBwAAUB91CjpffvmlEhIStGvXLl166aWSpM8//9xjG4fD4bvqAAAA6qFOQadXr17Ky8vT2rVrJX3/kw9PPvmk4uLiGqQ4AACA+qjTGJ2zJ2gtX75cJSUlPi0IAADAV7wajFypjjPTAQAAGlWdgo7D4agyBocxOQAAoKmq0xgdY4zGjx/v/uHOU6dO6Y477qgy6+rNN9/0XYUAAABeqlPQycjI8Fj+1a9+5dNiGpPL5ZLL5VJ5ebm/SwEAAA2kzj8BYRt+AgIAAO9Y+RMQAAAAzQVBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1AjbouFwuJSYmKiUlxd+lAACABhKwQcfpdConJ0fZ2dn+LgUAADSQgA06AADAfgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsJYVQWfMmDFq27atbrjhBn+XAgAAmhArgs60adP08ssv+7sMAADQxFgRdIYNG6bIyEh/lwEAAJoYvwed9evXa9SoUYqPj5fD4dDSpUurbONyuZSQkKCWLVtq0KBB2rx5c+MXCgAAmh2/B52SkhIlJSXJ5XJVu37x4sXKzMxUVlaWtm3bpqSkJKWlpamgoKCRKwUAAM1NiL8LSE9PV3p6eo3r586dq0mTJmnChAmSpAULFmjZsmV68cUXNWPGjDofr7S0VKWlpe7l4uLiuhcNAACaBb/f0TmX06dPa+vWrUpNTXW3BQUFKTU1VZs2bfJqn7Nnz1Z0dLT71bVrV1+VCwAAmpgmHXSOHj2q8vJyxcXFebTHxcXpyJEj7uXU1FTdeOONeuedd9SlS5dzhqCZM2eqqKjI/Tp06FCD1Q8AAPzL74+ufOHdd9+t9bZhYWEKCwtrwGoAAEBT0aTv6MTExCg4OFj5+fke7fn5+erYsaOfqgIAAM1Fkw46oaGhSk5O1po1a9xtFRUVWrNmjQYPHlyvfbtcLiUmJiolJaW+ZQIAgCbK74+uTpw4oT179riX9+3bp+3bt6tdu3bq1q2bMjMzlZGRoQEDBmjgwIGaN2+eSkpK3LOwvOV0OuV0OlVcXKzo6Oj6ngYAAGiC/B50tmzZouHDh7uXMzMzJUkZGRlatGiRxo4dq8LCQt133306cuSI+vfvrxUrVlQZoAwAAHA2hzHG+LsIf6q8o1NUVKSoqCh/lwMAQLORMGPZebfZP2dkgxy7tn+/m/QYnYbEGB0AAOwXsEHH6XQqJydH2dnZ/i4FAAA0kIANOgAAwH4EHQAAYC2CDgAAsFbABh0GIwMAYL+ADToMRgYAwH4BG3QAAID9CDoAAMBaBB0AAGAtgg4AALBWwAYdZl0BAGC/gA06zLoCAMB+ARt0AACA/Qg6AADAWgQdAABgLYIOAACwFkEHAABYK2CDDtPLAQCwX8AGHaaXAwBgv4ANOgAAwH4EHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAawVs0OELAwEAsF/ABh2+MBAAAPsFbNABAAD2I+gAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYK2KDDT0AAAGC/gA06/AQEAAD2C9igAwAA7EfQAQAA1iLoAAAAaxF0AACAtQg6AADAWgQdAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGCtEH8X4C8ul0sul0vl5eX+LgUBKGHGsvNus3/OyEaoBADsFrB3dJxOp3JycpSdne3vUgAAQAMJ2KADAADsR9ABAADWIugAAABrEXQAAIC1CDoAAMBaBB0AAGAtgg4AALAWQQcAAFiLoAMAAKxF0AEAANYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAa1kRdN5++2317t1bvXr10gsvvODvcgAAQBMR4u8C6qusrEyZmZlau3atoqOjlZycrDFjxqh9+/b+Lg0AAPhZs7+js3nzZvXp00edO3dW69atlZ6erlWrVvm7LAAA0AT4PeisX79eo0aNUnx8vBwOh5YuXVplG5fLpYSEBLVs2VKDBg3S5s2b3etyc3PVuXNn93Lnzp11+PDhxigdAAA0cX4POiUlJUpKSpLL5ap2/eLFi5WZmamsrCxt27ZNSUlJSktLU0FBgVfHKy0tVXFxsccLAADYye9jdNLT05Wenl7j+rlz52rSpEmaMGGCJGnBggVatmyZXnzxRc2YMUPx8fEed3AOHz6sgQMH1ri/2bNna9asWb47gXNImLHsvNvsnzOyESqBrbjGmh8+M9QH10/d+f2OzrmcPn1aW7duVWpqqrstKChIqamp2rRpkyRp4MCB2rVrlw4fPqwTJ05o+fLlSktLq3GfM2fOVFFRkft16NChBj8PAADgH36/o3MuR48eVXl5ueLi4jza4+Li9Nlnn0mSQkJC9MQTT2j48OGqqKjQvffee84ZV2FhYQoLC2vQugEAQNPQpINObf385z/Xz3/+c3+XAQAAmpgm/egqJiZGwcHBys/P92jPz89Xx44d/VQVAABoLpp00AkNDVVycrLWrFnjbquoqNCaNWs0ePDgeu3b5XIpMTFRKSkp9S0TAAA0UX5/dHXixAnt2bPHvbxv3z5t375d7dq1U7du3ZSZmamMjAwNGDBAAwcO1Lx581RSUuKeheUtp9Mpp9Op4uJiRUdH1/c0AABAE+T3oLNlyxYNHz7cvZyZmSlJysjI0KJFizR27FgVFhbqvvvu05EjR9S/f3+tWLGiygBlAACAs/k96AwbNkzGmHNuM3XqVE2dOrWRKgIAALZo0mN0GhJjdAAAsF/ABh2n06mcnBxlZ2f7uxQAANBAAjboAAAA+xF0AACAtQg6AADAWgEbdBiMDACA/QI26DAYGQAA+/n9e3T8rfI7fIqLi32+74rSk+fdpiGOi6bPV9cG11jzw2eG+mhq148/66nc7/m+i89hzreF5b766it17drV32UAAAAvHDp0SF26dKlxfcAHnYqKCuXm5ioyMlIOh+Oc2xYXF6tr1646dOiQoqKiGqnCwEV/Nz76vHHR342PPm9cDdnfxhgdP35c8fHxCgqqeSROwD+6CgoKOmcSrE5UVBT/A2lE9Hfjo88bF/3d+OjzxtVQ/V2bH+UO2MHIAADAfgQdAABgLYJOHYSFhSkrK0thYWH+LiUg0N+Njz5vXPR346PPG1dT6O+AH4wMAADsxR0dAABgLYIOAACwFkEHAABYi6ADAACsRdABAADWCvigM3v2bKWkpCgyMlIdOnTQ6NGjtXv3bo9thg0bJofD4fG64447PLY5ePCgRo4cqfDwcHXo0EH33HOPysrKGvNUmoVnn31W/fr1c39L5uDBg7V8+XL3+lOnTsnpdKp9+/Zq3bq1rr/+euXn53vsg76um/P1Odd3w5ozZ44cDoemT5/ubuM6bzjV9TfXuG/df//9Vfrz4osvdq9vatd3wP8ExLp16+R0OpWSkqKysjL94Q9/0FVXXaWcnBxFRES4t5s0aZIeeOAB93J4eLj73+Xl5Ro5cqQ6duyojRs3Ki8vT+PGjVOLFi30yCOPNOr5NHVdunTRnDlz1KtXLxlj9NJLL+naa6/VRx99pD59+ui3v/2tli1bpiVLlig6OlpTp07Vddddpw8//FASfe2N8/W5xPXdULKzs/Xcc8+pX79+Hu1c5w2jpv6WuMZ9rU+fPnr33XfdyyEh/xcnmtz1beChoKDASDLr1q1zt11xxRVm2rRpNb7nnXfeMUFBQebIkSPutmeffdZERUWZ0tLShizXCm3btjUvvPCCOXbsmGnRooVZsmSJe92nn35qJJlNmzYZY+hrX6nsc2O4vhvK8ePHTa9evczq1as9+pjrvGHU1N/GcI37WlZWlklKSqp2XVO8vgP+0dXZioqKJEnt2rXzaP+f//kfxcTEqG/fvpo5c6ZOnjzpXrdp0yZdcskliouLc7elpaWpuLhYn3zySeMU3gyVl5frtddeU0lJiQYPHqytW7fqzJkzSk1NdW9z8cUXq1u3btq0aZMk+rq+zu7zSlzfvud0OjVy5EiP61kS13kDqam/K3GN+9YXX3yh+Ph4XXjhhbrlllt08OBBSU3z+g74R1c/VFFRoenTp+snP/mJ+vbt626/+eab1b17d8XHx2vHjh36/e9/r927d+vNN9+UJB05csTjA5PkXj5y5EjjnUAzsXPnTg0ePFinTp1S69at9Y9//EOJiYnavn27QkND1aZNG4/t4+Li3P1IX3unpj6XuL4bwmuvvaZt27YpOzu7yrojR45wnfvYufpb4hr3tUGDBmnRokXq3bu38vLyNGvWLA0dOlS7du1qktc3QecHnE6ndu3apQ0bNni0T5482f3vSy65RJ06ddKIESO0d+9e9ejRo7HLbPZ69+6t7du3q6ioSG+88YYyMjK0bt06f5dltZr6PDExkevbxw4dOqRp06Zp9erVatmypb/LsV5t+ptr3LfS09Pd/+7Xr58GDRqk7t276/XXX1erVq38WFn1eHT1/02dOlVvv/221q5dqy5dupxz20GDBkmS9uzZI0nq2LFjlRHllcsdO3ZsgGqbt9DQUPXs2VPJycmaPXu2kpKSNH/+fHXs2FGnT5/WsWPHPLbPz8939yN97Z2a+rw6XN/1s3XrVhUUFOjSSy9VSEiIQkJCtG7dOj355JMKCQlRXFwc17kPna+/y8vLq7yHa9y32rRpo4suukh79uxpkv8dD/igY4zR1KlT9Y9//EPvvfeeLrjggvO+Z/v27ZKkTp06SZIGDx6snTt3qqCgwL3N6tWrFRUV5X48gJpVVFSotLRUycnJatGihdasWeNet3v3bh08eNA9noS+9o3KPq8O13f9jBgxQjt37tT27dvdrwEDBuiWW25x/5vr3HfO19/BwcFV3sM17lsnTpzQ3r171alTp6b533GfD29uZu68804THR1t3n//fZOXl+d+nTx50hhjzJ49e8wDDzxgtmzZYvbt22feeustc+GFF5rLL7/cvY+ysjLTt29fc9VVV5nt27ebFStWmNjYWDNz5kx/nVaTNWPGDLNu3Tqzb98+s2PHDjNjxgzjcDjMqlWrjDHG3HHHHaZbt27mvffeM1u2bDGDBw82gwcPdr+fvq67c/U513fjOHvWD9d5w/phf3ON+97dd99t3n//fbNv3z7z4YcfmtTUVBMTE2MKCgqMMU3v+g74oCOp2tfChQuNMcYcPHjQXH755aZdu3YmLCzM9OzZ09xzzz2mqKjIYz/79+836enpplWrViYmJsbcfffd5syZM344o6bt17/+tenevbsJDQ01sbGxZsSIEe6QY4wx3333nZkyZYpp27atCQ8PN2PGjDF5eXke+6Cv6+Zcfc713TjODjpc5w3rh/3NNe57Y8eONZ06dTKhoaGmc+fOZuzYsWbPnj3u9U3t+nYYY4zv7xMBAAD4X8CP0QEAAPYi6AAAAGsRdAAAgLUIOgAAwFoEHQAAYC2CDgAAsBZBBwAAWIugAwAArEXQAQAA1iLoAAAAaxF0AACAtf4fiVInC+FH654AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# | eval: false\n", + "pl_df[\"excerpt\"].str.strip_chars().str.len_chars().to_pandas().plot.hist(\n", + " bins=50, log=True, title=\"Excerpt #chars distribution\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "82355195-bcd0-47fe-9cea-8feb680dd650", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (6_154,)
excerpt
str
"No. 2008/03296/A9 2008/03350/A…
"Neutral Citation Number: [2006…
"Neutral Citation Number: [2012…
"Neutral Citation Number: [2014…
"No: 201802356 A2 Neutral Citat…
"Neutral Citation Number: [2018…
"Case No: 2002/04091/D1 Neutral…
"Neutral Citation Number: [2010…
"Case No: 200305991 D2 Neutral …
"2017/05382/B1 Neutral Citation…
" + ], + "text/plain": [ + "shape: (6_154,)\n", + "Series: 'excerpt' [str]\n", + "[\n", + "\t\"No. 2008/03296/A9 2008/03350/A…\n", + "\t\"Neutral Citation Number: [2006…\n", + "\t\"Neutral Citation Number: [2012…\n", + "\t\"Neutral Citation Number: [2014…\n", + "\t\"No: 201802356 A2 Neutral Citat…\n", + "\t…\n", + "\t\"Neutral Citation Number: [2018…\n", + "\t\"Case No: 2002/04091/D1 Neutral…\n", + "\t\"Neutral Citation Number: [2010…\n", + "\t\"Case No: 200305991 D2 Neutral …\n", + "\t\"2017/05382/B1 Neutral Citation…\n", + "]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl_df[\"excerpt\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "bb009db3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 500.0\n", + "1 500.0\n", + "2 499.0\n", + "3 500.0\n", + "4 499.0\n", + " ... \n", + "6149 499.0\n", + "6150 500.0\n", + "6151 500.0\n", + "6152 499.0\n", + "6153 499.0\n", + "Name: excerpt, Length: 6154, dtype: float64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl_df[\"excerpt\"].str.strip_chars().str.len_chars().to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b8d0b77-9a61-481f-ba4b-ac738250df3e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/england_wales/01_extract_jsonl.py b/scripts/england_wales/01_extract_jsonl.py deleted file mode 100644 index 735498a..0000000 --- a/scripts/england_wales/01_extract_jsonl.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import json -from bs4 import BeautifulSoup -from tqdm import tqdm - -def extract_information_from_xml(xml_content): - soup = BeautifulSoup(xml_content, 'lxml') - - # Extract required fields - _id = soup.find('uk:hash').text - signature = soup.find('neutralcitation').text if soup.find('neutralcitation') else None - hearing_date = soup.find('hearingdate').text if soup.find('hearingdate') else None - date = hearing_date.strip() if hearing_date else None - publication_date = soup.find('frbrwork').find('frbrdate')['date'] - court_type = soup.find('courttype').text if soup.find('courttype') else None - - # Get the excerpt - header_text = soup.header.get_text(separator=' ', strip=True) - excerpt = header_text[:500] - - # Get the full content of the judgment body as XML string - judgment_body = soup.find('judgmentbody') - content = str(judgment_body) if judgment_body else None - - # Get the judges list - judges = [judge.get_text() for judge in soup.find_all('judge')] - - # Get case numbers - case_numbers = [case_number.get_text() for case_number in soup.find_all('p', class_='CoverText') if - 'Case Nos:' in case_number.text] - case_numbers = [num.strip() for sublist in case_numbers for num in sublist.replace('Case Nos:', '').split()] - - return { - "_id": _id, - "signature": signature, - "date": date, - "publicationDate": publication_date, - "type": court_type, - "excerpt": excerpt, - "content": content, - "judges": judges, - "caseNumbers": case_numbers - } - - -def process_directory(directory_path, output_file): - with open(output_file, 'w') as jsonl_file: - xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')] - for filename in tqdm(xml_files, desc="Processing XML files"): - file_path = os.path.join(directory_path, filename) - with open(file_path, 'r', encoding='utf-8') as xml_file: - xml_content = xml_file.read() - judgment_data = extract_information_from_xml(xml_content) - jsonl_file.write(json.dumps(judgment_data) + '\n') - -directory_path = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/dump/' -output_file = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/englad_wales_data.jsonl' - -process_directory(directory_path, output_file) diff --git a/scripts/england_wales/01_extract_jsonl_refined.py b/scripts/england_wales/01_extract_jsonl_refined.py new file mode 100644 index 0000000..a1389ce --- /dev/null +++ b/scripts/england_wales/01_extract_jsonl_refined.py @@ -0,0 +1,216 @@ +import os +import json +from bs4 import BeautifulSoup +from tqdm import tqdm +import re +from multiprocessing import Pool + +def extract_appeal_type(text): + patterns = [ + (r'appeal\s+against\s+\S+\s+sentence\s+or\s+\S+\s+conviction', 'conviction_sentence'), + (r'appeal\s+against\s+\S+\s+conviction\s+or\s+\S+\s+sentence', 'conviction_sentence'), + (r'appeal\s+against\s+\S+\s+conviction', 'conviction'), + (r'appeal\s+against\s+\S+\s+sentence', 'sentence') + ] + + for pattern, appeal_type in patterns: + if re.search(pattern, text, re.IGNORECASE): + return appeal_type + return None + + +def extract_appeal_outcome(text): + outcome_patterns = { + 'granted': r'appeal\s+is\s+granted', + 'dismissed': r'appeal\s+is\s+dismissed', + 'refused': r'appeal\s+is\s+refused', + 'allowed': r'appeal\s+is\s+allowed' + } + + for outcome, pattern in outcome_patterns.items(): + if re.search(pattern, text, re.IGNORECASE): + return outcome + return None + +def extract_and_clean_judges(paragraphs): + judges = [] + for para in paragraphs: + text = para.get_text(strip=True) + if re.search(r'\bJustice\b|\bJudge\b|\bSIR\b|\bHonour\b|\bHHJ\b', text, re.IGNORECASE): + # Remove text within parentheses + cleaned_text = re.sub(r'\([^)]*\)', '', text).strip() + # Remove dashes and any text following them + cleaned_text = re.sub(r'-.*', '', cleaned_text).strip() + # Check for specific keywords and ensure it's not empty or unwanted text + if cleaned_text and 'Royal Courts of Justice' not in cleaned_text and cleaned_text != "THE LORD CHIEF JUSTICE OF ENGLAND AND WALES": + judges.append(cleaned_text) + return judges + +def categorize_court(court_name): + if 'SUPREME_COURT' in court_name: + return 'supreme_court' + elif "HIGH_COURT" in court_name and "ADMINISTRATIVE_COURT" in court_name: + return 'high_court_administrative_court' + elif 'HIGH_COURT' in court_name and 'DIVISIONAL_COURT' in court_name: + return 'high_court_division_court' + elif 'HIGH_COURT' in court_name: + return 'high_court' + elif 'CIVIL_AND_CRIMINAL' in court_name: + return 'civil_criminal_court' + elif 'MARTIAL' in court_name: + return 'martial_court' + elif 'DIVISIONAL_COURT' in court_name: + return 'division_court' + else: + return 'crown_court' + +def extract_information_from_xml(xml_content, file_name): + soup = BeautifulSoup(xml_content, 'xml') # Using 'xml' parser for handling namespaces + + # Extract required fields + _id = soup.find('uk:hash').text if soup.find('uk:hash') else None + citation = soup.find('uk:cite').text if soup.find('uk:cite') else None + signature = citation.split('] ')[1] if citation else None # Removing the year part + if signature: + signature = signature.replace(' ', '_') + hearing_date = soup.find('hearingdate').text if soup.find('hearingdate') else None + date = hearing_date.strip() if hearing_date else None + publication_date = soup.find('FRBRdate', {'name': 'judgment'})['date'] if soup.find('FRBRdate', + {'name': 'judgment'}) else None + + court_type_tags = soup.find_all('courtType') + # Use a set to collect unique court types + unique_court_types = set( + re.sub(r'\([^)]*\)', '', tag.get_text(strip=True)).replace(' ', '_') for tag in court_type_tags) + + # Join the unique court types + court_type_ = "_".join(unique_court_types) + court_type_ = re.sub(r'_+', '_', court_type_).strip('_') + + # Categorize the combined court types + court_type = categorize_court(court_type_) + + # Get the excerpt + header_text = soup.header.get_text(separator=' ', strip=True) if soup.header else "" + excerpt = header_text[:500] + + # Get the full content of the header and judgment body as text + header_content = soup.header.get_text(separator='\n', strip=True) if soup.header else "" + judgment_body_content = soup.find('judgmentBody').get_text(separator='\n', strip=True) if soup.find( + 'judgmentBody') else "" + content = header_content + "\n" + judgment_body_content + + # Get the judges list + # Get the judges list from TLCPerson elements + judges = [judge['showAs'] for judge in soup.find_all('TLCPerson') if 'showAs' in judge.attrs and re.search(r'\bJustice\b|\bJudge\b|\bSIR\b|\bHonour\b|\bHHJ\b', judge['showAs'], re.IGNORECASE)] + # Filter judges using regex criteria + judges = [judge for judge in judges if + re.search(r'\bJustice\b|\bJudge\b|\bSIR\b|\bHonour\b|\bHHJ\b', judge, re.IGNORECASE)] + + # If no judges found, get text from elements + if not judges: + judges = [judge.get_text(strip=True) for judge in soup.find_all('judge')] + + # If no judges found, use regex to extract them from header content + if not judges and soup.header: + # Extract all

tags + paragraphs = soup.header.find_all('p') + judges = extract_and_clean_judges(paragraphs) + + # If still no judges found, look for text in

tags with style="text-align:center" + if not judges: + centered_paragraphs = soup.find_all('p', style=lambda x: x and 'text-align:center' in x) + judges.extend(extract_and_clean_judges(centered_paragraphs)) + + # If still no judges found, look for text in

tags with style="text-align:right" + if not judges: + right_aligned_paragraphs = soup.find_all('p', style=lambda x: x and 'text-align:right' in x) + judges.extend(extract_and_clean_judges(right_aligned_paragraphs)) + + # Filter judges using regex criteria + judges = [judge for judge in judges if + re.search(r'\bJustice\b|\bJudge\b|\bSIR\b|\bHonour\b|\bHHJ\b', judge, re.IGNORECASE)] + + # Extract URIs + xml_uri = soup.find('FRBRManifestation').find('FRBRuri')['value'] if soup.find('FRBRManifestation') and soup.find('FRBRManifestation').find('FRBRuri') else None + uri = soup.find('FRBRWork').find('FRBRuri')['value'] if soup.find('FRBRWork') and soup.find('FRBRWork').find('FRBRuri') else None + + # Extract legislation texts + legislation_tags = soup.find_all('ref', {'uk:type': 'legislation'}) + legislation_texts = set(tag.get_text() for tag in legislation_tags) + legislation_list = list(legislation_texts) # Convert set to list to remove duplicates + + # Extract case references + case_tags = soup.find_all('ref', {'uk:type': 'case'}) + case_references = set(tag.get_text() for tag in case_tags) + case_references_list = list(case_references) # Convert set to list to remove duplicates + + # Extract case numbers + case_numbers = set() + docket_number_tags = soup.find_all('docketNumber') + for tag in docket_number_tags: + case_numbers.add(tag.get_text()) + + # Extract case numbers from

tags containing "Case No:" + cover_text_tags = soup.find_all('p', class_='CoverText') + case_no_pattern = re.compile(r'Case No:\s*(.*)') + for tag in cover_text_tags: + match = case_no_pattern.search(tag.get_text()) + if match: + case_numbers.update([num.strip() for num in match.group(1).split(',')]) + + # If no case numbers found, look for text in

tags with style="text-align:right" + if not case_numbers: + right_aligned_paragraphs = soup.find_all('p', style=lambda x: x and 'text-align:right' in x) + case_no_pattern = re.compile(r'\b\d{4}/\d{4}/\w+\b|\d{6}') + for tag in right_aligned_paragraphs: + matches = case_no_pattern.findall(tag.get_text()) + case_numbers.update(matches) + + # Extract appeal type + appeal_type = extract_appeal_type(content) + + # Extract appeal outcome + appeal_outcome = extract_appeal_outcome(content) + + def null_if_empty(value): + return value if value else None + + return { + "_id": null_if_empty(_id), + "citation": null_if_empty(citation), + "signature": null_if_empty(signature), + "date": null_if_empty(date), + "publicationDate": null_if_empty(publication_date), + "type": null_if_empty(court_type), + "excerpt": null_if_empty(excerpt), + "content": null_if_empty(content), + "judges": null_if_empty(judges), + "caseNumbers": null_if_empty(list(case_numbers)), + "citation_references": null_if_empty(case_references_list), + "legislation": null_if_empty(legislation_list), + "file_name": null_if_empty(file_name), + "appeal_type": null_if_empty(appeal_type), + "appeal_outcome": null_if_empty(appeal_outcome), + "xml_uri": null_if_empty(xml_uri), + "uri": null_if_empty(uri) + } + +def process_file(file_path): + with open(file_path, 'r', encoding='utf-8') as xml_file: + xml_content = xml_file.read() + file_name = os.path.basename(file_path) + return extract_information_from_xml(xml_content, file_name) + +def process_directory(directory_path, output_file): + xml_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.xml')] + + with Pool() as pool, open(output_file, 'w') as jsonl_file: + for judgment_data in tqdm(pool.imap(process_file, xml_files), total=len(xml_files), + desc="Processing XML files"): + jsonl_file.write(json.dumps(judgment_data) + '\n') + +directory_path = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/dump/' +output_file = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/england_wales_data_refined_7.jsonl' + +process_directory(directory_path, output_file) diff --git a/scripts/england_wales/02_extract_jsonl_refined.py b/scripts/england_wales/02_extract_jsonl_refined.py deleted file mode 100644 index 6008656..0000000 --- a/scripts/england_wales/02_extract_jsonl_refined.py +++ /dev/null @@ -1,63 +0,0 @@ -import os -import json -from bs4 import BeautifulSoup -from tqdm import tqdm - - -def extract_information_from_xml(xml_content): - soup = BeautifulSoup(xml_content, 'html.parser') - - # Extract required fields - _id = soup.find('uk:hash').text - signature = soup.find('neutralcitation').text if soup.find('neutralcitation') else None - hearing_date = soup.find('hearingdate').text if soup.find('hearingdate') else None - date = hearing_date.strip() if hearing_date else None - publication_date = soup.find('frbrwork').find('frbrdate')['date'] - court_type = soup.find('courttype').text if soup.find('courttype') else None - - # Get the excerpt - header_text = soup.header.get_text(separator=' ', strip=True) - excerpt = header_text[:500] - - # Get the full content of the header and judgment body as text - header_content = soup.header.get_text(separator='\n', strip=True) - judgment_body_content = soup.find('judgmentbody').get_text(separator='\n', strip=True) if soup.find( - 'judgmentbody') else "" - content = header_content + "\n" + judgment_body_content - - # Get the judges list - judges = [judge.get_text() for judge in soup.find_all('judge')] - - # Get case numbers - case_numbers = [case_number.get_text() for case_number in soup.find_all('p', class_='CoverText') if - 'Case Nos:' in case_number.text] - case_numbers = [num.strip() for sublist in case_numbers for num in sublist.replace('Case Nos:', '').split()] - - return { - "_id": _id, - "signature": signature, - "date": date, - "publicationDate": publication_date, - "type": court_type, - "excerpt": excerpt, - "content": content, - "judges": judges, - "caseNumbers": case_numbers - } - - -def process_directory(directory_path, output_file): - with open(output_file, 'w') as jsonl_file: - xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')] - for filename in tqdm(xml_files, desc="Processing XML files"): - file_path = os.path.join(directory_path, filename) - with open(file_path, 'r', encoding='utf-8') as xml_file: - xml_content = xml_file.read() - judgment_data = extract_information_from_xml(xml_content) - jsonl_file.write(json.dumps(judgment_data) + '\n') - - -directory_path = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/dump/' -output_file = '/home/stirunag/work/github/ML4-legal-documents/judgements_xml/englad_wales_data_refined.jsonl' - -process_directory(directory_path, output_file)