Skip to content

Commit

Permalink
Parse pl judgements (#4)
Browse files Browse the repository at this point in the history
* Improve error handling in download_pl_content.py

* Add dataset dump scrip

* Add pl dataset to DVC

* Add simple data analysis notebook

* Extract text from pl judgements

* Refine text extraction and add analysis

* Add addtional details download and ingest

* Refine extraction and ingest extracted data to mongo

* Add script for chunked embeddings

---------

Co-authored-by: Jakub Binkowski <[email protected]>
  • Loading branch information
binkjakub and Jakub Binkowski authored Apr 3, 2024
1 parent 1355b9c commit 981501c
Show file tree
Hide file tree
Showing 19 changed files with 886 additions and 112 deletions.
1 change: 0 additions & 1 deletion data/.gitignore

This file was deleted.

2 changes: 2 additions & 0 deletions data/datasets/pl/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/raw
/text
6 changes: 6 additions & 0 deletions data/datasets/pl/raw.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: 801ebfe4c29d0564abfce7006536adc8.dir
size: 5466475038
nfiles: 9
hash: md5
path: raw
6 changes: 6 additions & 0 deletions data/datasets/pl/text.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: cac0bc44e36e68d606eff7500d627bd1.dir
size: 22741832080
nfiles: 11
hash: md5
path: text
5 changes: 0 additions & 5 deletions data/dummy_file.txt.dvc

This file was deleted.

2 changes: 1 addition & 1 deletion juddges/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
'doc_host': 'https://laugustyniak.github.io',
'git_url': 'https://github.com/laugustyniak/juddges',
'lib_path': 'juddges'},
'syms': {'juddges.data.pl_court_api': {}}}
'syms': {'juddges.data.pl_court_api': {}, 'juddges.preprocessing.parser_base': {}, 'juddges.preprocessing.pl_court_parser': {}}}
69 changes: 66 additions & 3 deletions juddges/data/pl_court_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,41 @@
import requests
import xmltodict
from loguru import logger
from requests import HTTPError


class PolishCourtAPI:
def __init__(self) -> None:
self.url = "https://apiorzeczenia.wroclaw.sa.gov.pl/ncourt-api"

@property
def schema(self) -> dict[str, list[str]]:
return {
"judgement": [
"_id",
"signature",
"date",
"publicationDate",
"lastUpdate",
"courtId",
"departmentId",
"type",
"excerpt",
],
"content": ["content"],
"details": [
"chairman",
"judges",
"themePhrases",
"references",
"legalBases",
"recorder",
"decision",
"reviser",
"publisher",
],
}

def get_number_of_judgements(self, params: dict[str, Any] | None = None) -> int:
if params is None:
params = {}
Expand All @@ -33,11 +62,45 @@ def get_judgements(self, params: dict[str, Any]) -> list[dict[str, Any]]:

return judgements

def get_content(self, id: str) -> str:
def get_content(self, id: str) -> dict[str, Any]:
params = {"id": id}
endpoint = f"{self.url}/judgement/content"
res = requests.get(endpoint, params=params)
res.raise_for_status()

try:
res.raise_for_status()
except HTTPError as err:
if err.response.status_code == 404:
raise DataNotFoundError(f"Not found content for document: {id}")
raise

content = res.content.decode("utf-8")

return content
return {"content": content}

def get_cleaned_details(self, id: str) -> dict[str, Any]:
"""Downloads details without repeating fields retrieved in get_judgements."""
details = self.get_details(id)
return {k: v for k, v in details.items() if k in self.schema["details"]}

def get_details(self, id: str) -> dict[str, Any]:
params = {"id": id}
endpoint = f"{self.url}/judgement/details"
res = requests.get(endpoint, params=params)
res.raise_for_status()

# for details, API returns XML with error info instead of 404 status code
data = xmltodict.parse(res.content.decode("utf-8"))
try:
details = data["judgement"]
except KeyError:
if "error" in data.keys():
raise DataNotFoundError(f"Not found details for document: {id}")
raise
else:
assert isinstance(details, dict)
return details


class DataNotFoundError(Exception):
pass
File renamed without changes.
18 changes: 18 additions & 0 deletions juddges/preprocessing/parser_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from abc import ABC, abstractmethod
from typing import Any


class DocParserBase(ABC):
"""Base class for parser retrieving data from a document."""

def __call__(self, document: str) -> dict[str, Any]:
return self.parse(document)

@property
@abstractmethod
def schema(self) -> list[str]:
pass

@abstractmethod
def parse(self, document: str) -> dict[str, Any]:
pass
80 changes: 80 additions & 0 deletions juddges/preprocessing/pl_court_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import re
from typing import Any, Generator
from xml.etree import ElementTree
from xml.etree.ElementTree import Element

from juddges.preprocessing.parser_base import DocParserBase

MULTIPLE_NEWLINES = re.compile(r"(\n\s*)+\n+")


class SimplePlJudgementsParser(DocParserBase):
"""The simplest parser for the simple XML format used by the Polish courts.
It extracts the text from XML file, without adhering to any specific structure.
"""

@property
def schema(self) -> list[str]:
return ["num_pages", "vol_number", "vol_type", "text"]

def parse(self, document: str) -> dict[str, Any]:
et = ElementTree.fromstring(document)

xblock_elements = et.findall("xBlock")
assert len(xblock_elements) == 1, "There should be only one xBlock element"
content_root, *_ = xblock_elements

return {
"num_pages": int(et.attrib["xToPage"]),
"vol_number": int(et.attrib["xVolNmbr"]),
"vol_type": et.attrib["xVolType"],
"text": self.extract_text(content_root),
}

@staticmethod
def extract_text(element: Element) -> str:
text = ""
for elem_txt in element.itertext():
if elem_txt is None:
continue
if txt := elem_txt.strip(" "):
text += txt

text = re.sub(MULTIPLE_NEWLINES, "\n\n", text).strip()

return text


def itertext(element: Element, prefix: str = "") -> Generator[str, None, None]:
"""Extension of the Element.itertext method to handle special tags in pl court XML."""
tag = element.tag
if not isinstance(tag, str) and tag is not None:
return

t: str | None
match (tag, element.attrib):
case ("xName", {"xSffx": suffix}):
element.tail = element.tail.strip() if element.tail else None
t = f"{element.text}{suffix} "
case ("xEnum", _):
bullet_elem = element.find("xBullet")
if bullet_elem:
prefix = bullet_elem.text or ""
element.remove(bullet_elem)
t = ""
case ("xEnumElem", _):
t = prefix
case _:
t = element.text

if t:
yield t

for e in element:
yield from itertext(e, prefix)
t = e.tail

if t:
yield t
144 changes: 144 additions & 0 deletions notebooks/1_analyse_dataset.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "initial_id",
"metadata": {
"ExecuteTime": {
"end_time": "2024-03-15T11:07:39.123324510Z",
"start_time": "2024-03-15T11:07:39.065139618Z"
}
},
"outputs": [],
"source": [
"import polars as pl"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8a2c7d4858169a2",
"metadata": {
"ExecuteTime": {
"end_time": "2024-03-15T11:12:01.007272795Z",
"start_time": "2024-03-15T11:11:47.709404815Z"
},
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"ds = pl.read_parquet(\"../data/datasets/pl/raw\", use_pyarrow=True)\n",
"\n",
"dt_fmt = \"%Y-%m-%d %H:%M:%S%.f %Z\"\n",
"dt_unit = \"ms\" # due to https://github.com/pola-rs/polars/issues/13592\n",
"\n",
"ds = ds.with_columns(\n",
" ds[\"date\"].str.to_datetime(format=dt_fmt, time_unit=dt_unit),\n",
" ds[\"publicationDate\"].str.to_datetime(format=dt_fmt, time_unit=dt_unit),\n",
" ds[\"lastUpdate\"].str.to_datetime(format=dt_fmt, time_unit=dt_unit),\n",
" ds[\"courtId\"].cast(pl.Int32),\n",
" ds[\"departmentId\"].cast(pl.Int32),\n",
" ds[\"type\"].cast(pl.Categorical),\n",
")\n",
"\n",
"ds.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35e65fe2dd9a4bce",
"metadata": {
"ExecuteTime": {
"end_time": "2024-03-15T11:12:01.074286418Z",
"start_time": "2024-03-15T11:12:00.912825131Z"
},
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"ds.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab23ff37327a377a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-03-15T11:15:27.800725934Z",
"start_time": "2024-03-15T11:15:27.753971240Z"
},
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"ds[\"type\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11446c299cdf1700",
"metadata": {
"ExecuteTime": {
"end_time": "2024-03-15T11:15:38.059155473Z",
"start_time": "2024-03-15T11:15:38.053450756Z"
},
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
"print(f\"Missing content: {ds['content'].null_count() / len(ds)}\")\n",
"print(f\"Missing theis: {ds['thesis'].null_count() / len(ds)}\")\n",
"print(f\"Missing excerpt: {ds['excerpt'].null_count() / len(ds)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "891ffbad",
"metadata": {},
"outputs": [],
"source": [
"ds[\"excerpt\"].str.strip_chars().str.len_chars().to_pandas().plot.hist(\n",
" bins=50, log=True, title=\"Excerpt #chars distribution\"\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 981501c

Please sign in to comment.