Parse pl judgements (#4)

* Improve error handling in download_pl_content.py * Add dataset dump scrip * Add pl dataset to DVC * Add simple data analysis notebook * Extract text from pl judgements * Refine text extraction and add analysis * Add addtional details download and ingest * Refine extraction and ingest extracted data to mongo * Add script for chunked embeddings --------- Co-authored-by: Jakub Binkowski <[email protected]>
pwr-ai · Apr 3, 2024 · 981501c · 981501c
1 parent 1355b9c
commit 981501c
Show file tree

Hide file tree

Showing 19 changed files with 886 additions and 112 deletions.
diff --git a/data/.gitignore b/data/.gitignore
diff --git a/data/datasets/pl/.gitignore b/data/datasets/pl/.gitignore
@@ -0,0 +1,2 @@
+/raw
+/text
diff --git a/data/datasets/pl/raw.dvc b/data/datasets/pl/raw.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: 801ebfe4c29d0564abfce7006536adc8.dir
+  size: 5466475038
+  nfiles: 9
+  hash: md5
+  path: raw
diff --git a/data/datasets/pl/text.dvc b/data/datasets/pl/text.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: cac0bc44e36e68d606eff7500d627bd1.dir
+  size: 22741832080
+  nfiles: 11
+  hash: md5
+  path: text
diff --git a/data/dummy_file.txt.dvc b/data/dummy_file.txt.dvc
diff --git a/juddges/_modidx.py b/juddges/_modidx.py
@@ -5,4 +5,4 @@
                 'doc_host': 'https://laugustyniak.github.io',
                 'git_url': 'https://github.com/laugustyniak/juddges',
                 'lib_path': 'juddges'},
-  'syms': {'juddges.data.pl_court_api': {}}}
+  'syms': {'juddges.data.pl_court_api': {}, 'juddges.preprocessing.parser_base': {}, 'juddges.preprocessing.pl_court_parser': {}}}
diff --git a/juddges/data/pl_court_api.py b/juddges/data/pl_court_api.py
@@ -3,12 +3,41 @@
 import requests
 import xmltodict
 from loguru import logger
+from requests import HTTPError
 
 
 class PolishCourtAPI:
     def __init__(self) -> None:
         self.url = "https://apiorzeczenia.wroclaw.sa.gov.pl/ncourt-api"
 
+    @property
+    def schema(self) -> dict[str, list[str]]:
+        return {
+            "judgement": [
+                "_id",
+                "signature",
+                "date",
+                "publicationDate",
+                "lastUpdate",
+                "courtId",
+                "departmentId",
+                "type",
+                "excerpt",
+            ],
+            "content": ["content"],
+            "details": [
+                "chairman",
+                "judges",
+                "themePhrases",
+                "references",
+                "legalBases",
+                "recorder",
+                "decision",
+                "reviser",
+                "publisher",
+            ],
+        }
+
     def get_number_of_judgements(self, params: dict[str, Any] | None = None) -> int:
         if params is None:
             params = {}
@@ -33,11 +62,45 @@ def get_judgements(self, params: dict[str, Any]) -> list[dict[str, Any]]:
 
         return judgements
 
-    def get_content(self, id: str) -> str:
+    def get_content(self, id: str) -> dict[str, Any]:
         params = {"id": id}
         endpoint = f"{self.url}/judgement/content"
         res = requests.get(endpoint, params=params)
-        res.raise_for_status()
+
+        try:
+            res.raise_for_status()
+        except HTTPError as err:
+            if err.response.status_code == 404:
+                raise DataNotFoundError(f"Not found content for document: {id}")
+            raise
+
         content = res.content.decode("utf-8")
 
-        return content
+        return {"content": content}
+
+    def get_cleaned_details(self, id: str) -> dict[str, Any]:
+        """Downloads details without repeating fields retrieved in get_judgements."""
+        details = self.get_details(id)
+        return {k: v for k, v in details.items() if k in self.schema["details"]}
+
+    def get_details(self, id: str) -> dict[str, Any]:
+        params = {"id": id}
+        endpoint = f"{self.url}/judgement/details"
+        res = requests.get(endpoint, params=params)
+        res.raise_for_status()
+
+        # for details, API returns XML with error info instead of 404 status code
+        data = xmltodict.parse(res.content.decode("utf-8"))
+        try:
+            details = data["judgement"]
+        except KeyError:
+            if "error" in data.keys():
+                raise DataNotFoundError(f"Not found details for document: {id}")
+            raise
+        else:
+            assert isinstance(details, dict)
+            return details
+
+
+class DataNotFoundError(Exception):
+    pass
diff --git a/notebooks/.gitkeep → juddges/preprocessing/__init__.py b/notebooks/.gitkeep → juddges/preprocessing/__init__.py
diff --git a/juddges/preprocessing/parser_base.py b/juddges/preprocessing/parser_base.py
@@ -0,0 +1,18 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class DocParserBase(ABC):
+    """Base class for parser retrieving data from a document."""
+
+    def __call__(self, document: str) -> dict[str, Any]:
+        return self.parse(document)
+
+    @property
+    @abstractmethod
+    def schema(self) -> list[str]:
+        pass
+
+    @abstractmethod
+    def parse(self, document: str) -> dict[str, Any]:
+        pass
diff --git a/juddges/preprocessing/pl_court_parser.py b/juddges/preprocessing/pl_court_parser.py
@@ -0,0 +1,80 @@
+import re
+from typing import Any, Generator
+from xml.etree import ElementTree
+from xml.etree.ElementTree import Element
+
+from juddges.preprocessing.parser_base import DocParserBase
+
+MULTIPLE_NEWLINES = re.compile(r"(\n\s*)+\n+")
+
+
+class SimplePlJudgementsParser(DocParserBase):
+    """The simplest parser for the simple XML format used by the Polish courts.
+
+    It extracts the text from XML file, without adhering to any specific structure.
+
+    """
+
+    @property
+    def schema(self) -> list[str]:
+        return ["num_pages", "vol_number", "vol_type", "text"]
+
+    def parse(self, document: str) -> dict[str, Any]:
+        et = ElementTree.fromstring(document)
+
+        xblock_elements = et.findall("xBlock")
+        assert len(xblock_elements) == 1, "There should be only one xBlock element"
+        content_root, *_ = xblock_elements
+
+        return {
+            "num_pages": int(et.attrib["xToPage"]),
+            "vol_number": int(et.attrib["xVolNmbr"]),
+            "vol_type": et.attrib["xVolType"],
+            "text": self.extract_text(content_root),
+        }
+
+    @staticmethod
+    def extract_text(element: Element) -> str:
+        text = ""
+        for elem_txt in element.itertext():
+            if elem_txt is None:
+                continue
+            if txt := elem_txt.strip(" "):
+                text += txt
+
+        text = re.sub(MULTIPLE_NEWLINES, "\n\n", text).strip()
+
+        return text
+
+
+def itertext(element: Element, prefix: str = "") -> Generator[str, None, None]:
+    """Extension of the Element.itertext method to handle special tags in pl court XML."""
+    tag = element.tag
+    if not isinstance(tag, str) and tag is not None:
+        return
+
+    t: str | None
+    match (tag, element.attrib):
+        case ("xName", {"xSffx": suffix}):
+            element.tail = element.tail.strip() if element.tail else None
+            t = f"{element.text}{suffix} "
+        case ("xEnum", _):
+            bullet_elem = element.find("xBullet")
+            if bullet_elem:
+                prefix = bullet_elem.text or ""
+                element.remove(bullet_elem)
+            t = ""
+        case ("xEnumElem", _):
+            t = prefix
+        case _:
+            t = element.text
+
+    if t:
+        yield t
+
+    for e in element:
+        yield from itertext(e, prefix)
+        t = e.tail
+
+        if t:
+            yield t
diff --git a/notebooks/1_analyse_dataset.ipynb b/notebooks/1_analyse_dataset.ipynb
@@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-03-15T11:07:39.123324510Z",
+     "start_time": "2024-03-15T11:07:39.065139618Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import polars as pl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8a2c7d4858169a2",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-03-15T11:12:01.007272795Z",
+     "start_time": "2024-03-15T11:11:47.709404815Z"
+    },
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "ds = pl.read_parquet(\"../data/datasets/pl/raw\", use_pyarrow=True)\n",
+    "\n",
+    "dt_fmt = \"%Y-%m-%d %H:%M:%S%.f %Z\"\n",
+    "dt_unit = \"ms\"  # due to https://github.com/pola-rs/polars/issues/13592\n",
+    "\n",
+    "ds = ds.with_columns(\n",
+    "    ds[\"date\"].str.to_datetime(format=dt_fmt, time_unit=dt_unit),\n",
+    "    ds[\"publicationDate\"].str.to_datetime(format=dt_fmt, time_unit=dt_unit),\n",
+    "    ds[\"lastUpdate\"].str.to_datetime(format=dt_fmt, time_unit=dt_unit),\n",
+    "    ds[\"courtId\"].cast(pl.Int32),\n",
+    "    ds[\"departmentId\"].cast(pl.Int32),\n",
+    "    ds[\"type\"].cast(pl.Categorical),\n",
+    ")\n",
+    "\n",
+    "ds.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35e65fe2dd9a4bce",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-03-15T11:12:01.074286418Z",
+     "start_time": "2024-03-15T11:12:00.912825131Z"
+    },
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "ds.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab23ff37327a377a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-03-15T11:15:27.800725934Z",
+     "start_time": "2024-03-15T11:15:27.753971240Z"
+    },
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "ds[\"type\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11446c299cdf1700",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-03-15T11:15:38.059155473Z",
+     "start_time": "2024-03-15T11:15:38.053450756Z"
+    },
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(f\"Missing content: {ds['content'].null_count() / len(ds)}\")\n",
+    "print(f\"Missing theis: {ds['thesis'].null_count() / len(ds)}\")\n",
+    "print(f\"Missing excerpt: {ds['excerpt'].null_count() / len(ds)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "891ffbad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds[\"excerpt\"].str.strip_chars().str.len_chars().to_pandas().plot.hist(\n",
+    "    bins=50, log=True, title=\"Excerpt #chars distribution\"\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}