-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Improve error handling in download_pl_content.py * Add dataset dump scrip * Add pl dataset to DVC * Add simple data analysis notebook * Extract text from pl judgements * Refine text extraction and add analysis * Add addtional details download and ingest * Refine extraction and ingest extracted data to mongo * Add script for chunked embeddings --------- Co-authored-by: Jakub Binkowski <[email protected]>
- Loading branch information
Showing
19 changed files
with
886 additions
and
112 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
/raw | ||
/text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
outs: | ||
- md5: 801ebfe4c29d0564abfce7006536adc8.dir | ||
size: 5466475038 | ||
nfiles: 9 | ||
hash: md5 | ||
path: raw |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
outs: | ||
- md5: cac0bc44e36e68d606eff7500d627bd1.dir | ||
size: 22741832080 | ||
nfiles: 11 | ||
hash: md5 | ||
path: text |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Any | ||
|
||
|
||
class DocParserBase(ABC): | ||
"""Base class for parser retrieving data from a document.""" | ||
|
||
def __call__(self, document: str) -> dict[str, Any]: | ||
return self.parse(document) | ||
|
||
@property | ||
@abstractmethod | ||
def schema(self) -> list[str]: | ||
pass | ||
|
||
@abstractmethod | ||
def parse(self, document: str) -> dict[str, Any]: | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import re | ||
from typing import Any, Generator | ||
from xml.etree import ElementTree | ||
from xml.etree.ElementTree import Element | ||
|
||
from juddges.preprocessing.parser_base import DocParserBase | ||
|
||
MULTIPLE_NEWLINES = re.compile(r"(\n\s*)+\n+") | ||
|
||
|
||
class SimplePlJudgementsParser(DocParserBase): | ||
"""The simplest parser for the simple XML format used by the Polish courts. | ||
It extracts the text from XML file, without adhering to any specific structure. | ||
""" | ||
|
||
@property | ||
def schema(self) -> list[str]: | ||
return ["num_pages", "vol_number", "vol_type", "text"] | ||
|
||
def parse(self, document: str) -> dict[str, Any]: | ||
et = ElementTree.fromstring(document) | ||
|
||
xblock_elements = et.findall("xBlock") | ||
assert len(xblock_elements) == 1, "There should be only one xBlock element" | ||
content_root, *_ = xblock_elements | ||
|
||
return { | ||
"num_pages": int(et.attrib["xToPage"]), | ||
"vol_number": int(et.attrib["xVolNmbr"]), | ||
"vol_type": et.attrib["xVolType"], | ||
"text": self.extract_text(content_root), | ||
} | ||
|
||
@staticmethod | ||
def extract_text(element: Element) -> str: | ||
text = "" | ||
for elem_txt in element.itertext(): | ||
if elem_txt is None: | ||
continue | ||
if txt := elem_txt.strip(" "): | ||
text += txt | ||
|
||
text = re.sub(MULTIPLE_NEWLINES, "\n\n", text).strip() | ||
|
||
return text | ||
|
||
|
||
def itertext(element: Element, prefix: str = "") -> Generator[str, None, None]: | ||
"""Extension of the Element.itertext method to handle special tags in pl court XML.""" | ||
tag = element.tag | ||
if not isinstance(tag, str) and tag is not None: | ||
return | ||
|
||
t: str | None | ||
match (tag, element.attrib): | ||
case ("xName", {"xSffx": suffix}): | ||
element.tail = element.tail.strip() if element.tail else None | ||
t = f"{element.text}{suffix} " | ||
case ("xEnum", _): | ||
bullet_elem = element.find("xBullet") | ||
if bullet_elem: | ||
prefix = bullet_elem.text or "" | ||
element.remove(bullet_elem) | ||
t = "" | ||
case ("xEnumElem", _): | ||
t = prefix | ||
case _: | ||
t = element.text | ||
|
||
if t: | ||
yield t | ||
|
||
for e in element: | ||
yield from itertext(e, prefix) | ||
t = e.tail | ||
|
||
if t: | ||
yield t |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "initial_id", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-03-15T11:07:39.123324510Z", | ||
"start_time": "2024-03-15T11:07:39.065139618Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import polars as pl" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c8a2c7d4858169a2", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-03-15T11:12:01.007272795Z", | ||
"start_time": "2024-03-15T11:11:47.709404815Z" | ||
}, | ||
"collapsed": false, | ||
"jupyter": { | ||
"outputs_hidden": false | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"ds = pl.read_parquet(\"../data/datasets/pl/raw\", use_pyarrow=True)\n", | ||
"\n", | ||
"dt_fmt = \"%Y-%m-%d %H:%M:%S%.f %Z\"\n", | ||
"dt_unit = \"ms\" # due to https://github.com/pola-rs/polars/issues/13592\n", | ||
"\n", | ||
"ds = ds.with_columns(\n", | ||
" ds[\"date\"].str.to_datetime(format=dt_fmt, time_unit=dt_unit),\n", | ||
" ds[\"publicationDate\"].str.to_datetime(format=dt_fmt, time_unit=dt_unit),\n", | ||
" ds[\"lastUpdate\"].str.to_datetime(format=dt_fmt, time_unit=dt_unit),\n", | ||
" ds[\"courtId\"].cast(pl.Int32),\n", | ||
" ds[\"departmentId\"].cast(pl.Int32),\n", | ||
" ds[\"type\"].cast(pl.Categorical),\n", | ||
")\n", | ||
"\n", | ||
"ds.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "35e65fe2dd9a4bce", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-03-15T11:12:01.074286418Z", | ||
"start_time": "2024-03-15T11:12:00.912825131Z" | ||
}, | ||
"collapsed": false, | ||
"jupyter": { | ||
"outputs_hidden": false | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"ds.describe()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ab23ff37327a377a", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-03-15T11:15:27.800725934Z", | ||
"start_time": "2024-03-15T11:15:27.753971240Z" | ||
}, | ||
"collapsed": false, | ||
"jupyter": { | ||
"outputs_hidden": false | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"ds[\"type\"].value_counts()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "11446c299cdf1700", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-03-15T11:15:38.059155473Z", | ||
"start_time": "2024-03-15T11:15:38.053450756Z" | ||
}, | ||
"collapsed": false, | ||
"jupyter": { | ||
"outputs_hidden": false | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"print(f\"Missing content: {ds['content'].null_count() / len(ds)}\")\n", | ||
"print(f\"Missing theis: {ds['thesis'].null_count() / len(ds)}\")\n", | ||
"print(f\"Missing excerpt: {ds['excerpt'].null_count() / len(ds)}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "891ffbad", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ds[\"excerpt\"].str.strip_chars().str.len_chars().to_pandas().plot.hist(\n", | ||
" bins=50, log=True, title=\"Excerpt #chars distribution\"\n", | ||
")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.8" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.