From b1af4636d3d1c1b727cafa13f3adba9b16bdcb52 Mon Sep 17 00:00:00 2001 From: Albert Sawczyn Date: Fri, 30 Aug 2024 17:08:53 +0200 Subject: [PATCH] feat: add instruct readme --- data/datasets/en/readme/.gitignore | 2 + dvc.lock | 42 +- dvc.yaml | 28 +- .../02_Dataset_Description_Instruct.ipynb | 14 +- ...ataset_Description_en_court_instruct.ipynb | 381 ++++++++++++++++++ scripts/dataset/push_instruct_readme.py | 39 +- 6 files changed, 466 insertions(+), 40 deletions(-) create mode 100644 data/datasets/en/readme/.gitignore create mode 100644 nbs/Dataset Cards/05_Dataset_Description_en_court_instruct.ipynb diff --git a/data/datasets/en/readme/.gitignore b/data/datasets/en/readme/.gitignore new file mode 100644 index 0000000..47b637c --- /dev/null +++ b/data/datasets/en/readme/.gitignore @@ -0,0 +1,2 @@ +/raw +/instruct diff --git a/dvc.lock b/dvc.lock index 0dd0e21..3042332 100644 --- a/dvc.lock +++ b/dvc.lock @@ -169,20 +169,6 @@ stages: hash: md5 md5: 3b3589929112cb2f199044d240e87bcc size: 305 - instruct_dataset_readme: - cmd: jupyter nbconvert --no-input --to markdown --execute nbs/Data/03_Dataset_Description_Instruct.ipynb - --output-dir data/datasets/pl/readme/instruct --output README - deps: - - path: nbs/Data/03_Dataset_Description_Instruct.ipynb - hash: md5 - md5: 27e6d517445028d45e5c40b22febece4 - size: 16215 - outs: - - path: data/datasets/pl/readme/instruct/ - hash: md5 - md5: de02794df3d74d86f8610f040a17dcbe.dir - size: 144326 - nfiles: 5 predict@Unsloth-Llama-3-8B-Instruct: cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct deps: @@ -4947,3 +4933,31 @@ stages: hash: md5 md5: 4edc8fe239f53890d71291f61b6cc96c size: 486 + instruct_dataset_readme@1: + cmd: jupyter nbconvert --no-input --to markdown --execute "nbs/Dataset Cards/05_Dataset_Description_en_court_instruct.ipynb" + --output-dir data/datasets/en/readme/instruct --output README + deps: + - path: nbs/Dataset Cards/05_Dataset_Description_en_court_instruct.ipynb + hash: md5 + md5: da1eb2132a552e2de52e42fc827ecef8 + size: 14220 + outs: + - path: data/datasets/en/readme/instruct + hash: md5 + md5: dd812ae1518c3c934945b916aa9e5bfc.dir + size: 67727 + nfiles: 3 + instruct_dataset_readme@0: + cmd: jupyter nbconvert --no-input --to markdown --execute "nbs/Dataset Cards/02_Dataset_Description_Instruct.ipynb" + --output-dir data/datasets/pl/readme/instruct --output README + deps: + - path: nbs/Dataset Cards/02_Dataset_Description_Instruct.ipynb + hash: md5 + md5: 1212931a887a1e4ab1939234554b4795 + size: 16026 + outs: + - path: data/datasets/pl/readme/instruct + hash: md5 + md5: 5a7c4f784f826e2214d39decfd1c4625.dir + size: 206279 + nfiles: 7 diff --git a/dvc.yaml b/dvc.yaml index d02b182..8dc3918 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -17,17 +17,23 @@ stages: - data/datasets/pl/readme/raw/ instruct_dataset_readme: - cmd: >- - jupyter nbconvert - --no-input - --to markdown - --execute nbs/Data/03_Dataset_Description_Instruct.ipynb - --output-dir data/datasets/pl/readme/instruct - --output README - deps: - - nbs/Data/03_Dataset_Description_Instruct.ipynb - outs: - - data/datasets/pl/readme/instruct/ + foreach: + - notebook: "nbs/Dataset Cards/02_Dataset_Description_Instruct.ipynb" + output_dir: data/datasets/pl/readme/instruct + - notebook: "nbs/Dataset Cards/05_Dataset_Description_en_court_instruct.ipynb" + output_dir: data/datasets/en/readme/instruct + do: + cmd: >- + jupyter nbconvert + --no-input + --to markdown + --execute "${item.notebook}" + --output-dir ${item.output_dir} + --output README + deps: + - ${item.notebook} + outs: + - ${item.output_dir} build_instruct_dataset_pl: cmd: >- diff --git a/nbs/Dataset Cards/02_Dataset_Description_Instruct.ipynb b/nbs/Dataset Cards/02_Dataset_Description_Instruct.ipynb index 2f3fa11..82cf0a8 100644 --- a/nbs/Dataset Cards/02_Dataset_Description_Instruct.ipynb +++ b/nbs/Dataset Cards/02_Dataset_Description_Instruct.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "---\n", + "{{ card_data }}\n", + "---" + ], + "id": "ba43c85ce14e7af6" + }, { "cell_type": "code", "execution_count": null, @@ -74,7 +84,7 @@ "## Dataset Description\n", "\n", "* **Homepage: TBA**\n", - "* **Repository: [github](https://github.com/pwr-ai/JuDDGES)**\n", + "* **Repository: https://github.com/pwr-ai/JuDDGES**\n", "* **Paper: TBA**\n", "* **Point of Contact: lukasz.augustyniak@pwr.edu.pl; jakub.binkowski@pwr.edu.pl; albert.sawczyn@pwr.edu.pl**\n", "\n", @@ -258,7 +268,7 @@ "\n", "### Licensing Information\n", "\n", - "[More Information Needed]\n", + "We license the actual packaging of these data under Attribution 4.0 International (CC BY 4.0) https://creativecommons.org/licenses/by/4.0/\n", "\n", "### Citation Information\n", "\n", diff --git a/nbs/Dataset Cards/05_Dataset_Description_en_court_instruct.ipynb b/nbs/Dataset Cards/05_Dataset_Description_en_court_instruct.ipynb new file mode 100644 index 0000000..b5c0b76 --- /dev/null +++ b/nbs/Dataset Cards/05_Dataset_Description_en_court_instruct.ipynb @@ -0,0 +1,381 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "---\n", + "{{ card_data }}\n", + "---" + ], + "id": "c0dfa673eb39fed7" + }, + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "source": [ + "import datasets\n", + "import transformers\n", + "import warnings\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.ticker as ticker\n", + "import pandas as pd\n", + "import polars as pl\n", + "import seaborn as sns\n", + "import yaml\n", + "from datasets import load_dataset\n", + "from transformers import AutoTokenizer\n", + "from IPython.display import display\n", + "\n", + "\n", + "warnings.filterwarnings('ignore')\n", + "sns.set_theme(\"notebook\")\n", + "transformers.logging.set_verbosity_error()\n", + "datasets.logging.set_verbosity_error()\n", + "datasets.utils.disable_progress_bars()" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": "ds = load_dataset(\"JuDDGES/en-court-instruct\") ", + "id": "a433d1c1541dbba3", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Dataset Card for [JuDDGES/en-court-instruct](https://huggingface.co/datasets/JuDDGES/en-court-instruct)\n", + "\n", + "## Table of Contents\n", + "- [Table of Contents](#table-of-contents)\n", + "- [Dataset Description](#dataset-description)\n", + " - [Dataset Summary](#dataset-summary)\n", + " - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)\n", + " - [Languages](#languages)\n", + "- [Dataset Structure](#dataset-structure)\n", + " - [Data Instances](#data-instances)\n", + " - [Data Fields](#data-fields)\n", + " - [Data Splits](#data-splits)\n", + "- [Dataset Creation](#dataset-creation)\n", + " - [Curation Rationale](#curation-rationale)\n", + " - [Source Data](#source-data)\n", + " - [Annotations](#annotations)\n", + " - [Personal and Sensitive Information](#personal-and-sensitive-information)\n", + "- [Considerations for Using the Data](#considerations-for-using-the-data)\n", + " - [Social Impact of Dataset](#social-impact-of-dataset)\n", + " - [Discussion of Biases](#discussion-of-biases)\n", + " - [Other Known Limitations](#other-known-limitations)\n", + "- [Additional Information](#additional-information)\n", + " - [Dataset Curators](#dataset-curators)\n", + " - [Licensing Information](#licensing-information)\n", + " - [Citation Information](#citation-information)\n", + " - [Contributions](#contributions)\n", + "- [Statistics](#statistics)\n", + "\n", + "## Dataset Description\n", + "\n", + "* **Homepage: TBA**\n", + "* **Repository: https://github.com/pwr-ai/JuDDGES**\n", + "* **Paper: TBA**\n", + "* **Point of Contact: lukasz.augustyniak@pwr.edu.pl; jakub.binkowski@pwr.edu.pl; albert.sawczyn@pwr.edu.pl**\n", + "\n", + "### Dataset Summary\n", + "\n", + "The data was acquired from publicly available judgments from the Court of Appeal (Criminal Division) ([link](https://caselaw.nationalarchives.gov.uk/judgments/advanced_search?court=ewca/crim)) of England and Wales. These judgments are available in HTML format on the national archives website for online reading. They can be downloaded as XML or PDF files under the crown copyright license ([link](https://www.judiciary.uk/copyright/)): and the Open Government license (see Appendix 6 in the paper). These licenses encourage the use and re-use of the information available under them freely and flexibly, with only a few conditions ([link](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/)).\n", + "\n", + "This dataset is designed for fine-tuning large language models (LLMs) for information extraction tasks and is formatted as instructions. For raw dataset see [`JuDDGES/en-court-raw`](https://huggingface.co/datasets/JuDDGES/en-court-raw).\n", + "\n", + "### Supported Tasks and Leaderboards\n", + "\n", + "* `information-extraction`: The dataset can be used for information extraction tasks.\n", + "* `text-generation`: The dataset can be used for text generation tasks, as the dataset is formatted as instructions.\n", + "\n", + "### Languages\n", + "\n", + "en-EN English \n", + "\n", + "## Dataset Structure\n", + "\n", + "### Data Instances\n", + "\n", + "
\n", + " Click to expand \n", + "\n", + "```json" + ], + "id": "303d51383a8ed440" + }, + { + "metadata": {}, + "cell_type": "code", + "source": "display(ds[\"train\"][0])", + "id": "5bdc791e40046a68", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "```\n", + "\n", + " \n", + "
\n", + "\n", + "### Data Fields\n", + "\n", + "\n", + "| Feature name | Feature description | Type |\n", + "|------------------|-------------------------------------------------------------------------------------------------------------------------------------------|------------|\n", + "| prompt | The prompt template provided for extracting information from the judgement. It contains placeholder `{context}` for the judgement content. | `string` |\n", + "| context | The full text content of the judgement | `string` |\n", + "| output | The extracted information in YAML format based on the provided context | `string` |\n", + "\n", + "\n", + "### Data Splits" + ], + "id": "b9733cc4fa3a4e89" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "data = []\n", + "for split in ds.keys():\n", + " data.append({\"split\": split, \"# samples\": len(ds[split])})\n", + "\n", + "df = pd.DataFrame(data)\n", + "df[\"% samples\"] = (df[\"# samples\"] / df[\"# samples\"].sum() * 100).round(2)\n", + "# print(df.to_markdown(index=False))" + ], + "id": "420de6e0dea88ac8", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "| split | # samples | % samples |\n", + "|:--------|------------:|------------:|\n", + "| train | 3365 | 62.72 |\n", + "| test | 2000 | 37.28 |" + ], + "id": "5d73998511dbab50" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## Dataset Creation\n", + "\n", + "For details on the dataset creation, see the paper [TBA]() and the code repository [here](https://github.com/pwr-ai/JuDDGES).\n", + "\n", + "### Curation Rationale\n", + "\n", + "Created to enable cross-jurisdictional legal analytics.\n", + "\n", + "### Source Data\n", + "\n", + "#### Initial Data Collection and Normalization\n", + "\n", + "1. Utilize the raw dataset [`JuDDGES/en-court-raw`](https://huggingface.co/datasets/JuDDGES/en-court-raw).\n", + "1. First, we identified information from metadata which is contained in text of the judgement. Therefore, the following fields were selected for extraction as targets:\n", + " * `citation`\n", + " * `date`\n", + " * `judges`\n", + "1. **Data filtering**: In order to ensure high quality of the dataset, we performed preprocessing procedure, as described below.\n", + " 1. Removal of bad judgements - if judgement text is too short which we reveal to correspond with missing content it's removed. \n", + " 1. Dataset extraction - extracting *date* from judgement content with regular expression. \n", + " 1. Removing examples wherein attributes are not a substring of judgment text - due to inherent errors in acquired data, some attribute values might be mistyped; hence, we filter them out. \n", + " (Data cleaning removes 685 examples, and the instruction dataset finally consists of 5365 examples.)\n", + "1. **Generating instructions**: After cleaning we generate instructions for information extraction. Specifically, we define same prompt for each document, as follows:\n", + "\n", + " ```text\n", + " You are extracting information from the court judgments.\n", + " Extract specified values strictly from the provided judgement. If information is not provided in the judgement, leave the field with null value.\n", + " Please return the response in the identical YAML format:\n", + " '''yaml\n", + " citation: \n", + " date: \n", + " judges: \n", + " '''\n", + " =====\n", + " {context}\n", + " ======\n", + " ```\n", + " where `{context}` is replaced by text of each judgement.\n", + "\n", + "\n", + "#### Who are the source language producers?\n", + "\n", + "Produced by human legal professionals (judges, court clerks). Demographics was not analysed. Sourced from public court databases.\n", + "\n", + "### Annotations\n", + "\n", + "#### Annotation process\n", + "\n", + "No annotation was performed by us. All features were provided.\n", + "\n", + "#### Who are the annotators?\n", + "\n", + "As above.\n", + "\n", + "### Personal and Sensitive Information\n", + "\n", + "Data comply with GDPR and The Data Protection Act 2018 (DPA). (See more in Section 3.4 in the paper.)\n", + "\n", + "## Considerations for Using the Data\n", + "\n", + "### Social Impact of Dataset\n", + "\n", + "[More Information Needed]\n", + "\n", + "### Discussion of Biases\n", + "\n", + "[More Information Needed]\n", + "\n", + "### Other Known Limitations\n", + "\n", + "[More Information Needed]\n", + "\n", + "## Additional Information\n", + "\n", + "### Dataset Curators\n", + "\n", + "[More Information Needed]\n", + "\n", + "### Licensing Information\n", + "\n", + "We license the actual packaging of these data under Open Government Licence https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/\n", + "\n", + "### Citation Information\n", + "\n", + "[More Information Needed]" + ], + "id": "8be5f759f6f160be" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Statistics", + "id": "d7b25ff1687b865b" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "data = yaml.safe_load(ds[\"train\"][\"output\"][0].replace(\"```yaml\", \"\").replace(\"```\", \"\"))\n", + "data[\"date\"] = pd.to_datetime(data[\"date\"])" + ], + "id": "e2293cc34f189526", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "def parse_output(output: str) -> dict:\n", + " data = yaml.safe_load(output.replace(\"```yaml\", \"\").replace(\"```\", \"\"))\n", + " data[\"date\"] = pd.to_datetime(data[\"date\"])\n", + " return data\n", + "\n", + "ds = ds.map(parse_output, input_columns=\"output\", num_proc=20)" + ], + "id": "11f8d704ca49f4cd", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "en_ds = pl.concat([pl.from_arrow(ds[\"train\"].data.table), pl.from_arrow(ds[\"test\"].data.table)])\n", + "en_ds = en_ds.with_columns(pl.Series(name=\"subset\", values=[\"train\"] * len(ds[\"train\"]) + [\"test\"] * len(ds[\"test\"]))) " + ], + "id": "ed074d9bb142d194", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "num_judges = en_ds.with_columns([pl.col(\"judges\").list.len().alias(\"num_judges\")]).select([\"subset\", \"num_judges\"]).to_pandas()\n", + "ax = sns.histplot(data=num_judges, x=\"num_judges\", hue=\"subset\", bins=num_judges[\"num_judges\"].nunique(), stat=\"percent\", common_norm=False)\n", + "ax.set(xlabel=\"#Judges per judgement\", ylabel=\"%\", title=\"#Judges per single judgement\")\n", + "plt.show()" + ], + "id": "3345ba747a9ac0d9", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3-8B\")\n", + "\n", + "def tokenize(batch: dict[str, list]) -> list[int]: \n", + " tokenized = tokenizer(batch[\"context\"], add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False, return_length=True)\n", + " return {\"length\": tokenized[\"length\"]}\n", + "\n", + "ds = ds.map(tokenize, batched=True, batch_size=16, remove_columns=[\"context\"], num_proc=20)" + ], + "id": "2475982aee74523d", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "context_len_train = ds[\"train\"].to_pandas()\n", + "context_len_train[\"subset\"] = \"train\"\n", + "context_len_test = ds[\"test\"].to_pandas()\n", + "context_len_test[\"subset\"] = \"test\"\n", + "context_len = pd.concat([context_len_train, context_len_test])\n", + "\n", + "ax = sns.histplot(data=context_len, x=\"length\", bins=50, hue=\"subset\")\n", + "ax.set(xlabel=\"#Tokens\", ylabel=\"Count\", title=\"#Tokens distribution in context (llama-3 tokenizer)\", yscale=\"log\")\n", + "ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))\n", + "plt.show()" + ], + "id": "6a2dddfeb0e48da4", + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/dataset/push_instruct_readme.py b/scripts/dataset/push_instruct_readme.py index a5d0988..c107887 100644 --- a/scripts/dataset/push_instruct_readme.py +++ b/scripts/dataset/push_instruct_readme.py @@ -9,36 +9,49 @@ DATASET_CARD_TEMPLATE_DIR = Path("data/datasets/pl/readme/instruct") +PL_CARD_DATA = DatasetCardData( + language="pl", + multilinguality="monolingual", + size_categories="100K None: assert any(file.name == "README.md" for file in dataset_card_template_dir.iterdir()) assert any(file.name == "README_files" for file in dataset_card_template_dir.iterdir()) - card_data = DatasetCardData( - language="pl", - multilinguality="monolingual", - size_categories="100K