From 2e06a6a14c905658196d1848f2f3b4278673c142 Mon Sep 17 00:00:00 2001 From: Albert Sawczyn Date: Mon, 10 Jun 2024 15:38:41 +0200 Subject: [PATCH] refactor: clean notebooks --- dvc.lock | 36 ++--- nbs/Data/02_Dataset_Description_Raw.ipynb | 142 ++++++----------- .../03_Dataset_Description_Instruct.ipynb | 148 +++++++----------- 3 files changed, 118 insertions(+), 208 deletions(-) diff --git a/dvc.lock b/dvc.lock index 575abeb..7a3675d 100644 --- a/dvc.lock +++ b/dvc.lock @@ -79,7 +79,7 @@ stages: deps: - hash: md5 md5: df2f1d464152f87737c8ebb5b0673854 - path: + path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json size: 2179383 - hash: md5 @@ -89,7 +89,7 @@ stages: outs: - hash: md5 md5: 521a731cc2c45d3eda0656a8e69d505b - path: + path: data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json size: 307 evaluate@Unsloth-Llama-3-8B-Instruct-fine-tuned: @@ -98,7 +98,7 @@ stages: deps: - hash: md5 md5: 9199da7e04fb35cc1ce2bbe9dd5cd274 - path: + path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json size: 1891254 - hash: md5 @@ -108,7 +108,7 @@ stages: outs: - hash: md5 md5: 6a0eb30a14687342bc86ae80253cd60c - path: + path: data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct-fine-tuned.json size: 306 evaluate@Unsloth-Mistral-7B-Instruct-v0.3: @@ -117,7 +117,7 @@ stages: deps: - hash: md5 md5: c2e03f3fbd29c744023bdac7e1007265 - path: + path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json size: 2007040 - hash: md5 @@ -127,7 +127,7 @@ stages: outs: - hash: md5 md5: 091b8888275600052dd2dcdd36a55588 - path: + path: data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3.json size: 305 evaluate@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned: @@ -136,7 +136,7 @@ stages: deps: - hash: md5 md5: a4fda5774b367e8924cf07f3bf271922 - path: + path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json size: 1834778 - hash: md5 @@ -146,7 +146,7 @@ stages: outs: - hash: md5 md5: 3b3589929112cb2f199044d240e87bcc - path: + path: data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json size: 305 predict@Bielik-7B-Instruct-v0.1: @@ -227,7 +227,7 @@ stages: outs: - hash: md5 md5: adf03a2b51a7a9cd4431c884a89f6497 - path: + path: data/experiments/predict/pl-court-instruct/outputs_Mistral-7B-Instruct-v0.2-fine-tuned.json size: 1843278 predict@Unsloth-Llama-3-8B-Instruct: @@ -248,7 +248,7 @@ stages: outs: - hash: md5 md5: df2f1d464152f87737c8ebb5b0673854 - path: + path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json size: 2179383 predict@Unsloth-Llama-3-8B-Instruct-fine-tuned: @@ -269,7 +269,7 @@ stages: outs: - hash: md5 md5: 9199da7e04fb35cc1ce2bbe9dd5cd274 - path: + path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json size: 1891254 predict@Unsloth-Mistral-7B-Instruct-v0.3: @@ -290,7 +290,7 @@ stages: outs: - hash: md5 md5: c2e03f3fbd29c744023bdac7e1007265 - path: + path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json size: 2007040 predict@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned: @@ -311,7 +311,7 @@ stages: outs: - hash: md5 md5: a4fda5774b367e8924cf07f3bf271922 - path: + path: data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json size: 1834778 sft@Mistral-7B-Instruct-v0.2: @@ -436,8 +436,8 @@ stages: nfiles: 17 - path: nbs/Data/02_Dataset_Description_Raw.ipynb hash: md5 - md5: 520ae4cd67c4e72e97301a496383adf4 - size: 74776 + md5: d3d7509d084b85676857e13a2f20b82a + size: 73872 outs: - path: data/datasets/pl/readme/raw/ hash: md5 @@ -450,11 +450,11 @@ stages: deps: - path: nbs/Data/03_Dataset_Description_Instruct.ipynb hash: md5 - md5: c403ede420e9c30b07920bc528bf8c7e - size: 16852 + md5: 27e6d517445028d45e5c40b22febece4 + size: 16215 outs: - path: data/datasets/pl/readme/instruct/ hash: md5 md5: de02794df3d74d86f8610f040a17dcbe.dir size: 144326 - nfiles: 5 \ No newline at end of file + nfiles: 5 diff --git a/nbs/Data/02_Dataset_Description_Raw.ipynb b/nbs/Data/02_Dataset_Description_Raw.ipynb index ae4bb2e..8cba868 100644 --- a/nbs/Data/02_Dataset_Description_Raw.ipynb +++ b/nbs/Data/02_Dataset_Description_Raw.ipynb @@ -2,14 +2,10 @@ "cells": [ { "cell_type": "code", + "execution_count": null, "id": "initial_id", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "import warnings\n", "\n", @@ -27,29 +23,22 @@ "transformers.logging.set_verbosity_error()\n", "datasets.logging.set_verbosity_error()\n", "datasets.utils.disable_progress_bars()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "3105d222", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "raw_ds = pl.scan_parquet(source=\"../../data/datasets/pl/raw/*\")" - ], - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "bac42f58ea3c3d96", + "metadata": {}, "source": [ "---\n", "language: {{language}}\n", @@ -60,8 +49,7 @@ "pretty_name: {{pretty_name}}\n", "tags: {{tags}}\n", "---" - ], - "id": "bac42f58ea3c3d96" + ] }, { "cell_type": "markdown", @@ -292,21 +280,23 @@ }, { "cell_type": "code", + "execution_count": null, "id": "5c2f63ac", "metadata": {}, + "outputs": [], "source": [ "court_distribution = raw_ds.drop_nulls(subset=\"court_name\").select(\"court_name\").group_by(\"court_name\").len().sort(\"len\", descending=True).collect().to_pandas()\n", "ax = sns.histplot(data=court_distribution, x=\"len\", log_scale=True, kde=True)\n", "ax.set(title=\"Distribution of judgments per court\", xlabel=\"#Judgements in single court\", ylabel=\"Count\")\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "12acf455", "metadata": {}, + "outputs": [], "source": [ "judgements_per_year = raw_ds.select(\"date\").collect()[\"date\"].str.split(\" \").list.get(0).str.to_date().dt.year().value_counts().sort(\"date\").to_pandas()\n", "judgements_per_year = judgements_per_year[judgements_per_year[\"date\"] < 2024]\n", @@ -316,14 +306,14 @@ "ax.set(xlabel=\"Year\", ylabel=\"Number of Judgements\", title=\"Yearly Number of Judgements\", yscale=\"log\")\n", "plt.xticks(rotation=90)\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "3df2d2fa", "metadata": {}, + "outputs": [], "source": [ "types = raw_ds.fill_null(value=\"\").select(\"type\").group_by(\"type\").len().sort(\"len\", descending=True).collect().to_pandas()\n", "\n", @@ -331,69 +321,51 @@ "ax = sns.barplot(data=types, x=\"len\", y=\"type\", errorbar=None, ax=ax)\n", "ax.set(xlabel=\"Count\", ylabel=\"Type\", title=\"Judgement types cardinality\", xscale=\"log\")\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "e0801346", "metadata": {}, + "outputs": [], "source": [ "num_judges = raw_ds.with_columns([pl.col(\"judges\").list.len().alias(\"num_judges\")]).select(\"num_judges\").sort(\"num_judges\").collect().to_pandas()\n", "ax = sns.histplot(data=num_judges, x=\"num_judges\", bins=num_judges[\"num_judges\"].nunique())\n", "ax.set(xlabel=\"#Judges per judgement\", ylabel=\"Count\", yscale=\"log\", title=\"#Judges per single judgement\")\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "758f41b7", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "num_lb = raw_ds.with_columns([pl.col(\"legalBases\").list.len().alias(\"num_lb\")]).select(\"num_lb\").sort(\"num_lb\").collect().to_pandas()\n", "ax = sns.histplot(data=num_lb, x=\"num_lb\", bins=num_lb[\"num_lb\"].nunique())\n", "ax.set(xlabel=\"#Legal bases\", ylabel=\"Count\", yscale=\"log\", title=\"#Legal bases per judgement\")\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "b1f2f3de", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "raw_text_ds = load_dataset(\"parquet\", data_dir=\"../../data/datasets/pl/raw/\", columns=[\"_id\", \"text\"])\n", "raw_text_ds = raw_text_ds.filter(lambda x: x[\"text\"] is not None)" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "030652c5", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3-8B\")\n", "\n", @@ -402,20 +374,14 @@ " return {\"length\": tokenized[\"length\"]}\n", "\n", "raw_text_ds = raw_text_ds.map(tokenize, batched=True, batch_size=16, remove_columns=[\"text\"], num_proc=20)" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "b8f46bd1", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "judgement_len = raw_text_ds[\"train\"].to_pandas()\n", "\n", @@ -423,20 +389,14 @@ "ax.set(xlabel=\"#Tokens\", ylabel=\"Count\", title=\"#Tokens distribution in judgements (llama-3 tokenizer)\", yscale=\"log\")\n", "ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "4b180955", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "per_type_tokens = raw_ds.fill_null(value=\"\").select([\"_id\", \"type\"]).collect().to_pandas().set_index(\"_id\").join(judgement_len.set_index(\"_id\"))\n", "\n", @@ -444,28 +404,14 @@ "ax = sns.boxenplot(data=per_type_tokens, y=\"type\", x=\"length\")\n", "ax.set(xscale=\"log\", title=\"Judgement token count per type\", xlabel=\"#Tokens\", ylabel=\"Type\")\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" } }, "nbformat": 4, diff --git a/nbs/Data/03_Dataset_Description_Instruct.ipynb b/nbs/Data/03_Dataset_Description_Instruct.ipynb index 82c1da7..998b302 100644 --- a/nbs/Data/03_Dataset_Description_Instruct.ipynb +++ b/nbs/Data/03_Dataset_Description_Instruct.ipynb @@ -2,14 +2,10 @@ "cells": [ { "cell_type": "code", + "execution_count": null, "id": "initial_id", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "import datasets\n", "import transformers\n", @@ -30,25 +26,17 @@ "transformers.logging.set_verbosity_error()\n", "datasets.logging.set_verbosity_error()\n", "datasets.utils.disable_progress_bars()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "3105d222", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "ds = load_dataset(\"JuDDGES/pl-court-instruct\") " - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -132,11 +120,11 @@ }, { "cell_type": "code", + "execution_count": null, "id": "3f161970acf83cfa", "metadata": {}, - "source": "display(ds[\"train\"][0])", "outputs": [], - "execution_count": null + "source": "display(ds[\"train\"][0])" }, { "cell_type": "markdown", @@ -164,8 +152,11 @@ ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "ee96bab3205ad17a", + "metadata": {}, + "outputs": [], "source": [ "data = []\n", "for split in ds.keys():\n", @@ -174,25 +165,23 @@ "df = pd.DataFrame(data)\n", "df[\"% samples\"] = (df[\"# samples\"] / df[\"# samples\"].sum() * 100).round(2)\n", "# print(df.to_markdown(index=False))" - ], - "id": "ee96bab3205ad17a", - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "4ee99a119109fc75", + "metadata": {}, "source": [ "| split | # samples | % samples |\n", "|:--------|------------:|------------:|\n", "| train | 238851 | 99.17 |\n", "| test | 2000 | 0.83 |" - ], - "id": "4ee99a119109fc75" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "970a616415592b60", + "metadata": {}, "source": [ "\n", "\n", @@ -289,8 +278,7 @@ "### Citation Information\n", "\n", "[More Information Needed]" - ], - "id": "970a616415592b60" + ] }, { "cell_type": "markdown", @@ -301,19 +289,22 @@ ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "bd1df108f7be20e5", + "metadata": {}, + "outputs": [], "source": [ "data = yaml.safe_load(ds[\"train\"][\"output\"][0].replace(\"```yaml\", \"\").replace(\"```\", \"\"))\n", "data[\"date\"] = pd.to_datetime(data[\"date\"])" - ], - "id": "bd1df108f7be20e5", - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "d29a063bc04e4df5", + "metadata": {}, + "outputs": [], "source": [ "def parse_output(output: str) -> dict:\n", " data = yaml.safe_load(output.replace(\"```yaml\", \"\").replace(\"```\", \"\"))\n", @@ -321,38 +312,38 @@ " return data\n", "\n", "ds = ds.map(parse_output, input_columns=\"output\", num_proc=20)" - ], - "id": "d29a063bc04e4df5", - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "cd31a01d116567", + "metadata": {}, + "outputs": [], "source": [ "pl_ds = pl.concat([pl.from_arrow(ds[\"train\"].data.table), pl.from_arrow(ds[\"test\"].data.table)])\n", "pl_ds = pl_ds.with_columns(pl.Series(name=\"subset\", values=[\"train\"] * len(ds[\"train\"]) + [\"test\"] * len(ds[\"test\"]))) " - ], - "id": "cd31a01d116567", - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "5c2f63ac", "metadata": {}, + "outputs": [], "source": [ "court_distribution = pl_ds.select([\"subset\", \"court_name\"]).group_by([\"subset\", \"court_name\"]).len().sort(\"len\", descending=True).to_pandas()\n", "ax = sns.histplot(data=court_distribution, x=\"len\", hue=\"subset\", log_scale=True, kde=True, stat=\"percent\", common_norm=False )\n", "ax.set(title=\"Distribution of judgments per court\", xlabel=\"#Judgements in single court\", ylabel=\"percent\")\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "4201a1725cbbca26", + "metadata": {}, + "outputs": [], "source": [ "judgements_per_year = pl_ds.select([\"subset\", \"date\"])[[\"subset\", \"date\"]]\n", "judgements_per_year = judgements_per_year.with_columns(judgements_per_year[\"date\"].dt.year()) \n", @@ -365,34 +356,27 @@ "ax.set(xlabel=\"Year\", ylabel=\"% Judgements\", title=\"Yearly Number of Judgements\", yscale=\"log\")\n", "plt.xticks(rotation=90)\n", "plt.show()" - ], - "id": "4201a1725cbbca26", - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "e0801346", "metadata": {}, + "outputs": [], "source": [ "num_judges = pl_ds.with_columns([pl.col(\"judges\").list.len().alias(\"num_judges\")]).select([\"subset\", \"num_judges\"]).to_pandas()\n", "ax = sns.histplot(data=num_judges, x=\"num_judges\", hue=\"subset\", bins=num_judges[\"num_judges\"].nunique(), stat=\"percent\", common_norm=False)\n", "ax.set(xlabel=\"#Judges per judgement\", ylabel=\"%\", title=\"#Judges per single judgement\")\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "030652c5", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3-8B\")\n", "\n", @@ -401,20 +385,14 @@ " return {\"length\": tokenized[\"length\"]}\n", "\n", "ds = ds.map(tokenize, batched=True, batch_size=16, remove_columns=[\"context\"], num_proc=20)" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "b8f46bd1", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, + "outputs": [], "source": [ "context_len_train = ds[\"train\"].to_pandas()\n", "context_len_train[\"subset\"] = \"train\"\n", @@ -426,28 +404,14 @@ "ax.set(xlabel=\"#Tokens\", ylabel=\"Count\", title=\"#Tokens distribution in context (llama-3 tokenizer)\", yscale=\"log\")\n", "ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))\n", "plt.show()" - ], - "outputs": [], - "execution_count": null + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" } }, "nbformat": 4,