From 18d4ecdd9cb59e21ebfc7561bee7215b1e822459 Mon Sep 17 00:00:00 2001
From: Jakub Binkowski <jakub.binkowski@pwr.edu.pl>
Date: Mon, 3 Jun 2024 16:32:39 +0200
Subject: [PATCH] Finish sft experiments (#21)

* Added notebook for inspecting SFT results

* Add llama-unsloth to zero-shot eval

* Reproduce prediction on 1500 step of Unsloth-Mistral

* Add Bielik LLM
---
 configs/model/Bielik-7B-Instruct-v0.1.yaml    |   8 +
 ...h-Mistral-7B-Instruct-v0.3-fine-tuned.yaml |  10 +
 .../.gitignore                                |   1 +
 .../predict/pl-court-instruct/.gitignore      |   3 +
 .../metrics_Bielik-7B-Instruct-v0.1.json      |  12 +
 .../metrics_Unsloth-Llama-3-8B-Instruct.json  |  12 +
 ...h-Mistral-7B-Instruct-v0.3-fine-tuned.json |  12 +
 .../pl-court-instruct/metrics_summary.md      |  16 +-
 dvc.lock                                      | 141 +++++++-
 dvc.yaml                                      |   9 +-
 nbs/Data/02_Analyse_sft.ipynb                 | 312 ++++++++++++++++++
 11 files changed, 524 insertions(+), 12 deletions(-)
 create mode 100644 configs/model/Bielik-7B-Instruct-v0.1.yaml
 create mode 100644 configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
 create mode 100644 data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/.gitignore
 create mode 100644 data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json
 create mode 100644 data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json
 create mode 100644 data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
 create mode 100644 nbs/Data/02_Analyse_sft.ipynb

diff --git a/configs/model/Bielik-7B-Instruct-v0.1.yaml b/configs/model/Bielik-7B-Instruct-v0.1.yaml
new file mode 100644
index 0000000..aa7a872
--- /dev/null
+++ b/configs/model/Bielik-7B-Instruct-v0.1.yaml
@@ -0,0 +1,8 @@
+name: speakleash/Bielik-7B-Instruct-v0.1
+tokenizer_name: ${.name}
+
+adapter_path: null
+
+max_seq_length: 4_000
+padding: longest
+batch_size: 1
diff --git a/configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml b/configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
new file mode 100644
index 0000000..91c2c7a
--- /dev/null
+++ b/configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
@@ -0,0 +1,10 @@
+name: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
+tokenizer_name: ${.name}
+
+adapter_path: data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/pl-court-instruct/checkpoint-1500
+
+max_seq_length: 20_000
+padding: longest
+batch_size: 1
+
+use_unsloth: true
diff --git a/data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/.gitignore b/data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/.gitignore
new file mode 100644
index 0000000..c5110ed
--- /dev/null
+++ b/data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/.gitignore
@@ -0,0 +1 @@
+/pl-court-instruct
diff --git a/data/experiments/predict/pl-court-instruct/.gitignore b/data/experiments/predict/pl-court-instruct/.gitignore
index f2c04e6..283c20e 100644
--- a/data/experiments/predict/pl-court-instruct/.gitignore
+++ b/data/experiments/predict/pl-court-instruct/.gitignore
@@ -6,3 +6,6 @@
 /outputs_Mistral-7B-Instruct-v0.2-fine-tuned.json
 /outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json
 /outputs_Unsloth-Mistral-7B-Instruct-v0.3.json
+/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
+/outputs_Unsloth-Llama-3-8B-Instruct.json
+/outputs_Bielik-7B-Instruct-v0.1.json
diff --git a/data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json b/data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json
new file mode 100644
index 0000000..e6d130c
--- /dev/null
+++ b/data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json
@@ -0,0 +1,12 @@
+{
+	"full_text_chrf": 0.2468319535255432,
+	"field_chrf": {
+		"court_name": 0.7368742823600769,
+		"date": 0.7829525470733643,
+		"department_name": 0.626532793045044,
+		"judges": 0.30981674790382385,
+		"legal_bases": 0.3045749366283417,
+		"recorder": 0.5168337821960449,
+		"signature": 0.4849330484867096
+	}
+}
\ No newline at end of file
diff --git a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json
new file mode 100644
index 0000000..0e8eb94
--- /dev/null
+++ b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json
@@ -0,0 +1,12 @@
+{
+	"full_text_chrf": 0.4385761320590973,
+	"field_chrf": {
+		"court_name": 0.8789530396461487,
+		"date": 0.9822721481323242,
+		"department_name": 0.9057374000549316,
+		"judges": 0.9149863123893738,
+		"legal_bases": 0.42645466327667236,
+		"recorder": 0.7640316486358643,
+		"signature": 0.7549777626991272
+	}
+}
\ No newline at end of file
diff --git a/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
new file mode 100644
index 0000000..e0a55ee
--- /dev/null
+++ b/data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
@@ -0,0 +1,12 @@
+{
+	"full_text_chrf": 0.8193286061286926,
+	"field_chrf": {
+		"court_name": 0.9964265823364258,
+		"date": 0.9885857701301575,
+		"department_name": 0.9962303042411804,
+		"judges": 0.981475830078125,
+		"legal_bases": 0.7374544143676758,
+		"recorder": 0.9933416843414307,
+		"signature": 0.9780842661857605
+	}
+}
\ No newline at end of file
diff --git a/data/experiments/predict/pl-court-instruct/metrics_summary.md b/data/experiments/predict/pl-court-instruct/metrics_summary.md
index 497a5be..b6e1f18 100644
--- a/data/experiments/predict/pl-court-instruct/metrics_summary.md
+++ b/data/experiments/predict/pl-court-instruct/metrics_summary.md
@@ -1,7 +1,9 @@
-| llm                                    |   full_text_chrf |   court_name |   date |   department_name |   judges |   legal_bases |   recorder |   signature |
-|:---------------------------------------|-----------------:|-------------:|-------:|------------------:|---------:|--------------:|-----------:|------------:|
-| Meta-Llama-3-8B-Instruct               |            0.247 |        0.862 |  0.971 |             0.833 |    0.882 |         0.287 |      0.805 |       0.778 |
-| Mistral-7B-Instruct-v0.2               |            0.432 |        0.839 |  0.922 |             0.850 |    0.879 |         0.333 |      0.837 |       0.145 |
-| Mistral-7B-Instruct-v0.2-fine-tuned    |            0.772 |        0.987 |  0.990 |             0.965 |    0.952 |         0.600 |      0.979 |       0.972 |
-| Unsloth-Llama-3-8B-Instruct-fine-tuned |            0.828 |        0.995 |  0.989 |             0.986 |    0.977 |         0.601 |      0.993 |       0.994 |
-| Unsloth-Mistral-7B-Instruct-v0.3       |            0.477 |        0.830 |  0.987 |             0.900 |    0.870 |         0.419 |      0.943 |       0.567 |
\ No newline at end of file
+| llm                                         |   full_text_chrf |   court_name |   date |   department_name |   judges |   legal_bases |   recorder |   signature |
+|:--------------------------------------------|-----------------:|-------------:|-------:|------------------:|---------:|--------------:|-----------:|------------:|
+| Meta-Llama-3-8B-Instruct                    |            0.247 |        0.862 |  0.971 |             0.833 |    0.882 |         0.287 |      0.805 |       0.778 |
+| Mistral-7B-Instruct-v0.2                    |            0.432 |        0.839 |  0.922 |             0.850 |    0.879 |         0.333 |      0.837 |       0.145 |
+| Mistral-7B-Instruct-v0.2-fine-tuned         |            0.772 |        0.987 |  0.990 |             0.965 |    0.952 |         0.600 |      0.979 |       0.972 |
+| Unsloth-Llama-3-8B-Instruct                 |            0.439 |        0.879 |  0.982 |             0.906 |    0.915 |         0.426 |      0.764 |       0.755 |
+| Unsloth-Llama-3-8B-Instruct-fine-tuned      |            0.828 |        0.995 |  0.989 |             0.986 |    0.977 |         0.601 |      0.993 |       0.994 |
+| Unsloth-Mistral-7B-Instruct-v0.3            |            0.477 |        0.830 |  0.987 |             0.900 |    0.870 |         0.419 |      0.943 |       0.567 |
+| Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned |            0.819 |        0.996 |  0.989 |             0.996 |    0.981 |         0.737 |      0.993 |       0.978 |
\ No newline at end of file
diff --git a/dvc.lock b/dvc.lock
index 848d8a3..1587886 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -320,8 +320,8 @@ stages:
     outs:
     - path: data/experiments/predict/pl-court-instruct/metrics_summary.md
       hash: md5
-      md5: a72452f53099f61de9d653af1a596a3a
-      size: 1119
+      md5: 80c3922982cb8a41468063481dbf695c
+      size: 1484
   evaluate@Unsloth-Mistral-7B-Instruct-v0.3:
     cmd: PYTHONPATH=. python scripts/sft/evaluate.py  --output-file 
       data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3.json
@@ -341,3 +341,140 @@ stages:
       hash: md5
       md5: 091b8888275600052dd2dcdd36a55588
       size: 305
+  predict@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
+    deps:
+    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.yaml
+      hash: md5
+      md5: 8e8b380ef9bc65715cb833ce104cda20
+      size: 256
+    - path: configs/predict.yaml
+      hash: md5
+      md5: e6b047cf62e612a32381d6221eb99b4e
+      size: 416
+    - path: scripts/sft/predict.py
+      hash: md5
+      md5: 69e4844a715c9c5c75e1127a06472ad4
+      size: 3148
+    outs:
+    - path: 
+        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
+      hash: md5
+      md5: a4fda5774b367e8924cf07f3bf271922
+      size: 1834778
+  evaluate@Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py  --output-file 
+      data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
+    deps:
+    - path: 
+        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
+      hash: md5
+      md5: a4fda5774b367e8924cf07f3bf271922
+      size: 1834778
+    - path: scripts/sft/evaluate.py
+      hash: md5
+      md5: 5ee442a9a3525af7596bf24c3d724a1d
+      size: 570
+    outs:
+    - path: 
+        data/experiments/predict/pl-court-instruct/metrics_Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned.json
+      hash: md5
+      md5: 3b3589929112cb2f199044d240e87bcc
+      size: 305
+  predict@Unsloth-Llama-3-8B-Instruct:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Unsloth-Llama-3-8B-Instruct
+    deps:
+    - path: configs/model/Unsloth-Llama-3-8B-Instruct.yaml
+      hash: md5
+      md5: e97bb2e6bf39f75edea7714d6ba58b77
+      size: 160
+    - path: configs/predict.yaml
+      hash: md5
+      md5: e6b047cf62e612a32381d6221eb99b4e
+      size: 416
+    - path: scripts/sft/predict.py
+      hash: md5
+      md5: 69e4844a715c9c5c75e1127a06472ad4
+      size: 3148
+    outs:
+    - path: 
+        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json
+      hash: md5
+      md5: df2f1d464152f87737c8ebb5b0673854
+      size: 2179383
+  evaluate@Unsloth-Llama-3-8B-Instruct:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py  --output-file 
+      data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json
+    deps:
+    - path: 
+        data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct.json
+      hash: md5
+      md5: df2f1d464152f87737c8ebb5b0673854
+      size: 2179383
+    - path: scripts/sft/evaluate.py
+      hash: md5
+      md5: 5ee442a9a3525af7596bf24c3d724a1d
+      size: 570
+    outs:
+    - path: 
+        data/experiments/predict/pl-court-instruct/metrics_Unsloth-Llama-3-8B-Instruct.json
+      hash: md5
+      md5: 521a731cc2c45d3eda0656a8e69d505b
+      size: 307
+  predict@Bielik-7B-Instruct-v0.1:
+    cmd: PYTHONPATH=. python scripts/sft/predict.py model=Bielik-7B-Instruct-v0.1
+    deps:
+    - path: configs/model/Bielik-7B-Instruct-v0.1.yaml
+      hash: md5
+      md5: ea2309177451ac16db4c2c7a5b7aed3b
+      size: 140
+    - path: configs/predict.yaml
+      hash: md5
+      md5: e6b047cf62e612a32381d6221eb99b4e
+      size: 416
+    - path: scripts/sft/predict.py
+      hash: md5
+      md5: 69e4844a715c9c5c75e1127a06472ad4
+      size: 3148
+    outs:
+    - path: data/experiments/predict/pl-court-instruct/outputs_Bielik-7B-Instruct-v0.1.json
+      hash: md5
+      md5: 58f1b7a5d06cca3989c8b373c5429162
+      size: 2033178
+  sft_unsloth@Unsloth-Mistral-7B-Instruct-v0.3:
+    cmd: PYTHONPATH=. python scripts/sft/fine_tune_unsloth.py model=Unsloth-Mistral-7B-Instruct-v0.3
+    deps:
+    - path: configs/fine_tuning.yaml
+      hash: md5
+      md5: 9cd6fd320530e1c8ded7d9c369b8a082
+      size: 440
+    - path: configs/model/Unsloth-Mistral-7B-Instruct-v0.3.yaml
+      hash: md5
+      md5: 71dbbb0a8a2454c7c0210e2d1acd859d
+      size: 167
+    - path: scripts/sft/fine_tune_unsloth.py
+      hash: md5
+      md5: c8a06fdcb01188a621b5fc9cc579ea56
+      size: 6904
+    outs:
+    - path: data/experiments/fine-tune/Unsloth-Mistral-7B-Instruct-v0.3/pl-court-instruct
+      hash: md5
+      md5: 914a39b11765124b6548bfa3f5ef64e1.dir
+      size: 4084044746
+      nfiles: 192
+  evaluate@Bielik-7B-Instruct-v0.1:
+    cmd: PYTHONPATH=. python scripts/sft/evaluate.py  --output-file data/experiments/predict/pl-court-instruct/outputs_Bielik-7B-Instruct-v0.1.json
+    deps:
+    - path: data/experiments/predict/pl-court-instruct/outputs_Bielik-7B-Instruct-v0.1.json
+      hash: md5
+      md5: 58f1b7a5d06cca3989c8b373c5429162
+      size: 2033178
+    - path: scripts/sft/evaluate.py
+      hash: md5
+      md5: 5ee442a9a3525af7596bf24c3d724a1d
+      size: 570
+    outs:
+    - path: data/experiments/predict/pl-court-instruct/metrics_Bielik-7B-Instruct-v0.1.json
+      hash: md5
+      md5: 2d1b6a392152f2e022a33553265e141a
+      size: 306
diff --git a/dvc.yaml b/dvc.yaml
index 43b34df..27960b4 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -58,8 +58,11 @@ stages:
         - Meta-Llama-3-8B-Instruct
         - Mistral-7B-Instruct-v0.2
         - Mistral-7B-Instruct-v0.2-fine-tuned
+        - Bielik-7B-Instruct-v0.1
+        - Unsloth-Llama-3-8B-Instruct
         - Unsloth-Llama-3-8B-Instruct-fine-tuned
         - Unsloth-Mistral-7B-Instruct-v0.3
+        - Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
     cmd: >-
       PYTHONPATH=. python scripts/sft/predict.py model=${item.model}
     deps:
@@ -72,11 +75,11 @@ stages:
   evaluate:
     matrix:
       model:
-        - Meta-Llama-3-8B-Instruct
-        - Mistral-7B-Instruct-v0.2
-        - Mistral-7B-Instruct-v0.2-fine-tuned
+        - Unsloth-Llama-3-8B-Instruct
         - Unsloth-Llama-3-8B-Instruct-fine-tuned
         - Unsloth-Mistral-7B-Instruct-v0.3
+        - Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned
+        - Bielik-7B-Instruct-v0.1
     cmd: >-
       PYTHONPATH=. python scripts/sft/evaluate.py 
       --output-file data/experiments/predict/pl-court-instruct/outputs_${item.model}.json
diff --git a/nbs/Data/02_Analyse_sft.ipynb b/nbs/Data/02_Analyse_sft.ipynb
new file mode 100644
index 0000000..5e77e17
--- /dev/null
+++ b/nbs/Data/02_Analyse_sft.ipynb
@@ -0,0 +1,312 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "import json\n",
+    "from multiprocessing import Pool\n",
+    "from statistics import mean\n",
+    "from typing import Any\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "from tqdm.auto import tqdm\n",
+    "from ipywidgets import interact\n",
+    "\n",
+    "from juddges.utils.misc import parse_yaml\n",
+    "from juddges.metrics.info_extraction import evaluate_extraction\n",
+    "\n",
+    "pd.options.display.float_format = '{:,.3f}'.format\n",
+    "warnings.filterwarnings('ignore', message=\"To copy construct from a tensor, it is recommended to use\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compare metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>llm</th>\n",
+       "      <th>full_text_chrf</th>\n",
+       "      <th>court_name</th>\n",
+       "      <th>date</th>\n",
+       "      <th>department_name</th>\n",
+       "      <th>judges</th>\n",
+       "      <th>legal_bases</th>\n",
+       "      <th>recorder</th>\n",
+       "      <th>signature</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Meta-Llama-3-8B-Instruct</td>\n",
+       "      <td>0.247</td>\n",
+       "      <td>0.862</td>\n",
+       "      <td>0.971</td>\n",
+       "      <td>0.833</td>\n",
+       "      <td>0.882</td>\n",
+       "      <td>0.287</td>\n",
+       "      <td>0.805</td>\n",
+       "      <td>0.778</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Mistral-7B-Instruct-v0.2</td>\n",
+       "      <td>0.432</td>\n",
+       "      <td>0.839</td>\n",
+       "      <td>0.922</td>\n",
+       "      <td>0.850</td>\n",
+       "      <td>0.879</td>\n",
+       "      <td>0.333</td>\n",
+       "      <td>0.837</td>\n",
+       "      <td>0.145</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Mistral-7B-Instruct-v0.2-fine-tuned</td>\n",
+       "      <td>0.772</td>\n",
+       "      <td>0.987</td>\n",
+       "      <td>0.990</td>\n",
+       "      <td>0.965</td>\n",
+       "      <td>0.952</td>\n",
+       "      <td>0.600</td>\n",
+       "      <td>0.979</td>\n",
+       "      <td>0.972</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Unsloth-Llama-3-8B-Instruct-fine-tuned</td>\n",
+       "      <td>0.828</td>\n",
+       "      <td>0.995</td>\n",
+       "      <td>0.989</td>\n",
+       "      <td>0.986</td>\n",
+       "      <td>0.977</td>\n",
+       "      <td>0.601</td>\n",
+       "      <td>0.993</td>\n",
+       "      <td>0.994</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Unsloth-Mistral-7B-Instruct-v0.3</td>\n",
+       "      <td>0.477</td>\n",
+       "      <td>0.830</td>\n",
+       "      <td>0.987</td>\n",
+       "      <td>0.900</td>\n",
+       "      <td>0.870</td>\n",
+       "      <td>0.419</td>\n",
+       "      <td>0.943</td>\n",
+       "      <td>0.567</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned</td>\n",
+       "      <td>0.798</td>\n",
+       "      <td>0.995</td>\n",
+       "      <td>0.988</td>\n",
+       "      <td>0.986</td>\n",
+       "      <td>0.967</td>\n",
+       "      <td>0.608</td>\n",
+       "      <td>0.987</td>\n",
+       "      <td>0.976</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           llm  full_text_chrf  court_name  \\\n",
+       "2                     Meta-Llama-3-8B-Instruct           0.247       0.862   \n",
+       "0                     Mistral-7B-Instruct-v0.2           0.432       0.839   \n",
+       "3          Mistral-7B-Instruct-v0.2-fine-tuned           0.772       0.987   \n",
+       "4       Unsloth-Llama-3-8B-Instruct-fine-tuned           0.828       0.995   \n",
+       "1             Unsloth-Mistral-7B-Instruct-v0.3           0.477       0.830   \n",
+       "5  Unsloth-Mistral-7B-Instruct-v0.3-fine-tuned           0.798       0.995   \n",
+       "\n",
+       "   date  department_name  judges  legal_bases  recorder  signature  \n",
+       "2 0.971            0.833   0.882        0.287     0.805      0.778  \n",
+       "0 0.922            0.850   0.879        0.333     0.837      0.145  \n",
+       "3 0.990            0.965   0.952        0.600     0.979      0.972  \n",
+       "4 0.989            0.986   0.977        0.601     0.993      0.994  \n",
+       "1 0.987            0.900   0.870        0.419     0.943      0.567  \n",
+       "5 0.988            0.986   0.967        0.608     0.987      0.976  "
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results = []\n",
+    "for f in  Path(\"data/experiments/predict/pl-court-instruct\").glob(\"metrics_*.json\"):\n",
+    "    model_name = f.stem.replace(\"metrics_\", \"\")\n",
+    "    with f.open() as file:\n",
+    "        m_res = json.load(file)\n",
+    "        results.append(\n",
+    "            {\"llm\": model_name}\n",
+    "            | {\"full_text_chrf\": m_res[\"full_text_chrf\"]}\n",
+    "            | m_res[\"field_chrf\"]\n",
+    "        )\n",
+    "\n",
+    "pd.DataFrame(results).sort_values(\"llm\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inspect results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUTS_PATH = \"data/experiments/predict/pl-court-instruct/outputs_Unsloth-Llama-3-8B-Instruct-fine-tuned.json\"\n",
+    "\n",
+    "with open(OUTPUTS_PATH) as file:\n",
+    "    data = json.load(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "26e97e0494064714ab86ea2f0ea4b0b6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of invalid answers: 224 / 2000\n"
+     ]
+    }
+   ],
+   "source": [
+    "def eval_item(item: dict[str, Any]) -> dict[str, Any]:\n",
+    "    item[\"metrics\"] = evaluate_extraction([item])\n",
+    "    item[\"metrics\"][\"mean_field\"] = mean(item[\"metrics\"][\"field_chrf\"].values())\n",
+    "    item[\"gold\"] = parse_yaml(item[\"gold\"])\n",
+    "    try:\n",
+    "        item[\"answer\"] = parse_yaml(item[\"answer\"])\n",
+    "    except:\n",
+    "        item[\"answer\"] = None\n",
+    "    return item\n",
+    "\n",
+    "num_invalid_answers = 0\n",
+    "results = []\n",
+    "with Pool(10) as pool:\n",
+    "    for item in tqdm(pool.imap(eval_item, data), total=len(data)):\n",
+    "        results.append(item)\n",
+    "        if item[\"answer\"] is None:\n",
+    "            num_invalid_answers += 1\n",
+    "\n",
+    "print(f\"Number of invalid answers: {num_invalid_answers} / {len(data)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "26fcb134773c4fbc9fbb60645895253c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "interactive(children=(Dropdown(description='idx', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "data_valid = [item for item in results if item[\"answer\"] is not None]\n",
+    "data_valid = sorted(data_valid, key=lambda x: x[\"metrics\"][\"mean_field\"])\n",
+    "\n",
+    "def item_to_df(idx: int) -> pd.DataFrame:\n",
+    "    item = data_valid[idx]\n",
+    "    return pd.DataFrame({\n",
+    "        \"gold\": item[\"gold\"],\n",
+    "        \"answer\": item[\"answer\"],\n",
+    "        \"metrics\": item[\"metrics\"][\"field_chrf\"],\n",
+    "    })\n",
+    "\n",
+    "\n",
+    "interact(item_to_df, idx=range(len(data_valid)));"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "juddges",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}